summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c378
-rw-r--r--usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c2
-rw-r--r--usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c9
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c18
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c51
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/route.c47
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c20
-rw-r--r--usr/src/cmd/devfsadm/misc_link.c3
-rw-r--r--usr/src/cmd/mdb/common/modules/arp/arp.c665
-rw-r--r--usr/src/cmd/mdb/common/modules/genunix/genunix.c4
-rw-r--r--usr/src/cmd/mdb/common/modules/genunix/net.c195
-rw-r--r--usr/src/cmd/mdb/common/modules/genunix/net.h3
-rw-r--r--usr/src/cmd/mdb/common/modules/genunix/streams.c1
-rw-r--r--usr/src/cmd/mdb/common/modules/genunix/vfs.c44
-rw-r--r--usr/src/cmd/mdb/common/modules/ip/ip.c735
-rw-r--r--usr/src/cmd/mdb/common/modules/sctp/sctp.c212
-rw-r--r--usr/src/common/net/patricia/radix.c148
-rw-r--r--usr/src/lib/brand/native/zone/platform.xml2
-rw-r--r--usr/src/lib/brand/solaris10/zone/platform.xml2
-rw-r--r--usr/src/pkgdefs/SUNWckr/prototype_com1
-rw-r--r--usr/src/pkgdefs/SUNWckr/prototype_i3862
-rw-r--r--usr/src/pkgdefs/SUNWckr/prototype_sparc1
-rw-r--r--usr/src/pkgdefs/SUNWhea/prototype_com1
-rw-r--r--usr/src/pkgdefs/etc/exception_list_i3861
-rw-r--r--usr/src/pkgdefs/etc/exception_list_sparc1
-rw-r--r--usr/src/tools/scripts/bfu.sh6
-rw-r--r--usr/src/uts/common/Makefile.files14
-rw-r--r--usr/src/uts/common/fs/sockfs/sockcommon.h3
-rw-r--r--usr/src/uts/common/fs/sockfs/sockcommon_sops.c3
-rw-r--r--usr/src/uts/common/fs/sockfs/sockcommon_subr.c8
-rw-r--r--usr/src/uts/common/fs/sockfs/socktpi.c199
-rw-r--r--usr/src/uts/common/inet/Makefile4
-rw-r--r--usr/src/uts/common/inet/arp.h143
-rw-r--r--usr/src/uts/common/inet/arp/arp.c4883
-rw-r--r--usr/src/uts/common/inet/arp/arp_netinfo.c376
-rw-r--r--usr/src/uts/common/inet/arp/arpddi.c20
-rw-r--r--usr/src/uts/common/inet/arp_impl.h253
-rw-r--r--usr/src/uts/common/inet/ip.h2395
-rw-r--r--usr/src/uts/common/inet/ip/conn_opt.c2933
-rw-r--r--usr/src/uts/common/inet/ip/icmp.c6776
-rw-r--r--usr/src/uts/common/inet/ip/icmp_opt_data.c201
-rw-r--r--usr/src/uts/common/inet/ip/igmp.c763
-rw-r--r--usr/src/uts/common/inet/ip/ip.c24206
-rw-r--r--usr/src/uts/common/inet/ip/ip2mac.c254
-rw-r--r--usr/src/uts/common/inet/ip/ip6.c11729
-rw-r--r--usr/src/uts/common/inet/ip/ip6_asp.c16
-rw-r--r--usr/src/uts/common/inet/ip/ip6_if.c1782
-rw-r--r--usr/src/uts/common/inet/ip/ip6_input.c2749
-rw-r--r--usr/src/uts/common/inet/ip/ip6_ire.c3123
-rw-r--r--usr/src/uts/common/inet/ip/ip6_output.c1315
-rw-r--r--usr/src/uts/common/inet/ip/ip6_rts.c27
-rw-r--r--usr/src/uts/common/inet/ip/ip_arp.c2468
-rw-r--r--usr/src/uts/common/inet/ip/ip_attr.c1338
-rw-r--r--usr/src/uts/common/inet/ip/ip_dce.c873
-rw-r--r--usr/src/uts/common/inet/ip/ip_ftable.c2682
-rw-r--r--usr/src/uts/common/inet/ip/ip_helper_stream.c372
-rw-r--r--usr/src/uts/common/inet/ip/ip_if.c8230
-rw-r--r--usr/src/uts/common/inet/ip/ip_input.c3095
-rw-r--r--usr/src/uts/common/inet/ip/ip_ire.c6513
-rw-r--r--usr/src/uts/common/inet/ip/ip_mroute.c611
-rw-r--r--usr/src/uts/common/inet/ip/ip_multi.c3981
-rw-r--r--usr/src/uts/common/inet/ip/ip_ndp.c5399
-rw-r--r--usr/src/uts/common/inet/ip/ip_netinfo.c460
-rw-r--r--usr/src/uts/common/inet/ip/ip_opt_data.c301
-rw-r--r--usr/src/uts/common/inet/ip/ip_output.c2554
-rw-r--r--usr/src/uts/common/inet/ip/ip_rts.c923
-rw-r--r--usr/src/uts/common/inet/ip/ip_sadb.c189
-rw-r--r--usr/src/uts/common/inet/ip/ip_srcid.c8
-rw-r--r--usr/src/uts/common/inet/ip/ipclassifier.c1118
-rw-r--r--usr/src/uts/common/inet/ip/ipdrop.c114
-rw-r--r--usr/src/uts/common/inet/ip/ipmp.c462
-rw-r--r--usr/src/uts/common/inet/ip/ipsec_loader.c4
-rw-r--r--usr/src/uts/common/inet/ip/ipsecah.c1653
-rw-r--r--usr/src/uts/common/inet/ip/ipsecesp.c1604
-rw-r--r--usr/src/uts/common/inet/ip/keysock.c8
-rw-r--r--usr/src/uts/common/inet/ip/keysock_opt_data.c16
-rw-r--r--usr/src/uts/common/inet/ip/rts.c456
-rw-r--r--usr/src/uts/common/inet/ip/rts_opt_data.c30
-rw-r--r--usr/src/uts/common/inet/ip/sadb.c1065
-rw-r--r--usr/src/uts/common/inet/ip/spd.c2213
-rw-r--r--usr/src/uts/common/inet/ip/spdsock.c50
-rw-r--r--usr/src/uts/common/inet/ip/spdsock_opt_data.c9
-rw-r--r--usr/src/uts/common/inet/ip/tn_ipopt.c550
-rw-r--r--usr/src/uts/common/inet/ip/tnet.c311
-rw-r--r--usr/src/uts/common/inet/ip2mac_impl.h8
-rw-r--r--usr/src/uts/common/inet/ip6.h202
-rw-r--r--usr/src/uts/common/inet/ip_arp.h136
-rw-r--r--usr/src/uts/common/inet/ip_ftable.h22
-rw-r--r--usr/src/uts/common/inet/ip_if.h146
-rw-r--r--usr/src/uts/common/inet/ip_impl.h410
-rw-r--r--usr/src/uts/common/inet/ip_ire.h438
-rw-r--r--usr/src/uts/common/inet/ip_multi.h100
-rw-r--r--usr/src/uts/common/inet/ip_ndp.h390
-rw-r--r--usr/src/uts/common/inet/ip_netinfo.h5
-rw-r--r--usr/src/uts/common/inet/ip_rts.h11
-rw-r--r--usr/src/uts/common/inet/ip_stack.h156
-rw-r--r--usr/src/uts/common/inet/ipclassifier.h511
-rw-r--r--usr/src/uts/common/inet/ipdrop.h4
-rw-r--r--usr/src/uts/common/inet/ipp_common.h16
-rw-r--r--usr/src/uts/common/inet/ipsec_impl.h114
-rw-r--r--usr/src/uts/common/inet/ipsec_info.h257
-rw-r--r--usr/src/uts/common/inet/ipsecah.h9
-rw-r--r--usr/src/uts/common/inet/ipsecesp.h7
-rw-r--r--usr/src/uts/common/inet/iptun/iptun.c1180
-rw-r--r--usr/src/uts/common/inet/iptun/iptun_dev.c4
-rw-r--r--usr/src/uts/common/inet/iptun/iptun_impl.h39
-rw-r--r--usr/src/uts/common/inet/keysock.h6
-rw-r--r--usr/src/uts/common/inet/kssl/ksslrec.c2
-rw-r--r--usr/src/uts/common/inet/mi.c4
-rw-r--r--usr/src/uts/common/inet/mib2.h39
-rw-r--r--usr/src/uts/common/inet/optcom.c463
-rw-r--r--usr/src/uts/common/inet/optcom.h43
-rw-r--r--usr/src/uts/common/inet/proto_set.c31
-rw-r--r--usr/src/uts/common/inet/proto_set.h4
-rw-r--r--usr/src/uts/common/inet/rawip_impl.h76
-rw-r--r--usr/src/uts/common/inet/rts_impl.h28
-rw-r--r--usr/src/uts/common/inet/sadb.h123
-rw-r--r--usr/src/uts/common/inet/sctp/sctp.c988
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_addr.c71
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_addr.h5
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_asconf.c33
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_asconf.h6
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_bind.c95
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_common.c717
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_conn.c275
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_cookie.c234
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_error.c284
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_hash.c389
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_heartbeat.c35
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_impl.h186
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_init.c55
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_input.c683
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_ioc.c91
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_notify.c7
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_opt_data.c1203
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_output.c77
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_param.c11
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_shutdown.c151
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_snmp.c13
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_stack.h15
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_timer.c22
-rw-r--r--usr/src/uts/common/inet/sctp_ip.h40
-rw-r--r--usr/src/uts/common/inet/sctp_itf.h4
-rw-r--r--usr/src/uts/common/inet/sockmods/socksctp.c14
-rw-r--r--usr/src/uts/common/inet/sockmods/socksctp.h2
-rw-r--r--usr/src/uts/common/inet/sockmods/socksctpsubr.c6
-rw-r--r--usr/src/uts/common/inet/spdsock.h4
-rw-r--r--usr/src/uts/common/inet/squeue.c112
-rw-r--r--usr/src/uts/common/inet/tcp.h271
-rw-r--r--usr/src/uts/common/inet/tcp/tcp.c12129
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_fusion.c440
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_kssl.c57
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_opt_data.c144
-rw-r--r--usr/src/uts/common/inet/tcp_impl.h45
-rw-r--r--usr/src/uts/common/inet/tcp_stack.h43
-rw-r--r--usr/src/uts/common/inet/udp/udp.c7703
-rw-r--r--usr/src/uts/common/inet/udp/udp_opt_data.c185
-rw-r--r--usr/src/uts/common/inet/udp_impl.h187
-rw-r--r--usr/src/uts/common/io/dld/dld_proto.c2
-rw-r--r--usr/src/uts/common/io/ib/clients/rds/rds_opt.c11
-rw-r--r--usr/src/uts/common/io/ib/clients/rds/rdsddi.c6
-rw-r--r--usr/src/uts/common/io/ib/clients/rds/rdssubr.c21
-rw-r--r--usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp.c316
-rw-r--r--usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp_link.c731
-rw-r--r--usr/src/uts/common/io/mac/mac_util.c2
-rw-r--r--usr/src/uts/common/io/softmac/softmac_dev.c6
-rw-r--r--usr/src/uts/common/io/softmac/softmac_fp.c25
-rw-r--r--usr/src/uts/common/io/stream.c4
-rw-r--r--usr/src/uts/common/io/strplumb.c155
-rw-r--r--usr/src/uts/common/io/tl.c16
-rw-r--r--usr/src/uts/common/io/warlock/ibcm.wlcmd4
-rw-r--r--usr/src/uts/common/ipp/dlcosmk/dlcosmk.c8
-rw-r--r--usr/src/uts/common/ipp/ipgpc/classifierddi.c7
-rw-r--r--usr/src/uts/common/ktli/t_kutil.c6
-rw-r--r--usr/src/uts/common/net/route.h3
-rw-r--r--usr/src/uts/common/netinet/in.h1
-rw-r--r--usr/src/uts/common/netinet/ip_mroute.h13
-rw-r--r--usr/src/uts/common/os/ip_cksum.c8
-rw-r--r--usr/src/uts/common/os/strsubr.c4
-rw-r--r--usr/src/uts/common/sys/dld.h3
-rw-r--r--usr/src/uts/common/sys/dlpi.h43
-rw-r--r--usr/src/uts/common/sys/ib/mgt/ibcm/ibcm_arp.h21
-rw-r--r--usr/src/uts/common/sys/iphada.h144
-rw-r--r--usr/src/uts/common/sys/pattr.h7
-rw-r--r--usr/src/uts/common/sys/softmac_impl.h4
-rw-r--r--usr/src/uts/common/sys/squeue.h13
-rw-r--r--usr/src/uts/common/sys/squeue_impl.h6
-rw-r--r--usr/src/uts/common/sys/stream.h6
-rw-r--r--usr/src/uts/common/sys/tsol/tnet.h39
-rw-r--r--usr/src/uts/intel/Makefile.intel.shared1
-rw-r--r--usr/src/uts/intel/arp/Makefile6
-rw-r--r--usr/src/uts/intel/arp/arp.global-objs.debug649
-rw-r--r--usr/src/uts/intel/ia32/ml/modstubs.s3
-rw-r--r--usr/src/uts/intel/ip/ip.global-objs.debug6485
-rw-r--r--usr/src/uts/intel/ip/ip.global-objs.obj6480
-rw-r--r--usr/src/uts/sparc/Makefile.sparc.shared2
-rw-r--r--usr/src/uts/sparc/arp/Makefile6
-rw-r--r--usr/src/uts/sparc/arp/arp.global-objs.debug649
-rw-r--r--usr/src/uts/sparc/ip/ip.global-objs.debug6481
-rw-r--r--usr/src/uts/sparc/ip/ip.global-objs.obj6476
-rw-r--r--usr/src/uts/sparc/ml/modstubs.s3
201 files changed, 58700 insertions, 91699 deletions
diff --git a/usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c b/usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c
index 96bcec530c..1919d21356 100644
--- a/usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c
+++ b/usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c
@@ -196,6 +196,7 @@ static void ire_report(const mib_item_t *item);
static void tcp_report(const mib_item_t *item);
static void udp_report(const mib_item_t *item);
static void group_report(mib_item_t *item);
+static void dce_report(mib_item_t *item);
static void print_ip_stats(mib2_ip_t *ip);
static void print_icmp_stats(mib2_icmp_t *icmp);
static void print_ip6_stats(mib2_ipv6IfStatsEntry_t *ip6);
@@ -236,7 +237,7 @@ static void fatal(int errcode, char *str1, ...);
static boolean_t Aflag = B_FALSE; /* All sockets/ifs/rtng-tbls */
-static boolean_t Dflag = B_FALSE; /* Debug Info */
+static boolean_t Dflag = B_FALSE; /* DCE info */
static boolean_t Iflag = B_FALSE; /* IP Traffic Interfaces */
static boolean_t Mflag = B_FALSE; /* STREAMS Memory Statistics */
static boolean_t Nflag = B_FALSE; /* Numeric Network Addresses */
@@ -248,6 +249,7 @@ static boolean_t Pflag = B_FALSE; /* Net to Media Tables */
static boolean_t Gflag = B_FALSE; /* Multicast group membership */
static boolean_t MMflag = B_FALSE; /* Multicast routing table */
static boolean_t DHCPflag = B_FALSE; /* DHCP statistics */
+static boolean_t Xflag = B_FALSE; /* Debug Info */
static int v4compat = 0; /* Compatible printing format for status */
@@ -276,6 +278,8 @@ static int ipv6NetToMediaEntrySize;
static int ipv6MemberEntrySize;
static int ipv6GroupSourceEntrySize;
+static int ipDestEntrySize;
+
static int transportMLPSize;
static int tcpConnEntrySize;
static int tcp6ConnEntrySize;
@@ -298,7 +302,7 @@ static m_label_t *zone_security_label = NULL;
/* Flags on routes */
#define FLF_A 0x00000001
-#define FLF_B 0x00000002
+#define FLF_b 0x00000002
#define FLF_D 0x00000004
#define FLF_G 0x00000008
#define FLF_H 0x00000010
@@ -306,7 +310,12 @@ static m_label_t *zone_security_label = NULL;
#define FLF_U 0x00000040
#define FLF_M 0x00000080
#define FLF_S 0x00000100
-static const char flag_list[] = "ABDGHLUMS";
+#define FLF_C 0x00000200 /* IRE_IF_CLONE */
+#define FLF_I 0x00000400 /* RTF_INDIRECT */
+#define FLF_R 0x00000800 /* RTF_REJECT */
+#define FLF_B 0x00001000 /* RTF_BLACKHOLE */
+
+static const char flag_list[] = "AbDGHLUMSCIRB";
typedef struct filter_rule filter_t;
@@ -379,14 +388,15 @@ main(int argc, char **argv)
(void) setlocale(LC_ALL, "");
(void) textdomain(TEXT_DOMAIN);
- while ((c = getopt(argc, argv, "adimnrspMgvf:P:I:DRT:")) != -1) {
+ while ((c = getopt(argc, argv, "adimnrspMgvxf:P:I:DRT:")) != -1) {
switch ((char)c) {
case 'a': /* all connections */
Aflag = B_TRUE;
break;
- case 'd': /* turn on debugging */
+ case 'd': /* DCE info */
Dflag = B_TRUE;
+ IFLAGMOD(Iflag_only, 1, 0); /* see macro def'n */
break;
case 'i': /* interface (ill/ipif report) */
@@ -438,6 +448,10 @@ main(int argc, char **argv)
IFLAGMOD(Iflag_only, 1, 0); /* see macro def'n */
break;
+ case 'x': /* turn on debugging */
+ Xflag = B_TRUE;
+ break;
+
case 'f':
process_filter(optarg);
break;
@@ -603,7 +617,7 @@ main(int argc, char **argv)
mib_item_destroy(&previtem);
}
- if (!(Iflag || Rflag || Sflag || Mflag ||
+ if (!(Dflag || Iflag || Rflag || Sflag || Mflag ||
MMflag || Pflag || Gflag || DHCPflag)) {
if (protocol_selected(IPPROTO_UDP))
udp_report(item);
@@ -634,12 +648,14 @@ main(int argc, char **argv)
if (family_selected(AF_INET6))
ndp_report(item);
}
+ if (Dflag)
+ dce_report(item);
mib_item_destroy(&curritem);
}
/* netstat: AF_UNIX behaviour */
if (family_selected(AF_UNIX) &&
- (!(Iflag || Rflag || Sflag || Mflag ||
+ (!(Dflag || Iflag || Rflag || Sflag || Mflag ||
MMflag || Pflag || Gflag)))
unixpr(kc);
(void) kstat_close(kc);
@@ -729,7 +745,7 @@ mibget(int sd)
* us information concerning IRE_MARK_TESTHIDDEN routes.
*/
req = (struct opthdr *)&tor[1];
- req->level = EXPER_IP_AND_TESTHIDDEN;
+ req->level = EXPER_IP_AND_ALL_IRES;
req->name = 0;
req->len = 0;
@@ -755,7 +771,7 @@ mibget(int sd)
getcode = getmsg(sd, &ctlbuf, (struct strbuf *)0, &flags);
if (getcode == -1) {
perror("mibget getmsg(ctl) failed");
- if (Dflag) {
+ if (Xflag) {
(void) fputs("# level name len\n",
stderr);
i = 0;
@@ -774,7 +790,7 @@ mibget(int sd)
toa->PRIM_type == T_OPTMGMT_ACK &&
toa->MGMT_flags == T_SUCCESS &&
req->len == 0) {
- if (Dflag)
+ if (Xflag)
(void) printf("mibget getmsg() %d returned "
"EOD (level %ld, name %ld)\n",
j, req->level, req->name);
@@ -826,7 +842,7 @@ mibget(int sd)
last_item->valp = malloc((int)req->len);
if (last_item->valp == NULL)
goto error_exit;
- if (Dflag)
+ if (Xflag)
(void) printf("msg %d: group = %4d mib_id = %5d"
"length = %d\n",
j, last_item->group, last_item->mib_id,
@@ -1754,6 +1770,7 @@ mib_get_constants(mib_item_t *item)
ipGroupSourceEntrySize = ip->ipGroupSourceEntrySize;
ipRouteAttributeSize = ip->ipRouteAttributeSize;
transportMLPSize = ip->transportMLPSize;
+ ipDestEntrySize = ip->ipDestEntrySize;
assert(IS_P2ALIGNED(ipAddrEntrySize,
sizeof (mib2_ipAddrEntry_t *)));
assert(IS_P2ALIGNED(ipRouteEntrySize,
@@ -1850,7 +1867,7 @@ mib_get_constants(mib_item_t *item)
}
} /* 'for' loop 1 ends */
- if (Dflag) {
+ if (Xflag) {
(void) puts("mib_get_constants:");
(void) printf("\tipv6IfStatsEntrySize %d\n",
ipv6IfStatsEntrySize);
@@ -1872,6 +1889,7 @@ mib_get_constants(mib_item_t *item)
ipv6MemberEntrySize);
(void) printf("\tipv6IfIcmpEntrySize %d\n",
ipv6IfIcmpEntrySize);
+ (void) printf("\tipDestEntrySize %d\n", ipDestEntrySize);
(void) printf("\ttransportMLPSize %d\n", transportMLPSize);
(void) printf("\ttcpConnEntrySize %d\n", tcpConnEntrySize);
(void) printf("\ttcp6ConnEntrySize %d\n", tcp6ConnEntrySize);
@@ -1895,7 +1913,7 @@ stat_report(mib_item_t *item)
/* 'for' loop 1: */
for (; item; item = item->next_item) {
- if (Dflag) {
+ if (Xflag) {
(void) printf("\n--- Entry %d ---\n", ++jtemp);
(void) printf("Group = %d, mib_id = %d, "
"length = %d, valp = 0x%p\n",
@@ -2542,7 +2560,7 @@ mrt_stat_report(mib_item_t *curritem)
for (tempitem = curritem;
tempitem;
tempitem = tempitem->next_item) {
- if (Dflag) {
+ if (Xflag) {
(void) printf("\n--- Entry %d ---\n", ++jtemp);
(void) printf("Group = %d, mib_id = %d, "
"length = %d, valp = 0x%p\n",
@@ -2603,7 +2621,7 @@ if_report(mib_item_t *item, char *matchname,
/* 'for' loop 1: */
for (; item; item = item->next_item) {
- if (Dflag) {
+ if (Xflag) {
(void) printf("\n--- Entry %d ---\n", ++jtemp);
(void) printf("Group = %d, mib_id = %d, "
"length = %d, valp = 0x%p\n",
@@ -2632,7 +2650,7 @@ if_report(mib_item_t *item, char *matchname,
boolean_t first = B_TRUE;
uint32_t new_ifindex;
- if (Dflag)
+ if (Xflag)
(void) printf("if_report: %d items\n",
(item->length)
/ sizeof (mib2_ipAddrEntry_t));
@@ -2944,7 +2962,7 @@ if_report(mib_item_t *item, char *matchname,
boolean_t first = B_TRUE;
uint32_t new_ifindex;
- if (Dflag)
+ if (Xflag)
(void) printf("if_report: %d items\n",
(item->length)
/ sizeof (mib2_ipv6AddrEntry_t));
@@ -3287,10 +3305,10 @@ if_report_ip4(mib2_ipAddrEntry_t *ap,
(void) pr_netaddr(ap->ipAdEntAddr, ap->ipAdEntNetMask,
abuf, sizeof (abuf));
- (void) printf("%-13s %-14s %-6llu %-5s %-6llu "
+ (void) printf("%-13s %-14s %-6llu %-5s %-6s "
"%-5s %-6s %-6llu\n", abuf,
pr_addr(ap->ipAdEntAddr, dstbuf, sizeof (dstbuf)),
- statptr->ipackets, "N/A", statptr->opackets, "N/A", "N/A",
+ statptr->ipackets, "N/A", "N/A", "N/A", "N/A",
0LL);
}
}
@@ -3337,11 +3355,10 @@ if_report_ip6(mib2_ipv6AddrEntry_t *ap6,
else
(void) pr_prefix6(&ap6->ipv6AddrAddress,
ap6->ipv6AddrPfxLength, abuf, sizeof (abuf));
- (void) printf("%-27s %-27s %-6llu %-5s %-6llu %-5s %-6s\n",
+ (void) printf("%-27s %-27s %-6llu %-5s %-6s %-5s %-6s\n",
abuf, pr_addr6(&ap6->ipv6AddrAddress, dstbuf,
sizeof (dstbuf)),
- statptr->ipackets, "N/A",
- statptr->opackets, "N/A", "N/A");
+ statptr->ipackets, "N/A", "N/A", "N/A", "N/A");
}
}
@@ -3490,7 +3507,7 @@ group_report(mib_item_t *item)
/* 'for' loop 1: */
for (; item; item = item->next_item) {
- if (Dflag) {
+ if (Xflag) {
(void) printf("\n--- Entry %d ---\n", ++jtemp);
(void) printf("Group = %d, mib_id = %d, "
"length = %d, valp = 0x%p\n",
@@ -3501,12 +3518,12 @@ group_report(mib_item_t *item)
switch (item->mib_id) {
case EXPER_IP_GROUP_MEMBERSHIP:
v4grp = item;
- if (Dflag)
+ if (Xflag)
(void) printf("item is v4grp info\n");
break;
case EXPER_IP_GROUP_SOURCES:
v4src = item;
- if (Dflag)
+ if (Xflag)
(void) printf("item is v4src info\n");
break;
default:
@@ -3518,12 +3535,12 @@ group_report(mib_item_t *item)
switch (item->mib_id) {
case EXPER_IP6_GROUP_MEMBERSHIP:
v6grp = item;
- if (Dflag)
+ if (Xflag)
(void) printf("item is v6grp info\n");
break;
case EXPER_IP6_GROUP_SOURCES:
v6src = item;
- if (Dflag)
+ if (Xflag)
(void) printf("item is v6src info\n");
break;
default:
@@ -3533,7 +3550,7 @@ group_report(mib_item_t *item)
}
if (family_selected(AF_INET) && v4grp != NULL) {
- if (Dflag)
+ if (Xflag)
(void) printf("%u records for ipGroupMember:\n",
v4grp->length / sizeof (ip_member_t));
@@ -3564,7 +3581,7 @@ group_report(mib_item_t *item)
if (!Vflag || v4src == NULL)
continue;
- if (Dflag)
+ if (Xflag)
(void) printf("scanning %u ipGroupSource "
"records...\n",
v4src->length/sizeof (ip_grpsrc_t));
@@ -3609,7 +3626,7 @@ group_report(mib_item_t *item)
}
if (family_selected(AF_INET6) && v6grp != NULL) {
- if (Dflag)
+ if (Xflag)
(void) printf("%u records for ipv6GroupMember:\n",
v6grp->length / sizeof (ipv6_member_t));
@@ -3638,7 +3655,7 @@ group_report(mib_item_t *item)
if (!Vflag || v6src == NULL)
continue;
- if (Dflag)
+ if (Xflag)
(void) printf("scanning %u ipv6GroupSource "
"records...\n",
v6src->length/sizeof (ipv6_grpsrc_t));
@@ -3683,6 +3700,126 @@ group_report(mib_item_t *item)
(void) fflush(stdout);
}
+/* --------------------- DCE_REPORT (netstat -d) ------------------------- */
+
+#define FLBUFSIZE 8
+
+/* Assumes flbuf is at least 5 characters; callers use FLBUFSIZE */
+static char *
+dceflags2str(uint32_t flags, char *flbuf)
+{
+ char *str = flbuf;
+
+ if (flags & DCEF_DEFAULT)
+ *str++ = 'D';
+ if (flags & DCEF_PMTU)
+ *str++ = 'P';
+ if (flags & DCEF_UINFO)
+ *str++ = 'U';
+ if (flags & DCEF_TOO_SMALL_PMTU)
+ *str++ = 'S';
+ *str++ = '\0';
+ return (flbuf);
+}
+
+static void
+dce_report(mib_item_t *item)
+{
+ mib_item_t *v4dce = NULL;
+ mib_item_t *v6dce = NULL;
+ int jtemp = 0;
+ char ifname[LIFNAMSIZ + 1];
+ char abuf[MAXHOSTNAMELEN + 1];
+ char flbuf[FLBUFSIZE];
+ boolean_t first;
+ dest_cache_entry_t *dce;
+
+ /* 'for' loop 1: */
+ for (; item; item = item->next_item) {
+ if (Xflag) {
+ (void) printf("\n--- Entry %d ---\n", ++jtemp);
+ (void) printf("Group = %d, mib_id = %d, "
+ "length = %d, valp = 0x%p\n",
+ item->group, item->mib_id, item->length,
+ item->valp);
+ }
+ if (item->group == MIB2_IP && family_selected(AF_INET) &&
+ item->mib_id == EXPER_IP_DCE) {
+ v4dce = item;
+ if (Xflag)
+ (void) printf("item is v4dce info\n");
+ }
+ if (item->group == MIB2_IP6 && family_selected(AF_INET6) &&
+ item->mib_id == EXPER_IP_DCE) {
+ v6dce = item;
+ if (Xflag)
+ (void) printf("item is v6dce info\n");
+ }
+ }
+
+ if (family_selected(AF_INET) && v4dce != NULL) {
+ if (Xflag)
+ (void) printf("%u records for DestCacheEntry:\n",
+ v4dce->length / ipDestEntrySize);
+
+ first = B_TRUE;
+ for (dce = (dest_cache_entry_t *)v4dce->valp;
+ (char *)dce < (char *)v4dce->valp + v4dce->length;
+ /* LINTED: (note 1) */
+ dce = (dest_cache_entry_t *)((char *)dce +
+ ipDestEntrySize)) {
+ if (first) {
+ (void) putchar('\n');
+ (void) puts("Destination Cache Entries: IPv4");
+ (void) puts(
+ "Address PMTU Age Flags");
+ (void) puts(
+ "-------------------- ------ ----- -----");
+ first = B_FALSE;
+ }
+
+ (void) printf("%-20s %6u %5u %-5s\n",
+ pr_addr(dce->DestIpv4Address, abuf, sizeof (abuf)),
+ dce->DestPmtu, dce->DestAge,
+ dceflags2str(dce->DestFlags, flbuf));
+ }
+ }
+
+ if (family_selected(AF_INET6) && v6dce != NULL) {
+ if (Xflag)
+ (void) printf("%u records for DestCacheEntry:\n",
+ v6dce->length / ipDestEntrySize);
+
+ first = B_TRUE;
+ for (dce = (dest_cache_entry_t *)v6dce->valp;
+ (char *)dce < (char *)v6dce->valp + v6dce->length;
+ /* LINTED: (note 1) */
+ dce = (dest_cache_entry_t *)((char *)dce +
+ ipDestEntrySize)) {
+ if (first) {
+ (void) putchar('\n');
+ (void) puts("Destination Cache Entries: IPv6");
+ (void) puts(
+ "Address PMTU "
+ " Age Flags If ");
+ (void) puts(
+ "--------------------------- ------ "
+ "----- ----- ---");
+ first = B_FALSE;
+ }
+
+ (void) printf("%-27s %6u %5u %-5s %s\n",
+ pr_addr6(&dce->DestIpv6Address, abuf,
+ sizeof (abuf)),
+ dce->DestPmtu, dce->DestAge,
+ dceflags2str(dce->DestFlags, flbuf),
+ dce->DestIfindex == 0 ? "" :
+ ifindex2str(dce->DestIfindex, ifname));
+ }
+ }
+ (void) fflush(stdout);
+}
+
/* --------------------- ARP_REPORT (netstat -p) -------------------------- */
static void
@@ -3703,7 +3840,7 @@ arp_report(mib_item_t *item)
/* 'for' loop 1: */
for (; item; item = item->next_item) {
- if (Dflag) {
+ if (Xflag) {
(void) printf("\n--- Entry %d ---\n", ++jtemp);
(void) printf("Group = %d, mib_id = %d, "
"length = %d, valp = 0x%p\n",
@@ -3713,7 +3850,7 @@ arp_report(mib_item_t *item)
if (!(item->group == MIB2_IP && item->mib_id == MIB2_IP_MEDIA))
continue; /* 'for' loop 1 */
- if (Dflag)
+ if (Xflag)
(void) printf("%u records for "
"ipNetToMediaEntryTable:\n",
item->length/sizeof (mib2_ipNetToMediaEntry_t));
@@ -3798,7 +3935,7 @@ ndp_report(mib_item_t *item)
/* 'for' loop 1: */
for (; item; item = item->next_item) {
- if (Dflag) {
+ if (Xflag) {
(void) printf("\n--- Entry %d ---\n", ++jtemp);
(void) printf("Group = %d, mib_id = %d, "
"length = %d, valp = 0x%p\n",
@@ -3973,7 +4110,7 @@ ire_report(const mib_item_t *item)
v4a = v4_attrs;
v6a = v6_attrs;
for (; item != NULL; item = item->next_item) {
- if (Dflag) {
+ if (Xflag) {
(void) printf("\n--- Entry %d ---\n", ++jtemp);
(void) printf("Group = %d, mib_id = %d, "
"length = %d, valp = 0x%p\n",
@@ -3991,7 +4128,7 @@ ire_report(const mib_item_t *item)
else if (item->group == MIB2_IP6 && !family_selected(AF_INET6))
continue; /* 'for' loop 1 */
- if (Dflag) {
+ if (Xflag) {
if (item->group == MIB2_IP) {
(void) printf("%u records for "
"ipRouteEntryTable:\n",
@@ -4161,29 +4298,29 @@ form_v4_route_flags(const mib2_ipRouteEntry_t *rp, char *flags)
flag_b = FLF_U;
(void) strcpy(flags, "U");
- if (rp->ipRouteInfo.re_ire_type == IRE_DEFAULT ||
- rp->ipRouteInfo.re_ire_type == IRE_PREFIX ||
- rp->ipRouteInfo.re_ire_type == IRE_HOST ||
- rp->ipRouteInfo.re_ire_type == IRE_HOST_REDIRECT) {
+ /* RTF_INDIRECT wins over RTF_GATEWAY - don't display both */
+ if (rp->ipRouteInfo.re_flags & RTF_INDIRECT) {
+ (void) strcat(flags, "I");
+ flag_b |= FLF_I;
+ } else if (rp->ipRouteInfo.re_ire_type & IRE_OFFLINK) {
(void) strcat(flags, "G");
flag_b |= FLF_G;
}
- if (rp->ipRouteMask == IP_HOST_MASK) {
+ /* IRE_IF_CLONE wins over RTF_HOST - don't display both */
+ if (rp->ipRouteInfo.re_ire_type & IRE_IF_CLONE) {
+ (void) strcat(flags, "C");
+ flag_b |= FLF_C;
+ } else if (rp->ipRouteMask == IP_HOST_MASK) {
(void) strcat(flags, "H");
flag_b |= FLF_H;
}
- if (rp->ipRouteInfo.re_ire_type == IRE_HOST_REDIRECT) {
+ if (rp->ipRouteInfo.re_flags & RTF_DYNAMIC) {
(void) strcat(flags, "D");
flag_b |= FLF_D;
}
- if (rp->ipRouteInfo.re_ire_type == IRE_CACHE) {
- /* Address resolution */
- (void) strcat(flags, "A");
- flag_b |= FLF_A;
- }
if (rp->ipRouteInfo.re_ire_type == IRE_BROADCAST) { /* Broadcast */
- (void) strcat(flags, "B");
- flag_b |= FLF_B;
+ (void) strcat(flags, "b");
+ flag_b |= FLF_b;
}
if (rp->ipRouteInfo.re_ire_type == IRE_LOCAL) { /* Local */
(void) strcat(flags, "L");
@@ -4197,6 +4334,14 @@ form_v4_route_flags(const mib2_ipRouteEntry_t *rp, char *flags)
(void) strcat(flags, "S"); /* Setsrc */
flag_b |= FLF_S;
}
+ if (rp->ipRouteInfo.re_flags & RTF_REJECT) {
+ (void) strcat(flags, "R");
+ flag_b |= FLF_R;
+ }
+ if (rp->ipRouteInfo.re_flags & RTF_BLACKHOLE) {
+ (void) strcat(flags, "B");
+ flag_b |= FLF_B;
+ }
return (flag_b);
}
@@ -4205,9 +4350,9 @@ static const char ire_hdr_v4[] =
static const char ire_hdr_v4_compat[] =
"\n%s Table:\n";
static const char ire_hdr_v4_verbose[] =
-" Destination Mask Gateway Device Mxfrg "
-"Rtt Ref Flg Out In/Fwd %s\n"
-"-------------------- --------------- -------------------- ------ ----- "
+" Destination Mask Gateway Device "
+" MTU Ref Flg Out In/Fwd %s\n"
+"-------------------- --------------- -------------------- ------ "
"----- --- --- ----- ------ %s\n";
static const char ire_hdr_v4_normal[] =
@@ -4226,8 +4371,10 @@ ire_report_item_v4(const mib2_ipRouteEntry_t *rp, boolean_t first,
char flags[10]; /* RTF_ flags */
uint_t flag_b;
- if (!(Aflag || (rp->ipRouteInfo.re_ire_type != IRE_CACHE &&
+ if (!(Aflag || (rp->ipRouteInfo.re_ire_type != IRE_IF_CLONE &&
rp->ipRouteInfo.re_ire_type != IRE_BROADCAST &&
+ rp->ipRouteInfo.re_ire_type != IRE_MULTICAST &&
+ rp->ipRouteInfo.re_ire_type != IRE_NOROUTE &&
rp->ipRouteInfo.re_ire_type != IRE_LOCAL))) {
return (first);
}
@@ -4253,15 +4400,13 @@ ire_report_item_v4(const mib2_ipRouteEntry_t *rp, boolean_t first,
dstbuf, sizeof (dstbuf));
}
if (Vflag) {
- (void) printf("%-20s %-15s %-20s %-6s %5u%c %4u %3u "
+ (void) printf("%-20s %-15s %-20s %-6s %5u %3u "
"%-4s%6u %6u %s\n",
dstbuf,
pr_mask(rp->ipRouteMask, maskbuf, sizeof (maskbuf)),
pr_addrnz(rp->ipRouteNextHop, gwbuf, sizeof (gwbuf)),
octetstr(&rp->ipRouteIfIndex, 'a', ifname, sizeof (ifname)),
rp->ipRouteInfo.re_max_frag,
- rp->ipRouteInfo.re_frag_flag ? '*' : ' ',
- rp->ipRouteInfo.re_rtt,
rp->ipRouteInfo.re_ref,
flags,
rp->ipRouteInfo.re_obpkt,
@@ -4391,58 +4536,39 @@ ire_filter_match_v6(const mib2_ipv6RouteEntry_t *rp6, uint_t flag_b)
return (B_TRUE);
}
-static const char ire_hdr_v6[] =
-"\n%s Table: IPv6\n";
-static const char ire_hdr_v6_verbose[] =
-" Destination/Mask Gateway If PMTU Rtt "
-"Ref Flags Out In/Fwd %s\n"
-"--------------------------- --------------------------- ----- ------ ----- "
-"--- ----- ------ ------ %s\n";
-static const char ire_hdr_v6_normal[] =
-" Destination/Mask Gateway Flags Ref Use "
-" If %s\n"
-"--------------------------- --------------------------- ----- --- ------- "
-"----- %s\n";
-
-static boolean_t
-ire_report_item_v6(const mib2_ipv6RouteEntry_t *rp6, boolean_t first,
- const sec_attr_list_t *attrs)
+/*
+ * Given an IPv6 MIB2 route entry, form the list of flags for the
+ * route.
+ */
+static uint_t
+form_v6_route_flags(const mib2_ipv6RouteEntry_t *rp6, char *flags)
{
- char dstbuf[MAXHOSTNAMELEN + 1];
- char gwbuf[MAXHOSTNAMELEN + 1];
- char ifname[LIFNAMSIZ + 1];
- char flags[10]; /* RTF_ flags */
- uint_t flag_b;
-
- if (!(Aflag || (rp6->ipv6RouteInfo.re_ire_type != IRE_CACHE &&
- rp6->ipv6RouteInfo.re_ire_type != IRE_LOCAL))) {
- return (first);
- }
+ uint_t flag_b;
flag_b = FLF_U;
(void) strcpy(flags, "U");
- if (rp6->ipv6RouteInfo.re_ire_type == IRE_DEFAULT ||
- rp6->ipv6RouteInfo.re_ire_type == IRE_PREFIX ||
- rp6->ipv6RouteInfo.re_ire_type == IRE_HOST ||
- rp6->ipv6RouteInfo.re_ire_type == IRE_HOST_REDIRECT) {
+ /* RTF_INDIRECT wins over RTF_GATEWAY - don't display both */
+ if (rp6->ipv6RouteInfo.re_flags & RTF_INDIRECT) {
+ (void) strcat(flags, "I");
+ flag_b |= FLF_I;
+ } else if (rp6->ipv6RouteInfo.re_ire_type & IRE_OFFLINK) {
(void) strcat(flags, "G");
flag_b |= FLF_G;
}
- if (rp6->ipv6RoutePfxLength == IPV6_ABITS) {
+ /* IRE_IF_CLONE wins over RTF_HOST - don't display both */
+ if (rp6->ipv6RouteInfo.re_ire_type & IRE_IF_CLONE) {
+ (void) strcat(flags, "C");
+ flag_b |= FLF_C;
+ } else if (rp6->ipv6RoutePfxLength == IPV6_ABITS) {
(void) strcat(flags, "H");
flag_b |= FLF_H;
}
- if (rp6->ipv6RouteInfo.re_ire_type == IRE_HOST_REDIRECT) {
+ if (rp6->ipv6RouteInfo.re_flags & RTF_DYNAMIC) {
(void) strcat(flags, "D");
flag_b |= FLF_D;
}
- if (rp6->ipv6RouteInfo.re_ire_type == IRE_CACHE) {
- /* Address resolution */
- (void) strcat(flags, "A");
- flag_b |= FLF_A;
- }
if (rp6->ipv6RouteInfo.re_ire_type == IRE_LOCAL) { /* Local */
(void) strcat(flags, "L");
flag_b |= FLF_L;
@@ -4455,6 +4581,48 @@ ire_report_item_v6(const mib2_ipv6RouteEntry_t *rp6, boolean_t first,
(void) strcat(flags, "S"); /* Setsrc */
flag_b |= FLF_S;
}
+ if (rp6->ipv6RouteInfo.re_flags & RTF_REJECT) {
+ (void) strcat(flags, "R");
+ flag_b |= FLF_R;
+ }
+ if (rp6->ipv6RouteInfo.re_flags & RTF_BLACKHOLE) {
+ (void) strcat(flags, "B");
+ flag_b |= FLF_B;
+ }
+ return (flag_b);
+}
+
+static const char ire_hdr_v6[] =
+"\n%s Table: IPv6\n";
+static const char ire_hdr_v6_verbose[] =
+" Destination/Mask Gateway If MTU "
+"Ref Flags Out In/Fwd %s\n"
+"--------------------------- --------------------------- ----- ----- "
+"--- ----- ------ ------ %s\n";
+static const char ire_hdr_v6_normal[] =
+" Destination/Mask Gateway Flags Ref Use "
+" If %s\n"
+"--------------------------- --------------------------- ----- --- ------- "
+"----- %s\n";
+
+static boolean_t
+ire_report_item_v6(const mib2_ipv6RouteEntry_t *rp6, boolean_t first,
+ const sec_attr_list_t *attrs)
+{
+ char dstbuf[MAXHOSTNAMELEN + 1];
+ char gwbuf[MAXHOSTNAMELEN + 1];
+ char ifname[LIFNAMSIZ + 1];
+ char flags[10]; /* RTF_ flags */
+ uint_t flag_b;
+
+ if (!(Aflag || (rp6->ipv6RouteInfo.re_ire_type != IRE_IF_CLONE &&
+ rp6->ipv6RouteInfo.re_ire_type != IRE_MULTICAST &&
+ rp6->ipv6RouteInfo.re_ire_type != IRE_NOROUTE &&
+ rp6->ipv6RouteInfo.re_ire_type != IRE_LOCAL))) {
+ return (first);
+ }
+
+ flag_b = form_v6_route_flags(rp6, flags);
if (!ire_filter_match_v6(rp6, flag_b))
return (first);
@@ -4468,7 +4636,7 @@ ire_report_item_v6(const mib2_ipv6RouteEntry_t *rp6, boolean_t first,
}
if (Vflag) {
- (void) printf("%-27s %-27s %-5s %5u%c %5u %3u "
+ (void) printf("%-27s %-27s %-5s %5u %3u "
"%-5s %6u %6u %s\n",
pr_prefix6(&rp6->ipv6RouteDest,
rp6->ipv6RoutePfxLength, dstbuf, sizeof (dstbuf)),
@@ -4478,8 +4646,6 @@ ire_report_item_v6(const mib2_ipv6RouteEntry_t *rp6, boolean_t first,
octetstr(&rp6->ipv6RouteIfIndex, 'a',
ifname, sizeof (ifname)),
rp6->ipv6RouteInfo.re_max_frag,
- rp6->ipv6RouteInfo.re_frag_flag ? '*' : ' ',
- rp6->ipv6RouteInfo.re_rtt,
rp6->ipv6RouteInfo.re_ref,
flags,
rp6->ipv6RouteInfo.re_obpkt,
@@ -4617,7 +4783,7 @@ tcp_report(const mib_item_t *item)
v4a = v4_attrs;
v6a = v6_attrs;
for (; item != NULL; item = item->next_item) {
- if (Dflag) {
+ if (Xflag) {
(void) printf("\n--- Entry %d ---\n", ++jtemp);
(void) printf("Group = %d, mib_id = %d, "
"length = %d, valp = 0x%p\n",
@@ -4841,7 +5007,7 @@ udp_report(const mib_item_t *item)
v6a = v6_attrs;
/* 'for' loop 1: */
for (; item; item = item->next_item) {
- if (Dflag) {
+ if (Xflag) {
(void) printf("\n--- Entry %d ---\n", ++jtemp);
(void) printf("Group = %d, mib_id = %d, "
"length = %d, valp = 0x%p\n",
@@ -4916,10 +5082,7 @@ udp_report_item_v4(const mib2_udpEntry_t *ude, boolean_t first,
"",
miudp_state(ude->udpEntryInfo.ue_state, attr));
- /*
- * UDP sockets don't have remote attributes, so there's no need to
- * print them here.
- */
+ print_transport_label(attr);
return (first);
}
@@ -4956,10 +5119,7 @@ udp_report_item_v6(const mib2_udp6Entry_t *ude6, boolean_t first,
miudp_state(ude6->udp6EntryInfo.ue_state, attr),
ifnamep == NULL ? "" : ifnamep);
- /*
- * UDP sockets don't have remote attributes, so there's no need to
- * print them here.
- */
+ print_transport_label(attr);
return (first);
}
@@ -5321,7 +5481,7 @@ mrt_report(mib_item_t *item)
/* 'for' loop 1: */
for (; item; item = item->next_item) {
- if (Dflag) {
+ if (Xflag) {
(void) printf("\n--- Entry %d ---\n", ++jtemp);
(void) printf("Group = %d, mib_id = %d, "
"length = %d, valp = 0x%p\n",
@@ -5334,7 +5494,7 @@ mrt_report(mib_item_t *item)
switch (item->mib_id) {
case EXPER_DVMRP_VIF:
- if (Dflag)
+ if (Xflag)
(void) printf("%u records for ipVifTable:\n",
item->length/sizeof (struct vifctl));
if (item->length/sizeof (struct vifctl) == 0) {
@@ -5377,7 +5537,7 @@ mrt_report(mib_item_t *item)
break;
case EXPER_DVMRP_MRT:
- if (Dflag)
+ if (Xflag)
(void) printf("%u records for ipMfcTable:\n",
item->length/sizeof (struct vifctl));
if (item->length/sizeof (struct vifctl) == 0) {
diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c
index 28416c4d7f..c0621996d3 100644
--- a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c
+++ b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c
@@ -2875,7 +2875,7 @@ mibwalk(void (*proc)(mib_item_t *))
* us information concerning IRE_MARK_TESTHIDDEN routes.
*/
req = (struct opthdr *)&tor[1];
- req->level = EXPER_IP_AND_TESTHIDDEN;
+ req->level = EXPER_IP_AND_ALL_IRES;
req->name = 0;
req->len = 0;
diff --git a/usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c b/usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c
index b76341e303..2cea11b454 100644
--- a/usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c
+++ b/usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c
@@ -407,6 +407,15 @@ select_src_ifi_info_solaris(int sockfd, int numifs,
if (ifflags & (IFF_NOXMIT | IFF_NOLOCAL | IFF_PRIVATE))
continue;
+ /* A DHCP client will have IFF_UP set yet the address is zero. Ignore */
+ if (lifr->lifr_addr.ss_family == AF_INET) {
+ struct sockaddr_in *sinptr;
+
+ sinptr = (struct sockaddr_in *) &lifr->lifr_addr;
+ if (sinptr->sin_addr.s_addr == INADDR_ANY)
+ continue;
+ }
+
if (*best_lifr != NULL) {
/*
* Check if we found a better interface by checking
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c
index 506b15a307..868f9ab5e2 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c
@@ -3541,18 +3541,6 @@ ifplumb(const char *linkname, const char *ifname, boolean_t genppa, int af)
Perror2_exit("I_PUSH", IP_MOD_NAME);
/*
- * Push the ARP module onto the interface stream. IP uses
- * this to send resolution requests up to ARP. We need to
- * do this before the SLIFNAME ioctl is sent down because
- * the interface becomes publicly known as soon as the SLIFNAME
- * ioctl completes. Thus some other process trying to bring up
- * the interface after SLIFNAME but before we have pushed ARP
- * could hang. We pop the module again later if it is not needed.
- */
- if (ioctl(ip_fd, I_PUSH, ARP_MOD_NAME) == -1)
- Perror2_exit("I_PUSH", ARP_MOD_NAME);
-
- /*
* Prepare to set IFF_IPV4/IFF_IPV6 flags as part of SIOCSLIFNAME.
* (At this point in time the kernel also allows an override of the
* IFF_CANTCHANGE flags.)
@@ -3679,12 +3667,6 @@ ifplumb(const char *linkname, const char *ifname, boolean_t genppa, int af)
(void) putchar('\n');
}
- /* Check if arp is not actually needed */
- if (lifr.lifr_flags & (IFF_NOARP|IFF_IPV6)) {
- if (ioctl(ip_fd, I_POP, 0) == -1)
- Perror2_exit("I_POP", ARP_MOD_NAME);
- }
-
/*
* Open "/dev/udp" for use as a multiplexor to PLINK the
* interface stream under. We use "/dev/udp" instead of "/dev/ip"
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c b/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c
index 2a4ff60d57..d851dce613 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c
@@ -159,6 +159,7 @@ static int moptions; /* multicast options */
int npackets; /* number of packets to send */
static ushort_t tos; /* type-of-service value */
static int hoplimit = -1; /* time-to-live value */
+static int dontfrag; /* IP*_DONTFRAG */
static int timeout = TIMEOUT; /* timeout value (sec) for probes */
static struct if_entry out_if; /* interface argument */
int ident; /* ID for this ping run */
@@ -268,7 +269,7 @@ main(int argc, char *argv[])
setbuf(stdout, (char *)0);
while ((c = getopt(argc, argv,
- "abA:c:dF:G:g:I:i:LlnN:P:p:rRSsTt:UvX:x:Y0123?")) != -1) {
+ "abA:c:dDF:G:g:I:i:LlnN:P:p:rRSsTt:UvX:x:Y0123?")) != -1) {
switch ((char)c) {
case 'A':
if (strcmp(optarg, "inet") == 0) {
@@ -301,6 +302,10 @@ main(int argc, char *argv[])
options |= SO_DEBUG;
break;
+ case 'D':
+ dontfrag = 1;
+ break;
+
case 'b':
bypass = _B_TRUE;
break;
@@ -1303,8 +1308,6 @@ setup_socket(int family, int *send_sockp, int *recv_sockp, int *if_index,
}
}
- if (nexthop != NULL && !use_udp)
- set_nexthop(family, ai_nexthop, recv_sock);
/*
* We always receive on raw icmp socket. But the sending socket can be
* raw icmp or udp, depending on the use of -U flag.
@@ -1332,9 +1335,6 @@ setup_socket(int family, int *send_sockp, int *recv_sockp, int *if_index,
}
}
- if (nexthop != NULL)
- set_nexthop(family, ai_nexthop, send_sock);
-
/*
* In order to distinguish replies to our UDP probes from
* other pings', we need to know our source port number.
@@ -1368,6 +1368,9 @@ setup_socket(int family, int *send_sockp, int *recv_sockp, int *if_index,
send_sock = recv_sock;
}
+ if (nexthop != NULL)
+ set_nexthop(family, ai_nexthop, send_sock);
+
int_op = 48 * 1024;
if (int_op < datalen)
int_op = datalen;
@@ -1431,6 +1434,7 @@ setup_socket(int family, int *send_sockp, int *recv_sockp, int *if_index,
if (moptions & MULTICAST_TTL) {
char_op = hoplimit;
+ /* Applies to unicast and multicast. */
if (family == AF_INET) {
if (setsockopt(send_sock, IPPROTO_IP, IP_MULTICAST_TTL,
(char *)&char_op, sizeof (char)) == -1) {
@@ -1454,7 +1458,10 @@ setup_socket(int family, int *send_sockp, int *recv_sockp, int *if_index,
*/
}
- /* did the user specify an interface? */
+ /*
+ * did the user specify an interface?
+ * Applies to unicast, broadcast and multicast.
+ */
if (moptions & MULTICAST_IF) {
struct ifaddrlist *al = NULL; /* interface list */
struct ifaddrlist *my_if;
@@ -1496,6 +1503,8 @@ setup_socket(int family, int *send_sockp, int *recv_sockp, int *if_index,
}
if (family == AF_INET) {
+ struct in_pktinfo pktinfo;
+
if (setsockopt(send_sock, IPPROTO_IP, IP_MULTICAST_IF,
(char *)&my_if->addr.addr,
sizeof (struct in_addr)) == -1) {
@@ -1504,6 +1513,15 @@ setup_socket(int family, int *send_sockp, int *recv_sockp, int *if_index,
strerror(errno));
exit(EXIT_FAILURE);
}
+ bzero(&pktinfo, sizeof (pktinfo));
+ pktinfo.ipi_ifindex = my_if->index;
+ if (setsockopt(send_sock, IPPROTO_IP, IP_PKTINFO,
+ (char *)&pktinfo, sizeof (pktinfo)) == -1) {
+ Fprintf(stderr, "%s: setsockopt "
+ "IP_PKTINFO %s\n", progname,
+ strerror(errno));
+ exit(EXIT_FAILURE);
+ }
} else {
/*
* the outgoing interface is set in set_ancillary_data()
@@ -1525,6 +1543,23 @@ setup_socket(int family, int *send_sockp, int *recv_sockp, int *if_index,
}
}
+ /* We enable or disable to not depend on the kernel default */
+ if (family == AF_INET) {
+ if (setsockopt(send_sock, IPPROTO_IP, IP_DONTFRAG,
+ (char *)&dontfrag, sizeof (dontfrag)) == -1) {
+ Fprintf(stderr, "%s: setsockopt IP_DONTFRAG %s\n",
+ progname, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ } else {
+ if (setsockopt(send_sock, IPPROTO_IPV6, IPV6_DONTFRAG,
+ (char *)&dontfrag, sizeof (dontfrag)) == -1) {
+ Fprintf(stderr, "%s: setsockopt IPV6_DONTFRAG %s\n",
+ progname, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ }
+
/* receiving IPv6 extension headers in verbose mode */
if (verbose && family == AF_INET6) {
if (setsockopt(recv_sock, IPPROTO_IPV6, IPV6_RECVHOPOPTS,
@@ -2336,7 +2371,7 @@ usage(char *cmdname)
Fprintf(stderr, "usage: %s host [timeout]\n", cmdname);
Fprintf(stderr,
/* CSTYLED */
-"usage: %s -s [-l | U] [abdLnRrv] [-A addr_family] [-c traffic_class]\n\t"
+"usage: %s -s [-l | U] [abdDLnRrv] [-A addr_family] [-c traffic_class]\n\t"
"[-g gateway [-g gateway ...]] [-N nexthop] [-F flow_label] [-I interval]\n\t"
"[-i interface] [-P tos] [-p port] [-t ttl] host [data_size] [npackets]\n",
cmdname);
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/route.c b/usr/src/cmd/cmd-inet/usr.sbin/route.c
index b4b16d6755..aedef45409 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/route.c
+++ b/usr/src/cmd/cmd-inet/usr.sbin/route.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -45,8 +45,6 @@
* @(#)linkaddr.c 8.1 (Berkeley) 6/4/93
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/param.h>
#include <sys/file.h>
#include <sys/socket.h>
@@ -175,6 +173,8 @@ static struct keytab {
{"show", K_SHOW},
#define K_SECATTR 43
{"secattr", K_SECATTR},
+#define K_INDIRECT 44
+ {"indirect", K_INDIRECT},
{0, 0}
};
@@ -655,7 +655,7 @@ flushroutes(int argc, char *argv[])
(char *)rp < (char *)item->valp + item->length;
/* LINTED */
rp = (mib2_ipRouteEntry_t *)
- ((char *)rp + ipRouteEntrySize)) {
+ ((char *)rp + ipRouteEntrySize)) {
delRouteEntry(rp, NULL, seqno);
seqno++;
}
@@ -670,7 +670,7 @@ flushroutes(int argc, char *argv[])
if (item->group == MIB2_IP6) {
ipv6RouteEntrySize =
((mib2_ipv6IfStatsEntry_t *)item->valp)->
- ipv6RouteEntrySize;
+ ipv6RouteEntrySize;
assert(IS_P2ALIGNED(ipv6RouteEntrySize,
sizeof (mib2_ipv6RouteEntry_t *)));
break;
@@ -692,7 +692,7 @@ flushroutes(int argc, char *argv[])
(char *)rp6 < (char *)item->valp + item->length;
/* LINTED */
rp6 = (mib2_ipv6RouteEntry_t *)
- ((char *)rp6 + ipv6RouteEntrySize)) {
+ ((char *)rp6 + ipv6RouteEntrySize)) {
delRouteEntry(NULL, rp6, seqno);
seqno++;
}
@@ -812,7 +812,7 @@ delRouteEntry(mib2_ipRouteEntry_t *rp, mib2_ipv6RouteEntry_t *rp6, int seqno)
(void) printf("%-20.20s ",
rtm->rtm_flags & RTF_HOST ? routename(sa) :
- netname(sa));
+ netname(sa));
/* LINTED */
sa = (struct sockaddr *)(salen(sa) + (char *)sa);
(void) printf("%-20.20s ", routename(sa));
@@ -861,7 +861,7 @@ routename(const struct sockaddr *sa)
cp = "default";
if (cp == NULL && !nflag) {
hp = gethostbyaddr((char *)&in, sizeof (struct in_addr),
- AF_INET);
+ AF_INET);
if (hp != NULL) {
if (((cp = strchr(hp->h_name, '.')) != NULL) &&
(strcmp(cp + 1, domain) == 0))
@@ -892,7 +892,7 @@ routename(const struct sockaddr *sa)
cp = "default";
if (cp == NULL && !nflag) {
hp = getipnodebyaddr((char *)&in6,
- sizeof (struct in6_addr), AF_INET6, &error_num);
+ sizeof (struct in6_addr), AF_INET6, &error_num);
if (hp != NULL) {
if (((cp = strchr(hp->h_name, '.')) != NULL) &&
(strcmp(cp + 1, domain) == 0))
@@ -1120,8 +1120,8 @@ print_rtcmd_short(FILE *to, rtcmd_irep_t *rcip, boolean_t gw_good,
break;
case AF_INET6:
if (inet_ntop(AF_INET6,
- &rcip->ri_gate.sin6.sin6_addr, obuf,
- INET6_ADDRSTRLEN) != NULL) {
+ &rcip->ri_gate.sin6.sin6_addr, obuf,
+ INET6_ADDRSTRLEN) != NULL) {
if (nflag) {
(void) fprintf(to, ": gateway %s",
obuf);
@@ -1405,6 +1405,9 @@ args_to_rtcmd(rtcmd_irep_t *rcip, char **argv, char *cmd_string)
return (B_FALSE);
}
break;
+ case K_INDIRECT:
+ rcip->ri_flags |= RTF_INDIRECT;
+ break;
default:
if (dash_keyword) {
syntax_bad_keyword(tok + 1);
@@ -1479,8 +1482,8 @@ args_to_rtcmd(rtcmd_irep_t *rcip, char **argv, char *cmd_string)
}
if (rcip->ri_af == AF_INET6 &&
memcmp(&rcip->ri_mask.sin6.sin6_addr,
- &in6_host_mask,
- sizeof (struct in6_addr)) == 0) {
+ &in6_host_mask,
+ sizeof (struct in6_addr)) == 0) {
rcip->ri_flags |= RTF_HOST;
}
} else {
@@ -1853,8 +1856,8 @@ newroute(char **argv)
break;
case AF_INET6:
if (inet_ntop(AF_INET6,
- (void *)&newrt->ri_dst.sin6.sin6_addr,
- obuf, INET6_ADDRSTRLEN) != NULL) {
+ (void *)&newrt->ri_dst.sin6.sin6_addr,
+ obuf, INET6_ADDRSTRLEN) != NULL) {
(void) printf(" %s", obuf);
break;
}
@@ -2236,7 +2239,7 @@ in_getaddr(char *s, struct sockaddr_in *sin, int *plenp, int which,
inet_lnaof(sin->sin_addr) == INADDR_ANY)) {
/* This looks like a network address. */
inet_makenetandmask(rcip, ntohl(val),
- sin);
+ sin);
}
}
return (B_TRUE);
@@ -2562,7 +2565,7 @@ static char metricnames[] =
static char routeflags[] =
"\1UP\2GATEWAY\3HOST\4REJECT\5DYNAMIC\6MODIFIED\7DONE\010MASK_PRESENT"
"\011CLONING\012XRESOLVE\013LLINFO\014STATIC\015BLACKHOLE"
- "\016PRIVATE\017PROTO2\020PROTO1\021MULTIRT\022SETSRC";
+ "\016PRIVATE\017PROTO2\020PROTO1\021MULTIRT\022SETSRC\023INDIRECT";
static char ifnetflags[] =
"\1UP\2BROADCAST\3DEBUG\4LOOPBACK\5PTP\6NOTRAILERS\7RUNNING\010NOARP"
"\011PPROMISC\012ALLMULTI\013INTELLIGENT\014MULTICAST"
@@ -2623,7 +2626,7 @@ print_rtmsg(struct rt_msghdr *rtm, int msglen)
break;
default:
(void) printf("pid: %ld, seq %d, errno %d, flags:",
- rtm->rtm_pid, rtm->rtm_seq, rtm->rtm_errno);
+ rtm->rtm_pid, rtm->rtm_seq, rtm->rtm_errno);
bprintf(stdout, rtm->rtm_flags, routeflags);
pmsg_common(rtm, msglen);
break;
@@ -2649,7 +2652,7 @@ print_getmsg(rtcmd_irep_t *req_rt, struct rt_msghdr *rtm, int msglen)
if (rtm->rtm_msglen > (ushort_t)msglen) {
(void) fprintf(stderr,
gettext("message length mismatch, in packet %d, "
- "returned %d\n"), rtm->rtm_msglen, msglen);
+ "returned %d\n"), rtm->rtm_msglen, msglen);
}
if (rtm->rtm_errno) {
(void) fprintf(stderr, "RTM_GET: %s (errno %d)\n",
@@ -2675,7 +2678,7 @@ print_getmsg(rtcmd_irep_t *req_rt, struct rt_msghdr *rtm, int msglen)
case RTA_IFP:
if (sa->sa_family == AF_LINK &&
((struct sockaddr_dl *)sa)->
- sdl_nlen != 0)
+ sdl_nlen != 0)
ifp = (struct sockaddr_dl *)sa;
break;
case RTA_SRC:
@@ -3122,8 +3125,8 @@ mibget(int sd)
(void) fprintf(stderr, gettext("mibget %d gives "
"T_ERROR_ACK: TLI_error = 0x%lx, UNIX_error = "
"0x%lx\n"), j, tea->TLI_error, tea->UNIX_error);
- errno = (tea->TLI_error == TSYSERR)
- ? tea->UNIX_error : EPROTO;
+ errno = (tea->TLI_error == TSYSERR) ?
+ tea->UNIX_error : EPROTO;
break;
}
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c b/usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c
index cae75df60d..b8b56259ad 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c
+++ b/usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c
@@ -166,6 +166,7 @@ boolean_t useicmp = _B_FALSE; /* use icmp echo instead of udp packets */
boolean_t docksum = _B_TRUE; /* calculate checksums */
static boolean_t collect_stat = _B_FALSE; /* print statistics */
boolean_t settos = _B_FALSE; /* set type-of-service field */
+int dontfrag = 0; /* IP*_DONTFRAG */
static int max_timeout = 5; /* quit after this consecutive timeouts */
static boolean_t probe_all = _B_FALSE; /* probe all the IFs of the target */
static boolean_t pick_src = _B_FALSE; /* traceroute picks the src address */
@@ -315,6 +316,7 @@ main(int argc, char **argv)
case 'F':
off = IP_DF;
+ dontfrag = 1;
break;
case 'g':
@@ -1361,6 +1363,24 @@ setup_socket(struct pr_set *pr, int packet_len)
exit(EXIT_FAILURE);
}
}
+
+ /* We enable or disable to not depend on the kernel default */
+ if (pr->family == AF_INET) {
+ if (setsockopt(ssock, IPPROTO_IP, IP_DONTFRAG,
+ (char *)&dontfrag, sizeof (dontfrag)) == -1) {
+ Fprintf(stderr, "%s: IP_DONTFRAG %s\n", prog,
+ strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ } else {
+ if (setsockopt(ssock, IPPROTO_IPV6, IPV6_DONTFRAG,
+ (char *)&dontfrag, sizeof (dontfrag)) == -1) {
+ Fprintf(stderr, "%s: IPV6_DONTFRAG %s\n", prog,
+ strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ }
+
if (pr->family == AF_INET) {
rcvsock4 = rsock;
sndsock4 = ssock;
diff --git a/usr/src/cmd/devfsadm/misc_link.c b/usr/src/cmd/devfsadm/misc_link.c
index 222699e479..84cdb42377 100644
--- a/usr/src/cmd/devfsadm/misc_link.c
+++ b/usr/src/cmd/devfsadm/misc_link.c
@@ -104,8 +104,7 @@ static devfsadm_create_t misc_cbt[] = {
"(^ip$)|(^tcp$)|(^udp$)|(^icmp$)|(^sctp$)|"
"(^ip6$)|(^tcp6$)|(^udp6$)|(^icmp6$)|(^sctp6$)|"
"(^rts$)|(^arp$)|(^ipsecah$)|(^ipsecesp$)|(^keysock$)|(^spdsock$)|"
- "(^nca$)|(^rds$)|(^sdp$)|(^ipnet$)|(^dlpistub$)|(^iptunq)|"
- "(^bpf$)",
+ "(^nca$)|(^rds$)|(^sdp$)|(^ipnet$)|(^dlpistub$)|(^bpf$)",
TYPE_EXACT | DRV_RE, ILEVEL_1, minor_name
},
{ "pseudo", "ddi_pseudo",
diff --git a/usr/src/cmd/mdb/common/modules/arp/arp.c b/usr/src/cmd/mdb/common/modules/arp/arp.c
index f36a81170e..f97cdaab42 100644
--- a/usr/src/cmd/mdb/common/modules/arp/arp.c
+++ b/usr/src/cmd/mdb/common/modules/arp/arp.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <stdio.h>
#include <sys/types.h>
#include <sys/stropts.h>
@@ -36,7 +34,6 @@
#include <inet/common.h>
#include <inet/mi.h>
#include <inet/arp.h>
-#include <inet/arp_impl.h>
#include <inet/ip.h>
#include <netinet/arp.h>
@@ -50,541 +47,10 @@ typedef struct {
} arp_cmd_tbl;
/*
- * Table of ARP commands and structure types used for messages between ARP and
- * IP.
- */
-static const arp_cmd_tbl act_list[] = {
- { AR_ENTRY_ADD, "AR_ENTRY_ADD", "arp`area_t" },
- { AR_ENTRY_DELETE, "AR_ENTRY_DELETE", "arp`ared_t" },
- { AR_ENTRY_QUERY, "AR_ENTRY_QUERY", "arp`areq_t" },
- { AR_ENTRY_SQUERY, "AR_ENTRY_SQUERY", "arp`area_t" },
- { AR_MAPPING_ADD, "AR_MAPPING_ADD", "arp`arma_t" },
- { AR_CLIENT_NOTIFY, "AR_CLIENT_NOTIFY", "arp`arcn_t" },
- { AR_INTERFACE_UP, "AR_INTERFACE_UP", "arp`arc_t" },
- { AR_INTERFACE_DOWN, "AR_INTERFACE_DOWN", "arp`arc_t" },
- { AR_INTERFACE_ON, "AR_INTERFACE_ON", "arp`arc_t" },
- { AR_INTERFACE_OFF, "AR_INTERFACE_OFF", "arp`arc_t" },
- { AR_DLPIOP_DONE, "AR_DLPIOP_DONE", "arp`arc_t" },
- { AR_ARP_CLOSING, "AR_ARP_CLOSING", "arp`arc_t" },
- { AR_ARP_EXTEND, "AR_ARP_EXTEND", "arp`arc_t" },
- { 0, "unknown command", "arp`arc_t" }
-};
-
-/*
- * State information kept during walk over ACE hash table and unhashed mask
- * list.
- */
-typedef struct ace_walk_data {
- ace_t *awd_hash_tbl[ARP_HASH_SIZE];
- ace_t *awd_masks;
- int awd_idx;
-} ace_walk_data_t;
-
-/*
- * Given the kernel address of an arl_t, return the stackid
+ * removed all the ace/arl related stuff. The only thing that remains
+ * is code for dealing with ioctls and printing out arp header that
+ * should probably be moved into the ip/mdb module.
*/
-static int
-arl_to_stackid(uintptr_t addr)
-{
- arl_t arl;
- queue_t rq;
- ar_t ar;
- arp_stack_t ass;
- netstack_t nss;
-
- if (mdb_vread(&arl, sizeof (arl), addr) == -1) {
- mdb_warn("failed to read arl_t %p", addr);
- return (0);
- }
-
- addr = (uintptr_t)arl.arl_rq;
- if (mdb_vread(&rq, sizeof (rq), addr) == -1) {
- mdb_warn("failed to read queue_t %p", addr);
- return (0);
- }
-
- addr = (uintptr_t)rq.q_ptr;
- if (mdb_vread(&ar, sizeof (ar), addr) == -1) {
- mdb_warn("failed to read ar_t %p", addr);
- return (0);
- }
-
- addr = (uintptr_t)ar.ar_as;
- if (mdb_vread(&ass, sizeof (ass), addr) == -1) {
- mdb_warn("failed to read arp_stack_t %p", addr);
- return (0);
- }
- addr = (uintptr_t)ass.as_netstack;
- if (mdb_vread(&nss, sizeof (nss), addr) == -1) {
- mdb_warn("failed to read netstack_t %p", addr);
- return (0);
- }
- return (nss.netstack_stackid);
-}
-
-static int
-arp_stacks_walk_init(mdb_walk_state_t *wsp)
-{
- if (mdb_layered_walk("netstack", wsp) == -1) {
- mdb_warn("can't walk 'netstack'");
- return (WALK_ERR);
- }
- return (WALK_NEXT);
-}
-
-static int
-arp_stacks_walk_step(mdb_walk_state_t *wsp)
-{
- uintptr_t addr;
- netstack_t nss;
-
- if (mdb_vread(&nss, sizeof (nss), wsp->walk_addr) == -1) {
- mdb_warn("can't read netstack at %p", wsp->walk_addr);
- return (WALK_ERR);
- }
- addr = (uintptr_t)nss.netstack_modules[NS_ARP];
-
- return (wsp->walk_callback(addr, wsp->walk_layer, wsp->walk_cbdata));
-}
-
-static int
-arl_stack_walk_init(mdb_walk_state_t *wsp)
-{
- uintptr_t addr;
-
- if (wsp->walk_addr == NULL) {
- mdb_warn("arl_stack supports only local walks\n");
- return (WALK_ERR);
- }
-
- addr = wsp->walk_addr + OFFSETOF(arp_stack_t, as_arl_head);
- if (mdb_vread(&wsp->walk_addr, sizeof (wsp->walk_addr),
- addr) == -1) {
- mdb_warn("failed to read 'arl_g_head'");
- return (WALK_ERR);
- }
- return (WALK_NEXT);
-}
-
-static int
-arl_stack_walk_step(mdb_walk_state_t *wsp)
-{
- uintptr_t addr = wsp->walk_addr;
- arl_t arl;
-
- if (wsp->walk_addr == NULL)
- return (WALK_DONE);
-
- if (mdb_vread(&arl, sizeof (arl), addr) == -1) {
- mdb_warn("failed to read arl_t at %p", addr);
- return (WALK_ERR);
- }
-
- wsp->walk_addr = (uintptr_t)arl.arl_next;
-
- return ((*wsp->walk_callback)(addr, &arl, wsp->walk_cbdata));
-}
-
-static int
-arl_walk_init(mdb_walk_state_t *wsp)
-{
- if (mdb_layered_walk("arp_stacks", wsp) == -1) {
- mdb_warn("can't walk 'arp_stacks'");
- return (WALK_ERR);
- }
-
- return (WALK_NEXT);
-}
-
-static int
-arl_walk_step(mdb_walk_state_t *wsp)
-{
- if (mdb_pwalk("arl_stack", wsp->walk_callback,
- wsp->walk_cbdata, wsp->walk_addr) == -1) {
- mdb_warn("couldn't walk 'arl_stack' at %p", wsp->walk_addr);
- return (WALK_ERR);
- }
- return (WALK_NEXT);
-}
-
-/*
- * Called with walk_addr being the address of arp_stack_t
- */
-static int
-ace_stack_walk_init(mdb_walk_state_t *wsp)
-{
- ace_walk_data_t *aw;
- uintptr_t addr;
-
- if (wsp->walk_addr == NULL) {
- mdb_warn("ace_stack supports only local walks\n");
- return (WALK_ERR);
- }
-
- aw = mdb_alloc(sizeof (ace_walk_data_t), UM_SLEEP);
-
- addr = wsp->walk_addr + OFFSETOF(arp_stack_t, as_ce_hash_tbl);
- if (mdb_vread(aw->awd_hash_tbl, sizeof (aw->awd_hash_tbl),
- addr) == -1) {
- mdb_warn("failed to read 'as_ce_hash_tbl'");
- mdb_free(aw, sizeof (ace_walk_data_t));
- return (WALK_ERR);
- }
-
- addr = wsp->walk_addr + OFFSETOF(arp_stack_t, as_ce_mask_entries);
- if (mdb_vread(&aw->awd_masks, sizeof (aw->awd_masks),
- addr) == -1) {
- mdb_warn("failed to read 'as_ce_mask_entries'");
- mdb_free(aw, sizeof (ace_walk_data_t));
- return (WALK_ERR);
- }
-
- /* The step routine will start off by incrementing to index 0 */
- aw->awd_idx = -1;
- wsp->walk_addr = 0;
- wsp->walk_data = aw;
-
- return (WALK_NEXT);
-}
-
-static int
-ace_stack_walk_step(mdb_walk_state_t *wsp)
-{
- uintptr_t addr;
- ace_walk_data_t *aw = wsp->walk_data;
- ace_t ace;
-
- /*
- * If we're at the end of the previous list, then find the start of the
- * next list to process.
- */
- while (wsp->walk_addr == NULL) {
- if (aw->awd_idx == ARP_HASH_SIZE)
- return (WALK_DONE);
- if (++aw->awd_idx == ARP_HASH_SIZE) {
- wsp->walk_addr = (uintptr_t)aw->awd_masks;
- } else {
- wsp->walk_addr =
- (uintptr_t)aw->awd_hash_tbl[aw->awd_idx];
- }
- }
-
- addr = wsp->walk_addr;
- if (mdb_vread(&ace, sizeof (ace), addr) == -1) {
- mdb_warn("failed to read ace_t at %p", addr);
- return (WALK_ERR);
- }
-
- wsp->walk_addr = (uintptr_t)ace.ace_next;
-
- return (wsp->walk_callback(addr, &ace, wsp->walk_cbdata));
-}
-
-static void
-ace_stack_walk_fini(mdb_walk_state_t *wsp)
-{
- mdb_free(wsp->walk_data, sizeof (ace_walk_data_t));
-}
-
-static int
-ace_walk_init(mdb_walk_state_t *wsp)
-{
- if (mdb_layered_walk("arp_stacks", wsp) == -1) {
- mdb_warn("can't walk 'arp_stacks'");
- return (WALK_ERR);
- }
-
- return (WALK_NEXT);
-}
-
-static int
-ace_walk_step(mdb_walk_state_t *wsp)
-{
- if (mdb_pwalk("ace_stack", wsp->walk_callback,
- wsp->walk_cbdata, wsp->walk_addr) == -1) {
- mdb_warn("couldn't walk 'ace_stack' at %p", wsp->walk_addr);
- return (WALK_ERR);
- }
- return (WALK_NEXT);
-}
-
-
-/* Common routine to produce an 'ar' text description */
-static void
-ar_describe(const ar_t *ar, char *buf, size_t nbytes, boolean_t addmac)
-{
- if (ar->ar_arl == NULL) {
- queue_t wq, ipq;
- ill_t ill;
- char name[LIFNAMSIZ];
- GElf_Sym sym;
- boolean_t nextip;
-
- if (mdb_vread(&wq, sizeof (wq), (uintptr_t)ar->ar_wq) == -1 ||
- mdb_vread(&ipq, sizeof (ipq), (uintptr_t)wq.q_next) == -1)
- return;
-
- nextip =
- (mdb_lookup_by_obj("ip", "ipwinit", &sym) == 0 &&
- (uintptr_t)sym.st_value == (uintptr_t)ipq.q_qinfo);
-
- if (!ar->ar_on_ill_stream) {
- (void) strcpy(buf, nextip ? "Client" : "Unknown");
- return;
- }
-
- if (!nextip ||
- mdb_vread(&ill, sizeof (ill), (uintptr_t)ipq.q_ptr) == -1 ||
- mdb_readstr(name, sizeof (name),
- (uintptr_t)ill.ill_name) == -1) {
- return;
- }
- (void) mdb_snprintf(buf, nbytes, "IP %s", name);
- } else {
- arl_t arl;
- arlphy_t ap;
- ssize_t retv;
- uint32_t alen;
- uchar_t macaddr[ARP_MAX_ADDR_LEN];
-
- if (mdb_vread(&arl, sizeof (arl), (uintptr_t)ar->ar_arl) == -1)
- return;
- retv = mdb_snprintf(buf, nbytes, "ARP %s ", arl.arl_name);
- if (retv >= nbytes || !addmac)
- return;
- if (mdb_vread(&ap, sizeof (ap), (uintptr_t)arl.arl_phy) == -1)
- return;
- alen = ap.ap_hw_addrlen;
- if (ap.ap_hw_addr == NULL || alen == 0 ||
- alen > sizeof (macaddr))
- return;
- if (mdb_vread(macaddr, alen, (uintptr_t)ap.ap_hw_addr) == -1)
- return;
- mdb_mac_addr(macaddr, alen, buf + retv, nbytes - retv);
- }
-}
-
-/* ARGSUSED2 */
-static int
-ar_cb(uintptr_t addr, const void *arptr, void *dummy)
-{
- const ar_t *ar = arptr;
- char ardesc[sizeof ("ARP ") + LIFNAMSIZ];
-
- ar_describe(ar, ardesc, sizeof (ardesc), B_FALSE);
- mdb_printf("%?p %?p %?p %s\n", addr, ar->ar_wq, ar->ar_arl, ardesc);
- return (WALK_NEXT);
-}
-
-/*
- * Print out ARP client structures.
- */
-/* ARGSUSED2 */
-static int
-ar_cmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
-{
- ar_t ar;
-
- if (DCMD_HDRSPEC(flags) && !(flags & DCMD_PIPE_OUT)) {
- mdb_printf("%<u>%?s %?s %?s %s%</u>\n",
- "AR", "WQ", "ARL", "TYPE");
- }
-
- if (flags & DCMD_ADDRSPEC) {
- if (mdb_vread(&ar, sizeof (ar), addr) == -1) {
- mdb_warn("failed to read ar_t at %p", addr);
- return (DCMD_ERR);
- }
- (void) ar_cb(addr, &ar, NULL);
- } else {
- if (mdb_walk("ar", ar_cb, NULL) == -1) {
- mdb_warn("cannot walk ar_t structures");
- return (DCMD_ERR);
- }
- }
- return (DCMD_OK);
-}
-
-/* ARGSUSED2 */
-static int
-arl_cb(uintptr_t addr, const void *arlptr, void *dummy)
-{
- const arl_t *arl = arlptr;
- arlphy_t ap;
- uchar_t macaddr[ARP_MAX_ADDR_LEN];
- char macstr[ARP_MAX_ADDR_LEN*3];
- char flags[4];
- const char *primstr;
-
- mdb_printf("%?p ", addr);
- if (arl->arl_dlpi_pending == DL_PRIM_INVAL)
- mdb_printf("%16s", "--");
- else if ((primstr = mdb_dlpi_prim(arl->arl_dlpi_pending)) != NULL)
- mdb_printf("%16s", primstr);
- else
- mdb_printf("%16x", arl->arl_dlpi_pending);
-
- if (mdb_vread(&ap, sizeof (ap), (uintptr_t)arl->arl_phy) == -1 ||
- ap.ap_hw_addrlen == 0 || ap.ap_hw_addrlen > sizeof (macaddr)) {
- (void) strcpy(macstr, "--");
- } else if (mdb_vread(macaddr, ap.ap_hw_addrlen,
- (uintptr_t)ap.ap_hw_addr) == -1) {
- (void) strcpy(macstr, "?");
- } else {
- mdb_mac_addr(macaddr, ap.ap_hw_addrlen, macstr,
- sizeof (macstr));
- }
-
- /* Print both the link-layer state and the NOARP flag */
- flags[0] = '\0';
- if (arl->arl_flags & ARL_F_NOARP)
- (void) strcat(flags, "N");
- switch (arl->arl_state) {
- case ARL_S_DOWN:
- (void) strcat(flags, "d");
- break;
- case ARL_S_PENDING:
- (void) strcat(flags, "P");
- break;
- case ARL_S_UP:
- (void) strcat(flags, "U");
- break;
- default:
- (void) strcat(flags, "?");
- break;
- }
- mdb_printf(" %8d %-3s %-9s %-17s %5d\n",
- mdb_mblk_count(arl->arl_dlpi_deferred), flags, arl->arl_name,
- macstr, arl_to_stackid((uintptr_t)addr));
- return (WALK_NEXT);
-}
-
-/*
- * Print out ARP link-layer elements.
- */
-/* ARGSUSED2 */
-static int
-arl_cmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
-{
- arl_t arl;
-
- if (DCMD_HDRSPEC(flags) && !(flags & DCMD_PIPE_OUT)) {
- mdb_printf("%<u>%?s %16s %8s %3s %9s %-17s %5s%</u>\n",
- "ARL", "DLPI REQ", "DLPI CNT", "FLG", "INTERFACE",
- "HWADDR", "STACK");
- }
-
- if (flags & DCMD_ADDRSPEC) {
- if (mdb_vread(&arl, sizeof (arl), addr) == -1) {
- mdb_warn("failed to read arl_t at %p", addr);
- return (DCMD_ERR);
- }
- (void) arl_cb(addr, &arl, NULL);
- } else {
- if (mdb_walk("arl", arl_cb, NULL) == -1) {
- mdb_warn("cannot walk arl_t structures");
- return (DCMD_ERR);
- }
- }
- return (DCMD_OK);
-}
-
-/* ARGSUSED2 */
-static int
-ace_cb(uintptr_t addr, const void *aceptr, void *dummy)
-{
- const ace_t *ace = aceptr;
- uchar_t macaddr[ARP_MAX_ADDR_LEN];
- char macstr[ARP_MAX_ADDR_LEN*3];
- /* The %b format isn't compact enough for long listings */
- static const char ace_flags[] = "SPDRMLdA ofya";
- const char *cp;
- char flags[sizeof (ace_flags)], *fp;
- int flg;
- in_addr_t inaddr, mask;
- char addrstr[sizeof ("255.255.255.255/32")];
-
- /* Walk the list of flags and produce a string */
- cp = ace_flags;
- fp = flags;
- for (flg = 1; *cp != '\0'; flg <<= 1, cp++) {
- if ((flg & ace->ace_flags) && *cp != ' ')
- *fp++ = *cp;
- }
- *fp = '\0';
-
- /* If it's not resolved, then it has no hardware address */
- if (!(ace->ace_flags & ACE_F_RESOLVED) ||
- ace->ace_hw_addr_length == 0 ||
- ace->ace_hw_addr_length > sizeof (macaddr)) {
- (void) strcpy(macstr, "--");
- } else if (mdb_vread(macaddr, ace->ace_hw_addr_length,
- (uintptr_t)ace->ace_hw_addr) == -1) {
- (void) strcpy(macstr, "?");
- } else {
- mdb_mac_addr(macaddr, ace->ace_hw_addr_length, macstr,
- sizeof (macstr));
- }
-
- /*
- * Nothing other than IP uses ARP these days, so we don't try very hard
- * here to switch out on ARP protocol type. (Note that ARP protocol
- * types are roughly Ethertypes, but are allocated separately at IANA.)
- */
- if (ace->ace_proto != IP_ARP_PROTO_TYPE) {
- (void) mdb_snprintf(addrstr, sizeof (addrstr),
- "Unknown proto %x", ace->ace_proto);
- } else if (mdb_vread(&inaddr, sizeof (inaddr),
- (uintptr_t)ace->ace_proto_addr) != -1 &&
- mdb_vread(&mask, sizeof (mask), (uintptr_t)ace->ace_proto_mask) !=
- -1) {
- /*
- * If it's the standard host mask, then print it normally.
- * Otherwise, use "/n" notation.
- */
- if (mask == (in_addr_t)~0) {
- (void) mdb_snprintf(addrstr, sizeof (addrstr), "%I",
- inaddr);
- } else {
- (void) mdb_snprintf(addrstr, sizeof (addrstr), "%I/%d",
- inaddr, mask == 0 ? 0 : 33 - mdb_ffs(mask));
- }
- } else {
- (void) strcpy(addrstr, "?");
- }
- mdb_printf("%?p %-18s %-8s %-17s %5d\n", addr, addrstr, flags,
- macstr, arl_to_stackid((uintptr_t)ace->ace_arl));
- return (WALK_NEXT);
-}
-
-/*
- * Print out ARP cache entry (ace_t) elements.
- */
-/* ARGSUSED2 */
-static int
-ace_cmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
-{
- ace_t ace;
-
- if (DCMD_HDRSPEC(flags) && !(flags & DCMD_PIPE_OUT)) {
- mdb_printf("%<u>%?s %-18s %-8s %-17s %5s%</u>\n",
- "ACE", "PROTOADDR", "FLAGS", "HWADDR", "STACK");
- }
-
- if (flags & DCMD_ADDRSPEC) {
- if (mdb_vread(&ace, sizeof (ace), addr) == -1) {
- mdb_warn("failed to read ace_t at %p", addr);
- return (DCMD_ERR);
- }
- (void) ace_cb(addr, &ace, NULL);
- } else {
- if (mdb_walk("ace", ace_cb, NULL) == -1) {
- mdb_warn("cannot walk ace_t structures");
- return (DCMD_ERR);
- }
- }
- return (DCMD_OK);
-}
/*
* Print an ARP hardware and protocol address pair; used when printing an ARP
@@ -696,148 +162,25 @@ arphdr_cmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
return (DCMD_OK);
}
-/*
- * Print out an arp command formatted in a reasonable manner. This implements
- * the type switch used by ARP.
- *
- * It could also dump the data that follows the header (using offset and length
- * in the various structures), but it currently does not.
- */
-/* ARGSUSED2 */
-static int
-arpcmd_cmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
-{
- arc_t arc;
- const arp_cmd_tbl *tp;
- mdb_arg_t subargv;
-
- if (!(flags & DCMD_ADDRSPEC)) {
- mdb_warn("address required to print ARP command\n");
- return (DCMD_ERR);
- }
- if (mdb_vread(&arc, sizeof (arc), addr) == -1) {
- mdb_warn("unable to read arc_t at %p", addr);
- return (DCMD_ERR);
- }
- for (tp = act_list; tp->act_cmd != 0; tp++)
- if (tp->act_cmd == arc.arc_cmd)
- break;
- mdb_printf("%p %s (%s) = ", addr, tp->act_name, tp->act_type);
- subargv.a_type = MDB_TYPE_STRING;
- subargv.a_un.a_str = tp->act_type;
- if (mdb_call_dcmd("print", addr, DCMD_ADDRSPEC, 1, &subargv) == -1)
- return (DCMD_ERR);
- else
- return (DCMD_OK);
-}
-
-static size_t
-mi_osize(const queue_t *q)
-{
- /*
- * The code in common/inet/mi.c allocates an extra word to store the
- * size of the allocation. An mi_o_s is thus a size_t plus an mi_o_s.
- */
- struct mi_block {
- size_t mi_nbytes;
- struct mi_o_s mi_o;
- } m;
-
- if (mdb_vread(&m, sizeof (m), (uintptr_t)q->q_ptr - sizeof (m)) != -1)
- return (m.mi_nbytes - sizeof (m));
-
- return (0);
-}
-
-/*
- * This is called when ::stream is used and an ARP module is seen on the
- * stream. Determine what sort of ARP usage is involved and show an
- * appropriate message.
- */
-static void
-arp_qinfo(const queue_t *qp, char *buf, size_t nbytes)
-{
- size_t size = mi_osize(qp);
- ar_t ar;
-
- if (size != sizeof (ar_t))
- return;
- if (mdb_vread(&ar, sizeof (ar), (uintptr_t)qp->q_ptr) == -1)
- return;
- ar_describe(&ar, buf, nbytes, B_TRUE);
-}
-
-static uintptr_t
-arp_rnext(const queue_t *q)
-{
- size_t size = mi_osize(q);
- ar_t ar;
-
- if (size == sizeof (ar_t) && mdb_vread(&ar, sizeof (ar),
- (uintptr_t)q->q_ptr) != -1)
- return ((uintptr_t)ar.ar_rq);
-
- return (NULL);
-}
-
-static uintptr_t
-arp_wnext(const queue_t *q)
-{
- size_t size = mi_osize(q);
- ar_t ar;
-
- if (size == sizeof (ar_t) && mdb_vread(&ar, sizeof (ar),
- (uintptr_t)q->q_ptr) != -1)
- return ((uintptr_t)ar.ar_wq);
-
- return (NULL);
-}
-
static const mdb_dcmd_t dcmds[] = {
- { "ar", "?", "display ARP client streams for all stacks",
- ar_cmd, NULL },
- { "arl", "?", "display ARP link layers for all stacks", arl_cmd, NULL },
- { "ace", "?", "display ARP cache entries for all stacks",
- ace_cmd, NULL },
{ "arphdr", ":", "display an ARP header", arphdr_cmd, NULL },
- { "arpcmd", ":", "display an ARP command", arpcmd_cmd, NULL },
{ NULL }
};
/* Note: ar_t walker is in genunix.c and net.c; generic MI walker */
static const mdb_walker_t walkers[] = {
- { "arl", "walk list of arl_t links for all stacks",
- arl_walk_init, arl_walk_step, NULL },
- { "arl_stack", "walk list of arl_t links",
- arl_stack_walk_init, arl_stack_walk_step, NULL },
- { "ace", "walk list of ace_t entries for all stacks",
- ace_walk_init, ace_walk_step, NULL },
- { "ace_stack", "walk list of ace_t entries",
- ace_stack_walk_init, ace_stack_walk_step, ace_stack_walk_fini },
- { "arp_stacks", "walk all the arp_stack_t",
- arp_stacks_walk_init, arp_stacks_walk_step, NULL },
{ NULL }
};
-static const mdb_qops_t arp_qops = { arp_qinfo, arp_rnext, arp_wnext };
static const mdb_modinfo_t modinfo = { MDB_API_VERSION, dcmds, walkers };
const mdb_modinfo_t *
_mdb_init(void)
{
- GElf_Sym sym;
-
- if (mdb_lookup_by_obj("arp", "winit", &sym) == 0)
- mdb_qops_install(&arp_qops, (uintptr_t)sym.st_value);
-
return (&modinfo);
}
void
_mdb_fini(void)
{
- GElf_Sym sym;
-
- if (mdb_lookup_by_obj("arp", "winit", &sym) == 0)
- mdb_qops_remove(&arp_qops, (uintptr_t)sym.st_value);
}
diff --git a/usr/src/cmd/mdb/common/modules/genunix/genunix.c b/usr/src/cmd/mdb/common/modules/genunix/genunix.c
index 3e49d9a99c..e6fe3f7dcf 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/genunix.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/genunix.c
@@ -4770,8 +4770,6 @@ static const mdb_walker_t walkers[] = {
NULL, modchain_walk_step, NULL },
/* from net.c */
- { "ar", "walk ar_t structures using MI for all stacks",
- mi_payload_walk_init, mi_payload_walk_step, NULL, &mi_ar_arg },
{ "icmp", "walk ICMP control structures using MI for all stacks",
mi_payload_walk_init, mi_payload_walk_step, NULL,
&mi_icmp_arg },
@@ -4779,8 +4777,6 @@ static const mdb_walker_t walkers[] = {
mi_walk_init, mi_walk_step, mi_walk_fini, NULL },
{ "sonode", "given a sonode, walk its children",
sonode_walk_init, sonode_walk_step, sonode_walk_fini, NULL },
- { "ar_stacks", "walk all the ar_stack_t",
- ar_stacks_walk_init, ar_stacks_walk_step, NULL },
{ "icmp_stacks", "walk all the icmp_stack_t",
icmp_stacks_walk_init, icmp_stacks_walk_step, NULL },
{ "tcp_stacks", "walk all the tcp_stack_t",
diff --git a/usr/src/cmd/mdb/common/modules/genunix/net.c b/usr/src/cmd/mdb/common/modules/genunix/net.c
index d9f4717d7e..23d6202fff 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/net.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/net.c
@@ -45,7 +45,6 @@
#include <sys/socketvar.h>
#include <sys/cred_impl.h>
#include <inet/udp_impl.h>
-#include <inet/arp_impl.h>
#include <inet/rawip_impl.h>
#include <inet/mi.h>
#include <fs/sockfs/socktpi_impl.h>
@@ -71,31 +70,6 @@ typedef struct netstat_cb_data_s {
int af;
} netstat_cb_data_t;
-/* Walkers for various *_stack_t */
-int
-ar_stacks_walk_init(mdb_walk_state_t *wsp)
-{
- if (mdb_layered_walk("netstack", wsp) == -1) {
- mdb_warn("can't walk 'netstack'");
- return (WALK_ERR);
- }
- return (WALK_NEXT);
-}
-
-int
-ar_stacks_walk_step(mdb_walk_state_t *wsp)
-{
- uintptr_t kaddr;
- netstack_t nss;
-
- if (mdb_vread(&nss, sizeof (nss), wsp->walk_addr) == -1) {
- mdb_warn("can't read netstack at %p", wsp->walk_addr);
- return (WALK_ERR);
- }
- kaddr = (uintptr_t)nss.netstack_modules[NS_ARP];
- return (wsp->walk_callback(kaddr, wsp->walk_layer, wsp->walk_cbdata));
-}
-
int
icmp_stacks_walk_init(mdb_walk_state_t *wsp)
{
@@ -201,15 +175,15 @@ net_tcp_active(const tcp_t *tcp)
static int
net_tcp_ipv4(const tcp_t *tcp)
{
- return ((tcp->tcp_ipversion == IPV4_VERSION) ||
- (IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip_src_v6) &&
+ return ((tcp->tcp_connp->conn_ipversion == IPV4_VERSION) ||
+ (IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_connp->conn_laddr_v6) &&
(tcp->tcp_state <= TCPS_LISTEN)));
}
static int
net_tcp_ipv6(const tcp_t *tcp)
{
- return (tcp->tcp_ipversion == IPV6_VERSION);
+ return (tcp->tcp_connp->conn_ipversion == IPV6_VERSION);
}
static int
@@ -222,15 +196,15 @@ net_udp_active(const udp_t *udp)
static int
net_udp_ipv4(const udp_t *udp)
{
- return ((udp->udp_ipversion == IPV4_VERSION) ||
- (IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src) &&
+ return ((udp->udp_connp->conn_ipversion == IPV4_VERSION) ||
+ (IN6_IS_ADDR_UNSPECIFIED(&udp->udp_connp->conn_laddr_v6) &&
(udp->udp_state <= TS_IDLE)));
}
static int
net_udp_ipv6(const udp_t *udp)
{
- return (udp->udp_ipversion == IPV6_VERSION);
+ return (udp->udp_connp->conn_ipversion == IPV6_VERSION);
}
int
@@ -399,11 +373,6 @@ mi_payload_walk_step(mdb_walk_state_t *wsp)
return (WALK_NEXT);
}
-const mi_payload_walk_arg_t mi_ar_arg = {
- "ar_stacks", OFFSETOF(arp_stack_t, as_head), sizeof (ar_t),
- MI_PAYLOAD_DEVICE | MI_PAYLOAD_MODULE
-};
-
const mi_payload_walk_arg_t mi_icmp_arg = {
"icmp_stacks", OFFSETOF(icmp_stack_t, is_head), sizeof (icmp_t),
MI_PAYLOAD_DEVICE | MI_PAYLOAD_MODULE
@@ -632,7 +601,7 @@ netstat_tcp_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
tcp_kaddr = (uintptr_t)connp->conn_tcp;
if (mdb_vread(&tcps, sizeof (tcp_t), tcp_kaddr) == -1) {
- mdb_warn("failed to read tcp_t at %p", kaddr);
+ mdb_warn("failed to read tcp_t at %p", tcp_kaddr);
return (WALK_ERR);
}
@@ -648,13 +617,13 @@ netstat_tcp_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
mdb_printf("%0?p %2i ", tcp_kaddr, tcp->tcp_state);
if (af == AF_INET) {
- net_ipv4addrport_pr(&tcp->tcp_ip_src_v6, tcp->tcp_lport);
+ net_ipv4addrport_pr(&connp->conn_laddr_v6, connp->conn_lport);
mdb_printf(" ");
- net_ipv4addrport_pr(&tcp->tcp_remote_v6, tcp->tcp_fport);
+ net_ipv4addrport_pr(&connp->conn_faddr_v6, connp->conn_fport);
} else if (af == AF_INET6) {
- net_ipv6addrport_pr(&tcp->tcp_ip_src_v6, tcp->tcp_lport);
+ net_ipv6addrport_pr(&connp->conn_laddr_v6, connp->conn_lport);
mdb_printf(" ");
- net_ipv6addrport_pr(&tcp->tcp_remote_v6, tcp->tcp_fport);
+ net_ipv6addrport_pr(&connp->conn_faddr_v6, connp->conn_fport);
}
mdb_printf(" %5i", ns_to_stackid((uintptr_t)connp->conn_netstack));
mdb_printf(" %4i\n", connp->conn_zoneid);
@@ -687,6 +656,9 @@ netstat_udp_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
return (WALK_ERR);
}
+ connp->conn_udp = &udp;
+ udp.udp_connp = connp;
+
if (!((opts & NETSTAT_ALL) || net_udp_active(&udp)) ||
(af == AF_INET && !net_udp_ipv4(&udp)) ||
(af == AF_INET6 && !net_udp_ipv6(&udp))) {
@@ -704,13 +676,13 @@ netstat_udp_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
mdb_printf("%0?p %10s ", (uintptr_t)connp->conn_udp, state);
if (af == AF_INET) {
- net_ipv4addrport_pr(&udp.udp_v6src, udp.udp_port);
+ net_ipv4addrport_pr(&connp->conn_laddr_v6, connp->conn_lport);
mdb_printf(" ");
- net_ipv4addrport_pr(&udp.udp_v6dst, udp.udp_dstport);
+ net_ipv4addrport_pr(&connp->conn_faddr_v6, connp->conn_fport);
} else if (af == AF_INET6) {
- net_ipv6addrport_pr(&udp.udp_v6src, udp.udp_port);
+ net_ipv6addrport_pr(&connp->conn_laddr_v6, connp->conn_lport);
mdb_printf(" ");
- net_ipv6addrport_pr(&udp.udp_v6dst, udp.udp_dstport);
+ net_ipv6addrport_pr(&connp->conn_faddr_v6, connp->conn_fport);
}
mdb_printf(" %5i", ns_to_stackid((uintptr_t)connp->conn_netstack));
mdb_printf(" %4i\n", connp->conn_zoneid);
@@ -740,8 +712,11 @@ netstat_icmp_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
return (WALK_ERR);
}
- if ((af == AF_INET && icmp.icmp_ipversion != IPV4_VERSION) ||
- (af == AF_INET6 && icmp.icmp_ipversion != IPV6_VERSION)) {
+ connp->conn_icmp = &icmp;
+ icmp.icmp_connp = connp;
+
+ if ((af == AF_INET && connp->conn_ipversion != IPV4_VERSION) ||
+ (af == AF_INET6 && connp->conn_ipversion != IPV6_VERSION)) {
return (WALK_NEXT);
}
@@ -756,16 +731,16 @@ netstat_icmp_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
mdb_printf("%0?p %10s ", (uintptr_t)connp->conn_icmp, state);
if (af == AF_INET) {
- mdb_printf("%*I ", ADDR_V4_WIDTH,
- V4_PART_OF_V6((icmp.icmp_v6src)));
- mdb_printf("%*I ", ADDR_V4_WIDTH,
- V4_PART_OF_V6((icmp.icmp_v6dst.sin6_addr)));
+ net_ipv4addrport_pr(&connp->conn_laddr_v6, connp->conn_lport);
+ mdb_printf(" ");
+ net_ipv4addrport_pr(&connp->conn_faddr_v6, connp->conn_fport);
} else if (af == AF_INET6) {
- mdb_printf("%*N ", ADDR_V6_WIDTH, &icmp.icmp_v6src);
- mdb_printf("%*N ", ADDR_V6_WIDTH, &icmp.icmp_v6dst);
+ net_ipv6addrport_pr(&connp->conn_laddr_v6, connp->conn_lport);
+ mdb_printf(" ");
+ net_ipv6addrport_pr(&connp->conn_faddr_v6, connp->conn_fport);
}
mdb_printf(" %5i", ns_to_stackid((uintptr_t)connp->conn_netstack));
- mdb_printf(" %4i\n", icmp.icmp_zoneid);
+ mdb_printf(" %4i\n", connp->conn_zoneid);
return (WALK_NEXT);
}
@@ -881,57 +856,57 @@ get_ifname(const ire_t *ire, char *intf)
ill_t ill;
*intf = '\0';
- if (ire->ire_type == IRE_CACHE) {
- queue_t stq;
-
- if (mdb_vread(&stq, sizeof (stq), (uintptr_t)ire->ire_stq) ==
- -1)
- return;
- if (mdb_vread(&ill, sizeof (ill), (uintptr_t)stq.q_ptr) == -1)
+ if (ire->ire_ill != NULL) {
+ if (mdb_vread(&ill, sizeof (ill),
+ (uintptr_t)ire->ire_ill) == -1)
return;
(void) mdb_readstr(intf, MIN(LIFNAMSIZ, ill.ill_name_length),
(uintptr_t)ill.ill_name);
- } else if (ire->ire_ipif != NULL) {
- ipif_t ipif;
- char *cp;
-
- if (mdb_vread(&ipif, sizeof (ipif),
- (uintptr_t)ire->ire_ipif) == -1)
- return;
- if (mdb_vread(&ill, sizeof (ill), (uintptr_t)ipif.ipif_ill) ==
- -1)
- return;
- (void) mdb_readstr(intf, MIN(LIFNAMSIZ, ill.ill_name_length),
- (uintptr_t)ill.ill_name);
- if (ipif.ipif_id != 0) {
- cp = intf + strlen(intf);
- (void) mdb_snprintf(cp, LIFNAMSIZ + 1 - (cp - intf),
- ":%u", ipif.ipif_id);
- }
}
}
+const in6_addr_t ipv6_all_ones =
+ { 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU };
+
static void
-get_v4flags(const ire_t *ire, char *flags)
+get_ireflags(const ire_t *ire, char *flags)
{
(void) strcpy(flags, "U");
- if (ire->ire_type == IRE_DEFAULT || ire->ire_type == IRE_PREFIX ||
- ire->ire_type == IRE_HOST || ire->ire_type == IRE_HOST_REDIRECT)
+ /* RTF_INDIRECT wins over RTF_GATEWAY - don't display both */
+ if (ire->ire_flags & RTF_INDIRECT)
+ (void) strcat(flags, "I");
+ else if (ire->ire_type & IRE_OFFLINK)
(void) strcat(flags, "G");
- if (ire->ire_mask == IP_HOST_MASK)
- (void) strcat(flags, "H");
- if (ire->ire_type == IRE_HOST_REDIRECT)
+
+ /* IRE_IF_CLONE wins over RTF_HOST - don't display both */
+ if (ire->ire_type & IRE_IF_CLONE)
+ (void) strcat(flags, "C");
+ else if (ire->ire_ipversion == IPV4_VERSION) {
+ if (ire->ire_mask == IP_HOST_MASK)
+ (void) strcat(flags, "H");
+ } else {
+ if (IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, &ipv6_all_ones))
+ (void) strcat(flags, "H");
+ }
+
+ if (ire->ire_flags & RTF_DYNAMIC)
(void) strcat(flags, "D");
- if (ire->ire_type == IRE_CACHE)
- (void) strcat(flags, "A");
if (ire->ire_type == IRE_BROADCAST)
- (void) strcat(flags, "B");
+ (void) strcat(flags, "b");
+ if (ire->ire_type == IRE_MULTICAST)
+ (void) strcat(flags, "m");
if (ire->ire_type == IRE_LOCAL)
(void) strcat(flags, "L");
+ if (ire->ire_type == IRE_NOROUTE)
+ (void) strcat(flags, "N");
if (ire->ire_flags & RTF_MULTIRT)
(void) strcat(flags, "M");
if (ire->ire_flags & RTF_SETSRC)
(void) strcat(flags, "S");
+ if (ire->ire_flags & RTF_REJECT)
+ (void) strcat(flags, "R");
+ if (ire->ire_flags & RTF_BLACKHOLE)
+ (void) strcat(flags, "B");
}
static int
@@ -945,8 +920,10 @@ netstat_irev4_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
if (ire->ire_ipversion != IPV4_VERSION)
return (WALK_NEXT);
- if (!(*opts & NETSTAT_ALL) && (ire->ire_type == IRE_CACHE ||
- ire->ire_type == IRE_BROADCAST || ire->ire_type == IRE_LOCAL))
+ /* Skip certain IREs by default */
+ if (!(*opts & NETSTAT_ALL) &&
+ (ire->ire_type &
+ (IRE_BROADCAST|IRE_LOCAL|IRE_MULTICAST|IRE_NOROUTE|IRE_IF_CLONE)))
return (WALK_NEXT);
if (*opts & NETSTAT_FIRST) {
@@ -966,10 +943,9 @@ netstat_irev4_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
}
}
- gate = (ire->ire_type & (IRE_INTERFACE|IRE_LOOPBACK|IRE_BROADCAST)) ?
- ire->ire_src_addr : ire->ire_gateway_addr;
+ gate = ire->ire_gateway_addr;
- get_v4flags(ire, flags);
+ get_ireflags(ire, flags);
get_ifname(ire, intf);
@@ -977,8 +953,8 @@ netstat_irev4_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
mdb_printf("%?p %-*I %-*I %-*I %-6s %5u%c %4u %3u %-3s %5u "
"%u\n", kaddr, ADDR_V4_WIDTH, ire->ire_addr, ADDR_V4_WIDTH,
ire->ire_mask, ADDR_V4_WIDTH, gate, intf,
- ire->ire_max_frag, ire->ire_frag_flag ? '*' : ' ',
- ire->ire_uinfo.iulp_rtt, ire->ire_refcnt, flags,
+ 0, ' ',
+ ire->ire_metrics.iulp_rtt, ire->ire_refcnt, flags,
ire->ire_ob_pkt_count, ire->ire_ib_pkt_count);
} else {
mdb_printf("%?p %-*I %-*I %-5s %4u %5u %s\n", kaddr,
@@ -1025,7 +1001,10 @@ netstat_irev6_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
if (ire->ire_ipversion != IPV6_VERSION)
return (WALK_NEXT);
- if (!(*opts & NETSTAT_ALL) && ire->ire_type == IRE_CACHE)
+ /* Skip certain IREs by default */
+ if (!(*opts & NETSTAT_ALL) &&
+ (ire->ire_type &
+ (IRE_BROADCAST|IRE_LOCAL|IRE_MULTICAST|IRE_NOROUTE|IRE_IF_CLONE)))
return (WALK_NEXT);
if (*opts & NETSTAT_FIRST) {
@@ -1045,37 +1024,21 @@ netstat_irev6_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
}
}
- gatep = (ire->ire_type & (IRE_INTERFACE|IRE_LOOPBACK)) ?
- &ire->ire_src_addr_v6 : &ire->ire_gateway_addr_v6;
+ gatep = &ire->ire_gateway_addr_v6;
masklen = ip_mask_to_plen_v6(&ire->ire_mask_v6);
(void) mdb_snprintf(deststr, sizeof (deststr), "%N/%d",
&ire->ire_addr_v6, masklen);
- (void) strcpy(flags, "U");
- if (ire->ire_type == IRE_DEFAULT || ire->ire_type == IRE_PREFIX ||
- ire->ire_type == IRE_HOST || ire->ire_type == IRE_HOST_REDIRECT)
- (void) strcat(flags, "G");
- if (masklen == IPV6_ABITS)
- (void) strcat(flags, "H");
- if (ire->ire_type == IRE_HOST_REDIRECT)
- (void) strcat(flags, "D");
- if (ire->ire_type == IRE_CACHE)
- (void) strcat(flags, "A");
- if (ire->ire_type == IRE_LOCAL)
- (void) strcat(flags, "L");
- if (ire->ire_flags & RTF_MULTIRT)
- (void) strcat(flags, "M");
- if (ire->ire_flags & RTF_SETSRC)
- (void) strcat(flags, "S");
+ get_ireflags(ire, flags);
get_ifname(ire, intf);
if (*opts & NETSTAT_VERBOSE) {
mdb_printf("%?p %-*s %-*N %-5s %5u%c %5u %3u %-5s %6u %u\n",
kaddr, ADDR_V6_WIDTH+4, deststr, ADDR_V6_WIDTH, gatep,
- intf, ire->ire_max_frag, ire->ire_frag_flag ? '*' : ' ',
- ire->ire_uinfo.iulp_rtt, ire->ire_refcnt,
+ intf, 0, ' ',
+ ire->ire_metrics.iulp_rtt, ire->ire_refcnt,
flags, ire->ire_ob_pkt_count, ire->ire_ib_pkt_count);
} else {
mdb_printf("%?p %-*s %-*N %-5s %3u %6u %s\n", kaddr,
diff --git a/usr/src/cmd/mdb/common/modules/genunix/net.h b/usr/src/cmd/mdb/common/modules/genunix/net.h
index f2d441e78c..f72d75f75a 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/net.h
+++ b/usr/src/cmd/mdb/common/modules/genunix/net.h
@@ -30,7 +30,6 @@
extern "C" {
#endif
-extern struct mi_payload_walk_arg_s mi_ar_arg;
extern struct mi_payload_walk_arg_s mi_icmp_arg;
extern struct mi_payload_walk_arg_s mi_ill_arg;
@@ -42,8 +41,6 @@ extern int mi_walk_step(mdb_walk_state_t *);
extern void mi_walk_fini(mdb_walk_state_t *);
extern int mi_payload_walk_init(mdb_walk_state_t *);
extern int mi_payload_walk_step(mdb_walk_state_t *);
-extern int ar_stacks_walk_init(mdb_walk_state_t *);
-extern int ar_stacks_walk_step(mdb_walk_state_t *);
extern int icmp_stacks_walk_init(mdb_walk_state_t *);
extern int icmp_stacks_walk_step(mdb_walk_state_t *);
extern int tcp_stacks_walk_init(mdb_walk_state_t *);
diff --git a/usr/src/cmd/mdb/common/modules/genunix/streams.c b/usr/src/cmd/mdb/common/modules/genunix/streams.c
index 0458589309..d0095c7752 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/streams.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/streams.c
@@ -172,7 +172,6 @@ static const struct str_flags mbf[] = {
{ SF(0x08), "unused" },
{ SF(MSGMARKNEXT), "Private: b_next's first byte marked" },
{ SF(MSGNOTMARKNEXT), "Private: ... not marked" },
- { SF(MSGHASREF), "Private: msg has reference to owner" },
{ 0, NULL, NULL }
};
diff --git a/usr/src/cmd/mdb/common/modules/genunix/vfs.c b/usr/src/cmd/mdb/common/modules/genunix/vfs.c
index 45dc27af23..8001c41b3c 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/vfs.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/vfs.c
@@ -572,8 +572,9 @@ sctp_getsockaddr(sctp_t *sctp, struct sockaddr *addr)
sin_t *sin4;
int scanned = 0;
boolean_t skip_lback = B_FALSE;
+ conn_t *connp = sctp->sctp_connp;
- addr->sa_family = sctp->sctp_family;
+ addr->sa_family = connp->conn_family;
if (sctp->sctp_nsaddrs == 0)
goto done;
@@ -636,18 +637,18 @@ sctp_getsockaddr(sctp_t *sctp, struct sockaddr *addr)
continue;
}
- switch (sctp->sctp_family) {
+ switch (connp->conn_family) {
case AF_INET:
/* LINTED: alignment */
sin4 = (sin_t *)addr;
if ((sctp->sctp_state <= SCTPS_LISTEN) &&
sctp->sctp_bound_to_all) {
sin4->sin_addr.s_addr = INADDR_ANY;
- sin4->sin_port = sctp->sctp_lport;
+ sin4->sin_port = connp->conn_lport;
} else {
sin4 += added;
sin4->sin_family = AF_INET;
- sin4->sin_port = sctp->sctp_lport;
+ sin4->sin_port = connp->conn_lport;
IN6_V4MAPPED_TO_INADDR(&laddr,
&sin4->sin_addr);
}
@@ -660,15 +661,14 @@ sctp_getsockaddr(sctp_t *sctp, struct sockaddr *addr)
sctp->sctp_bound_to_all) {
bzero(&sin6->sin6_addr,
sizeof (sin6->sin6_addr));
- sin6->sin6_port = sctp->sctp_lport;
+ sin6->sin6_port = connp->conn_lport;
} else {
sin6 += added;
sin6->sin6_family = AF_INET6;
- sin6->sin6_port = sctp->sctp_lport;
+ sin6->sin6_port = connp->conn_lport;
sin6->sin6_addr = laddr;
}
- sin6->sin6_flowinfo = sctp->sctp_ip6h->ip6_vcf &
- ~IPV6_VERS_AND_FLOW_MASK;
+ sin6->sin6_flowinfo = connp->conn_flowinfo;
sin6->sin6_scope_id = 0;
sin6->__sin6_src_id = 0;
break;
@@ -712,11 +712,12 @@ sctp_getpeeraddr(sctp_t *sctp, struct sockaddr *addr)
struct sockaddr_in6 *sin6;
sctp_faddr_t sctp_primary;
in6_addr_t faddr;
+ conn_t *connp = sctp->sctp_connp;
if (sctp->sctp_faddrs == NULL)
return (-1);
- addr->sa_family = sctp->sctp_family;
+ addr->sa_family = connp->conn_family;
if (mdb_vread(&sctp_primary, sizeof (sctp_faddr_t),
(uintptr_t)sctp->sctp_primary) == -1) {
mdb_warn("failed to read sctp primary faddr");
@@ -724,12 +725,12 @@ sctp_getpeeraddr(sctp_t *sctp, struct sockaddr *addr)
}
faddr = sctp_primary.faddr;
- switch (sctp->sctp_family) {
+ switch (connp->conn_family) {
case AF_INET:
/* LINTED: alignment */
sin4 = (struct sockaddr_in *)addr;
IN6_V4MAPPED_TO_INADDR(&faddr, &sin4->sin_addr);
- sin4->sin_port = sctp->sctp_fport;
+ sin4->sin_port = connp->conn_fport;
sin4->sin_family = AF_INET;
break;
@@ -737,7 +738,7 @@ sctp_getpeeraddr(sctp_t *sctp, struct sockaddr *addr)
/* LINTED: alignment */
sin6 = (struct sockaddr_in6 *)addr;
sin6->sin6_addr = faddr;
- sin6->sin6_port = sctp->sctp_fport;
+ sin6->sin6_port = connp->conn_fport;
sin6->sin6_family = AF_INET6;
sin6->sin6_flowinfo = 0;
sin6->sin6_scope_id = 0;
@@ -797,7 +798,7 @@ tcpip_sock_print(struct sonode *socknode)
mdb_printf("socket: ");
mdb_nhconvert(&port, &conn_t.conn_lport, sizeof (port));
- mdb_printf("AF_INET %I %d ", conn_t.conn_src, port);
+ mdb_printf("AF_INET %I %d ", conn_t.conn_laddr_v4, port);
/*
* If this is a listening socket, we don't print
@@ -807,7 +808,8 @@ tcpip_sock_print(struct sonode *socknode)
IPCL_IS_UDP(&conn_t) && IPCL_IS_CONNECTED(&conn_t)) {
mdb_printf("remote: ");
mdb_nhconvert(&port, &conn_t.conn_fport, sizeof (port));
- mdb_printf("AF_INET %I %d ", conn_t.conn_rem, port);
+ mdb_printf("AF_INET %I %d ", conn_t.conn_faddr_v4,
+ port);
}
break;
@@ -826,7 +828,7 @@ tcpip_sock_print(struct sonode *socknode)
mdb_printf("socket: ");
mdb_nhconvert(&port, &conn_t.conn_lport, sizeof (port));
- mdb_printf("AF_INET6 %N %d ", &conn_t.conn_srcv6, port);
+ mdb_printf("AF_INET6 %N %d ", &conn_t.conn_laddr_v4, port);
/*
* If this is a listening socket, we don't print
@@ -836,7 +838,8 @@ tcpip_sock_print(struct sonode *socknode)
IPCL_IS_UDP(&conn_t) && IPCL_IS_CONNECTED(&conn_t)) {
mdb_printf("remote: ");
mdb_nhconvert(&port, &conn_t.conn_fport, sizeof (port));
- mdb_printf("AF_INET6 %N %d ", &conn_t.conn_remv6, port);
+ mdb_printf("AF_INET6 %N %d ", &conn_t.conn_faddr_v6,
+ port);
}
break;
@@ -854,6 +857,7 @@ static int
sctp_sock_print(struct sonode *socknode)
{
sctp_t sctp_t;
+ conn_t conns;
struct sockaddr *laddr = mdb_alloc(sizeof (struct sockaddr), UM_SLEEP);
struct sockaddr *faddr = mdb_alloc(sizeof (struct sockaddr), UM_SLEEP);
@@ -864,6 +868,14 @@ sctp_sock_print(struct sonode *socknode)
return (-1);
}
+ if (mdb_vread(&conns, sizeof (conn_t),
+ (uintptr_t)sctp_t.sctp_connp) == -1) {
+ mdb_warn("failed to read conn_t at %p",
+ (uintptr_t)sctp_t.sctp_connp);
+ return (-1);
+ }
+ sctp_t.sctp_connp = &conns;
+
if (sctp_getsockaddr(&sctp_t, laddr) == 0) {
mdb_printf("socket:");
pfiles_print_addr(laddr);
diff --git a/usr/src/cmd/mdb/common/modules/ip/ip.c b/usr/src/cmd/mdb/common/modules/ip/ip.c
index 28f21efe1f..da94942eae 100644
--- a/usr/src/cmd/mdb/common/modules/ip/ip.c
+++ b/usr/src/cmd/mdb/common/modules/ip/ip.c
@@ -52,6 +52,7 @@
#include <ilb/ilb_nat.h>
#include <ilb/ilb_conn.h>
#include <sys/dlpi.h>
+#include <sys/zone.h>
#include <mdb/mdb_modapi.h>
#include <mdb/mdb_ks.h>
@@ -84,15 +85,20 @@ typedef struct illif_walk_data {
ill_if_t ill_if;
} illif_walk_data_t;
-typedef struct nce_walk_data_s {
- struct ndp_g_s nce_ip_ndp;
- int nce_hash_tbl_index;
- nce_t nce;
-} nce_walk_data_t;
+typedef struct ncec_walk_data_s {
+ struct ndp_g_s ncec_ip_ndp;
+ int ncec_hash_tbl_index;
+ ncec_t ncec;
+} ncec_walk_data_t;
+
+typedef struct ncec_cbdata_s {
+ uintptr_t ncec_addr;
+ int ncec_ipversion;
+} ncec_cbdata_t;
typedef struct nce_cbdata_s {
- uintptr_t nce_addr;
- int nce_ipversion;
+ int nce_ipversion;
+ char nce_ill_name[LIFNAMSIZ];
} nce_cbdata_t;
typedef struct ire_cbdata_s {
@@ -100,6 +106,12 @@ typedef struct ire_cbdata_s {
boolean_t verbose;
} ire_cbdata_t;
+typedef struct zi_cbdata_s {
+ const char *zone_name;
+ ip_stack_t *ipst;
+ boolean_t shared_ip_zone;
+} zi_cbdata_t;
+
typedef struct th_walk_data {
uint_t thw_non_zero_only;
boolean_t thw_match;
@@ -122,6 +134,7 @@ typedef struct ill_walk_data_s {
typedef struct ill_cbdata_s {
uintptr_t ill_addr;
int ill_ipversion;
+ ip_stack_t *ill_ipst;
boolean_t verbose;
} ill_cbdata_t;
@@ -156,7 +169,7 @@ static hash_walk_arg_t bind_hash_arg = {
};
static hash_walk_arg_t proto_hash_arg = {
- OFFSETOF(ip_stack_t, ips_ipcl_proto_fanout),
+ OFFSETOF(ip_stack_t, ips_ipcl_proto_fanout_v4),
0
};
@@ -210,13 +223,15 @@ static void ip_list_walk_fini(mdb_walk_state_t *);
static int srcid_walk_step(mdb_walk_state_t *);
static int ire_format(uintptr_t addr, const void *, void *);
-static int nce_format(uintptr_t addr, const nce_t *nce, int ipversion);
-static int nce(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv);
-static int nce_walk_step(mdb_walk_state_t *wsp);
-static int nce_stack_walk_init(mdb_walk_state_t *wsp);
-static int nce_stack_walk_step(mdb_walk_state_t *wsp);
-static void nce_stack_walk_fini(mdb_walk_state_t *wsp);
-static int nce_cb(uintptr_t addr, const nce_walk_data_t *iw, nce_cbdata_t *id);
+static int ncec_format(uintptr_t addr, const ncec_t *ncec, int ipversion);
+static int ncec(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv);
+static int ncec_walk_step(mdb_walk_state_t *wsp);
+static int ncec_stack_walk_init(mdb_walk_state_t *wsp);
+static int ncec_stack_walk_step(mdb_walk_state_t *wsp);
+static void ncec_stack_walk_fini(mdb_walk_state_t *wsp);
+static int ncec_cb(uintptr_t addr, const ncec_walk_data_t *iw,
+ ncec_cbdata_t *id);
+static char *nce_l2_addr(const nce_t *, const ill_t *);
static int ipcl_hash_walk_init(mdb_walk_state_t *);
static int ipcl_hash_walk_step(mdb_walk_state_t *);
@@ -262,6 +277,69 @@ ips_to_stackid(uintptr_t kaddr)
return (nss.netstack_stackid);
}
+/* ARGSUSED */
+static int
+zone_to_ips_cb(uintptr_t addr, const void *zi_arg, void *zi_cb_arg)
+{
+ zi_cbdata_t *zi_cb = zi_cb_arg;
+ zone_t zone;
+ char zone_name[ZONENAME_MAX];
+ netstack_t ns;
+
+ if (mdb_vread(&zone, sizeof (zone_t), addr) == -1) {
+ mdb_warn("can't read zone at %p", addr);
+ return (WALK_ERR);
+ }
+
+ (void) mdb_readstr(zone_name, ZONENAME_MAX, (uintptr_t)zone.zone_name);
+
+ if (strcmp(zi_cb->zone_name, zone_name) != 0)
+ return (WALK_NEXT);
+
+ zi_cb->shared_ip_zone = (!(zone.zone_flags & ZF_NET_EXCL) &&
+ (strcmp(zone_name, "global") != 0));
+
+ if (mdb_vread(&ns, sizeof (netstack_t), (uintptr_t)zone.zone_netstack)
+ == -1) {
+ mdb_warn("can't read netstack at %p", zone.zone_netstack);
+ return (WALK_ERR);
+ }
+
+ zi_cb->ipst = ns.netstack_ip;
+ return (WALK_DONE);
+}
+
+static ip_stack_t *
+zone_to_ips(const char *zone_name)
+{
+ zi_cbdata_t zi_cb;
+
+ if (zone_name == NULL)
+ return (NULL);
+
+ zi_cb.zone_name = zone_name;
+ zi_cb.ipst = NULL;
+ zi_cb.shared_ip_zone = B_FALSE;
+
+ if (mdb_walk("zone", (mdb_walk_cb_t)zone_to_ips_cb, &zi_cb) == -1) {
+ mdb_warn("failed to walk zone");
+ return (NULL);
+ }
+
+ if (zi_cb.shared_ip_zone) {
+ mdb_warn("%s is a Shared-IP zone, try '-s global' instead\n",
+ zone_name);
+ return (NULL);
+ }
+
+ if (zi_cb.ipst == NULL) {
+ mdb_warn("failed to find zone %s\n", zone_name);
+ return (NULL);
+ }
+
+ return (zi_cb.ipst);
+}
+
int
ip_stacks_walk_init(mdb_walk_state_t *wsp)
{
@@ -529,10 +607,10 @@ illif_help(void)
}
int
-ire_walk_init(mdb_walk_state_t *wsp)
+nce_walk_init(mdb_walk_state_t *wsp)
{
- if (mdb_layered_walk("ire_cache", wsp) == -1) {
- mdb_warn("can't walk 'ire_cache'");
+ if (mdb_layered_walk("nce_cache", wsp) == -1) {
+ mdb_warn("can't walk 'nce_cache'");
return (WALK_ERR);
}
@@ -540,60 +618,129 @@ ire_walk_init(mdb_walk_state_t *wsp)
}
int
-ire_walk_step(mdb_walk_state_t *wsp)
+nce_walk_step(mdb_walk_state_t *wsp)
{
- ire_t ire;
+ nce_t nce;
- if (mdb_vread(&ire, sizeof (ire), wsp->walk_addr) == -1) {
- mdb_warn("can't read ire at %p", wsp->walk_addr);
+ if (mdb_vread(&nce, sizeof (nce), wsp->walk_addr) == -1) {
+ mdb_warn("can't read nce at %p", wsp->walk_addr);
return (WALK_ERR);
}
- return (wsp->walk_callback(wsp->walk_addr, &ire, wsp->walk_cbdata));
+ return (wsp->walk_callback(wsp->walk_addr, &nce, wsp->walk_cbdata));
}
+static int
+nce_format(uintptr_t addr, const nce_t *ncep, void *nce_cb_arg)
+{
+ nce_cbdata_t *nce_cb = nce_cb_arg;
+ ill_t ill;
+ char ill_name[LIFNAMSIZ];
+ ncec_t ncec;
+
+ if (mdb_vread(&ncec, sizeof (ncec),
+ (uintptr_t)ncep->nce_common) == -1) {
+ mdb_warn("can't read ncec at %p", ncep->nce_common);
+ return (WALK_NEXT);
+ }
+ if (nce_cb->nce_ipversion != 0 &&
+ ncec.ncec_ipversion != nce_cb->nce_ipversion)
+ return (WALK_NEXT);
+
+ if (mdb_vread(&ill, sizeof (ill), (uintptr_t)ncep->nce_ill) == -1) {
+ mdb_snprintf(ill_name, sizeof (ill_name), "--");
+ } else {
+ (void) mdb_readstr(ill_name,
+ MIN(LIFNAMSIZ, ill.ill_name_length),
+ (uintptr_t)ill.ill_name);
+ }
+
+ if (nce_cb->nce_ill_name[0] != '\0' &&
+ strncmp(nce_cb->nce_ill_name, ill_name, LIFNAMSIZ) != 0)
+ return (WALK_NEXT);
+
+ if (ncec.ncec_ipversion == IPV6_VERSION) {
+
+ mdb_printf("%?p %5s %-18s %?p %6d %N\n",
+ addr, ill_name,
+ nce_l2_addr(ncep, &ill),
+ ncep->nce_fp_mp,
+ ncep->nce_refcnt,
+ &ncep->nce_addr);
+
+ } else {
+ struct in_addr nceaddr;
+
+ IN6_V4MAPPED_TO_INADDR(&ncep->nce_addr, &nceaddr);
+ mdb_printf("%?p %5s %-18s %?p %6d %I\n",
+ addr, ill_name,
+ nce_l2_addr(ncep, &ill),
+ ncep->nce_fp_mp,
+ ncep->nce_refcnt,
+ nceaddr.s_addr);
+ }
+
+ return (WALK_NEXT);
+}
int
-ire_ctable_walk_step(mdb_walk_state_t *wsp)
+dce_walk_init(mdb_walk_state_t *wsp)
{
- uintptr_t kaddr;
- irb_t *irb;
- uint32_t cache_table_size;
- int i;
- ire_cbdata_t ire_cb;
+ wsp->walk_data = (void *)wsp->walk_addr;
- ire_cb.verbose = B_FALSE;
- ire_cb.ire_ipversion = 0;
+ if (mdb_layered_walk("dce_cache", wsp) == -1) {
+ mdb_warn("can't walk 'dce_cache'");
+ return (WALK_ERR);
+ }
+ return (WALK_NEXT);
+}
- kaddr = wsp->walk_addr + OFFSETOF(ip_stack_t, ips_ip_cache_table_size);
+int
+dce_walk_step(mdb_walk_state_t *wsp)
+{
+ dce_t dce;
- if (mdb_vread(&cache_table_size, sizeof (uint32_t), kaddr) == -1) {
- mdb_warn("can't read ips_ip_cache_table at %p", kaddr);
+ if (mdb_vread(&dce, sizeof (dce), wsp->walk_addr) == -1) {
+ mdb_warn("can't read dce at %p", wsp->walk_addr);
return (WALK_ERR);
}
- kaddr = wsp->walk_addr + OFFSETOF(ip_stack_t, ips_ip_cache_table);
- if (mdb_vread(&kaddr, sizeof (kaddr), kaddr) == -1) {
- mdb_warn("can't read ips_ip_cache_table at %p", kaddr);
+ /* If ip_stack_t is specified, skip DCEs that don't belong to it. */
+ if ((wsp->walk_data != NULL) && (wsp->walk_data != dce.dce_ipst))
+ return (WALK_NEXT);
+
+ return (wsp->walk_callback(wsp->walk_addr, &dce, wsp->walk_cbdata));
+}
+
+int
+ire_walk_init(mdb_walk_state_t *wsp)
+{
+ wsp->walk_data = (void *)wsp->walk_addr;
+
+ if (mdb_layered_walk("ire_cache", wsp) == -1) {
+ mdb_warn("can't walk 'ire_cache'");
return (WALK_ERR);
}
- irb = mdb_alloc(sizeof (irb_t) * cache_table_size, UM_SLEEP|UM_GC);
- if (mdb_vread(irb, sizeof (irb_t) * cache_table_size, kaddr) == -1) {
- mdb_warn("can't read irb at %p", kaddr);
+ return (WALK_NEXT);
+}
+
+int
+ire_walk_step(mdb_walk_state_t *wsp)
+{
+ ire_t ire;
+
+ if (mdb_vread(&ire, sizeof (ire), wsp->walk_addr) == -1) {
+ mdb_warn("can't read ire at %p", wsp->walk_addr);
return (WALK_ERR);
}
- for (i = 0; i < cache_table_size; i++) {
- kaddr = (uintptr_t)irb[i].irb_ire;
- if (mdb_pwalk("ire_next", ire_format, &ire_cb,
- kaddr) == -1) {
- mdb_warn("can't walk 'ire_next' for ire %p", kaddr);
- return (WALK_ERR);
- }
- }
- return (WALK_NEXT);
+ /* If ip_stack_t is specified, skip IREs that don't belong to it. */
+ if ((wsp->walk_data != NULL) && (wsp->walk_data != ire.ire_ipst))
+ return (WALK_NEXT);
+
+ return (wsp->walk_callback(wsp->walk_addr, &ire, wsp->walk_cbdata));
}
/* ARGSUSED */
@@ -633,6 +780,9 @@ ire_format(uintptr_t addr, const void *ire_arg, void *ire_cb_arg)
const ire_t *irep = ire_arg;
ire_cbdata_t *ire_cb = ire_cb_arg;
boolean_t verbose = ire_cb->verbose;
+ ill_t ill;
+ char ill_name[LIFNAMSIZ];
+ boolean_t condemned = irep->ire_generation == IRE_GENERATION_CONDEMNED;
static const mdb_bitmask_t tmasks[] = {
{ "BROADCAST", IRE_BROADCAST, IRE_BROADCAST },
@@ -640,22 +790,12 @@ ire_format(uintptr_t addr, const void *ire_arg, void *ire_cb_arg)
{ "LOCAL", IRE_LOCAL, IRE_LOCAL },
{ "LOOPBACK", IRE_LOOPBACK, IRE_LOOPBACK },
{ "PREFIX", IRE_PREFIX, IRE_PREFIX },
- { "CACHE", IRE_CACHE, IRE_CACHE },
+ { "MULTICAST", IRE_MULTICAST, IRE_MULTICAST },
+ { "NOROUTE", IRE_NOROUTE, IRE_NOROUTE },
{ "IF_NORESOLVER", IRE_IF_NORESOLVER, IRE_IF_NORESOLVER },
{ "IF_RESOLVER", IRE_IF_RESOLVER, IRE_IF_RESOLVER },
+ { "IF_CLONE", IRE_IF_CLONE, IRE_IF_CLONE },
{ "HOST", IRE_HOST, IRE_HOST },
- { "HOST_REDIRECT", IRE_HOST_REDIRECT, IRE_HOST_REDIRECT },
- { NULL, 0, 0 }
- };
-
- static const mdb_bitmask_t mmasks[] = {
- { "CONDEMNED", IRE_MARK_CONDEMNED, IRE_MARK_CONDEMNED },
- { "TESTHIDDEN", IRE_MARK_TESTHIDDEN, IRE_MARK_TESTHIDDEN },
- { "NOADD", IRE_MARK_NOADD, IRE_MARK_NOADD },
- { "TEMPORARY", IRE_MARK_TEMPORARY, IRE_MARK_TEMPORARY },
- { "USESRC", IRE_MARK_USESRC_CHECK, IRE_MARK_USESRC_CHECK },
- { "PRIVATE", IRE_MARK_PRIVATE_ADDR, IRE_MARK_PRIVATE_ADDR },
- { "UNCACHED", IRE_MARK_UNCACHED, IRE_MARK_UNCACHED },
{ NULL, 0, 0 }
};
@@ -678,6 +818,7 @@ ire_format(uintptr_t addr, const void *ire_arg, void *ire_cb_arg)
{ "PROTO1", RTF_PROTO1, RTF_PROTO1 },
{ "MULTIRT", RTF_MULTIRT, RTF_MULTIRT },
{ "SETSRC", RTF_SETSRC, RTF_SETSRC },
+ { "INDIRECT", RTF_INDIRECT, RTF_INDIRECT },
{ NULL, 0, 0 }
};
@@ -685,40 +826,53 @@ ire_format(uintptr_t addr, const void *ire_arg, void *ire_cb_arg)
irep->ire_ipversion != ire_cb->ire_ipversion)
return (WALK_NEXT);
+ if (mdb_vread(&ill, sizeof (ill), (uintptr_t)irep->ire_ill) == -1) {
+ mdb_snprintf(ill_name, sizeof (ill_name), "--");
+ } else {
+ (void) mdb_readstr(ill_name,
+ MIN(LIFNAMSIZ, ill.ill_name_length),
+ (uintptr_t)ill.ill_name);
+ }
+
if (irep->ire_ipversion == IPV6_VERSION && verbose) {
- mdb_printf("%<b>%?p%</b> %40N <%hb>\n"
- "%?s %40N <%hb>\n"
- "%?s %40d %4d <%hb>\n",
- addr, &irep->ire_src_addr_v6, irep->ire_type, tmasks,
- "", &irep->ire_addr_v6, (ushort_t)irep->ire_marks, mmasks,
+ mdb_printf("%<b>%?p%</b>%3s %40N <%hb%s>\n"
+ "%?s %40N\n"
+ "%?s %40d %4d <%hb> %s\n",
+ addr, condemned ? "(C)" : "", &irep->ire_setsrc_addr_v6,
+ irep->ire_type, tmasks,
+ (irep->ire_testhidden ? ", HIDDEN" : ""),
+ "", &irep->ire_addr_v6,
"", ips_to_stackid((uintptr_t)irep->ire_ipst),
irep->ire_zoneid,
- irep->ire_flags, fmasks);
+ irep->ire_flags, fmasks, ill_name);
} else if (irep->ire_ipversion == IPV6_VERSION) {
- mdb_printf("%?p %30N %30N %5d %4d\n",
- addr, &irep->ire_src_addr_v6,
+ mdb_printf("%?p%3s %30N %30N %5d %4d %s\n",
+ addr, condemned ? "(C)" : "", &irep->ire_setsrc_addr_v6,
&irep->ire_addr_v6,
ips_to_stackid((uintptr_t)irep->ire_ipst),
- irep->ire_zoneid);
+ irep->ire_zoneid, ill_name);
} else if (verbose) {
- mdb_printf("%<b>%?p%</b> %40I <%hb>\n"
- "%?s %40I <%hb>\n"
- "%?s %40d %4d <%hb>\n",
- addr, irep->ire_src_addr, irep->ire_type, tmasks,
- "", irep->ire_addr, (ushort_t)irep->ire_marks, mmasks,
+ mdb_printf("%<b>%?p%</b>%3s %40I <%hb%s>\n"
+ "%?s %40I\n"
+ "%?s %40d %4d <%hb> %s\n",
+ addr, condemned ? "(C)" : "", irep->ire_setsrc_addr,
+ irep->ire_type, tmasks,
+ (irep->ire_testhidden ? ", HIDDEN" : ""),
+ "", irep->ire_addr,
"", ips_to_stackid((uintptr_t)irep->ire_ipst),
- irep->ire_zoneid, irep->ire_flags, fmasks);
+ irep->ire_zoneid, irep->ire_flags, fmasks, ill_name);
} else {
- mdb_printf("%?p %30I %30I %5d %4d\n", addr, irep->ire_src_addr,
+ mdb_printf("%?p%3s %30I %30I %5d %4d %s\n", addr,
+ condemned ? "(C)" : "", irep->ire_setsrc_addr,
irep->ire_addr, ips_to_stackid((uintptr_t)irep->ire_ipst),
- irep->ire_zoneid);
+ irep->ire_zoneid, ill_name);
}
return (WALK_NEXT);
@@ -1040,6 +1194,140 @@ ip6hdr(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
}
int
+nce(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+ nce_t nce;
+ nce_cbdata_t nce_cb;
+ int ipversion = 0;
+ const char *opt_P = NULL, *opt_ill;
+
+ if (mdb_getopts(argc, argv,
+ 'i', MDB_OPT_STR, &opt_ill,
+ 'P', MDB_OPT_STR, &opt_P, NULL) != argc)
+ return (DCMD_USAGE);
+
+ if (opt_P != NULL) {
+ if (strcmp("v4", opt_P) == 0) {
+ ipversion = IPV4_VERSION;
+ } else if (strcmp("v6", opt_P) == 0) {
+ ipversion = IPV6_VERSION;
+ } else {
+ mdb_warn("invalid protocol '%s'\n", opt_P);
+ return (DCMD_USAGE);
+ }
+ }
+
+ if ((flags & DCMD_LOOPFIRST) || !(flags & DCMD_LOOP)) {
+ mdb_printf("%<u>%?s %5s %18s %?s %s %s %</u>\n",
+ "ADDR", "INTF", "LLADDR", "FP_MP", "REFCNT",
+ "NCE_ADDR");
+ }
+
+ bzero(&nce_cb, sizeof (nce_cb));
+ if (opt_ill != NULL) {
+ strcpy(nce_cb.nce_ill_name, opt_ill);
+ }
+ nce_cb.nce_ipversion = ipversion;
+
+ if (flags & DCMD_ADDRSPEC) {
+ (void) mdb_vread(&nce, sizeof (nce_t), addr);
+ (void) nce_format(addr, &nce, &nce_cb);
+ } else if (mdb_walk("nce", (mdb_walk_cb_t)nce_format, &nce_cb) == -1) {
+ mdb_warn("failed to walk ire table");
+ return (DCMD_ERR);
+ }
+
+ return (DCMD_OK);
+}
+
+/* ARGSUSED */
+static int
+dce_format(uintptr_t addr, const dce_t *dcep, void *dce_cb_arg)
+{
+ static const mdb_bitmask_t dmasks[] = {
+ { "D", DCEF_DEFAULT, DCEF_DEFAULT },
+ { "P", DCEF_PMTU, DCEF_PMTU },
+ { "U", DCEF_UINFO, DCEF_UINFO },
+ { "S", DCEF_TOO_SMALL_PMTU, DCEF_TOO_SMALL_PMTU },
+ { NULL, 0, 0 }
+ };
+ char flagsbuf[2 * A_CNT(dmasks)];
+ int ipversion = *(int *)dce_cb_arg;
+ boolean_t condemned = dcep->dce_generation == DCE_GENERATION_CONDEMNED;
+
+ if (ipversion != 0 && ipversion != dcep->dce_ipversion)
+ return (WALK_NEXT);
+
+ mdb_snprintf(flagsbuf, sizeof (flagsbuf), "%b", dcep->dce_flags,
+ dmasks);
+
+ switch (dcep->dce_ipversion) {
+ case IPV4_VERSION:
+ mdb_printf("%<u>%?p%3s %8s %8d %30I %</u>\n", addr, condemned ?
+ "(C)" : "", flagsbuf, dcep->dce_pmtu, &dcep->dce_v4addr);
+ break;
+ case IPV6_VERSION:
+ mdb_printf("%<u>%?p%3s %8s %8d %30N %</u>\n", addr, condemned ?
+ "(C)" : "", flagsbuf, dcep->dce_pmtu, &dcep->dce_v6addr);
+ break;
+ default:
+ mdb_printf("%<u>%?p%3s %8s %8d %30s %</u>\n", addr, condemned ?
+ "(C)" : "", flagsbuf, dcep->dce_pmtu, "");
+ }
+
+ return (WALK_NEXT);
+}
+
+int
+dce(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+ dce_t dce;
+ const char *opt_P = NULL;
+ const char *zone_name = NULL;
+ ip_stack_t *ipst = NULL;
+ int ipversion = 0;
+
+ if (mdb_getopts(argc, argv,
+ 's', MDB_OPT_STR, &zone_name,
+ 'P', MDB_OPT_STR, &opt_P, NULL) != argc)
+ return (DCMD_USAGE);
+
+ /* Follow the specified zone name to find a ip_stack_t*. */
+ if (zone_name != NULL) {
+ ipst = zone_to_ips(zone_name);
+ if (ipst == NULL)
+ return (DCMD_USAGE);
+ }
+
+ if (opt_P != NULL) {
+ if (strcmp("v4", opt_P) == 0) {
+ ipversion = IPV4_VERSION;
+ } else if (strcmp("v6", opt_P) == 0) {
+ ipversion = IPV6_VERSION;
+ } else {
+ mdb_warn("invalid protocol '%s'\n", opt_P);
+ return (DCMD_USAGE);
+ }
+ }
+
+ if ((flags & DCMD_LOOPFIRST) || !(flags & DCMD_LOOP)) {
+ mdb_printf("%<u>%?s%3s %8s %8s %30s %</u>\n",
+ "ADDR", "", "FLAGS", "PMTU", "DST_ADDR");
+ }
+
+ if (flags & DCMD_ADDRSPEC) {
+ (void) mdb_vread(&dce, sizeof (dce_t), addr);
+ (void) dce_format(addr, &dce, &ipversion);
+ } else if (mdb_pwalk("dce", (mdb_walk_cb_t)dce_format, &ipversion,
+ (uintptr_t)ipst) == -1) {
+ mdb_warn("failed to walk dce cache");
+ return (DCMD_ERR);
+ }
+
+ return (DCMD_OK);
+}
+
+int
ire(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
{
uint_t verbose = FALSE;
@@ -1047,12 +1335,22 @@ ire(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
ire_cbdata_t ire_cb;
int ipversion = 0;
const char *opt_P = NULL;
+ const char *zone_name = NULL;
+ ip_stack_t *ipst = NULL;
if (mdb_getopts(argc, argv,
'v', MDB_OPT_SETBITS, TRUE, &verbose,
+ 's', MDB_OPT_STR, &zone_name,
'P', MDB_OPT_STR, &opt_P, NULL) != argc)
return (DCMD_USAGE);
+ /* Follow the specified zone name to find a ip_stack_t*. */
+ if (zone_name != NULL) {
+ ipst = zone_to_ips(zone_name);
+ if (ipst == NULL)
+ return (DCMD_USAGE);
+ }
+
if (opt_P != NULL) {
if (strcmp("v4", opt_P) == 0) {
ipversion = IPV4_VERSION;
@@ -1069,13 +1367,13 @@ ire(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
if (verbose) {
mdb_printf("%?s %40s %-20s%\n"
"%?s %40s %-20s%\n"
- "%<u>%?s %40s %4s %-20s%</u>\n",
+ "%<u>%?s %40s %4s %-20s %s%</u>\n",
"ADDR", "SRC", "TYPE",
"", "DST", "MARKS",
- "", "STACK", "ZONE", "FLAGS");
+ "", "STACK", "ZONE", "FLAGS", "INTF");
} else {
- mdb_printf("%<u>%?s %30s %30s %5s %4s%</u>\n",
- "ADDR", "SRC", "DST", "STACK", "ZONE");
+ mdb_printf("%<u>%?s %30s %30s %5s %4s %s%</u>\n",
+ "ADDR", "SRC", "DST", "STACK", "ZONE", "INTF");
}
}
@@ -1085,7 +1383,8 @@ ire(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
if (flags & DCMD_ADDRSPEC) {
(void) mdb_vread(&ire, sizeof (ire_t), addr);
(void) ire_format(addr, &ire, &ire_cb);
- } else if (mdb_walk("ire", (mdb_walk_cb_t)ire_format, &ire_cb) == -1) {
+ } else if (mdb_pwalk("ire", (mdb_walk_cb_t)ire_format, &ire_cb,
+ (uintptr_t)ipst) == -1) {
mdb_warn("failed to walk ire table");
return (DCMD_ERR);
}
@@ -1338,7 +1637,7 @@ th_trace(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
static void
th_trace_help(void)
{
- mdb_printf("If given an address of an ill_t, ipif_t, ire_t, or nce_t, "
+ mdb_printf("If given an address of an ill_t, ipif_t, ire_t, or ncec_t, "
"print the\n"
"corresponding th_trace_t structure in detail. Otherwise, if no "
"address is\n"
@@ -1354,8 +1653,8 @@ static const mdb_dcmd_t dcmds[] = {
{ "srcid_status", ":",
"display connection structures from ipcl hash tables",
srcid_status },
- { "ill", "?[-v] [-P v4 | v6]", "display ill_t structures",
- ill, ill_help },
+ { "ill", "?[-v] [-P v4 | v6] [-s exclusive-ip-zone-name]",
+ "display ill_t structures", ill, ill_help },
{ "illif", "?[-P v4 | v6]",
"display or filter IP Lower Level InterFace structures", illif,
illif_help },
@@ -1363,10 +1662,14 @@ static const mdb_dcmd_t dcmds[] = {
{ "ip6hdr", ":[-vf]", "display an IPv6 header", ip6hdr },
{ "ipif", "?[-v] [-P v4 | v6]", "display ipif structures",
ipif, ipif_help },
- { "ire", "?[-v] [-P v4|v6]",
+ { "ire", "?[-v] [-P v4|v6] [-s exclusive-ip-zone-name]",
"display Internet Route Entry structures", ire },
- { "nce", "?[-P v4 | v6]", "display Neighbor Cache Entry structures",
- nce },
+ { "nce", "?[-P v4|v6] [-i <interface>]",
+ "display interface-specific Neighbor Cache structures", nce },
+ { "ncec", "?[-P v4 | v6]", "display Neighbor Cache Entry structures",
+ ncec },
+ { "dce", "?[-P v4|v6] [-s exclusive-ip-zone-name]",
+ "display Destination Cache Entry structures", dce },
{ "squeue", ":[-v]", "print core squeue_t info", squeue,
ip_squeue_help },
{ "tcphdr", ":", "display a TCP header", tcphdr },
@@ -1385,7 +1688,7 @@ static const mdb_walker_t walkers[] = {
{ "illif_stack", "walk list of ill interface types",
illif_stack_walk_init, illif_stack_walk_step,
illif_stack_walk_fini },
- { "ill", "walk list of nce structures for all stacks",
+ { "ill", "walk active ill_t structures for all stacks",
ill_walk_init, ill_walk_step, NULL },
{ "ipif", "walk list of ipif structures for all stacks",
ipif_walk_init, ipif_walk_step, NULL },
@@ -1400,19 +1703,21 @@ static const mdb_walker_t walkers[] = {
&srcid_walk_arg },
{ "ire", "walk active ire_t structures",
ire_walk_init, ire_walk_step, NULL },
- { "ire_ctable", "walk ire_t structures in the ctable",
- ip_stacks_common_walk_init, ire_ctable_walk_step, NULL },
{ "ire_next", "walk ire_t structures in the ctable",
ire_next_walk_init, ire_next_walk_step, NULL },
+ { "nce", "walk active nce_t structures",
+ nce_walk_init, nce_walk_step, NULL },
+ { "dce", "walk active dce_t structures",
+ dce_walk_init, dce_walk_step, NULL },
{ "ip_stacks", "walk all the ip_stack_t",
ip_stacks_walk_init, ip_stacks_walk_step, NULL },
{ "th_hash", "walk all the th_hash_t entries",
th_hash_walk_init, th_hash_walk_step, NULL },
- { "nce", "walk list of nce structures for all stacks",
- ip_stacks_common_walk_init, nce_walk_step, NULL },
- { "nce_stack", "walk list of nce structures",
- nce_stack_walk_init, nce_stack_walk_step,
- nce_stack_walk_fini},
+ { "ncec", "walk list of ncec structures for all stacks",
+ ip_stacks_common_walk_init, ncec_walk_step, NULL },
+ { "ncec_stack", "walk list of ncec structures",
+ ncec_stack_walk_init, ncec_stack_walk_step,
+ ncec_stack_walk_fini},
{ "udp_hash", "walk list of conn_t structures in ips_ipcl_udp_fanout",
ipcl_hash_walk_init, ipcl_hash_walk_step,
ipcl_hash_walk_fini, &udp_hash_arg},
@@ -1471,9 +1776,9 @@ _mdb_fini(void)
}
static char *
-nce_state(int nce_state)
+ncec_state(int ncec_state)
{
- switch (nce_state) {
+ switch (ncec_state) {
case ND_UNCHANGED:
return ("unchanged");
case ND_INCOMPLETE:
@@ -1496,36 +1801,61 @@ nce_state(int nce_state)
}
static char *
-nce_l2_addr(const nce_t *nce, const ill_t *ill)
+ncec_l2_addr(const ncec_t *ncec, const ill_t *ill)
{
uchar_t *h;
static char addr_buf[L2MAXADDRSTRLEN];
- mblk_t mp;
- size_t mblen;
- if (ill->ill_flags & ILLF_XRESOLV) {
- return ("XRESOLV");
+ if (ncec->ncec_lladdr == NULL) {
+ return ("None");
}
- if (nce->nce_res_mp == NULL) {
+ if (ill->ill_net_type == IRE_IF_RESOLVER) {
+
+ if (ill->ill_phys_addr_length == 0)
+ return ("None");
+ h = mdb_zalloc(ill->ill_phys_addr_length, UM_SLEEP);
+ if (mdb_vread(h, ill->ill_phys_addr_length,
+ (uintptr_t)ncec->ncec_lladdr) == -1) {
+ mdb_warn("failed to read hwaddr at %p",
+ ncec->ncec_lladdr);
+ return ("Unknown");
+ }
+ mdb_mac_addr(h, ill->ill_phys_addr_length,
+ addr_buf, sizeof (addr_buf));
+ } else {
return ("None");
}
+ mdb_free(h, ill->ill_phys_addr_length);
+ return (addr_buf);
+}
- if (ill->ill_net_type == IRE_IF_RESOLVER) {
+static char *
+nce_l2_addr(const nce_t *nce, const ill_t *ill)
+{
+ uchar_t *h;
+ static char addr_buf[L2MAXADDRSTRLEN];
+ mblk_t mp;
+ size_t mblen;
+
+ if (nce->nce_dlur_mp == NULL)
+ return ("None");
+ if (ill->ill_net_type == IRE_IF_RESOLVER) {
if (mdb_vread(&mp, sizeof (mblk_t),
- (uintptr_t)nce->nce_res_mp) == -1) {
- mdb_warn("failed to read nce_res_mp at %p",
- nce->nce_res_mp);
+ (uintptr_t)nce->nce_dlur_mp) == -1) {
+ mdb_warn("failed to read nce_dlur_mp at %p",
+ nce->nce_dlur_mp);
+ return ("None");
}
-
- if (ill->ill_nd_lla_len == 0)
+ if (ill->ill_phys_addr_length == 0)
return ("None");
mblen = mp.b_wptr - mp.b_rptr;
if (mblen > (sizeof (dl_unitdata_req_t) + MAX_SAP_LEN) ||
- ill->ill_nd_lla_len > MAX_SAP_LEN ||
- NCE_LL_ADDR_OFFSET(ill) + ill->ill_nd_lla_len > mblen) {
- return ("Truncated");
+ ill->ill_phys_addr_length > MAX_SAP_LEN ||
+ (NCE_LL_ADDR_OFFSET(ill) +
+ ill->ill_phys_addr_length) > mblen) {
+ return ("Unknown");
}
h = mdb_zalloc(mblen, UM_SLEEP);
if (mdb_vread(h, mblen, (uintptr_t)(mp.b_rptr)) == -1) {
@@ -1533,8 +1863,8 @@ nce_l2_addr(const nce_t *nce, const ill_t *ill)
mp.b_rptr + NCE_LL_ADDR_OFFSET(ill));
return ("Unknown");
}
- mdb_mac_addr(h + NCE_LL_ADDR_OFFSET(ill), ill->ill_nd_lla_len,
- addr_buf, sizeof (addr_buf));
+ mdb_mac_addr(h + NCE_LL_ADDR_OFFSET(ill),
+ ill->ill_phys_addr_length, addr_buf, sizeof (addr_buf));
} else {
return ("None");
}
@@ -1543,7 +1873,7 @@ nce_l2_addr(const nce_t *nce, const ill_t *ill)
}
static void
-nce_header(uint_t flags)
+ncec_header(uint_t flags)
{
if ((flags & DCMD_LOOPFIRST) || !(flags & DCMD_LOOP)) {
@@ -1553,10 +1883,10 @@ nce_header(uint_t flags)
}
int
-nce(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+ncec(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
{
- nce_t nce;
- nce_cbdata_t id;
+ ncec_t ncec;
+ ncec_cbdata_t id;
int ipversion = 0;
const char *opt_P = NULL;
@@ -1577,23 +1907,23 @@ nce(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
if (flags & DCMD_ADDRSPEC) {
- if (mdb_vread(&nce, sizeof (nce_t), addr) == -1) {
- mdb_warn("failed to read nce at %p\n", addr);
+ if (mdb_vread(&ncec, sizeof (ncec_t), addr) == -1) {
+ mdb_warn("failed to read ncec at %p\n", addr);
return (DCMD_ERR);
}
- if (ipversion != 0 && nce.nce_ipversion != ipversion) {
+ if (ipversion != 0 && ncec.ncec_ipversion != ipversion) {
mdb_printf("IP Version mismatch\n");
return (DCMD_ERR);
}
- nce_header(flags);
- return (nce_format(addr, &nce, ipversion));
+ ncec_header(flags);
+ return (ncec_format(addr, &ncec, ipversion));
} else {
- id.nce_addr = addr;
- id.nce_ipversion = ipversion;
- nce_header(flags);
- if (mdb_walk("nce", (mdb_walk_cb_t)nce_cb, &id) == -1) {
- mdb_warn("failed to walk nce table\n");
+ id.ncec_addr = addr;
+ id.ncec_ipversion = ipversion;
+ ncec_header(flags);
+ if (mdb_walk("ncec", (mdb_walk_cb_t)ncec_cb, &id) == -1) {
+ mdb_warn("failed to walk ncec table\n");
return (DCMD_ERR);
}
}
@@ -1601,10 +1931,10 @@ nce(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
}
static int
-nce_format(uintptr_t addr, const nce_t *nce, int ipversion)
+ncec_format(uintptr_t addr, const ncec_t *ncec, int ipversion)
{
- static const mdb_bitmask_t nce_flags[] = {
- { "P", NCE_F_PERMANENT, NCE_F_PERMANENT },
+ static const mdb_bitmask_t ncec_flags[] = {
+ { "P", NCE_F_NONUD, NCE_F_NONUD },
{ "R", NCE_F_ISROUTER, NCE_F_ISROUTER },
{ "N", NCE_F_NONUD, NCE_F_NONUD },
{ "A", NCE_F_ANYCAST, NCE_F_ANYCAST },
@@ -1613,15 +1943,15 @@ nce_format(uintptr_t addr, const nce_t *nce, int ipversion)
{ "B", NCE_F_BCAST, NCE_F_BCAST },
{ NULL, 0, 0 }
};
-#define NCE_MAX_FLAGS (sizeof (nce_flags) / sizeof (mdb_bitmask_t))
+#define NCE_MAX_FLAGS (sizeof (ncec_flags) / sizeof (mdb_bitmask_t))
struct in_addr nceaddr;
ill_t ill;
char ill_name[LIFNAMSIZ];
char flagsbuf[NCE_MAX_FLAGS];
- if (mdb_vread(&ill, sizeof (ill), (uintptr_t)nce->nce_ill) == -1) {
- mdb_warn("failed to read nce_ill at %p",
- nce->nce_ill);
+ if (mdb_vread(&ill, sizeof (ill), (uintptr_t)ncec->ncec_ill) == -1) {
+ mdb_warn("failed to read ncec_ill at %p",
+ ncec->ncec_ill);
return (DCMD_ERR);
}
@@ -1629,33 +1959,33 @@ nce_format(uintptr_t addr, const nce_t *nce, int ipversion)
(uintptr_t)ill.ill_name);
mdb_snprintf(flagsbuf, sizeof (flagsbuf), "%hb",
- nce->nce_flags, nce_flags);
+ ncec->ncec_flags, ncec_flags);
- if (ipversion != 0 && nce->nce_ipversion != ipversion)
+ if (ipversion != 0 && ncec->ncec_ipversion != ipversion)
return (DCMD_OK);
- if (nce->nce_ipversion == IPV4_VERSION) {
- IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &nceaddr);
+ if (ncec->ncec_ipversion == IPV4_VERSION) {
+ IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &nceaddr);
mdb_printf("%?p %-20s %-10s "
"%-8s "
"%-5s %I\n",
- addr, nce_l2_addr(nce, &ill),
- nce_state(nce->nce_state),
+ addr, ncec_l2_addr(ncec, &ill),
+ ncec_state(ncec->ncec_state),
flagsbuf,
ill_name, nceaddr.s_addr);
} else {
mdb_printf("%?p %-20s %-10s %-8s %-5s %N\n",
- addr, nce_l2_addr(nce, &ill),
- nce_state(nce->nce_state),
+ addr, ncec_l2_addr(ncec, &ill),
+ ncec_state(ncec->ncec_state),
flagsbuf,
- ill_name, &nce->nce_addr);
+ ill_name, &ncec->ncec_addr);
}
return (DCMD_OK);
}
static uintptr_t
-nce_get_next_hash_tbl(uintptr_t start, int *index, struct ndp_g_s ndp)
+ncec_get_next_hash_tbl(uintptr_t start, int *index, struct ndp_g_s ndp)
{
uintptr_t addr = start;
int i = *index;
@@ -1671,7 +2001,7 @@ nce_get_next_hash_tbl(uintptr_t start, int *index, struct ndp_g_s ndp)
}
static int
-nce_walk_step(mdb_walk_state_t *wsp)
+ncec_walk_step(mdb_walk_state_t *wsp)
{
uintptr_t kaddr4, kaddr6;
@@ -1686,15 +2016,15 @@ nce_walk_step(mdb_walk_state_t *wsp)
mdb_warn("can't read ips_ip_cache_table at %p", kaddr6);
return (WALK_ERR);
}
- if (mdb_pwalk("nce_stack", wsp->walk_callback, wsp->walk_cbdata,
+ if (mdb_pwalk("ncec_stack", wsp->walk_callback, wsp->walk_cbdata,
kaddr4) == -1) {
- mdb_warn("couldn't walk 'nce_stack' for ips_ndp4 %p",
+ mdb_warn("couldn't walk 'ncec_stack' for ips_ndp4 %p",
kaddr4);
return (WALK_ERR);
}
- if (mdb_pwalk("nce_stack", wsp->walk_callback,
+ if (mdb_pwalk("ncec_stack", wsp->walk_callback,
wsp->walk_cbdata, kaddr6) == -1) {
- mdb_warn("couldn't walk 'nce_stack' for ips_ndp6 %p",
+ mdb_warn("couldn't walk 'ncec_stack' for ips_ndp6 %p",
kaddr6);
return (WALK_ERR);
}
@@ -1743,7 +2073,7 @@ ipcl_hash_walk_init(mdb_walk_state_t *wsp)
mdb_free(iw, sizeof (ipcl_hash_walk_data_t));
return (WALK_ERR);
}
- if (arg->tbl_off == OFFSETOF(ip_stack_t, ips_ipcl_proto_fanout) ||
+ if (arg->tbl_off == OFFSETOF(ip_stack_t, ips_ipcl_proto_fanout_v4) ||
arg->tbl_off == OFFSETOF(ip_stack_t, ips_ipcl_proto_fanout_v6)) {
iw->hash_tbl_size = IPPROTO_MAX;
} else {
@@ -1809,72 +2139,75 @@ ipcl_hash_walk_fini(mdb_walk_state_t *wsp)
* Called with walk_addr being the address of ips_ndp{4,6}
*/
static int
-nce_stack_walk_init(mdb_walk_state_t *wsp)
+ncec_stack_walk_init(mdb_walk_state_t *wsp)
{
- nce_walk_data_t *nw;
+ ncec_walk_data_t *nw;
if (wsp->walk_addr == NULL) {
- mdb_warn("nce_stack requires ndp_g_s address\n");
+ mdb_warn("ncec_stack requires ndp_g_s address\n");
return (WALK_ERR);
}
- nw = mdb_alloc(sizeof (nce_walk_data_t), UM_SLEEP);
+ nw = mdb_alloc(sizeof (ncec_walk_data_t), UM_SLEEP);
- if (mdb_vread(&nw->nce_ip_ndp, sizeof (struct ndp_g_s),
+ if (mdb_vread(&nw->ncec_ip_ndp, sizeof (struct ndp_g_s),
wsp->walk_addr) == -1) {
mdb_warn("failed to read 'ip_ndp' at %p",
wsp->walk_addr);
- mdb_free(nw, sizeof (nce_walk_data_t));
+ mdb_free(nw, sizeof (ncec_walk_data_t));
return (WALK_ERR);
}
- nw->nce_hash_tbl_index = 0;
- wsp->walk_addr = nce_get_next_hash_tbl(NULL,
- &nw->nce_hash_tbl_index, nw->nce_ip_ndp);
+ /*
+ * ncec_get_next_hash_tbl() starts at ++i , so initialize index to -1
+ */
+ nw->ncec_hash_tbl_index = -1;
+ wsp->walk_addr = ncec_get_next_hash_tbl(NULL,
+ &nw->ncec_hash_tbl_index, nw->ncec_ip_ndp);
wsp->walk_data = nw;
return (WALK_NEXT);
}
static int
-nce_stack_walk_step(mdb_walk_state_t *wsp)
+ncec_stack_walk_step(mdb_walk_state_t *wsp)
{
uintptr_t addr = wsp->walk_addr;
- nce_walk_data_t *nw = wsp->walk_data;
+ ncec_walk_data_t *nw = wsp->walk_data;
if (addr == NULL)
return (WALK_DONE);
- if (mdb_vread(&nw->nce, sizeof (nce_t), addr) == -1) {
- mdb_warn("failed to read nce_t at %p", addr);
+ if (mdb_vread(&nw->ncec, sizeof (ncec_t), addr) == -1) {
+ mdb_warn("failed to read ncec_t at %p", addr);
return (WALK_ERR);
}
- wsp->walk_addr = (uintptr_t)nw->nce.nce_next;
+ wsp->walk_addr = (uintptr_t)nw->ncec.ncec_next;
- wsp->walk_addr = nce_get_next_hash_tbl(wsp->walk_addr,
- &nw->nce_hash_tbl_index, nw->nce_ip_ndp);
+ wsp->walk_addr = ncec_get_next_hash_tbl(wsp->walk_addr,
+ &nw->ncec_hash_tbl_index, nw->ncec_ip_ndp);
return (wsp->walk_callback(addr, nw, wsp->walk_cbdata));
}
static void
-nce_stack_walk_fini(mdb_walk_state_t *wsp)
+ncec_stack_walk_fini(mdb_walk_state_t *wsp)
{
- mdb_free(wsp->walk_data, sizeof (nce_walk_data_t));
+ mdb_free(wsp->walk_data, sizeof (ncec_walk_data_t));
}
/* ARGSUSED */
static int
-nce_cb(uintptr_t addr, const nce_walk_data_t *iw, nce_cbdata_t *id)
+ncec_cb(uintptr_t addr, const ncec_walk_data_t *iw, ncec_cbdata_t *id)
{
- nce_t nce;
+ ncec_t ncec;
- if (mdb_vread(&nce, sizeof (nce_t), addr) == -1) {
- mdb_warn("failed to read nce at %p", addr);
+ if (mdb_vread(&ncec, sizeof (ncec_t), addr) == -1) {
+ mdb_warn("failed to read ncec at %p", addr);
return (WALK_NEXT);
}
- (void) nce_format(addr, &nce, id->nce_ipversion);
+ (void) ncec_format(addr, &ncec, id->ncec_ipversion);
return (WALK_NEXT);
}
@@ -1918,6 +2251,11 @@ ill_cb(uintptr_t addr, const ill_walk_data_t *iw, ill_cbdata_t *id)
mdb_warn("failed to read ill at %p", addr);
return (WALK_NEXT);
}
+
+ /* If ip_stack_t is specified, skip ILLs that don't belong to it. */
+ if (id->ill_ipst != NULL && ill.ill_ipst != id->ill_ipst)
+ return (WALK_NEXT);
+
return (ill_format((uintptr_t)addr, &ill, id));
}
@@ -2013,7 +2351,7 @@ ill_format(uintptr_t addr, const void *illptr, void *ill_cb_arg)
break;
}
cnt = ill->ill_refcnt + ill->ill_ire_cnt + ill->ill_nce_cnt +
- ill->ill_ilm_walker_cnt + ill->ill_ilm_cnt;
+ ill->ill_ilm_cnt + ill->ill_ncec_cnt;
mdb_printf("%-?p %-8s %-3s ",
addr, ill_name, ill->ill_isv6 ? "v6" : "v4");
if (typebuf != NULL)
@@ -2035,11 +2373,10 @@ ill_format(uintptr_t addr, const void *illptr, void *ill_cb_arg)
strlen(sbuf), "", ill->ill_ire_cnt, "ill_ire_cnt");
mdb_printf("%*s %7d %-18s nces referencing this ill\n",
strlen(sbuf), "", ill->ill_nce_cnt, "ill_nce_cnt");
+ mdb_printf("%*s %7d %-18s ncecs referencing this ill\n",
+ strlen(sbuf), "", ill->ill_ncec_cnt, "ill_ncec_cnt");
mdb_printf("%*s %7d %-18s ilms referencing this ill\n",
strlen(sbuf), "", ill->ill_ilm_cnt, "ill_ilm_cnt");
- mdb_printf("%*s %7d %-18s active ilm walkers\n\n",
- strlen(sbuf), "", ill->ill_ilm_walker_cnt,
- "ill_ilm_walker_cnt");
} else {
mdb_printf("%4d %-?p %-llb\n",
cnt, ill->ill_wq,
@@ -2054,14 +2391,24 @@ ill(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
ill_t ill_data;
ill_cbdata_t id;
int ipversion = 0;
+ const char *zone_name = NULL;
const char *opt_P = NULL;
uint_t verbose = FALSE;
+ ip_stack_t *ipst = NULL;
if (mdb_getopts(argc, argv,
'v', MDB_OPT_SETBITS, TRUE, &verbose,
+ 's', MDB_OPT_STR, &zone_name,
'P', MDB_OPT_STR, &opt_P, NULL) != argc)
return (DCMD_USAGE);
+ /* Follow the specified zone name to find a ip_stack_t*. */
+ if (zone_name != NULL) {
+ ipst = zone_to_ips(zone_name);
+ if (ipst == NULL)
+ return (DCMD_USAGE);
+ }
+
if (opt_P != NULL) {
if (strcmp("v4", opt_P) == 0) {
ipversion = IPV4_VERSION;
@@ -2076,6 +2423,7 @@ ill(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
id.verbose = verbose;
id.ill_addr = addr;
id.ill_ipversion = ipversion;
+ id.ill_ipst = ipst;
ill_header(verbose);
if (flags & DCMD_ADDRSPEC) {
@@ -2254,7 +2602,6 @@ ipif_format(uintptr_t addr, const void *ipifptr, void *ipif_cb_arg)
{ "CO", IPIF_CONDEMNED, IPIF_CONDEMNED},
{ "CH", IPIF_CHANGING, IPIF_CHANGING},
{ "SL", IPIF_SET_LINKLOCAL, IPIF_SET_LINKLOCAL},
- { "ZS", IPIF_ZERO_SOURCE, IPIF_ZERO_SOURCE},
{ NULL, 0, 0 }
};
static const mdb_bitmask_t fmasks[] = {
@@ -2299,16 +2646,14 @@ ipif_format(uintptr_t addr, const void *ipifptr, void *ipif_cb_arg)
}
mdb_snprintf(bitfields, sizeof (bitfields), "%s",
ipif->ipif_addr_ready ? ",ADR" : "",
- ipif->ipif_multicast_up ? ",MU" : "",
ipif->ipif_was_up ? ",WU" : "",
- ipif->ipif_was_dup ? ",WD" : "",
- ipif->ipif_joined_allhosts ? ",JA" : "");
+ ipif->ipif_was_dup ? ",WD" : "");
mdb_snprintf(flagsbuf, sizeof (flagsbuf), "%llb%s",
ipif->ipif_flags, fmasks, bitfields);
mdb_snprintf(sflagsbuf, sizeof (sflagsbuf), "%b",
ipif->ipif_state_flags, sfmasks);
- cnt = ipif->ipif_refcnt + ipif->ipif_ire_cnt + ipif->ipif_ilm_cnt;
+ cnt = ipif->ipif_refcnt;
if (ipifcb->ill.ill_isv6) {
mdb_snprintf(addrstr, sizeof (addrstr), "%N",
@@ -2329,12 +2674,6 @@ ipif_format(uintptr_t addr, const void *ipifptr, void *ipif_cb_arg)
mdb_printf("%s |\n%s +---> %4d %-15s "
"Active consistent reader cnt\n",
sbuf, sbuf, ipif->ipif_refcnt, "ipif_refcnt");
- mdb_printf("%*s %10d %-15s "
- "Number of ire's referencing this ipif\n",
- strlen(sbuf), "", ipif->ipif_ire_cnt, "ipif_ire_cnt");
- mdb_printf("%*s %10d %-15s "
- "Number of ilm's referencing this ipif\n\n",
- strlen(sbuf), "", ipif->ipif_ilm_cnt, "ipif_ilm_cnt");
mdb_printf("%-s/%d\n",
addrstr, mask_to_prefixlen(af, &ipif->ipif_v6net_mask));
if (ipifcb->ill.ill_isv6) {
@@ -2473,16 +2812,16 @@ conn_status_cb(uintptr_t addr, const void *walk_data,
mdb_printf("%-?p %-?p %?d %?d\n", addr, conn->conn_wq,
nss.netstack_stackid, conn->conn_zoneid);
- if (conn->conn_af_isv6) {
+ if (conn->conn_family == AF_INET6) {
mdb_snprintf(src_addrstr, sizeof (rem_addrstr), "%N",
- &conn->conn_srcv6);
+ &conn->conn_laddr_v6);
mdb_snprintf(rem_addrstr, sizeof (rem_addrstr), "%N",
- &conn->conn_remv6);
+ &conn->conn_faddr_v6);
} else {
mdb_snprintf(src_addrstr, sizeof (src_addrstr), "%I",
- V4_PART_OF_V6((conn->conn_srcv6)));
+ V4_PART_OF_V6((conn->conn_laddr_v6)));
mdb_snprintf(rem_addrstr, sizeof (rem_addrstr), "%I",
- V4_PART_OF_V6((conn->conn_remv6)));
+ V4_PART_OF_V6((conn->conn_faddr_v6)));
}
mdb_printf("%s:%-5d\n%s:%-5d\n",
src_addrstr, conn->conn_lport, rem_addrstr, conn->conn_fport);
@@ -2519,7 +2858,7 @@ conn_status_help(void)
{
mdb_printf("Prints conn_t structures from the following hash tables: "
"\n\tips_ipcl_udp_fanout\n\tips_ipcl_bind_fanout"
- "\n\tips_ipcl_conn_fanout\n\tips_ipcl_proto_fanout"
+ "\n\tips_ipcl_conn_fanout\n\tips_ipcl_proto_fanout_v4"
"\n\tips_ipcl_proto_fanout_v6\n");
}
diff --git a/usr/src/cmd/mdb/common/modules/sctp/sctp.c b/usr/src/cmd/mdb/common/modules/sctp/sctp.c
index 05f0c385c8..4165a56ca4 100644
--- a/usr/src/cmd/mdb/common/modules/sctp/sctp.c
+++ b/usr/src/cmd/mdb/common/modules/sctp/sctp.c
@@ -20,12 +20,10 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/stream.h>
#include <sys/mdb_modapi.h>
@@ -164,7 +162,7 @@ sctp_faddr(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
mdb_printf("lastactive\t%?ld\thb_secret\t%?#lx\n", fa->lastactive,
fa->hb_secret);
mdb_printf("rxt_unacked\t%?u\n", fa->rxt_unacked);
- mdb_printf("timer_mp\t%?p\tire\t\t%?p\n", fa->timer_mp, fa->ire);
+ mdb_printf("timer_mp\t%?p\tixa\t\t%?p\n", fa->timer_mp, fa->ixa);
mdb_printf("hb_enabled\t%?d\thb_pending\t%?d\n"
"timer_running\t%?d\tdf\t\t%?d\n"
"pmtu_discovered\t%?d\tisv4\t\t%?d\n"
@@ -566,11 +564,12 @@ show_sctp_flags(sctp_t *sctp)
{
mdb_printf("\tunderstands_asconf\t%d\n",
sctp->sctp_understands_asconf);
- mdb_printf("\tdebug\t\t\t%d\n", sctp->sctp_debug);
+ mdb_printf("\tdebug\t\t\t%d\n", sctp->sctp_connp->conn_debug);
mdb_printf("\tcchunk_pend\t\t%d\n", sctp->sctp_cchunk_pend);
- mdb_printf("\tdgram_errind\t\t%d\n", sctp->sctp_dgram_errind);
+ mdb_printf("\tdgram_errind\t\t%d\n",
+ sctp->sctp_connp->conn_dgram_errind);
- mdb_printf("\tlinger\t\t\t%d\n", sctp->sctp_linger);
+ mdb_printf("\tlinger\t\t\t%d\n", sctp->sctp_connp->conn_linger);
if (sctp->sctp_lingering)
return;
mdb_printf("\tlingering\t\t%d\n", sctp->sctp_lingering);
@@ -578,7 +577,8 @@ show_sctp_flags(sctp_t *sctp)
mdb_printf("\tforce_sack\t\t%d\n", sctp->sctp_force_sack);
mdb_printf("\tack_timer_runing\t%d\n", sctp->sctp_ack_timer_running);
- mdb_printf("\trecvdstaddr\t\t%d\n", sctp->sctp_recvdstaddr);
+ mdb_printf("\trecvdstaddr\t\t%d\n",
+ sctp->sctp_connp->conn_recv_ancillary.crb_recvdstaddr);
mdb_printf("\thwcksum\t\t\t%d\n", sctp->sctp_hwcksum);
mdb_printf("\tunderstands_addip\t%d\n", sctp->sctp_understands_addip);
@@ -654,8 +654,8 @@ print_saddr(uintptr_t ptr, const void *addr, void *cbdata)
if (saddr->saddr_ipif_delete_pending == 1)
mdb_printf("/DeletePending");
mdb_printf(")\n");
- mdb_printf("\t\t\tMTU %d id %d zoneid %d IPIF flags %x\n",
- ipif.sctp_ipif_mtu, ipif.sctp_ipif_id,
+ mdb_printf("\t\t\tid %d zoneid %d IPIF flags %x\n",
+ ipif.sctp_ipif_id,
ipif.sctp_ipif_zoneid, ipif.sctp_ipif_flags);
return (WALK_NEXT);
}
@@ -682,8 +682,8 @@ print_faddr(uintptr_t ptr, const void *addr, void *cbdata)
int
sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
{
- sctp_t sctp;
- conn_t connp;
+ sctp_t sctps, *sctp;
+ conn_t conns, *connp;
int i;
uint_t opts = 0;
uint_t paddr = 0;
@@ -692,16 +692,23 @@ sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
if (!(flags & DCMD_ADDRSPEC))
return (DCMD_USAGE);
- if (mdb_vread(&sctp, sizeof (sctp), addr) == -1) {
+ if (mdb_vread(&sctps, sizeof (sctps), addr) == -1) {
mdb_warn("failed to read sctp_t at: %p\n", addr);
return (DCMD_ERR);
}
- if (mdb_vread(&connp, sizeof (connp),
- (uintptr_t)sctp.sctp_connp) == -1) {
- mdb_warn("failed to read conn_t at: %p\n", sctp.sctp_connp);
+ sctp = &sctps;
+
+ if (mdb_vread(&conns, sizeof (conns),
+ (uintptr_t)sctp->sctp_connp) == -1) {
+ mdb_warn("failed to read conn_t at: %p\n", sctp->sctp_connp);
return (DCMD_ERR);
}
+ connp = &conns;
+
+ connp->conn_sctp = sctp;
+ sctp->sctp_connp = connp;
+
if (mdb_getopts(argc, argv,
'a', MDB_OPT_SETBITS, MDB_SCTP_SHOW_ALL, &opts,
'f', MDB_OPT_SETBITS, MDB_SCTP_SHOW_FLAGS, &opts,
@@ -726,7 +733,7 @@ sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
/* non-verbose faddrs, suitable for pipelines to sctp_faddr */
if (paddr != 0) {
sctp_faddr_t faddr, *fp;
- for (fp = sctp.sctp_faddrs; fp != NULL; fp = faddr.next) {
+ for (fp = sctp->sctp_faddrs; fp != NULL; fp = faddr.next) {
if (mdb_vread(&faddr, sizeof (faddr), (uintptr_t)fp)
== -1) {
mdb_warn("failed to read faddr at %p",
@@ -738,16 +745,16 @@ sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
return (DCMD_OK);
}
- mdb_nhconvert(&lport, &sctp.sctp_lport, sizeof (lport));
- mdb_nhconvert(&fport, &sctp.sctp_fport, sizeof (fport));
+ mdb_nhconvert(&lport, &connp->conn_lport, sizeof (lport));
+ mdb_nhconvert(&fport, &connp->conn_fport, sizeof (fport));
mdb_printf("%<u>%p% %22s S=%-6hu D=%-6hu% STACK=%d ZONE=%d%</u>", addr,
- state2str(&sctp), lport, fport,
- ns_to_stackid((uintptr_t)connp.conn_netstack), connp.conn_zoneid);
+ state2str(sctp), lport, fport,
+ ns_to_stackid((uintptr_t)connp->conn_netstack), connp->conn_zoneid);
- if (sctp.sctp_faddrs) {
+ if (sctp->sctp_faddrs) {
sctp_faddr_t faddr;
if (mdb_vread(&faddr, sizeof (faddr),
- (uintptr_t)sctp.sctp_faddrs) != -1)
+ (uintptr_t)sctp->sctp_faddrs) != -1)
mdb_printf("%<u> %N%</u>", &faddr.faddr);
}
mdb_printf("\n");
@@ -756,78 +763,78 @@ sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
mdb_printf("%<b>Local and Peer Addresses%</b>\n");
/* Display source addresses */
- mdb_printf("nsaddrs\t\t%?d\n", sctp.sctp_nsaddrs);
+ mdb_printf("nsaddrs\t\t%?d\n", sctp->sctp_nsaddrs);
(void) mdb_pwalk("sctp_walk_saddr", print_saddr, NULL, addr);
/* Display peer addresses */
- mdb_printf("nfaddrs\t\t%?d\n", sctp.sctp_nfaddrs);
+ mdb_printf("nfaddrs\t\t%?d\n", sctp->sctp_nfaddrs);
i = 1;
(void) mdb_pwalk("sctp_walk_faddr", print_faddr, &i, addr);
mdb_printf("lastfaddr\t%?p\tprimary\t\t%?p\n",
- sctp.sctp_lastfaddr, sctp.sctp_primary);
+ sctp->sctp_lastfaddr, sctp->sctp_primary);
mdb_printf("current\t\t%?p\tlastdata\t%?p\n",
- sctp.sctp_current, sctp.sctp_lastdata);
+ sctp->sctp_current, sctp->sctp_lastdata);
}
if (opts & MDB_SCTP_SHOW_OUT) {
mdb_printf("%<b>Outbound Data%</b>\n");
mdb_printf("xmit_head\t%?p\txmit_tail\t%?p\n",
- sctp.sctp_xmit_head, sctp.sctp_xmit_tail);
+ sctp->sctp_xmit_head, sctp->sctp_xmit_tail);
mdb_printf("xmit_unsent\t%?p\txmit_unsent_tail%?p\n",
- sctp.sctp_xmit_unsent, sctp.sctp_xmit_unsent_tail);
- mdb_printf("xmit_unacked\t%?p\n", sctp.sctp_xmit_unacked);
+ sctp->sctp_xmit_unsent, sctp->sctp_xmit_unsent_tail);
+ mdb_printf("xmit_unacked\t%?p\n", sctp->sctp_xmit_unacked);
mdb_printf("unacked\t\t%?u\tunsent\t\t%?ld\n",
- sctp.sctp_unacked, sctp.sctp_unsent);
+ sctp->sctp_unacked, sctp->sctp_unsent);
mdb_printf("ltsn\t\t%?x\tlastack_rxd\t%?x\n",
- sctp.sctp_ltsn, sctp.sctp_lastack_rxd);
+ sctp->sctp_ltsn, sctp->sctp_lastack_rxd);
mdb_printf("recovery_tsn\t%?x\tadv_pap\t\t%?x\n",
- sctp.sctp_recovery_tsn, sctp.sctp_adv_pap);
+ sctp->sctp_recovery_tsn, sctp->sctp_adv_pap);
mdb_printf("num_ostr\t%?hu\tostrcntrs\t%?p\n",
- sctp.sctp_num_ostr, sctp.sctp_ostrcntrs);
+ sctp->sctp_num_ostr, sctp->sctp_ostrcntrs);
mdb_printf("pad_mp\t\t%?p\terr_chunks\t%?p\n",
- sctp.sctp_pad_mp, sctp.sctp_err_chunks);
- mdb_printf("err_len\t\t%?u\n", sctp.sctp_err_len);
+ sctp->sctp_pad_mp, sctp->sctp_err_chunks);
+ mdb_printf("err_len\t\t%?u\n", sctp->sctp_err_len);
mdb_printf("%<b>Default Send Parameters%</b>\n");
mdb_printf("def_stream\t%?u\tdef_flags\t%?x\n",
- sctp.sctp_def_stream, sctp.sctp_def_flags);
+ sctp->sctp_def_stream, sctp->sctp_def_flags);
mdb_printf("def_ppid\t%?x\tdef_context\t%?x\n",
- sctp.sctp_def_ppid, sctp.sctp_def_context);
+ sctp->sctp_def_ppid, sctp->sctp_def_context);
mdb_printf("def_timetolive\t%?u\n",
- sctp.sctp_def_timetolive);
+ sctp->sctp_def_timetolive);
}
if (opts & MDB_SCTP_SHOW_IN) {
mdb_printf("%<b>Inbound Data%</b>\n");
mdb_printf("sack_info\t%?p\tsack_gaps\t%?d\n",
- sctp.sctp_sack_info, sctp.sctp_sack_gaps);
- dump_sack_info((uintptr_t)sctp.sctp_sack_info);
+ sctp->sctp_sack_info, sctp->sctp_sack_gaps);
+ dump_sack_info((uintptr_t)sctp->sctp_sack_info);
mdb_printf("ftsn\t\t%?x\tlastacked\t%?x\n",
- sctp.sctp_ftsn, sctp.sctp_lastacked);
+ sctp->sctp_ftsn, sctp->sctp_lastacked);
mdb_printf("istr_nmsgs\t%?d\tsack_toggle\t%?d\n",
- sctp.sctp_istr_nmsgs, sctp.sctp_sack_toggle);
- mdb_printf("ack_mp\t\t%?p\n", sctp.sctp_ack_mp);
+ sctp->sctp_istr_nmsgs, sctp->sctp_sack_toggle);
+ mdb_printf("ack_mp\t\t%?p\n", sctp->sctp_ack_mp);
mdb_printf("num_istr\t%?hu\tinstr\t\t%?p\n",
- sctp.sctp_num_istr, sctp.sctp_instr);
- mdb_printf("unord_reass\t%?p\n", sctp.sctp_uo_frags);
+ sctp->sctp_num_istr, sctp->sctp_instr);
+ mdb_printf("unord_reass\t%?p\n", sctp->sctp_uo_frags);
}
if (opts & MDB_SCTP_SHOW_RTT) {
mdb_printf("%<b>RTT Tracking%</b>\n");
mdb_printf("rtt_tsn\t\t%?x\tout_time\t%?ld\n",
- sctp.sctp_rtt_tsn, sctp.sctp_out_time);
+ sctp->sctp_rtt_tsn, sctp->sctp_out_time);
}
if (opts & MDB_SCTP_SHOW_FLOW) {
mdb_printf("%<b>Flow Control%</b>\n");
- mdb_printf("txmit_hiwater\t%?d\n"
- "xmit_lowater\t%?d\tfrwnd\t\t%?u\n"
+ mdb_printf("tconn_sndbuf\t%?d\n"
+ "conn_sndlowat\t%?d\tfrwnd\t\t%?u\n"
"rwnd\t\t%?u\tinitial rwnd\t%?u\n"
- "rxqueued\t%?u\tcwnd_max\t%?u\n", sctp.sctp_xmit_hiwater,
- sctp.sctp_xmit_lowater, sctp.sctp_frwnd,
- sctp.sctp_rwnd, sctp.sctp_irwnd, sctp.sctp_rxqueued,
- sctp.sctp_cwnd_max);
+ "rxqueued\t%?u\tcwnd_max\t%?u\n", connp->conn_sndbuf,
+ connp->conn_sndlowat, sctp->sctp_frwnd,
+ sctp->sctp_rwnd, sctp->sctp_irwnd, sctp->sctp_rxqueued,
+ sctp->sctp_cwnd_max);
}
if (opts & MDB_SCTP_SHOW_HDR) {
@@ -838,21 +845,21 @@ sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
"ipha\t\t%?p\tip6h\t\t%?p\n"
"ip_hdr_len\t%?d\tip_hdr6_len\t%?d\n"
"sctph\t\t%?p\tsctph6\t\t%?p\n"
- "lvtag\t\t%?x\tfvtag\t\t%?x\n", sctp.sctp_iphc,
- sctp.sctp_iphc6, sctp.sctp_iphc_len,
- sctp.sctp_iphc6_len, sctp.sctp_hdr_len,
- sctp.sctp_hdr6_len, sctp.sctp_ipha, sctp.sctp_ip6h,
- sctp.sctp_ip_hdr_len, sctp.sctp_ip_hdr6_len,
- sctp.sctp_sctph, sctp.sctp_sctph6, sctp.sctp_lvtag,
- sctp.sctp_fvtag);
+ "lvtag\t\t%?x\tfvtag\t\t%?x\n", sctp->sctp_iphc,
+ sctp->sctp_iphc6, sctp->sctp_iphc_len,
+ sctp->sctp_iphc6_len, sctp->sctp_hdr_len,
+ sctp->sctp_hdr6_len, sctp->sctp_ipha, sctp->sctp_ip6h,
+ sctp->sctp_ip_hdr_len, sctp->sctp_ip_hdr6_len,
+ sctp->sctp_sctph, sctp->sctp_sctph6, sctp->sctp_lvtag,
+ sctp->sctp_fvtag);
}
if (opts & MDB_SCTP_SHOW_PMTUD) {
mdb_printf("%<b>PMTUd%</b>\n");
mdb_printf("last_mtu_probe\t%?ld\tmtu_probe_intvl\t%?ld\n"
"mss\t\t%?u\n",
- sctp.sctp_last_mtu_probe, sctp.sctp_mtu_probe_intvl,
- sctp.sctp_mss);
+ sctp->sctp_last_mtu_probe, sctp->sctp_mtu_probe_intvl,
+ sctp->sctp_mss);
}
if (opts & MDB_SCTP_SHOW_RXT) {
@@ -862,33 +869,33 @@ sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
"pp_max_rxt\t%?d\trto_max\t\t%?u\n"
"rto_min\t\t%?u\trto_initial\t%?u\n"
"init_rto_max\t%?u\n"
- "rxt_nxttsn\t%?u\trxt_maxtsn\t%?u\n", sctp.sctp_cookie_mp,
- sctp.sctp_strikes, sctp.sctp_max_init_rxt,
- sctp.sctp_pa_max_rxt, sctp.sctp_pp_max_rxt,
- sctp.sctp_rto_max, sctp.sctp_rto_min,
- sctp.sctp_rto_initial, sctp.sctp_init_rto_max,
- sctp.sctp_rxt_nxttsn, sctp.sctp_rxt_maxtsn);
+ "rxt_nxttsn\t%?u\trxt_maxtsn\t%?u\n", sctp->sctp_cookie_mp,
+ sctp->sctp_strikes, sctp->sctp_max_init_rxt,
+ sctp->sctp_pa_max_rxt, sctp->sctp_pp_max_rxt,
+ sctp->sctp_rto_max, sctp->sctp_rto_min,
+ sctp->sctp_rto_initial, sctp->sctp_init_rto_max,
+ sctp->sctp_rxt_nxttsn, sctp->sctp_rxt_maxtsn);
}
if (opts & MDB_SCTP_SHOW_CONN) {
mdb_printf("%<b>Connection State%</b>\n");
mdb_printf("last_secret_update%?ld\n",
- sctp.sctp_last_secret_update);
+ sctp->sctp_last_secret_update);
mdb_printf("secret\t\t");
for (i = 0; i < SCTP_SECRET_LEN; i++) {
if (i % 2 == 0)
- mdb_printf("0x%02x", sctp.sctp_secret[i]);
+ mdb_printf("0x%02x", sctp->sctp_secret[i]);
else
- mdb_printf("%02x ", sctp.sctp_secret[i]);
+ mdb_printf("%02x ", sctp->sctp_secret[i]);
}
mdb_printf("\n");
mdb_printf("old_secret\t");
for (i = 0; i < SCTP_SECRET_LEN; i++) {
if (i % 2 == 0)
- mdb_printf("0x%02x", sctp.sctp_old_secret[i]);
+ mdb_printf("0x%02x", sctp->sctp_old_secret[i]);
else
- mdb_printf("%02x ", sctp.sctp_old_secret[i]);
+ mdb_printf("%02x ", sctp->sctp_old_secret[i]);
}
mdb_printf("\n");
}
@@ -901,40 +908,40 @@ sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
"T2expire\t%?lu\tT3expire\t%?lu\n"
"msgcount\t%?llu\tprsctpdrop\t%?llu\n"
"AssocStartTime\t%?lu\n",
- sctp.sctp_opkts, sctp.sctp_obchunks,
- sctp.sctp_odchunks, sctp.sctp_oudchunks,
- sctp.sctp_rxtchunks, sctp.sctp_T1expire,
- sctp.sctp_T2expire, sctp.sctp_T3expire,
- sctp.sctp_msgcount, sctp.sctp_prsctpdrop,
- sctp.sctp_assoc_start_time);
+ sctp->sctp_opkts, sctp->sctp_obchunks,
+ sctp->sctp_odchunks, sctp->sctp_oudchunks,
+ sctp->sctp_rxtchunks, sctp->sctp_T1expire,
+ sctp->sctp_T2expire, sctp->sctp_T3expire,
+ sctp->sctp_msgcount, sctp->sctp_prsctpdrop,
+ sctp->sctp_assoc_start_time);
mdb_printf("ipkts\t\t%?llu\tibchunks\t%?llu\n"
"idchunks\t%?llu\tiudchunks\t%?llu\n"
"fragdmsgs\t%?llu\treassmsgs\t%?llu\n",
- sctp.sctp_ipkts, sctp.sctp_ibchunks,
- sctp.sctp_idchunks, sctp.sctp_iudchunks,
- sctp.sctp_fragdmsgs, sctp.sctp_reassmsgs);
+ sctp->sctp_ipkts, sctp->sctp_ibchunks,
+ sctp->sctp_idchunks, sctp->sctp_iudchunks,
+ sctp->sctp_fragdmsgs, sctp->sctp_reassmsgs);
}
if (opts & MDB_SCTP_SHOW_HASH) {
mdb_printf("%<b>Hash Tables%</b>\n");
- mdb_printf("conn_hash_next\t%?p\t", sctp.sctp_conn_hash_next);
- mdb_printf("conn_hash_prev\t%?p\n", sctp.sctp_conn_hash_prev);
+ mdb_printf("conn_hash_next\t%?p\t", sctp->sctp_conn_hash_next);
+ mdb_printf("conn_hash_prev\t%?p\n", sctp->sctp_conn_hash_prev);
mdb_printf("listen_hash_next%?p\t",
- sctp.sctp_listen_hash_next);
+ sctp->sctp_listen_hash_next);
mdb_printf("listen_hash_prev%?p\n",
- sctp.sctp_listen_hash_prev);
- mdb_nhconvert(&lport, &sctp.sctp_lport, sizeof (lport));
+ sctp->sctp_listen_hash_prev);
+ mdb_nhconvert(&lport, &connp->conn_lport, sizeof (lport));
mdb_printf("[ listen_hash bucket\t%?d ]\n",
SCTP_LISTEN_HASH(lport));
- mdb_printf("conn_tfp\t%?p\t", sctp.sctp_conn_tfp);
- mdb_printf("listen_tfp\t%?p\n", sctp.sctp_listen_tfp);
+ mdb_printf("conn_tfp\t%?p\t", sctp->sctp_conn_tfp);
+ mdb_printf("listen_tfp\t%?p\n", sctp->sctp_listen_tfp);
mdb_printf("bind_hash\t%?p\tptpbhn\t\t%?p\n",
- sctp.sctp_bind_hash, sctp.sctp_ptpbhn);
+ sctp->sctp_bind_hash, sctp->sctp_ptpbhn);
mdb_printf("bind_lockp\t%?p\n",
- sctp.sctp_bind_lockp);
+ sctp->sctp_bind_lockp);
mdb_printf("[ bind_hash bucket\t%?d ]\n",
SCTP_BIND_HASH(lport));
}
@@ -943,8 +950,8 @@ sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
mdb_printf("%<b>Cleanup / Close%</b>\n");
mdb_printf("shutdown_faddr\t%?p\tclient_errno\t%?d\n"
"lingertime\t%?d\trefcnt\t\t%?hu\n",
- sctp.sctp_shutdown_faddr, sctp.sctp_client_errno,
- sctp.sctp_lingertime, sctp.sctp_refcnt);
+ sctp->sctp_shutdown_faddr, sctp->sctp_client_errno,
+ connp->conn_lingertime, sctp->sctp_refcnt);
}
if (opts & MDB_SCTP_SHOW_MISC) {
@@ -955,24 +962,25 @@ sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
"active\t\t%?ld\ttx_adaptation_code%?x\n"
"rx_adaptation_code%?x\ttimer_mp\t%?p\n"
"partial_delivery_point\t%?d\n",
- sctp.sctp_bound_if, sctp.sctp_heartbeat_mp,
- sctp.sctp_family, sctp.sctp_ipversion,
- sctp.sctp_hb_interval, sctp.sctp_autoclose,
- sctp.sctp_active, sctp.sctp_tx_adaptation_code,
- sctp.sctp_rx_adaptation_code, sctp.sctp_timer_mp,
- sctp.sctp_pd_point);
+ connp->conn_bound_if, sctp->sctp_heartbeat_mp,
+ connp->conn_family,
+ connp->conn_ipversion,
+ sctp->sctp_hb_interval, sctp->sctp_autoclose,
+ sctp->sctp_active, sctp->sctp_tx_adaptation_code,
+ sctp->sctp_rx_adaptation_code, sctp->sctp_timer_mp,
+ sctp->sctp_pd_point);
}
if (opts & MDB_SCTP_SHOW_EXT) {
mdb_printf("%<b>Extensions and Reliable Ctl Chunks%</b>\n");
mdb_printf("cxmit_list\t%?p\tlcsn\t\t%?x\n"
- "fcsn\t\t%?x\n", sctp.sctp_cxmit_list, sctp.sctp_lcsn,
- sctp.sctp_fcsn);
+ "fcsn\t\t%?x\n", sctp->sctp_cxmit_list, sctp->sctp_lcsn,
+ sctp->sctp_fcsn);
}
if (opts & MDB_SCTP_SHOW_FLAGS) {
mdb_printf("%<b>Flags%</b>\n");
- show_sctp_flags(&sctp);
+ show_sctp_flags(sctp);
}
return (DCMD_OK);
diff --git a/usr/src/common/net/patricia/radix.c b/usr/src/common/net/patricia/radix.c
index 9a1d3f78ed..cf2085280f 100644
--- a/usr/src/common/net/patricia/radix.c
+++ b/usr/src/common/net/patricia/radix.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*
* Copyright (c) 1988, 1989, 1993
@@ -367,8 +367,9 @@ rn_match_args(v_arg, head, rn_leaf_fn, rn_leaf_arg)
* is looking for some other criteria as well. Continue
* looking as if the exact match failed.
*/
- if (t->rn_parent->rn_flags & RNF_ROOT) {
- /* hit the top. have to give up */
+ if (t->rn_dupedkey == NULL &&
+ (t->rn_parent->rn_flags & RNF_ROOT)) {
+ /* no more dupedkeys and hit the top. have to give up */
return (NULL);
}
b = 0;
@@ -486,56 +487,70 @@ rn_insert(v_arg, head, dupentry, nodes)
{
caddr_t v = v_arg;
struct radix_node *top = head->rnh_treetop;
+ struct radix_node *p, *x;
int head_off = top->rn_offset, vlen = (int)LEN(v);
struct radix_node *t = rn_search(v_arg, top);
caddr_t cp = v + head_off;
int b;
struct radix_node *tt;
+ caddr_t cp2 = t->rn_key + head_off;
+ int cmp_res;
+ caddr_t cplim = v + vlen;
/*
* Find first bit at which v and t->rn_key differ
*/
- {
- caddr_t cp2 = t->rn_key + head_off;
- int cmp_res;
- caddr_t cplim = v + vlen;
-
- while (cp < cplim)
- if (*cp2++ != *cp++)
- goto on1;
- *dupentry = 1;
- return (t);
+ while (cp < cplim)
+ if (*cp2++ != *cp++)
+ goto on1;
+ *dupentry = 1;
+ return (t);
on1:
- *dupentry = 0;
- cmp_res = (cp[-1] ^ cp2[-1]) & 0xff;
- for (b = (cp - v) << 3; cmp_res; b--)
- cmp_res >>= 1;
- }
- {
- struct radix_node *p, *x = top;
- cp = v;
- do {
- p = x;
- if (cp[x->rn_offset] & x->rn_bmask)
- x = x->rn_right;
- else
- x = x->rn_left;
- } while (b > (unsigned)x->rn_bit);
- /* x->rn_bit < b && x->rn_bit >= 0 */
- t = rn_newpair(v_arg, b, nodes);
- tt = t->rn_left;
- if ((cp[p->rn_offset] & p->rn_bmask) == 0)
- p->rn_left = t;
+ *dupentry = 0;
+ cmp_res = (cp[-1] ^ cp2[-1]) & 0xff;
+ /*
+ * (cp - v) is the number of bytes where the match is relevant.
+ * Multiply by 8 to get number of bits. Then reduce this number
+ * by the trailing bits in the last byte where we have a match
+ * by looking at (cmp_res >> 1) in each iteration below.
+ * Note that v starts at the beginning of the key, so, when key
+ * is a sockaddr structure, the preliminary len/family/port bytes
+ * are accounted for.
+ */
+ for (b = (cp - v) << 3; cmp_res; b--)
+ cmp_res >>= 1;
+ cp = v;
+ x = top;
+ do {
+ p = x;
+ if (cp[x->rn_offset] & x->rn_bmask)
+ x = x->rn_right;
else
- p->rn_right = t;
- x->rn_parent = t;
- t->rn_parent = p;
- if ((cp[t->rn_offset] & t->rn_bmask) == 0) {
- t->rn_right = x;
- } else {
- t->rn_right = tt;
- t->rn_left = x;
- }
+ x = x->rn_left;
+ } while (b > (unsigned)x->rn_bit);
+ /* x->rn_bit < b && x->rn_bit >= 0 */
+ /*
+ * now the rightmost bit where v and rn_key differ (b) is <
+ * x->rn_bit.
+ *
+ * We will add a new branch at p. b cannot equal x->rn_bit
+ * because we know we didn't find a duplicated key.
+ * The tree will be re-adjusted so that t is inserted between p
+ * and x.
+ */
+ t = rn_newpair(v_arg, b, nodes);
+ tt = t->rn_left;
+ if ((cp[p->rn_offset] & p->rn_bmask) == 0)
+ p->rn_left = t;
+ else
+ p->rn_right = t;
+ x->rn_parent = t;
+ t->rn_parent = p;
+ if ((cp[t->rn_offset] & t->rn_bmask) == 0) {
+ t->rn_right = x;
+ } else {
+ t->rn_right = tt;
+ t->rn_left = x;
}
return (tt);
}
@@ -718,6 +733,8 @@ rn_addroute(v_arg, n_arg, head, treenodes)
* find it among possible duplicate key entries
* anyway, so the above test doesn't hurt.
*
+ * Insert treenodes before tt.
+ *
* We sort the masks for a duplicated key the same way as
* in a masklist -- most specific to least specific.
* This may require the unfortunate nuisance of relocating
@@ -758,22 +775,54 @@ rn_addroute(v_arg, n_arg, head, treenodes)
tt->rn_bit = x->rn_bit;
tt->rn_flags |= x->rn_flags & RNF_NORMAL;
}
+ /* BEGIN CSTYLED */
+ /*
+ * at this point the parent-child relationship for p, t, x, tt is
+ * one of the following:
+ * p p
+ * : (left/right child) :
+ * : :
+ * t t
+ * / \ / \
+ * x tt tt x
+ *
+ * tt == saved_tt returned by rn_insert().
+ */
+ /* END CSTYLED */
t = saved_tt->rn_parent;
if (keyduplicated)
goto key_exists;
b_leaf = -1 - t->rn_bit;
+ /*
+ * b_leaf is now normalized to be in the leaf rn_bit format
+ * (it is the rn_bit value of a leaf corresponding to netmask
+ * of t->rn_bit).
+ */
if (t->rn_right == saved_tt)
x = t->rn_left;
else
x = t->rn_right;
- /* Promote general routes from below */
+ /*
+ * Promote general routes from below.
+ * Identify the less specific netmasks and add them to t->rm_mklist
+ */
if (x->rn_bit < 0) {
- for (mp = &t->rn_mklist; x; x = x->rn_dupedkey)
- if (x->rn_mask && (x->rn_bit >= b_leaf) && x->rn_mklist == 0) {
- *mp = m = rn_new_radix_mask(x, 0);
- if (m)
- mp = &m->rm_mklist;
- }
+ /* x is the sibling node. it is a leaf node. */
+ for (mp = &t->rn_mklist; x; x = x->rn_dupedkey)
+ if (x->rn_mask && (x->rn_bit >= b_leaf) &&
+ x->rn_mklist == 0) {
+ /*
+ * x is the first node in the dupedkey chain
+ * without a mklist, and with a shorter mask
+ * than b_leaf. Create a radix_mask
+ * corresponding to x's mask and add it to
+ * t's rn_mklist. The mask list gets created
+ * in decreasing order of mask length.
+ */
+ *mp = m = rn_new_radix_mask(x, 0);
+ if (m)
+ mp = &m->rm_mklist;
+ }
} else if (x->rn_mklist) {
/*
* Skip over masks whose index is > that of new node
@@ -788,6 +837,7 @@ key_exists:
if ((netmask == 0) || (b > t->rn_bit))
return (tt); /* can't lift at all */
b_leaf = tt->rn_bit;
+ /* b is the index of the netmask */
do {
x = t;
t = t->rn_parent;
diff --git a/usr/src/lib/brand/native/zone/platform.xml b/usr/src/lib/brand/native/zone/platform.xml
index e988200bde..0225a51dc7 100644
--- a/usr/src/lib/brand/native/zone/platform.xml
+++ b/usr/src/lib/brand/native/zone/platform.xml
@@ -106,7 +106,6 @@
<device match="ipsecesp" ip-type="exclusive" />
<device match="ipstate" ip-type="exclusive" />
<device match="ipsync" ip-type="exclusive" />
- <device match="iptunq" ip-type="exclusive" />
<device match="keysock" ip-type="exclusive" />
<device match="rawip" ip-type="exclusive" />
<device match="rawip6" ip-type="exclusive" />
@@ -117,6 +116,7 @@
<device match="spdsock" ip-type="exclusive" />
<device match="sppp" ip-type="exclusive" />
<device match="sppptun" ip-type="exclusive" />
+ <device match="vni" ip-type="exclusive" />
<!-- Renamed devices to create under /dev -->
<device match="zcons/%z/zoneconsole" name="zconsole" />
diff --git a/usr/src/lib/brand/solaris10/zone/platform.xml b/usr/src/lib/brand/solaris10/zone/platform.xml
index fa396ec222..89f7035615 100644
--- a/usr/src/lib/brand/solaris10/zone/platform.xml
+++ b/usr/src/lib/brand/solaris10/zone/platform.xml
@@ -123,7 +123,6 @@
<device match="ipsecesp" ip-type="exclusive" />
<device match="ipstate" ip-type="exclusive" />
<device match="ipsync" ip-type="exclusive" />
- <device match="iptunq" ip-type="exclusive" />
<device match="keysock" ip-type="exclusive" />
<device match="rawip" ip-type="exclusive" />
<device match="rawip6" ip-type="exclusive" />
@@ -134,6 +133,7 @@
<device match="spdsock" ip-type="exclusive" />
<device match="sppp" ip-type="exclusive" />
<device match="sppptun" ip-type="exclusive" />
+ <device match="vni" ip-type="exclusive" />
<!-- Renamed devices to create under /dev -->
<device match="zcons/%z/zoneconsole" name="zconsole" />
diff --git a/usr/src/pkgdefs/SUNWckr/prototype_com b/usr/src/pkgdefs/SUNWckr/prototype_com
index 30679b7037..86489c1422 100644
--- a/usr/src/pkgdefs/SUNWckr/prototype_com
+++ b/usr/src/pkgdefs/SUNWckr/prototype_com
@@ -92,7 +92,6 @@ f none kernel/drv/ippctl.conf 644 root sys
f none kernel/drv/ipsecah.conf 644 root sys
f none kernel/drv/ipsecesp.conf 644 root sys
f none kernel/drv/iptun.conf 644 root sys
-f none kernel/drv/iptunq.conf 644 root sys
f none kernel/drv/iwscn.conf 644 root sys
f none kernel/drv/keysock.conf 644 root sys
f none kernel/drv/kmdb.conf 644 root sys
diff --git a/usr/src/pkgdefs/SUNWckr/prototype_i386 b/usr/src/pkgdefs/SUNWckr/prototype_i386
index 2a6676197e..5f886a8d60 100644
--- a/usr/src/pkgdefs/SUNWckr/prototype_i386
+++ b/usr/src/pkgdefs/SUNWckr/prototype_i386
@@ -103,7 +103,6 @@ f none kernel/drv/ippctl 755 root sys
f none kernel/drv/ipsecah 755 root sys
f none kernel/drv/ipsecesp 755 root sys
f none kernel/drv/iptun 755 root sys
-f none kernel/drv/iptunq 755 root sys
f none kernel/drv/iwscn 755 root sys
f none kernel/drv/kb8042 755 root sys
f none kernel/drv/keysock 755 root sys
@@ -326,7 +325,6 @@ f none kernel/drv/amd64/ippctl 755 root sys
f none kernel/drv/amd64/ipsecah 755 root sys
f none kernel/drv/amd64/ipsecesp 755 root sys
f none kernel/drv/amd64/iptun 755 root sys
-f none kernel/drv/amd64/iptunq 755 root sys
f none kernel/drv/amd64/iwscn 755 root sys
f none kernel/drv/amd64/kb8042 755 root sys
f none kernel/drv/amd64/keysock 755 root sys
diff --git a/usr/src/pkgdefs/SUNWckr/prototype_sparc b/usr/src/pkgdefs/SUNWckr/prototype_sparc
index e086c94862..c2824f989c 100644
--- a/usr/src/pkgdefs/SUNWckr/prototype_sparc
+++ b/usr/src/pkgdefs/SUNWckr/prototype_sparc
@@ -94,7 +94,6 @@ f none kernel/drv/sparcv9/ippctl 755 root sys
f none kernel/drv/sparcv9/ipsecah 755 root sys
f none kernel/drv/sparcv9/ipsecesp 755 root sys
f none kernel/drv/sparcv9/iptun 755 root sys
-f none kernel/drv/sparcv9/iptunq 755 root sys
f none kernel/drv/sparcv9/isp 755 root sys
f none kernel/drv/sparcv9/iwscn 755 root sys
f none kernel/drv/sparcv9/kb8042 755 root sys
diff --git a/usr/src/pkgdefs/SUNWhea/prototype_com b/usr/src/pkgdefs/SUNWhea/prototype_com
index 3129ef6be5..e3bfe3f348 100644
--- a/usr/src/pkgdefs/SUNWhea/prototype_com
+++ b/usr/src/pkgdefs/SUNWhea/prototype_com
@@ -242,6 +242,7 @@ d none usr/include/inet 755 root bin
f none usr/include/inet/arp.h 644 root bin
f none usr/include/inet/common.h 644 root bin
f none usr/include/inet/ip.h 644 root bin
+f none usr/include/inet/ip_arp.h 644 root bin
f none usr/include/inet/ip_if.h 644 root bin
f none usr/include/inet/ip_ire.h 644 root bin
f none usr/include/inet/ip_ftable.h 644 root bin
diff --git a/usr/src/pkgdefs/etc/exception_list_i386 b/usr/src/pkgdefs/etc/exception_list_i386
index 09514a0ecc..ee760eba55 100644
--- a/usr/src/pkgdefs/etc/exception_list_i386
+++ b/usr/src/pkgdefs/etc/exception_list_i386
@@ -365,7 +365,6 @@ usr/lib/amd64/llib-like.ln i386
usr/lib/amd64/libipsecutil.so i386
usr/lib/amd64/llib-lipsecutil.ln i386
#
-usr/include/inet/arp_impl.h i386
usr/include/inet/rawip_impl.h i386
usr/include/inet/udp_impl.h i386
usr/include/inet/tcp_impl.h i386
diff --git a/usr/src/pkgdefs/etc/exception_list_sparc b/usr/src/pkgdefs/etc/exception_list_sparc
index 5a32c55a05..533552b058 100644
--- a/usr/src/pkgdefs/etc/exception_list_sparc
+++ b/usr/src/pkgdefs/etc/exception_list_sparc
@@ -354,7 +354,6 @@ usr/share/lib/locale/com/sun/dhcpmgr/cli/dhcpconfig/ResourceBundle.properties sp
usr/share/lib/locale/com/sun/dhcpmgr/cli/dhtadm/ResourceBundle.properties sparc
usr/share/lib/locale/com/sun/dhcpmgr/cli/pntadm/ResourceBundle.properties sparc
#
-usr/include/inet/arp_impl.h sparc
usr/include/inet/rawip_impl.h sparc
usr/include/inet/udp_impl.h sparc
usr/include/inet/tcp_impl.h sparc
diff --git a/usr/src/tools/scripts/bfu.sh b/usr/src/tools/scripts/bfu.sh
index be820004e4..e4e9a36ab2 100644
--- a/usr/src/tools/scripts/bfu.sh
+++ b/usr/src/tools/scripts/bfu.sh
@@ -8010,6 +8010,12 @@ mondo_loop() {
rm -f $root/kernel/strmod/sparcv9/tun
rm -f $root/kernel/strmod/amd64/tun
+ # Remove obsolete iptunq
+ rm -f $root/kernel/drv/iptunq
+ rm -f $root/kernel/drv/iptunq.conf
+ rm -f $root/kernel/drv/amd64/iptunq
+ rm -f $root/kernel/drv/sparcv9/iptunq
+
#
# Remove libtopo platform XML files that have been replaced by propmap
# files.
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 042685bc5a..550606f39c 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -514,7 +514,7 @@ TOKENMT_OBJS += tokenmt.o tokenmtddi.o
TSWTCL_OBJS += tswtcl.o tswtclddi.o
-ARP_OBJS += arpddi.o arp.o arp_netinfo.o
+ARP_OBJS += arpddi.o
ICMP_OBJS += icmpddi.o
@@ -535,13 +535,15 @@ IP_SCTP_OBJS = sctp.o sctp_opt_data.o sctp_output.o \
sctp_addr.o tn_ipopt.o tnet.o ip_netinfo.o
IP_ILB_OBJS = ilb.o ilb_nat.o ilb_conn.o ilb_alg_hash.o ilb_alg_rr.o
-IP_OBJS += igmp.o ipmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o ip6_rts.o \
- ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \
- ip_multi.o ip2mac.o ip_ndp.o ip_opt_data.o ip_rts.o ip_srcid.o \
+IP_OBJS += igmp.o ipmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o \
+ ip6_rts.o ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \
+ ip_multi.o ip2mac.o ip_ndp.o ip_rts.o ip_srcid.o \
ipddi.o ipdrop.o mi.o nd.o optcom.o snmpcom.o ipsec_loader.o \
spd.o ipclassifier.o inet_common.o ip_squeue.o squeue.o \
ip_sadb.o ip_ftable.o proto_set.o radix.o ip_dummy.o \
- ip_helper_stream.o iptunq.o \
+ ip_helper_stream.o \
+ ip_output.o ip_input.o ip6_input.o ip6_output.o ip_arp.o \
+ conn_opt.o ip_attr.o ip_dce.o \
$(IP_ICMP_OBJS) \
$(IP_RTS_OBJS) \
$(IP_TCP_OBJS) \
@@ -644,8 +646,6 @@ MAC_IB_OBJS += mac_ib.o
IPTUN_OBJS += iptun_dev.o iptun_ctl.o iptun.o
-IPTUNQ_OBJS += iptunq_ddi.o
-
AGGR_OBJS += aggr_dev.o aggr_ctl.o aggr_grp.o aggr_port.o \
aggr_send.o aggr_recv.o aggr_lacp.o
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon.h b/usr/src/uts/common/fs/sockfs/sockcommon.h
index f3ffe456f1..fac10a8935 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon.h
+++ b/usr/src/uts/common/fs/sockfs/sockcommon.h
@@ -184,8 +184,7 @@ extern int so_dequeue_msg(struct sonode *, mblk_t **, struct uio *,
extern void so_enqueue_msg(struct sonode *, mblk_t *, size_t);
extern void so_process_new_message(struct sonode *, mblk_t *, mblk_t *);
-extern mblk_t *socopyinuio(uio_t *, ssize_t, size_t, ssize_t, size_t, int *,
- cred_t *);
+extern mblk_t *socopyinuio(uio_t *, ssize_t, size_t, ssize_t, size_t, int *);
extern mblk_t *socopyoutuio(mblk_t *, struct uio *, ssize_t, int *);
extern boolean_t somsghasdata(mblk_t *);
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
index 48a3e37921..4521fdd352 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
@@ -470,8 +470,7 @@ so_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
so->so_proto_props.sopp_maxpsz,
so->so_proto_props.sopp_wroff,
so->so_proto_props.sopp_maxblk,
- so->so_proto_props.sopp_tail, &error,
- cr)) == NULL) {
+ so->so_proto_props.sopp_tail, &error)) == NULL) {
break;
}
ASSERT(uiop->uio_resid >= 0);
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
index a244c65bc6..9b806d0a4a 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
@@ -471,7 +471,7 @@ socket_sendsig(struct sonode *so, int event)
/* Copy userdata into a new mblk_t */
mblk_t *
socopyinuio(uio_t *uiop, ssize_t iosize, size_t wroff, ssize_t maxblk,
- size_t tail_len, int *errorp, cred_t *cr)
+ size_t tail_len, int *errorp)
{
mblk_t *head = NULL, **tail = &head;
@@ -499,11 +499,7 @@ socopyinuio(uio_t *uiop, ssize_t iosize, size_t wroff, ssize_t maxblk,
blocksize = MIN(iosize, maxblk);
ASSERT(blocksize >= 0);
- if (is_system_labeled())
- mp = allocb_cred(wroff + blocksize + tail_len,
- cr, curproc->p_pid);
- else
- mp = allocb(wroff + blocksize + tail_len, BPRI_MED);
+ mp = allocb(wroff + blocksize + tail_len, BPRI_MED);
if (mp == NULL) {
*errorp = ENOMEM;
return (head);
diff --git a/usr/src/uts/common/fs/sockfs/socktpi.c b/usr/src/uts/common/fs/sockfs/socktpi.c
index b2a178fbcb..bfbd67ad81 100644
--- a/usr/src/uts/common/fs/sockfs/socktpi.c
+++ b/usr/src/uts/common/fs/sockfs/socktpi.c
@@ -5506,205 +5506,6 @@ sotpi_setsockopt(struct sonode *so, int level, int option_name,
so_lock_single(so); /* Set SOLOCKED */
mutex_exit(&so->so_lock);
- /*
- * For SOCKET or TCP level options, try to set it here itself
- * provided socket has not been popped and we know the tcp
- * structure (stored in so_priv).
- */
- if ((level == SOL_SOCKET || level == IPPROTO_TCP) &&
- (so->so_family == AF_INET || so->so_family == AF_INET6) &&
- (so->so_version == SOV_SOCKSTREAM) &&
- (so->so_proto_handle != NULL)) {
- tcp_t *tcp = (tcp_t *)so->so_proto_handle;
- boolean_t onoff;
-
-#define intvalue (*(int32_t *)optval)
-
- switch (level) {
- case SOL_SOCKET:
- switch (option_name) { /* Check length param */
- case SO_DEBUG:
- case SO_REUSEADDR:
- case SO_DONTROUTE:
- case SO_BROADCAST:
- case SO_USELOOPBACK:
- case SO_OOBINLINE:
- case SO_DGRAM_ERRIND:
- if (optlen != (t_uscalar_t)sizeof (int32_t)) {
- error = EINVAL;
- eprintsoline(so, error);
- mutex_enter(&so->so_lock);
- goto done2;
- }
- ASSERT(optval);
- onoff = intvalue != 0;
- handled = B_TRUE;
- break;
- case SO_SNDTIMEO:
- case SO_RCVTIMEO:
- if (get_udatamodel() == DATAMODEL_NONE ||
- get_udatamodel() == DATAMODEL_NATIVE) {
- if (optlen !=
- sizeof (struct timeval)) {
- error = EINVAL;
- eprintsoline(so, error);
- mutex_enter(&so->so_lock);
- goto done2;
- }
- } else {
- if (optlen !=
- sizeof (struct timeval32)) {
- error = EINVAL;
- eprintsoline(so, error);
- mutex_enter(&so->so_lock);
- goto done2;
- }
- }
- ASSERT(optval);
- handled = B_TRUE;
- break;
- case SO_LINGER:
- if (optlen !=
- (t_uscalar_t)sizeof (struct linger)) {
- error = EINVAL;
- eprintsoline(so, error);
- mutex_enter(&so->so_lock);
- goto done2;
- }
- ASSERT(optval);
- handled = B_TRUE;
- break;
- }
-
- switch (option_name) { /* Do actions */
- case SO_LINGER: {
- struct linger *lgr = (struct linger *)optval;
-
- if (lgr->l_onoff) {
- tcp->tcp_linger = 1;
- tcp->tcp_lingertime = lgr->l_linger;
- so->so_linger.l_onoff = SO_LINGER;
- so->so_options |= SO_LINGER;
- } else {
- tcp->tcp_linger = 0;
- tcp->tcp_lingertime = 0;
- so->so_linger.l_onoff = 0;
- so->so_options &= ~SO_LINGER;
- }
- so->so_linger.l_linger = lgr->l_linger;
- handled = B_TRUE;
- break;
- }
- case SO_SNDTIMEO:
- case SO_RCVTIMEO: {
- struct timeval tl;
- clock_t val;
-
- if (get_udatamodel() == DATAMODEL_NONE ||
- get_udatamodel() == DATAMODEL_NATIVE)
- bcopy(&tl, (struct timeval *)optval,
- sizeof (struct timeval));
- else
- TIMEVAL32_TO_TIMEVAL(&tl,
- (struct timeval32 *)optval);
- val = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
- if (option_name == SO_RCVTIMEO)
- so->so_rcvtimeo = drv_usectohz(val);
- else
- so->so_sndtimeo = drv_usectohz(val);
- break;
- }
-
- case SO_DEBUG:
- tcp->tcp_debug = onoff;
-#ifdef SOCK_TEST
- if (intvalue & 2)
- sock_test_timelimit = 10 * hz;
- else
- sock_test_timelimit = 0;
-
- if (intvalue & 4)
- do_useracc = 0;
- else
- do_useracc = 1;
-#endif /* SOCK_TEST */
- break;
- case SO_DONTROUTE:
- /*
- * SO_DONTROUTE, SO_USELOOPBACK and
- * SO_BROADCAST are only of interest to IP.
- * We track them here only so
- * that we can report their current value.
- */
- tcp->tcp_dontroute = onoff;
- if (onoff)
- so->so_options |= option_name;
- else
- so->so_options &= ~option_name;
- break;
- case SO_USELOOPBACK:
- tcp->tcp_useloopback = onoff;
- if (onoff)
- so->so_options |= option_name;
- else
- so->so_options &= ~option_name;
- break;
- case SO_BROADCAST:
- tcp->tcp_broadcast = onoff;
- if (onoff)
- so->so_options |= option_name;
- else
- so->so_options &= ~option_name;
- break;
- case SO_REUSEADDR:
- tcp->tcp_reuseaddr = onoff;
- if (onoff)
- so->so_options |= option_name;
- else
- so->so_options &= ~option_name;
- break;
- case SO_OOBINLINE:
- tcp->tcp_oobinline = onoff;
- if (onoff)
- so->so_options |= option_name;
- else
- so->so_options &= ~option_name;
- break;
- case SO_DGRAM_ERRIND:
- tcp->tcp_dgram_errind = onoff;
- if (onoff)
- so->so_options |= option_name;
- else
- so->so_options &= ~option_name;
- break;
- }
- break;
- case IPPROTO_TCP:
- switch (option_name) {
- case TCP_NODELAY:
- if (optlen != (t_uscalar_t)sizeof (int32_t)) {
- error = EINVAL;
- eprintsoline(so, error);
- mutex_enter(&so->so_lock);
- goto done2;
- }
- ASSERT(optval);
- tcp->tcp_naglim = intvalue ? 1 : tcp->tcp_mss;
- handled = B_TRUE;
- break;
- }
- break;
- default:
- handled = B_FALSE;
- break;
- }
- }
-
- if (handled) {
- mutex_enter(&so->so_lock);
- goto done2;
- }
-
optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
optmgmt_req.MGMT_flags = T_NEGOTIATE;
optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
diff --git a/usr/src/uts/common/inet/Makefile b/usr/src/uts/common/inet/Makefile
index 052c010aea..3d45e4861c 100644
--- a/usr/src/uts/common/inet/Makefile
+++ b/usr/src/uts/common/inet/Makefile
@@ -28,12 +28,12 @@
# include global definitions
include ../../../Makefile.master
-HDRS= arp.h arp_impl.h common.h ipclassifier.h ip.h ip6.h ipdrop.h ipnet.h \
+HDRS= arp.h common.h ipclassifier.h ip.h ip6.h ipdrop.h ipnet.h \
ipsecah.h ipsecesp.h ipsec_info.h iptun.h ip6_asp.h ip_if.h ip_ire.h \
ip_multi.h ip_netinfo.h ip_ndp.h ip_rts.h ipsec_impl.h keysock.h \
led.h mi.h mib2.h nd.h optcom.h sadb.h sctp_itf.h snmpcom.h tcp.h \
tcp_sack.h tcp_stack.h udp_impl.h rawip_impl.h ipp_common.h \
- ip_ftable.h ip_impl.h ip_stack.h tcp_impl.h wifi_ioctl.h \
+ ip_ftable.h ip_impl.h ip_stack.h ip_arp.h tcp_impl.h wifi_ioctl.h \
ip2mac.h ip2mac_impl.h
ROOTDIRS= $(ROOT)/usr/include/inet
diff --git a/usr/src/uts/common/inet/arp.h b/usr/src/uts/common/inet/arp.h
index 4351c91666..de0602e1f7 100644
--- a/usr/src/uts/common/inet/arp.h
+++ b/usr/src/uts/common/inet/arp.h
@@ -28,7 +28,6 @@
#define _INET_ARP_H
#include <sys/types.h>
-#include <net/if.h>
#ifdef __cplusplus
extern "C" {
@@ -45,30 +44,7 @@ extern "C" {
#define RARP_REQUEST 3
#define RARP_RESPONSE 4
-#define AR_IOCTL (((unsigned)'A' & 0xFF)<<8)
-#define CMD_IN_PROGRESS 0x10000
-
-#define AR_ENTRY_ADD (AR_IOCTL + 1)
-#define AR_ENTRY_DELETE (AR_IOCTL + 2)
-#define AR_ENTRY_QUERY (AR_IOCTL + 3)
-#define AR_ENTRY_SQUERY (AR_IOCTL + 6)
-#define AR_MAPPING_ADD (AR_IOCTL + 7)
-#define AR_CLIENT_NOTIFY (AR_IOCTL + 8)
-#define AR_INTERFACE_UP (AR_IOCTL + 9)
-#define AR_INTERFACE_DOWN (AR_IOCTL + 10)
-#define AR_INTERFACE_ON (AR_IOCTL + 12)
-#define AR_INTERFACE_OFF (AR_IOCTL + 13)
-#define AR_DLPIOP_DONE (AR_IOCTL + 14)
-/*
- * This is not an ARP command per se, it is used to interface between
- * ARP and IP during close.
- */
-#define AR_ARP_CLOSING (AR_IOCTL + 16)
-#define AR_ARP_EXTEND (AR_IOCTL + 17)
-#define AR_IPMP_ACTIVATE (AR_IOCTL + 18)
-#define AR_IPMP_DEACTIVATE (AR_IOCTL + 19)
-
-/* Both ace_flags and area_flags; must also modify arp.c in mdb */
+/* Both ace_flags; must also modify arp.c in mdb */
#define ACE_F_PERMANENT 0x0001
#define ACE_F_PUBLISH 0x0002
#define ACE_F_DYING 0x0004
@@ -84,123 +60,6 @@ extern "C" {
#define ACE_F_DELAYED 0x0800 /* rescheduled on arp_defend_rate */
#define ACE_F_DAD_ABORTED 0x1000 /* DAD was aborted on link down */
-/* ared_flags */
-#define ARED_F_PRESERVE_PERM 0x0001 /* preserve permanent ace */
-
-/* ARP Command Structures */
-
-/* arc_t - Common command overlay */
-typedef struct ar_cmd_s {
- uint32_t arc_cmd;
- uint32_t arc_name_offset;
- uint32_t arc_name_length;
-} arc_t;
-
-/*
- * NOTE: when using area_t for an AR_ENTRY_SQUERY, the area_hw_addr_offset
- * field isn't what you might think. See comments in ip_multi.c where
- * the routine ill_create_squery() is called, and also in the routine
- * itself, to see how this field is used *only* when the area_t holds
- * an AR_ENTRY_SQUERY.
- */
-typedef struct ar_entry_add_s {
- uint32_t area_cmd;
- uint32_t area_name_offset;
- uint32_t area_name_length;
- uint32_t area_proto;
- uint32_t area_proto_addr_offset;
- uint32_t area_proto_addr_length;
- uint32_t area_proto_mask_offset;
- uint32_t area_flags; /* Same values as ace_flags */
- uint32_t area_hw_addr_offset;
- uint32_t area_hw_addr_length;
-} area_t;
-
-typedef struct ar_entry_delete_s {
- uint32_t ared_cmd;
- uint32_t ared_name_offset;
- uint32_t ared_name_length;
- uint32_t ared_proto;
- uint32_t ared_proto_addr_offset;
- uint32_t ared_proto_addr_length;
- uint32_t ared_flags;
-} ared_t;
-
-typedef struct ar_entry_query_s {
- uint32_t areq_cmd;
- uint32_t areq_name_offset;
- uint32_t areq_name_length;
- uint32_t areq_proto;
- uint32_t areq_target_addr_offset;
- uint32_t areq_target_addr_length;
- uint32_t areq_flags;
- uint32_t areq_sender_addr_offset;
- uint32_t areq_sender_addr_length;
- uint32_t areq_xmit_count; /* 0 ==> cache lookup only */
- uint32_t areq_xmit_interval; /* # of milliseconds; 0: default */
- /* # ofquests to buffer; 0: default */
- uint32_t areq_max_buffered;
- uchar_t areq_sap[8]; /* to insert in returned template */
-} areq_t;
-
-#define AR_EQ_DEFAULT_XMIT_COUNT 6
-#define AR_EQ_DEFAULT_XMIT_INTERVAL 1000
-#define AR_EQ_DEFAULT_MAX_BUFFERED 4
-
-/*
- * Structure used with AR_ENTRY_LLAQUERY to map from the link_addr
- * (in Neighbor Discovery option format excluding the option type and
- * length) to a hardware address.
- * The response has the same format as for an AR_ENTRY_SQUERY - an M_CTL with
- * arel_hw_addr updated.
- * An IPv6 address will be passed in AR_ENTRY_LLAQUERY so that atmip
- * can send it in AR_CLIENT_NOTIFY messages.
- */
-typedef struct ar_entry_llaquery_s {
- uint32_t arel_cmd;
- uint32_t arel_name_offset;
- uint32_t arel_name_length;
- uint32_t arel_link_addr_offset;
- uint32_t arel_link_addr_length;
- uint32_t arel_hw_addr_offset;
- uint32_t arel_hw_addr_length;
- uint32_t arel_ip_addr_offset;
- uint32_t arel_ip_addr_length;
-} arel_t;
-
-typedef struct ar_mapping_add_s {
- uint32_t arma_cmd;
- uint32_t arma_name_offset;
- uint32_t arma_name_length;
- uint32_t arma_proto;
- uint32_t arma_proto_addr_offset;
- uint32_t arma_proto_addr_length;
- uint32_t arma_proto_mask_offset;
- uint32_t arma_proto_extract_mask_offset;
- uint32_t arma_flags;
- uint32_t arma_hw_addr_offset;
- uint32_t arma_hw_addr_length;
- /* Offset were we start placing */
- uint32_t arma_hw_mapping_start;
- /* the mask&proto_addr */
-} arma_t;
-
-/* Structure used to notify ARP of changes to IPMP group topology */
-typedef struct ar_ipmp_event_s {
- uint32_t arie_cmd;
- uint32_t arie_name_offset;
- uint32_t arie_name_length;
- char arie_grifname[LIFNAMSIZ];
-} arie_t;
-
-/* Structure used to notify clients of interesting conditions. */
-typedef struct ar_client_notify_s {
- uint32_t arcn_cmd;
- uint32_t arcn_name_offset;
- uint32_t arcn_name_length;
- uint32_t arcn_code; /* Notification code. */
-} arcn_t;
-
/* Client Notification Codes */
#define AR_CN_BOGON 1
#define AR_CN_ANNOUNCE 2
diff --git a/usr/src/uts/common/inet/arp/arp.c b/usr/src/uts/common/inet/arp/arp.c
deleted file mode 100644
index abdbc39a47..0000000000
--- a/usr/src/uts/common/inet/arp/arp.c
+++ /dev/null
@@ -1,4883 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-/* Copyright (c) 1990 Mentat Inc. */
-
-/* AR - Address Resolution Protocol */
-
-#include <sys/types.h>
-#include <sys/stream.h>
-#include <sys/stropts.h>
-#include <sys/strsubr.h>
-#include <sys/errno.h>
-#include <sys/strlog.h>
-#include <sys/dlpi.h>
-#include <sys/sockio.h>
-#define _SUN_TPI_VERSION 2
-#include <sys/tihdr.h>
-#include <sys/socket.h>
-#include <sys/ddi.h>
-#include <sys/sunddi.h>
-#include <sys/cmn_err.h>
-#include <sys/sdt.h>
-#include <sys/vtrace.h>
-#include <sys/strsun.h>
-#include <sys/policy.h>
-#include <sys/zone.h>
-#include <sys/ethernet.h>
-#include <sys/zone.h>
-#include <sys/random.h>
-#include <sys/sdt.h>
-#include <sys/hook_event.h>
-
-#include <inet/common.h>
-#include <inet/optcom.h>
-#include <inet/mi.h>
-#include <inet/nd.h>
-#include <inet/snmpcom.h>
-#include <net/if.h>
-#include <inet/arp.h>
-#include <netinet/ip6.h>
-#include <netinet/arp.h>
-#include <inet/ip.h>
-#include <inet/ip_ire.h>
-#include <inet/ip_ndp.h>
-#include <inet/mib2.h>
-#include <inet/arp_impl.h>
-
-/*
- * ARP entry life time and design notes
- * ------------------------------------
- *
- * ARP entries (ACEs) must last at least as long as IP knows about a given
- * MAC-IP translation (i.e., as long as the IRE cache entry exists). It's ok
- * if the ARP entry lasts longer, but not ok if it is removed before the IP
- * entry. The reason for this is that if ARP doesn't have an entry, we will be
- * unable to detect the difference between an ARP broadcast that represents no
- * change (same, known address of sender) and one that represents a change (new
- * address for existing entry). In the former case, we must not notify IP, or
- * we can suffer hurricane attack. In the latter case, we must notify IP, or
- * IP will drift out of sync with the network.
- *
- * Note that IP controls the lifetime of entries, not ARP.
- *
- * We don't attempt to reconfirm aging entries. If the system is no longer
- * talking to a given peer, then it doesn't matter if we have the right mapping
- * for that peer. It would be possible to send queries on aging entries that
- * are active, but this isn't done.
- *
- * IPMP Notes
- * ----------
- *
- * ARP is aware of IPMP. In particular, IP notifies ARP about all "active"
- * (able to transmit data packets) interfaces in a given group via
- * AR_IPMP_ACTIVATE and AR_IPMP_DEACTIVATE messages. These messages, combined
- * with the "IPMP arl_t" that ARP creates over the IPMP DLPI stub driver,
- * enable ARP to track all the arl_t's that are in the same group and thus
- * ensure that ACEs are shared across each group and the arl_t that ARP
- * chooses to transmit on for a given ACE is optimal.
- *
- * ARP relies on IP for hardware address updates. In particular, if the
- * hardware address of an interface changes (DL_NOTE_PHYS_ADDR), then IP will
- * bring the interface down and back up -- and as part of bringing it back
- * up, will send messages to ARP that allow it to update the affected arl's
- * with new hardware addresses.
- *
- * N.B.: One side-effect of this approach is that when an interface fails and
- * then starts to repair, it will temporarily populate the ARP cache with
- * addresses that are owned by it rather than the group's arl_t. To address
- * this, we could add more messages (e.g., AR_IPMP_JOIN and AR_IPMP_LEAVE),
- * but as the issue appears to be only cosmetic (redundant entries in the ARP
- * cache during interace repair), we've kept things simple for now.
- */
-
-/*
- * This is used when scanning for "old" (least recently broadcast) ACEs. We
- * don't want to have to walk the list for every single one, so we gather up
- * batches at a time.
- */
-#define ACE_RESCHED_LIST_LEN 8
-
-typedef struct {
- arl_t *art_arl;
- uint_t art_naces;
- ace_t *art_aces[ACE_RESCHED_LIST_LEN];
-} ace_resched_t;
-
-#define ACE_RESOLVED(ace) ((ace)->ace_flags & ACE_F_RESOLVED)
-#define ACE_NONPERM(ace) \
- (((ace)->ace_flags & (ACE_F_RESOLVED | ACE_F_PERMANENT)) == \
- ACE_F_RESOLVED)
-
-#define AR_DEF_XMIT_INTERVAL 500 /* time in milliseconds */
-#define AR_LL_HDR_SLACK 32 /* Leave the lower layer some room */
-
-#define AR_SNMP_MSG T_OPTMGMT_ACK
-#define AR_DRAINING (void *)0x11
-
-/*
- * The IPv4 Link Local address space is special; we do extra duplicate checking
- * there, as the entire assignment mechanism rests on random numbers.
- */
-#define IS_IPV4_LL_SPACE(ptr) (((uchar_t *)ptr)[0] == 169 && \
- ((uchar_t *)ptr)[1] == 254)
-
-/*
- * Check if the command needs to be enqueued by seeing if there are other
- * commands ahead of us or if some DLPI response is being awaited. Usually
- * there would be an enqueued command in the latter case, however if the
- * stream that originated the command has closed, the close would have
- * cleaned up the enqueued command. AR_DRAINING signifies that the command
- * at the head of the arl_queue has been internally dequeued on completion
- * of the previous command and is being called from ar_dlpi_done
- */
-#define CMD_NEEDS_QUEUEING(mp, arl) \
- (mp->b_prev != AR_DRAINING && (arl->arl_queue != NULL || \
- arl->arl_dlpi_pending != DL_PRIM_INVAL))
-
-#define ARH_FIXED_LEN 8
-
-/*
- * Macro used when creating ACEs to determine the arl that should own it.
- */
-#define OWNING_ARL(arl) \
- ((arl)->arl_ipmp_arl != NULL ? (arl)->arl_ipmp_arl : arl)
-
-/*
- * MAC-specific intelligence. Shouldn't be needed, but the DL_INFO_ACK
- * doesn't quite do it for us.
- */
-typedef struct ar_m_s {
- t_uscalar_t ar_mac_type;
- uint32_t ar_mac_arp_hw_type;
- t_scalar_t ar_mac_sap_length;
- uint32_t ar_mac_hw_addr_length;
-} ar_m_t;
-
-typedef struct msg2_args {
- mblk_t *m2a_mpdata;
- mblk_t *m2a_mptail;
-} msg2_args_t;
-
-static mblk_t *ar_alloc(uint32_t cmd, int);
-static int ar_ce_create(arl_t *arl, uint32_t proto, uchar_t *hw_addr,
- uint32_t hw_addr_len, uchar_t *proto_addr,
- uint32_t proto_addr_len, uchar_t *proto_mask,
- uchar_t *proto_extract_mask, uint32_t hw_extract_start,
- uchar_t *sender_addr, uint32_t flags);
-static void ar_ce_delete(ace_t *ace);
-static void ar_ce_delete_per_arl(ace_t *ace, void *arg);
-static ace_t **ar_ce_hash(arp_stack_t *as, uint32_t proto,
- const uchar_t *proto_addr, uint32_t proto_addr_length);
-static ace_t *ar_ce_lookup(arl_t *arl, uint32_t proto,
- const uchar_t *proto_addr, uint32_t proto_addr_length);
-static ace_t *ar_ce_lookup_entry(arl_t *arl, uint32_t proto,
- const uchar_t *proto_addr, uint32_t proto_addr_length);
-static ace_t *ar_ce_lookup_from_area(arp_stack_t *as, mblk_t *mp,
- ace_t *matchfn());
-static ace_t *ar_ce_lookup_mapping(arl_t *arl, uint32_t proto,
- const uchar_t *proto_addr, uint32_t proto_addr_length);
-static ace_t *ar_ce_lookup_permanent(arp_stack_t *as, uint32_t proto,
- uchar_t *proto_addr, uint32_t proto_addr_length);
-static boolean_t ar_ce_resolve(ace_t *ace, const uchar_t *hw_addr,
- uint32_t hw_addr_length);
-static void ar_ce_walk(arp_stack_t *as, void (*pfi)(ace_t *, void *),
- void *arg1);
-
-static void ar_client_notify(const arl_t *arl, mblk_t *mp, int code);
-static int ar_close(queue_t *q);
-static int ar_cmd_dispatch(queue_t *q, mblk_t *mp, boolean_t from_wput);
-static void ar_cmd_drain(arl_t *arl);
-static void ar_cmd_done(arl_t *arl);
-static mblk_t *ar_dlpi_comm(t_uscalar_t prim, size_t size);
-static void ar_dlpi_send(arl_t *, mblk_t *);
-static void ar_dlpi_done(arl_t *, t_uscalar_t);
-static int ar_entry_add(queue_t *q, mblk_t *mp);
-static int ar_entry_delete(queue_t *q, mblk_t *mp);
-static int ar_entry_query(queue_t *q, mblk_t *mp);
-static int ar_entry_squery(queue_t *q, mblk_t *mp);
-static int ar_interface_up(queue_t *q, mblk_t *mp);
-static int ar_interface_down(queue_t *q, mblk_t *mp);
-static int ar_interface_on(queue_t *q, mblk_t *mp);
-static int ar_interface_off(queue_t *q, mblk_t *mp);
-static int ar_ipmp_activate(queue_t *q, mblk_t *mp);
-static int ar_ipmp_deactivate(queue_t *q, mblk_t *mp);
-static void ar_ll_cleanup_arl_queue(queue_t *q);
-static void ar_ll_down(arl_t *arl);
-static arl_t *ar_ll_lookup_by_name(arp_stack_t *as, const char *name);
-static arl_t *ar_ll_lookup_from_mp(arp_stack_t *as, mblk_t *mp);
-static void ar_ll_init(arp_stack_t *, ar_t *, mblk_t *mp);
-static void ar_ll_set_defaults(arl_t *, mblk_t *mp);
-static void ar_ll_clear_defaults(arl_t *);
-static int ar_ll_up(arl_t *arl);
-static int ar_mapping_add(queue_t *q, mblk_t *mp);
-static boolean_t ar_mask_all_ones(uchar_t *mask, uint32_t mask_len);
-static ar_m_t *ar_m_lookup(t_uscalar_t mac_type);
-static int ar_nd_ioctl(queue_t *q, mblk_t *mp);
-static int ar_open(queue_t *q, dev_t *devp, int flag, int sflag,
- cred_t *credp);
-static int ar_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
-static boolean_t ar_param_register(IDP *ndp, arpparam_t *arppa, int cnt);
-static int ar_param_set(queue_t *q, mblk_t *mp, char *value,
- caddr_t cp, cred_t *cr);
-static void ar_query_delete(ace_t *ace, void *ar);
-static void ar_query_reply(ace_t *ace, int ret_val,
- uchar_t *proto_addr, uint32_t proto_addr_len);
-static clock_t ar_query_xmit(arp_stack_t *as, ace_t *ace);
-static void ar_rput(queue_t *q, mblk_t *mp_orig);
-static void ar_rput_dlpi(queue_t *q, mblk_t *mp);
-static void ar_set_address(ace_t *ace, uchar_t *addrpos,
- uchar_t *proto_addr, uint32_t proto_addr_len);
-static int ar_slifname(queue_t *q, mblk_t *mp);
-static int ar_set_ppa(queue_t *q, mblk_t *mp);
-static int ar_snmp_msg(queue_t *q, mblk_t *mp_orig);
-static void ar_snmp_msg2(ace_t *, void *);
-static void ar_wput(queue_t *q, mblk_t *mp);
-static void ar_wsrv(queue_t *q);
-static void ar_xmit(arl_t *arl, uint32_t operation, uint32_t proto,
- uint32_t plen, const uchar_t *haddr1, const uchar_t *paddr1,
- const uchar_t *haddr2, const uchar_t *paddr2, const uchar_t *dstaddr,
- arp_stack_t *as);
-static void ar_cmd_enqueue(arl_t *arl, mblk_t *mp, queue_t *q,
- ushort_t cmd, boolean_t);
-static mblk_t *ar_cmd_dequeue(arl_t *arl);
-
-static void *arp_stack_init(netstackid_t stackid, netstack_t *ns);
-static void arp_stack_fini(netstackid_t stackid, void *arg);
-static void arp_stack_shutdown(netstackid_t stackid, void *arg);
-
-boolean_t arp_no_defense = B_FALSE;
-
-/*
- * All of these are alterable, within the min/max values given,
- * at run time. arp_publish_interval and arp_publish_count are
- * set by default to 2 seconds and 5 respectively. This is
- * useful during FAILOVER/FAILBACK to make sure that the ARP
- * packets are not lost. Assumed that it does not affect the
- * normal operations.
- */
-static arpparam_t arp_param_arr[] = {
- /* min max value name */
- { 30000, 3600000, 300000, "arp_cleanup_interval"},
- { 1000, 20000, 2000, "arp_publish_interval"},
- { 1, 20, 5, "arp_publish_count"},
- { 0, 20000, 1000, "arp_probe_delay"},
- { 10, 20000, 1500, "arp_probe_interval"},
- { 0, 20, 3, "arp_probe_count"},
- { 0, 20000, 100, "arp_fastprobe_delay"},
- { 10, 20000, 150, "arp_fastprobe_interval"},
- { 0, 20, 3, "arp_fastprobe_count"},
- { 0, 3600000, 300000, "arp_defend_interval"},
- { 0, 20000, 100, "arp_defend_rate"},
- { 0, 3600000, 15000, "arp_broadcast_interval"},
- { 5, 86400, 3600, "arp_defend_period"}
-};
-#define as_cleanup_interval as_param_arr[0].arp_param_value
-#define as_publish_interval as_param_arr[1].arp_param_value
-#define as_publish_count as_param_arr[2].arp_param_value
-#define as_probe_delay as_param_arr[3].arp_param_value
-#define as_probe_interval as_param_arr[4].arp_param_value
-#define as_probe_count as_param_arr[5].arp_param_value
-#define as_fastprobe_delay as_param_arr[6].arp_param_value
-#define as_fastprobe_interval as_param_arr[7].arp_param_value
-#define as_fastprobe_count as_param_arr[8].arp_param_value
-#define as_defend_interval as_param_arr[9].arp_param_value
-#define as_defend_rate as_param_arr[10].arp_param_value
-#define as_broadcast_interval as_param_arr[11].arp_param_value
-#define as_defend_period as_param_arr[12].arp_param_value
-
-static struct module_info arp_mod_info = {
- 0, "arp", 0, INFPSZ, 512, 128
-};
-
-static struct qinit arprinit = {
- (pfi_t)ar_rput, NULL, ar_open, ar_close, NULL, &arp_mod_info
-};
-
-static struct qinit arpwinit = {
- (pfi_t)ar_wput, (pfi_t)ar_wsrv, ar_open, ar_close, NULL, &arp_mod_info
-};
-
-struct streamtab arpinfo = {
- &arprinit, &arpwinit
-};
-
-/*
- * TODO: we need a better mechanism to set the ARP hardware type since
- * the DLPI mac type does not include enough predefined values.
- */
-static ar_m_t ar_m_tbl[] = {
- { DL_CSMACD, ARPHRD_ETHER, -2, 6}, /* 802.3 */
- { DL_TPB, ARPHRD_IEEE802, -2, 6}, /* 802.4 */
- { DL_TPR, ARPHRD_IEEE802, -2, 6}, /* 802.5 */
- { DL_METRO, ARPHRD_IEEE802, -2, 6}, /* 802.6 */
- { DL_ETHER, ARPHRD_ETHER, -2, 6}, /* Ethernet */
- { DL_FDDI, ARPHRD_ETHER, -2, 6}, /* FDDI */
- { DL_IB, ARPHRD_IB, -2, 20}, /* Infiniband */
- { DL_OTHER, ARPHRD_ETHER, -2, 6}, /* unknown */
-};
-
-/*
- * Note that all routines which need to queue the message for later
- * processing have to be ioctl_aware to be able to queue the complete message.
- * Following are command entry flags in arct_flags
- */
-#define ARF_IOCTL_AWARE 0x1 /* Arp command can come down as M_IOCTL */
-#define ARF_ONLY_CMD 0x2 /* Command is exclusive to ARP */
-#define ARF_WPUT_OK 0x4 /* Command is allowed from ar_wput */
-
-/* ARP Cmd Table entry */
-typedef struct arct_s {
- int (*arct_pfi)(queue_t *, mblk_t *);
- uint32_t arct_cmd;
- int arct_min_len;
- uint32_t arct_flags;
- int arct_priv_req; /* Privilege required for this cmd */
- const char *arct_txt;
-} arct_t;
-
-/*
- * AR_ENTRY_ADD, QUERY and SQUERY are used by sdp, hence they need to
- * have ARF_WPUT_OK set.
- */
-static arct_t ar_cmd_tbl[] = {
- { ar_entry_add, AR_ENTRY_ADD, sizeof (area_t),
- ARF_IOCTL_AWARE | ARF_ONLY_CMD | ARF_WPUT_OK, OP_CONFIG,
- "AR_ENTRY_ADD" },
- { ar_entry_delete, AR_ENTRY_DELETE, sizeof (ared_t),
- ARF_IOCTL_AWARE | ARF_ONLY_CMD, OP_CONFIG, "AR_ENTRY_DELETE" },
- { ar_entry_query, AR_ENTRY_QUERY, sizeof (areq_t),
- ARF_IOCTL_AWARE | ARF_ONLY_CMD | ARF_WPUT_OK, OP_NP,
- "AR_ENTRY_QUERY" },
- { ar_entry_squery, AR_ENTRY_SQUERY, sizeof (area_t),
- ARF_IOCTL_AWARE | ARF_ONLY_CMD | ARF_WPUT_OK, OP_NP,
- "AR_ENTRY_SQUERY" },
- { ar_mapping_add, AR_MAPPING_ADD, sizeof (arma_t),
- ARF_IOCTL_AWARE | ARF_ONLY_CMD, OP_CONFIG, "AR_MAPPING_ADD" },
- { ar_interface_up, AR_INTERFACE_UP, sizeof (arc_t),
- ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_UP" },
- { ar_interface_down, AR_INTERFACE_DOWN, sizeof (arc_t),
- ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_DOWN" },
- { ar_interface_on, AR_INTERFACE_ON, sizeof (arc_t),
- ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_ON" },
- { ar_interface_off, AR_INTERFACE_OFF, sizeof (arc_t),
- ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_OFF" },
- { ar_ipmp_activate, AR_IPMP_ACTIVATE, sizeof (arie_t),
- ARF_ONLY_CMD, OP_CONFIG, "AR_IPMP_ACTIVATE" },
- { ar_ipmp_deactivate, AR_IPMP_DEACTIVATE, sizeof (arie_t),
- ARF_ONLY_CMD, OP_CONFIG, "AR_IPMP_DEACTIVATE" },
- { ar_set_ppa, (uint32_t)IF_UNITSEL, sizeof (int),
- ARF_IOCTL_AWARE | ARF_WPUT_OK, OP_CONFIG, "IF_UNITSEL" },
- { ar_nd_ioctl, ND_GET, 1,
- ARF_IOCTL_AWARE | ARF_WPUT_OK, OP_NP, "ND_GET" },
- { ar_nd_ioctl, ND_SET, 1,
- ARF_IOCTL_AWARE | ARF_WPUT_OK, OP_CONFIG, "ND_SET" },
- { ar_snmp_msg, AR_SNMP_MSG, sizeof (struct T_optmgmt_ack),
- ARF_IOCTL_AWARE | ARF_WPUT_OK | ARF_ONLY_CMD, OP_NP,
- "AR_SNMP_MSG" },
- { ar_slifname, (uint32_t)SIOCSLIFNAME, sizeof (struct lifreq),
- ARF_IOCTL_AWARE | ARF_WPUT_OK, OP_CONFIG, "SIOCSLIFNAME" }
-};
-
-/*
- * Lookup and return an arl appropriate for sending packets with either source
- * hardware address `hw_addr' or source protocol address `ip_addr', in that
- * order. If neither was specified or neither match, return any arl in the
- * same group as `arl'.
- */
-static arl_t *
-ar_ipmp_lookup_xmit_arl(arl_t *arl, uchar_t *hw_addr, uint_t hw_addrlen,
- uchar_t *ip_addr)
-{
- arlphy_t *ap;
- ace_t *src_ace;
- arl_t *xmit_arl = NULL;
- arp_stack_t *as = ARL_TO_ARPSTACK(arl);
-
- ASSERT(arl->arl_flags & ARL_F_IPMP);
-
- if (hw_addr != NULL && hw_addrlen != 0) {
- xmit_arl = as->as_arl_head;
- for (; xmit_arl != NULL; xmit_arl = xmit_arl->arl_next) {
- /*
- * There may be arls with the same HW address that are
- * not in our IPMP group; we don't want those.
- */
- if (xmit_arl->arl_ipmp_arl != arl)
- continue;
-
- ap = xmit_arl->arl_phy;
- if (ap != NULL && ap->ap_hw_addrlen == hw_addrlen &&
- bcmp(ap->ap_hw_addr, hw_addr, hw_addrlen) == 0)
- break;
- }
-
- DTRACE_PROBE4(xmit_arl_hwsrc, arl_t *, arl, arl_t *,
- xmit_arl, uchar_t *, hw_addr, uint_t, hw_addrlen);
- }
-
- if (xmit_arl == NULL && ip_addr != NULL) {
- src_ace = ar_ce_lookup_permanent(as, IP_ARP_PROTO_TYPE, ip_addr,
- IP_ADDR_LEN);
- if (src_ace != NULL)
- xmit_arl = src_ace->ace_xmit_arl;
-
- DTRACE_PROBE4(xmit_arl_ipsrc, arl_t *, arl, arl_t *,
- xmit_arl, uchar_t *, ip_addr, uint_t, IP_ADDR_LEN);
- }
-
- if (xmit_arl == NULL) {
- xmit_arl = as->as_arl_head;
- for (; xmit_arl != NULL; xmit_arl = xmit_arl->arl_next)
- if (xmit_arl->arl_ipmp_arl == arl && xmit_arl != arl)
- break;
-
- DTRACE_PROBE2(xmit_arl_any, arl_t *, arl, arl_t *, xmit_arl);
- }
-
- return (xmit_arl);
-}
-
-/*
- * ARP Cache Entry creation routine.
- * Cache entries are allocated within timer messages and inserted into
- * the global hash list based on protocol and protocol address.
- */
-static int
-ar_ce_create(arl_t *arl, uint_t proto, uchar_t *hw_addr, uint_t hw_addr_len,
- uchar_t *proto_addr, uint_t proto_addr_len, uchar_t *proto_mask,
- uchar_t *proto_extract_mask, uint_t hw_extract_start, uchar_t *sender_addr,
- uint_t flags)
-{
- static ace_t ace_null;
- ace_t *ace;
- ace_t **acep;
- uchar_t *dst;
- mblk_t *mp;
- arp_stack_t *as = ARL_TO_ARPSTACK(arl);
- arl_t *xmit_arl;
- arlphy_t *ap;
-
- if ((flags & ~ACE_EXTERNAL_FLAGS_MASK) || arl == NULL)
- return (EINVAL);
-
- if (proto_addr == NULL || proto_addr_len == 0 ||
- (proto == IP_ARP_PROTO_TYPE && proto_addr_len != IP_ADDR_LEN))
- return (EINVAL);
-
- if (flags & ACE_F_MYADDR)
- flags |= ACE_F_PUBLISH | ACE_F_AUTHORITY;
-
- /*
- * Latch a transmit arl for this ace.
- */
- if (arl->arl_flags & ARL_F_IPMP) {
- ASSERT(proto == IP_ARP_PROTO_TYPE);
- xmit_arl = ar_ipmp_lookup_xmit_arl(arl, hw_addr, hw_addr_len,
- sender_addr);
- } else {
- xmit_arl = arl;
- }
-
- if (xmit_arl == NULL || xmit_arl->arl_phy == NULL)
- return (EINVAL);
-
- ap = xmit_arl->arl_phy;
-
- if (!hw_addr && hw_addr_len == 0) {
- if (flags == ACE_F_PERMANENT) { /* Not publish */
- /* 224.0.0.0 to zero length address */
- flags |= ACE_F_RESOLVED;
- } else { /* local address and unresolved case */
- hw_addr = ap->ap_hw_addr;
- hw_addr_len = ap->ap_hw_addrlen;
- if (flags & ACE_F_PUBLISH)
- flags |= ACE_F_RESOLVED;
- }
- } else {
- flags |= ACE_F_RESOLVED;
- }
-
- /* Handle hw_addr_len == 0 for DL_ENABMULTI_REQ etc. */
- if (hw_addr_len != 0 && hw_addr == NULL)
- return (EINVAL);
- if (hw_addr_len < ap->ap_hw_addrlen && hw_addr_len != 0)
- return (EINVAL);
- if (!proto_extract_mask && (flags & ACE_F_MAPPING))
- return (EINVAL);
-
- /*
- * If the underlying link doesn't have reliable up/down notification or
- * if we're working with the IPv4 169.254.0.0/16 Link Local Address
- * space, then don't use the fast timers. Otherwise, use them.
- */
- if (ap->ap_notifies &&
- !(proto == IP_ARP_PROTO_TYPE && IS_IPV4_LL_SPACE(proto_addr))) {
- flags |= ACE_F_FAST;
- }
-
- /*
- * Allocate the timer block to hold the ace.
- * (ace + proto_addr + proto_addr_mask + proto_extract_mask + hw_addr)
- */
- mp = mi_timer_alloc(sizeof (ace_t) + proto_addr_len + proto_addr_len +
- proto_addr_len + hw_addr_len);
- if (!mp)
- return (ENOMEM);
- ace = (ace_t *)mp->b_rptr;
- *ace = ace_null;
- ace->ace_proto = proto;
- ace->ace_mp = mp;
- ace->ace_arl = arl;
- ace->ace_xmit_arl = xmit_arl;
-
- dst = (uchar_t *)&ace[1];
-
- ace->ace_proto_addr = dst;
- ace->ace_proto_addr_length = proto_addr_len;
- bcopy(proto_addr, dst, proto_addr_len);
- dst += proto_addr_len;
- /*
- * The proto_mask allows us to add entries which will let us respond
- * to requests for a group of addresses. This makes it easy to provide
- * proxy ARP service for machines that don't understand about the local
- * subnet structure, if, for example, there are BSD4.2 systems lurking.
- */
- ace->ace_proto_mask = dst;
- if (proto_mask != NULL) {
- bcopy(proto_mask, dst, proto_addr_len);
- dst += proto_addr_len;
- } else {
- while (proto_addr_len-- > 0)
- *dst++ = (uchar_t)~0;
- }
-
- if (proto_extract_mask != NULL) {
- ace->ace_proto_extract_mask = dst;
- bcopy(proto_extract_mask, dst, ace->ace_proto_addr_length);
- dst += ace->ace_proto_addr_length;
- } else {
- ace->ace_proto_extract_mask = NULL;
- }
- ace->ace_hw_extract_start = hw_extract_start;
- ace->ace_hw_addr_length = hw_addr_len;
- ace->ace_hw_addr = dst;
- if (hw_addr != NULL) {
- bcopy(hw_addr, dst, hw_addr_len);
- dst += hw_addr_len;
- }
-
- ace->ace_flags = flags;
- if (ar_mask_all_ones(ace->ace_proto_mask,
- ace->ace_proto_addr_length)) {
- acep = ar_ce_hash(as, ace->ace_proto, ace->ace_proto_addr,
- ace->ace_proto_addr_length);
- } else {
- acep = &as->as_ce_mask_entries;
- }
- if ((ace->ace_next = *acep) != NULL)
- ace->ace_next->ace_ptpn = &ace->ace_next;
- *acep = ace;
- ace->ace_ptpn = acep;
- return (0);
-}
-
-/* Delete a cache entry. */
-static void
-ar_ce_delete(ace_t *ace)
-{
- ace_t **acep;
-
- /* Get out of the hash list. */
- acep = ace->ace_ptpn;
- if (ace->ace_next)
- ace->ace_next->ace_ptpn = acep;
- acep[0] = ace->ace_next;
- /* Mark it dying in case we have a timer about to fire. */
- ace->ace_flags |= ACE_F_DYING;
- /* Complete any outstanding queries immediately. */
- ar_query_reply(ace, ENXIO, NULL, (uint32_t)0);
- /* Free the timer, immediately, or when it fires. */
- mi_timer_free(ace->ace_mp);
-}
-
-/*
- * ar_ce_walk routine. Delete the ace if it is associated with the arl
- * that is going away.
- */
-static void
-ar_ce_delete_per_arl(ace_t *ace, void *arl)
-{
- if (ace->ace_arl == arl || ace->ace_xmit_arl == arl) {
- ace->ace_flags &= ~ACE_F_PERMANENT;
- ar_ce_delete(ace);
- }
-}
-
-/*
- * ar_ce_walk routine used when deactivating an `arl' in a group. Deletes
- * `ace' if it was using `arl_arg' as its output interface.
- */
-static void
-ar_ce_ipmp_deactivate(ace_t *ace, void *arl_arg)
-{
- arl_t *arl = arl_arg;
-
- ASSERT(!(arl->arl_flags & ARL_F_IPMP));
-
- if (ace->ace_arl == arl) {
- ASSERT(ace->ace_xmit_arl == arl);
- /*
- * This ACE is tied to the arl leaving the group (e.g., an
- * ACE_F_PERMANENT for a test address) and is not used by the
- * group, so we can leave it be.
- */
- return;
- }
-
- if (ace->ace_xmit_arl != arl)
- return;
-
- ASSERT(ace->ace_arl == arl->arl_ipmp_arl);
-
- /*
- * IP should've already sent us messages asking us to move any
- * ACE_F_MYADDR entries to another arl, but there are two exceptions:
- *
- * 1. The group was misconfigured with interfaces that have duplicate
- * hardware addresses, but in.mpathd was unable to offline those
- * duplicate interfaces.
- *
- * 2. The messages from IP were lost or never created (e.g. due to
- * memory pressure).
- *
- * We handle the first case by just quietly deleting the ACE. Since
- * the second case cannot be distinguished from a more serious bug in
- * the IPMP framework, we ASSERT() that this can't happen on DEBUG
- * systems, but quietly delete the ACE on production systems (the
- * deleted ACE will render the IP address unreachable).
- */
- if (ace->ace_flags & ACE_F_MYADDR) {
- arlphy_t *ap = arl->arl_phy;
- uint_t hw_addrlen = ap->ap_hw_addrlen;
-
- ASSERT(hw_addrlen == ace->ace_hw_addr_length &&
- bcmp(ap->ap_hw_addr, ace->ace_hw_addr, hw_addrlen) == 0);
- }
-
- /*
- * NOTE: it's possible this arl got selected as the ace_xmit_arl when
- * creating an ACE_F_PERMANENT ACE on behalf of an SIOCS*ARP ioctl for
- * an IPMP IP interface. But it's still OK for us to delete such an
- * ACE since ipmp_illgrp_refresh_arpent() will ask us to recreate it
- * and we'll pick another arl then.
- */
- ar_ce_delete(ace);
-}
-
-/* Cache entry hash routine, based on protocol and protocol address. */
-static ace_t **
-ar_ce_hash(arp_stack_t *as, uint32_t proto, const uchar_t *proto_addr,
- uint32_t proto_addr_length)
-{
- const uchar_t *up = proto_addr;
- unsigned int hval = proto;
- int len = proto_addr_length;
-
- while (--len >= 0)
- hval ^= *up++;
- return (&as->as_ce_hash_tbl[hval % ARP_HASH_SIZE]);
-}
-
-/* Cache entry lookup. Try to find an ace matching the parameters passed. */
-ace_t *
-ar_ce_lookup(arl_t *arl, uint32_t proto, const uchar_t *proto_addr,
- uint32_t proto_addr_length)
-{
- ace_t *ace;
-
- ace = ar_ce_lookup_entry(arl, proto, proto_addr, proto_addr_length);
- if (!ace)
- ace = ar_ce_lookup_mapping(arl, proto, proto_addr,
- proto_addr_length);
- return (ace);
-}
-
-/*
- * Cache entry lookup. Try to find an ace matching the parameters passed.
- * Look only for exact entries (no mappings)
- */
-static ace_t *
-ar_ce_lookup_entry(arl_t *arl, uint32_t proto, const uchar_t *proto_addr,
- uint32_t proto_addr_length)
-{
- ace_t *ace;
- arp_stack_t *as = ARL_TO_ARPSTACK(arl);
-
- if (!proto_addr)
- return (NULL);
- ace = *ar_ce_hash(as, proto, proto_addr, proto_addr_length);
- for (; ace; ace = ace->ace_next) {
- if ((ace->ace_arl == arl ||
- ace->ace_arl == arl->arl_ipmp_arl) &&
- ace->ace_proto_addr_length == proto_addr_length &&
- ace->ace_proto == proto) {
- int i1 = proto_addr_length;
- uchar_t *ace_addr = ace->ace_proto_addr;
- uchar_t *mask = ace->ace_proto_mask;
- /*
- * Note that the ace_proto_mask is applied to the
- * proto_addr before comparing to the ace_addr.
- */
- do {
- if (--i1 < 0)
- return (ace);
- } while ((proto_addr[i1] & mask[i1]) == ace_addr[i1]);
- }
- }
- return (ace);
-}
-
-/*
- * Extract cache entry lookup parameters from an external command message, then
- * call the supplied match function.
- */
-static ace_t *
-ar_ce_lookup_from_area(arp_stack_t *as, mblk_t *mp, ace_t *matchfn())
-{
- uchar_t *proto_addr;
- area_t *area = (area_t *)mp->b_rptr;
-
- proto_addr = mi_offset_paramc(mp, area->area_proto_addr_offset,
- area->area_proto_addr_length);
- if (!proto_addr)
- return (NULL);
- return ((*matchfn)(ar_ll_lookup_from_mp(as, mp), area->area_proto,
- proto_addr, area->area_proto_addr_length));
-}
-
-/*
- * Cache entry lookup. Try to find an ace matching the parameters passed.
- * Look only for mappings.
- */
-static ace_t *
-ar_ce_lookup_mapping(arl_t *arl, uint32_t proto, const uchar_t *proto_addr,
- uint32_t proto_addr_length)
-{
- ace_t *ace;
- arp_stack_t *as = ARL_TO_ARPSTACK(arl);
-
- if (!proto_addr)
- return (NULL);
- ace = as->as_ce_mask_entries;
- for (; ace; ace = ace->ace_next) {
- if (ace->ace_arl == arl &&
- ace->ace_proto_addr_length == proto_addr_length &&
- ace->ace_proto == proto) {
- int i1 = proto_addr_length;
- uchar_t *ace_addr = ace->ace_proto_addr;
- uchar_t *mask = ace->ace_proto_mask;
- /*
- * Note that the ace_proto_mask is applied to the
- * proto_addr before comparing to the ace_addr.
- */
- do {
- if (--i1 < 0)
- return (ace);
- } while ((proto_addr[i1] & mask[i1]) == ace_addr[i1]);
- }
- }
- return (ace);
-}
-
-/*
- * Look for a permanent entry for proto_addr across all interfaces.
- */
-static ace_t *
-ar_ce_lookup_permanent(arp_stack_t *as, uint32_t proto, uchar_t *proto_addr,
- uint32_t proto_addr_length)
-{
- ace_t *ace;
-
- ace = *ar_ce_hash(as, proto, proto_addr, proto_addr_length);
- for (; ace != NULL; ace = ace->ace_next) {
- if (!(ace->ace_flags & ACE_F_PERMANENT))
- continue;
- if (ace->ace_proto_addr_length == proto_addr_length &&
- ace->ace_proto == proto) {
- int i1 = proto_addr_length;
- uchar_t *ace_addr = ace->ace_proto_addr;
- uchar_t *mask = ace->ace_proto_mask;
-
- /*
- * Note that the ace_proto_mask is applied to the
- * proto_addr before comparing to the ace_addr.
- */
- do {
- if (--i1 < 0)
- return (ace);
- } while ((proto_addr[i1] & mask[i1]) == ace_addr[i1]);
- }
- }
- return (ace);
-}
-
-/*
- * ar_ce_resolve is called when a response comes in to an outstanding request.
- * Returns 'true' if the address has changed and we need to tell the client.
- * (We don't need to tell the client if there's still an outstanding query.)
- */
-static boolean_t
-ar_ce_resolve(ace_t *ace, const uchar_t *hw_addr, uint32_t hw_addr_length)
-{
- boolean_t hwchanged;
-
- if (hw_addr_length == ace->ace_hw_addr_length) {
- ASSERT(ace->ace_hw_addr != NULL);
- hwchanged = bcmp(hw_addr, ace->ace_hw_addr,
- hw_addr_length) != 0;
- if (hwchanged)
- bcopy(hw_addr, ace->ace_hw_addr, hw_addr_length);
- /*
- * No need to bother with ar_query_reply if no queries are
- * waiting.
- */
- ace->ace_flags |= ACE_F_RESOLVED;
- if (ace->ace_query_mp != NULL)
- ar_query_reply(ace, 0, NULL, (uint32_t)0);
- if (hwchanged)
- return (B_TRUE);
- }
- return (B_FALSE);
-}
-
-/*
- * There are 2 functions performed by this function.
- * 1. Resolution of unresolved entries and update of resolved entries.
- * 2. Detection of nodes with our own IP address (duplicates).
- *
- * If the resolving ARL is in the same group as a matching ACE's ARL, then
- * update the ACE. Otherwise, make no updates.
- *
- * For all entries, we first check to see if this is a duplicate (probable
- * loopback) message. If so, then just ignore it.
- *
- * Next, check to see if the entry has completed DAD. If not, then we've
- * failed, because someone is already using the address. Notify IP of the DAD
- * failure and remove the broken ace.
- *
- * Next, we check if we're the authority for this address. If so, then it's
- * time to defend it, because the other node is a duplicate. Report it as a
- * 'bogon' and let IP decide how to defend.
- *
- * Finally, if it's unresolved or if the arls match, we just update the MAC
- * address. This allows a published 'static' entry to be updated by an ARP
- * request from the node for which we're a proxy ARP server.
- *
- * Note that this logic does not update published ARP entries for mismatched
- * arls, as for example when we proxy arp across 2 subnets with differing
- * subnet masks.
- *
- * Return Values below
- */
-
-#define AR_NOTFOUND 1 /* No matching ace found in cache */
-#define AR_MERGED 2 /* Matching ace updated (RFC 826 Merge_flag) */
-#define AR_LOOPBACK 3 /* Our own arp packet was received */
-#define AR_BOGON 4 /* Another host has our IP addr. */
-#define AR_FAILED 5 /* Duplicate Address Detection has failed */
-#define AR_CHANGED 6 /* Address has changed; tell IP (and merged) */
-
-static int
-ar_ce_resolve_all(arl_t *arl, uint32_t proto, const uchar_t *src_haddr,
- uint32_t hlen, const uchar_t *src_paddr, uint32_t plen, arl_t **ace_arlp)
-{
- ace_t *ace;
- ace_t *ace_next;
- int i1;
- const uchar_t *paddr;
- uchar_t *ace_addr;
- uchar_t *mask;
- int retv = AR_NOTFOUND;
- arp_stack_t *as = ARL_TO_ARPSTACK(arl);
-
- ace = *ar_ce_hash(as, proto, src_paddr, plen);
- for (; ace != NULL; ace = ace_next) {
-
- /* ar_ce_resolve may delete the ace; fetch next pointer now */
- ace_next = ace->ace_next;
-
- if (ace->ace_proto_addr_length != plen ||
- ace->ace_proto != proto) {
- continue;
- }
-
- /*
- * Note that the ace_proto_mask is applied to the proto_addr
- * before comparing to the ace_addr.
- */
- paddr = src_paddr;
- i1 = plen;
- ace_addr = ace->ace_proto_addr;
- mask = ace->ace_proto_mask;
- while (--i1 >= 0) {
- if ((*paddr++ & *mask++) != *ace_addr++)
- break;
- }
- if (i1 >= 0)
- continue;
-
- *ace_arlp = ace->ace_arl;
-
- /*
- * If the IP address is ours, and the hardware address matches
- * one of our own arls, then this is a broadcast packet
- * emitted by one of our interfaces, reflected by the switch
- * and received on another interface. We return AR_LOOPBACK.
- */
- if (ace->ace_flags & ACE_F_MYADDR) {
- arl_t *hw_arl = as->as_arl_head;
- arlphy_t *ap;
-
- for (; hw_arl != NULL; hw_arl = hw_arl->arl_next) {
- ap = hw_arl->arl_phy;
- if (ap != NULL && ap->ap_hw_addrlen == hlen &&
- bcmp(ap->ap_hw_addr, src_haddr, hlen) == 0)
- return (AR_LOOPBACK);
- }
- }
-
- /*
- * If the entry is unverified, then we've just verified that
- * someone else already owns this address, because this is a
- * message with the same protocol address but different
- * hardware address. NOTE: the ace_xmit_arl check ensures we
- * don't send duplicate AR_FAILEDs if arl is in an IPMP group.
- */
- if ((ace->ace_flags & ACE_F_UNVERIFIED) &&
- arl == ace->ace_xmit_arl) {
- ar_ce_delete(ace);
- return (AR_FAILED);
- }
-
- /*
- * If the IP address matches ours and we're authoritative for
- * this entry, then some other node is using our IP addr, so
- * return AR_BOGON. Also reset the transmit count to zero so
- * that, if we're currently in initial announcement mode, we
- * switch back to the lazier defense mode. Knowing that
- * there's at least one duplicate out there, we ought not
- * blindly announce. NOTE: the ace_xmit_arl check ensures we
- * don't send duplicate AR_BOGONs if arl is in an IPMP group.
- */
- if ((ace->ace_flags & ACE_F_AUTHORITY) &&
- arl == ace->ace_xmit_arl) {
- ace->ace_xmit_count = 0;
- return (AR_BOGON);
- }
-
- /*
- * Only update this ACE if it's on the same network -- i.e.,
- * it's for our ARL or another ARL in the same IPMP group.
- */
- if (ace->ace_arl == arl || ace->ace_arl == arl->arl_ipmp_arl) {
- if (ar_ce_resolve(ace, src_haddr, hlen))
- retv = AR_CHANGED;
- else if (retv == AR_NOTFOUND)
- retv = AR_MERGED;
- }
- }
-
- if (retv == AR_NOTFOUND)
- *ace_arlp = NULL;
- return (retv);
-}
-
-/* Pass arg1 to the pfi supplied, along with each ace in existence. */
-static void
-ar_ce_walk(arp_stack_t *as, void (*pfi)(ace_t *, void *), void *arg1)
-{
- ace_t *ace;
- ace_t *ace1;
- int i;
-
- for (i = 0; i < ARP_HASH_SIZE; i++) {
- /*
- * We walk the hash chain in a way that allows the current
- * ace to get blown off by the called routine.
- */
- for (ace = as->as_ce_hash_tbl[i]; ace; ace = ace1) {
- ace1 = ace->ace_next;
- (*pfi)(ace, arg1);
- }
- }
- for (ace = as->as_ce_mask_entries; ace; ace = ace1) {
- ace1 = ace->ace_next;
- (*pfi)(ace, arg1);
- }
-}
-
-/*
- * Send a copy of interesting packets to the corresponding IP instance.
- * The corresponding IP instance is the ARP-IP-DEV instance for this
- * DEV (i.e. ARL).
- */
-static void
-ar_client_notify(const arl_t *arl, mblk_t *mp, int code)
-{
- ar_t *ar = ((ar_t *)arl->arl_rq->q_ptr)->ar_arl_ip_assoc;
- arcn_t *arcn;
- mblk_t *mp1;
- int arl_namelen = strlen(arl->arl_name) + 1;
-
- /* Looks like the association disappeared */
- if (ar == NULL) {
- freemsg(mp);
- return;
- }
-
- /* ar is the corresponding ARP-IP instance for this ARL */
- ASSERT(ar->ar_arl == NULL && ar->ar_wq->q_next != NULL);
-
- mp1 = allocb(sizeof (arcn_t) + arl_namelen, BPRI_MED);
- if (mp1 == NULL) {
- freemsg(mp);
- return;
- }
- DB_TYPE(mp1) = M_CTL;
- mp1->b_cont = mp;
- arcn = (arcn_t *)mp1->b_rptr;
- mp1->b_wptr = (uchar_t *)&arcn[1] + arl_namelen;
- arcn->arcn_cmd = AR_CLIENT_NOTIFY;
- arcn->arcn_name_offset = sizeof (arcn_t);
- arcn->arcn_name_length = arl_namelen;
- arcn->arcn_code = code;
- bcopy(arl->arl_name, &arcn[1], arl_namelen);
-
- putnext(ar->ar_wq, mp1);
-}
-
-/*
- * Send a delete-notify message down to IP. We've determined that IP doesn't
- * have a cache entry for the IP address itself, but it may have other cache
- * entries with the same hardware address, and we don't want to see those grow
- * stale. (The alternative is sending down updates for every ARP message we
- * get that doesn't match an existing ace. That's much more expensive than an
- * occasional delete and reload.)
- */
-static void
-ar_delete_notify(const ace_t *ace)
-{
- const arl_t *arl = ace->ace_arl;
- const arlphy_t *ap = ace->ace_xmit_arl->arl_phy;
- mblk_t *mp;
- size_t len;
- arh_t *arh;
-
- len = sizeof (*arh) + 2 * ace->ace_proto_addr_length;
- mp = allocb(len, BPRI_MED);
- if (mp == NULL)
- return;
- arh = (arh_t *)mp->b_rptr;
- mp->b_wptr = (uchar_t *)arh + len;
- U16_TO_BE16(ap->ap_arp_hw_type, arh->arh_hardware);
- U16_TO_BE16(ace->ace_proto, arh->arh_proto);
- arh->arh_hlen = 0;
- arh->arh_plen = ace->ace_proto_addr_length;
- U16_TO_BE16(ARP_RESPONSE, arh->arh_operation);
- bcopy(ace->ace_proto_addr, arh + 1, ace->ace_proto_addr_length);
- bcopy(ace->ace_proto_addr, (uchar_t *)(arh + 1) +
- ace->ace_proto_addr_length, ace->ace_proto_addr_length);
- ar_client_notify(arl, mp, AR_CN_ANNOUNCE);
-}
-
-/* ARP module close routine. */
-static int
-ar_close(queue_t *q)
-{
- ar_t *ar = (ar_t *)q->q_ptr;
- char name[LIFNAMSIZ];
- arl_t *arl, *xarl;
- arl_t **arlp;
- cred_t *cr;
- arc_t *arc;
- mblk_t *mp1;
- int index;
- arp_stack_t *as = ar->ar_as;
-
- TRACE_1(TR_FAC_ARP, TR_ARP_CLOSE,
- "arp_close: q %p", q);
-
- arl = ar->ar_arl;
- if (arl == NULL) {
- index = 0;
- /*
- * If this is the <ARP-IP-Driver> stream send down
- * a closing message to IP and wait for IP to send
- * an ack. This helps to make sure that messages
- * that are currently being sent up by IP are not lost.
- */
- if (ar->ar_on_ill_stream) {
- mp1 = allocb(sizeof (arc_t), BPRI_MED);
- if (mp1 != NULL) {
- DB_TYPE(mp1) = M_CTL;
- arc = (arc_t *)mp1->b_rptr;
- mp1->b_wptr = mp1->b_rptr + sizeof (arc_t);
- arc->arc_cmd = AR_ARP_CLOSING;
- putnext(WR(q), mp1);
- while (!ar->ar_ip_acked_close)
- /* If we are interrupted break out */
- if (qwait_sig(q) == 0)
- break;
- }
- }
- /* Delete all our pending queries, 'arl' is not dereferenced */
- ar_ce_walk(as, ar_query_delete, ar);
- /*
- * The request could be pending on some arl_queue also. This
- * happens if the arl is not yet bound, and bind is pending.
- */
- ar_ll_cleanup_arl_queue(q);
- } else {
- index = arl->arl_index;
- (void) strcpy(name, arl->arl_name);
- arl->arl_closing = 1;
- while (arl->arl_queue != NULL)
- qwait(arl->arl_rq);
-
- if (arl->arl_state == ARL_S_UP)
- ar_ll_down(arl);
-
- while (arl->arl_state != ARL_S_DOWN)
- qwait(arl->arl_rq);
-
- if (arl->arl_flags & ARL_F_IPMP) {
- /*
- * Though rude, someone could force the IPMP arl
- * closed without removing the underlying interfaces.
- * In that case, force the ARLs out of the group.
- */
- xarl = as->as_arl_head;
- for (; xarl != NULL; xarl = xarl->arl_next) {
- if (xarl->arl_ipmp_arl != arl || xarl == arl)
- continue;
- ar_ce_walk(as, ar_ce_ipmp_deactivate, xarl);
- xarl->arl_ipmp_arl = NULL;
- }
- }
-
- ar_ll_clear_defaults(arl);
- /*
- * If this is the control stream for an arl, delete anything
- * hanging off our arl.
- */
- ar_ce_walk(as, ar_ce_delete_per_arl, arl);
- /* Free any messages waiting for a bind_ack */
- /* Get the arl out of the chain. */
- rw_enter(&as->as_arl_lock, RW_WRITER);
- for (arlp = &as->as_arl_head; *arlp;
- arlp = &(*arlp)->arl_next) {
- if (*arlp == arl) {
- *arlp = arl->arl_next;
- break;
- }
- }
-
- ASSERT(arl->arl_dlpi_deferred == NULL);
- ar->ar_arl = NULL;
- rw_exit(&as->as_arl_lock);
-
- mi_free((char *)arl);
- }
- /* Let's break the association between an ARL and IP instance */
- if (ar->ar_arl_ip_assoc != NULL) {
- ASSERT(ar->ar_arl_ip_assoc->ar_arl_ip_assoc != NULL &&
- ar->ar_arl_ip_assoc->ar_arl_ip_assoc == ar);
- ar->ar_arl_ip_assoc->ar_arl_ip_assoc = NULL;
- ar->ar_arl_ip_assoc = NULL;
- }
- cr = ar->ar_credp;
- /* mi_close_comm frees the instance data. */
- (void) mi_close_comm(&as->as_head, q);
- qprocsoff(q);
- crfree(cr);
-
- if (index != 0) {
- hook_nic_event_t info;
-
- info.hne_nic = index;
- info.hne_lif = 0;
- info.hne_event = NE_UNPLUMB;
- info.hne_data = name;
- info.hne_datalen = strlen(name);
- (void) hook_run(as->as_net_data->netd_hooks,
- as->as_arpnicevents, (hook_data_t)&info);
- }
- netstack_rele(as->as_netstack);
- return (0);
-}
-
-/*
- * Dispatch routine for ARP commands. This routine can be called out of
- * either ar_wput or ar_rput, in response to IOCTLs or M_PROTO messages.
- */
-/* TODO: error reporting for M_PROTO case */
-static int
-ar_cmd_dispatch(queue_t *q, mblk_t *mp_orig, boolean_t from_wput)
-{
- arct_t *arct;
- uint32_t cmd;
- ssize_t len;
- mblk_t *mp = mp_orig;
- cred_t *cr = NULL;
-
- if (!mp)
- return (ENOENT);
-
- /* We get both M_PROTO and M_IOCTL messages, so watch out! */
- if (DB_TYPE(mp) == M_IOCTL) {
- struct iocblk *ioc;
- ioc = (struct iocblk *)mp->b_rptr;
- cmd = ioc->ioc_cmd;
- cr = ioc->ioc_cr;
- mp = mp->b_cont;
- if (!mp)
- return (ENOENT);
- } else {
- cr = msg_getcred(mp, NULL);
- /* For initial messages beteen IP and ARP, cr can be NULL */
- if (cr == NULL)
- cr = ((ar_t *)q->q_ptr)->ar_credp;
- }
- len = MBLKL(mp);
- if (len < sizeof (uint32_t) || !OK_32PTR(mp->b_rptr))
- return (ENOENT);
- if (mp_orig == mp)
- cmd = *(uint32_t *)mp->b_rptr;
- for (arct = ar_cmd_tbl; ; arct++) {
- if (arct >= A_END(ar_cmd_tbl))
- return (ENOENT);
- if (arct->arct_cmd == cmd)
- break;
- }
- if (len < arct->arct_min_len) {
- /*
- * If the command is exclusive to ARP, we return EINVAL,
- * else we need to pass the command downstream, so return
- * ENOENT
- */
- return ((arct->arct_flags & ARF_ONLY_CMD) ? EINVAL : ENOENT);
- }
- if (arct->arct_priv_req != OP_NP) {
- int error;
-
- if ((error = secpolicy_ip(cr, arct->arct_priv_req,
- B_FALSE)) != 0)
- return (error);
- }
- /* Disallow many commands except if from rput i.e. from IP */
- if (from_wput && !(arct->arct_flags & ARF_WPUT_OK)) {
- return (EINVAL);
- }
-
- if (arct->arct_flags & ARF_IOCTL_AWARE)
- mp = mp_orig;
-
- DTRACE_PROBE3(cmd_dispatch, queue_t *, q, mblk_t *, mp,
- arct_t *, arct);
- return (*arct->arct_pfi)(q, mp);
-}
-
-/* Allocate and do common initializations for DLPI messages. */
-static mblk_t *
-ar_dlpi_comm(t_uscalar_t prim, size_t size)
-{
- mblk_t *mp;
-
- if ((mp = allocb(size, BPRI_HI)) == NULL)
- return (NULL);
-
- /*
- * DLPIv2 says that DL_INFO_REQ and DL_TOKEN_REQ (the latter
- * of which we don't seem to use) are sent with M_PCPROTO, and
- * that other DLPI are M_PROTO.
- */
- DB_TYPE(mp) = (prim == DL_INFO_REQ) ? M_PCPROTO : M_PROTO;
-
- mp->b_wptr = mp->b_rptr + size;
- bzero(mp->b_rptr, size);
- ((union DL_primitives *)mp->b_rptr)->dl_primitive = prim;
-
- return (mp);
-}
-
-static void
-ar_dlpi_dispatch(arl_t *arl)
-{
- mblk_t *mp;
- t_uscalar_t primitive = DL_PRIM_INVAL;
-
- while (((mp = arl->arl_dlpi_deferred) != NULL) &&
- (arl->arl_dlpi_pending == DL_PRIM_INVAL)) {
- union DL_primitives *dlp = (union DL_primitives *)mp->b_rptr;
-
- DTRACE_PROBE2(dlpi_dispatch, arl_t *, arl, mblk_t *, mp);
-
- ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
- arl->arl_dlpi_deferred = mp->b_next;
- mp->b_next = NULL;
-
- /*
- * If this is a DL_NOTIFY_CONF, no ack is expected.
- */
- if ((primitive = dlp->dl_primitive) != DL_NOTIFY_CONF)
- arl->arl_dlpi_pending = dlp->dl_primitive;
- putnext(arl->arl_wq, mp);
- }
-
- if (arl->arl_dlpi_pending == DL_PRIM_INVAL) {
- /*
- * No pending DLPI operation.
- */
- ASSERT(mp == NULL);
- DTRACE_PROBE1(dlpi_idle, arl_t *, arl);
-
- /*
- * If the last DLPI message dispatched is DL_NOTIFY_CONF,
- * it is not assoicated with any pending cmd request, drain
- * the rest of pending cmd requests, otherwise call
- * ar_cmd_done() to finish up the current pending cmd
- * operation.
- */
- if (primitive == DL_NOTIFY_CONF)
- ar_cmd_drain(arl);
- else
- ar_cmd_done(arl);
- } else if (mp != NULL) {
- DTRACE_PROBE2(dlpi_defer, arl_t *, arl, mblk_t *, mp);
- }
-}
-
-/*
- * The following two functions serialize DLPI messages to the driver, much
- * along the lines of ill_dlpi_send and ill_dlpi_done in IP. Basically,
- * we wait for a DLPI message, sent downstream, to be acked before sending
- * the next. If there are DLPI messages that have not yet been sent, queue
- * this message (mp), else send it downstream.
- */
-static void
-ar_dlpi_send(arl_t *arl, mblk_t *mp)
-{
- mblk_t **mpp;
-
- ASSERT(arl != NULL);
- ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
-
- /* Always queue the message. Tail insertion */
- mpp = &arl->arl_dlpi_deferred;
- while (*mpp != NULL)
- mpp = &((*mpp)->b_next);
- *mpp = mp;
-
- ar_dlpi_dispatch(arl);
-}
-
-/*
- * Called when an DLPI control message has been acked; send down the next
- * queued message (if any).
- * The DLPI messages of interest being bind, attach, unbind and detach since
- * these are the only ones sent by ARP via ar_dlpi_send.
- */
-static void
-ar_dlpi_done(arl_t *arl, t_uscalar_t prim)
-{
- if (arl->arl_dlpi_pending != prim) {
- DTRACE_PROBE2(dlpi_done_unexpected, arl_t *, arl,
- t_uscalar_t, prim);
- return;
- }
-
- DTRACE_PROBE2(dlpi_done, arl_t *, arl, t_uscalar_t, prim);
- arl->arl_dlpi_pending = DL_PRIM_INVAL;
- ar_dlpi_dispatch(arl);
-}
-
-/*
- * Send a DL_NOTE_REPLUMB_DONE message down to the driver to indicate
- * the replumb process has already been done. Note that mp is either a
- * DL_NOTIFY_IND message or an AR_INTERFACE_DOWN message (comes from IP).
- */
-static void
-arp_replumb_done(arl_t *arl, mblk_t *mp)
-{
- ASSERT(arl->arl_state == ARL_S_DOWN && arl->arl_replumbing);
-
- mp = mexchange(NULL, mp, sizeof (dl_notify_conf_t), M_PROTO,
- DL_NOTIFY_CONF);
- ((dl_notify_conf_t *)(mp->b_rptr))->dl_notification =
- DL_NOTE_REPLUMB_DONE;
- arl->arl_replumbing = B_FALSE;
- ar_dlpi_send(arl, mp);
-}
-
-static void
-ar_cmd_drain(arl_t *arl)
-{
- mblk_t *mp;
- queue_t *q;
-
- /*
- * Run the commands that have been enqueued while we were waiting
- * for the last command (AR_INTERFACE_UP or AR_INTERFACE_DOWN)
- * to complete.
- */
- while ((mp = arl->arl_queue) != NULL) {
- if (((uintptr_t)mp->b_prev & CMD_IN_PROGRESS) != 0) {
- /*
- * The current command is an AR_INTERFACE_UP or
- * AR_INTERFACE_DOWN and is waiting for a DLPI ack
- * from the driver. Return. We can't make progress now.
- */
- break;
- }
-
- mp = ar_cmd_dequeue(arl);
- mp->b_prev = AR_DRAINING;
- q = mp->b_queue;
- mp->b_queue = NULL;
-
- /*
- * Don't call put(q, mp) since it can lead to reorder of
- * messages by sending the current messages to the end of
- * arp's syncq
- */
- if (q->q_flag & QREADR)
- ar_rput(q, mp);
- else
- ar_wput(q, mp);
- }
-}
-
-static void
-ar_cmd_done(arl_t *arl)
-{
- mblk_t *mp;
- int cmd;
- int err;
- mblk_t *mp1;
- mblk_t *dlpi_op_done_mp = NULL;
- queue_t *dlpi_op_done_q;
- ar_t *ar_arl;
- ar_t *ar_ip;
-
- ASSERT(arl->arl_state == ARL_S_UP || arl->arl_state == ARL_S_DOWN);
-
- /*
- * If the current operation was initiated by IP there must be
- * an op enqueued in arl_queue. But if ar_close has sent down
- * a detach/unbind, there is no command enqueued. Also if the IP-ARP
- * stream has closed the cleanup would be done and there won't be any mp
- */
- if ((mp = arl->arl_queue) == NULL)
- return;
-
- if ((cmd = (uintptr_t)mp->b_prev) & CMD_IN_PROGRESS) {
- mp1 = ar_cmd_dequeue(arl);
- ASSERT(mp == mp1);
-
- cmd &= ~CMD_IN_PROGRESS;
- if (cmd == AR_INTERFACE_UP) {
- /*
- * There is an ioctl waiting for us...
- */
- if (arl->arl_state == ARL_S_UP)
- err = 0;
- else
- err = EINVAL;
-
- dlpi_op_done_mp = ar_alloc(AR_DLPIOP_DONE, err);
- if (dlpi_op_done_mp != NULL) {
- /*
- * Better performance if we send the response
- * after the potential MAPPING_ADDs command
- * that are likely to follow. (Do it below the
- * while loop, instead of putnext right now)
- */
- dlpi_op_done_q = WR(mp->b_queue);
- }
-
- if (err == 0) {
- /*
- * Now that we have the ARL instance
- * corresponding to the IP instance let's make
- * the association here.
- */
- ar_ip = (ar_t *)mp->b_queue->q_ptr;
- ar_arl = (ar_t *)arl->arl_rq->q_ptr;
- ar_arl->ar_arl_ip_assoc = ar_ip;
- ar_ip->ar_arl_ip_assoc = ar_arl;
- }
-
- inet_freemsg(mp);
- } else if (cmd == AR_INTERFACE_DOWN && arl->arl_replumbing) {
- /*
- * The arl is successfully brought down and this is
- * a result of the DL_NOTE_REPLUMB process. Reset
- * mp->b_prev first (it keeps the 'cmd' information
- * at this point).
- */
- mp->b_prev = NULL;
- arp_replumb_done(arl, mp);
- } else {
- inet_freemsg(mp);
- }
- }
-
- ar_cmd_drain(arl);
-
- if (dlpi_op_done_mp != NULL) {
- DTRACE_PROBE3(cmd_done_next, arl_t *, arl,
- queue_t *, dlpi_op_done_q, mblk_t *, dlpi_op_done_mp);
- putnext(dlpi_op_done_q, dlpi_op_done_mp);
- }
-}
-
-/*
- * Queue all arp commands coming from clients. Typically these commands
- * come from IP, but could also come from other clients. The commands
- * are serviced in FIFO order. Some commands need to wait and restart
- * after the DLPI response from the driver is received. Typically
- * AR_INTERFACE_UP and AR_INTERFACE_DOWN. ar_dlpi_done restarts
- * the command and then dequeues the queue at arl_queue and calls ar_rput
- * or ar_wput for each enqueued command. AR_DRAINING is used to signify
- * that the command is being executed thru a drain from ar_dlpi_done.
- * Functions handling the individual commands such as ar_entry_add
- * check for this flag in b_prev to determine whether the command has
- * to be enqueued for later processing or must be processed now.
- *
- * b_next used to thread the enqueued command mblks
- * b_queue used to identify the queue of the originating request(client)
- * b_prev used to store the command itself for easy parsing.
- */
-static void
-ar_cmd_enqueue(arl_t *arl, mblk_t *mp, queue_t *q, ushort_t cmd,
- boolean_t tail_insert)
-{
- mp->b_queue = q;
- if (arl->arl_queue == NULL) {
- ASSERT(arl->arl_queue_tail == NULL);
- mp->b_prev = (void *)((uintptr_t)(cmd | CMD_IN_PROGRESS));
- mp->b_next = NULL;
- arl->arl_queue = mp;
- arl->arl_queue_tail = mp;
- } else if (tail_insert) {
- mp->b_prev = (void *)((uintptr_t)cmd);
- mp->b_next = NULL;
- arl->arl_queue_tail->b_next = mp;
- arl->arl_queue_tail = mp;
- } else {
- /* head insert */
- mp->b_prev = (void *)((uintptr_t)cmd | CMD_IN_PROGRESS);
- mp->b_next = arl->arl_queue;
- arl->arl_queue = mp;
- }
-}
-
-static mblk_t *
-ar_cmd_dequeue(arl_t *arl)
-{
- mblk_t *mp;
-
- if (arl->arl_queue == NULL) {
- ASSERT(arl->arl_queue_tail == NULL);
- return (NULL);
- }
- mp = arl->arl_queue;
- arl->arl_queue = mp->b_next;
- if (arl->arl_queue == NULL)
- arl->arl_queue_tail = NULL;
- mp->b_next = NULL;
- return (mp);
-}
-
-/*
- * Standard ACE timer handling: compute 'fuzz' around a central value or from 0
- * up to a value, and then set the timer. The randomization is necessary to
- * prevent groups of systems from falling into synchronization on the network
- * and producing ARP packet storms.
- */
-static void
-ace_set_timer(ace_t *ace, boolean_t initial_time)
-{
- clock_t intv, rnd, frac;
-
- (void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd));
- /* Note that clock_t is signed; must chop off bits */
- rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1;
- intv = ace->ace_xmit_interval;
- if (initial_time) {
- /* Set intv to be anywhere in the [1 .. intv] range */
- if (intv <= 0)
- intv = 1;
- else
- intv = (rnd % intv) + 1;
- } else {
- /* Compute 'frac' as 20% of the configured interval */
- if ((frac = intv / 5) <= 1)
- frac = 2;
- /* Set intv randomly in the range [intv-frac .. intv+frac] */
- if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0)
- intv = 1;
- }
- mi_timer(ace->ace_arl->arl_wq, ace->ace_mp, intv);
-}
-
-/*
- * Process entry add requests from external messages.
- * It is also called by ip_rput_dlpi_writer() through
- * ipif_resolver_up() to change hardware address when
- * an asynchronous hardware address change notification
- * arrives from the driver.
- */
-static int
-ar_entry_add(queue_t *q, mblk_t *mp_orig)
-{
- area_t *area;
- ace_t *ace;
- uchar_t *hw_addr;
- uint32_t hw_addr_len;
- uchar_t *proto_addr;
- uint32_t proto_addr_len;
- uchar_t *proto_mask;
- arl_t *arl;
- mblk_t *mp = mp_orig;
- int err;
- uint_t aflags;
- boolean_t unverified;
- arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
-
- /* We handle both M_IOCTL and M_PROTO messages. */
- if (DB_TYPE(mp) == M_IOCTL)
- mp = mp->b_cont;
- arl = ar_ll_lookup_from_mp(as, mp);
- if (arl == NULL)
- return (EINVAL);
- /*
- * Newly received commands from clients go to the tail of the queue.
- */
- if (CMD_NEEDS_QUEUEING(mp_orig, arl)) {
- DTRACE_PROBE3(eadd_enqueued, queue_t *, q, mblk_t *, mp_orig,
- arl_t *, arl);
- ar_cmd_enqueue(arl, mp_orig, q, AR_ENTRY_ADD, B_TRUE);
- return (EINPROGRESS);
- }
- mp_orig->b_prev = NULL;
-
- area = (area_t *)mp->b_rptr;
- aflags = area->area_flags;
-
- /*
- * If the previous entry wasn't published and we are now going
- * to publish, then we need to do address verification. The previous
- * entry may have been a local unpublished address or even an external
- * address. If the entry we find was in an unverified state we retain
- * this.
- * If it's a new published entry, then we're obligated to do
- * duplicate address detection now.
- */
- ace = ar_ce_lookup_from_area(as, mp, ar_ce_lookup_entry);
- if (ace != NULL) {
- unverified = !(ace->ace_flags & ACE_F_PUBLISH) &&
- (aflags & ACE_F_PUBLISH);
- if (ace->ace_flags & ACE_F_UNVERIFIED)
- unverified = B_TRUE;
- ar_ce_delete(ace);
- } else {
- unverified = (aflags & ACE_F_PUBLISH) != 0;
- }
-
- /* Allow client to request DAD restart */
- if (aflags & ACE_F_UNVERIFIED)
- unverified = B_TRUE;
-
- /* Extract parameters from the message. */
- hw_addr_len = area->area_hw_addr_length;
- hw_addr = mi_offset_paramc(mp, area->area_hw_addr_offset, hw_addr_len);
- proto_addr_len = area->area_proto_addr_length;
- proto_addr = mi_offset_paramc(mp, area->area_proto_addr_offset,
- proto_addr_len);
- proto_mask = mi_offset_paramc(mp, area->area_proto_mask_offset,
- proto_addr_len);
- if (proto_mask == NULL) {
- DTRACE_PROBE2(eadd_bad_mask, arl_t *, arl, area_t *, area);
- return (EINVAL);
- }
- err = ar_ce_create(
- arl,
- area->area_proto,
- hw_addr,
- hw_addr_len,
- proto_addr,
- proto_addr_len,
- proto_mask,
- NULL,
- (uint32_t)0,
- NULL,
- aflags & ~ACE_F_MAPPING & ~ACE_F_UNVERIFIED & ~ACE_F_DEFEND);
- if (err != 0) {
- DTRACE_PROBE3(eadd_create_failed, arl_t *, arl, area_t *, area,
- int, err);
- return (err);
- }
-
- if (aflags & ACE_F_PUBLISH) {
- arlphy_t *ap;
-
- ace = ar_ce_lookup(arl, area->area_proto, proto_addr,
- proto_addr_len);
- ASSERT(ace != NULL);
-
- ap = ace->ace_xmit_arl->arl_phy;
-
- if (hw_addr == NULL || hw_addr_len == 0) {
- hw_addr = ap->ap_hw_addr;
- } else if (aflags & ACE_F_MYADDR) {
- /*
- * If hardware address changes, then make sure
- * that the hardware address and hardware
- * address length fields in arlphy_t get updated
- * too. Otherwise, they will continue carrying
- * the old hardware address information.
- */
- ASSERT((hw_addr != NULL) && (hw_addr_len != 0));
- bcopy(hw_addr, ap->ap_hw_addr, hw_addr_len);
- ap->ap_hw_addrlen = hw_addr_len;
- }
-
- if (ace->ace_flags & ACE_F_FAST) {
- ace->ace_xmit_count = as->as_fastprobe_count;
- ace->ace_xmit_interval = as->as_fastprobe_delay;
- } else {
- ace->ace_xmit_count = as->as_probe_count;
- ace->ace_xmit_interval = as->as_probe_delay;
- }
-
- /*
- * If the user has disabled duplicate address detection for
- * this kind of interface (fast or slow) by setting the probe
- * count to zero, then pretend as if we've verified the
- * address, and go right to address defense mode.
- */
- if (ace->ace_xmit_count == 0)
- unverified = B_FALSE;
-
- /*
- * If we need to do duplicate address detection, then kick that
- * off. Otherwise, send out a gratuitous ARP message in order
- * to update everyone's caches with the new hardware address.
- */
- if (unverified) {
- ace->ace_flags |= ACE_F_UNVERIFIED;
- if (ace->ace_xmit_interval == 0) {
- /*
- * User has configured us to send the first
- * probe right away. Do so, and set up for
- * the subsequent probes.
- */
- DTRACE_PROBE2(eadd_probe, ace_t *, ace,
- area_t *, area);
- ar_xmit(ace->ace_xmit_arl, ARP_REQUEST,
- area->area_proto, proto_addr_len,
- hw_addr, NULL, NULL, proto_addr, NULL, as);
- ace->ace_xmit_count--;
- ace->ace_xmit_interval =
- (ace->ace_flags & ACE_F_FAST) ?
- as->as_fastprobe_interval :
- as->as_probe_interval;
- ace_set_timer(ace, B_FALSE);
- } else {
- DTRACE_PROBE2(eadd_delay, ace_t *, ace,
- area_t *, area);
- /* Regular delay before initial probe */
- ace_set_timer(ace, B_TRUE);
- }
- } else {
- DTRACE_PROBE2(eadd_announce, ace_t *, ace,
- area_t *, area);
- ar_xmit(ace->ace_xmit_arl, ARP_REQUEST,
- area->area_proto, proto_addr_len, hw_addr,
- proto_addr, ap->ap_arp_addr, proto_addr, NULL, as);
- ace->ace_last_bcast = ddi_get_lbolt();
-
- /*
- * If AUTHORITY is set, it is not just a proxy arp
- * entry; we believe we're the authority for this
- * entry. In that case, and if we're not just doing
- * one-off defense of the address, we send more than
- * one copy, so we'll still have a good chance of
- * updating everyone even when there's a packet loss
- * or two.
- */
- if ((aflags & ACE_F_AUTHORITY) &&
- !(aflags & ACE_F_DEFEND) &&
- as->as_publish_count > 0) {
- /* Account for the xmit we just did */
- ace->ace_xmit_count = as->as_publish_count - 1;
- ace->ace_xmit_interval =
- as->as_publish_interval;
- if (ace->ace_xmit_count > 0)
- ace_set_timer(ace, B_FALSE);
- }
- }
- }
- return (0);
-}
-
-/* Process entry delete requests from external messages. */
-static int
-ar_entry_delete(queue_t *q, mblk_t *mp_orig)
-{
- ace_t *ace;
- arl_t *arl;
- mblk_t *mp = mp_orig;
- arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
-
- /* We handle both M_IOCTL and M_PROTO messages. */
- if (DB_TYPE(mp) == M_IOCTL)
- mp = mp->b_cont;
- arl = ar_ll_lookup_from_mp(as, mp);
- if (arl == NULL)
- return (EINVAL);
- /*
- * Newly received commands from clients go to the tail of the queue.
- */
- if (CMD_NEEDS_QUEUEING(mp_orig, arl)) {
- DTRACE_PROBE3(edel_enqueued, queue_t *, q, mblk_t *, mp_orig,
- arl_t *, arl);
- ar_cmd_enqueue(arl, mp_orig, q, AR_ENTRY_DELETE, B_TRUE);
- return (EINPROGRESS);
- }
- mp_orig->b_prev = NULL;
-
- /*
- * Need to know if it is a mapping or an exact match. Check exact
- * match first.
- */
- ace = ar_ce_lookup_from_area(as, mp, ar_ce_lookup);
- if (ace != NULL) {
- ared_t *ared = (ared_t *)mp->b_rptr;
-
- /*
- * If it's a permanent entry, then the client is the one who
- * told us to delete it, so there's no reason to notify.
- */
- if (ACE_NONPERM(ace))
- ar_delete_notify(ace);
- /*
- * Only delete the ARP entry if it is non-permanent, or
- * ARED_F_PRESERVE_PERM flags is not set.
- */
- if (ACE_NONPERM(ace) ||
- !(ared->ared_flags & ARED_F_PRESERVE_PERM)) {
- ar_ce_delete(ace);
- }
- return (0);
- }
- return (ENXIO);
-}
-
-/*
- * Process entry query requests from external messages.
- * Bump up the ire_stats_freed for all errors except
- * EINPROGRESS - which means the packet has been queued.
- * For all other errors the packet is going to be freed
- * and hence we account for ire being freed if it
- * is a M_PROTO message.
- */
-static int
-ar_entry_query(queue_t *q, mblk_t *mp_orig)
-{
- ace_t *ace;
- areq_t *areq;
- arl_t *arl;
- int err;
- mblk_t *mp = mp_orig;
- uchar_t *proto_addr;
- uchar_t *sender_addr;
- uint32_t proto_addr_len;
- clock_t ms;
- boolean_t is_mproto = B_TRUE;
- arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
-
- /* We handle both M_IOCTL and M_PROTO messages. */
- if (DB_TYPE(mp) == M_IOCTL) {
- is_mproto = B_FALSE;
- mp = mp->b_cont;
- }
- arl = ar_ll_lookup_from_mp(as, mp);
- if (arl == NULL) {
- DTRACE_PROBE2(query_no_arl, queue_t *, q, mblk_t *, mp);
- err = EINVAL;
- goto err_ret;
- }
- /*
- * Newly received commands from clients go to the tail of the queue.
- */
- if (CMD_NEEDS_QUEUEING(mp_orig, arl)) {
- DTRACE_PROBE3(query_enqueued, queue_t *, q, mblk_t *, mp_orig,
- arl_t *, arl);
- ar_cmd_enqueue(arl, mp_orig, q, AR_ENTRY_QUERY, B_TRUE);
- return (EINPROGRESS);
- }
- mp_orig->b_prev = NULL;
-
- areq = (areq_t *)mp->b_rptr;
- proto_addr_len = areq->areq_target_addr_length;
- proto_addr = mi_offset_paramc(mp, areq->areq_target_addr_offset,
- proto_addr_len);
- if (proto_addr == NULL) {
- DTRACE_PROBE1(query_illegal_address, areq_t *, areq);
- err = EINVAL;
- goto err_ret;
- }
- /* Stash the reply queue pointer for later use. */
- mp->b_prev = (mblk_t *)OTHERQ(q);
- mp->b_next = NULL;
- if (areq->areq_xmit_interval == 0)
- areq->areq_xmit_interval = AR_DEF_XMIT_INTERVAL;
- ace = ar_ce_lookup(arl, areq->areq_proto, proto_addr, proto_addr_len);
- if (ace != NULL && (ace->ace_flags & ACE_F_OLD)) {
- /*
- * This is a potentially stale entry that IP's asking about.
- * Since IP is asking, it must not have an answer anymore,
- * either due to periodic ARP flush or due to SO_DONTROUTE.
- * Rather than go forward with what we've got, restart
- * resolution.
- */
- DTRACE_PROBE2(query_stale_ace, ace_t *, ace, areq_t *, areq);
- ar_ce_delete(ace);
- ace = NULL;
- }
- if (ace != NULL) {
- mblk_t **mpp;
- uint32_t count = 0;
-
- /*
- * There is already a cache entry. This means there is either
- * a permanent entry, or address resolution is in progress.
- * If the latter, there should be one or more queries queued
- * up. We link the current one in at the end, if there aren't
- * too many outstanding.
- */
- for (mpp = &ace->ace_query_mp; mpp[0]; mpp = &mpp[0]->b_next) {
- if (++count > areq->areq_max_buffered) {
- DTRACE_PROBE2(query_overflow, ace_t *, ace,
- areq_t *, areq);
- mp->b_prev = NULL;
- err = EALREADY;
- goto err_ret;
- }
- }
- /* Put us on the list. */
- mpp[0] = mp;
- if (count != 0) {
- /*
- * If a query was already queued up, then we must not
- * have an answer yet.
- */
- DTRACE_PROBE2(query_in_progress, ace_t *, ace,
- areq_t *, areq);
- return (EINPROGRESS);
- }
- if (ACE_RESOLVED(ace)) {
- /*
- * We have an answer already.
- * Keep a dup of mp since proto_addr points to it
- * and mp has been placed on the ace_query_mp list.
- */
- mblk_t *mp1;
-
- DTRACE_PROBE2(query_resolved, ace_t *, ace,
- areq_t *, areq);
- mp1 = dupmsg(mp);
- ar_query_reply(ace, 0, proto_addr, proto_addr_len);
- freemsg(mp1);
- return (EINPROGRESS);
- }
- if (ace->ace_flags & ACE_F_MAPPING) {
- /* Should never happen */
- DTRACE_PROBE2(query_unresolved_mapping, ace_t *, ace,
- areq_t *, areq);
- mpp[0] = mp->b_next;
- err = ENXIO;
- goto err_ret;
- }
- DTRACE_PROBE2(query_unresolved, ace_t, ace, areq_t *, areq);
- } else {
- /* No ace yet. Make one now. (This is the common case.) */
- if (areq->areq_xmit_count == 0) {
- DTRACE_PROBE2(query_template, arl_t *, arl,
- areq_t *, areq);
- mp->b_prev = NULL;
- err = ENXIO;
- goto err_ret;
- }
- /*
- * Check for sender addr being NULL or not before
- * we create the ace. It is easy to cleanup later.
- */
- sender_addr = mi_offset_paramc(mp,
- areq->areq_sender_addr_offset,
- areq->areq_sender_addr_length);
- if (sender_addr == NULL) {
- DTRACE_PROBE2(query_no_sender, arl_t *, arl,
- areq_t *, areq);
- mp->b_prev = NULL;
- err = EINVAL;
- goto err_ret;
- }
- err = ar_ce_create(OWNING_ARL(arl), areq->areq_proto, NULL, 0,
- proto_addr, proto_addr_len, NULL,
- NULL, (uint32_t)0, sender_addr,
- areq->areq_flags);
- if (err != 0) {
- DTRACE_PROBE3(query_create_failed, arl_t *, arl,
- areq_t *, areq, int, err);
- mp->b_prev = NULL;
- goto err_ret;
- }
- ace = ar_ce_lookup(arl, areq->areq_proto, proto_addr,
- proto_addr_len);
- if (ace == NULL || ace->ace_query_mp != NULL) {
- /* Shouldn't happen! */
- DTRACE_PROBE3(query_lookup_failed, arl_t *, arl,
- areq_t *, areq, ace_t *, ace);
- mp->b_prev = NULL;
- err = ENXIO;
- goto err_ret;
- }
- ace->ace_query_mp = mp;
- }
- ms = ar_query_xmit(as, ace);
- if (ms == 0) {
- /* Immediate reply requested. */
- ar_query_reply(ace, ENXIO, NULL, (uint32_t)0);
- } else {
- mi_timer(ace->ace_arl->arl_wq, ace->ace_mp, ms);
- }
- return (EINPROGRESS);
-err_ret:
- if (is_mproto) {
- ip_stack_t *ipst = as->as_netstack->netstack_ip;
-
- BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed);
- }
- return (err);
-}
-
-/* Handle simple query requests. */
-static int
-ar_entry_squery(queue_t *q, mblk_t *mp_orig)
-{
- ace_t *ace;
- area_t *area;
- arl_t *arl;
- uchar_t *hw_addr;
- uint32_t hw_addr_len;
- mblk_t *mp = mp_orig;
- uchar_t *proto_addr;
- int proto_addr_len;
- arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
-
- if (DB_TYPE(mp) == M_IOCTL)
- mp = mp->b_cont;
- arl = ar_ll_lookup_from_mp(as, mp);
- if (arl == NULL)
- return (EINVAL);
- /*
- * Newly received commands from clients go to the tail of the queue.
- */
- if (CMD_NEEDS_QUEUEING(mp_orig, arl)) {
- DTRACE_PROBE3(squery_enqueued, queue_t *, q, mblk_t *, mp_orig,
- arl_t *, arl);
- ar_cmd_enqueue(arl, mp_orig, q, AR_ENTRY_SQUERY, B_TRUE);
- return (EINPROGRESS);
- }
- mp_orig->b_prev = NULL;
-
- /* Extract parameters from the request message. */
- area = (area_t *)mp->b_rptr;
- proto_addr_len = area->area_proto_addr_length;
- proto_addr = mi_offset_paramc(mp, area->area_proto_addr_offset,
- proto_addr_len);
- hw_addr_len = area->area_hw_addr_length;
- hw_addr = mi_offset_paramc(mp, area->area_hw_addr_offset, hw_addr_len);
- if (proto_addr == NULL || hw_addr == NULL) {
- DTRACE_PROBE1(squery_illegal_address, area_t *, area);
- return (EINVAL);
- }
- ace = ar_ce_lookup(arl, area->area_proto, proto_addr, proto_addr_len);
- if (ace == NULL) {
- return (ENXIO);
- }
- if (hw_addr_len < ace->ace_hw_addr_length) {
- return (EINVAL);
- }
- if (ACE_RESOLVED(ace)) {
- /* Got it, prepare the response. */
- ASSERT(area->area_hw_addr_length == ace->ace_hw_addr_length);
- ar_set_address(ace, hw_addr, proto_addr, proto_addr_len);
- } else {
- /*
- * We have an incomplete entry. Set the length to zero and
- * just return out the flags.
- */
- area->area_hw_addr_length = 0;
- }
- area->area_flags = ace->ace_flags;
- if (mp == mp_orig) {
- /* Non-ioctl case */
- /* TODO: change message type? */
- DB_TYPE(mp) = M_CTL; /* Caught by ip_wput */
- DTRACE_PROBE3(squery_reply, queue_t *, q, mblk_t *, mp,
- arl_t *, arl);
- qreply(q, mp);
- return (EINPROGRESS);
- }
- return (0);
-}
-
-/* Process an interface down causing us to detach and unbind. */
-/* ARGSUSED */
-static int
-ar_interface_down(queue_t *q, mblk_t *mp)
-{
- arl_t *arl;
- arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
-
- arl = ar_ll_lookup_from_mp(as, mp);
- if (arl == NULL || arl->arl_closing) {
- DTRACE_PROBE2(down_no_arl, queue_t *, q, mblk_t *, mp);
- return (EINVAL);
- }
-
- /*
- * Newly received commands from clients go to the tail of the queue.
- */
- if (CMD_NEEDS_QUEUEING(mp, arl)) {
- DTRACE_PROBE3(down_enqueued, queue_t *, q, mblk_t *, mp,
- arl_t *, arl);
- ar_cmd_enqueue(arl, mp, q, AR_INTERFACE_DOWN, B_TRUE);
- return (EINPROGRESS);
- }
- mp->b_prev = NULL;
- /*
- * The arl is already down, no work to do.
- */
- if (arl->arl_state == ARL_S_DOWN) {
- if (arl->arl_replumbing) {
- /*
- * The arl is already down and this is a result of
- * the DL_NOTE_REPLUMB process. Return EINPROGRESS
- * so this mp won't be freed by ar_rput().
- */
- arp_replumb_done(arl, mp);
- return (EINPROGRESS);
- } else {
- /* ar_rput frees the mp */
- return (0);
- }
- }
-
- /*
- * This command cannot complete in a single shot now itself.
- * It has to be restarted after the receipt of the ack from
- * the driver. So we need to enqueue the command (at the head).
- */
- ar_cmd_enqueue(arl, mp, q, AR_INTERFACE_DOWN, B_FALSE);
-
- ASSERT(arl->arl_state == ARL_S_UP);
-
- /* Free all arp entries for this interface */
- ar_ce_walk(as, ar_ce_delete_per_arl, arl);
-
- ar_ll_down(arl);
- /* Return EINPROGRESS so that ar_rput does not free the 'mp' */
- return (EINPROGRESS);
-}
-
-
-/* Process an interface up causing the info req sequence to start. */
-/* ARGSUSED */
-static int
-ar_interface_up(queue_t *q, mblk_t *mp)
-{
- arl_t *arl;
- int err;
- mblk_t *mp1;
- arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
-
- arl = ar_ll_lookup_from_mp(as, mp);
- if (arl == NULL || arl->arl_closing) {
- DTRACE_PROBE2(up_no_arl, queue_t *, q, mblk_t *, mp);
- err = EINVAL;
- goto done;
- }
-
- /*
- * Newly received commands from clients go to the tail of the queue.
- */
- if (CMD_NEEDS_QUEUEING(mp, arl)) {
- DTRACE_PROBE3(up_enqueued, queue_t *, q, mblk_t *, mp,
- arl_t *, arl);
- ar_cmd_enqueue(arl, mp, q, AR_INTERFACE_UP, B_TRUE);
- return (EINPROGRESS);
- }
- mp->b_prev = NULL;
-
- /*
- * The arl is already up. No work to do.
- */
- if (arl->arl_state == ARL_S_UP) {
- err = 0;
- goto done;
- }
-
- /*
- * This command cannot complete in a single shot now itself.
- * It has to be restarted after the receipt of the ack from
- * the driver. So we need to enqueue the command (at the head).
- */
- ar_cmd_enqueue(arl, mp, q, AR_INTERFACE_UP, B_FALSE);
-
- err = ar_ll_up(arl);
-
- /* Return EINPROGRESS so that ar_rput does not free the 'mp' */
- return (EINPROGRESS);
-
-done:
- /* caller frees 'mp' */
-
- mp1 = ar_alloc(AR_DLPIOP_DONE, err);
- if (mp1 != NULL) {
- q = WR(q);
- DTRACE_PROBE3(up_send_err, queue_t *, q, mblk_t *, mp1,
- int, err);
- putnext(q, mp1);
- }
- return (err);
-}
-
-/*
- * Given an arie_t `mp', find the arl_t's that it names and return them
- * in `*arlp' and `*ipmp_arlp'. If they cannot be found, return B_FALSE.
- */
-static boolean_t
-ar_ipmp_lookup(arp_stack_t *as, mblk_t *mp, arl_t **arlp, arl_t **ipmp_arlp)
-{
- arie_t *arie = (arie_t *)mp->b_rptr;
-
- *arlp = ar_ll_lookup_from_mp(as, mp);
- if (*arlp == NULL) {
- DTRACE_PROBE1(ipmp_lookup_no_arl, mblk_t *, mp);
- return (B_FALSE);
- }
-
- arie->arie_grifname[LIFNAMSIZ - 1] = '\0';
- *ipmp_arlp = ar_ll_lookup_by_name(as, arie->arie_grifname);
- if (*ipmp_arlp == NULL) {
- DTRACE_PROBE1(ipmp_lookup_no_ipmp_arl, mblk_t *, mp);
- return (B_FALSE);
- }
-
- DTRACE_PROBE2(ipmp_lookup, arl_t *, *arlp, arl_t *, *ipmp_arlp);
- return (B_TRUE);
-}
-
-/*
- * Bind an arl_t to an IPMP group arl_t.
- */
-static int
-ar_ipmp_activate(queue_t *q, mblk_t *mp)
-{
- arl_t *arl, *ipmp_arl;
- arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
-
- if (!ar_ipmp_lookup(as, mp, &arl, &ipmp_arl))
- return (EINVAL);
-
- if (arl->arl_ipmp_arl != NULL) {
- DTRACE_PROBE1(ipmp_activated_already, arl_t *, arl);
- return (EALREADY);
- }
-
- DTRACE_PROBE2(ipmp_activate, arl_t *, arl, arl_t *, ipmp_arl);
- arl->arl_ipmp_arl = ipmp_arl;
- return (0);
-}
-
-/*
- * Unbind an arl_t from an IPMP group arl_t and update the ace_t's so
- * that it is no longer part of the group.
- */
-static int
-ar_ipmp_deactivate(queue_t *q, mblk_t *mp)
-{
- arl_t *arl, *ipmp_arl;
- arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
-
- if (!ar_ipmp_lookup(as, mp, &arl, &ipmp_arl))
- return (EINVAL);
-
- if (ipmp_arl != arl->arl_ipmp_arl) {
- DTRACE_PROBE2(ipmp_deactivate_notactive, arl_t *, arl, arl_t *,
- ipmp_arl);
- return (EINVAL);
- }
-
- DTRACE_PROBE2(ipmp_deactivate, arl_t *, arl, arl_t *,
- arl->arl_ipmp_arl);
- ar_ce_walk(as, ar_ce_ipmp_deactivate, arl);
- arl->arl_ipmp_arl = NULL;
- return (0);
-}
-
-/*
- * Enable an interface to process ARP_REQUEST and ARP_RESPONSE messages.
- */
-/* ARGSUSED */
-static int
-ar_interface_on(queue_t *q, mblk_t *mp)
-{
- arl_t *arl;
- arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
-
- arl = ar_ll_lookup_from_mp(as, mp);
- if (arl == NULL) {
- DTRACE_PROBE2(on_no_arl, queue_t *, q, mblk_t *, mp);
- return (EINVAL);
- }
-
- DTRACE_PROBE3(on_intf, queue_t *, q, mblk_t *, mp, arl_t *, arl);
- arl->arl_flags &= ~ARL_F_NOARP;
- return (0);
-}
-
-/*
- * Disable an interface from processing
- * ARP_REQUEST and ARP_RESPONSE messages
- */
-/* ARGSUSED */
-static int
-ar_interface_off(queue_t *q, mblk_t *mp)
-{
- arl_t *arl;
- arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
-
- arl = ar_ll_lookup_from_mp(as, mp);
- if (arl == NULL) {
- DTRACE_PROBE2(off_no_arl, queue_t *, q, mblk_t *, mp);
- return (EINVAL);
- }
-
- DTRACE_PROBE3(off_intf, queue_t *, q, mblk_t *, mp, arl_t *, arl);
- arl->arl_flags |= ARL_F_NOARP;
- return (0);
-}
-
-/*
- * The queue 'q' is closing. Walk all the arl's and free any message
- * pending in the arl_queue if it originated from the closing q.
- * Also cleanup the ip_pending_queue, if the arp-IP stream is closing.
- */
-static void
-ar_ll_cleanup_arl_queue(queue_t *q)
-{
- arl_t *arl;
- mblk_t *mp;
- mblk_t *mpnext;
- mblk_t *prev;
- arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
- ip_stack_t *ipst = as->as_netstack->netstack_ip;
-
- for (arl = as->as_arl_head; arl != NULL; arl = arl->arl_next) {
- for (prev = NULL, mp = arl->arl_queue; mp != NULL;
- mp = mpnext) {
- mpnext = mp->b_next;
- if ((void *)mp->b_queue == (void *)q ||
- (void *)mp->b_queue == (void *)OTHERQ(q)) {
- if (prev == NULL)
- arl->arl_queue = mp->b_next;
- else
- prev->b_next = mp->b_next;
- if (arl->arl_queue_tail == mp)
- arl->arl_queue_tail = prev;
- if (DB_TYPE(mp) == M_PROTO &&
- *(uint32_t *)mp->b_rptr == AR_ENTRY_QUERY) {
- BUMP_IRE_STATS(ipst->ips_ire_stats_v4,
- ire_stats_freed);
- }
- inet_freemsg(mp);
- } else {
- prev = mp;
- }
- }
- }
-}
-
-/*
- * Look up a lower level tap by name.
- */
-static arl_t *
-ar_ll_lookup_by_name(arp_stack_t *as, const char *name)
-{
- arl_t *arl;
-
- for (arl = as->as_arl_head; arl; arl = arl->arl_next) {
- if (strcmp(arl->arl_name, name) == 0) {
- return (arl);
- }
- }
- return (NULL);
-}
-
-/*
- * Look up a lower level tap using parameters extracted from the common
- * portion of the ARP command.
- */
-static arl_t *
-ar_ll_lookup_from_mp(arp_stack_t *as, mblk_t *mp)
-{
- arc_t *arc = (arc_t *)mp->b_rptr;
- uint8_t *name;
- size_t namelen = arc->arc_name_length;
-
- name = mi_offset_param(mp, arc->arc_name_offset, namelen);
- if (name == NULL || name[namelen - 1] != '\0')
- return (NULL);
- return (ar_ll_lookup_by_name(as, (char *)name));
-}
-
-static void
-ar_ll_init(arp_stack_t *as, ar_t *ar, mblk_t *mp)
-{
- arl_t *arl;
- dl_info_ack_t *dlia = (dl_info_ack_t *)mp->b_rptr;
-
- ASSERT(ar->ar_arl == NULL);
-
- if ((arl = (arl_t *)mi_zalloc(sizeof (arl_t))) == NULL)
- return;
-
- if (dlia->dl_mac_type == SUNW_DL_IPMP) {
- arl->arl_flags |= ARL_F_IPMP;
- arl->arl_ipmp_arl = arl;
- }
-
- arl->arl_provider_style = dlia->dl_provider_style;
- arl->arl_rq = ar->ar_rq;
- arl->arl_wq = ar->ar_wq;
-
- arl->arl_dlpi_pending = DL_PRIM_INVAL;
-
- ar->ar_arl = arl;
-
- /*
- * If/when ARP gets pushed into the IP module then this code to make
- * a number uniquely identify an ARP instance can be removed and the
- * ifindex from IP used. Rather than try and reinvent or copy the
- * code used by IP for the purpose of allocating an index number
- * (and trying to keep the number small), just allocate it in an
- * ever increasing manner. This index number isn't ever exposed to
- * users directly, its only use is for providing the pfhooks interface
- * with a number it can use to uniquely identify an interface in time.
- *
- * Using a 32bit counter, over 136 plumbs would need to be done every
- * second of every day (non-leap year) for it to wrap around and the
- * for() loop below to kick in as a performance concern.
- */
- if (as->as_arp_counter_wrapped) {
- arl_t *arl1;
-
- do {
- for (arl1 = as->as_arl_head; arl1 != NULL;
- arl1 = arl1->arl_next)
- if (arl1->arl_index ==
- as->as_arp_index_counter) {
- as->as_arp_index_counter++;
- if (as->as_arp_index_counter == 0) {
- as->as_arp_counter_wrapped++;
- as->as_arp_index_counter = 1;
- }
- break;
- }
- } while (arl1 != NULL);
- } else {
- arl->arl_index = as->as_arp_index_counter;
- }
- as->as_arp_index_counter++;
- if (as->as_arp_index_counter == 0) {
- as->as_arp_counter_wrapped++;
- as->as_arp_index_counter = 1;
- }
-}
-
-/*
- * This routine is called during module initialization when the DL_INFO_ACK
- * comes back from the device. We set up defaults for all the device dependent
- * doo-dads we are going to need. This will leave us ready to roll if we are
- * attempting auto-configuration. Alternatively, these defaults can be
- * overridden by initialization procedures possessing higher intelligence.
- */
-static void
-ar_ll_set_defaults(arl_t *arl, mblk_t *mp)
-{
- ar_m_t *arm;
- dl_info_ack_t *dlia = (dl_info_ack_t *)mp->b_rptr;
- dl_unitdata_req_t *dlur;
- uchar_t *up;
- arlphy_t *ap;
-
- ASSERT(arl != NULL);
-
- /*
- * Clear any stale defaults that might exist.
- */
- ar_ll_clear_defaults(arl);
-
- if (arl->arl_flags & ARL_F_IPMP) {
- /*
- * If this is an IPMP arl_t, we have nothing to do,
- * since we will never transmit or receive.
- */
- return;
- }
-
- ap = kmem_zalloc(sizeof (arlphy_t), KM_NOSLEEP);
- if (ap == NULL)
- goto bad;
- arl->arl_phy = ap;
-
- if ((arm = ar_m_lookup(dlia->dl_mac_type)) == NULL)
- arm = ar_m_lookup(DL_OTHER);
- ASSERT(arm != NULL);
-
- /*
- * We initialize based on parameters in the (currently) not too
- * exhaustive ar_m_tbl.
- */
- if (dlia->dl_version == DL_VERSION_2) {
- /* XXX DLPI spec allows dl_sap_length of 0 before binding. */
- ap->ap_saplen = dlia->dl_sap_length;
- ap->ap_hw_addrlen = dlia->dl_brdcst_addr_length;
- } else {
- ap->ap_saplen = arm->ar_mac_sap_length;
- ap->ap_hw_addrlen = arm->ar_mac_hw_addr_length;
- }
- ap->ap_arp_hw_type = arm->ar_mac_arp_hw_type;
-
- /*
- * Allocate the hardware and ARP addresses; note that the hardware
- * address cannot be filled in until we see the DL_BIND_ACK.
- */
- ap->ap_hw_addr = kmem_zalloc(ap->ap_hw_addrlen, KM_NOSLEEP);
- ap->ap_arp_addr = kmem_alloc(ap->ap_hw_addrlen, KM_NOSLEEP);
- if (ap->ap_hw_addr == NULL || ap->ap_arp_addr == NULL)
- goto bad;
-
- if (dlia->dl_version == DL_VERSION_2) {
- if ((up = mi_offset_param(mp, dlia->dl_brdcst_addr_offset,
- ap->ap_hw_addrlen)) == NULL)
- goto bad;
- bcopy(up, ap->ap_arp_addr, ap->ap_hw_addrlen);
- } else {
- /*
- * No choice but to assume a broadcast address of all ones,
- * known to work on some popular networks.
- */
- (void) memset(ap->ap_arp_addr, ~0, ap->ap_hw_addrlen);
- }
-
- /*
- * Make us a template DL_UNITDATA_REQ message which we will use for
- * broadcasting resolution requests, and which we will clone to hand
- * back as responses to the protocols.
- */
- ap->ap_xmit_mp = ar_dlpi_comm(DL_UNITDATA_REQ, ap->ap_hw_addrlen +
- ABS(ap->ap_saplen) + sizeof (dl_unitdata_req_t));
- if (ap->ap_xmit_mp == NULL)
- goto bad;
-
- dlur = (dl_unitdata_req_t *)ap->ap_xmit_mp->b_rptr;
- dlur->dl_priority.dl_min = 0;
- dlur->dl_priority.dl_max = 0;
- dlur->dl_dest_addr_length = ap->ap_hw_addrlen + ABS(ap->ap_saplen);
- dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t);
-
- /* NOTE: the destination address and sap offsets are permanently set */
- ap->ap_xmit_sapoff = dlur->dl_dest_addr_offset;
- ap->ap_xmit_addroff = dlur->dl_dest_addr_offset;
- if (ap->ap_saplen < 0)
- ap->ap_xmit_sapoff += ap->ap_hw_addrlen; /* sap last */
- else
- ap->ap_xmit_addroff += ap->ap_saplen; /* addr last */
-
- *(uint16_t *)((caddr_t)dlur + ap->ap_xmit_sapoff) = ETHERTYPE_ARP;
- return;
-bad:
- ar_ll_clear_defaults(arl);
-}
-
-static void
-ar_ll_clear_defaults(arl_t *arl)
-{
- arlphy_t *ap = arl->arl_phy;
-
- if (ap != NULL) {
- arl->arl_phy = NULL;
- if (ap->ap_hw_addr != NULL)
- kmem_free(ap->ap_hw_addr, ap->ap_hw_addrlen);
- if (ap->ap_arp_addr != NULL)
- kmem_free(ap->ap_arp_addr, ap->ap_hw_addrlen);
- freemsg(ap->ap_xmit_mp);
- kmem_free(ap, sizeof (arlphy_t));
- }
-}
-
-static void
-ar_ll_down(arl_t *arl)
-{
- mblk_t *mp;
- ar_t *ar;
-
- ASSERT(arl->arl_state == ARL_S_UP);
-
- /* Let's break the association between an ARL and IP instance */
- ar = (ar_t *)arl->arl_rq->q_ptr;
- if (ar->ar_arl_ip_assoc != NULL) {
- ASSERT(ar->ar_arl_ip_assoc->ar_arl_ip_assoc != NULL &&
- ar->ar_arl_ip_assoc->ar_arl_ip_assoc == ar);
- ar->ar_arl_ip_assoc->ar_arl_ip_assoc = NULL;
- ar->ar_arl_ip_assoc = NULL;
- }
-
- arl->arl_state = ARL_S_PENDING;
-
- mp = arl->arl_unbind_mp;
- ASSERT(mp != NULL);
- ar_dlpi_send(arl, mp);
- arl->arl_unbind_mp = NULL;
-
- if (arl->arl_provider_style == DL_STYLE2) {
- mp = arl->arl_detach_mp;
- ASSERT(mp != NULL);
- ar_dlpi_send(arl, mp);
- arl->arl_detach_mp = NULL;
- }
-}
-
-static int
-ar_ll_up(arl_t *arl)
-{
- mblk_t *attach_mp = NULL;
- mblk_t *bind_mp = NULL;
- mblk_t *detach_mp = NULL;
- mblk_t *unbind_mp = NULL;
- mblk_t *info_mp = NULL;
- mblk_t *notify_mp = NULL;
-
- ASSERT(arl->arl_state == ARL_S_DOWN);
-
- if (arl->arl_provider_style == DL_STYLE2) {
- attach_mp =
- ar_dlpi_comm(DL_ATTACH_REQ, sizeof (dl_attach_req_t));
- if (attach_mp == NULL)
- goto bad;
- ((dl_attach_req_t *)attach_mp->b_rptr)->dl_ppa =
- arl->arl_ppa;
-
- detach_mp =
- ar_dlpi_comm(DL_DETACH_REQ, sizeof (dl_detach_req_t));
- if (detach_mp == NULL)
- goto bad;
- }
-
- info_mp = ar_dlpi_comm(DL_INFO_REQ, sizeof (dl_info_req_t));
- if (info_mp == NULL)
- goto bad;
-
- /* Allocate and initialize a bind message. */
- bind_mp = ar_dlpi_comm(DL_BIND_REQ, sizeof (dl_bind_req_t));
- if (bind_mp == NULL)
- goto bad;
- ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ETHERTYPE_ARP;
- ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS;
-
- unbind_mp = ar_dlpi_comm(DL_UNBIND_REQ, sizeof (dl_unbind_req_t));
- if (unbind_mp == NULL)
- goto bad;
-
- notify_mp = ar_dlpi_comm(DL_NOTIFY_REQ, sizeof (dl_notify_req_t));
- if (notify_mp == NULL)
- goto bad;
- ((dl_notify_req_t *)notify_mp->b_rptr)->dl_notifications =
- DL_NOTE_LINK_UP | DL_NOTE_LINK_DOWN | DL_NOTE_REPLUMB;
-
- arl->arl_state = ARL_S_PENDING;
- if (arl->arl_provider_style == DL_STYLE2) {
- ar_dlpi_send(arl, attach_mp);
- ASSERT(detach_mp != NULL);
- arl->arl_detach_mp = detach_mp;
- }
- ar_dlpi_send(arl, info_mp);
- ar_dlpi_send(arl, bind_mp);
- arl->arl_unbind_mp = unbind_mp;
- ar_dlpi_send(arl, notify_mp);
- return (0);
-
-bad:
- freemsg(attach_mp);
- freemsg(bind_mp);
- freemsg(detach_mp);
- freemsg(unbind_mp);
- freemsg(info_mp);
- freemsg(notify_mp);
- return (ENOMEM);
-}
-
-/* Process mapping add requests from external messages. */
-static int
-ar_mapping_add(queue_t *q, mblk_t *mp_orig)
-{
- arma_t *arma;
- mblk_t *mp = mp_orig;
- ace_t *ace;
- uchar_t *hw_addr;
- uint32_t hw_addr_len;
- uchar_t *proto_addr;
- uint32_t proto_addr_len;
- uchar_t *proto_mask;
- uchar_t *proto_extract_mask;
- uint32_t hw_extract_start;
- arl_t *arl;
- arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
-
- /* We handle both M_IOCTL and M_PROTO messages. */
- if (DB_TYPE(mp) == M_IOCTL)
- mp = mp->b_cont;
- arl = ar_ll_lookup_from_mp(as, mp);
- if (arl == NULL)
- return (EINVAL);
- /*
- * Newly received commands from clients go to the tail of the queue.
- */
- if (CMD_NEEDS_QUEUEING(mp_orig, arl)) {
- DTRACE_PROBE3(madd_enqueued, queue_t *, q, mblk_t *, mp_orig,
- arl_t *, arl);
- ar_cmd_enqueue(arl, mp_orig, q, AR_MAPPING_ADD, B_TRUE);
- return (EINPROGRESS);
- }
- mp_orig->b_prev = NULL;
-
- arma = (arma_t *)mp->b_rptr;
- ace = ar_ce_lookup_from_area(as, mp, ar_ce_lookup_mapping);
- if (ace != NULL)
- ar_ce_delete(ace);
- hw_addr_len = arma->arma_hw_addr_length;
- hw_addr = mi_offset_paramc(mp, arma->arma_hw_addr_offset, hw_addr_len);
- proto_addr_len = arma->arma_proto_addr_length;
- proto_addr = mi_offset_paramc(mp, arma->arma_proto_addr_offset,
- proto_addr_len);
- proto_mask = mi_offset_paramc(mp, arma->arma_proto_mask_offset,
- proto_addr_len);
- proto_extract_mask = mi_offset_paramc(mp,
- arma->arma_proto_extract_mask_offset, proto_addr_len);
- hw_extract_start = arma->arma_hw_mapping_start;
- if (proto_mask == NULL || proto_extract_mask == NULL) {
- DTRACE_PROBE2(madd_illegal_mask, arl_t *, arl, arpa_t *, arma);
- return (EINVAL);
- }
- return (ar_ce_create(
- arl,
- arma->arma_proto,
- hw_addr,
- hw_addr_len,
- proto_addr,
- proto_addr_len,
- proto_mask,
- proto_extract_mask,
- hw_extract_start,
- NULL,
- arma->arma_flags | ACE_F_MAPPING));
-}
-
-static boolean_t
-ar_mask_all_ones(uchar_t *mask, uint32_t mask_len)
-{
- if (mask == NULL)
- return (B_TRUE);
-
- while (mask_len-- > 0) {
- if (*mask++ != 0xFF) {
- return (B_FALSE);
- }
- }
- return (B_TRUE);
-}
-
-/* Find an entry for a particular MAC type in the ar_m_tbl. */
-static ar_m_t *
-ar_m_lookup(t_uscalar_t mac_type)
-{
- ar_m_t *arm;
-
- for (arm = ar_m_tbl; arm < A_END(ar_m_tbl); arm++) {
- if (arm->ar_mac_type == mac_type)
- return (arm);
- }
- return (NULL);
-}
-
-/* Respond to Named Dispatch requests. */
-static int
-ar_nd_ioctl(queue_t *q, mblk_t *mp)
-{
- ar_t *ar = (ar_t *)q->q_ptr;
- arp_stack_t *as = ar->ar_as;
-
- if (DB_TYPE(mp) == M_IOCTL && nd_getset(q, as->as_nd, mp))
- return (0);
- return (ENOENT);
-}
-
-/* ARP module open routine. */
-static int
-ar_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
-{
- ar_t *ar;
- int err;
- queue_t *tmp_q;
- mblk_t *mp;
- netstack_t *ns;
- arp_stack_t *as;
-
- TRACE_1(TR_FAC_ARP, TR_ARP_OPEN,
- "arp_open: q %p", q);
- /* Allow a reopen. */
- if (q->q_ptr != NULL) {
- return (0);
- }
-
- ns = netstack_find_by_cred(credp);
- ASSERT(ns != NULL);
- as = ns->netstack_arp;
- ASSERT(as != NULL);
-
- /* mi_open_comm allocates the instance data structure, etc. */
- err = mi_open_comm(&as->as_head, sizeof (ar_t), q, devp, flag, sflag,
- credp);
- if (err) {
- netstack_rele(as->as_netstack);
- return (err);
- }
-
- /*
- * We are D_MTPERMOD so it is safe to do qprocson before
- * the instance data has been initialized.
- */
- qprocson(q);
-
- ar = (ar_t *)q->q_ptr;
- ar->ar_rq = q;
- q = WR(q);
- ar->ar_wq = q;
- crhold(credp);
- ar->ar_credp = credp;
- ar->ar_as = as;
-
- /*
- * Probe for the DLPI info if we are not pushed on IP or UDP. Wait for
- * the reply. In case of error call ar_close() which will take
- * care of doing everything required to close this instance, such
- * as freeing the arl, restarting the timer on a different queue etc.
- */
- if (strcmp(q->q_next->q_qinfo->qi_minfo->mi_idname, "ip") == 0 ||
- strcmp(q->q_next->q_qinfo->qi_minfo->mi_idname, "udp") == 0) {
- arc_t *arc;
-
- /*
- * We are pushed directly on top of IP or UDP. There is no need
- * to send down a DL_INFO_REQ. Return success. This could
- * either be an ill stream (i.e. <arp-IP-Driver> stream)
- * or a stream corresponding to an open of /dev/arp
- * (i.e. <arp-IP> stream). Note that we don't support
- * pushing some module in between arp and IP.
- *
- * Tell IP, though, that we're an extended implementation, so
- * it knows to expect a DAD response after bringing an
- * interface up. Old ATM drivers won't do this, and IP will
- * just bring the interface up immediately.
- */
- ar->ar_on_ill_stream = (q->q_next->q_next != NULL);
- if (!ar->ar_on_ill_stream || arp_no_defense)
- return (0);
- mp = allocb(sizeof (arc_t), BPRI_MED);
- if (mp == NULL) {
- (void) ar_close(RD(q));
- return (ENOMEM);
- }
- DB_TYPE(mp) = M_CTL;
- arc = (arc_t *)mp->b_rptr;
- mp->b_wptr = mp->b_rptr + sizeof (arc_t);
- arc->arc_cmd = AR_ARP_EXTEND;
- putnext(q, mp);
- return (0);
- }
- tmp_q = q;
- /* Get the driver's queue */
- while (tmp_q->q_next != NULL)
- tmp_q = tmp_q->q_next;
-
- ASSERT(tmp_q->q_qinfo->qi_minfo != NULL);
-
- if (strcmp(tmp_q->q_qinfo->qi_minfo->mi_idname, "ip") == 0 ||
- strcmp(tmp_q->q_qinfo->qi_minfo->mi_idname, "udp") == 0) {
- /*
- * We don't support pushing ARP arbitrarily on an IP or UDP
- * driver stream. ARP has to be pushed directly above IP or
- * UDP.
- */
- (void) ar_close(RD(q));
- return (ENOTSUP);
- } else {
- /*
- * Send down a DL_INFO_REQ so we can find out what we are
- * talking to.
- */
- mp = ar_dlpi_comm(DL_INFO_REQ, sizeof (dl_info_req_t));
- if (mp == NULL) {
- (void) ar_close(RD(q));
- return (ENOMEM);
- }
- putnext(ar->ar_wq, mp);
- while (ar->ar_arl == NULL) {
- if (!qwait_sig(ar->ar_rq)) {
- (void) ar_close(RD(q));
- return (EINTR);
- }
- }
- }
- return (0);
-}
-
-/* Get current value of Named Dispatch item. */
-/* ARGSUSED */
-static int
-ar_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
-{
- arpparam_t *arppa = (arpparam_t *)cp;
-
- (void) mi_mpprintf(mp, "%d", arppa->arp_param_value);
- return (0);
-}
-
-/*
- * Walk through the param array specified registering each element with the
- * named dispatch handler.
- */
-static boolean_t
-ar_param_register(IDP *ndp, arpparam_t *arppa, int cnt)
-{
- for (; cnt-- > 0; arppa++) {
- if (arppa->arp_param_name && arppa->arp_param_name[0]) {
- if (!nd_load(ndp, arppa->arp_param_name,
- ar_param_get, ar_param_set,
- (caddr_t)arppa)) {
- nd_free(ndp);
- return (B_FALSE);
- }
- }
- }
- return (B_TRUE);
-}
-
-/* Set new value of Named Dispatch item. */
-/* ARGSUSED */
-static int
-ar_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr)
-{
- long new_value;
- arpparam_t *arppa = (arpparam_t *)cp;
-
- if (ddi_strtol(value, NULL, 10, &new_value) != 0 ||
- new_value < arppa->arp_param_min ||
- new_value > arppa->arp_param_max) {
- return (EINVAL);
- }
- arppa->arp_param_value = new_value;
- return (0);
-}
-
-/*
- * Process an I_PLINK ioctl. If the lower stream is an arp device stream,
- * append another mblk to the chain, that will carry the device name,
- * and the muxid. IP uses this info to lookup the corresponding ill, and
- * set the ill_arp_muxid atomically, as part of the I_PLINK, instead of
- * waiting for the SIOCSLIFMUXID. (which may never happen if ifconfig is
- * killed, and this has the bad effect of not being able to unplumb
- * subsequently)
- */
-static int
-ar_plink_send(queue_t *q, mblk_t *mp)
-{
- char *name;
- mblk_t *muxmp;
- mblk_t *mp1;
- ar_t *ar = (ar_t *)q->q_ptr;
- arp_stack_t *as = ar->ar_as;
- struct linkblk *li;
- struct ipmx_s *ipmxp;
- queue_t *arpwq;
-
- mp1 = mp->b_cont;
- ASSERT((mp1 != NULL) && (mp1->b_cont == NULL));
- li = (struct linkblk *)mp1->b_rptr;
- arpwq = li->l_qbot;
-
- /*
- * Allocate a new mblk which will hold an ipmx_s and chain it to
- * the M_IOCTL chain. The final chain will consist of 3 mblks,
- * namely the M_IOCTL, followed by the linkblk, followed by the ipmx_s
- */
- muxmp = allocb(sizeof (struct ipmx_s), BPRI_MED);
- if (muxmp == NULL)
- return (ENOMEM);
- ipmxp = (struct ipmx_s *)muxmp->b_wptr;
- ipmxp->ipmx_arpdev_stream = 0;
- muxmp->b_wptr += sizeof (struct ipmx_s);
- mp1->b_cont = muxmp;
-
- /*
- * The l_qbot represents the uppermost write queue of the
- * lower stream. Walk down this stream till we hit ARP.
- * We can safely walk, since STREAMS has made sure the stream
- * cannot close till the IOCACK goes up, and is not interruptible.
- */
- while (arpwq != NULL) {
- /*
- * Beware of broken modules like logsubr.c that
- * may not have a q_qinfo or qi_minfo.
- */
- if ((q->q_qinfo != NULL) && (q->q_qinfo->qi_minfo != NULL)) {
- name = arpwq->q_qinfo->qi_minfo->mi_idname;
- if (name != NULL && name[0] != NULL &&
- (strcmp(name, arp_mod_info.mi_idname) == 0))
- break;
- }
- arpwq = arpwq->q_next;
- }
-
- /*
- * Check if arpwq corresponds to an arp device stream, by walking
- * the mi list. If it does, then add the muxid and device name info
- * for use by IP. IP will send the M_IOCACK.
- */
- if (arpwq != NULL) {
- for (ar = (ar_t *)mi_first_ptr(&as->as_head); ar != NULL;
- ar = (ar_t *)mi_next_ptr(&as->as_head, (void *)ar)) {
- if ((ar->ar_wq == arpwq) && (ar->ar_arl != NULL)) {
- ipmxp->ipmx_arpdev_stream = 1;
- (void) strcpy((char *)ipmxp->ipmx_name,
- ar->ar_arl->arl_name);
- break;
- }
- }
- }
-
- putnext(q, mp);
- return (0);
-}
-
-/*
- * ar_ce_walk routine to delete any outstanding queries for an ar that is
- * going away.
- */
-static void
-ar_query_delete(ace_t *ace, void *arg)
-{
- ar_t *ar = arg;
- mblk_t **mpp = &ace->ace_query_mp;
- mblk_t *mp;
- arp_stack_t *as = ar->ar_as;
- ip_stack_t *ipst = as->as_netstack->netstack_ip;
-
- while ((mp = *mpp) != NULL) {
- /* The response queue was stored in the query b_prev. */
- if ((queue_t *)mp->b_prev == ar->ar_wq ||
- (queue_t *)mp->b_prev == ar->ar_rq) {
- *mpp = mp->b_next;
- if (DB_TYPE(mp) == M_PROTO &&
- *(uint32_t *)mp->b_rptr == AR_ENTRY_QUERY) {
- BUMP_IRE_STATS(ipst->ips_ire_stats_v4,
- ire_stats_freed);
- }
- inet_freemsg(mp);
- } else {
- mpp = &mp->b_next;
- }
- }
-}
-
-/*
- * This routine is called either when an address resolution has just been
- * found, or when it is time to give, or in some other error situation.
- * If a non-zero ret_val is provided, any outstanding queries for the
- * specified ace will be completed using that error value. Otherwise,
- * the completion status will depend on whether the address has been
- * resolved.
- */
-static void
-ar_query_reply(ace_t *ace, int ret_val, uchar_t *proto_addr,
- uint32_t proto_addr_len)
-{
- mblk_t *areq_mp;
- mblk_t *mp;
- mblk_t *xmit_mp;
- queue_t *arl_wq = ace->ace_arl->arl_wq;
- arp_stack_t *as = ARL_TO_ARPSTACK(ace->ace_arl);
- ip_stack_t *ipst = as->as_netstack->netstack_ip;
- arlphy_t *ap = ace->ace_xmit_arl->arl_phy;
-
- /*
- * On error or completion for a query, we need to shut down the timer.
- * However, the timer must not be stopped for an interface doing
- * Duplicate Address Detection, or it will never finish that phase.
- */
- if (!(ace->ace_flags & (ACE_F_UNVERIFIED | ACE_F_AUTHORITY)))
- mi_timer(arl_wq, ace->ace_mp, -1L);
-
- /* Establish the return value appropriate. */
- if (ret_val == 0) {
- if (!ACE_RESOLVED(ace) || ap == NULL)
- ret_val = ENXIO;
- }
- /* Terminate all outstanding queries. */
- while ((mp = ace->ace_query_mp) != 0) {
- /* The response queue was saved in b_prev. */
- queue_t *q = (queue_t *)mp->b_prev;
- mp->b_prev = NULL;
- ace->ace_query_mp = mp->b_next;
- mp->b_next = NULL;
- /*
- * If we have the answer, attempt to get a copy of the xmit
- * template to prepare for the client.
- */
- if (ret_val == 0 &&
- (xmit_mp = copyb(ap->ap_xmit_mp)) == NULL) {
- /* Too bad, buy more memory. */
- ret_val = ENOMEM;
- }
- /* Complete the response based on how the request arrived. */
- if (DB_TYPE(mp) == M_IOCTL) {
- struct iocblk *ioc = (struct iocblk *)mp->b_rptr;
-
- ioc->ioc_error = ret_val;
- if (ret_val != 0) {
- DB_TYPE(mp) = M_IOCNAK;
- ioc->ioc_count = 0;
- putnext(q, mp);
- continue;
- }
- /*
- * Return the xmit mp out with the successful IOCTL.
- */
- DB_TYPE(mp) = M_IOCACK;
- ioc->ioc_count = MBLKL(xmit_mp);
- /* Remove the areq mblk from the IOCTL. */
- areq_mp = mp->b_cont;
- mp->b_cont = areq_mp->b_cont;
- } else {
- if (ret_val != 0) {
- /* TODO: find some way to let the guy know? */
- inet_freemsg(mp);
- BUMP_IRE_STATS(ipst->ips_ire_stats_v4,
- ire_stats_freed);
- continue;
- }
- /*
- * In the M_PROTO case, the areq message is followed by
- * a message chain to be returned to the protocol. ARP
- * doesn't know (or care) what is in this chain, but in
- * the event that the reader is pondering the
- * relationship between ARP and IP (for example), the
- * areq is followed by an incipient IRE, and then the
- * original outbound packet. Here we detach the areq.
- */
- areq_mp = mp;
- mp = mp->b_cont;
- }
- ASSERT(ret_val == 0 && ap != NULL);
- if (ap->ap_saplen != 0) {
- /*
- * Copy the SAP type specified in the request into
- * the xmit mp.
- */
- areq_t *areq = (areq_t *)areq_mp->b_rptr;
- bcopy(areq->areq_sap, xmit_mp->b_rptr +
- ap->ap_xmit_sapoff, ABS(ap->ap_saplen));
- }
- /* Done with the areq message. */
- freeb(areq_mp);
- /*
- * Copy the resolved hardware address into the xmit mp
- * or perform the mapping operation.
- */
- ar_set_address(ace, xmit_mp->b_rptr + ap->ap_xmit_addroff,
- proto_addr, proto_addr_len);
- /*
- * Now insert the xmit mp after the response message. In
- * the M_IOCTL case, it will be the returned data block. In
- * the M_PROTO case, (again using IP as an example) it will
- * appear after the IRE and before the outbound packet.
- */
- xmit_mp->b_cont = mp->b_cont;
- mp->b_cont = xmit_mp;
- putnext(q, mp);
- }
-
- /*
- * Unless we are responding from a permanent cache entry, start the
- * cleanup timer or (on error) delete the entry.
- */
- if (!(ace->ace_flags & (ACE_F_PERMANENT | ACE_F_DYING))) {
- if (!ACE_RESOLVED(ace) || ap == NULL) {
- /*
- * No need to notify IP here, because the entry was
- * never resolved, so IP can't have any cached copies
- * of the address.
- */
- ar_ce_delete(ace);
- } else {
- mi_timer(arl_wq, ace->ace_mp, as->as_cleanup_interval);
- }
- }
-}
-
-/*
- * Returns number of milliseconds after which we should either rexmit or abort.
- * Return of zero means we should abort.
- */
-static clock_t
-ar_query_xmit(arp_stack_t *as, ace_t *ace)
-{
- areq_t *areq;
- mblk_t *mp;
- uchar_t *proto_addr;
- uchar_t *sender_addr;
- ace_t *src_ace;
- arl_t *xmit_arl = ace->ace_xmit_arl;
-
- mp = ace->ace_query_mp;
- /*
- * ar_query_delete may have just blown off the outstanding
- * ace_query_mp entries because the client who sent the query
- * went away. If this happens just before the ace_mp timer
- * goes off, we'd find a null ace_query_mp which is not an error.
- * The unresolved ace itself, and the timer, will be removed
- * when the arl stream goes away.
- */
- if (!mp)
- return (0);
- if (DB_TYPE(mp) == M_IOCTL)
- mp = mp->b_cont;
- areq = (areq_t *)mp->b_rptr;
- if (areq->areq_xmit_count == 0)
- return (0);
- areq->areq_xmit_count--;
- proto_addr = mi_offset_paramc(mp, areq->areq_target_addr_offset,
- areq->areq_target_addr_length);
- sender_addr = mi_offset_paramc(mp, areq->areq_sender_addr_offset,
- areq->areq_sender_addr_length);
-
- /*
- * Get the ace for the sender address, so that we can verify that
- * we have one and that DAD has completed.
- */
- src_ace = ar_ce_lookup(xmit_arl, areq->areq_proto, sender_addr,
- areq->areq_sender_addr_length);
- if (src_ace == NULL) {
- DTRACE_PROBE3(xmit_no_source, ace_t *, ace, areq_t *, areq,
- uchar_t *, sender_addr);
- return (0);
- }
-
- /*
- * If we haven't yet finished duplicate address checking on this source
- * address, then do *not* use it on the wire. Doing so will corrupt
- * the world's caches. Just allow the timer to restart. Note that
- * duplicate address checking will eventually complete one way or the
- * other, so this cannot go on "forever."
- */
- if (src_ace->ace_flags & ACE_F_UNVERIFIED) {
- DTRACE_PROBE2(xmit_source_unverified, ace_t *, ace,
- ace_t *, src_ace);
- areq->areq_xmit_count++;
- return (areq->areq_xmit_interval);
- }
-
- DTRACE_PROBE3(xmit_send, ace_t *, ace, ace_t *, src_ace,
- areq_t *, areq);
-
- ar_xmit(xmit_arl, ARP_REQUEST, areq->areq_proto,
- areq->areq_sender_addr_length, xmit_arl->arl_phy->ap_hw_addr,
- sender_addr, xmit_arl->arl_phy->ap_arp_addr, proto_addr, NULL, as);
- src_ace->ace_last_bcast = ddi_get_lbolt();
- return (areq->areq_xmit_interval);
-}
-
-/* Our read side put procedure. */
-static void
-ar_rput(queue_t *q, mblk_t *mp)
-{
- arh_t *arh;
- arl_t *arl;
- arl_t *client_arl;
- ace_t *dst_ace;
- uchar_t *dst_paddr;
- int err;
- uint32_t hlen;
- struct iocblk *ioc;
- mblk_t *mp1;
- int op;
- uint32_t plen;
- uint32_t proto;
- uchar_t *src_haddr;
- uchar_t *src_paddr;
- uchar_t *dst_haddr;
- boolean_t is_probe;
- boolean_t is_unicast = B_FALSE;
- dl_unitdata_ind_t *dlindp;
- int i;
- arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
-
- TRACE_1(TR_FAC_ARP, TR_ARP_RPUT_START,
- "arp_rput_start: q %p", q);
-
- /*
- * We handle ARP commands from below both in M_IOCTL and M_PROTO
- * messages. Actual ARP requests and responses will show up as
- * M_PROTO messages containing DL_UNITDATA_IND blocks.
- */
- switch (DB_TYPE(mp)) {
- case M_IOCTL:
- err = ar_cmd_dispatch(q, mp, B_FALSE);
- switch (err) {
- case ENOENT:
- DB_TYPE(mp) = M_IOCNAK;
- if ((mp1 = mp->b_cont) != 0) {
- /*
- * Collapse the data as a note to the
- * originator.
- */
- mp1->b_wptr = mp1->b_rptr;
- }
- break;
- case EINPROGRESS:
- TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
- "arp_rput_end: q %p (%S)", q, "ioctl/inprogress");
- return;
- default:
- DB_TYPE(mp) = M_IOCACK;
- break;
- }
- ioc = (struct iocblk *)mp->b_rptr;
- ioc->ioc_error = err;
- if ((mp1 = mp->b_cont) != 0)
- ioc->ioc_count = MBLKL(mp1);
- else
- ioc->ioc_count = 0;
- qreply(q, mp);
- TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
- "arp_rput_end: q %p (%S)", q, "ioctl");
- return;
- case M_CTL:
- /*
- * IP is acking the AR_ARP_CLOSING message that we sent
- * in ar_close.
- */
- if (MBLKL(mp) == sizeof (arc_t)) {
- if (((arc_t *)mp->b_rptr)->arc_cmd == AR_ARP_CLOSING)
- ((ar_t *)q->q_ptr)->ar_ip_acked_close = 1;
- }
- freemsg(mp);
- return;
- case M_PCPROTO:
- case M_PROTO:
- dlindp = (dl_unitdata_ind_t *)mp->b_rptr;
- if (MBLKL(mp) >= sizeof (dl_unitdata_ind_t) &&
- dlindp->dl_primitive == DL_UNITDATA_IND) {
- is_unicast = (dlindp->dl_group_address == 0);
- arl = ((ar_t *)q->q_ptr)->ar_arl;
- if (arl != NULL && arl->arl_phy != NULL) {
- /* Real messages from the wire! */
- break;
- }
- putnext(q, mp);
- TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
- "arp_rput_end: q %p (%S)", q, "default");
- return;
- }
- err = ar_cmd_dispatch(q, mp, B_FALSE);
- switch (err) {
- case ENOENT:
- /* Miscellaneous DLPI messages get shuffled off. */
- ar_rput_dlpi(q, mp);
- TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
- "arp_rput_end: q %p (%S)", q, "proto/dlpi");
- break;
- case EINPROGRESS:
- TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
- "arp_rput_end: q %p (%S)", q, "proto");
- break;
- default:
- inet_freemsg(mp);
- break;
- }
- return;
- default:
- putnext(q, mp);
- TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
- "arp_rput_end: q %p (%S)", q, "default");
- return;
- }
- /*
- * If the IFF_NOARP flag is on, then do not process any
- * incoming ARP_REQUEST or incoming ARP_RESPONSE.
- */
- if (arl->arl_flags & ARL_F_NOARP) {
- freemsg(mp);
- TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
- "arp_rput_end: q %p (%S)", q, "interface has IFF_NOARP set");
- return;
- }
-
- /*
- * What we should have at this point is a DL_UNITDATA_IND message
- * followed by an ARP packet. We do some initial checks and then
- * get to work.
- */
- mp1 = mp->b_cont;
- if (mp1 == NULL) {
- freemsg(mp);
- TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
- "arp_rput_end: q %p (%S)", q, "baddlpi");
- return;
- }
- if (mp1->b_cont != NULL) {
- /* No fooling around with funny messages. */
- if (!pullupmsg(mp1, -1)) {
- freemsg(mp);
- TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
- "arp_rput_end: q %p (%S)", q, "pullupmsgfail");
- return;
- }
- }
- arh = (arh_t *)mp1->b_rptr;
- hlen = arh->arh_hlen;
- plen = arh->arh_plen;
- if (MBLKL(mp1) < ARH_FIXED_LEN + 2 * hlen + 2 * plen) {
- freemsg(mp);
- TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
- "arp_rput_end: q %p (%S)", q, "short");
- return;
- }
- /*
- * hlen 0 is used for RFC 1868 UnARP.
- *
- * Note that the rest of the code checks that hlen is what we expect
- * for this hardware address type, so might as well discard packets
- * here that don't match.
- */
- if ((hlen > 0 && hlen != arl->arl_phy->ap_hw_addrlen) || plen == 0) {
- DTRACE_PROBE2(rput_bogus, arl_t *, arl, mblk_t *, mp1);
- freemsg(mp);
- TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
- "arp_rput_end: q %p (%S)", q, "hlenzero/plenzero");
- return;
- }
- /*
- * Historically, Solaris has been lenient about hardware type numbers.
- * We should check here, but don't.
- */
- DTRACE_PROBE2(rput_normal, arl_t *, arl, arh_t *, arh);
-
- DTRACE_PROBE3(arp__physical__in__start,
- arl_t *, arl, arh_t *, arh, mblk_t *, mp);
-
- ARP_HOOK_IN(as->as_arp_physical_in_event, as->as_arp_physical_in,
- arl->arl_index, arh, mp, mp1, as);
-
- DTRACE_PROBE1(arp__physical__in__end, mblk_t *, mp);
-
- if (mp == NULL)
- return;
-
- proto = (uint32_t)BE16_TO_U16(arh->arh_proto);
- src_haddr = (uchar_t *)arh;
- src_haddr = &src_haddr[ARH_FIXED_LEN];
- src_paddr = &src_haddr[hlen];
- dst_haddr = &src_haddr[hlen + plen];
- dst_paddr = &src_haddr[hlen + plen + hlen];
- op = BE16_TO_U16(arh->arh_operation);
-
- /* Determine if this is just a probe */
- for (i = 0; i < plen; i++)
- if (src_paddr[i] != 0)
- break;
- is_probe = i >= plen;
-
- /*
- * RFC 826: first check if the <protocol, sender protocol address> is
- * in the cache, if there is a sender protocol address. Note that this
- * step also handles resolutions based on source.
- *
- * Note that IP expects that each notification it receives will be
- * tied to the ill it received it on. Thus, we must talk to it over
- * the arl tied to the resolved IP address (if any), hence client_arl.
- */
- if (is_probe)
- err = AR_NOTFOUND;
- else
- err = ar_ce_resolve_all(arl, proto, src_haddr, hlen, src_paddr,
- plen, &client_arl);
-
- switch (err) {
- case AR_BOGON:
- ar_client_notify(client_arl, mp1, AR_CN_BOGON);
- mp1 = NULL;
- break;
- case AR_FAILED:
- ar_client_notify(client_arl, mp1, AR_CN_FAILED);
- mp1 = NULL;
- break;
- case AR_LOOPBACK:
- DTRACE_PROBE2(rput_loopback, arl_t *, arl, arh_t *, arh);
- freemsg(mp1);
- mp1 = NULL;
- break;
- }
- if (mp1 == NULL) {
- freeb(mp);
- TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
- "arp_rput_end: q %p (%S)", q, "unneeded");
- return;
- }
-
- /*
- * Now look up the destination address. By RFC 826, we ignore the
- * packet at this step if the target isn't one of our addresses. This
- * is true even if the target is something we're trying to resolve and
- * the packet is a response. To avoid duplicate responses, we also
- * ignore the packet if it was multicast/broadcast to an arl that's in
- * an IPMP group but was not the designated xmit_arl for the ACE.
- *
- * Note that in order to do this correctly, we need to know when to
- * notify IP of a change implied by the source address of the ARP
- * message. That implies that the local ARP table has entries for all
- * of the resolved entries cached in the client. This is why we must
- * notify IP when we delete a resolved entry and we know that IP may
- * have cached answers.
- */
- dst_ace = ar_ce_lookup_entry(arl, proto, dst_paddr, plen);
- if (dst_ace == NULL || !ACE_RESOLVED(dst_ace) ||
- (dst_ace->ace_xmit_arl != arl && !is_unicast) ||
- !(dst_ace->ace_flags & ACE_F_PUBLISH)) {
- /*
- * Let the client know if the source mapping has changed, even
- * if the destination provides no useful information for the
- * client.
- */
- if (err == AR_CHANGED)
- ar_client_notify(client_arl, mp1, AR_CN_ANNOUNCE);
- else
- freemsg(mp1);
- freeb(mp);
- TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
- "arp_rput_end: q %p (%S)", q, "nottarget");
- return;
- }
-
- /*
- * If the target is unverified by DAD, then one of two things is true:
- * either it's someone else claiming this address (on a probe or an
- * announcement) or it's just a regular request. The former is
- * failure, but a regular request is not.
- */
- if (dst_ace->ace_flags & ACE_F_UNVERIFIED) {
- /*
- * Check for a reflection. Some misbehaving bridges will
- * reflect our own transmitted packets back to us.
- */
- if (hlen == dst_ace->ace_hw_addr_length &&
- bcmp(src_haddr, dst_ace->ace_hw_addr, hlen) == 0) {
- DTRACE_PROBE3(rput_probe_reflected, arl_t *, arl,
- arh_t *, arh, ace_t *, dst_ace);
- freeb(mp);
- freemsg(mp1);
- TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
- "arp_rput_end: q %p (%S)", q, "reflection");
- return;
- }
-
- /*
- * Conflicts seen via the wrong interface may be bogus.
- * Multiple interfaces on the same segment imply any conflict
- * will also be seen via the correct interface, so we can ignore
- * anything not matching the arl from the ace.
- */
- if (arl != dst_ace->ace_arl) {
- DTRACE_PROBE3(rput_probe_misdirect, arl_t *, arl,
- arh_t *, arh, ace_t *, dst_ace);
- freeb(mp);
- freemsg(mp1);
- return;
- }
- /*
- * Responses targeting our HW address that are not responses to
- * our DAD probe must be ignored as they are related to requests
- * sent before DAD was restarted. Note: response to our DAD
- * probe will have been handled by ar_ce_resolve_all() above.
- */
- if (op == ARP_RESPONSE &&
- (bcmp(dst_haddr, dst_ace->ace_hw_addr, hlen) == 0)) {
- DTRACE_PROBE3(rput_probe_stale, arl_t *, arl,
- arh_t *, arh, ace_t *, dst_ace);
- freeb(mp);
- freemsg(mp1);
- return;
- }
- /*
- * Responses targeted to HW addresses which are not ours but
- * sent to our unverified proto address are also conflicts.
- * These may be reported by a proxy rather than the interface
- * with the conflicting address, dst_paddr is in conflict
- * rather than src_paddr. To ensure IP can locate the correct
- * ipif to take down, it is necessary to copy dst_paddr to
- * the src_paddr field before sending it to IP. The same is
- * required for probes, where src_paddr will be INADDR_ANY.
- */
- if (is_probe) {
- /*
- * In this case, client_arl will be invalid (e.g.,
- * since probes don't have a valid sender address).
- * But dst_ace has the appropriate arl.
- */
- bcopy(dst_paddr, src_paddr, plen);
- ar_client_notify(dst_ace->ace_arl, mp1, AR_CN_FAILED);
- ar_ce_delete(dst_ace);
- } else if (op == ARP_RESPONSE) {
- bcopy(dst_paddr, src_paddr, plen);
- ar_client_notify(client_arl, mp1, AR_CN_FAILED);
- ar_ce_delete(dst_ace);
- } else if (err == AR_CHANGED) {
- ar_client_notify(client_arl, mp1, AR_CN_ANNOUNCE);
- } else {
- DTRACE_PROBE3(rput_request_unverified, arl_t *, arl,
- arh_t *, arh, ace_t *, dst_ace);
- freemsg(mp1);
- }
- freeb(mp);
- TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
- "arp_rput_end: q %p (%S)", q, "unverified");
- return;
- }
-
- /*
- * If it's a request, then we reply to this, and if we think the
- * sender's unknown, then we create an entry to avoid unnecessary ARPs.
- * The design assumption is that someone ARPing us is likely to send us
- * a packet soon, and that we'll want to reply to it.
- */
- if (op == ARP_REQUEST) {
- const uchar_t *dstaddr = src_haddr;
- clock_t now;
-
- /*
- * This implements periodic address defense based on a modified
- * version of the RFC 3927 requirements. Instead of sending a
- * broadcasted reply every time, as demanded by the RFC, we
- * send at most one broadcast reply per arp_broadcast_interval.
- */
- now = ddi_get_lbolt();
- if ((now - dst_ace->ace_last_bcast) >
- MSEC_TO_TICK(as->as_broadcast_interval)) {
- DTRACE_PROBE3(rput_bcast_reply, arl_t *, arl,
- arh_t *, arh, ace_t *, dst_ace);
- dst_ace->ace_last_bcast = now;
- dstaddr = arl->arl_phy->ap_arp_addr;
- /*
- * If this is one of the long-suffering entries, then
- * pull it out now. It no longer needs separate
- * defense, because we're doing now that with this
- * broadcasted reply.
- */
- dst_ace->ace_flags &= ~ACE_F_DELAYED;
- }
-
- ar_xmit(arl, ARP_RESPONSE, dst_ace->ace_proto, plen,
- dst_ace->ace_hw_addr, dst_ace->ace_proto_addr,
- src_haddr, src_paddr, dstaddr, as);
- if (!is_probe && err == AR_NOTFOUND &&
- ar_ce_create(OWNING_ARL(arl), proto, src_haddr, hlen,
- src_paddr, plen, NULL, NULL, 0, NULL, 0) == 0) {
- ace_t *ace;
-
- ace = ar_ce_lookup(arl, proto, src_paddr, plen);
- ASSERT(ace != NULL);
- mi_timer(ace->ace_arl->arl_wq, ace->ace_mp,
- as->as_cleanup_interval);
- }
- }
- if (err == AR_CHANGED) {
- freeb(mp);
- ar_client_notify(client_arl, mp1, AR_CN_ANNOUNCE);
- TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
- "arp_rput_end: q %p (%S)", q, "reqchange");
- } else {
- freemsg(mp);
- TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END,
- "arp_rput_end: q %p (%S)", q, "end");
- }
-}
-
-static void
-ar_ce_restart_dad(ace_t *ace, void *arl_arg)
-{
- arl_t *arl = arl_arg;
- arp_stack_t *as = ARL_TO_ARPSTACK(arl);
-
- if ((ace->ace_xmit_arl == arl) &&
- (ace->ace_flags & (ACE_F_UNVERIFIED|ACE_F_DAD_ABORTED)) ==
- (ACE_F_UNVERIFIED|ACE_F_DAD_ABORTED)) {
- /*
- * Slight cheat here: we don't use the initial probe delay
- * in this obscure case.
- */
- if (ace->ace_flags & ACE_F_FAST) {
- ace->ace_xmit_count = as->as_fastprobe_count;
- ace->ace_xmit_interval = as->as_fastprobe_interval;
- } else {
- ace->ace_xmit_count = as->as_probe_count;
- ace->ace_xmit_interval = as->as_probe_interval;
- }
- ace->ace_flags &= ~ACE_F_DAD_ABORTED;
- ace_set_timer(ace, B_FALSE);
- }
-}
-
-/* DLPI messages, other than DL_UNITDATA_IND are handled here. */
-static void
-ar_rput_dlpi(queue_t *q, mblk_t *mp)
-{
- ar_t *ar = q->q_ptr;
- arl_t *arl = ar->ar_arl;
- arlphy_t *ap = NULL;
- union DL_primitives *dlp;
- const char *err_str;
- arp_stack_t *as = ar->ar_as;
-
- if (arl != NULL)
- ap = arl->arl_phy;
-
- if (MBLKL(mp) < sizeof (dlp->dl_primitive)) {
- putnext(q, mp);
- return;
- }
- dlp = (union DL_primitives *)mp->b_rptr;
- switch (dlp->dl_primitive) {
- case DL_ERROR_ACK:
- /*
- * ce is confused about how DLPI works, so we have to interpret
- * an "error" on DL_NOTIFY_ACK (which we never could have sent)
- * as really meaning an error on DL_NOTIFY_REQ.
- *
- * Note that supporting DL_NOTIFY_REQ is optional, so printing
- * out an error message on the console isn't warranted except
- * for debug.
- */
- if (dlp->error_ack.dl_error_primitive == DL_NOTIFY_ACK ||
- dlp->error_ack.dl_error_primitive == DL_NOTIFY_REQ) {
- ar_dlpi_done(arl, DL_NOTIFY_REQ);
- freemsg(mp);
- return;
- }
- err_str = dl_primstr(dlp->error_ack.dl_error_primitive);
- DTRACE_PROBE2(rput_dl_error, arl_t *, arl,
- dl_error_ack_t *, &dlp->error_ack);
- switch (dlp->error_ack.dl_error_primitive) {
- case DL_UNBIND_REQ:
- if (arl->arl_provider_style == DL_STYLE1)
- arl->arl_state = ARL_S_DOWN;
- break;
- case DL_DETACH_REQ:
- case DL_BIND_REQ:
- arl->arl_state = ARL_S_DOWN;
- break;
- case DL_ATTACH_REQ:
- break;
- default:
- /* If it's anything else, we didn't send it. */
- putnext(q, mp);
- return;
- }
- ar_dlpi_done(arl, dlp->error_ack.dl_error_primitive);
- (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
- "ar_rput_dlpi: %s failed, dl_errno %d, dl_unix_errno %d",
- err_str, dlp->error_ack.dl_errno,
- dlp->error_ack.dl_unix_errno);
- break;
- case DL_INFO_ACK:
- DTRACE_PROBE2(rput_dl_info, arl_t *, arl,
- dl_info_ack_t *, &dlp->info_ack);
- if (arl != NULL && arl->arl_dlpi_pending == DL_INFO_REQ) {
- /*
- * We have a response back from the driver. Go set up
- * transmit defaults.
- */
- ar_ll_set_defaults(arl, mp);
- ar_dlpi_done(arl, DL_INFO_REQ);
- } else if (arl == NULL) {
- ar_ll_init(as, ar, mp);
- }
- /* Kick off any awaiting messages */
- qenable(WR(q));
- break;
- case DL_OK_ACK:
- DTRACE_PROBE2(rput_dl_ok, arl_t *, arl,
- dl_ok_ack_t *, &dlp->ok_ack);
- switch (dlp->ok_ack.dl_correct_primitive) {
- case DL_UNBIND_REQ:
- if (arl->arl_provider_style == DL_STYLE1)
- arl->arl_state = ARL_S_DOWN;
- break;
- case DL_DETACH_REQ:
- arl->arl_state = ARL_S_DOWN;
- break;
- case DL_ATTACH_REQ:
- break;
- default:
- putnext(q, mp);
- return;
- }
- ar_dlpi_done(arl, dlp->ok_ack.dl_correct_primitive);
- break;
- case DL_NOTIFY_ACK:
- DTRACE_PROBE2(rput_dl_notify, arl_t *, arl,
- dl_notify_ack_t *, &dlp->notify_ack);
- /*
- * We mostly care about interface-up transitions, as this is
- * when we need to redo duplicate address detection.
- */
- if (ap != NULL) {
- ap->ap_notifies = (dlp->notify_ack.dl_notifications &
- DL_NOTE_LINK_UP) != 0;
- }
- ar_dlpi_done(arl, DL_NOTIFY_REQ);
- break;
- case DL_BIND_ACK:
- DTRACE_PROBE2(rput_dl_bind, arl_t *, arl,
- dl_bind_ack_t *, &dlp->bind_ack);
- if (ap != NULL) {
- caddr_t hw_addr;
-
- hw_addr = (caddr_t)dlp + dlp->bind_ack.dl_addr_offset;
- if (ap->ap_saplen > 0)
- hw_addr += ap->ap_saplen;
- bcopy(hw_addr, ap->ap_hw_addr, ap->ap_hw_addrlen);
- }
- arl->arl_state = ARL_S_UP;
- ar_dlpi_done(arl, DL_BIND_REQ);
- break;
- case DL_NOTIFY_IND:
- DTRACE_PROBE2(rput_dl_notify_ind, arl_t *, arl,
- dl_notify_ind_t *, &dlp->notify_ind);
-
- if (dlp->notify_ind.dl_notification == DL_NOTE_REPLUMB) {
- arl->arl_replumbing = B_TRUE;
- if (arl->arl_state == ARL_S_DOWN) {
- arp_replumb_done(arl, mp);
- return;
- }
- break;
- }
-
- if (ap != NULL) {
- switch (dlp->notify_ind.dl_notification) {
- case DL_NOTE_LINK_UP:
- ap->ap_link_down = B_FALSE;
- ar_ce_walk(as, ar_ce_restart_dad, arl);
- break;
- case DL_NOTE_LINK_DOWN:
- ap->ap_link_down = B_TRUE;
- break;
- }
- }
- break;
- case DL_UDERROR_IND:
- DTRACE_PROBE2(rput_dl_uderror, arl_t *, arl,
- dl_uderror_ind_t *, &dlp->uderror_ind);
- (void) mi_strlog(q, 1, SL_ERROR | SL_TRACE,
- "ar_rput_dlpi: "
- "DL_UDERROR_IND, dl_dest_addr_length %d dl_errno %d",
- dlp->uderror_ind.dl_dest_addr_length,
- dlp->uderror_ind.dl_errno);
- putnext(q, mp);
- return;
- default:
- DTRACE_PROBE2(rput_dl_badprim, arl_t *, arl,
- union DL_primitives *, dlp);
- putnext(q, mp);
- return;
- }
- freemsg(mp);
-}
-
-static void
-ar_set_address(ace_t *ace, uchar_t *addrpos, uchar_t *proto_addr,
- uint32_t proto_addr_len)
-{
- uchar_t *mask, *to;
- int len;
-
- ASSERT(ace->ace_hw_addr != NULL);
-
- bcopy(ace->ace_hw_addr, addrpos, ace->ace_hw_addr_length);
- if (ace->ace_flags & ACE_F_MAPPING &&
- proto_addr != NULL &&
- ace->ace_proto_extract_mask) { /* careful */
- len = MIN((int)ace->ace_hw_addr_length
- - ace->ace_hw_extract_start,
- proto_addr_len);
- mask = ace->ace_proto_extract_mask;
- to = addrpos + ace->ace_hw_extract_start;
- while (len-- > 0)
- *to++ |= *mask++ & *proto_addr++;
- }
-}
-
-static int
-ar_slifname(queue_t *q, mblk_t *mp_orig)
-{
- ar_t *ar = q->q_ptr;
- arl_t *arl = ar->ar_arl;
- struct lifreq *lifr;
- mblk_t *mp = mp_orig;
- arl_t *old_arl;
- mblk_t *ioccpy;
- struct iocblk *iocp;
- hook_nic_event_t info;
- arp_stack_t *as = ar->ar_as;
-
- if (ar->ar_on_ill_stream) {
- /*
- * This command is for IP, since it is coming down
- * the <arp-IP-driver> stream. Return ENOENT so that
- * it will be sent downstream by the caller
- */
- return (ENOENT);
- }
- /* We handle both M_IOCTL and M_PROTO messages */
- if (DB_TYPE(mp) == M_IOCTL)
- mp = mp->b_cont;
- if (q->q_next == NULL || arl == NULL) {
- /*
- * If the interface was just opened and
- * the info ack has not yet come back from the driver
- */
- DTRACE_PROBE2(slifname_no_arl, queue_t *, q,
- mblk_t *, mp_orig);
- (void) putq(q, mp_orig);
- return (EINPROGRESS);
- }
-
- if (MBLKL(mp) < sizeof (struct lifreq)) {
- DTRACE_PROBE2(slifname_malformed, queue_t *, q,
- mblk_t *, mp);
- }
-
- if (arl->arl_name[0] != '\0') {
- DTRACE_PROBE1(slifname_already, arl_t *, arl);
- return (EALREADY);
- }
-
- lifr = (struct lifreq *)mp->b_rptr;
-
- if (strlen(lifr->lifr_name) >= LIFNAMSIZ) {
- DTRACE_PROBE2(slifname_bad_name, arl_t *, arl,
- struct lifreq *, lifr);
- return (ENXIO);
- }
-
- /* Check whether the name is already in use. */
-
- old_arl = ar_ll_lookup_by_name(as, lifr->lifr_name);
- if (old_arl != NULL) {
- DTRACE_PROBE2(slifname_exists, arl_t *, arl, arl_t *, old_arl);
- return (EEXIST);
- }
-
- /* Make a copy of the message so we can send it downstream. */
- if ((ioccpy = allocb(sizeof (struct iocblk), BPRI_MED)) == NULL ||
- (ioccpy->b_cont = copymsg(mp)) == NULL) {
- if (ioccpy != NULL)
- freeb(ioccpy);
- return (ENOMEM);
- }
-
- (void) strlcpy(arl->arl_name, lifr->lifr_name, sizeof (arl->arl_name));
-
- /* The ppa is sent down by ifconfig */
- arl->arl_ppa = lifr->lifr_ppa;
-
- /*
- * A network device is not considered to be fully plumb'd until
- * its name has been set using SIOCSLIFNAME. Once it has
- * been set, it cannot be set again (see code above), so there
- * is currently no danger in this function causing two NE_PLUMB
- * events without an intervening NE_UNPLUMB.
- */
- info.hne_nic = arl->arl_index;
- info.hne_lif = 0;
- info.hne_event = NE_PLUMB;
- info.hne_data = arl->arl_name;
- info.hne_datalen = strlen(arl->arl_name);
- (void) hook_run(as->as_net_data->netd_hooks, as->as_arpnicevents,
- (hook_data_t)&info);
-
- /* Chain in the new arl. */
- rw_enter(&as->as_arl_lock, RW_WRITER);
- arl->arl_next = as->as_arl_head;
- as->as_arl_head = arl;
- rw_exit(&as->as_arl_lock);
- DTRACE_PROBE1(slifname_set, arl_t *, arl);
-
- /*
- * Send along a copy of the ioctl; this is just for hitbox. Use
- * M_CTL to avoid confusing anyone else who might be listening.
- */
- DB_TYPE(ioccpy) = M_CTL;
- iocp = (struct iocblk *)ioccpy->b_rptr;
- bzero(iocp, sizeof (*iocp));
- iocp->ioc_cmd = SIOCSLIFNAME;
- iocp->ioc_count = msgsize(ioccpy->b_cont);
- ioccpy->b_wptr = (uchar_t *)(iocp + 1);
- putnext(arl->arl_wq, ioccpy);
-
- return (0);
-}
-
-static int
-ar_set_ppa(queue_t *q, mblk_t *mp_orig)
-{
- ar_t *ar = (ar_t *)q->q_ptr;
- arl_t *arl = ar->ar_arl;
- int ppa;
- char *cp;
- mblk_t *mp = mp_orig;
- arl_t *old_arl;
- arp_stack_t *as = ar->ar_as;
-
- if (ar->ar_on_ill_stream) {
- /*
- * This command is for IP, since it is coming down
- * the <arp-IP-driver> stream. Return ENOENT so that
- * it will be sent downstream by the caller
- */
- return (ENOENT);
- }
-
- /* We handle both M_IOCTL and M_PROTO messages. */
- if (DB_TYPE(mp) == M_IOCTL)
- mp = mp->b_cont;
- if (q->q_next == NULL || arl == NULL) {
- /*
- * If the interface was just opened and
- * the info ack has not yet come back from the driver.
- */
- DTRACE_PROBE2(setppa_no_arl, queue_t *, q,
- mblk_t *, mp_orig);
- (void) putq(q, mp_orig);
- return (EINPROGRESS);
- }
-
- if (arl->arl_name[0] != '\0') {
- DTRACE_PROBE1(setppa_already, arl_t *, arl);
- return (EALREADY);
- }
-
- do {
- q = q->q_next;
- } while (q->q_next != NULL);
- cp = q->q_qinfo->qi_minfo->mi_idname;
-
- ppa = *(int *)(mp->b_rptr);
- (void) snprintf(arl->arl_name, sizeof (arl->arl_name), "%s%d", cp, ppa);
-
- old_arl = ar_ll_lookup_by_name(as, arl->arl_name);
- if (old_arl != NULL) {
- DTRACE_PROBE2(setppa_exists, arl_t *, arl, arl_t *, old_arl);
- /* Make it a null string again */
- arl->arl_name[0] = '\0';
- return (EBUSY);
- }
-
- arl->arl_ppa = ppa;
- DTRACE_PROBE1(setppa_done, arl_t *, arl);
- /* Chain in the new arl. */
- rw_enter(&as->as_arl_lock, RW_WRITER);
- arl->arl_next = as->as_arl_head;
- as->as_arl_head = arl;
- rw_exit(&as->as_arl_lock);
-
- return (0);
-}
-
-static int
-ar_snmp_msg(queue_t *q, mblk_t *mp_orig)
-{
- mblk_t *mpdata, *mp = mp_orig;
- struct opthdr *optp;
- msg2_args_t args;
- arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
-
- if (mp == NULL)
- return (0);
- /*
- * ar_cmd_dispatch() already checked for us that "mp->b_cont" is valid
- * in case of an M_IOCTL message.
- */
- if (DB_TYPE(mp) == M_IOCTL)
- mp = mp->b_cont;
-
- optp = (struct opthdr *)(&mp->b_rptr[sizeof (struct T_optmgmt_ack)]);
- if (optp->level == MIB2_IP && optp->name == MIB2_IP_MEDIA) {
- /*
- * Put our ARP cache entries in the ipNetToMediaTable mp from
- * IP. Due to a historical side effect of IP's MIB code, it
- * always passes us a b_cont, but the b_cont should be empty.
- */
- if ((mpdata = mp->b_cont) == NULL || MBLKL(mpdata) != 0)
- return (EINVAL);
-
- args.m2a_mpdata = mpdata;
- args.m2a_mptail = NULL;
- ar_ce_walk(as, ar_snmp_msg2, &args);
- optp->len = msgdsize(mpdata);
- }
- putnext(q, mp_orig);
- return (EINPROGRESS); /* so that rput() exits doing nothing... */
-}
-
-static void
-ar_snmp_msg2(ace_t *ace, void *arg)
-{
- const char *name = "unknown";
- mib2_ipNetToMediaEntry_t ntme;
- msg2_args_t *m2ap = arg;
-
- ASSERT(ace != NULL && ace->ace_arl != NULL);
- if (ace->ace_arl != NULL)
- name = ace->ace_arl->arl_name;
-
- /*
- * Fill in ntme using the information in the ACE.
- */
- ntme.ipNetToMediaType = (ace->ace_flags & ACE_F_PERMANENT) ? 4 : 3;
- ntme.ipNetToMediaIfIndex.o_length = MIN(OCTET_LENGTH, strlen(name));
- bcopy(name, ntme.ipNetToMediaIfIndex.o_bytes,
- ntme.ipNetToMediaIfIndex.o_length);
-
- bcopy(ace->ace_proto_addr, &ntme.ipNetToMediaNetAddress,
- MIN(sizeof (uint32_t), ace->ace_proto_addr_length));
-
- ntme.ipNetToMediaInfo.ntm_mask.o_length =
- MIN(OCTET_LENGTH, ace->ace_proto_addr_length);
- bcopy(ace->ace_proto_mask, ntme.ipNetToMediaInfo.ntm_mask.o_bytes,
- ntme.ipNetToMediaInfo.ntm_mask.o_length);
- ntme.ipNetToMediaInfo.ntm_flags = ace->ace_flags;
-
- ntme.ipNetToMediaPhysAddress.o_length =
- MIN(OCTET_LENGTH, ace->ace_hw_addr_length);
- if ((ace->ace_flags & ACE_F_RESOLVED) == 0)
- ntme.ipNetToMediaPhysAddress.o_length = 0;
- bcopy(ace->ace_hw_addr, ntme.ipNetToMediaPhysAddress.o_bytes,
- ntme.ipNetToMediaPhysAddress.o_length);
-
- /*
- * All entries within the ARP cache are unique, and there are no
- * preexisting entries in the ipNetToMediaTable mp, so just add 'em.
- */
- (void) snmp_append_data2(m2ap->m2a_mpdata, &m2ap->m2a_mptail,
- (char *)&ntme, sizeof (ntme));
-}
-
-/* Write side put procedure. */
-static void
-ar_wput(queue_t *q, mblk_t *mp)
-{
- int err;
- struct iocblk *ioc;
- mblk_t *mp1;
-
- TRACE_1(TR_FAC_ARP, TR_ARP_WPUT_START,
- "arp_wput_start: q %p", q);
-
- /*
- * Here we handle ARP commands coming from controlling processes
- * either in the form of M_IOCTL messages, or M_PROTO messages.
- */
- switch (DB_TYPE(mp)) {
- case M_IOCTL:
- switch (err = ar_cmd_dispatch(q, mp, B_TRUE)) {
- case ENOENT:
- /*
- * If it is an I_PLINK, process it. Otherwise
- * we don't recognize it, so pass it down.
- * Since ARP is a module there is always someone
- * below.
- */
- ASSERT(q->q_next != NULL);
- ioc = (struct iocblk *)mp->b_rptr;
- if ((ioc->ioc_cmd != I_PLINK) &&
- (ioc->ioc_cmd != I_PUNLINK)) {
- putnext(q, mp);
- TRACE_2(TR_FAC_ARP, TR_ARP_WPUT_END,
- "arp_wput_end: q %p (%S)",
- q, "ioctl/enoent");
- return;
- }
- err = ar_plink_send(q, mp);
- if (err == 0) {
- return;
- }
- if ((mp1 = mp->b_cont) != 0)
- mp1->b_wptr = mp1->b_rptr;
- break;
- case EINPROGRESS:
- /*
- * If the request resulted in an attempt to resolve
- * an address, we return out here. The IOCTL will
- * be completed in ar_rput if something comes back,
- * or as a result of the timer expiring.
- */
- TRACE_2(TR_FAC_ARP, TR_ARP_WPUT_END,
- "arp_wput_end: q %p (%S)", q, "inprog");
- return;
- default:
- DB_TYPE(mp) = M_IOCACK;
- break;
- }
- ioc = (struct iocblk *)mp->b_rptr;
- if (err != 0)
- ioc->ioc_error = err;
- if (ioc->ioc_error != 0) {
- /*
- * Don't free b_cont as IP/IB needs
- * it to identify the request.
- */
- DB_TYPE(mp) = M_IOCNAK;
- }
- ioc->ioc_count = msgdsize(mp->b_cont);
- qreply(q, mp);
- TRACE_2(TR_FAC_ARP, TR_ARP_WPUT_END,
- "arp_wput_end: q %p (%S)", q, "ioctl");
- return;
- case M_FLUSH:
- if (*mp->b_rptr & FLUSHW)
- flushq(q, FLUSHDATA);
- if (*mp->b_rptr & FLUSHR) {
- flushq(RD(q), FLUSHDATA);
- *mp->b_rptr &= ~FLUSHW;
- qreply(q, mp);
- TRACE_2(TR_FAC_ARP, TR_ARP_WPUT_END,
- "arp_wput_end: q %p (%S)", q, "flush");
- return;
- }
- /*
- * The normal behavior of a STREAMS module should be
- * to pass down M_FLUSH messages. However there is a
- * complex sequence of events during plumb/unplumb that
- * can cause DLPI messages in the driver's queue to be
- * flushed. So we don't send down M_FLUSH. This has been
- * reported for some drivers (Eg. le) that send up an M_FLUSH
- * in response to unbind request which will eventually be
- * looped back at the mux head and sent down. Since IP
- * does not queue messages in a module instance queue
- * of IP, nothing is lost by not sending down the flush.
- */
- freemsg(mp);
- return;
- case M_PROTO:
- case M_PCPROTO:
- /*
- * Commands in the form of PROTO messages are handled very
- * much the same as IOCTLs, but no response is returned.
- */
- switch (err = ar_cmd_dispatch(q, mp, B_TRUE)) {
- case ENOENT:
- if (q->q_next) {
- putnext(q, mp);
- TRACE_2(TR_FAC_ARP, TR_ARP_WPUT_END,
- "arp_wput_end: q %p (%S)", q,
- "proto/enoent");
- return;
- }
- break;
- case EINPROGRESS:
- TRACE_2(TR_FAC_ARP, TR_ARP_WPUT_END,
- "arp_wput_end: q %p (%S)", q, "proto/einprog");
- return;
- default:
- break;
- }
- break;
- case M_IOCDATA:
- /*
- * We pass M_IOCDATA downstream because it could be as a
- * result of a previous M_COPYIN/M_COPYOUT message sent
- * upstream.
- */
- /* FALLTHRU */
- case M_CTL:
- /*
- * We also send any M_CTL downstream as it could
- * contain control information for a module downstream.
- */
- putnext(q, mp);
- return;
- default:
- break;
- }
- /* Free any message we don't understand */
- freemsg(mp);
- TRACE_2(TR_FAC_ARP, TR_ARP_WPUT_END,
- "arp_wput_end: q %p (%S)", q, "end");
-}
-
-static boolean_t
-arp_say_ready(ace_t *ace)
-{
- mblk_t *mp;
- arl_t *arl = ace->ace_arl;
- arlphy_t *ap = ace->ace_xmit_arl->arl_phy;
- arh_t *arh;
- uchar_t *cp;
-
- mp = allocb(sizeof (*arh) + 2 * (ace->ace_hw_addr_length +
- ace->ace_proto_addr_length), BPRI_MED);
- if (mp == NULL) {
- /* skip a beat on allocation trouble */
- ace->ace_xmit_count = 1;
- ace_set_timer(ace, B_FALSE);
- return (B_FALSE);
- }
- /* Tell IP address is now usable */
- arh = (arh_t *)mp->b_rptr;
- U16_TO_BE16(ap->ap_arp_hw_type, arh->arh_hardware);
- U16_TO_BE16(ace->ace_proto, arh->arh_proto);
- arh->arh_hlen = ace->ace_hw_addr_length;
- arh->arh_plen = ace->ace_proto_addr_length;
- U16_TO_BE16(ARP_REQUEST, arh->arh_operation);
- cp = (uchar_t *)(arh + 1);
- bcopy(ace->ace_hw_addr, cp, ace->ace_hw_addr_length);
- cp += ace->ace_hw_addr_length;
- bcopy(ace->ace_proto_addr, cp, ace->ace_proto_addr_length);
- cp += ace->ace_proto_addr_length;
- bcopy(ace->ace_hw_addr, cp, ace->ace_hw_addr_length);
- cp += ace->ace_hw_addr_length;
- bcopy(ace->ace_proto_addr, cp, ace->ace_proto_addr_length);
- cp += ace->ace_proto_addr_length;
- mp->b_wptr = cp;
- ar_client_notify(arl, mp, AR_CN_READY);
- DTRACE_PROBE1(ready, ace_t *, ace);
- return (B_TRUE);
-}
-
-/*
- * Pick the longest-waiting aces for defense.
- */
-static void
-ace_reschedule(ace_t *ace, void *arg)
-{
- ace_resched_t *art = arg;
- ace_t **aces;
- ace_t **acemax;
- ace_t *atemp;
-
- if (ace->ace_xmit_arl != art->art_arl)
- return;
- /*
- * Only published entries that are ready for announcement are eligible.
- */
- if ((ace->ace_flags & (ACE_F_PUBLISH | ACE_F_UNVERIFIED | ACE_F_DYING |
- ACE_F_DELAYED)) != ACE_F_PUBLISH) {
- return;
- }
- if (art->art_naces < ACE_RESCHED_LIST_LEN) {
- art->art_aces[art->art_naces++] = ace;
- } else {
- aces = art->art_aces;
- acemax = aces + ACE_RESCHED_LIST_LEN;
- for (; aces < acemax; aces++) {
- if ((*aces)->ace_last_bcast > ace->ace_last_bcast) {
- atemp = *aces;
- *aces = ace;
- ace = atemp;
- }
- }
- }
-}
-
-/*
- * Reschedule the ARP defense of any long-waiting ACEs. It's assumed that this
- * doesn't happen very often (if at all), and thus it needn't be highly
- * optimized. (Note, though, that it's actually O(N) complexity, because the
- * outer loop is bounded by a constant rather than by the length of the list.)
- */
-static void
-arl_reschedule(arl_t *arl)
-{
- arlphy_t *ap = arl->arl_phy;
- ace_resched_t art;
- int i;
- ace_t *ace;
- arp_stack_t *as = ARL_TO_ARPSTACK(arl);
-
- i = ap->ap_defend_count;
- ap->ap_defend_count = 0;
- /* If none could be sitting around, then don't reschedule */
- if (i < as->as_defend_rate) {
- DTRACE_PROBE1(reschedule_none, arl_t *, arl);
- return;
- }
- art.art_arl = arl;
- while (ap->ap_defend_count < as->as_defend_rate) {
- art.art_naces = 0;
- ar_ce_walk(as, ace_reschedule, &art);
- for (i = 0; i < art.art_naces; i++) {
- ace = art.art_aces[i];
- ace->ace_flags |= ACE_F_DELAYED;
- ace_set_timer(ace, B_FALSE);
- if (++ap->ap_defend_count >= as->as_defend_rate)
- break;
- }
- if (art.art_naces < ACE_RESCHED_LIST_LEN)
- break;
- }
- DTRACE_PROBE1(reschedule, arl_t *, arl);
-}
-
-/*
- * Write side service routine. The only action here is delivery of transmit
- * timer events and delayed messages while waiting for the info_ack (ar_arl
- * not yet set).
- */
-static void
-ar_wsrv(queue_t *q)
-{
- ace_t *ace;
- arlphy_t *ap;
- mblk_t *mp;
- clock_t ms;
- arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as;
-
- TRACE_1(TR_FAC_ARP, TR_ARP_WSRV_START,
- "arp_wsrv_start: q %p", q);
-
- while ((mp = getq(q)) != NULL) {
- switch (DB_TYPE(mp)) {
- case M_PCSIG:
- if (!mi_timer_valid(mp))
- continue;
- ace = (ace_t *)mp->b_rptr;
- if (ace->ace_flags & ACE_F_DYING)
- continue;
- ap = ace->ace_xmit_arl->arl_phy;
- if (ace->ace_flags & ACE_F_UNVERIFIED) {
- ASSERT(ace->ace_flags & ACE_F_PUBLISH);
- ASSERT(ace->ace_query_mp == NULL);
- /*
- * If the link is down, give up for now. IP
- * will give us the go-ahead to try again when
- * the link restarts.
- */
- if (ap->ap_link_down) {
- DTRACE_PROBE1(timer_link_down,
- ace_t *, ace);
- ace->ace_flags |= ACE_F_DAD_ABORTED;
- continue;
- }
- if (ace->ace_xmit_count > 0) {
- DTRACE_PROBE1(timer_probe,
- ace_t *, ace);
- ace->ace_xmit_count--;
- ar_xmit(ace->ace_xmit_arl, ARP_REQUEST,
- ace->ace_proto,
- ace->ace_proto_addr_length,
- ace->ace_hw_addr, NULL, NULL,
- ace->ace_proto_addr, NULL, as);
- ace_set_timer(ace, B_FALSE);
- continue;
- }
- if (!arp_say_ready(ace))
- continue;
- DTRACE_PROBE1(timer_ready, ace_t *, ace);
- ace->ace_xmit_interval =
- as->as_publish_interval;
- ace->ace_xmit_count = as->as_publish_count;
- if (ace->ace_xmit_count == 0)
- ace->ace_xmit_count++;
- ace->ace_flags &= ~ACE_F_UNVERIFIED;
- }
- if (ace->ace_flags & ACE_F_PUBLISH) {
- clock_t now;
-
- /*
- * If an hour has passed, then free up the
- * entries that need defense by rescheduling
- * them.
- */
- now = ddi_get_lbolt();
- if (as->as_defend_rate > 0 &&
- now - ap->ap_defend_start >
- SEC_TO_TICK(as->as_defend_period)) {
- ap->ap_defend_start = now;
- arl_reschedule(ace->ace_xmit_arl);
- }
- /*
- * Finish the job that we started in
- * ar_entry_add. When we get to zero
- * announcement retransmits left, switch to
- * address defense.
- */
- ASSERT(ace->ace_query_mp == NULL);
- if (ace->ace_xmit_count > 0) {
- ace->ace_xmit_count--;
- DTRACE_PROBE1(timer_announce,
- ace_t *, ace);
- } else if (ace->ace_flags & ACE_F_DELAYED) {
- /*
- * This guy was rescheduled as one of
- * the really old entries needing
- * on-going defense. Let him through
- * now.
- */
- DTRACE_PROBE1(timer_send_delayed,
- ace_t *, ace);
- ace->ace_flags &= ~ACE_F_DELAYED;
- } else if (as->as_defend_rate > 0 &&
- (ap->ap_defend_count >=
- as->as_defend_rate ||
- ++ap->ap_defend_count >=
- as->as_defend_rate)) {
- /*
- * If we're no longer allowed to send
- * unbidden defense messages, then just
- * wait for rescheduling.
- */
- DTRACE_PROBE1(timer_excess_defense,
- ace_t *, ace);
- ace_set_timer(ace, B_FALSE);
- continue;
- } else {
- DTRACE_PROBE1(timer_defend,
- ace_t *, ace);
- }
- ar_xmit(ace->ace_xmit_arl, ARP_REQUEST,
- ace->ace_proto,
- ace->ace_proto_addr_length,
- ace->ace_hw_addr,
- ace->ace_proto_addr,
- ace->ace_xmit_arl->arl_phy->ap_arp_addr,
- ace->ace_proto_addr, NULL, as);
- ace->ace_last_bcast = now;
- if (ace->ace_xmit_count == 0)
- ace->ace_xmit_interval =
- as->as_defend_interval;
- if (ace->ace_xmit_interval != 0)
- ace_set_timer(ace, B_FALSE);
- continue;
- }
-
- /*
- * If this is a non-permanent (regular) resolved ARP
- * entry, then it's now time to check if it can be
- * retired. As an optimization, we check with IP
- * first, and just restart the timer if the address is
- * still in use.
- */
- if (ACE_NONPERM(ace)) {
- if (ace->ace_proto == IP_ARP_PROTO_TYPE &&
- ndp_lookup_ipaddr(*(ipaddr_t *)
- ace->ace_proto_addr, as->as_netstack)) {
- ace->ace_flags |= ACE_F_OLD;
- mi_timer(ace->ace_arl->arl_wq,
- ace->ace_mp,
- as->as_cleanup_interval);
- } else {
- ar_delete_notify(ace);
- ar_ce_delete(ace);
- }
- continue;
- }
-
- /*
- * ar_query_xmit returns the number of milliseconds to
- * wait following this transmit. If the number of
- * allowed transmissions has been exhausted, it will
- * return zero without transmitting. If that happens
- * we complete the operation with a failure indication.
- * Otherwise, we restart the timer.
- */
- ms = ar_query_xmit(as, ace);
- if (ms == 0)
- ar_query_reply(ace, ENXIO, NULL, (uint32_t)0);
- else
- mi_timer(q, mp, ms);
- continue;
- default:
- put(q, mp);
- continue;
- }
- }
- TRACE_1(TR_FAC_ARP, TR_ARP_WSRV_END,
- "arp_wsrv_end: q %p", q);
-}
-
-/* ar_xmit is called to transmit an ARP Request or Response. */
-static void
-ar_xmit(arl_t *arl, uint32_t operation, uint32_t proto, uint32_t plen,
- const uchar_t *haddr1, const uchar_t *paddr1, const uchar_t *haddr2,
- const uchar_t *paddr2, const uchar_t *dstaddr, arp_stack_t *as)
-{
- arh_t *arh;
- uint8_t *cp;
- uint_t hlen;
- mblk_t *mp;
- arlphy_t *ap = arl->arl_phy;
-
- ASSERT(!(arl->arl_flags & ARL_F_IPMP));
-
- if (ap == NULL) {
- DTRACE_PROBE1(xmit_no_arl_phy, arl_t *, arl);
- return;
- }
-
- /* IFF_NOARP flag is set or link down: do not send arp messages */
- if ((arl->arl_flags & ARL_F_NOARP) || ap->ap_link_down)
- return;
-
- hlen = ap->ap_hw_addrlen;
- if ((mp = copyb(ap->ap_xmit_mp)) == NULL)
- return;
-
- mp->b_cont = allocb(AR_LL_HDR_SLACK + ARH_FIXED_LEN + (hlen * 4) +
- plen + plen, BPRI_MED);
- if (mp->b_cont == NULL) {
- freeb(mp);
- return;
- }
-
- /* Get the L2 destination address for the message */
- if (haddr2 == NULL)
- dstaddr = ap->ap_arp_addr;
- else if (dstaddr == NULL)
- dstaddr = haddr2;
-
- /*
- * Figure out where the target hardware address goes in the
- * DL_UNITDATA_REQ header, and copy it in.
- */
- cp = mi_offset_param(mp, ap->ap_xmit_addroff, hlen);
- ASSERT(cp != NULL);
- if (cp == NULL) {
- freemsg(mp);
- return;
- }
- bcopy(dstaddr, cp, hlen);
-
- /* Fill in the ARP header. */
- cp = mp->b_cont->b_rptr + (AR_LL_HDR_SLACK + hlen + hlen);
- mp->b_cont->b_rptr = cp;
- arh = (arh_t *)cp;
- U16_TO_BE16(ap->ap_arp_hw_type, arh->arh_hardware);
- U16_TO_BE16(proto, arh->arh_proto);
- arh->arh_hlen = (uint8_t)hlen;
- arh->arh_plen = (uint8_t)plen;
- U16_TO_BE16(operation, arh->arh_operation);
- cp += ARH_FIXED_LEN;
- bcopy(haddr1, cp, hlen);
- cp += hlen;
- if (paddr1 == NULL)
- bzero(cp, plen);
- else
- bcopy(paddr1, cp, plen);
- cp += plen;
- if (haddr2 == NULL)
- bzero(cp, hlen);
- else
- bcopy(haddr2, cp, hlen);
- cp += hlen;
- bcopy(paddr2, cp, plen);
- cp += plen;
- mp->b_cont->b_wptr = cp;
-
- DTRACE_PROBE3(arp__physical__out__start,
- arl_t *, arl, arh_t *, arh, mblk_t *, mp);
-
- ARP_HOOK_OUT(as->as_arp_physical_out_event, as->as_arp_physical_out,
- arl->arl_index, arh, mp, mp->b_cont, as);
-
- DTRACE_PROBE1(arp__physical__out__end, mblk_t *, mp);
-
- if (mp == NULL)
- return;
-
- /* Ship it out. */
- if (canputnext(arl->arl_wq))
- putnext(arl->arl_wq, mp);
- else
- freemsg(mp);
-}
-
-static mblk_t *
-ar_alloc(uint32_t cmd, int err)
-{
- uint32_t len;
- mblk_t *mp;
- mblk_t *mp1;
- char *cp;
- arc_t *arc;
-
- /* For now only one type of command is accepted */
- if (cmd != AR_DLPIOP_DONE)
- return (NULL);
- len = sizeof (arc_t);
- mp = allocb(len, BPRI_HI);
- if (!mp)
- return (NULL);
-
- DB_TYPE(mp) = M_CTL;
- cp = (char *)mp->b_rptr;
- arc = (arc_t *)(mp->b_rptr);
- arc->arc_cmd = cmd;
- mp->b_wptr = (uchar_t *)&cp[len];
- len = sizeof (int);
- mp1 = allocb(len, BPRI_HI);
- if (!mp1) {
- freeb(mp);
- return (NULL);
- }
- cp = (char *)mp->b_rptr;
- /* Initialize the error code */
- *((int *)mp1->b_rptr) = err;
- mp1->b_wptr = (uchar_t *)&cp[len];
- linkb(mp, mp1);
- return (mp);
-}
-
-void
-arp_ddi_init(void)
-{
- /*
- * We want to be informed each time a stack is created or
- * destroyed in the kernel, so we can maintain the
- * set of arp_stack_t's.
- */
- netstack_register(NS_ARP, arp_stack_init, arp_stack_shutdown,
- arp_stack_fini);
-}
-
-void
-arp_ddi_destroy(void)
-{
- netstack_unregister(NS_ARP);
-}
-
-/*
- * Initialize the ARP stack instance.
- */
-/* ARGSUSED */
-static void *
-arp_stack_init(netstackid_t stackid, netstack_t *ns)
-{
- arp_stack_t *as;
- arpparam_t *pa;
-
- as = (arp_stack_t *)kmem_zalloc(sizeof (*as), KM_SLEEP);
- as->as_netstack = ns;
-
- pa = (arpparam_t *)kmem_alloc(sizeof (arp_param_arr), KM_SLEEP);
- as->as_param_arr = pa;
- bcopy(arp_param_arr, as->as_param_arr, sizeof (arp_param_arr));
-
- (void) ar_param_register(&as->as_nd,
- as->as_param_arr, A_CNT(arp_param_arr));
-
- as->as_arp_index_counter = 1;
- as->as_arp_counter_wrapped = 0;
-
- rw_init(&as->as_arl_lock, NULL, RW_DRIVER, NULL);
- arp_net_init(as, stackid);
- arp_hook_init(as);
-
- return (as);
-}
-
-/* ARGSUSED */
-static void
-arp_stack_shutdown(netstackid_t stackid, void *arg)
-{
- arp_stack_t *as = (arp_stack_t *)arg;
-
- arp_net_shutdown(as);
-}
-
-/*
- * Free the ARP stack instance.
- */
-/* ARGSUSED */
-static void
-arp_stack_fini(netstackid_t stackid, void *arg)
-{
- arp_stack_t *as = (arp_stack_t *)arg;
-
- arp_hook_destroy(as);
- arp_net_destroy(as);
- rw_destroy(&as->as_arl_lock);
- nd_free(&as->as_nd);
- kmem_free(as->as_param_arr, sizeof (arp_param_arr));
- as->as_param_arr = NULL;
- kmem_free(as, sizeof (*as));
-}
diff --git a/usr/src/uts/common/inet/arp/arp_netinfo.c b/usr/src/uts/common/inet/arp/arp_netinfo.c
deleted file mode 100644
index 9d9c6a5bbe..0000000000
--- a/usr/src/uts/common/inet/arp/arp_netinfo.c
+++ /dev/null
@@ -1,376 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <sys/param.h>
-#include <sys/types.h>
-#include <sys/systm.h>
-#include <sys/cmn_err.h>
-#include <sys/stream.h>
-#include <sys/sunddi.h>
-#include <sys/hook.h>
-#include <sys/hook_impl.h>
-#include <sys/netstack.h>
-#include <net/if.h>
-
-#include <sys/neti.h>
-#include <sys/hook_event.h>
-#include <inet/arp_impl.h>
-
-/*
- * ARP netinfo entry point declarations.
- */
-static int arp_getifname(net_handle_t, phy_if_t, char *, const size_t);
-static int arp_getmtu(net_handle_t, phy_if_t, lif_if_t);
-static int arp_getpmtuenabled(net_handle_t);
-static int arp_getlifaddr(net_handle_t, phy_if_t, lif_if_t, size_t,
- net_ifaddr_t [], void *);
-static int arp_getlifzone(net_handle_t, phy_if_t, lif_if_t, zoneid_t *);
-static int arp_getlifflags(net_handle_t, phy_if_t, lif_if_t, uint64_t *);
-static phy_if_t arp_phygetnext(net_handle_t, phy_if_t);
-static phy_if_t arp_phylookup(net_handle_t, const char *);
-static lif_if_t arp_lifgetnext(net_handle_t, phy_if_t, lif_if_t);
-static int arp_inject(net_handle_t, inject_t, net_inject_t *);
-static phy_if_t arp_routeto(net_handle_t, struct sockaddr *, struct sockaddr *);
-static int arp_ispartialchecksum(net_handle_t, mblk_t *);
-static int arp_isvalidchecksum(net_handle_t, mblk_t *);
-
-static net_protocol_t arp_netinfo = {
- NETINFO_VERSION,
- NHF_ARP,
- arp_getifname,
- arp_getmtu,
- arp_getpmtuenabled,
- arp_getlifaddr,
- arp_getlifzone,
- arp_getlifflags,
- arp_phygetnext,
- arp_phylookup,
- arp_lifgetnext,
- arp_inject,
- arp_routeto,
- arp_ispartialchecksum,
- arp_isvalidchecksum
-};
-
-/*
- * Register ARP netinfo functions.
- */
-void
-arp_net_init(arp_stack_t *as, netstackid_t stackid)
-{
- netid_t id;
-
- id = net_getnetidbynetstackid(stackid);
- ASSERT(id != -1);
-
- as->as_net_data = net_protocol_register(id, &arp_netinfo);
- ASSERT(as->as_net_data != NULL);
-}
-
-void
-arp_net_shutdown(arp_stack_t *as)
-{
- if (as->as_arpnicevents != NULL) {
- (void) net_event_shutdown(as->as_net_data,
- &as->as_arp_nic_events);
- }
-
- if (as->as_arp_physical_out != NULL) {
- (void) net_event_shutdown(as->as_net_data,
- &as->as_arp_physical_out_event);
- }
-
- if (as->as_arp_physical_in != NULL) {
- (void) net_event_shutdown(as->as_net_data,
- &as->as_arp_physical_in_event);
- }
-
- (void) net_family_shutdown(as->as_net_data, &as->as_arproot);
-}
-
-/*
- * Unregister ARP netinfo functions.
- */
-void
-arp_net_destroy(arp_stack_t *as)
-{
- if (net_protocol_unregister(as->as_net_data) == 0)
- as->as_net_data = NULL;
-}
-
-/*
- * Initialize ARP hook family and events
- */
-void
-arp_hook_init(arp_stack_t *as)
-{
- HOOK_FAMILY_INIT(&as->as_arproot, Hn_ARP);
- if (net_family_register(as->as_net_data, &as->as_arproot) != 0) {
- cmn_err(CE_NOTE, "arp_hook_init: "
- "net_family_register failed for arp");
- }
-
- HOOK_EVENT_INIT(&as->as_arp_physical_in_event, NH_PHYSICAL_IN);
- as->as_arp_physical_in = net_event_register(as->as_net_data,
- &as->as_arp_physical_in_event);
- if (as->as_arp_physical_in == NULL) {
- cmn_err(CE_NOTE, "arp_hook_init: "
- "net_event_register failed for arp/physical_in");
- }
-
- HOOK_EVENT_INIT(&as->as_arp_physical_out_event, NH_PHYSICAL_OUT);
- as->as_arp_physical_out = net_event_register(as->as_net_data,
- &as->as_arp_physical_out_event);
- if (as->as_arp_physical_out == NULL) {
- cmn_err(CE_NOTE, "arp_hook_init: "
- "net_event_register failed for arp/physical_out");
- }
-
- HOOK_EVENT_INIT(&as->as_arp_nic_events, NH_NIC_EVENTS);
- as->as_arpnicevents = net_event_register(as->as_net_data,
- &as->as_arp_nic_events);
- if (as->as_arpnicevents == NULL) {
- cmn_err(CE_NOTE, "arp_hook_init: "
- "net_event_register failed for arp/nic_events");
- }
-}
-
-void
-arp_hook_destroy(arp_stack_t *as)
-{
- if (as->as_arpnicevents != NULL) {
- if (net_event_unregister(as->as_net_data,
- &as->as_arp_nic_events) == 0)
- as->as_arpnicevents = NULL;
- }
-
- if (as->as_arp_physical_out != NULL) {
- if (net_event_unregister(as->as_net_data,
- &as->as_arp_physical_out_event) == 0)
- as->as_arp_physical_out = NULL;
- }
-
- if (as->as_arp_physical_in != NULL) {
- if (net_event_unregister(as->as_net_data,
- &as->as_arp_physical_in_event) == 0)
- as->as_arp_physical_in = NULL;
- }
-
- (void) net_family_unregister(as->as_net_data, &as->as_arproot);
-}
-
-/*
- * Determine the name of the lower level interface
- */
-static int
-arp_getifname(net_handle_t net, phy_if_t phy_ifdata, char *buffer,
- const size_t buflen)
-{
- arl_t *arl;
- arp_stack_t *as;
- netstack_t *ns = net->netd_stack->nts_netstack;
-
- ASSERT(buffer != NULL);
- ASSERT(ns != NULL);
-
- as = ns->netstack_arp;
- rw_enter(&as->as_arl_lock, RW_READER);
- for (arl = as->as_arl_head; arl != NULL; arl = arl->arl_next) {
- if (arl->arl_index == phy_ifdata) {
- (void) strlcpy(buffer, arl->arl_name, buflen);
- rw_exit(&as->as_arl_lock);
- return (0);
- }
- }
- rw_exit(&as->as_arl_lock);
-
- return (1);
-}
-
-/*
- * Unsupported with ARP.
- */
-/*ARGSUSED*/
-static int
-arp_getmtu(net_handle_t net, phy_if_t phy_ifdata, lif_if_t ifdata)
-{
- return (-1);
-}
-
-/*
- * Unsupported with ARP.
- */
-/*ARGSUSED*/
-static int
-arp_getpmtuenabled(net_handle_t net)
-{
- return (-1);
-}
-
-/*
- * Unsupported with ARP.
- */
-/*ARGSUSED*/
-static int
-arp_getlifaddr(net_handle_t net, phy_if_t phy_ifdata, lif_if_t ifdata,
- size_t nelem, net_ifaddr_t type[], void *storage)
-{
- return (-1);
-}
-
-/*
- * Determine the instance number of the next lower level interface
- */
-static phy_if_t
-arp_phygetnext(net_handle_t net, phy_if_t phy_ifdata)
-{
- arl_t *arl;
- int index;
- arp_stack_t *as;
- netstack_t *ns = net->netd_stack->nts_netstack;
-
- ASSERT(ns != NULL);
-
- as = ns->netstack_arp;
- rw_enter(&as->as_arl_lock, RW_READER);
- if (phy_ifdata == 0) {
- arl = as->as_arl_head;
- } else {
- for (arl = as->as_arl_head; arl != NULL;
- arl = arl->arl_next) {
- if (arl->arl_index == phy_ifdata) {
- arl = arl->arl_next;
- break;
- }
- }
- }
-
- index = (arl != NULL) ? arl->arl_index : 0;
-
- rw_exit(&as->as_arl_lock);
-
- return (index);
-}
-
-/*
- * Given a network interface name, find its ARP layer instance number.
- */
-static phy_if_t
-arp_phylookup(net_handle_t net, const char *name)
-{
- arl_t *arl;
- int index;
- arp_stack_t *as;
- netstack_t *ns = net->netd_stack->nts_netstack;
-
- ASSERT(name != NULL);
- ASSERT(ns != NULL);
-
- index = 0;
- as = ns->netstack_arp;
- rw_enter(&as->as_arl_lock, RW_READER);
- for (arl = as->as_arl_head; arl != NULL; arl = arl->arl_next) {
- if (strcmp(name, arl->arl_name) == 0) {
- index = arl->arl_index;
- break;
- }
- }
- rw_exit(&as->as_arl_lock);
-
- return (index);
-
-}
-
-/*
- * Unsupported with ARP.
- */
-/*ARGSUSED*/
-static lif_if_t
-arp_lifgetnext(net_handle_t net, phy_if_t ifp, lif_if_t lif)
-{
- return ((lif_if_t)-1);
-}
-
-/*
- * Unsupported with ARP.
- */
-/*ARGSUSED*/
-static int
-arp_inject(net_handle_t net, inject_t injection, net_inject_t *neti)
-{
- return (-1);
-}
-
-/*
- * Unsupported with ARP.
- */
-/*ARGSUSED*/
-static phy_if_t
-arp_routeto(net_handle_t net, struct sockaddr *addr, struct sockaddr *next)
-{
- return ((phy_if_t)-1);
-}
-
-/*
- * Unsupported with ARP.
- */
-/*ARGSUSED*/
-int
-arp_ispartialchecksum(net_handle_t net, mblk_t *mb)
-{
- return (-1);
-}
-
-/*
- * Unsupported with ARP.
- */
-/*ARGSUSED*/
-static int
-arp_isvalidchecksum(net_handle_t net, mblk_t *mb)
-{
- return (-1);
-}
-
-/*
- * Unsupported with ARP.
- */
-/*ARGSUSED*/
-static int
-arp_getlifzone(net_handle_t net, phy_if_t phy_ifdata, lif_if_t ifdata,
- zoneid_t *zoneid)
-{
- return (-1);
-}
-
-/*
- * Unsupported with ARP.
- */
-/*ARGSUSED*/
-static int
-arp_getlifflags(net_handle_t net, phy_if_t phy_ifdata, lif_if_t ifdata,
- uint64_t *flags)
-{
- return (-1);
-}
diff --git a/usr/src/uts/common/inet/arp/arpddi.c b/usr/src/uts/common/inet/arp/arpddi.c
index 2cc56b77fd..de8333295b 100644
--- a/usr/src/uts/common/inet/arp/arpddi.c
+++ b/usr/src/uts/common/inet/arp/arpddi.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -27,10 +27,8 @@
#include <sys/types.h>
#include <sys/conf.h>
#include <sys/modctl.h>
-#include <sys/ksynch.h>
#include <inet/common.h>
#include <inet/ip.h>
-#include <inet/arp_impl.h>
#define INET_NAME "arp"
#define INET_MODDESC "ARP STREAMS module"
@@ -39,28 +37,16 @@
#define INET_DEVSTRTAB ipinfov4
#define INET_MODSTRTAB arpinfo
#define INET_DEVMTFLAGS IP_DEVMTFLAGS /* since as a driver we're ip */
-#define INET_MODMTFLAGS (D_MP | D_MTPERMOD)
+#define INET_MODMTFLAGS D_MP
#include "../inetddi.c"
-extern void arp_ddi_init(void);
-extern void arp_ddi_destroy(void);
-
int
_init(void)
{
int error;
- /*
- * Note: After mod_install succeeds, another thread can enter
- * therefore all initialization is done before it and any
- * de-initialization needed done if it fails.
- */
- arp_ddi_init();
error = mod_install(&modlinkage);
- if (error != 0)
- arp_ddi_destroy();
-
return (error);
}
@@ -70,8 +56,6 @@ _fini(void)
int error;
error = mod_remove(&modlinkage);
- if (error == 0)
- arp_ddi_destroy();
return (error);
}
diff --git a/usr/src/uts/common/inet/arp_impl.h b/usr/src/uts/common/inet/arp_impl.h
deleted file mode 100644
index 38d0d1ab65..0000000000
--- a/usr/src/uts/common/inet/arp_impl.h
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _ARP_IMPL_H
-#define _ARP_IMPL_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifdef _KERNEL
-
-#include <sys/types.h>
-#include <sys/stream.h>
-#include <net/if.h>
-#include <sys/netstack.h>
-
-/* ARP kernel hash size; used for mdb support */
-#define ARP_HASH_SIZE 256
-
-/* Named Dispatch Parameter Management Structure */
-typedef struct arpparam_s {
- uint32_t arp_param_min;
- uint32_t arp_param_max;
- uint32_t arp_param_value;
- char *arp_param_name;
-} arpparam_t;
-
-/* ARL Structure, one per link level device */
-typedef struct arl_s {
- struct arl_s *arl_next; /* ARL chain at arl_g_head */
- queue_t *arl_rq; /* Read queue pointer */
- queue_t *arl_wq; /* Write queue pointer */
- t_uscalar_t arl_ppa; /* DL_ATTACH parameter */
- char arl_name[LIFNAMSIZ]; /* Lower level name */
- mblk_t *arl_unbind_mp;
- mblk_t *arl_detach_mp;
- t_uscalar_t arl_provider_style; /* From DL_INFO_ACK */
- mblk_t *arl_queue; /* Queued commands head */
- mblk_t *arl_queue_tail; /* Queued commands tail */
- uint32_t arl_flags; /* ARL_F_* values below */
- t_uscalar_t arl_dlpi_pending; /* pending DLPI request */
- mblk_t *arl_dlpi_deferred; /* Deferred DLPI messages */
- uint_t arl_state; /* lower interface state */
- uint_t arl_closing : 1, /* stream is closing */
- arl_replumbing : 1; /* Wait for IP to bring down */
- uint32_t arl_index; /* instance number */
- struct arlphy_s *arl_phy; /* physical info, if any */
- struct arl_s *arl_ipmp_arl; /* pointer to group arl_t */
-} arl_t;
-
-/*
- * There is no field to get from an arl_t to an arp_stack_t, but this
- * macro does it.
- */
-#define ARL_TO_ARPSTACK(_arl) (((ar_t *)(_arl)->arl_rq->q_ptr)->ar_as)
-
-/* ARL physical info structure, one per physical link level device */
-typedef struct arlphy_s {
- uint32_t ap_arp_hw_type; /* hardware type */
- uchar_t *ap_arp_addr; /* multicast address to use */
- uchar_t *ap_hw_addr; /* hardware address */
- uint32_t ap_hw_addrlen; /* hardware address length */
- mblk_t *ap_xmit_mp; /* DL_UNITDATA_REQ template */
- t_uscalar_t ap_xmit_addroff; /* address offset in xmit_mp */
- t_uscalar_t ap_xmit_sapoff; /* sap offset in xmit_mp */
- t_scalar_t ap_saplen; /* sap length */
- clock_t ap_defend_start; /* start of 1-hour period */
- uint_t ap_defend_count; /* # of unbidden broadcasts */
- uint_t ap_notifies : 1, /* handles DL_NOTE_LINK */
- ap_link_down : 1; /* DL_NOTE status */
-} arlphy_t;
-
-/* ARP Cache Entry */
-typedef struct ace_s {
- struct ace_s *ace_next; /* Hash chain next pointer */
- struct ace_s **ace_ptpn; /* Pointer to previous next */
- struct arl_s *ace_arl; /* Associated arl */
- uint32_t ace_proto; /* Protocol for this ace */
- uint32_t ace_flags;
- uchar_t *ace_proto_addr;
- uint32_t ace_proto_addr_length;
- uchar_t *ace_proto_mask; /* Mask for matching addr */
- uchar_t *ace_proto_extract_mask; /* For mappings */
- uchar_t *ace_hw_addr;
- uint32_t ace_hw_addr_length;
- uint32_t ace_hw_extract_start; /* For mappings */
- mblk_t *ace_mp; /* mblk we are in */
- mblk_t *ace_query_mp; /* outstanding query chain */
- clock_t ace_last_bcast; /* last broadcast Response */
- clock_t ace_xmit_interval;
- int ace_xmit_count;
- arl_t *ace_xmit_arl; /* xmit on this arl */
-} ace_t;
-
-#define ARPHOOK_INTERESTED_PHYSICAL_IN(as) \
- (as->as_arp_physical_in_event.he_interested)
-#define ARPHOOK_INTERESTED_PHYSICAL_OUT(as) \
- (as->as_arp_physical_out_event.he_interested)
-
-#define ARP_HOOK_IN(_hook, _event, _ilp, _hdr, _fm, _m, as) \
- \
- if ((_hook).he_interested) { \
- hook_pkt_event_t info; \
- \
- info.hpe_protocol = as->as_net_data; \
- info.hpe_ifp = _ilp; \
- info.hpe_ofp = 0; \
- info.hpe_hdr = _hdr; \
- info.hpe_mp = &(_fm); \
- info.hpe_mb = _m; \
- if (hook_run(as->as_net_data->netd_hooks, \
- _event, (hook_data_t)&info) != 0) { \
- if (_fm != NULL) { \
- freemsg(_fm); \
- _fm = NULL; \
- } \
- _hdr = NULL; \
- _m = NULL; \
- } else { \
- _hdr = info.hpe_hdr; \
- _m = info.hpe_mb; \
- } \
- }
-
-#define ARP_HOOK_OUT(_hook, _event, _olp, _hdr, _fm, _m, as) \
- \
- if ((_hook).he_interested) { \
- hook_pkt_event_t info; \
- \
- info.hpe_protocol = as->as_net_data; \
- info.hpe_ifp = 0; \
- info.hpe_ofp = _olp; \
- info.hpe_hdr = _hdr; \
- info.hpe_mp = &(_fm); \
- info.hpe_mb = _m; \
- if (hook_run(as->as_net_data->netd_hooks, \
- _event, (hook_data_t)&info) != 0) { \
- if (_fm != NULL) { \
- freemsg(_fm); \
- _fm = NULL; \
- } \
- _hdr = NULL; \
- _m = NULL; \
- } else { \
- _hdr = info.hpe_hdr; \
- _m = info.hpe_mb; \
- } \
- }
-
-#define ACE_EXTERNAL_FLAGS_MASK \
- (ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MAPPING | ACE_F_MYADDR | \
- ACE_F_AUTHORITY)
-
-/*
- * ARP stack instances
- */
-struct arp_stack {
- netstack_t *as_netstack; /* Common netstack */
- void *as_head; /* AR Instance Data List Head */
- caddr_t as_nd; /* AR Named Dispatch Head */
- struct arl_s *as_arl_head; /* ARL List Head */
- arpparam_t *as_param_arr; /* ndd variable table */
-
- /* ARP Cache Entry Hash Table */
- ace_t *as_ce_hash_tbl[ARP_HASH_SIZE];
- ace_t *as_ce_mask_entries;
-
- /*
- * With the introduction of netinfo (neti kernel module),
- * it is now possible to access data structures in the ARP module
- * without the code being executed in the context of the IP module,
- * thus there is no locking being enforced through the use of STREAMS.
- * as_arl_lock is used to protect as_arl_head list.
- */
- krwlock_t as_arl_lock;
-
- uint32_t as_arp_index_counter;
- uint32_t as_arp_counter_wrapped;
-
- /* arp_neti.c */
- hook_family_t as_arproot;
-
- /*
- * Hooks for ARP
- */
- hook_event_t as_arp_physical_in_event;
- hook_event_t as_arp_physical_out_event;
- hook_event_t as_arp_nic_events;
-
- hook_event_token_t as_arp_physical_in;
- hook_event_token_t as_arp_physical_out;
- hook_event_token_t as_arpnicevents;
-
- net_handle_t as_net_data;
-};
-typedef struct arp_stack arp_stack_t;
-
-#define ARL_F_NOARP 0x01
-#define ARL_F_IPMP 0x02
-
-#define ARL_S_DOWN 0x00
-#define ARL_S_PENDING 0x01
-#define ARL_S_UP 0x02
-
-/* AR Structure, one per upper stream */
-typedef struct ar_s {
- queue_t *ar_rq; /* Read queue pointer */
- queue_t *ar_wq; /* Write queue pointer */
- arl_t *ar_arl; /* Associated arl */
- cred_t *ar_credp; /* Credentials associated w/ open */
- struct ar_s *ar_arl_ip_assoc; /* ARL - IP association */
- uint32_t
- ar_ip_acked_close : 1, /* IP has acked the close */
- ar_on_ill_stream : 1; /* Module below is IP */
- arp_stack_t *ar_as;
-} ar_t;
-
-extern void arp_hook_init(arp_stack_t *);
-extern void arp_hook_destroy(arp_stack_t *);
-extern void arp_net_init(arp_stack_t *, netstackid_t);
-extern void arp_net_shutdown(arp_stack_t *);
-extern void arp_net_destroy(arp_stack_t *);
-
-#endif /* _KERNEL */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _ARP_IMPL_H */
diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h
index 5a7e05b210..88a14068bb 100644
--- a/usr/src/uts/common/inet/ip.h
+++ b/usr/src/uts/common/inet/ip.h
@@ -55,8 +55,6 @@ extern "C" {
#include <sys/squeue.h>
#include <net/route.h>
#include <sys/systm.h>
-#include <sys/multidata.h>
-#include <sys/list.h>
#include <net/radix.h>
#include <sys/modhash.h>
@@ -94,6 +92,7 @@ typedef uint32_t ipaddr_t;
/* Number of bits in an address */
#define IP_ABITS 32
+#define IPV4_ABITS IP_ABITS
#define IPV6_ABITS 128
#define IP_HOST_MASK (ipaddr_t)0xffffffffU
@@ -101,14 +100,6 @@ typedef uint32_t ipaddr_t;
#define IP_CSUM(mp, off, sum) (~ip_cksum(mp, off, sum) & 0xFFFF)
#define IP_CSUM_PARTIAL(mp, off, sum) ip_cksum(mp, off, sum)
#define IP_BCSUM_PARTIAL(bp, len, sum) bcksum(bp, len, sum)
-#define IP_MD_CSUM(pd, off, sum) (~ip_md_cksum(pd, off, sum) & 0xffff)
-#define IP_MD_CSUM_PARTIAL(pd, off, sum) ip_md_cksum(pd, off, sum)
-
-/*
- * Flag to IP write side to indicate that the appln has sent in a pre-built
- * IP header. Stored in ipha_ident (which is otherwise zero).
- */
-#define IP_HDR_INCLUDED 0xFFFF
#define ILL_FRAG_HASH_TBL_COUNT ((unsigned int)64)
#define ILL_FRAG_HASH_TBL_SIZE (ILL_FRAG_HASH_TBL_COUNT * sizeof (ipfb_t))
@@ -137,17 +128,12 @@ typedef uint32_t ipaddr_t;
#define UDPH_SIZE 8
-/* Leave room for ip_newroute to tack on the src and target addresses */
-#define OK_RESOLVER_MP(mp) \
- ((mp) && ((mp)->b_wptr - (mp)->b_rptr) >= (2 * IP_ADDR_LEN))
-
/*
* Constants and type definitions to support IP IOCTL commands
*/
#define IP_IOCTL (('i'<<8)|'p')
#define IP_IOC_IRE_DELETE 4
#define IP_IOC_IRE_DELETE_NO_REPLY 5
-#define IP_IOC_IRE_ADVISE_NO_REPLY 6
#define IP_IOC_RTS_REQUEST 7
/* Common definitions used by IP IOCTL data structures */
@@ -157,31 +143,6 @@ typedef struct ipllcmd_s {
uint_t ipllc_name_length;
} ipllc_t;
-/* IP IRE Change Command Structure. */
-typedef struct ipic_s {
- ipllc_t ipic_ipllc;
- uint_t ipic_ire_type;
- uint_t ipic_max_frag;
- uint_t ipic_addr_offset;
- uint_t ipic_addr_length;
- uint_t ipic_mask_offset;
- uint_t ipic_mask_length;
- uint_t ipic_src_addr_offset;
- uint_t ipic_src_addr_length;
- uint_t ipic_ll_hdr_offset;
- uint_t ipic_ll_hdr_length;
- uint_t ipic_gateway_addr_offset;
- uint_t ipic_gateway_addr_length;
- clock_t ipic_rtt;
- uint32_t ipic_ssthresh;
- clock_t ipic_rtt_sd;
- uchar_t ipic_ire_marks;
-} ipic_t;
-
-#define ipic_cmd ipic_ipllc.ipllc_cmd
-#define ipic_ll_name_length ipic_ipllc.ipllc_name_length
-#define ipic_ll_name_offset ipic_ipllc.ipllc_name_offset
-
/* IP IRE Delete Command Structure. */
typedef struct ipid_s {
ipllc_t ipid_ipllc;
@@ -257,16 +218,8 @@ typedef struct ipoptp_s
#define Q_TO_ICMP(q) (Q_TO_CONN((q))->conn_icmp)
#define Q_TO_RTS(q) (Q_TO_CONN((q))->conn_rts)
-/*
- * The following two macros are used by IP to get the appropriate
- * wq and rq for a conn. If it is a TCP conn, then we need
- * tcp_wq/tcp_rq else, conn_wq/conn_rq. IP can use conn_wq and conn_rq
- * from a conn directly if it knows that the conn is not TCP.
- */
-#define CONNP_TO_WQ(connp) \
- (IPCL_IS_TCP(connp) ? (connp)->conn_tcp->tcp_wq : (connp)->conn_wq)
-
-#define CONNP_TO_RQ(connp) RD(CONNP_TO_WQ(connp))
+#define CONNP_TO_WQ(connp) ((connp)->conn_wq)
+#define CONNP_TO_RQ(connp) ((connp)->conn_rq)
#define GRAB_CONN_LOCK(q) { \
if (q != NULL && CONN_Q(q)) \
@@ -278,9 +231,6 @@ typedef struct ipoptp_s
mutex_exit(&(Q_TO_CONN(q))->conn_lock); \
}
-/* "Congestion controlled" protocol */
-#define IP_FLOW_CONTROLLED_ULP(p) ((p) == IPPROTO_TCP || (p) == IPPROTO_SCTP)
-
/*
* Complete the pending operation. Usually an ioctl. Can also
* be a bind or option management request that got enqueued
@@ -295,63 +245,13 @@ typedef struct ipoptp_s
}
/*
- * Flags for the various ip_fanout_* routines.
- */
-#define IP_FF_SEND_ICMP 0x01 /* Send an ICMP error */
-#define IP_FF_HDR_COMPLETE 0x02 /* Call ip_hdr_complete if error */
-#define IP_FF_CKSUM 0x04 /* Recompute ipha_cksum if error */
-#define IP_FF_RAWIP 0x08 /* Use rawip mib variable */
-#define IP_FF_SRC_QUENCH 0x10 /* OK to send ICMP_SOURCE_QUENCH */
-#define IP_FF_SYN_ADDIRE 0x20 /* Add IRE if TCP syn packet */
-#define IP_FF_IPINFO 0x80 /* Used for both V4 and V6 */
-#define IP_FF_SEND_SLLA 0x100 /* Send source link layer info ? */
-#define IPV6_REACHABILITY_CONFIRMATION 0x200 /* Flags for ip_xmit_v6 */
-#define IP_FF_NO_MCAST_LOOP 0x400 /* No multicasts for sending zone */
-
-/*
- * Following flags are used by IPQoS to determine if policy processing is
- * required.
- */
-#define IP6_NO_IPPOLICY 0x800 /* Don't do IPQoS processing */
-#define IP6_IN_LLMCAST 0x1000 /* Multicast */
-
-#define IP_FF_LOOPBACK 0x2000 /* Loopback fanout */
-#define IP_FF_SCTP_CSUM_ERR 0x4000 /* sctp pkt has failed chksum */
-
-#ifndef IRE_DB_TYPE
-#define IRE_DB_TYPE M_SIG
-#endif
-
-#ifndef IRE_DB_REQ_TYPE
-#define IRE_DB_REQ_TYPE M_PCSIG
-#endif
-
-#ifndef IRE_ARPRESOLVE_TYPE
-#define IRE_ARPRESOLVE_TYPE M_EVENT
-#endif
-
-/*
* Values for squeue switch:
*/
-
#define IP_SQUEUE_ENTER_NODRAIN 1
#define IP_SQUEUE_ENTER 2
-/*
- * This is part of the interface between Transport provider and
- * IP which can be used to set policy information. This is usually
- * accompanied with O_T_BIND_REQ/T_BIND_REQ.ip_bind assumes that
- * only IPSEC_POLICY_SET is there when it is found in the chain.
- * The information contained is an struct ipsec_req_t. On success
- * or failure, either the T_BIND_ACK or the T_ERROR_ACK is returned.
- * IPSEC_POLICY_SET is never returned.
- */
-#define IPSEC_POLICY_SET M_SETOPTS
+#define IP_SQUEUE_FILL 3
-#define IRE_IS_LOCAL(ire) ((ire != NULL) && \
- ((ire)->ire_type & (IRE_LOCAL | IRE_LOOPBACK)))
-
-#define IRE_IS_TARGET(ire) ((ire != NULL) && \
- ((ire)->ire_type != IRE_BROADCAST))
+extern int ip_squeue_flag;
/* IP Fragmentation Reassembly Header */
typedef struct ipf_s {
@@ -387,71 +287,6 @@ typedef struct ipf_s {
#define ipf_src V4_PART_OF_V6(ipf_v6src)
#define ipf_dst V4_PART_OF_V6(ipf_v6dst)
-typedef enum {
- IB_PKT = 0x01,
- OB_PKT = 0x02
-} ip_pkt_t;
-
-#define UPDATE_IB_PKT_COUNT(ire)\
- { \
- (ire)->ire_ib_pkt_count++; \
- if ((ire)->ire_ipif != NULL) { \
- /* \
- * forwarding packet \
- */ \
- if ((ire)->ire_type & (IRE_LOCAL|IRE_BROADCAST)) \
- atomic_add_32(&(ire)->ire_ipif->ipif_ib_pkt_count, 1);\
- else \
- atomic_add_32(&(ire)->ire_ipif->ipif_fo_pkt_count, 1);\
- } \
- }
-
-#define UPDATE_OB_PKT_COUNT(ire)\
- { \
- (ire)->ire_ob_pkt_count++;\
- if ((ire)->ire_ipif != NULL) { \
- atomic_add_32(&(ire)->ire_ipif->ipif_ob_pkt_count, 1); \
- } \
- }
-
-#define IP_RPUT_LOCAL(q, mp, ipha, ire, recv_ill) \
-{ \
- switch (ipha->ipha_protocol) { \
- case IPPROTO_UDP: \
- ip_udp_input(q, mp, ipha, ire, recv_ill); \
- break; \
- default: \
- ip_proto_input(q, mp, ipha, ire, recv_ill, 0); \
- break; \
- } \
-}
-
-/*
- * NCE_EXPIRED is TRUE when we have a non-permanent nce that was
- * found to be REACHABLE more than ip_ire_arp_interval ms ago.
- * This macro is used to age existing nce_t entries. The
- * nce's will get cleaned up in the following circumstances:
- * - ip_ire_trash_reclaim will free nce's using ndp_cache_reclaim
- * when memory is low,
- * - ip_arp_news, when updates are received.
- * - if the nce is NCE_EXPIRED(), it will deleted, so that a new
- * arp request will need to be triggered from an ND_INITIAL nce.
- *
- * Note that the nce state transition follows the pattern:
- * ND_INITIAL -> ND_INCOMPLETE -> ND_REACHABLE
- * after which the nce is deleted when it has expired.
- *
- * nce_last is the timestamp that indicates when the nce_res_mp in the
- * nce_t was last updated to a valid link-layer address. nce_last gets
- * modified/updated :
- * - when the nce is created
- * - every time we get a sane arp response for the nce.
- */
-#define NCE_EXPIRED(nce, ipst) (nce->nce_last > 0 && \
- ((nce->nce_flags & NCE_F_PERMANENT) == 0) && \
- ((TICK_TO_MSEC(lbolt64) - nce->nce_last) > \
- (ipst)->ips_ip_ire_arp_interval))
-
#endif /* _KERNEL */
/* ICMP types */
@@ -560,7 +395,17 @@ typedef struct ipha_s {
#define IPH_DF 0x4000 /* Don't fragment */
#define IPH_MF 0x2000 /* More fragments to come */
#define IPH_OFFSET 0x1FFF /* Where the offset lives */
-#define IPH_FRAG_HDR 0x8000 /* IPv6 don't fragment bit */
+
+/* Byte-order specific values */
+#ifdef _BIG_ENDIAN
+#define IPH_DF_HTONS 0x4000 /* Don't fragment */
+#define IPH_MF_HTONS 0x2000 /* More fragments to come */
+#define IPH_OFFSET_HTONS 0x1FFF /* Where the offset lives */
+#else
+#define IPH_DF_HTONS 0x0040 /* Don't fragment */
+#define IPH_MF_HTONS 0x0020 /* More fragments to come */
+#define IPH_OFFSET_HTONS 0xFF1F /* Where the offset lives */
+#endif
/* ECN code points for IPv4 TOS byte and IPv6 traffic class octet. */
#define IPH_ECN_NECT 0x0 /* Not ECN-Capable Transport */
@@ -571,10 +416,8 @@ typedef struct ipha_s {
struct ill_s;
typedef void ip_v6intfid_func_t(struct ill_s *, in6_addr_t *);
-typedef boolean_t ip_v6mapinfo_func_t(uint_t, uint8_t *, uint8_t *, uint32_t *,
- in6_addr_t *);
-typedef boolean_t ip_v4mapinfo_func_t(uint_t, uint8_t *, uint8_t *, uint32_t *,
- ipaddr_t *);
+typedef void ip_v6mapinfo_func_t(struct ill_s *, uchar_t *, uchar_t *);
+typedef void ip_v4mapinfo_func_t(struct ill_s *, uchar_t *, uchar_t *);
/* IP Mac info structure */
typedef struct ip_m_s {
@@ -582,8 +425,8 @@ typedef struct ip_m_s {
int ip_m_type; /* From <net/if_types.h> */
t_uscalar_t ip_m_ipv4sap;
t_uscalar_t ip_m_ipv6sap;
- ip_v4mapinfo_func_t *ip_m_v4mapinfo;
- ip_v6mapinfo_func_t *ip_m_v6mapinfo;
+ ip_v4mapinfo_func_t *ip_m_v4mapping;
+ ip_v6mapinfo_func_t *ip_m_v6mapping;
ip_v6intfid_func_t *ip_m_v6intfid;
ip_v6intfid_func_t *ip_m_v6destintfid;
} ip_m_t;
@@ -591,20 +434,14 @@ typedef struct ip_m_s {
/*
* The following functions attempt to reduce the link layer dependency
* of the IP stack. The current set of link specific operations are:
- * a. map from IPv4 class D (224.0/4) multicast address range to the link
- * layer multicast address range.
- * b. map from IPv6 multicast address range (ff00::/8) to the link
- * layer multicast address range.
- * c. derive the default IPv6 interface identifier from the interface.
- * d. derive the default IPv6 destination interface identifier from
+ * a. map from IPv4 class D (224.0/4) multicast address range or the
+ * IPv6 multicast address range (ff00::/8) to the link layer multicast
+ * address.
+ * b. derive the default IPv6 interface identifier from the interface.
+ * c. derive the default IPv6 destination interface identifier from
* the interface (point-to-point only).
*/
-#define MEDIA_V4MINFO(ip_m, plen, bphys, maddr, hwxp, v4ptr) \
- (((ip_m)->ip_m_v4mapinfo != NULL) && \
- (*(ip_m)->ip_m_v4mapinfo)(plen, bphys, maddr, hwxp, v4ptr))
-#define MEDIA_V6MINFO(ip_m, plen, bphys, maddr, hwxp, v6ptr) \
- (((ip_m)->ip_m_v6mapinfo != NULL) && \
- (*(ip_m)->ip_m_v6mapinfo)(plen, bphys, maddr, hwxp, v6ptr))
+extern void ip_mcast_mapping(struct ill_s *, uchar_t *, uchar_t *);
/* ip_m_v6*intfid return void and are never NULL */
#define MEDIA_V6INTFID(ip_m, ill, v6ptr) (ip_m)->ip_m_v6intfid(ill, v6ptr)
#define MEDIA_V6DESTINTFID(ip_m, ill, v6ptr) \
@@ -616,107 +453,38 @@ typedef struct ip_m_s {
#define IRE_LOCAL 0x0004 /* Route entry for local address */
#define IRE_LOOPBACK 0x0008 /* Route entry for loopback address */
#define IRE_PREFIX 0x0010 /* Route entry for prefix routes */
+#ifndef _KERNEL
+/* Keep so user-level still compiles */
#define IRE_CACHE 0x0020 /* Cached Route entry */
+#endif
#define IRE_IF_NORESOLVER 0x0040 /* Route entry for local interface */
/* net without any address mapping. */
#define IRE_IF_RESOLVER 0x0080 /* Route entry for local interface */
/* net with resolver. */
#define IRE_HOST 0x0100 /* Host route entry */
+/* Keep so user-level still compiles */
#define IRE_HOST_REDIRECT 0x0200 /* only used for T_SVR4_OPTMGMT_REQ */
+#define IRE_IF_CLONE 0x0400 /* Per host clone of IRE_IF */
+#define IRE_MULTICAST 0x0800 /* Special - not in table */
+#define IRE_NOROUTE 0x1000 /* Special - not in table */
#define IRE_INTERFACE (IRE_IF_NORESOLVER | IRE_IF_RESOLVER)
-#define IRE_OFFSUBNET (IRE_DEFAULT | IRE_PREFIX | IRE_HOST)
-#define IRE_CACHETABLE (IRE_CACHE | IRE_BROADCAST | IRE_LOCAL | \
- IRE_LOOPBACK)
-#define IRE_FORWARDTABLE (IRE_INTERFACE | IRE_OFFSUBNET)
-
-/*
- * If an IRE is marked with IRE_MARK_CONDEMNED, the last walker of
- * the bucket should delete this IRE from this bucket.
- */
-#define IRE_MARK_CONDEMNED 0x0001
-
-/*
- * An IRE with IRE_MARK_PMTU has ire_max_frag set from an ICMP error.
- */
-#define IRE_MARK_PMTU 0x0002
-
-/*
- * An IRE with IRE_MARK_TESTHIDDEN is used by in.mpathd for test traffic. It
- * can only be looked up by requesting MATCH_IRE_MARK_TESTHIDDEN.
- */
-#define IRE_MARK_TESTHIDDEN 0x0004
-
-/*
- * An IRE with IRE_MARK_NOADD is created in ip_newroute_ipif when the outgoing
- * interface is specified by e.g. IP_PKTINFO. The IRE is not added to the IRE
- * cache table.
- */
-#define IRE_MARK_NOADD 0x0008 /* Mark not to add ire in cache */
-
-/*
- * IRE marked with IRE_MARK_TEMPORARY means that this IRE has been used
- * either for forwarding a packet or has not been used for sending
- * traffic on TCP connections terminated on this system. In both
- * cases, this IRE is the first to go when IRE is being cleaned up.
- */
-#define IRE_MARK_TEMPORARY 0x0010
-
-/*
- * IRE marked with IRE_MARK_USESRC_CHECK means that while adding an IRE with
- * this mark, additional atomic checks need to be performed. For eg: by the
- * time an IRE_CACHE is created, sent up to ARP and then comes back to IP; the
- * usesrc grouping could have changed in which case we want to fail adding
- * the IRE_CACHE entry
- */
-#define IRE_MARK_USESRC_CHECK 0x0020
-
-/*
- * IRE_MARK_PRIVATE_ADDR is used for IP_NEXTHOP. When IP_NEXTHOP is set, the
- * routing table lookup for the destination is bypassed and the packet is
- * sent directly to the specified nexthop. The associated IRE_CACHE entries
- * should be marked with IRE_MARK_PRIVATE_ADDR flag so that they don't show up
- * in regular ire cache lookups.
- */
-#define IRE_MARK_PRIVATE_ADDR 0x0040
+#define IRE_IF_ALL (IRE_IF_NORESOLVER | IRE_IF_RESOLVER | \
+ IRE_IF_CLONE)
+#define IRE_OFFSUBNET (IRE_DEFAULT | IRE_PREFIX | IRE_HOST)
+#define IRE_OFFLINK IRE_OFFSUBNET
/*
- * When we send an ARP resolution query for the nexthop gateway's ire,
- * we use esballoc to create the ire_t in the AR_ENTRY_QUERY mblk
- * chain, and mark its ire_marks with IRE_MARK_UNCACHED. This flag
- * indicates that information from ARP has not been transferred to a
- * permanent IRE_CACHE entry. The flag is reset only when the
- * information is successfully transferred to an ire_cache entry (in
- * ire_add()). Attempting to free the AR_ENTRY_QUERY mblk chain prior
- * to ire_add (e.g., from arp, or from ip`ip_wput_nondata) will
- * require that the resources (incomplete ire_cache and/or nce) must
- * be cleaned up. The free callback routine (ire_freemblk()) checks
- * for IRE_MARK_UNCACHED to see if any resources that are pinned down
- * will need to be cleaned up or not.
+ * Note that we view IRE_NOROUTE as ONLINK since we can "send" to them without
+ * going through a router; the result of sending will be an error/icmp error.
*/
-
-#define IRE_MARK_UNCACHED 0x0080
-
-/*
- * The comment below (and for other netstack_t references) refers
- * to the fact that we only do netstack_hold in particular cases,
- * such as the references from open streams (ill_t and conn_t's
- * pointers). Internally within IP we rely on IP's ability to cleanup e.g.
- * ire_t's when an ill goes away.
- */
-typedef struct ire_expire_arg_s {
- int iea_flush_flag;
- ip_stack_t *iea_ipst; /* Does not have a netstack_hold */
-} ire_expire_arg_t;
-
-/* Flags with ire_expire routine */
-#define FLUSH_ARP_TIME 0x0001 /* ARP info potentially stale timer */
-#define FLUSH_REDIRECT_TIME 0x0002 /* Redirects potentially stale */
-#define FLUSH_MTU_TIME 0x0004 /* Include path MTU per RFC 1191 */
+#define IRE_ONLINK (IRE_IF_ALL|IRE_LOCAL|IRE_LOOPBACK| \
+ IRE_BROADCAST|IRE_MULTICAST|IRE_NOROUTE)
/* Arguments to ire_flush_cache() */
#define IRE_FLUSH_DELETE 0
#define IRE_FLUSH_ADD 1
+#define IRE_FLUSH_GWCHANGE 2
/*
* Open/close synchronization flags.
@@ -724,31 +492,21 @@ typedef struct ire_expire_arg_s {
* depends on the atomic 32 bit access to that field.
*/
#define CONN_CLOSING 0x01 /* ip_close waiting for ip_wsrv */
-#define CONN_IPSEC_LOAD_WAIT 0x02 /* waiting for load */
-#define CONN_CONDEMNED 0x04 /* conn is closing, no more refs */
-#define CONN_INCIPIENT 0x08 /* conn not yet visible, no refs */
-#define CONN_QUIESCED 0x10 /* conn is now quiescent */
-
-/* Used to check connection state flags before caching the IRE */
-#define CONN_CACHE_IRE(connp) \
- (!((connp)->conn_state_flags & (CONN_CLOSING|CONN_CONDEMNED)))
-
-/*
- * Parameter to ip_output giving the identity of the caller.
- * IP_WSRV means the packet was enqueued in the STREAMS queue
- * due to flow control and is now being reprocessed in the context of
- * the STREAMS service procedure, consequent to flow control relief.
- * IRE_SEND means the packet is being reprocessed consequent to an
- * ire cache creation and addition and this may or may not be happening
- * in the service procedure context. Anything other than the above 2
- * cases is identified as IP_WPUT. Most commonly this is the case of
- * packets coming down from the application.
+#define CONN_CONDEMNED 0x02 /* conn is closing, no more refs */
+#define CONN_INCIPIENT 0x04 /* conn not yet visible, no refs */
+#define CONN_QUIESCED 0x08 /* conn is now quiescent */
+#define CONN_UPDATE_ILL 0x10 /* conn_update_ill in progress */
+
+/*
+ * Flags for dce_flags field. Specifies which information has been set.
+ * dce_ident is always present, but the other ones are identified by the flags.
*/
-#ifdef _KERNEL
-#define IP_WSRV 1 /* Called from ip_wsrv */
-#define IP_WPUT 2 /* Called from ip_wput */
-#define IRE_SEND 3 /* Called from ire_send */
+#define DCEF_DEFAULT 0x0001 /* Default DCE - no pmtu or uinfo */
+#define DCEF_PMTU 0x0002 /* Different than interface MTU */
+#define DCEF_UINFO 0x0004 /* dce_uinfo set */
+#define DCEF_TOO_SMALL_PMTU 0x0008 /* Smaller than IPv4/IPv6 MIN */
+#ifdef _KERNEL
/*
* Extra structures need for per-src-addr filtering (IGMPv3/MLDv2)
*/
@@ -786,90 +544,80 @@ typedef struct mrec_s {
} mrec_t;
/* Group membership list per upper conn */
+
/*
- * XXX add ilg info for ifaddr/ifindex.
- * XXX can we make ilg survive an ifconfig unplumb + plumb
- * by setting the ipif/ill to NULL and recover that later?
+ * We record the multicast information from the socket option in
+ * ilg_ifaddr/ilg_ifindex. This allows rejoining the group in the case when
+ * the ifaddr (or ifindex) disappears and later reappears, potentially on
+ * a different ill. The IPv6 multicast socket options and ioctls all specify
+ * the interface using an ifindex. For IPv4 some socket options/ioctls use
+ * the interface address and others use the index. We record here the method
+ * that was actually used (and leave the other of ilg_ifaddr or ilg_ifindex)
+ * at zero so that we can rejoin the way the application intended.
*
- * ilg_ipif is used by IPv4 as multicast groups are joined using an interface
- * address (ipif).
- * ilg_ill is used by IPv6 as multicast groups are joined using an interface
- * index (phyint->phyint_ifindex).
- * ilg_ill is NULL for IPv4 and ilg_ipif is NULL for IPv6.
+ * We track the ill on which we will or already have joined an ilm using
+ * ilg_ill. When we have succeeded joining the ilm and have a refhold on it
+ * then we set ilg_ilm. Thus intentionally there is a window where ilg_ill is
+ * set and ilg_ilm is not set. This allows clearing ilg_ill as a signal that
+ * the ill is being unplumbed and the ilm should be discarded.
*
* ilg records the state of multicast memberships of a socket end point.
* ilm records the state of multicast memberships with the driver and is
* maintained per interface.
*
- * There is no direct link between a given ilg and ilm. If the
- * application has joined a group G with ifindex I, we will have
- * an ilg with ilg_v6group and ilg_ill. There will be a corresponding
- * ilm with ilm_ill/ilm_v6addr recording the multicast membership.
- * To delete the membership:
- *
- * a) Search for ilg matching on G and I with ilg_v6group
- * and ilg_ill. Delete ilg_ill.
- * b) Search the corresponding ilm matching on G and I with
- * ilm_v6addr and ilm_ill. Delete ilm.
- *
- * For IPv4 the only difference is that we look using ipifs, not ills.
+ * The ilg state is protected by conn_ilg_lock.
+ * The ilg will not be freed until ilg_refcnt drops to zero.
*/
-
-/*
- * The ilg_t and ilm_t members are protected by ipsq. They can be changed only
- * by a thread executing in the ipsq. In other words add/delete of a
- * multicast group has to execute in the ipsq.
- */
-#define ILG_DELETED 0x1 /* ilg_flags */
typedef struct ilg_s {
+ struct ilg_s *ilg_next;
+ struct ilg_s **ilg_ptpn;
+ struct conn_s *ilg_connp; /* Back pointer to get lock */
in6_addr_t ilg_v6group;
- struct ipif_s *ilg_ipif; /* Logical interface we are member on */
- struct ill_s *ilg_ill; /* Used by IPv6 */
- uint_t ilg_flags;
+ ipaddr_t ilg_ifaddr; /* For some IPv4 cases */
+ uint_t ilg_ifindex; /* IPv6 and some other IPv4 cases */
+ struct ill_s *ilg_ill; /* Where ilm is joined. No refhold */
+ struct ilm_s *ilg_ilm; /* With ilm_refhold */
+ uint_t ilg_refcnt;
mcast_record_t ilg_fmode; /* MODE_IS_INCLUDE/MODE_IS_EXCLUDE */
slist_t *ilg_filter;
+ boolean_t ilg_condemned; /* Conceptually deleted */
} ilg_t;
/*
* Multicast address list entry for ill.
- * ilm_ipif is used by IPv4 as multicast groups are joined using ipif.
- * ilm_ill is used by IPv6 as multicast groups are joined using ill.
- * ilm_ill is NULL for IPv4 and ilm_ipif is NULL for IPv6.
+ * ilm_ill is used by IPv4 and IPv6
+ *
+ * The ilm state (and other multicast state on the ill) is protected by
+ * ill_mcast_lock. Operations that change state on both an ilg and ilm
+ * in addition use ill_mcast_serializer to ensure that we can't have
+ * interleaving between e.g., add and delete operations for the same conn_t,
+ * group, and ill.
*
* The comment below (and for other netstack_t references) refers
* to the fact that we only do netstack_hold in particular cases,
- * such as the references from open streams (ill_t and conn_t's
+ * such as the references from open endpoints (ill_t and conn_t's
* pointers). Internally within IP we rely on IP's ability to cleanup e.g.
* ire_t's when an ill goes away.
*/
-#define ILM_DELETED 0x1 /* ilm_flags */
typedef struct ilm_s {
in6_addr_t ilm_v6addr;
int ilm_refcnt;
uint_t ilm_timer; /* IGMP/MLD query resp timer, in msec */
- struct ipif_s *ilm_ipif; /* Back pointer to ipif for IPv4 */
struct ilm_s *ilm_next; /* Linked list for each ill */
uint_t ilm_state; /* state of the membership */
- struct ill_s *ilm_ill; /* Back pointer to ill for IPv6 */
- uint_t ilm_flags;
- boolean_t ilm_notify_driver; /* Need to notify the driver */
+ struct ill_s *ilm_ill; /* Back pointer to ill - ill_ilm_cnt */
zoneid_t ilm_zoneid;
int ilm_no_ilg_cnt; /* number of joins w/ no ilg */
mcast_record_t ilm_fmode; /* MODE_IS_INCLUDE/MODE_IS_EXCLUDE */
slist_t *ilm_filter; /* source filter list */
slist_t *ilm_pendsrcs; /* relevant src addrs for pending req */
rtx_state_t ilm_rtx; /* SCR retransmission state */
+ ipaddr_t ilm_ifaddr; /* For IPv4 netstat */
ip_stack_t *ilm_ipst; /* Does not have a netstack_hold */
} ilm_t;
#define ilm_addr V4_PART_OF_V6(ilm_v6addr)
-typedef struct ilm_walker {
- struct ill_s *ilw_ill; /* associated ill */
- struct ill_s *ilw_ipmp_ill; /* associated ipmp ill (if any) */
- struct ill_s *ilw_walk_ill; /* current ill being walked */
-} ilm_walker_t;
-
/*
* Soft reference to an IPsec SA.
*
@@ -898,40 +646,28 @@ typedef struct ipsa_ref_s
* In the presence of IPsec policy, fully-bound conn's bind a connection
* to more than just the 5-tuple, but also a specific IPsec action and
* identity-pair.
- *
- * As an optimization, we also cache soft references to IPsec SA's
- * here so that we can fast-path around most of the work needed for
+ * The identity pair is accessed from both the receive and transmit side
+ * hence it is maintained in the ipsec_latch_t structure. conn_latch and
+ * ixa_ipsec_latch points to it.
+ * The policy and actions are stored in conn_latch_in_policy and
+ * conn_latch_in_action for the inbound side, and in ixa_ipsec_policy and
+ * ixa_ipsec_action for the transmit side.
+ *
+ * As an optimization, we also cache soft references to IPsec SA's in
+ * ip_xmit_attr_t so that we can fast-path around most of the work needed for
* outbound IPsec SA selection.
- *
- * Were it not for TCP's detached connections, this state would be
- * in-line in conn_t; instead, this is in a separate structure so it
- * can be handed off to TCP when a connection is detached.
*/
typedef struct ipsec_latch_s
{
kmutex_t ipl_lock;
uint32_t ipl_refcnt;
- uint64_t ipl_unique;
- struct ipsec_policy_s *ipl_in_policy; /* latched policy (in) */
- struct ipsec_policy_s *ipl_out_policy; /* latched policy (out) */
- struct ipsec_action_s *ipl_in_action; /* latched action (in) */
- struct ipsec_action_s *ipl_out_action; /* latched action (out) */
- cred_t *ipl_local_id;
struct ipsid_s *ipl_local_cid;
struct ipsid_s *ipl_remote_cid;
unsigned int
- ipl_out_action_latched : 1,
- ipl_in_action_latched : 1,
- ipl_out_policy_latched : 1,
- ipl_in_policy_latched : 1,
-
ipl_ids_latched : 1,
- ipl_pad_to_bit_31 : 27;
-
- ipsa_ref_t ipl_ref[2]; /* 0: ESP, 1: AH */
-
+ ipl_pad_to_bit_31 : 31;
} ipsec_latch_t;
#define IPLATCH_REFHOLD(ipl) { \
@@ -939,97 +675,19 @@ typedef struct ipsec_latch_s
ASSERT((ipl)->ipl_refcnt != 0); \
}
-#define IPLATCH_REFRELE(ipl, ns) { \
+#define IPLATCH_REFRELE(ipl) { \
ASSERT((ipl)->ipl_refcnt != 0); \
membar_exit(); \
if (atomic_add_32_nv(&(ipl)->ipl_refcnt, -1) == 0) \
- iplatch_free(ipl, ns); \
+ iplatch_free(ipl); \
}
/*
* peer identity structure.
*/
-
typedef struct conn_s conn_t;
/*
- * The old IP client structure "ipc_t" is gone. All the data is stored in the
- * connection structure "conn_t" now. The mapping of old and new fields looks
- * like this:
- *
- * ipc_ulp conn_ulp
- * ipc_rq conn_rq
- * ipc_wq conn_wq
- *
- * ipc_laddr conn_src
- * ipc_faddr conn_rem
- * ipc_v6laddr conn_srcv6
- * ipc_v6faddr conn_remv6
- *
- * ipc_lport conn_lport
- * ipc_fport conn_fport
- * ipc_ports conn_ports
- *
- * ipc_policy conn_policy
- * ipc_latch conn_latch
- *
- * ipc_irc_lock conn_lock
- * ipc_ire_cache conn_ire_cache
- *
- * ipc_state_flags conn_state_flags
- * ipc_outgoing_ill conn_outgoing_ill
- *
- * ipc_dontroute conn_dontroute
- * ipc_loopback conn_loopback
- * ipc_broadcast conn_broadcast
- * ipc_reuseaddr conn_reuseaddr
- *
- * ipc_multicast_loop conn_multicast_loop
- * ipc_multi_router conn_multi_router
- * ipc_draining conn_draining
- *
- * ipc_did_putbq conn_did_putbq
- * ipc_unspec_src conn_unspec_src
- * ipc_policy_cached conn_policy_cached
- *
- * ipc_in_enforce_policy conn_in_enforce_policy
- * ipc_out_enforce_policy conn_out_enforce_policy
- * ipc_af_isv6 conn_af_isv6
- * ipc_pkt_isv6 conn_pkt_isv6
- *
- * ipc_ipv6_recvpktinfo conn_ipv6_recvpktinfo
- *
- * ipc_ipv6_recvhoplimit conn_ipv6_recvhoplimit
- * ipc_ipv6_recvhopopts conn_ipv6_recvhopopts
- * ipc_ipv6_recvdstopts conn_ipv6_recvdstopts
- *
- * ipc_ipv6_recvrthdr conn_ipv6_recvrthdr
- * ipc_ipv6_recvrtdstopts conn_ipv6_recvrtdstopts
- * ipc_fully_bound conn_fully_bound
- *
- * ipc_recvif conn_recvif
- *
- * ipc_recvslla conn_recvslla
- * ipc_acking_unbind conn_acking_unbind
- * ipc_pad_to_bit_31 conn_pad_to_bit_31
- *
- * ipc_proto conn_proto
- * ipc_incoming_ill conn_incoming_ill
- * ipc_pending_ill conn_pending_ill
- * ipc_unbind_mp conn_unbind_mp
- * ipc_ilg conn_ilg
- * ipc_ilg_allocated conn_ilg_allocated
- * ipc_ilg_inuse conn_ilg_inuse
- * ipc_ilg_walker_cnt conn_ilg_walker_cnt
- * ipc_refcv conn_refcv
- * ipc_multicast_ipif conn_multicast_ipif
- * ipc_multicast_ill conn_multicast_ill
- * ipc_drain_next conn_drain_next
- * ipc_drain_prev conn_drain_prev
- * ipc_idl conn_idl
- */
-
-/*
* This is used to match an inbound/outbound datagram with policy.
*/
typedef struct ipsec_selector {
@@ -1069,22 +727,6 @@ typedef struct ipsec_selector {
#define IPSEC_POLICY_MAX 5 /* Always max + 1. */
/*
- * Folowing macro is used whenever the code does not know whether there
- * is a M_CTL present in the front and it needs to examine the actual mp
- * i.e the IP header. As a M_CTL message could be in the front, this
- * extracts the packet into mp and the M_CTL mp into first_mp. If M_CTL
- * mp is not present, both first_mp and mp point to the same message.
- */
-#define EXTRACT_PKT_MP(mp, first_mp, mctl_present) \
- (first_mp) = (mp); \
- if ((mp)->b_datap->db_type == M_CTL) { \
- (mp) = (mp)->b_cont; \
- (mctl_present) = B_TRUE; \
- } else { \
- (mctl_present) = B_FALSE; \
- }
-
-/*
* Check with IPSEC inbound policy if
*
* 1) per-socket policy is present - indicated by conn_in_enforce_policy.
@@ -1113,11 +755,6 @@ typedef struct ipsec_selector {
/*
* Information cached in IRE for upper layer protocol (ULP).
- *
- * Notice that ire_max_frag is not included in the iulp_t structure, which
- * it may seem that it should. But ire_max_frag cannot really be cached. It
- * is fixed for each interface. For MTU found by PMTUd, we may want to cache
- * it. But currently, we do not do that.
*/
typedef struct iulp_s {
boolean_t iulp_set; /* Is any metric set? */
@@ -1128,17 +765,21 @@ typedef struct iulp_s {
uint32_t iulp_rpipe; /* Receive pipe size. */
uint32_t iulp_rtomax; /* Max round trip timeout. */
uint32_t iulp_sack; /* Use SACK option (TCP)? */
+ uint32_t iulp_mtu; /* Setable with routing sockets */
+
uint32_t
iulp_tstamp_ok : 1, /* Use timestamp option (TCP)? */
iulp_wscale_ok : 1, /* Use window scale option (TCP)? */
iulp_ecn_ok : 1, /* Enable ECN (for TCP)? */
iulp_pmtud_ok : 1, /* Enable PMTUd? */
- iulp_not_used : 28;
-} iulp_t;
+ /* These three are passed out by ip_set_destination */
+ iulp_localnet: 1, /* IRE_ONLINK */
+ iulp_loopback: 1, /* IRE_LOOPBACK */
+ iulp_local: 1, /* IRE_LOCAL */
-/* Zero iulp_t. */
-extern const iulp_t ire_uinfo_null;
+ iulp_not_used : 25;
+} iulp_t;
/*
* The conn drain list structure (idl_t).
@@ -1173,7 +814,6 @@ struct idl_tx_list_s {
struct idl_s {
conn_t *idl_conn; /* Head of drain list */
kmutex_t idl_lock; /* Lock for this list */
- conn_t *idl_conn_draining; /* conn that is draining */
uint32_t
idl_repeat : 1, /* Last conn must re-enable */
/* drain list again */
@@ -1182,36 +822,38 @@ struct idl_s {
};
#define CONN_DRAIN_LIST_LOCK(connp) (&((connp)->conn_idl->idl_lock))
+
/*
* Interface route structure which holds the necessary information to recreate
- * routes that are tied to an interface (namely where ire_ipif != NULL).
+ * routes that are tied to an interface i.e. have ire_ill set.
+ *
* These routes which were initially created via a routing socket or via the
* SIOCADDRT ioctl may be gateway routes (RTF_GATEWAY being set) or may be
- * traditional interface routes. When an interface comes back up after being
- * marked down, this information will be used to recreate the routes. These
- * are part of an mblk_t chain that hangs off of the IPIF (ipif_saved_ire_mp).
+ * traditional interface routes. When an ill comes back up after being
+ * down, this information will be used to recreate the routes. These
+ * are part of an mblk_t chain that hangs off of the ILL (ill_saved_ire_mp).
*/
typedef struct ifrt_s {
ushort_t ifrt_type; /* Type of IRE */
in6_addr_t ifrt_v6addr; /* Address IRE represents. */
- in6_addr_t ifrt_v6gateway_addr; /* Gateway if IRE_OFFSUBNET */
- in6_addr_t ifrt_v6src_addr; /* Src addr if RTF_SETSRC */
+ in6_addr_t ifrt_v6gateway_addr; /* Gateway if IRE_OFFLINK */
+ in6_addr_t ifrt_v6setsrc_addr; /* Src addr if RTF_SETSRC */
in6_addr_t ifrt_v6mask; /* Mask for matching IRE. */
uint32_t ifrt_flags; /* flags related to route */
- uint_t ifrt_max_frag; /* MTU (next hop or path). */
- iulp_t ifrt_iulp_info; /* Cached IRE ULP info. */
+ iulp_t ifrt_metrics; /* Routing socket metrics */
+ zoneid_t ifrt_zoneid; /* zoneid for route */
} ifrt_t;
#define ifrt_addr V4_PART_OF_V6(ifrt_v6addr)
#define ifrt_gateway_addr V4_PART_OF_V6(ifrt_v6gateway_addr)
-#define ifrt_src_addr V4_PART_OF_V6(ifrt_v6src_addr)
#define ifrt_mask V4_PART_OF_V6(ifrt_v6mask)
+#define ifrt_setsrc_addr V4_PART_OF_V6(ifrt_v6setsrc_addr)
/* Number of IP addresses that can be hosted on a physical interface */
#define MAX_ADDRS_PER_IF 8192
/*
* Number of Source addresses to be considered for source address
- * selection. Used by ipif_select_source[_v6].
+ * selection. Used by ipif_select_source_v4/v6.
*/
#define MAX_IPIF_SELECT_SOURCE 50
@@ -1245,16 +887,13 @@ typedef struct th_hash_s {
#define IPIF_CONDEMNED 0x1 /* The ipif is being removed */
#define IPIF_CHANGING 0x2 /* A critcal ipif field is changing */
#define IPIF_SET_LINKLOCAL 0x10 /* transient flag during bringup */
-#define IPIF_ZERO_SOURCE 0x20 /* transient flag during bringup */
/* IP interface structure, one per local address */
typedef struct ipif_s {
struct ipif_s *ipif_next;
struct ill_s *ipif_ill; /* Back pointer to our ill */
int ipif_id; /* Logical unit number */
- uint_t ipif_mtu; /* Starts at ipif_ill->ill_max_frag */
in6_addr_t ipif_v6lcl_addr; /* Local IP address for this if. */
- in6_addr_t ipif_v6src_addr; /* Source IP address for this if. */
in6_addr_t ipif_v6subnet; /* Subnet prefix for this if. */
in6_addr_t ipif_v6net_mask; /* Net mask for this interface. */
in6_addr_t ipif_v6brd_addr; /* Broadcast addr for this interface. */
@@ -1262,47 +901,29 @@ typedef struct ipif_s {
uint64_t ipif_flags; /* Interface flags. */
uint_t ipif_metric; /* BSD if metric, for compatibility. */
uint_t ipif_ire_type; /* IRE_LOCAL or IRE_LOOPBACK */
- mblk_t *ipif_arp_del_mp; /* Allocated at time arp comes up, to */
- /* prevent awkward out of mem */
- /* condition later */
- mblk_t *ipif_saved_ire_mp; /* Allocated for each extra */
- /* IRE_IF_NORESOLVER/IRE_IF_RESOLVER */
- /* on this interface so that they */
- /* can survive ifconfig down. */
- kmutex_t ipif_saved_ire_lock; /* Protects ipif_saved_ire_mp */
-
- mrec_t *ipif_igmp_rpt; /* List of group memberships which */
- /* will be reported on. Used when */
- /* handling an igmp timeout. */
/*
- * The packet counts in the ipif contain the sum of the
- * packet counts in dead IREs that were affiliated with
- * this ipif.
+ * The packet count in the ipif contain the sum of the
+ * packet counts in dead IRE_LOCAL/LOOPBACK for this ipif.
*/
- uint_t ipif_fo_pkt_count; /* Forwarded thru our dead IREs */
uint_t ipif_ib_pkt_count; /* Inbound packets for our dead IREs */
- uint_t ipif_ob_pkt_count; /* Outbound packets to our dead IREs */
+
/* Exclusive bit fields, protected by ipsq_t */
unsigned int
- ipif_multicast_up : 1, /* ipif_multicast_up() successful */
ipif_was_up : 1, /* ipif was up before */
ipif_addr_ready : 1, /* DAD is done */
ipif_was_dup : 1, /* DAD had failed */
-
- ipif_joined_allhosts : 1, /* allhosts joined */
ipif_added_nce : 1, /* nce added for local address */
- ipif_pad_to_31 : 26;
+
+ ipif_pad_to_31 : 28;
+
+ ilm_t *ipif_allhosts_ilm; /* For all-nodes join */
+ ilm_t *ipif_solmulti_ilm; /* For IPv6 solicited multicast join */
uint_t ipif_seqid; /* unique index across all ills */
uint_t ipif_state_flags; /* See IPIF_* flag defs above */
uint_t ipif_refcnt; /* active consistent reader cnt */
- /* Number of ire's and ilm's referencing this ipif */
- uint_t ipif_ire_cnt;
- uint_t ipif_ilm_cnt;
-
- uint_t ipif_saved_ire_cnt;
zoneid_t ipif_zoneid; /* zone ID number */
timeout_id_t ipif_recovery_id; /* Timer for DAD recovery */
boolean_t ipif_trace_disable; /* True when alloc fails */
@@ -1313,40 +934,12 @@ typedef struct ipif_s {
* part of a group will be pointed to, and an ill cannot disappear
* while it's in a group.
*/
- struct ill_s *ipif_bound_ill;
- struct ipif_s *ipif_bound_next; /* bound ipif chain */
- boolean_t ipif_bound; /* B_TRUE if we successfully bound */
-} ipif_t;
+ struct ill_s *ipif_bound_ill;
+ struct ipif_s *ipif_bound_next; /* bound ipif chain */
+ boolean_t ipif_bound; /* B_TRUE if we successfully bound */
-/*
- * IPIF_FREE_OK() means that there are no incoming references
- * to the ipif. Incoming refs would prevent the ipif from being freed.
- */
-#define IPIF_FREE_OK(ipif) \
- ((ipif)->ipif_ire_cnt == 0 && (ipif)->ipif_ilm_cnt == 0)
-/*
- * IPIF_DOWN_OK() determines whether the incoming pointer reference counts
- * would permit the ipif to be considered quiescent. In order for
- * an ipif or ill to be considered quiescent, the ire and nce references
- * to that ipif/ill must be zero.
- *
- * We do not require the ilm references to go to zero for quiescence
- * because the quiescence checks are done to ensure that
- * outgoing packets do not use addresses from the ipif/ill after it
- * has been marked down, and incoming packets to addresses on a
- * queiscent interface are rejected. This implies that all the
- * ire/nce's using that source address need to be deleted and future
- * creation of any ires using that source address must be prevented.
- * Similarly incoming unicast packets destined to the 'down' address
- * will not be accepted once that ire is gone. However incoming
- * multicast packets are not destined to the downed address.
- * They are only related to the ill in question. Furthermore
- * the current API behavior allows applications to join or leave
- * multicast groups, i.e., IP_ADD_MEMBERSHIP / LEAVE_MEMBERSHIP, using a
- * down address. Therefore the ilm references are not included in
- * the _DOWN_OK macros.
- */
-#define IPIF_DOWN_OK(ipif) ((ipif)->ipif_ire_cnt == 0)
+ struct ire_s *ipif_ire_local; /* Our IRE_LOCAL or LOOPBACK */
+} ipif_t;
/*
* The following table lists the protection levels of the various members
@@ -1371,9 +964,7 @@ typedef struct ipif_s {
* ill_g_lock ill_g_lock
* ipif_ill ipsq + down ipif write once
* ipif_id ipsq + down ipif write once
- * ipif_mtu ipsq
* ipif_v6lcl_addr ipsq + down ipif up ipif
- * ipif_v6src_addr ipsq + down ipif up ipif
* ipif_v6subnet ipsq + down ipif up ipif
* ipif_v6net_mask ipsq + down ipif up ipif
*
@@ -1383,28 +974,30 @@ typedef struct ipif_s {
* ipif_metric
* ipif_ire_type ipsq + down ill up ill
*
- * ipif_arp_del_mp ipsq ipsq
- * ipif_saved_ire_mp ipif_saved_ire_lock ipif_saved_ire_lock
- * ipif_igmp_rpt ipsq ipsq
- *
- * ipif_fo_pkt_count Approx
* ipif_ib_pkt_count Approx
- * ipif_ob_pkt_count Approx
*
* bit fields ill_lock ill_lock
*
+ * ipif_allhosts_ilm ipsq ipsq
+ * ipif_solmulti_ilm ipsq ipsq
+ *
* ipif_seqid ipsq Write once
*
* ipif_state_flags ill_lock ill_lock
* ipif_refcnt ill_lock ill_lock
- * ipif_ire_cnt ill_lock ill_lock
- * ipif_ilm_cnt ill_lock ill_lock
- * ipif_saved_ire_cnt
- *
* ipif_bound_ill ipsq + ipmp_lock ipsq OR ipmp_lock
* ipif_bound_next ipsq ipsq
* ipif_bound ipsq ipsq
+ *
+ * ipif_ire_local ipsq + ips_ill_g_lock ipsq OR ips_ill_g_lock
+ */
+
+/*
+ * Return values from ip_laddr_verify_{v4,v6}
*/
+typedef enum { IPVL_UNICAST_UP, IPVL_UNICAST_DOWN, IPVL_MCAST, IPVL_BCAST,
+ IPVL_BAD} ip_laddr_t;
+
#define IP_TR_HASH(tid) ((((uintptr_t)tid) >> 6) & (IP_TR_HASH_MAX - 1))
@@ -1422,18 +1015,12 @@ typedef struct ipif_s {
/* IPv4 compatibility macros */
#define ipif_lcl_addr V4_PART_OF_V6(ipif_v6lcl_addr)
-#define ipif_src_addr V4_PART_OF_V6(ipif_v6src_addr)
#define ipif_subnet V4_PART_OF_V6(ipif_v6subnet)
#define ipif_net_mask V4_PART_OF_V6(ipif_v6net_mask)
#define ipif_brd_addr V4_PART_OF_V6(ipif_v6brd_addr)
#define ipif_pp_dst_addr V4_PART_OF_V6(ipif_v6pp_dst_addr)
/* Macros for easy backreferences to the ill. */
-#define ipif_wq ipif_ill->ill_wq
-#define ipif_rq ipif_ill->ill_rq
-#define ipif_net_type ipif_ill->ill_net_type
-#define ipif_ipif_up_count ipif_ill->ill_ipif_up_count
-#define ipif_type ipif_ill->ill_type
#define ipif_isv6 ipif_ill->ill_isv6
#define SIOCLIFADDR_NDX 112 /* ndx of SIOCLIFADDR in the ndx ioctl table */
@@ -1524,7 +1111,7 @@ typedef struct ipxop_s {
boolean_t ipx_current_done; /* is the current operation done? */
int ipx_current_ioctl; /* current ioctl, or 0 if no ioctl */
ipif_t *ipx_current_ipif; /* ipif for current op */
- ipif_t *ipx_pending_ipif; /* ipif for ipsq_pending_mp */
+ ipif_t *ipx_pending_ipif; /* ipif for ipx_pending_mp */
mblk_t *ipx_pending_mp; /* current ioctl mp while waiting */
boolean_t ipx_forced; /* debugging aid */
#ifdef DEBUG
@@ -1642,24 +1229,62 @@ typedef struct irb {
krwlock_t irb_lock; /* Protect this bucket */
uint_t irb_refcnt; /* Protected by irb_lock */
uchar_t irb_marks; /* CONDEMNED ires in this bucket ? */
-#define IRB_MARK_CONDEMNED 0x0001
-#define IRB_MARK_FTABLE 0x0002
+#define IRB_MARK_CONDEMNED 0x0001 /* Contains some IRE_IS_CONDEMNED */
+#define IRB_MARK_DYNAMIC 0x0002 /* Dynamically allocated */
+ /* Once IPv6 uses radix then IRB_MARK_DYNAMIC will be always be set */
uint_t irb_ire_cnt; /* Num of active IRE in this bucket */
- uint_t irb_tmp_ire_cnt; /* Num of temporary IRE */
- struct ire_s *irb_rr_origin; /* origin for round-robin */
int irb_nire; /* Num of ftable ire's that ref irb */
ip_stack_t *irb_ipst; /* Does not have a netstack_hold */
} irb_t;
#define IRB2RT(irb) (rt_t *)((caddr_t)(irb) - offsetof(rt_t, rt_irb))
-/* The following are return values of ip_xmit_v4() */
-typedef enum {
- SEND_PASSED = 0, /* sent packet out on wire */
- SEND_FAILED, /* sending of packet failed */
- LOOKUP_IN_PROGRESS, /* ire cache found, ARP resolution in progress */
- LLHDR_RESLV_FAILED /* macaddr resl of onlink dst or nexthop failed */
-} ipxmit_state_t;
+/* Forward declarations */
+struct dce_s;
+typedef struct dce_s dce_t;
+struct ire_s;
+typedef struct ire_s ire_t;
+struct ncec_s;
+typedef struct ncec_s ncec_t;
+struct nce_s;
+typedef struct nce_s nce_t;
+struct ip_recv_attr_s;
+typedef struct ip_recv_attr_s ip_recv_attr_t;
+struct ip_xmit_attr_s;
+typedef struct ip_xmit_attr_s ip_xmit_attr_t;
+
+struct tsol_ire_gw_secattr_s;
+typedef struct tsol_ire_gw_secattr_s tsol_ire_gw_secattr_t;
+
+/*
+ * This is a structure for a one-element route cache that is passed
+ * by reference between ip_input and ill_inputfn.
+ */
+typedef struct {
+ ire_t *rtc_ire;
+ ipaddr_t rtc_ipaddr;
+ in6_addr_t rtc_ip6addr;
+} rtc_t;
+
+/*
+ * Note: Temporarily use 64 bits, and will probably go back to 32 bits after
+ * more cleanup work is done.
+ */
+typedef uint64_t iaflags_t;
+
+/* The ill input function pointer type */
+typedef void (*pfillinput_t)(mblk_t *, void *, void *, ip_recv_attr_t *,
+ rtc_t *);
+
+/* The ire receive function pointer type */
+typedef void (*pfirerecv_t)(ire_t *, mblk_t *, void *, ip_recv_attr_t *);
+
+/* The ire send and postfrag function pointer types */
+typedef int (*pfiresend_t)(ire_t *, mblk_t *, void *,
+ ip_xmit_attr_t *, uint32_t *);
+typedef int (*pfirepostfrag_t)(mblk_t *, nce_t *, iaflags_t, uint_t, uint32_t,
+ zoneid_t, zoneid_t, uintptr_t *);
+
#define IP_V4_G_HEAD 0
#define IP_V6_G_HEAD 1
@@ -1733,26 +1358,12 @@ typedef union ill_g_head_u {
/*
* Capabilities, possible flags for ill_capabilities.
*/
-
-#define ILL_CAPAB_AH 0x01 /* IPsec AH acceleration */
-#define ILL_CAPAB_ESP 0x02 /* IPsec ESP acceleration */
-#define ILL_CAPAB_MDT 0x04 /* Multidata Transmit */
+#define ILL_CAPAB_LSO 0x04 /* Large Send Offload */
#define ILL_CAPAB_HCKSUM 0x08 /* Hardware checksumming */
#define ILL_CAPAB_ZEROCOPY 0x10 /* Zero-copy */
#define ILL_CAPAB_DLD 0x20 /* DLD capabilities */
#define ILL_CAPAB_DLD_POLL 0x40 /* Polling */
#define ILL_CAPAB_DLD_DIRECT 0x80 /* Direct function call */
-#define ILL_CAPAB_DLD_LSO 0x100 /* Large Segment Offload */
-
-/*
- * Per-ill Multidata Transmit capabilities.
- */
-typedef struct ill_mdt_capab_s ill_mdt_capab_t;
-
-/*
- * Per-ill IPsec capabilities.
- */
-typedef struct ill_ipsec_capab_s ill_ipsec_capab_t;
/*
* Per-ill Hardware Checksumming capbilities.
@@ -1775,15 +1386,18 @@ typedef struct ill_dld_capab_s ill_dld_capab_t;
typedef struct ill_rx_ring ill_rx_ring_t;
/*
- * Per-ill Large Segment Offload capabilities.
+ * Per-ill Large Send Offload capabilities.
*/
typedef struct ill_lso_capab_s ill_lso_capab_t;
/* The following are ill_state_flags */
#define ILL_LL_SUBNET_PENDING 0x01 /* Waiting for DL_INFO_ACK from drv */
#define ILL_CONDEMNED 0x02 /* No more new ref's to the ILL */
-#define ILL_CHANGING 0x04 /* ILL not globally visible */
-#define ILL_DL_UNBIND_IN_PROGRESS 0x08 /* UNBIND_REQ is sent */
+#define ILL_DL_UNBIND_IN_PROGRESS 0x04 /* UNBIND_REQ is sent */
+#define ILL_DOWN_IN_PROGRESS 0x08 /* ILL is going down - no new nce's */
+#define ILL_LL_BIND_PENDING 0x0020 /* XXX Reuse ILL_LL_SUBNET_PENDING ? */
+#define ILL_LL_UP 0x0040
+#define ILL_LL_DOWN 0x0080
/* Is this an ILL whose source address is used by other ILL's ? */
#define IS_USESRC_ILL(ill) \
@@ -1796,10 +1410,9 @@ typedef struct ill_lso_capab_s ill_lso_capab_t;
((ill)->ill_usesrc_grp_next != NULL))
/* Is this an virtual network interface (vni) ILL ? */
-#define IS_VNI(ill) \
- (((ill) != NULL) && \
+#define IS_VNI(ill) \
(((ill)->ill_phyint->phyint_flags & (PHYI_LOOPBACK|PHYI_VIRTUAL)) == \
- PHYI_VIRTUAL))
+ PHYI_VIRTUAL)
/* Is this a loopback ILL? */
#define IS_LOOPBACK(ill) \
@@ -1900,18 +1513,41 @@ typedef struct ipmp_grp_s {
* ARP up-to-date as the active set of interfaces in the group changes.
*/
typedef struct ipmp_arpent_s {
- mblk_t *ia_area_mp; /* AR_ENTRY_ADD pointer */
ipaddr_t ia_ipaddr; /* IP address for this entry */
boolean_t ia_proxyarp; /* proxy ARP entry? */
boolean_t ia_notified; /* ARP notified about this entry? */
list_node_t ia_node; /* next ARP entry in list */
+ uint16_t ia_flags; /* nce_flags for the address */
+ size_t ia_lladdr_len;
+ uchar_t *ia_lladdr;
} ipmp_arpent_t;
+struct arl_s;
+
+/*
+ * Per-ill capabilities.
+ */
+struct ill_hcksum_capab_s {
+ uint_t ill_hcksum_version; /* interface version */
+ uint_t ill_hcksum_txflags; /* capabilities on transmit */
+};
+
+struct ill_zerocopy_capab_s {
+ uint_t ill_zerocopy_version; /* interface version */
+ uint_t ill_zerocopy_flags; /* capabilities */
+};
+
+struct ill_lso_capab_s {
+ uint_t ill_lso_flags; /* capabilities */
+ uint_t ill_lso_max; /* maximum size of payload */
+};
+
/*
* IP Lower level Structure.
* Instance data structure in ip_open when there is a device below us.
*/
typedef struct ill_s {
+ pfillinput_t ill_inputfn; /* Fast input function selector */
ill_if_t *ill_ifptr; /* pointer to interface type */
queue_t *ill_rq; /* Read queue. */
queue_t *ill_wq; /* Write queue. */
@@ -1922,6 +1558,8 @@ typedef struct ill_s {
uint_t ill_ipif_up_count; /* Number of IPIFs currently up. */
uint_t ill_max_frag; /* Max IDU from DLPI. */
+ uint_t ill_current_frag; /* Current IDU from DLPI. */
+ uint_t ill_mtu; /* User-specified MTU; SIOCSLIFMTU */
char *ill_name; /* Our name. */
uint_t ill_ipif_dup_count; /* Number of duplicate addresses. */
uint_t ill_name_length; /* Name length, incl. terminator. */
@@ -1941,8 +1579,9 @@ typedef struct ill_s {
uint8_t *ill_frag_ptr; /* Reassembly state. */
timeout_id_t ill_frag_timer_id; /* timeout id for the frag timer */
ipfb_t *ill_frag_hash_tbl; /* Fragment hash list head. */
- ipif_t *ill_pending_ipif; /* IPIF waiting for DL operation. */
+ krwlock_t ill_mcast_lock; /* Protects multicast state */
+ kmutex_t ill_mcast_serializer; /* Serialize across ilg and ilm state */
ilm_t *ill_ilm; /* Multicast membership for ill */
uint_t ill_global_timer; /* for IGMPv3/MLDv2 general queries */
int ill_mcast_type; /* type of router which is querier */
@@ -1955,22 +1594,20 @@ typedef struct ill_s {
uint8_t ill_mcast_rv; /* IGMPv3/MLDv2 robustness variable */
int ill_mcast_qi; /* IGMPv3/MLDv2 query interval var */
- mblk_t *ill_pending_mp; /* IOCTL/DLPI awaiting completion. */
/*
* All non-NULL cells between 'ill_first_mp_to_free' and
* 'ill_last_mp_to_free' are freed in ill_delete.
*/
#define ill_first_mp_to_free ill_bcast_mp
mblk_t *ill_bcast_mp; /* DLPI header for broadcasts. */
- mblk_t *ill_resolver_mp; /* Resolver template. */
mblk_t *ill_unbind_mp; /* unbind mp from ill_dl_up() */
mblk_t *ill_promiscoff_mp; /* for ill_leave_allmulti() */
mblk_t *ill_dlpi_deferred; /* b_next chain of control messages */
- mblk_t *ill_ardeact_mp; /* deact mp from ipmp_ill_activate() */
mblk_t *ill_dest_addr_mp; /* mblk which holds ill_dest_addr */
mblk_t *ill_replumb_mp; /* replumb mp from ill_replumb() */
mblk_t *ill_phys_addr_mp; /* mblk which holds ill_phys_addr */
-#define ill_last_mp_to_free ill_phys_addr_mp
+ mblk_t *ill_mcast_deferred; /* b_next chain of IGMP/MLD packets */
+#define ill_last_mp_to_free ill_mcast_deferred
cred_t *ill_credp; /* opener's credentials */
uint8_t *ill_phys_addr; /* ill_phys_addr_mp->b_rptr + off */
@@ -1986,37 +1623,33 @@ typedef struct ill_s {
ill_dlpi_style_set : 1,
ill_ifname_pending : 1,
- ill_join_allmulti : 1,
ill_logical_down : 1,
ill_dl_up : 1,
-
ill_up_ipifs : 1,
+
ill_note_link : 1, /* supports link-up notification */
ill_capab_reneg : 1, /* capability renegotiation to be done */
ill_dld_capab_inprog : 1, /* direct dld capab call in prog */
-
ill_need_recover_multicast : 1,
- ill_pad_to_bit_31 : 19;
+
+ ill_replumbing : 1,
+ ill_arl_dlpi_pending : 1,
+
+ ill_pad_to_bit_31 : 18;
/* Following bit fields protected by ill_lock */
uint_t
ill_fragtimer_executing : 1,
ill_fragtimer_needrestart : 1,
- ill_ilm_cleanup_reqd : 1,
- ill_arp_closing : 1,
-
- ill_arp_bringup_pending : 1,
- ill_arp_extend : 1, /* ARP has DAD extensions */
ill_manual_token : 1, /* system won't override ill_token */
ill_manual_linklocal : 1, /* system won't auto-conf linklocal */
- ill_pad_bit_31 : 24;
+ ill_pad_bit_31 : 28;
/*
* Used in SIOCSIFMUXID and SIOCGIFMUXID for 'ifconfig unplumb'.
*/
- int ill_arp_muxid; /* muxid returned from plink for arp */
- int ill_ip_muxid; /* muxid returned from plink for ip */
+ int ill_muxid; /* muxid returned from plink */
/* Used for IP frag reassembly throttling on a per ILL basis. */
uint_t ill_ipf_gen; /* Generation of next fragment queue */
@@ -2033,20 +1666,13 @@ typedef struct ill_s {
uint_t ill_dlpi_capab_state; /* State of capability query, IDCS_* */
uint_t ill_capab_pending_cnt;
uint64_t ill_capabilities; /* Enabled capabilities, ILL_CAPAB_* */
- ill_mdt_capab_t *ill_mdt_capab; /* Multidata Transmit capabilities */
- ill_ipsec_capab_t *ill_ipsec_capab_ah; /* IPsec AH capabilities */
- ill_ipsec_capab_t *ill_ipsec_capab_esp; /* IPsec ESP capabilities */
ill_hcksum_capab_t *ill_hcksum_capab; /* H/W cksumming capabilities */
ill_zerocopy_capab_t *ill_zerocopy_capab; /* Zero-copy capabilities */
ill_dld_capab_t *ill_dld_capab; /* DLD capabilities */
ill_lso_capab_t *ill_lso_capab; /* Large Segment Offload capabilities */
mblk_t *ill_capab_reset_mp; /* Preallocated mblk for capab reset */
- /*
- * Fields for IPv6
- */
uint8_t ill_max_hops; /* Maximum hops for any logical interface */
- uint_t ill_max_mtu; /* Maximum MTU for any logical interface */
uint_t ill_user_mtu; /* User-specified MTU via SIOCSLIFLNKINFO */
uint32_t ill_reachable_time; /* Value for ND algorithm in msec */
uint32_t ill_reachable_retrans_time; /* Value for ND algorithm msec */
@@ -2057,20 +1683,6 @@ typedef struct ill_s {
uint32_t ill_xmit_count; /* ndp max multicast xmits */
mib2_ipIfStatsEntry_t *ill_ip_mib; /* ver indep. interface mib */
mib2_ipv6IfIcmpEntry_t *ill_icmp6_mib; /* Per interface mib */
- /*
- * Following two mblks are allocated common to all
- * the ipifs when the first interface is coming up.
- * It is sent up to arp when the last ipif is coming
- * down.
- */
- mblk_t *ill_arp_down_mp;
- mblk_t *ill_arp_del_mapping_mp;
- /*
- * Used for implementing IFF_NOARP. As IFF_NOARP is used
- * to turn off for all the logicals, it is here instead
- * of the ipif.
- */
- mblk_t *ill_arp_on_mp;
phyint_t *ill_phyint;
uint64_t ill_flags;
@@ -2094,11 +1706,11 @@ typedef struct ill_s {
*/
uint_t ill_ifname_pending_err;
avl_node_t ill_avl_byppa; /* avl node based on ppa */
- void *ill_fastpath_list; /* both ire and nce hang off this */
+ list_t ill_nce; /* pointer to nce_s list */
uint_t ill_refcnt; /* active refcnt by threads */
uint_t ill_ire_cnt; /* ires associated with this ill */
kcondvar_t ill_cv;
- uint_t ill_ilm_walker_cnt; /* snmp ilm walkers */
+ uint_t ill_ncec_cnt; /* ncecs associated with this ill */
uint_t ill_nce_cnt; /* nces associated with this ill */
uint_t ill_waiters; /* threads waiting in ipsq_enter */
/*
@@ -2119,6 +1731,17 @@ typedef struct ill_s {
void *ill_flownotify_mh; /* Tx flow ctl, mac cb handle */
uint_t ill_ilm_cnt; /* ilms referencing this ill */
uint_t ill_ipallmulti_cnt; /* ip_join_allmulti() calls */
+ ilm_t *ill_ipallmulti_ilm;
+
+ mblk_t *ill_saved_ire_mp; /* Allocated for each extra IRE */
+ /* with ire_ill set so they can */
+ /* survive the ill going down and up. */
+ kmutex_t ill_saved_ire_lock; /* Protects ill_saved_ire_mp, cnt */
+ uint_t ill_saved_ire_cnt; /* # entries */
+ struct arl_ill_common_s *ill_common;
+ ire_t *ill_ire_multicast; /* IRE_MULTICAST for ill */
+ clock_t ill_defend_start; /* start of 1 hour period */
+ uint_t ill_defend_count; /* # of announce/defends per ill */
/*
* IPMP fields.
*/
@@ -2131,6 +1754,8 @@ typedef struct ill_s {
uint_t ill_bound_cnt; /* # of data addresses bound to ill */
ipif_t *ill_bound_ipif; /* ipif chain bound to ill */
timeout_id_t ill_refresh_tid; /* ill refresh retry timeout id */
+
+ uint32_t ill_mrouter_cnt; /* mrouter allmulti joins */
} ill_t;
/*
@@ -2139,15 +1764,17 @@ typedef struct ill_s {
*/
#define ILL_FREE_OK(ill) \
((ill)->ill_ire_cnt == 0 && (ill)->ill_ilm_cnt == 0 && \
- (ill)->ill_nce_cnt == 0)
+ (ill)->ill_ncec_cnt == 0 && (ill)->ill_nce_cnt == 0)
/*
- * An ipif/ill can be marked down only when the ire and nce references
+ * An ipif/ill can be marked down only when the ire and ncec references
* to that ipif/ill goes to zero. ILL_DOWN_OK() is a necessary condition
* quiescence checks. See comments above IPIF_DOWN_OK for details
* on why ires and nces are selectively considered for this macro.
*/
-#define ILL_DOWN_OK(ill) (ill->ill_ire_cnt == 0 && ill->ill_nce_cnt == 0)
+#define ILL_DOWN_OK(ill) \
+ (ill->ill_ire_cnt == 0 && ill->ill_ncec_cnt == 0 && \
+ ill->ill_nce_cnt == 0)
/*
* The following table lists the protection levels of the various members
@@ -2162,7 +1789,8 @@ typedef struct ill_s {
* ill_error ipsq None
* ill_ipif ill_g_lock + ipsq ill_g_lock OR ipsq
* ill_ipif_up_count ill_lock + ipsq ill_lock OR ipsq
- * ill_max_frag ipsq Write once
+ * ill_max_frag ill_lock ill_lock
+ * ill_current_frag ill_lock ill_lock
*
* ill_name ill_g_lock + ipsq Write once
* ill_name_length ill_g_lock + ipsq Write once
@@ -2179,23 +1807,22 @@ typedef struct ill_s {
*
* ill_frag_timer_id ill_lock ill_lock
* ill_frag_hash_tbl ipsq up ill
- * ill_ilm ipsq + ill_lock ill_lock
- * ill_mcast_type ill_lock ill_lock
- * ill_mcast_v1_time ill_lock ill_lock
- * ill_mcast_v2_time ill_lock ill_lock
- * ill_mcast_v1_tset ill_lock ill_lock
- * ill_mcast_v2_tset ill_lock ill_lock
- * ill_mcast_rv ill_lock ill_lock
- * ill_mcast_qi ill_lock ill_lock
- * ill_pending_mp ill_lock ill_lock
- *
- * ill_bcast_mp ipsq ipsq
- * ill_resolver_mp ipsq only when ill is up
+ * ill_ilm ill_mcast_lock(WRITER) ill_mcast_lock(READER)
+ * ill_global_timer ill_mcast_lock(WRITER) ill_mcast_lock(READER)
+ * ill_mcast_type ill_mcast_lock(WRITER) ill_mcast_lock(READER)
+ * ill_mcast_v1_time ill_mcast_lock(WRITER) ill_mcast_lock(READER)
+ * ill_mcast_v2_time ill_mcast_lock(WRITER) ill_mcast_lock(READER)
+ * ill_mcast_v1_tset ill_mcast_lock(WRITER) ill_mcast_lock(READER)
+ * ill_mcast_v2_tset ill_mcast_lock(WRITER) ill_mcast_lock(READER)
+ * ill_mcast_rv ill_mcast_lock(WRITER) ill_mcast_lock(READER)
+ * ill_mcast_qi ill_mcast_lock(WRITER) ill_mcast_lock(READER)
+ *
* ill_down_mp ipsq ipsq
* ill_dlpi_deferred ill_lock ill_lock
* ill_dlpi_pending ipsq + ill_lock ipsq or ill_lock or
* absence of ipsq writer.
* ill_phys_addr_mp ipsq + down ill only when ill is up
+ * ill_mcast_deferred ill_lock ill_lock
* ill_phys_addr ipsq + down ill only when ill is up
* ill_dest_addr_mp ipsq + down ill only when ill is up
* ill_dest_addr ipsq + down ill only when ill is up
@@ -2204,8 +1831,7 @@ typedef struct ill_s {
* exclusive bit flags ipsq_t ipsq_t
* shared bit flags ill_lock ill_lock
*
- * ill_arp_muxid ipsq Not atomic
- * ill_ip_muxid ipsq Not atomic
+ * ill_muxid ipsq Not atomic
*
* ill_ipf_gen Not atomic
* ill_frag_count atomics atomics
@@ -2215,7 +1841,7 @@ typedef struct ill_s {
* ill_dlpi_capab_state ipsq ipsq
* ill_max_hops ipsq Not atomic
*
- * ill_max_mtu
+ * ill_mtu ill_lock None
*
* ill_user_mtu ipsq + ill_lock ill_lock
* ill_reachable_time ipsq + ill_lock ill_lock
@@ -2230,9 +1856,6 @@ typedef struct ill_s {
* ill_xmit_count ipsq + down ill write once
* ill_ip6_mib ipsq + down ill only when ill is up
* ill_icmp6_mib ipsq + down ill only when ill is up
- * ill_arp_down_mp ipsq ipsq
- * ill_arp_del_mapping_mp ipsq ipsq
- * ill_arp_on_mp ipsq ipsq
*
* ill_phyint ipsq, ill_g_lock, ill_lock Any of them
* ill_flags ill_lock ill_lock
@@ -2247,7 +1870,7 @@ typedef struct ill_s {
* ill_refcnt ill_lock ill_lock
* ill_ire_cnt ill_lock ill_lock
* ill_cv ill_lock ill_lock
- * ill_ilm_walker_cnt ill_lock ill_lock
+ * ill_ncec_cnt ill_lock ill_lock
* ill_nce_cnt ill_lock ill_lock
* ill_ilm_cnt ill_lock ill_lock
* ill_src_ipif ill_g_lock ill_g_lock
@@ -2256,8 +1879,12 @@ typedef struct ill_s {
* ill_dhcpinit atomics atomics
* ill_flownotify_mh write once write once
* ill_capab_pending_cnt ipsq ipsq
- *
- * ill_bound_cnt ipsq ipsq
+ * ill_ipallmulti_cnt ill_lock ill_lock
+ * ill_ipallmulti_ilm ill_lock ill_lock
+ * ill_saved_ire_mp ill_saved_ire_lock ill_saved_ire_lock
+ * ill_saved_ire_cnt ill_saved_ire_lock ill_saved_ire_lock
+ * ill_arl ??? ???
+ * ill_ire_multicast ipsq + quiescent none
* ill_bound_ipif ipsq ipsq
* ill_actnode ipsq + ipmp_lock ipsq OR ipmp_lock
* ill_grpnode ipsq + ill_g_lock ipsq OR ill_g_lock
@@ -2267,6 +1894,7 @@ typedef struct ill_s {
* ill_refresh_tid ill_lock ill_lock
* ill_grp (for IPMP ill) write once write once
* ill_grp (for underlying ill) ipsq + ill_g_lock ipsq OR ill_g_lock
+ * ill_mrouter_cnt atomics atomics
*
* NOTE: It's OK to make heuristic decisions on an underlying interface
* by using IS_UNDER_IPMP() or comparing ill_grp's raw pointer value.
@@ -2311,7 +1939,6 @@ enum { IF_CMD = 1, LIF_CMD, ARP_CMD, XARP_CMD, MSFILT_CMD, MISC_CMD };
#define IPI_GET_CMD 0x8 /* branch to mi_copyout on success */
/* unused 0x10 */
#define IPI_NULL_BCONT 0x20 /* ioctl has not data and hence no b_cont */
-#define IPI_PASS_DOWN 0x40 /* pass this ioctl down when a module only */
extern ip_ioctl_cmd_t ip_ndx_ioctl_table[];
extern ip_ioctl_cmd_t ip_misc_ioctl_table[];
@@ -2362,6 +1989,430 @@ typedef struct ipndp_s {
char *ip_ndp_name;
} ipndp_t;
+/* IXA Notification types */
+typedef enum {
+ IXAN_LSO, /* LSO capability change */
+ IXAN_PMTU, /* PMTU change */
+ IXAN_ZCOPY /* ZEROCOPY capability change */
+} ixa_notify_type_t;
+
+typedef uint_t ixa_notify_arg_t;
+
+typedef void (*ixa_notify_t)(void *, ip_xmit_attr_t *ixa, ixa_notify_type_t,
+ ixa_notify_arg_t);
+
+/*
+ * Attribute flags that are common to the transmit and receive attributes
+ */
+#define IAF_IS_IPV4 0x80000000 /* ipsec_*_v4 */
+#define IAF_TRUSTED_ICMP 0x40000000 /* ipsec_*_icmp_loopback */
+#define IAF_NO_LOOP_ZONEID_SET 0x20000000 /* Zone that shouldn't have */
+ /* a copy */
+#define IAF_LOOPBACK_COPY 0x10000000 /* For multi and broadcast */
+
+#define IAF_MASK 0xf0000000 /* Flags that are common */
+
+/*
+ * Transmit side attributes used between the transport protocols and IP as
+ * well as inside IP. It is also used to cache information in the conn_t i.e.
+ * replaces conn_ire and the IPsec caching in the conn_t.
+ */
+struct ip_xmit_attr_s {
+ iaflags_t ixa_flags; /* IXAF_*. See below */
+
+ uint32_t ixa_free_flags; /* IXA_FREE_*. See below */
+ uint32_t ixa_refcnt; /* Using atomics */
+
+ /*
+ * Always initialized independently of ixa_flags settings.
+ * Used by ip_xmit so we keep them up front for cache locality.
+ */
+ uint32_t ixa_xmit_hint; /* For ECMP and GLD TX ring fanout */
+ uint_t ixa_pktlen; /* Always set. For frag and stats */
+ zoneid_t ixa_zoneid; /* Assumed always set */
+
+ /* Always set for conn_ip_output(); might be stale */
+ /*
+ * Since TCP keeps the conn_t around past the process going away
+ * we need to use the "notr" (e.g, ire_refhold_notr) for ixa_ire,
+ * ixa_nce, and ixa_dce.
+ */
+ ire_t *ixa_ire; /* Forwarding table entry */
+ uint_t ixa_ire_generation;
+ nce_t *ixa_nce; /* Neighbor cache entry */
+ dce_t *ixa_dce; /* Destination cache entry */
+ uint_t ixa_dce_generation;
+ uint_t ixa_src_generation; /* If IXAF_VERIFY_SOURCE */
+
+ uint32_t ixa_src_preferences; /* prefs for src addr select */
+ uint32_t ixa_pmtu; /* IXAF_VERIFY_PMTU */
+
+ /* Set by ULP if IXAF_VERIFY_PMTU; otherwise set by IP */
+ uint32_t ixa_fragsize;
+
+ int8_t ixa_use_min_mtu; /* IXAF_USE_MIN_MTU values */
+
+ pfirepostfrag_t ixa_postfragfn; /* Set internally in IP */
+
+ in6_addr_t ixa_nexthop_v6; /* IXAF_NEXTHOP_SET */
+#define ixa_nexthop_v4 V4_PART_OF_V6(ixa_nexthop_v6)
+
+ zoneid_t ixa_no_loop_zoneid; /* IXAF_NO_LOOP_ZONEID_SET */
+
+ uint_t ixa_scopeid; /* For IPv6 link-locals */
+
+ uint_t ixa_broadcast_ttl; /* IXAF_BROACAST_TTL_SET */
+
+ uint_t ixa_multicast_ttl; /* Assumed set for multicast */
+ uint_t ixa_multicast_ifindex; /* Assumed set for multicast */
+ ipaddr_t ixa_multicast_ifaddr; /* Assumed set for multicast */
+
+ int ixa_raw_cksum_offset; /* If IXAF_SET_RAW_CKSUM */
+
+ uint32_t ixa_ident; /* For IPv6 fragment header */
+
+ /*
+ * Cached LSO information.
+ */
+ ill_lso_capab_t ixa_lso_capab; /* Valid when IXAF_LSO_CAPAB */
+
+ uint64_t ixa_ipsec_policy_gen; /* Generation from iph_gen */
+ /*
+ * The following IPsec fields are only initialized when
+ * IXAF_IPSEC_SECURE is set. Otherwise they contain garbage.
+ */
+ ipsec_latch_t *ixa_ipsec_latch; /* Just the ids */
+ struct ipsa_s *ixa_ipsec_ah_sa; /* Hard reference SA for AH */
+ struct ipsa_s *ixa_ipsec_esp_sa; /* Hard reference SA for ESP */
+ struct ipsec_policy_s *ixa_ipsec_policy; /* why are we here? */
+ struct ipsec_action_s *ixa_ipsec_action; /* For reflected packets */
+ ipsa_ref_t ixa_ipsec_ref[2]; /* Soft reference to SA */
+ /* 0: ESP, 1: AH */
+
+ /*
+ * The selectors here are potentially different than the SPD rule's
+ * selectors, and we need to have both available for IKEv2.
+ *
+ * NOTE: "Source" and "Dest" are w.r.t. outbound datagrams. Ports can
+ * be zero, and the protocol number is needed to make the ports
+ * significant.
+ */
+ uint16_t ixa_ipsec_src_port; /* Source port number of d-gram. */
+ uint16_t ixa_ipsec_dst_port; /* Destination port number of d-gram. */
+ uint8_t ixa_ipsec_icmp_type; /* ICMP type of d-gram */
+ uint8_t ixa_ipsec_icmp_code; /* ICMP code of d-gram */
+
+ sa_family_t ixa_ipsec_inaf; /* Inner address family */
+#define IXA_MAX_ADDRLEN 4 /* Max addr len. (in 32-bit words) */
+ uint32_t ixa_ipsec_insrc[IXA_MAX_ADDRLEN]; /* Inner src address */
+ uint32_t ixa_ipsec_indst[IXA_MAX_ADDRLEN]; /* Inner dest address */
+ uint8_t ixa_ipsec_insrcpfx; /* Inner source prefix */
+ uint8_t ixa_ipsec_indstpfx; /* Inner destination prefix */
+
+ uint8_t ixa_ipsec_proto; /* IP protocol number for d-gram. */
+
+ /* Always initialized independently of ixa_flags settings */
+ uint_t ixa_ifindex; /* Assumed always set */
+ uint16_t ixa_ip_hdr_length; /* Points to ULP header */
+ uint8_t ixa_protocol; /* Protocol number for ULP cksum */
+ ts_label_t *ixa_tsl; /* Always set. NULL if not TX */
+ ip_stack_t *ixa_ipst; /* Always set */
+ uint32_t ixa_extra_ident; /* Set if LSO */
+ cred_t *ixa_cred; /* For getpeerucred */
+ pid_t ixa_cpid; /* For getpeerucred */
+
+#ifdef DEBUG
+ kthread_t *ixa_curthread; /* For serialization assert */
+#endif
+ squeue_t *ixa_sqp; /* Set from conn_sqp as a hint */
+ uintptr_t ixa_cookie; /* cookie to use for tx flow control */
+
+ /*
+ * Must be set by ULP if any of IXAF_VERIFY_LSO, IXAF_VERIFY_PMTU,
+ * or IXAF_VERIFY_ZCOPY is set.
+ */
+ ixa_notify_t ixa_notify; /* Registered upcall notify function */
+ void *ixa_notify_cookie; /* ULP cookie for ixa_notify */
+};
+
+/*
+ * Flags to indicate which transmit attributes are set.
+ * Split into "xxx_SET" ones which indicate that the "xxx" field it set, and
+ * single flags.
+ */
+#define IXAF_REACH_CONF 0x00000001 /* Reachability confirmation */
+#define IXAF_BROADCAST_TTL_SET 0x00000002 /* ixa_broadcast_ttl valid */
+#define IXAF_SET_SOURCE 0x00000004 /* Replace if broadcast */
+#define IXAF_USE_MIN_MTU 0x00000008 /* IPV6_USE_MIN_MTU */
+
+#define IXAF_DONTFRAG 0x00000010 /* IP*_DONTFRAG */
+#define IXAF_VERIFY_PMTU 0x00000020 /* ixa_pmtu/ixa_fragsize set */
+#define IXAF_PMTU_DISCOVERY 0x00000040 /* Create/use PMTU state */
+#define IXAF_MULTICAST_LOOP 0x00000080 /* IP_MULTICAST_LOOP */
+
+#define IXAF_IPSEC_SECURE 0x00000100 /* Need IPsec processing */
+#define IXAF_UCRED_TSL 0x00000200 /* ixa_tsl from SCM_UCRED */
+#define IXAF_DONTROUTE 0x00000400 /* SO_DONTROUTE */
+#define IXAF_NO_IPSEC 0x00000800 /* Ignore policy */
+
+#define IXAF_PMTU_TOO_SMALL 0x00001000 /* PMTU too small */
+#define IXAF_SET_ULP_CKSUM 0x00002000 /* Calculate ULP checksum */
+#define IXAF_VERIFY_SOURCE 0x00004000 /* Check that source is ok */
+#define IXAF_NEXTHOP_SET 0x00008000 /* ixa_nexthop set */
+
+#define IXAF_PMTU_IPV4_DF 0x00010000 /* Set IPv4 DF */
+#define IXAF_NO_DEV_FLOW_CTL 0x00020000 /* Protocol needs no flow ctl */
+#define IXAF_NO_TTL_CHANGE 0x00040000 /* Internal to IP */
+#define IXAF_IPV6_ADD_FRAGHDR 0x00080000 /* Add fragment header */
+
+#define IXAF_IPSEC_TUNNEL 0x00100000 /* Tunnel mode */
+#define IXAF_NO_PFHOOK 0x00200000 /* Skip xmit pfhook */
+#define IXAF_NO_TRACE 0x00400000 /* When back from ARP/ND */
+#define IXAF_SCOPEID_SET 0x00800000 /* ixa_scopeid set */
+
+#define IXAF_MULTIRT_MULTICAST 0x01000000 /* MULTIRT for multicast */
+#define IXAF_NO_HW_CKSUM 0x02000000 /* Force software cksum */
+#define IXAF_SET_RAW_CKSUM 0x04000000 /* Use ixa_raw_cksum_offset */
+#define IXAF_IPSEC_GLOBAL_POLICY 0x08000000 /* Policy came from global */
+
+/* Note the following uses bits 0x10000000 through 0x80000000 */
+#define IXAF_IS_IPV4 IAF_IS_IPV4
+#define IXAF_TRUSTED_ICMP IAF_TRUSTED_ICMP
+#define IXAF_NO_LOOP_ZONEID_SET IAF_NO_LOOP_ZONEID_SET
+#define IXAF_LOOPBACK_COPY IAF_LOOPBACK_COPY
+
+/* Note: use the upper 32 bits */
+#define IXAF_VERIFY_LSO 0x100000000 /* Check LSO capability */
+#define IXAF_LSO_CAPAB 0x200000000 /* Capable of LSO */
+#define IXAF_VERIFY_ZCOPY 0x400000000 /* Check Zero Copy capability */
+#define IXAF_ZCOPY_CAPAB 0x800000000 /* Capable of ZEROCOPY */
+
+/*
+ * The normal flags for sending packets e.g., icmp errors
+ */
+#define IXAF_BASIC_SIMPLE_V4 (IXAF_SET_ULP_CKSUM | IXAF_IS_IPV4)
+#define IXAF_BASIC_SIMPLE_V6 (IXAF_SET_ULP_CKSUM)
+
+/*
+ * Normally these fields do not have a hold. But in some cases they do, for
+ * instance when we've gone through ip_*_attr_to/from_mblk.
+ * We use ixa_free_flags to indicate that they have a hold and need to be
+ * released on cleanup.
+ */
+#define IXA_FREE_CRED 0x00000001 /* ixa_cred needs to be rele */
+#define IXA_FREE_TSL 0x00000002 /* ixa_tsl needs to be rele */
+
+/*
+ * Simplistic way to set the ixa_xmit_hint for locally generated traffic
+ * and forwarded traffic. The shift amount are based on the size of the
+ * structs to discard the low order bits which don't have much if any variation
+ * (coloring in kmem_cache_alloc might provide some variation).
+ *
+ * Basing the locally generated hint on the address of the conn_t means that
+ * the packets from the same socket/connection do not get reordered.
+ * Basing the hint for forwarded traffic on the ill_ring_t means that
+ * packets from the same NIC+ring are likely to use the same outbound ring
+ * hence we get low contention on the ring in the transmitting driver.
+ */
+#define CONN_TO_XMIT_HINT(connp) ((uint32_t)(((uintptr_t)connp) >> 11))
+#define ILL_RING_TO_XMIT_HINT(ring) ((uint32_t)(((uintptr_t)ring) >> 7))
+
+/*
+ * IP set Destination Flags used by function ip_set_destination,
+ * ip_attr_connect, and conn_connect.
+ */
+#define IPDF_ALLOW_MCBC 0x1 /* Allow multi/broadcast */
+#define IPDF_VERIFY_DST 0x2 /* Verify destination addr */
+#define IPDF_SELECT_SRC 0x4 /* Select source address */
+#define IPDF_LSO 0x8 /* Try LSO */
+#define IPDF_IPSEC 0x10 /* Set IPsec policy */
+#define IPDF_ZONE_IS_GLOBAL 0x20 /* From conn_zone_is_global */
+#define IPDF_ZCOPY 0x40 /* Try ZEROCOPY */
+#define IPDF_UNIQUE_DCE 0x80 /* Get a per-destination DCE */
+
+/*
+ * Receive side attributes used between the transport protocols and IP as
+ * well as inside IP.
+ */
+struct ip_recv_attr_s {
+ iaflags_t ira_flags; /* See below */
+
+ uint32_t ira_free_flags; /* IRA_FREE_*. See below */
+
+ /*
+ * This is a hint for TCP SYN packets.
+ * Always initialized independently of ira_flags settings
+ */
+ squeue_t *ira_sqp;
+ ill_rx_ring_t *ira_ring; /* Internal to IP */
+
+ /* For ip_accept_tcp when IRAF_TARGET_SQP is set */
+ squeue_t *ira_target_sqp;
+ mblk_t *ira_target_sqp_mp;
+
+ /* Always initialized independently of ira_flags settings */
+ uint32_t ira_xmit_hint; /* For ECMP and GLD TX ring fanout */
+ zoneid_t ira_zoneid; /* ALL_ZONES unless local delivery */
+ uint_t ira_pktlen; /* Always set. For frag and stats */
+ uint16_t ira_ip_hdr_length; /* Points to ULP header */
+ uint8_t ira_protocol; /* Protocol number for ULP cksum */
+ uint_t ira_rifindex; /* Received ifindex */
+ uint_t ira_ruifindex; /* Received upper ifindex */
+ ts_label_t *ira_tsl; /* Always set. NULL if not TX */
+ /*
+ * ira_rill and ira_ill is set inside IP, but not when conn_recv is
+ * called; ULPs should use ira_ruifindex instead.
+ */
+ ill_t *ira_rill; /* ill where packet came */
+ ill_t *ira_ill; /* ill where IP address hosted */
+ cred_t *ira_cred; /* For getpeerucred */
+ pid_t ira_cpid; /* For getpeerucred */
+
+ /* Used when IRAF_VERIFIED_SRC is set; this source was ok */
+ ipaddr_t ira_verified_src;
+
+ /*
+ * The following IPsec fields are only initialized when
+ * IRAF_IPSEC_SECURE is set. Otherwise they contain garbage.
+ */
+ struct ipsec_action_s *ira_ipsec_action; /* how we made it in.. */
+ struct ipsa_s *ira_ipsec_ah_sa; /* SA for AH */
+ struct ipsa_s *ira_ipsec_esp_sa; /* SA for ESP */
+
+ ipaddr_t ira_mroute_tunnel; /* IRAF_MROUTE_TUNNEL_SET */
+
+ zoneid_t ira_no_loop_zoneid; /* IRAF_NO_LOOP_ZONEID_SET */
+
+ uint32_t ira_esp_udp_ports; /* IRAF_ESP_UDP_PORTS */
+
+ /*
+ * For IP_RECVSLLA and ip_ndp_conflict/find_solicitation.
+ * Same size as max for sockaddr_dl
+ */
+#define IRA_L2SRC_SIZE 244
+ uint8_t ira_l2src[IRA_L2SRC_SIZE]; /* If IRAF_L2SRC_SET */
+
+ /*
+ * Local handle that we use to do lazy setting of ira_l2src.
+ * We defer setting l2src until needed but we do before any
+ * ip_input pullupmsg or copymsg.
+ */
+ struct mac_header_info_s *ira_mhip; /* Could be NULL */
+};
+
+/*
+ * Flags to indicate which receive attributes are set.
+ */
+#define IRAF_SYSTEM_LABELED 0x00000001 /* is_system_labeled() */
+#define IRAF_IPV4_OPTIONS 0x00000002 /* Performance */
+#define IRAF_MULTICAST 0x00000004 /* Was multicast at L3 */
+#define IRAF_BROADCAST 0x00000008 /* Was broadcast at L3 */
+#define IRAF_MULTIBROADCAST (IRAF_MULTICAST|IRAF_BROADCAST)
+
+#define IRAF_LOOPBACK 0x00000010 /* Looped back by IP */
+#define IRAF_VERIFY_IP_CKSUM 0x00000020 /* Need to verify IP */
+#define IRAF_VERIFY_ULP_CKSUM 0x00000040 /* Need to verify TCP,UDP,etc */
+#define IRAF_SCTP_CSUM_ERR 0x00000080 /* sctp pkt has failed chksum */
+
+#define IRAF_IPSEC_SECURE 0x00000100 /* Passed AH and/or ESP */
+#define IRAF_DHCP_UNICAST 0x00000200
+#define IRAF_IPSEC_DECAPS 0x00000400 /* Was packet decapsulated */
+ /* from a matching inner packet? */
+#define IRAF_TARGET_SQP 0x00000800 /* ira_target_sqp is set */
+#define IRAF_VERIFIED_SRC 0x00001000 /* ira_verified_src set */
+#define IRAF_RSVP 0x00002000 /* RSVP packet for rsvpd */
+#define IRAF_MROUTE_TUNNEL_SET 0x00004000 /* From ip_mroute_decap */
+#define IRAF_PIM_REGISTER 0x00008000 /* From register_mforward */
+
+#define IRAF_TX_MAC_EXEMPTABLE 0x00010000 /* Allow MAC_EXEMPT readdown */
+#define IRAF_TX_SHARED_ADDR 0x00020000 /* Arrived on ALL_ZONES addr */
+#define IRAF_ESP_UDP_PORTS 0x00040000 /* NAT-traversal packet */
+#define IRAF_NO_HW_CKSUM 0x00080000 /* Force software cksum */
+
+#define IRAF_ICMP_ERROR 0x00100000 /* Send to conn_recvicmp */
+#define IRAF_ROUTER_ALERT 0x00200000 /* IPv6 router alert */
+#define IRAF_L2SRC_SET 0x00400000 /* ira_l2src has been set */
+#define IRAF_L2SRC_LOOPBACK 0x00800000 /* Came from us */
+
+#define IRAF_L2DST_MULTICAST 0x01000000 /* Multicast at L2 */
+#define IRAF_L2DST_BROADCAST 0x02000000 /* Broadcast at L2 */
+/* Unused 0x04000000 */
+/* Unused 0x08000000 */
+
+/* Below starts with 0x10000000 */
+#define IRAF_IS_IPV4 IAF_IS_IPV4
+#define IRAF_TRUSTED_ICMP IAF_TRUSTED_ICMP
+#define IRAF_NO_LOOP_ZONEID_SET IAF_NO_LOOP_ZONEID_SET
+#define IRAF_LOOPBACK_COPY IAF_LOOPBACK_COPY
+
+/*
+ * Normally these fields do not have a hold. But in some cases they do, for
+ * instance when we've gone through ip_*_attr_to/from_mblk.
+ * We use ira_free_flags to indicate that they have a hold and need to be
+ * released on cleanup.
+ */
+#define IRA_FREE_CRED 0x00000001 /* ira_cred needs to be rele */
+#define IRA_FREE_TSL 0x00000002 /* ira_tsl needs to be rele */
+
+/*
+ * Optional destination cache entry for path MTU information,
+ * and ULP metrics.
+ */
+struct dce_s {
+ uint_t dce_generation; /* Changed since cached? */
+ uint_t dce_flags; /* See below */
+ uint_t dce_ipversion; /* IPv4/IPv6 version */
+ uint32_t dce_pmtu; /* Path MTU if DCEF_PMTU */
+ uint32_t dce_ident; /* Per destination IP ident. */
+ iulp_t dce_uinfo; /* Metrics if DCEF_UINFO */
+
+ struct dce_s *dce_next;
+ struct dce_s **dce_ptpn;
+ struct dcb_s *dce_bucket;
+
+ union {
+ in6_addr_t dceu_v6addr;
+ ipaddr_t dceu_v4addr;
+ } dce_u;
+#define dce_v4addr dce_u.dceu_v4addr
+#define dce_v6addr dce_u.dceu_v6addr
+ /* Note that for IPv6+IPMP we use the ifindex for the upper interface */
+ uint_t dce_ifindex; /* For IPv6 link-locals */
+
+ kmutex_t dce_lock;
+ uint_t dce_refcnt;
+ uint64_t dce_last_change_time; /* Path MTU. In seconds */
+
+ ip_stack_t *dce_ipst; /* Does not have a netstack_hold */
+};
+
+/*
+ * Values for dce_generation.
+ *
+ * If a DCE has DCE_GENERATION_CONDEMNED, the last dce_refrele should delete
+ * it.
+ *
+ * DCE_GENERATION_VERIFY is never stored in dce_generation but it is
+ * stored in places that cache DCE (such as ixa_dce_generation).
+ * It is used as a signal that the cache is stale and needs to be reverified.
+ */
+#define DCE_GENERATION_CONDEMNED 0
+#define DCE_GENERATION_VERIFY 1
+#define DCE_GENERATION_INITIAL 2
+#define DCE_IS_CONDEMNED(dce) \
+ ((dce)->dce_generation == DCE_GENERATION_CONDEMNED)
+
+
+/*
+ * Values for ips_src_generation.
+ *
+ * SRC_GENERATION_VERIFY is never stored in ips_src_generation but it is
+ * stored in places that cache IREs (ixa_src_generation). It is used as a
+ * signal that the cache is stale and needs to be reverified.
+ */
+#define SRC_GENERATION_VERIFY 0
+#define SRC_GENERATION_INITIAL 1
+
/*
* The kernel stores security attributes of all gateways in a database made
* up of one or more tsol_gcdb_t elements. Each tsol_gcdb_t contains the
@@ -2453,183 +2504,28 @@ extern kmutex_t gcgrp_lock;
*/
struct tsol_tnrhc;
-typedef struct tsol_ire_gw_secattr_s {
+struct tsol_ire_gw_secattr_s {
kmutex_t igsa_lock; /* lock to protect following */
struct tsol_tnrhc *igsa_rhc; /* host entry for gateway */
tsol_gc_t *igsa_gc; /* for prefix IREs */
- tsol_gcgrp_t *igsa_gcgrp; /* for cache IREs */
-} tsol_ire_gw_secattr_t;
-
-/*
- * Following are the macros to increment/decrement the reference
- * count of the IREs and IRBs (ire bucket).
- *
- * 1) We bump up the reference count of an IRE to make sure that
- * it does not get deleted and freed while we are using it.
- * Typically all the lookup functions hold the bucket lock,
- * and look for the IRE. If it finds an IRE, it bumps up the
- * reference count before dropping the lock. Sometimes we *may* want
- * to bump up the reference count after we *looked* up i.e without
- * holding the bucket lock. So, the IRE_REFHOLD macro does not assert
- * on the bucket lock being held. Any thread trying to delete from
- * the hash bucket can still do so but cannot free the IRE if
- * ire_refcnt is not 0.
- *
- * 2) We bump up the reference count on the bucket where the IRE resides
- * (IRB), when we want to prevent the IREs getting deleted from a given
- * hash bucket. This makes life easier for ire_walk type functions which
- * wants to walk the IRE list, call a function, but needs to drop
- * the bucket lock to prevent recursive rw_enters. While the
- * lock is dropped, the list could be changed by other threads or
- * the same thread could end up deleting the ire or the ire pointed by
- * ire_next. IRE_REFHOLDing the ire or ire_next is not sufficient as
- * a delete will still remove the ire from the bucket while we have
- * dropped the lock and hence the ire_next would be NULL. Thus, we
- * need a mechanism to prevent deletions from a given bucket.
- *
- * To prevent deletions, we bump up the reference count on the
- * bucket. If the bucket is held, ire_delete just marks IRE_MARK_CONDEMNED
- * both on the ire's ire_marks and the bucket's irb_marks. When the
- * reference count on the bucket drops to zero, all the CONDEMNED ires
- * are deleted. We don't have to bump up the reference count on the
- * bucket if we are walking the bucket and never have to drop the bucket
- * lock. Note that IRB_REFHOLD does not prevent addition of new ires
- * in the list. It is okay because addition of new ires will not cause
- * ire_next to point to freed memory. We do IRB_REFHOLD only when
- * all of the 3 conditions are true :
- *
- * 1) The code needs to walk the IRE bucket from start to end.
- * 2) It may have to drop the bucket lock sometimes while doing (1)
- * 3) It does not want any ires to be deleted meanwhile.
- */
-
-/*
- * Bump up the reference count on the IRE. We cannot assert that the
- * bucket lock is being held as it is legal to bump up the reference
- * count after the first lookup has returned the IRE without
- * holding the lock. Currently ip_wput does this for caching IRE_CACHEs.
- */
-
-#ifdef DEBUG
-#define IRE_UNTRACE_REF(ire) ire_untrace_ref(ire);
-#define IRE_TRACE_REF(ire) ire_trace_ref(ire);
-#else
-#define IRE_UNTRACE_REF(ire)
-#define IRE_TRACE_REF(ire)
-#endif
-
-#define IRE_REFHOLD_NOTR(ire) { \
- atomic_add_32(&(ire)->ire_refcnt, 1); \
- ASSERT((ire)->ire_refcnt != 0); \
-}
-
-#define IRE_REFHOLD(ire) { \
- IRE_REFHOLD_NOTR(ire); \
- IRE_TRACE_REF(ire); \
-}
-
-#define IRE_REFHOLD_LOCKED(ire) { \
- IRE_TRACE_REF(ire); \
- (ire)->ire_refcnt++; \
-}
-
-/*
- * Decrement the reference count on the IRE.
- * In architectures e.g sun4u, where atomic_add_32_nv is just
- * a cas, we need to maintain the right memory barrier semantics
- * as that of mutex_exit i.e all the loads and stores should complete
- * before the cas is executed. membar_exit() does that here.
- *
- * NOTE : This macro is used only in places where we want performance.
- * To avoid bloating the code, we use the function "ire_refrele"
- * which essentially calls the macro.
- */
-#define IRE_REFRELE_NOTR(ire) { \
- ASSERT((ire)->ire_refcnt != 0); \
- membar_exit(); \
- if (atomic_add_32_nv(&(ire)->ire_refcnt, -1) == 0) \
- ire_inactive(ire); \
-}
-
-#define IRE_REFRELE(ire) { \
- if (ire->ire_bucket != NULL) { \
- IRE_UNTRACE_REF(ire); \
- } \
- IRE_REFRELE_NOTR(ire); \
-}
-
-/*
- * Bump up the reference count on the hash bucket - IRB to
- * prevent ires from being deleted in this bucket.
- */
-#define IRB_REFHOLD(irb) { \
- rw_enter(&(irb)->irb_lock, RW_WRITER); \
- (irb)->irb_refcnt++; \
- ASSERT((irb)->irb_refcnt != 0); \
- rw_exit(&(irb)->irb_lock); \
-}
-#define IRB_REFHOLD_LOCKED(irb) { \
- ASSERT(RW_WRITE_HELD(&(irb)->irb_lock)); \
- (irb)->irb_refcnt++; \
- ASSERT((irb)->irb_refcnt != 0); \
-}
+};
void irb_refrele_ftable(irb_t *);
-/*
- * Note: when IRB_MARK_FTABLE (i.e., IRE_CACHETABLE entry), the irb_t
- * is statically allocated, so that when the irb_refcnt goes to 0,
- * we simply clean up the ire list and continue.
- */
-#define IRB_REFRELE(irb) { \
- if ((irb)->irb_marks & IRB_MARK_FTABLE) { \
- irb_refrele_ftable((irb)); \
- } else { \
- rw_enter(&(irb)->irb_lock, RW_WRITER); \
- ASSERT((irb)->irb_refcnt != 0); \
- if (--(irb)->irb_refcnt == 0 && \
- ((irb)->irb_marks & IRE_MARK_CONDEMNED)) { \
- ire_t *ire_list; \
- \
- ire_list = ire_unlink(irb); \
- rw_exit(&(irb)->irb_lock); \
- ASSERT(ire_list != NULL); \
- ire_cleanup(ire_list); \
- } else { \
- rw_exit(&(irb)->irb_lock); \
- } \
- } \
-}
extern struct kmem_cache *rt_entry_cache;
-/*
- * Lock the fast path mp for access, since the fp_mp can be deleted
- * due a DL_NOTE_FASTPATH_FLUSH in the case of IRE_BROADCAST
- */
-
-#define LOCK_IRE_FP_MP(ire) { \
- if ((ire)->ire_type == IRE_BROADCAST) \
- mutex_enter(&ire->ire_nce->nce_lock); \
- }
-#define UNLOCK_IRE_FP_MP(ire) { \
- if ((ire)->ire_type == IRE_BROADCAST) \
- mutex_exit(&ire->ire_nce->nce_lock); \
- }
-
typedef struct ire4 {
- ipaddr_t ire4_src_addr; /* Source address to use. */
ipaddr_t ire4_mask; /* Mask for matching this IRE. */
ipaddr_t ire4_addr; /* Address this IRE represents. */
- ipaddr_t ire4_gateway_addr; /* Gateway if IRE_CACHE/IRE_OFFSUBNET */
- ipaddr_t ire4_cmask; /* Mask from parent prefix route */
+ ipaddr_t ire4_gateway_addr; /* Gateway including for IRE_ONLINK */
+ ipaddr_t ire4_setsrc_addr; /* RTF_SETSRC */
} ire4_t;
typedef struct ire6 {
- in6_addr_t ire6_src_addr; /* Source address to use. */
in6_addr_t ire6_mask; /* Mask for matching this IRE. */
in6_addr_t ire6_addr; /* Address this IRE represents. */
- in6_addr_t ire6_gateway_addr; /* Gateway if IRE_CACHE/IRE_OFFSUBNET */
- in6_addr_t ire6_cmask; /* Mask from parent prefix route */
+ in6_addr_t ire6_gateway_addr; /* Gateway including for IRE_ONLINK */
+ in6_addr_t ire6_setsrc_addr; /* RTF_SETSRC */
} ire6_t;
typedef union ire_addr {
@@ -2637,115 +2533,131 @@ typedef union ire_addr {
ire4_t ire4_u;
} ire_addr_u_t;
-/* Internet Routing Entry */
-typedef struct ire_s {
+/*
+ * Internet Routing Entry
+ * When we have multiple identical IREs we logically add them by manipulating
+ * ire_identical_ref and ire_delete first decrements
+ * that and when it reaches 1 we know it is the last IRE.
+ * "identical" is defined as being the same for:
+ * ire_addr, ire_netmask, ire_gateway, ire_ill, ire_zoneid, and ire_type
+ * For instance, multiple IRE_BROADCASTs for the same subnet number are
+ * viewed as identical, and so are the IRE_INTERFACEs when there are
+ * multiple logical interfaces (on the same ill) with the same subnet prefix.
+ */
+struct ire_s {
struct ire_s *ire_next; /* The hash chain must be first. */
struct ire_s **ire_ptpn; /* Pointer to previous next. */
uint32_t ire_refcnt; /* Number of references */
- mblk_t *ire_mp; /* Non-null if allocated as mblk */
- queue_t *ire_rfq; /* recv from this queue */
- queue_t *ire_stq; /* send to this queue */
- union {
- uint_t *max_fragp; /* Used only during ire creation */
- uint_t max_frag; /* MTU (next hop or path). */
- } imf_u;
-#define ire_max_frag imf_u.max_frag
-#define ire_max_fragp imf_u.max_fragp
- uint32_t ire_frag_flag; /* IPH_DF or zero. */
- uint32_t ire_ident; /* Per IRE IP ident. */
- uint32_t ire_tire_mark; /* Used for reclaim of unused. */
+ ill_t *ire_ill;
+ uint32_t ire_identical_ref; /* IRE_INTERFACE, IRE_BROADCAST */
uchar_t ire_ipversion; /* IPv4/IPv6 version */
- uchar_t ire_marks; /* IRE_MARK_CONDEMNED etc. */
ushort_t ire_type; /* Type of IRE */
+ uint_t ire_generation; /* Generation including CONDEMNED */
uint_t ire_ib_pkt_count; /* Inbound packets for ire_addr */
uint_t ire_ob_pkt_count; /* Outbound packets to ire_addr */
- uint_t ire_ll_hdr_length; /* Non-zero if we do M_DATA prepends */
time_t ire_create_time; /* Time (in secs) IRE was created. */
- uint32_t ire_phandle; /* Associate prefix IREs to cache */
- uint32_t ire_ihandle; /* Associate interface IREs to cache */
- ipif_t *ire_ipif; /* the interface that this ire uses */
uint32_t ire_flags; /* flags related to route (RTF_*) */
/*
- * Neighbor Cache Entry for IPv6; arp info for IPv4
+ * ire_testhidden is TRUE for INTERFACE IREs of IS_UNDER_IPMP(ill)
+ * interfaces
*/
- struct nce_s *ire_nce;
+ boolean_t ire_testhidden;
+ pfirerecv_t ire_recvfn; /* Receive side handling */
+ pfiresend_t ire_sendfn; /* Send side handling */
+ pfirepostfrag_t ire_postfragfn; /* Bottom end of send handling */
+
uint_t ire_masklen; /* # bits in ire_mask{,_v6} */
ire_addr_u_t ire_u; /* IPv4/IPv6 address info. */
irb_t *ire_bucket; /* Hash bucket when ire_ptphn is set */
- iulp_t ire_uinfo; /* Upper layer protocol info. */
- /*
- * Protects ire_uinfo, ire_max_frag, and ire_frag_flag.
- */
kmutex_t ire_lock;
- uint_t ire_ipif_seqid; /* ipif_seqid of ire_ipif */
- uint_t ire_ipif_ifindex; /* ifindex associated with ipif */
- clock_t ire_last_used_time; /* Last used time */
+ clock_t ire_last_used_time; /* For IRE_LOCAL reception */
tsol_ire_gw_secattr_t *ire_gw_secattr; /* gateway security attributes */
- zoneid_t ire_zoneid; /* for local address discrimination */
+ zoneid_t ire_zoneid;
+
+ /*
+ * Cached information of where to send packets that match this route.
+ * The ire_dep_* information is used to determine when ire_nce_cache
+ * needs to be updated.
+ * ire_nce_cache is the fastpath for the Neighbor Cache Entry
+ * for IPv6; arp info for IPv4
+ * Since this is a cache setup and torn down independently of
+ * applications we need to use nce_ref{rele,hold}_notr for it.
+ */
+ nce_t *ire_nce_cache;
+
+ /*
+ * Quick check whether the ire_type and ire_masklen indicates
+ * that the IRE can have ire_nce_cache set i.e., whether it is
+ * IRE_ONLINK and for a single destination.
+ */
+ boolean_t ire_nce_capable;
+
/*
- * ire's that are embedded inside mblk_t and sent to the external
- * resolver use the ire_stq_ifindex to track the ifindex of the
- * ire_stq, so that the ill (if it exists) can be correctly recovered
- * for cleanup in the esbfree routine when arp failure occurs.
- * Similarly, the ire_stackid is used to recover the ip_stack_t.
+ * Dependency tracking so we can safely cache IRE and NCE pointers
+ * in offlink and onlink IREs.
+ * These are locked under the ips_ire_dep_lock rwlock. Write held
+ * when modifying the linkage.
+ * ire_dep_parent (Also chain towards IRE for nexthop)
+ * ire_dep_parent_generation: ire_generation of ire_dep_parent
+ * ire_dep_children (From parent to first child)
+ * ire_dep_sib_next (linked list of siblings)
+ * ire_dep_sib_ptpn (linked list of siblings)
+ *
+ * The parent has a ire_refhold on each child, and each child has
+ * an ire_refhold on its parent.
+ * Since ire_dep_parent is a cache setup and torn down independently of
+ * applications we need to use ire_ref{rele,hold}_notr for it.
*/
- uint_t ire_stq_ifindex;
- netstackid_t ire_stackid;
+ ire_t *ire_dep_parent;
+ ire_t *ire_dep_children;
+ ire_t *ire_dep_sib_next;
+ ire_t **ire_dep_sib_ptpn; /* Pointer to previous next */
+ uint_t ire_dep_parent_generation;
+
+ uint_t ire_badcnt; /* Number of times ND_UNREACHABLE */
+ uint64_t ire_last_badcnt; /* In seconds */
+
+ /* ire_defense* and ire_last_used_time are only used on IRE_LOCALs */
uint_t ire_defense_count; /* number of ARP conflicts */
uint_t ire_defense_time; /* last time defended (secs) */
+
boolean_t ire_trace_disable; /* True when alloc fails */
ip_stack_t *ire_ipst; /* Does not have a netstack_hold */
-} ire_t;
+ iulp_t ire_metrics;
+};
/* IPv4 compatibility macros */
-#define ire_src_addr ire_u.ire4_u.ire4_src_addr
#define ire_mask ire_u.ire4_u.ire4_mask
#define ire_addr ire_u.ire4_u.ire4_addr
#define ire_gateway_addr ire_u.ire4_u.ire4_gateway_addr
-#define ire_cmask ire_u.ire4_u.ire4_cmask
+#define ire_setsrc_addr ire_u.ire4_u.ire4_setsrc_addr
-#define ire_src_addr_v6 ire_u.ire6_u.ire6_src_addr
#define ire_mask_v6 ire_u.ire6_u.ire6_mask
#define ire_addr_v6 ire_u.ire6_u.ire6_addr
#define ire_gateway_addr_v6 ire_u.ire6_u.ire6_gateway_addr
-#define ire_cmask_v6 ire_u.ire6_u.ire6_cmask
-
-/* Convenient typedefs for sockaddrs */
-typedef struct sockaddr_in sin_t;
-typedef struct sockaddr_in6 sin6_t;
-
-/* Address structure used for internal bind with IP */
-typedef struct ipa_conn_s {
- ipaddr_t ac_laddr;
- ipaddr_t ac_faddr;
- uint16_t ac_fport;
- uint16_t ac_lport;
-} ipa_conn_t;
-
-typedef struct ipa6_conn_s {
- in6_addr_t ac6_laddr;
- in6_addr_t ac6_faddr;
- uint16_t ac6_fport;
- uint16_t ac6_lport;
-} ipa6_conn_t;
+#define ire_setsrc_addr_v6 ire_u.ire6_u.ire6_setsrc_addr
/*
- * Using ipa_conn_x_t or ipa6_conn_x_t allows us to modify the behavior of IP's
- * bind handler.
+ * Values for ire_generation.
+ *
+ * If an IRE is marked with IRE_IS_CONDEMNED, the last walker of
+ * the bucket should delete this IRE from this bucket.
+ *
+ * IRE_GENERATION_VERIFY is never stored in ire_generation but it is
+ * stored in places that cache IREs (such as ixa_ire_generation and
+ * ire_dep_parent_generation). It is used as a signal that the cache is
+ * stale and needs to be reverified.
*/
-typedef struct ipa_conn_extended_s {
- uint64_t acx_flags;
- ipa_conn_t acx_conn;
-} ipa_conn_x_t;
+#define IRE_GENERATION_CONDEMNED 0
+#define IRE_GENERATION_VERIFY 1
+#define IRE_GENERATION_INITIAL 2
+#define IRE_IS_CONDEMNED(ire) \
+ ((ire)->ire_generation == IRE_GENERATION_CONDEMNED)
-typedef struct ipa6_conn_extended_s {
- uint64_t ac6x_flags;
- ipa6_conn_t ac6x_conn;
-} ipa6_conn_x_t;
-
-/* flag values for ipa_conn_x_t and ipa6_conn_x_t. */
-#define ACX_VERIFY_DST 0x1ULL /* verify destination address is reachable */
+/* Convenient typedefs for sockaddrs */
+typedef struct sockaddr_in sin_t;
+typedef struct sockaddr_in6 sin6_t;
/* Name/Value Descriptor. */
typedef struct nv_s {
@@ -2784,110 +2696,83 @@ extern uint_t ip_max_frag_dups;
* to support the needs of such tools and private definitions moved to
* private headers.
*/
-struct ip6_pkt_s {
+struct ip_pkt_s {
uint_t ipp_fields; /* Which fields are valid */
- uint_t ipp_sticky_ignored; /* sticky fields to ignore */
- uint_t ipp_ifindex; /* pktinfo ifindex */
in6_addr_t ipp_addr; /* pktinfo src/dst addr */
- uint_t ipp_unicast_hops; /* IPV6_UNICAST_HOPS */
- uint_t ipp_multicast_hops; /* IPV6_MULTICAST_HOPS */
+#define ipp_addr_v4 V4_PART_OF_V6(ipp_addr)
+ uint_t ipp_unicast_hops; /* IPV6_UNICAST_HOPS, IP_TTL */
uint_t ipp_hoplimit; /* IPV6_HOPLIMIT */
uint_t ipp_hopoptslen;
- uint_t ipp_rtdstoptslen;
+ uint_t ipp_rthdrdstoptslen;
uint_t ipp_rthdrlen;
uint_t ipp_dstoptslen;
- uint_t ipp_pathmtulen;
uint_t ipp_fraghdrlen;
ip6_hbh_t *ipp_hopopts;
- ip6_dest_t *ipp_rtdstopts;
+ ip6_dest_t *ipp_rthdrdstopts;
ip6_rthdr_t *ipp_rthdr;
ip6_dest_t *ipp_dstopts;
ip6_frag_t *ipp_fraghdr;
- struct ip6_mtuinfo *ipp_pathmtu;
- in6_addr_t ipp_nexthop; /* Transmit only */
- uint8_t ipp_tclass;
- int8_t ipp_use_min_mtu;
+ uint8_t ipp_tclass; /* IPV6_TCLASS */
+ uint8_t ipp_type_of_service; /* IP_TOS */
+ uint_t ipp_ipv4_options_len; /* Len of IPv4 options */
+ uint8_t *ipp_ipv4_options; /* Ptr to IPv4 options */
+ uint_t ipp_label_len_v4; /* Len of TX label for IPv4 */
+ uint8_t *ipp_label_v4; /* TX label for IPv4 */
+ uint_t ipp_label_len_v6; /* Len of TX label for IPv6 */
+ uint8_t *ipp_label_v6; /* TX label for IPv6 */
};
-typedef struct ip6_pkt_s ip6_pkt_t;
-
-extern void ip6_pkt_free(ip6_pkt_t *); /* free storage inside ip6_pkt_t */
-
-/*
- * This struct is used by ULP_opt_set() functions to return value of IPv4
- * ancillary options. Currently this is only used by udp and icmp and only
- * IP_PKTINFO option is supported.
- */
-typedef struct ip4_pkt_s {
- uint_t ip4_ill_index; /* interface index */
- ipaddr_t ip4_addr; /* source address */
-} ip4_pkt_t;
-
-/*
- * Used by ULP's to pass options info to ip_output
- * currently only IP_PKTINFO is supported.
- */
-typedef struct ip_opt_info_s {
- uint_t ip_opt_ill_index;
- uint_t ip_opt_flags;
-} ip_opt_info_t;
-
-/*
- * value for ip_opt_flags
- */
-#define IP_VERIFY_SRC 0x1
+typedef struct ip_pkt_s ip_pkt_t;
-/*
- * This structure is used to convey information from IP and the ULP.
- * Currently used for the IP_RECVSLLA, IP_RECVIF and IP_RECVPKTINFO options.
- * The type of information field is set to IN_PKTINFO (i.e inbound pkt info)
- */
-typedef struct ip_pktinfo {
- uint32_t ip_pkt_ulp_type; /* type of info sent */
- uint32_t ip_pkt_flags; /* what is sent up by IP */
- uint32_t ip_pkt_ifindex; /* inbound interface index */
- struct sockaddr_dl ip_pkt_slla; /* has source link layer addr */
- struct in_addr ip_pkt_match_addr; /* matched address */
-} ip_pktinfo_t;
-
-/*
- * flags to tell UDP what IP is sending; in_pkt_flags
- */
-#define IPF_RECVIF 0x01 /* inbound interface index */
-#define IPF_RECVSLLA 0x02 /* source link layer address */
-/*
- * Inbound interface index + matched address.
- * Used only by IPV4.
- */
-#define IPF_RECVADDR 0x04
+extern void ip_pkt_free(ip_pkt_t *); /* free storage inside ip_pkt_t */
+extern ipaddr_t ip_pkt_source_route_v4(const ip_pkt_t *);
+extern in6_addr_t *ip_pkt_source_route_v6(const ip_pkt_t *);
+extern int ip_pkt_copy(ip_pkt_t *, ip_pkt_t *, int);
+extern void ip_pkt_source_route_reverse_v4(ip_pkt_t *);
/* ipp_fields values */
-#define IPPF_IFINDEX 0x0001 /* Part of in6_pktinfo: ifindex */
-#define IPPF_ADDR 0x0002 /* Part of in6_pktinfo: src/dst addr */
-#define IPPF_SCOPE_ID 0x0004 /* Add xmit ip6i_t for sin6_scope_id */
-#define IPPF_NO_CKSUM 0x0008 /* Add xmit ip6i_t for IP6I_NO_*_CKSUM */
-
-#define IPPF_RAW_CKSUM 0x0010 /* Add xmit ip6i_t for IP6I_RAW_CHECKSUM */
-#define IPPF_HOPLIMIT 0x0020
-#define IPPF_HOPOPTS 0x0040
-#define IPPF_RTHDR 0x0080
-
-#define IPPF_RTDSTOPTS 0x0100
-#define IPPF_DSTOPTS 0x0200
-#define IPPF_NEXTHOP 0x0400
-#define IPPF_PATHMTU 0x0800
-
-#define IPPF_TCLASS 0x1000
-#define IPPF_DONTFRAG 0x2000
-#define IPPF_USE_MIN_MTU 0x04000
-#define IPPF_MULTICAST_HOPS 0x08000
-
-#define IPPF_UNICAST_HOPS 0x10000
-#define IPPF_FRAGHDR 0x20000
-
-#define IPPF_HAS_IP6I \
- (IPPF_IFINDEX|IPPF_ADDR|IPPF_NEXTHOP|IPPF_SCOPE_ID| \
- IPPF_NO_CKSUM|IPPF_RAW_CKSUM|IPPF_HOPLIMIT|IPPF_DONTFRAG| \
- IPPF_USE_MIN_MTU|IPPF_MULTICAST_HOPS|IPPF_UNICAST_HOPS)
+#define IPPF_ADDR 0x0001 /* Part of in6_pktinfo: src/dst addr */
+#define IPPF_HOPLIMIT 0x0002 /* Overrides unicast and multicast */
+#define IPPF_TCLASS 0x0004 /* Overrides class in sin6_flowinfo */
+
+#define IPPF_HOPOPTS 0x0010 /* ipp_hopopts set */
+#define IPPF_RTHDR 0x0020 /* ipp_rthdr set */
+#define IPPF_RTHDRDSTOPTS 0x0040 /* ipp_rthdrdstopts set */
+#define IPPF_DSTOPTS 0x0080 /* ipp_dstopts set */
+
+#define IPPF_IPV4_OPTIONS 0x0100 /* ipp_ipv4_options set */
+#define IPPF_LABEL_V4 0x0200 /* ipp_label_v4 set */
+#define IPPF_LABEL_V6 0x0400 /* ipp_label_v6 set */
+
+#define IPPF_FRAGHDR 0x0800 /* Used for IPsec receive side */
+
+/*
+ * Data structure which is passed to conn_opt_get/set.
+ * The conn_t is included even though it can be inferred from queue_t.
+ * setsockopt and getsockopt use conn_ixa and conn_xmit_ipp. However,
+ * when handling ancillary data we use separate ixa and ipps.
+ */
+typedef struct conn_opt_arg_s {
+ conn_t *coa_connp;
+ ip_xmit_attr_t *coa_ixa;
+ ip_pkt_t *coa_ipp;
+ boolean_t coa_ancillary; /* Ancillary data and not setsockopt */
+ uint_t coa_changed; /* See below */
+} conn_opt_arg_t;
+
+/*
+ * Flags for what changed.
+ * If we want to be more efficient in the future we can have more fine
+ * grained flags e.g., a flag for just IP_TOS changing.
+ * For now we either call ip_set_destination (for "route changed")
+ * and/or conn_build_hdr_template/conn_prepend_hdr (for "header changed").
+ */
+#define COA_HEADER_CHANGED 0x0001
+#define COA_ROUTE_CHANGED 0x0002
+#define COA_RCVBUF_CHANGED 0x0004 /* SO_RCVBUF */
+#define COA_SNDBUF_CHANGED 0x0008 /* SO_SNDBUF */
+#define COA_WROFF_CHANGED 0x0010 /* Header size changed */
+#define COA_ICMP_BIND_NEEDED 0x0020
+#define COA_OOBINLINE_CHANGED 0x0040
#define TCP_PORTS_OFFSET 0
#define UDP_PORTS_OFFSET 0
@@ -2902,32 +2787,21 @@ typedef struct ip_pktinfo {
#define IPIF_LOOKUP_FAILED 2 /* Used as error code */
#define ILL_CAN_LOOKUP(ill) \
- (!((ill)->ill_state_flags & (ILL_CONDEMNED | ILL_CHANGING)) || \
+ (!((ill)->ill_state_flags & ILL_CONDEMNED) || \
IAM_WRITER_ILL(ill))
-#define ILL_CAN_WAIT(ill, q) \
- (((q) != NULL) && !((ill)->ill_state_flags & (ILL_CONDEMNED)))
+#define ILL_IS_CONDEMNED(ill) \
+ ((ill)->ill_state_flags & ILL_CONDEMNED)
#define IPIF_CAN_LOOKUP(ipif) \
- (!((ipif)->ipif_state_flags & (IPIF_CONDEMNED | IPIF_CHANGING)) || \
+ (!((ipif)->ipif_state_flags & IPIF_CONDEMNED) || \
IAM_WRITER_IPIF(ipif))
-/*
- * If the parameter 'q' is NULL, the caller is not interested in wait and
- * restart of the operation if the ILL or IPIF cannot be looked up when it is
- * marked as 'CHANGING'. Typically a thread that tries to send out data will
- * end up passing NULLs as the last 4 parameters to ill_lookup_on_ifindex and
- * in this case 'q' is NULL
- */
-#define IPIF_CAN_WAIT(ipif, q) \
- (((q) != NULL) && !((ipif)->ipif_state_flags & (IPIF_CONDEMNED)))
-
-#define IPIF_CAN_LOOKUP_WALKER(ipif) \
- (!((ipif)->ipif_state_flags & (IPIF_CONDEMNED)) || \
- IAM_WRITER_IPIF(ipif))
+#define IPIF_IS_CONDEMNED(ipif) \
+ ((ipif)->ipif_state_flags & IPIF_CONDEMNED)
-#define ILL_UNMARK_CHANGING(ill) \
- (ill)->ill_state_flags &= ~ILL_CHANGING;
+#define IPIF_IS_CHANGING(ipif) \
+ ((ipif)->ipif_state_flags & IPIF_CHANGING)
/* Macros used to assert that this thread is a writer */
#define IAM_WRITER_IPSQ(ipsq) ((ipsq)->ipsq_xop->ipx_writer == curthread)
@@ -2956,9 +2830,9 @@ typedef struct ip_pktinfo {
#define RELEASE_ILL_LOCKS(ill_1, ill_2) \
{ \
if (ill_1 != NULL) \
- mutex_exit(&(ill_1)->ill_lock); \
+ mutex_exit(&(ill_1)->ill_lock); \
if (ill_2 != NULL && ill_2 != ill_1) \
- mutex_exit(&(ill_2)->ill_lock); \
+ mutex_exit(&(ill_2)->ill_lock); \
}
/* Get the other protocol instance ill */
@@ -2975,20 +2849,13 @@ typedef struct cmd_info_s
struct lifreq *ci_lifr; /* the lifreq struct passed down */
} cmd_info_t;
-/*
- * List of AH and ESP IPsec acceleration capable ills
- */
-typedef struct ipsec_capab_ill_s {
- uint_t ill_index;
- boolean_t ill_isv6;
- struct ipsec_capab_ill_s *next;
-} ipsec_capab_ill_t;
-
extern struct kmem_cache *ire_cache;
extern ipaddr_t ip_g_all_ones;
-extern uint_t ip_loopback_mtu; /* /etc/system */
+extern uint_t ip_loopback_mtu; /* /etc/system */
+extern uint_t ip_loopback_mtuplus;
+extern uint_t ip_loopback_mtu_v6plus;
extern vmem_t *ip_minor_arena_sa;
extern vmem_t *ip_minor_arena_la;
@@ -3014,18 +2881,18 @@ extern vmem_t *ip_minor_arena_la;
#define ips_ip_g_send_redirects ips_param_arr[5].ip_param_value
#define ips_ip_g_forward_directed_bcast ips_param_arr[6].ip_param_value
#define ips_ip_mrtdebug ips_param_arr[7].ip_param_value
-#define ips_ip_timer_interval ips_param_arr[8].ip_param_value
-#define ips_ip_ire_arp_interval ips_param_arr[9].ip_param_value
-#define ips_ip_ire_redir_interval ips_param_arr[10].ip_param_value
+#define ips_ip_ire_reclaim_fraction ips_param_arr[8].ip_param_value
+#define ips_ip_nce_reclaim_fraction ips_param_arr[9].ip_param_value
+#define ips_ip_dce_reclaim_fraction ips_param_arr[10].ip_param_value
#define ips_ip_def_ttl ips_param_arr[11].ip_param_value
#define ips_ip_forward_src_routed ips_param_arr[12].ip_param_value
#define ips_ip_wroff_extra ips_param_arr[13].ip_param_value
-#define ips_ip_ire_pathmtu_interval ips_param_arr[14].ip_param_value
+#define ips_ip_pathmtu_interval ips_param_arr[14].ip_param_value
#define ips_ip_icmp_return ips_param_arr[15].ip_param_value
#define ips_ip_path_mtu_discovery ips_param_arr[16].ip_param_value
-#define ips_ip_ignore_delete_time ips_param_arr[17].ip_param_value
+#define ips_ip_pmtu_min ips_param_arr[17].ip_param_value
#define ips_ip_ignore_redirect ips_param_arr[18].ip_param_value
-#define ips_ip_output_queue ips_param_arr[19].ip_param_value
+#define ips_ip_arp_icmp_error ips_param_arr[19].ip_param_value
#define ips_ip_broadcast_ttl ips_param_arr[20].ip_param_value
#define ips_ip_icmp_err_interval ips_param_arr[21].ip_param_value
#define ips_ip_icmp_err_burst ips_param_arr[22].ip_param_value
@@ -3046,7 +2913,7 @@ extern vmem_t *ip_minor_arena_la;
#define ips_ipv6_send_redirects ips_param_arr[35].ip_param_value
#define ips_ipv6_ignore_redirect ips_param_arr[36].ip_param_value
#define ips_ipv6_strict_dst_multihoming ips_param_arr[37].ip_param_value
-#define ips_ip_ire_reclaim_fraction ips_param_arr[38].ip_param_value
+#define ips_src_check ips_param_arr[38].ip_param_value
#define ips_ipsec_policy_log_interval ips_param_arr[39].ip_param_value
#define ips_pim_accept_clear_messages ips_param_arr[40].ip_param_value
#define ips_ip_ndp_unsolicit_interval ips_param_arr[41].ip_param_value
@@ -3055,21 +2922,37 @@ extern vmem_t *ip_minor_arena_la;
/* Misc IP configuration knobs */
#define ips_ip_policy_mask ips_param_arr[44].ip_param_value
-#define ips_ip_multirt_resolution_interval ips_param_arr[45].ip_param_value
+#define ips_ip_ecmp_behavior ips_param_arr[45].ip_param_value
#define ips_ip_multirt_ttl ips_param_arr[46].ip_param_value
-#define ips_ip_multidata_outbound ips_param_arr[47].ip_param_value
-#define ips_ip_ndp_defense_interval ips_param_arr[48].ip_param_value
-#define ips_ip_max_temp_idle ips_param_arr[49].ip_param_value
-#define ips_ip_max_temp_defend ips_param_arr[50].ip_param_value
-#define ips_ip_max_defend ips_param_arr[51].ip_param_value
-#define ips_ip_defend_interval ips_param_arr[52].ip_param_value
-#define ips_ip_dup_recovery ips_param_arr[53].ip_param_value
-#define ips_ip_restrict_interzone_loopback ips_param_arr[54].ip_param_value
-#define ips_ip_lso_outbound ips_param_arr[55].ip_param_value
-#define ips_igmp_max_version ips_param_arr[56].ip_param_value
-#define ips_mld_max_version ips_param_arr[57].ip_param_value
-#define ips_ip_pmtu_min ips_param_arr[58].ip_param_value
-#define ips_ipv6_drop_inbound_icmpv6 ips_param_arr[59].ip_param_value
+#define ips_ip_ire_badcnt_lifetime ips_param_arr[47].ip_param_value
+#define ips_ip_max_temp_idle ips_param_arr[48].ip_param_value
+#define ips_ip_max_temp_defend ips_param_arr[49].ip_param_value
+#define ips_ip_max_defend ips_param_arr[50].ip_param_value
+#define ips_ip_defend_interval ips_param_arr[51].ip_param_value
+#define ips_ip_dup_recovery ips_param_arr[52].ip_param_value
+#define ips_ip_restrict_interzone_loopback ips_param_arr[53].ip_param_value
+#define ips_ip_lso_outbound ips_param_arr[54].ip_param_value
+#define ips_igmp_max_version ips_param_arr[55].ip_param_value
+#define ips_mld_max_version ips_param_arr[56].ip_param_value
+#define ips_ipv6_drop_inbound_icmpv6 ips_param_arr[57].ip_param_value
+#define ips_arp_probe_delay ips_param_arr[58].ip_param_value
+#define ips_arp_fastprobe_delay ips_param_arr[59].ip_param_value
+#define ips_arp_probe_interval ips_param_arr[60].ip_param_value
+#define ips_arp_fastprobe_interval ips_param_arr[61].ip_param_value
+#define ips_arp_probe_count ips_param_arr[62].ip_param_value
+#define ips_arp_fastprobe_count ips_param_arr[63].ip_param_value
+#define ips_ipv4_dad_announce_interval ips_param_arr[64].ip_param_value
+#define ips_ipv6_dad_announce_interval ips_param_arr[65].ip_param_value
+#define ips_arp_defend_interval ips_param_arr[66].ip_param_value
+#define ips_arp_defend_rate ips_param_arr[67].ip_param_value
+#define ips_ndp_defend_interval ips_param_arr[68].ip_param_value
+#define ips_ndp_defend_rate ips_param_arr[69].ip_param_value
+#define ips_arp_defend_period ips_param_arr[70].ip_param_value
+#define ips_ndp_defend_period ips_param_arr[71].ip_param_value
+#define ips_ipv4_icmp_return_pmtu ips_param_arr[72].ip_param_value
+#define ips_ipv6_icmp_return_pmtu ips_param_arr[73].ip_param_value
+#define ips_ip_arp_publish_count ips_param_arr[74].ip_param_value
+#define ips_ip_arp_publish_interval ips_param_arr[75].ip_param_value
extern int dohwcksum; /* use h/w cksum if supported by the h/w */
#ifdef ZC_TEST
@@ -3102,13 +2985,13 @@ extern struct module_info ip_mod_info;
((ipst)->ips_ip4_loopback_out_event.he_interested)
#define HOOKS6_INTERESTED_LOOPBACK_OUT(ipst) \
((ipst)->ips_ip6_loopback_out_event.he_interested)
-
/*
- * Hooks macros used inside of ip
+ * Hooks marcos used inside of ip
+ * The callers use the above INTERESTED macros first, hence
+ * the he_interested check is superflous.
*/
-#define FW_HOOKS(_hook, _event, _ilp, _olp, _iph, _fm, _m, _llm, ipst) \
- \
- if ((_hook).he_interested) { \
+#define FW_HOOKS(_hook, _event, _ilp, _olp, _iph, _fm, _m, _llm, ipst, _err) \
+ if ((_hook).he_interested) { \
hook_pkt_event_t info; \
\
_NOTE(CONSTCOND) \
@@ -3121,12 +3004,15 @@ extern struct module_info ip_mod_info;
info.hpe_mp = &(_fm); \
info.hpe_mb = _m; \
info.hpe_flags = _llm; \
- if (hook_run(ipst->ips_ipv4_net_data->netd_hooks, \
- _event, (hook_data_t)&info) != 0) { \
+ _err = hook_run(ipst->ips_ipv4_net_data->netd_hooks, \
+ _event, (hook_data_t)&info); \
+ if (_err != 0) { \
ip2dbg(("%s hook dropped mblk chain %p hdr %p\n",\
(_hook).he_name, (void *)_fm, (void *)_m)); \
- freemsg(_fm); \
- _fm = NULL; \
+ if (_fm != NULL) { \
+ freemsg(_fm); \
+ _fm = NULL; \
+ } \
_iph = NULL; \
_m = NULL; \
} else { \
@@ -3135,9 +3021,8 @@ extern struct module_info ip_mod_info;
} \
}
-#define FW_HOOKS6(_hook, _event, _ilp, _olp, _iph, _fm, _m, _llm, ipst) \
- \
- if ((_hook).he_interested) { \
+#define FW_HOOKS6(_hook, _event, _ilp, _olp, _iph, _fm, _m, _llm, ipst, _err) \
+ if ((_hook).he_interested) { \
hook_pkt_event_t info; \
\
_NOTE(CONSTCOND) \
@@ -3150,12 +3035,15 @@ extern struct module_info ip_mod_info;
info.hpe_mp = &(_fm); \
info.hpe_mb = _m; \
info.hpe_flags = _llm; \
- if (hook_run(ipst->ips_ipv6_net_data->netd_hooks, \
- _event, (hook_data_t)&info) != 0) { \
+ _err = hook_run(ipst->ips_ipv6_net_data->netd_hooks, \
+ _event, (hook_data_t)&info); \
+ if (_err != 0) { \
ip2dbg(("%s hook dropped mblk chain %p hdr %p\n",\
(_hook).he_name, (void *)_fm, (void *)_m)); \
- freemsg(_fm); \
- _fm = NULL; \
+ if (_fm != NULL) { \
+ freemsg(_fm); \
+ _fm = NULL; \
+ } \
_iph = NULL; \
_m = NULL; \
} else { \
@@ -3194,24 +3082,6 @@ extern struct module_info ip_mod_info;
#define IP_LOOPBACK_ADDR(addr) \
(((addr) & N_IN_CLASSA_NET == N_IN_LOOPBACK_NET))
-#ifdef DEBUG
-/* IPsec HW acceleration debugging support */
-
-#define IPSECHW_CAPAB 0x0001 /* capability negotiation */
-#define IPSECHW_SADB 0x0002 /* SADB exchange */
-#define IPSECHW_PKT 0x0004 /* general packet flow */
-#define IPSECHW_PKTIN 0x0008 /* driver in pkt processing details */
-#define IPSECHW_PKTOUT 0x0010 /* driver out pkt processing details */
-
-#define IPSECHW_DEBUG(f, x) if (ipsechw_debug & (f)) { (void) printf x; }
-#define IPSECHW_CALL(f, r, x) if (ipsechw_debug & (f)) { (void) r x; }
-
-extern uint32_t ipsechw_debug;
-#else
-#define IPSECHW_DEBUG(f, x) {}
-#define IPSECHW_CALL(f, r, x) {}
-#endif
-
extern int ip_debug;
extern uint_t ip_thread_data;
extern krwlock_t ip_thread_rwlock;
@@ -3235,8 +3105,6 @@ extern list_t ip_thread_list;
/* Default MAC-layer address string length for mac_colon_addr */
#define MAC_STR_LEN 128
-struct ipsec_out_s;
-
struct mac_header_info_s;
extern void ill_frag_timer(void *);
@@ -3252,86 +3120,173 @@ extern char *ip_dot_addr(ipaddr_t, char *);
extern const char *mac_colon_addr(const uint8_t *, size_t, char *, size_t);
extern void ip_lwput(queue_t *, mblk_t *);
extern boolean_t icmp_err_rate_limit(ip_stack_t *);
-extern void icmp_time_exceeded(queue_t *, mblk_t *, uint8_t, zoneid_t,
- ip_stack_t *);
-extern void icmp_unreachable(queue_t *, mblk_t *, uint8_t, zoneid_t,
- ip_stack_t *);
-extern mblk_t *ip_add_info(mblk_t *, ill_t *, uint_t, zoneid_t, ip_stack_t *);
-cred_t *ip_best_cred(mblk_t *, conn_t *, pid_t *);
-extern mblk_t *ip_bind_v4(queue_t *, mblk_t *, conn_t *);
-extern boolean_t ip_bind_ipsec_policy_set(conn_t *, mblk_t *);
-extern int ip_bind_laddr_v4(conn_t *, mblk_t **, uint8_t, ipaddr_t,
- uint16_t, boolean_t);
-extern int ip_proto_bind_laddr_v4(conn_t *, mblk_t **, uint8_t, ipaddr_t,
- uint16_t, boolean_t);
-extern int ip_proto_bind_connected_v4(conn_t *, mblk_t **,
- uint8_t, ipaddr_t *, uint16_t, ipaddr_t, uint16_t, boolean_t, boolean_t,
- cred_t *);
-extern int ip_bind_connected_v4(conn_t *, mblk_t **, uint8_t, ipaddr_t *,
- uint16_t, ipaddr_t, uint16_t, boolean_t, boolean_t, cred_t *);
+extern void icmp_frag_needed(mblk_t *, int, ip_recv_attr_t *);
+extern mblk_t *icmp_inbound_v4(mblk_t *, ip_recv_attr_t *);
+extern void icmp_time_exceeded(mblk_t *, uint8_t, ip_recv_attr_t *);
+extern void icmp_unreachable(mblk_t *, uint8_t, ip_recv_attr_t *);
+extern boolean_t ip_ipsec_policy_inherit(conn_t *, conn_t *, ip_recv_attr_t *);
+extern void *ip_pullup(mblk_t *, ssize_t, ip_recv_attr_t *);
+extern void ip_setl2src(mblk_t *, ip_recv_attr_t *, ill_t *);
+extern mblk_t *ip_check_and_align_header(mblk_t *, uint_t, ip_recv_attr_t *);
+extern mblk_t *ip_check_length(mblk_t *, uchar_t *, ssize_t, uint_t, uint_t,
+ ip_recv_attr_t *);
+extern mblk_t *ip_check_optlen(mblk_t *, ipha_t *, uint_t, uint_t,
+ ip_recv_attr_t *);
+extern mblk_t *ip_fix_dbref(mblk_t *, ip_recv_attr_t *);
extern uint_t ip_cksum(mblk_t *, int, uint32_t);
extern int ip_close(queue_t *, int);
extern uint16_t ip_csum_hdr(ipha_t *);
-extern void ip_proto_not_sup(queue_t *, mblk_t *, uint_t, zoneid_t,
- ip_stack_t *);
+extern void ip_forward_xmit_v4(nce_t *, ill_t *, mblk_t *, ipha_t *,
+ ip_recv_attr_t *, uint32_t, uint32_t);
+extern boolean_t ip_forward_options(mblk_t *, ipha_t *, ill_t *,
+ ip_recv_attr_t *);
+extern int ip_fragment_v4(mblk_t *, nce_t *, iaflags_t, uint_t, uint32_t,
+ uint32_t, zoneid_t, zoneid_t, pfirepostfrag_t postfragfn,
+ uintptr_t *cookie);
+extern void ip_proto_not_sup(mblk_t *, ip_recv_attr_t *);
extern void ip_ire_g_fini(void);
extern void ip_ire_g_init(void);
extern void ip_ire_fini(ip_stack_t *);
extern void ip_ire_init(ip_stack_t *);
+extern void ip_mdata_to_mhi(ill_t *, mblk_t *, struct mac_header_info_s *);
extern int ip_openv4(queue_t *q, dev_t *devp, int flag, int sflag,
cred_t *credp);
extern int ip_openv6(queue_t *q, dev_t *devp, int flag, int sflag,
cred_t *credp);
extern int ip_reassemble(mblk_t *, ipf_t *, uint_t, boolean_t, ill_t *,
size_t);
-extern int ip_opt_set_ill(conn_t *, int, boolean_t, boolean_t,
- int, int, mblk_t *);
extern void ip_rput(queue_t *, mblk_t *);
extern void ip_input(ill_t *, ill_rx_ring_t *, mblk_t *,
struct mac_header_info_s *);
+extern void ip_input_v6(ill_t *, ill_rx_ring_t *, mblk_t *,
+ struct mac_header_info_s *);
+extern mblk_t *ip_input_common_v4(ill_t *, ill_rx_ring_t *, mblk_t *,
+ struct mac_header_info_s *, squeue_t *, mblk_t **, uint_t *);
+extern mblk_t *ip_input_common_v6(ill_t *, ill_rx_ring_t *, mblk_t *,
+ struct mac_header_info_s *, squeue_t *, mblk_t **, uint_t *);
+extern void ill_input_full_v4(mblk_t *, void *, void *,
+ ip_recv_attr_t *, rtc_t *);
+extern void ill_input_short_v4(mblk_t *, void *, void *,
+ ip_recv_attr_t *, rtc_t *);
+extern void ill_input_full_v6(mblk_t *, void *, void *,
+ ip_recv_attr_t *, rtc_t *);
+extern void ill_input_short_v6(mblk_t *, void *, void *,
+ ip_recv_attr_t *, rtc_t *);
+extern ipaddr_t ip_input_options(ipha_t *, ipaddr_t, mblk_t *,
+ ip_recv_attr_t *, int *);
+extern boolean_t ip_input_local_options(mblk_t *, ipha_t *, ip_recv_attr_t *);
+extern mblk_t *ip_input_fragment(mblk_t *, ipha_t *, ip_recv_attr_t *);
+extern mblk_t *ip_input_fragment_v6(mblk_t *, ip6_t *, ip6_frag_t *, uint_t,
+ ip_recv_attr_t *);
+extern void ip_input_post_ipsec(mblk_t *, ip_recv_attr_t *);
+extern void ip_fanout_v4(mblk_t *, ipha_t *, ip_recv_attr_t *);
+extern void ip_fanout_v6(mblk_t *, ip6_t *, ip_recv_attr_t *);
+extern void ip_fanout_proto_conn(conn_t *, mblk_t *, ipha_t *, ip6_t *,
+ ip_recv_attr_t *);
+extern void ip_fanout_proto_v4(mblk_t *, ipha_t *, ip_recv_attr_t *);
+extern void ip_fanout_send_icmp_v4(mblk_t *, uint_t, uint_t,
+ ip_recv_attr_t *);
+extern void ip_fanout_udp_conn(conn_t *, mblk_t *, ipha_t *, ip6_t *,
+ ip_recv_attr_t *);
+extern void ip_fanout_udp_multi_v4(mblk_t *, ipha_t *, uint16_t, uint16_t,
+ ip_recv_attr_t *);
+extern mblk_t *zero_spi_check(mblk_t *, ip_recv_attr_t *);
+extern void ip_build_hdrs_v4(uchar_t *, uint_t, const ip_pkt_t *, uint8_t);
+extern int ip_find_hdr_v4(ipha_t *, ip_pkt_t *, boolean_t);
+extern int ip_total_hdrs_len_v4(const ip_pkt_t *);
+
extern mblk_t *ip_accept_tcp(ill_t *, ill_rx_ring_t *, squeue_t *,
mblk_t *, mblk_t **, uint_t *cnt);
-extern void ip_rput_dlpi(queue_t *, mblk_t *);
-extern void ip_rput_forward(ire_t *, ipha_t *, mblk_t *, ill_t *);
-extern void ip_rput_forward_multicast(ipaddr_t, mblk_t *, ipif_t *);
+extern void ip_rput_dlpi(ill_t *, mblk_t *);
+extern void ip_rput_notdata(ill_t *, mblk_t *);
extern void ip_mib2_add_ip_stats(mib2_ipIfStatsEntry_t *,
mib2_ipIfStatsEntry_t *);
extern void ip_mib2_add_icmp6_stats(mib2_ipv6IfIcmpEntry_t *,
mib2_ipv6IfIcmpEntry_t *);
-extern void ip_udp_input(queue_t *, mblk_t *, ipha_t *, ire_t *, ill_t *);
-extern void ip_proto_input(queue_t *, mblk_t *, ipha_t *, ire_t *, ill_t *,
- uint32_t);
extern void ip_rput_other(ipsq_t *, queue_t *, mblk_t *, void *);
extern ire_t *ip_check_multihome(void *, ire_t *, ill_t *);
-extern void ip_setpktversion(conn_t *, boolean_t, boolean_t, ip_stack_t *);
-extern void ip_trash_ire_reclaim(void *);
-extern void ip_trash_timer_expire(void *);
-extern void ip_wput(queue_t *, mblk_t *);
-extern void ip_output(void *, mblk_t *, void *, int);
-extern void ip_output_options(void *, mblk_t *, void *, int,
- ip_opt_info_t *);
-
-extern void ip_wput_ire(queue_t *, mblk_t *, ire_t *, conn_t *, int,
- zoneid_t);
-extern void ip_wput_local(queue_t *, ill_t *, ipha_t *, mblk_t *, ire_t *,
- int, zoneid_t);
-extern void ip_wput_multicast(queue_t *, mblk_t *, ipif_t *, zoneid_t);
-extern void ip_wput_nondata(ipsq_t *, queue_t *, mblk_t *, void *);
+extern void ip_send_potential_redirect_v4(mblk_t *, ipha_t *, ire_t *,
+ ip_recv_attr_t *);
+extern int ip_set_destination_v4(ipaddr_t *, ipaddr_t, ipaddr_t,
+ ip_xmit_attr_t *, iulp_t *, uint32_t, uint_t);
+extern int ip_set_destination_v6(in6_addr_t *, const in6_addr_t *,
+ const in6_addr_t *, ip_xmit_attr_t *, iulp_t *, uint32_t, uint_t);
+
+extern int ip_output_simple(mblk_t *, ip_xmit_attr_t *);
+extern int ip_output_simple_v4(mblk_t *, ip_xmit_attr_t *);
+extern int ip_output_simple_v6(mblk_t *, ip_xmit_attr_t *);
+extern int ip_output_options(mblk_t *, ipha_t *, ip_xmit_attr_t *,
+ ill_t *);
+extern void ip_output_local_options(ipha_t *, ip_stack_t *);
+
+extern ip_xmit_attr_t *conn_get_ixa(conn_t *, boolean_t);
+extern ip_xmit_attr_t *conn_get_ixa_tryhard(conn_t *, boolean_t);
+extern ip_xmit_attr_t *conn_replace_ixa(conn_t *, ip_xmit_attr_t *);
+extern ip_xmit_attr_t *conn_get_ixa_exclusive(conn_t *);
+extern ip_xmit_attr_t *ip_xmit_attr_duplicate(ip_xmit_attr_t *);
+extern void ip_xmit_attr_replace_tsl(ip_xmit_attr_t *, ts_label_t *);
+extern void ip_xmit_attr_restore_tsl(ip_xmit_attr_t *, cred_t *);
+boolean_t ip_recv_attr_replace_label(ip_recv_attr_t *, ts_label_t *);
+extern void ixa_inactive(ip_xmit_attr_t *);
+extern void ixa_refrele(ip_xmit_attr_t *);
+extern boolean_t ixa_check_drain_insert(conn_t *, ip_xmit_attr_t *);
+extern void ixa_cleanup(ip_xmit_attr_t *);
+extern void ira_cleanup(ip_recv_attr_t *, boolean_t);
+extern void ixa_safe_copy(ip_xmit_attr_t *, ip_xmit_attr_t *);
+
+extern int conn_ip_output(mblk_t *, ip_xmit_attr_t *);
+extern boolean_t ip_output_verify_local(ip_xmit_attr_t *);
+extern mblk_t *ip_output_process_local(mblk_t *, ip_xmit_attr_t *, boolean_t,
+ boolean_t, conn_t *);
+
+extern int conn_opt_get(conn_opt_arg_t *, t_scalar_t, t_scalar_t,
+ uchar_t *);
+extern int conn_opt_set(conn_opt_arg_t *, t_scalar_t, t_scalar_t, uint_t,
+ uchar_t *, boolean_t, cred_t *);
+extern boolean_t conn_same_as_last_v4(conn_t *, sin_t *);
+extern boolean_t conn_same_as_last_v6(conn_t *, sin6_t *);
+extern int conn_update_label(const conn_t *, const ip_xmit_attr_t *,
+ const in6_addr_t *, ip_pkt_t *);
+
+extern int ip_opt_set_multicast_group(conn_t *, t_scalar_t,
+ uchar_t *, boolean_t, boolean_t);
+extern int ip_opt_set_multicast_sources(conn_t *, t_scalar_t,
+ uchar_t *, boolean_t, boolean_t);
+extern int conn_getsockname(conn_t *, struct sockaddr *, uint_t *);
+extern int conn_getpeername(conn_t *, struct sockaddr *, uint_t *);
+
+extern int conn_build_hdr_template(conn_t *, uint_t, uint_t,
+ const in6_addr_t *, const in6_addr_t *, uint32_t);
+extern mblk_t *conn_prepend_hdr(ip_xmit_attr_t *, const ip_pkt_t *,
+ const in6_addr_t *, const in6_addr_t *, uint8_t, uint32_t, uint_t,
+ mblk_t *, uint_t, uint_t, uint32_t *, int *);
+extern void ip_attr_newdst(ip_xmit_attr_t *);
+extern void ip_attr_nexthop(const ip_pkt_t *, const ip_xmit_attr_t *,
+ const in6_addr_t *, in6_addr_t *);
+extern int conn_connect(conn_t *, iulp_t *, uint32_t);
+extern int ip_attr_connect(const conn_t *, ip_xmit_attr_t *,
+ const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, in_port_t,
+ in6_addr_t *, iulp_t *, uint32_t);
+extern int conn_inherit_parent(conn_t *, conn_t *);
+
+extern void conn_ixa_cleanup(conn_t *connp, void *arg);
+
+extern boolean_t conn_wantpacket(conn_t *, ip_recv_attr_t *, ipha_t *);
+extern uint_t ip_type_v4(ipaddr_t, ip_stack_t *);
+extern uint_t ip_type_v6(const in6_addr_t *, ip_stack_t *);
+
+extern void ip_wput_nondata(queue_t *, mblk_t *);
extern void ip_wsrv(queue_t *);
extern char *ip_nv_lookup(nv_t *, int);
extern boolean_t ip_local_addr_ok_v6(const in6_addr_t *, const in6_addr_t *);
extern boolean_t ip_remote_addr_ok_v6(const in6_addr_t *, const in6_addr_t *);
extern ipaddr_t ip_massage_options(ipha_t *, netstack_t *);
extern ipaddr_t ip_net_mask(ipaddr_t);
-extern void ip_newroute(queue_t *, mblk_t *, ipaddr_t, conn_t *, zoneid_t,
- ip_stack_t *);
-extern ipxmit_state_t ip_xmit_v4(mblk_t *, ire_t *, struct ipsec_out_s *,
- boolean_t, conn_t *);
-extern int ip_hdr_complete(ipha_t *, zoneid_t, ip_stack_t *);
+extern void arp_bringup_done(ill_t *, int);
+extern void arp_replumb_done(ill_t *, int);
extern struct qinit iprinitv6;
-extern struct qinit ipwinitv6;
extern void ipmp_init(ip_stack_t *);
extern void ipmp_destroy(ip_stack_t *);
@@ -3347,12 +3302,11 @@ extern ill_t *ipmp_illgrp_add_ipif(ipmp_illgrp_t *, ipif_t *);
extern void ipmp_illgrp_del_ipif(ipmp_illgrp_t *, ipif_t *);
extern ill_t *ipmp_illgrp_next_ill(ipmp_illgrp_t *);
extern ill_t *ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *);
-extern ill_t *ipmp_illgrp_cast_ill(ipmp_illgrp_t *);
extern ill_t *ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *);
extern ill_t *ipmp_illgrp_ipmp_ill(ipmp_illgrp_t *);
extern void ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *);
-extern ipmp_arpent_t *ipmp_illgrp_create_arpent(ipmp_illgrp_t *, mblk_t *,
- boolean_t);
+extern ipmp_arpent_t *ipmp_illgrp_create_arpent(ipmp_illgrp_t *,
+ boolean_t, ipaddr_t, uchar_t *, size_t, uint16_t);
extern void ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *, ipmp_arpent_t *);
extern ipmp_arpent_t *ipmp_illgrp_lookup_arpent(ipmp_illgrp_t *, ipaddr_t *);
extern void ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *);
@@ -3373,19 +3327,25 @@ extern ill_t *ipmp_ipif_bound_ill(const ipif_t *);
extern ill_t *ipmp_ipif_hold_bound_ill(const ipif_t *);
extern boolean_t ipmp_ipif_is_dataaddr(const ipif_t *);
extern boolean_t ipmp_ipif_is_stubaddr(const ipif_t *);
+extern boolean_t ipmp_packet_is_probe(mblk_t *, ill_t *);
+extern ill_t *ipmp_ill_get_xmit_ill(ill_t *, boolean_t);
+extern void ipmp_ncec_flush_nce(ncec_t *);
+extern void ipmp_ncec_fastpath(ncec_t *, ill_t *);
extern void conn_drain_insert(conn_t *, idl_tx_list_t *);
+extern void conn_setqfull(conn_t *, boolean_t *);
+extern void conn_clrqfull(conn_t *, boolean_t *);
extern int conn_ipsec_length(conn_t *);
-extern void ip_wput_ipsec_out(queue_t *, mblk_t *, ipha_t *, ill_t *,
- ire_t *);
extern ipaddr_t ip_get_dst(ipha_t *);
-extern int ipsec_out_extra_length(mblk_t *);
-extern int ipsec_in_extra_length(mblk_t *);
-extern mblk_t *ipsec_in_alloc(boolean_t, netstack_t *);
-extern boolean_t ipsec_in_is_secure(mblk_t *);
-extern void ipsec_out_process(queue_t *, mblk_t *, ire_t *, uint_t);
-extern void ipsec_out_to_in(mblk_t *);
-extern void ip_fanout_proto_again(mblk_t *, ill_t *, ill_t *, ire_t *);
+extern uint_t ip_get_pmtu(ip_xmit_attr_t *);
+extern uint_t ip_get_base_mtu(ill_t *, ire_t *);
+extern mblk_t *ip_output_attach_policy(mblk_t *, ipha_t *, ip6_t *,
+ const conn_t *, ip_xmit_attr_t *);
+extern int ipsec_out_extra_length(ip_xmit_attr_t *);
+extern int ipsec_out_process(mblk_t *, ip_xmit_attr_t *);
+extern int ip_output_post_ipsec(mblk_t *, ip_xmit_attr_t *);
+extern void ipsec_out_to_in(ip_xmit_attr_t *, ill_t *ill,
+ ip_recv_attr_t *);
extern void ire_cleanup(ire_t *);
extern void ire_inactive(ire_t *);
@@ -3407,14 +3367,13 @@ extern uint_t ip_srcid_find_addr(const in6_addr_t *, zoneid_t, netstack_t *);
extern uint8_t ipoptp_next(ipoptp_t *);
extern uint8_t ipoptp_first(ipoptp_t *, ipha_t *);
-extern int ip_opt_get_user(const ipha_t *, uchar_t *);
+extern int ip_opt_get_user(conn_t *, uchar_t *);
extern int ipsec_req_from_conn(conn_t *, ipsec_req_t *, int);
extern int ip_snmp_get(queue_t *q, mblk_t *mctl, int level);
extern int ip_snmp_set(queue_t *q, int, int, uchar_t *, int);
extern void ip_process_ioctl(ipsq_t *, queue_t *, mblk_t *, void *);
extern void ip_quiesce_conn(conn_t *);
extern void ip_reprocess_ioctl(ipsq_t *, queue_t *, mblk_t *, void *);
-extern void ip_restart_optmgmt(ipsq_t *, queue_t *, mblk_t *, void *);
extern void ip_ioctl_finish(queue_t *, mblk_t *, int, int, ipsq_t *);
extern boolean_t ip_cmpbuf(const void *, uint_t, boolean_t, const void *,
@@ -3425,32 +3384,36 @@ extern void ip_savebuf(void **, uint_t *, boolean_t, const void *, uint_t);
extern boolean_t ipsq_pending_mp_cleanup(ill_t *, conn_t *);
extern void conn_ioctl_cleanup(conn_t *);
-extern ill_t *conn_get_held_ill(conn_t *, ill_t **, int *);
-
-struct tcp_stack;
-extern void ip_xmit_reset_serialize(mblk_t *, int, zoneid_t, struct tcp_stack *,
- conn_t *);
-
-struct multidata_s;
-struct pdesc_s;
-
-extern mblk_t *ip_mdinfo_alloc(ill_mdt_capab_t *);
-extern mblk_t *ip_mdinfo_return(ire_t *, conn_t *, char *, ill_mdt_capab_t *);
-extern mblk_t *ip_lsoinfo_alloc(ill_lso_capab_t *);
-extern mblk_t *ip_lsoinfo_return(ire_t *, conn_t *, char *,
- ill_lso_capab_t *);
-extern uint_t ip_md_cksum(struct pdesc_s *, int, uint_t);
-extern boolean_t ip_md_addr_attr(struct multidata_s *, struct pdesc_s *,
- const mblk_t *);
-extern boolean_t ip_md_hcksum_attr(struct multidata_s *, struct pdesc_s *,
- uint32_t, uint32_t, uint32_t, uint32_t);
-extern boolean_t ip_md_zcopy_attr(struct multidata_s *, struct pdesc_s *,
- uint_t);
+
extern void ip_unbind(conn_t *);
extern void tnet_init(void);
extern void tnet_fini(void);
+/*
+ * Hook functions to enable cluster networking
+ * On non-clustered systems these vectors must always be NULL.
+ */
+extern int (*cl_inet_isclusterwide)(netstackid_t stack_id, uint8_t protocol,
+ sa_family_t addr_family, uint8_t *laddrp, void *args);
+extern uint32_t (*cl_inet_ipident)(netstackid_t stack_id, uint8_t protocol,
+ sa_family_t addr_family, uint8_t *laddrp, uint8_t *faddrp,
+ void *args);
+extern int (*cl_inet_connect2)(netstackid_t stack_id, uint8_t protocol,
+ boolean_t is_outgoing, sa_family_t addr_family, uint8_t *laddrp,
+ in_port_t lport, uint8_t *faddrp, in_port_t fport, void *args);
+extern void (*cl_inet_getspi)(netstackid_t, uint8_t, uint8_t *, size_t,
+ void *);
+extern void (*cl_inet_getspi)(netstackid_t stack_id, uint8_t protocol,
+ uint8_t *ptr, size_t len, void *args);
+extern int (*cl_inet_checkspi)(netstackid_t stack_id, uint8_t protocol,
+ uint32_t spi, void *args);
+extern void (*cl_inet_deletespi)(netstackid_t stack_id, uint8_t protocol,
+ uint32_t spi, void *args);
+extern void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t,
+ sa_family_t, in6_addr_t, in6_addr_t, void *);
+
+
/* Hooks for CGTP (multirt routes) filtering module */
#define CGTP_FILTER_REV_1 1
#define CGTP_FILTER_REV_2 2
@@ -3491,73 +3454,6 @@ extern int ip_cgtp_filter_register(netstackid_t, cgtp_filter_ops_t *);
extern int ip_cgtp_filter_unregister(netstackid_t);
extern int ip_cgtp_filter_is_registered(netstackid_t);
-/* Flags for ire_multirt_lookup() */
-
-#define MULTIRT_USESTAMP 0x0001
-#define MULTIRT_SETSTAMP 0x0002
-#define MULTIRT_CACHEGW 0x0004
-
-/* Debug stuff for multirt route resolution. */
-#if defined(DEBUG) && !defined(__lint)
-/* Our "don't send, rather drop" flag. */
-#define MULTIRT_DEBUG_FLAG 0x8000
-
-#define MULTIRT_TRACE(x) ip2dbg(x)
-
-#define MULTIRT_DEBUG_TAG(mblk) \
- do { \
- ASSERT(mblk != NULL); \
- MULTIRT_TRACE(("%s[%d]: tagging mblk %p, tag was %d\n", \
- __FILE__, __LINE__, \
- (void *)(mblk), (mblk)->b_flag & MULTIRT_DEBUG_FLAG)); \
- (mblk)->b_flag |= MULTIRT_DEBUG_FLAG; \
- } while (0)
-
-#define MULTIRT_DEBUG_UNTAG(mblk) \
- do { \
- ASSERT(mblk != NULL); \
- MULTIRT_TRACE(("%s[%d]: untagging mblk %p, tag was %d\n", \
- __FILE__, __LINE__, \
- (void *)(mblk), (mblk)->b_flag & MULTIRT_DEBUG_FLAG)); \
- (mblk)->b_flag &= ~MULTIRT_DEBUG_FLAG; \
- } while (0)
-
-#define MULTIRT_DEBUG_TAGGED(mblk) \
- (((mblk)->b_flag & MULTIRT_DEBUG_FLAG) ? B_TRUE : B_FALSE)
-#else
-#define MULTIRT_DEBUG_TAG(mblk) ASSERT(mblk != NULL)
-#define MULTIRT_DEBUG_UNTAG(mblk) ASSERT(mblk != NULL)
-#define MULTIRT_DEBUG_TAGGED(mblk) B_FALSE
-#endif
-
-/*
- * Per-ILL Multidata Transmit capabilities.
- */
-struct ill_mdt_capab_s {
- uint_t ill_mdt_version; /* interface version */
- uint_t ill_mdt_on; /* on/off switch for MDT on this ILL */
- uint_t ill_mdt_hdr_head; /* leading header fragment extra space */
- uint_t ill_mdt_hdr_tail; /* trailing header fragment extra space */
- uint_t ill_mdt_max_pld; /* maximum payload buffers per Multidata */
- uint_t ill_mdt_span_limit; /* maximum payload span per packet */
-};
-
-struct ill_hcksum_capab_s {
- uint_t ill_hcksum_version; /* interface version */
- uint_t ill_hcksum_txflags; /* capabilities on transmit */
-};
-
-struct ill_zerocopy_capab_s {
- uint_t ill_zerocopy_version; /* interface version */
- uint_t ill_zerocopy_flags; /* capabilities */
-};
-
-struct ill_lso_capab_s {
- uint_t ill_lso_on; /* on/off switch for LSO on this ILL */
- uint_t ill_lso_flags; /* capabilities */
- uint_t ill_lso_max; /* maximum size of payload */
-};
-
/*
* rr_ring_state cycles in the order shown below from RR_FREE through
* RR_FREE_IN_PROG and back to RR_FREE.
@@ -3669,18 +3565,61 @@ extern void ip_squeue_clean_ring(ill_t *, ill_rx_ring_t *);
extern void ip_squeue_quiesce_ring(ill_t *, ill_rx_ring_t *);
extern void ip_squeue_restart_ring(ill_t *, ill_rx_ring_t *);
extern void ip_squeue_clean_all(ill_t *);
+extern boolean_t ip_source_routed(ipha_t *, ip_stack_t *);
extern void tcp_wput(queue_t *, mblk_t *);
-extern int ip_fill_mtuinfo(struct in6_addr *, in_port_t,
- struct ip6_mtuinfo *, netstack_t *);
-extern ipif_t *conn_get_held_ipif(conn_t *, ipif_t **, int *);
+extern int ip_fill_mtuinfo(conn_t *, ip_xmit_attr_t *,
+ struct ip6_mtuinfo *);
extern hook_t *ipobs_register_hook(netstack_t *, pfv_t);
extern void ipobs_unregister_hook(netstack_t *, hook_t *);
extern void ipobs_hook(mblk_t *, int, zoneid_t, zoneid_t, const ill_t *,
ip_stack_t *);
typedef void (*ipsq_func_t)(ipsq_t *, queue_t *, mblk_t *, void *);
+extern void dce_g_init(void);
+extern void dce_g_destroy(void);
+extern void dce_stack_init(ip_stack_t *);
+extern void dce_stack_destroy(ip_stack_t *);
+extern void dce_cleanup(uint_t, ip_stack_t *);
+extern dce_t *dce_get_default(ip_stack_t *);
+extern dce_t *dce_lookup_pkt(mblk_t *, ip_xmit_attr_t *, uint_t *);
+extern dce_t *dce_lookup_v4(ipaddr_t, ip_stack_t *, uint_t *);
+extern dce_t *dce_lookup_v6(const in6_addr_t *, uint_t, ip_stack_t *,
+ uint_t *);
+extern dce_t *dce_lookup_and_add_v4(ipaddr_t, ip_stack_t *);
+extern dce_t *dce_lookup_and_add_v6(const in6_addr_t *, uint_t,
+ ip_stack_t *);
+extern int dce_update_uinfo_v4(ipaddr_t, iulp_t *, ip_stack_t *);
+extern int dce_update_uinfo_v6(const in6_addr_t *, uint_t, iulp_t *,
+ ip_stack_t *);
+extern int dce_update_uinfo(const in6_addr_t *, uint_t, iulp_t *,
+ ip_stack_t *);
+extern void dce_increment_generation(dce_t *);
+extern void dce_increment_all_generations(boolean_t, ip_stack_t *);
+extern void dce_refrele(dce_t *);
+extern void dce_refhold(dce_t *);
+extern void dce_refrele_notr(dce_t *);
+extern void dce_refhold_notr(dce_t *);
+mblk_t *ip_snmp_get_mib2_ip_dce(queue_t *, mblk_t *, ip_stack_t *ipst);
+
+extern ip_laddr_t ip_laddr_verify_v4(ipaddr_t, zoneid_t,
+ ip_stack_t *, boolean_t);
+extern ip_laddr_t ip_laddr_verify_v6(const in6_addr_t *, zoneid_t,
+ ip_stack_t *, boolean_t, uint_t);
+extern int ip_laddr_fanout_insert(conn_t *);
+
+extern boolean_t ip_verify_src(mblk_t *, ip_xmit_attr_t *, uint_t *);
+extern int ip_verify_ire(mblk_t *, ip_xmit_attr_t *);
+
+extern mblk_t *ip_xmit_attr_to_mblk(ip_xmit_attr_t *);
+extern boolean_t ip_xmit_attr_from_mblk(mblk_t *, ip_xmit_attr_t *);
+extern mblk_t *ip_xmit_attr_free_mblk(mblk_t *);
+extern mblk_t *ip_recv_attr_to_mblk(ip_recv_attr_t *);
+extern boolean_t ip_recv_attr_from_mblk(mblk_t *, ip_recv_attr_t *);
+extern mblk_t *ip_recv_attr_free_mblk(mblk_t *);
+extern boolean_t ip_recv_attr_is_mblk(mblk_t *);
+
/*
* Squeue tags. Tags only need to be unique when the callback function is the
* same to distinguish between different calls, but we use unique tags for
@@ -3729,16 +3668,8 @@ typedef void (*ipsq_func_t)(ipsq_t *, queue_t *, mblk_t *, void *);
#define SQTAG_CONNECT_FINISH 41
#define SQTAG_SYNCHRONOUS_OP 42
#define SQTAG_TCP_SHUTDOWN_OUTPUT 43
-#define SQTAG_XMIT_EARLY_RESET 44
-
-#define NOT_OVER_IP(ip_wq) \
- (ip_wq->q_next != NULL || \
- (ip_wq->q_qinfo->qi_minfo->mi_idname) == NULL || \
- strcmp(ip_wq->q_qinfo->qi_minfo->mi_idname, \
- IP_MOD_NAME) != 0 || \
- ip_wq->q_qinfo->qi_minfo->mi_idnum != IP_MOD_ID)
+#define SQTAG_TCP_IXA_CLEANUP 44
-#define PROTO_FLOW_CNTRLD(connp) (connp->conn_flow_cntrld)
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/inet/ip/conn_opt.c b/usr/src/uts/common/inet/ip/conn_opt.c
new file mode 100644
index 0000000000..a46d7c4cd0
--- /dev/null
+++ b/usr/src/uts/common/inet/ip/conn_opt.c
@@ -0,0 +1,2933 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/* Copyright (c) 1990 Mentat Inc. */
+
+#include <sys/types.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#define _SUN_TPI_VERSION 2
+#include <sys/tihdr.h>
+#include <sys/xti_inet.h>
+#include <sys/ucred.h>
+#include <sys/zone.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/atomic.h>
+#include <sys/policy.h>
+
+#include <sys/systm.h>
+#include <sys/param.h>
+#include <sys/kmem.h>
+#include <sys/sdt.h>
+#include <sys/socket.h>
+#include <sys/ethernet.h>
+#include <sys/mac.h>
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/if_arp.h>
+#include <net/route.h>
+#include <sys/sockio.h>
+#include <netinet/in.h>
+#include <net/if_dl.h>
+
+#include <inet/common.h>
+#include <inet/mi.h>
+#include <inet/mib2.h>
+#include <inet/nd.h>
+#include <inet/arp.h>
+#include <inet/snmpcom.h>
+#include <inet/kstatcom.h>
+
+#include <netinet/igmp_var.h>
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#include <netinet/sctp.h>
+
+#include <inet/ip.h>
+#include <inet/ip_impl.h>
+#include <inet/ip6.h>
+#include <inet/ip6_asp.h>
+#include <inet/tcp.h>
+#include <inet/ip_multi.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ire.h>
+#include <inet/ip_ftable.h>
+#include <inet/ip_rts.h>
+#include <inet/optcom.h>
+#include <inet/ip_ndp.h>
+#include <inet/ip_listutils.h>
+#include <netinet/igmp.h>
+#include <netinet/ip_mroute.h>
+#include <netinet/udp.h>
+#include <inet/ipp_common.h>
+
+#include <net/pfkeyv2.h>
+#include <inet/sadb.h>
+#include <inet/ipsec_impl.h>
+#include <inet/ipdrop.h>
+#include <inet/ip_netinfo.h>
+
+#include <inet/ipclassifier.h>
+#include <inet/sctp_ip.h>
+#include <inet/sctp/sctp_impl.h>
+#include <inet/udp_impl.h>
+#include <sys/sunddi.h>
+
+#include <sys/tsol/label.h>
+#include <sys/tsol/tnet.h>
+
+static sin_t sin_null; /* Zero address for quick clears */
+static sin6_t sin6_null; /* Zero address for quick clears */
+
+/*
+ * Return how much size is needed for the different ancillary data items
+ */
+uint_t
+conn_recvancillary_size(conn_t *connp, crb_t recv_ancillary,
+ ip_recv_attr_t *ira, mblk_t *mp, ip_pkt_t *ipp)
+{
+ uint_t ancil_size;
+ ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
+
+ /*
+ * If IP_RECVDSTADDR is set we include the destination IP
+ * address as an option. With IP_RECVOPTS we include all
+ * the IP options.
+ */
+ ancil_size = 0;
+ if (recv_ancillary.crb_recvdstaddr &&
+ (ira->ira_flags & IRAF_IS_IPV4)) {
+ ancil_size += sizeof (struct T_opthdr) +
+ sizeof (struct in_addr);
+ IP_STAT(ipst, conn_in_recvdstaddr);
+ }
+
+ /*
+ * ip_recvpktinfo is used for both AF_INET and AF_INET6 but
+ * are different
+ */
+ if (recv_ancillary.crb_ip_recvpktinfo &&
+ connp->conn_family == AF_INET) {
+ ancil_size += sizeof (struct T_opthdr) +
+ sizeof (struct in_pktinfo);
+ IP_STAT(ipst, conn_in_recvpktinfo);
+ }
+
+ if ((recv_ancillary.crb_recvopts) &&
+ (ipp->ipp_fields & IPPF_IPV4_OPTIONS)) {
+ ancil_size += sizeof (struct T_opthdr) +
+ ipp->ipp_ipv4_options_len;
+ IP_STAT(ipst, conn_in_recvopts);
+ }
+
+ if (recv_ancillary.crb_recvslla) {
+ ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
+ ill_t *ill;
+
+ /* Make sure ira_l2src is setup if not already */
+ if (!(ira->ira_flags & IRAF_L2SRC_SET)) {
+ ill = ill_lookup_on_ifindex(ira->ira_rifindex, B_FALSE,
+ ipst);
+ if (ill != NULL) {
+ ip_setl2src(mp, ira, ill);
+ ill_refrele(ill);
+ }
+ }
+ ancil_size += sizeof (struct T_opthdr) +
+ sizeof (struct sockaddr_dl);
+ IP_STAT(ipst, conn_in_recvslla);
+ }
+
+ if (recv_ancillary.crb_recvif) {
+ ancil_size += sizeof (struct T_opthdr) + sizeof (uint_t);
+ IP_STAT(ipst, conn_in_recvif);
+ }
+
+ /*
+ * ip_recvpktinfo is used for both AF_INET and AF_INET6 but
+ * are different
+ */
+ if (recv_ancillary.crb_ip_recvpktinfo &&
+ connp->conn_family == AF_INET6) {
+ ancil_size += sizeof (struct T_opthdr) +
+ sizeof (struct in6_pktinfo);
+ IP_STAT(ipst, conn_in_recvpktinfo);
+ }
+
+ if (recv_ancillary.crb_ipv6_recvhoplimit) {
+ ancil_size += sizeof (struct T_opthdr) + sizeof (int);
+ IP_STAT(ipst, conn_in_recvhoplimit);
+ }
+
+ if (recv_ancillary.crb_ipv6_recvtclass) {
+ ancil_size += sizeof (struct T_opthdr) + sizeof (int);
+ IP_STAT(ipst, conn_in_recvtclass);
+ }
+
+ if (recv_ancillary.crb_ipv6_recvhopopts &&
+ (ipp->ipp_fields & IPPF_HOPOPTS)) {
+ ancil_size += sizeof (struct T_opthdr) + ipp->ipp_hopoptslen;
+ IP_STAT(ipst, conn_in_recvhopopts);
+ }
+ /*
+ * To honor RFC3542 when an application asks for both IPV6_RECVDSTOPTS
+ * and IPV6_RECVRTHDR, we pass up the item rthdrdstopts (the destination
+ * options that appear before a routing header.
+ * We also pass them up if IPV6_RECVRTHDRDSTOPTS is set.
+ */
+ if (ipp->ipp_fields & IPPF_RTHDRDSTOPTS) {
+ if (recv_ancillary.crb_ipv6_recvrthdrdstopts ||
+ (recv_ancillary.crb_ipv6_recvdstopts &&
+ recv_ancillary.crb_ipv6_recvrthdr)) {
+ ancil_size += sizeof (struct T_opthdr) +
+ ipp->ipp_rthdrdstoptslen;
+ IP_STAT(ipst, conn_in_recvrthdrdstopts);
+ }
+ }
+ if ((recv_ancillary.crb_ipv6_recvrthdr) &&
+ (ipp->ipp_fields & IPPF_RTHDR)) {
+ ancil_size += sizeof (struct T_opthdr) + ipp->ipp_rthdrlen;
+ IP_STAT(ipst, conn_in_recvrthdr);
+ }
+ if ((recv_ancillary.crb_ipv6_recvdstopts ||
+ recv_ancillary.crb_old_ipv6_recvdstopts) &&
+ (ipp->ipp_fields & IPPF_DSTOPTS)) {
+ ancil_size += sizeof (struct T_opthdr) + ipp->ipp_dstoptslen;
+ IP_STAT(ipst, conn_in_recvdstopts);
+ }
+ if (recv_ancillary.crb_recvucred && ira->ira_cred != NULL) {
+ ancil_size += sizeof (struct T_opthdr) + ucredsize;
+ IP_STAT(ipst, conn_in_recvucred);
+ }
+
+ /*
+ * If SO_TIMESTAMP is set allocate the appropriate sized
+ * buffer. Since gethrestime() expects a pointer aligned
+ * argument, we allocate space necessary for extra
+ * alignment (even though it might not be used).
+ */
+ if (recv_ancillary.crb_timestamp) {
+ ancil_size += sizeof (struct T_opthdr) +
+ sizeof (timestruc_t) + _POINTER_ALIGNMENT;
+ IP_STAT(ipst, conn_in_timestamp);
+ }
+
+ /*
+ * If IP_RECVTTL is set allocate the appropriate sized buffer
+ */
+ if (recv_ancillary.crb_recvttl &&
+ (ira->ira_flags & IRAF_IS_IPV4)) {
+ ancil_size += sizeof (struct T_opthdr) + sizeof (uint8_t);
+ IP_STAT(ipst, conn_in_recvttl);
+ }
+
+ return (ancil_size);
+}
+
+/*
+ * Lay down the ancillary data items at "ancil_buf".
+ * Assumes caller has used conn_recvancillary_size to allocate a sufficiently
+ * large buffer - ancil_size.
+ */
+void
+conn_recvancillary_add(conn_t *connp, crb_t recv_ancillary,
+ ip_recv_attr_t *ira, ip_pkt_t *ipp, uchar_t *ancil_buf, uint_t ancil_size)
+{
+ /*
+ * Copy in destination address before options to avoid
+ * any padding issues.
+ */
+ if (recv_ancillary.crb_recvdstaddr &&
+ (ira->ira_flags & IRAF_IS_IPV4)) {
+ struct T_opthdr *toh;
+ ipaddr_t *dstptr;
+
+ toh = (struct T_opthdr *)ancil_buf;
+ toh->level = IPPROTO_IP;
+ toh->name = IP_RECVDSTADDR;
+ toh->len = sizeof (struct T_opthdr) + sizeof (ipaddr_t);
+ toh->status = 0;
+ ancil_buf += sizeof (struct T_opthdr);
+ dstptr = (ipaddr_t *)ancil_buf;
+ *dstptr = ipp->ipp_addr_v4;
+ ancil_buf += sizeof (ipaddr_t);
+ ancil_size -= toh->len;
+ }
+
+ /*
+ * ip_recvpktinfo is used for both AF_INET and AF_INET6 but
+ * are different
+ */
+ if (recv_ancillary.crb_ip_recvpktinfo &&
+ connp->conn_family == AF_INET) {
+ ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
+ struct T_opthdr *toh;
+ struct in_pktinfo *pktinfop;
+ ill_t *ill;
+ ipif_t *ipif;
+
+ toh = (struct T_opthdr *)ancil_buf;
+ toh->level = IPPROTO_IP;
+ toh->name = IP_PKTINFO;
+ toh->len = sizeof (struct T_opthdr) + sizeof (*pktinfop);
+ toh->status = 0;
+ ancil_buf += sizeof (struct T_opthdr);
+ pktinfop = (struct in_pktinfo *)ancil_buf;
+
+ pktinfop->ipi_ifindex = ira->ira_ruifindex;
+ pktinfop->ipi_spec_dst.s_addr = INADDR_ANY;
+
+ /* Find a good address to report */
+ ill = ill_lookup_on_ifindex(ira->ira_ruifindex, B_FALSE, ipst);
+ if (ill != NULL) {
+ ipif = ipif_good_addr(ill, IPCL_ZONEID(connp));
+ if (ipif != NULL) {
+ pktinfop->ipi_spec_dst.s_addr =
+ ipif->ipif_lcl_addr;
+ ipif_refrele(ipif);
+ }
+ ill_refrele(ill);
+ }
+ pktinfop->ipi_addr.s_addr = ipp->ipp_addr_v4;
+ ancil_buf += sizeof (struct in_pktinfo);
+ ancil_size -= toh->len;
+ }
+
+ if ((recv_ancillary.crb_recvopts) &&
+ (ipp->ipp_fields & IPPF_IPV4_OPTIONS)) {
+ struct T_opthdr *toh;
+
+ toh = (struct T_opthdr *)ancil_buf;
+ toh->level = IPPROTO_IP;
+ toh->name = IP_RECVOPTS;
+ toh->len = sizeof (struct T_opthdr) + ipp->ipp_ipv4_options_len;
+ toh->status = 0;
+ ancil_buf += sizeof (struct T_opthdr);
+ bcopy(ipp->ipp_ipv4_options, ancil_buf,
+ ipp->ipp_ipv4_options_len);
+ ancil_buf += ipp->ipp_ipv4_options_len;
+ ancil_size -= toh->len;
+ }
+
+ if (recv_ancillary.crb_recvslla) {
+ ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
+ struct T_opthdr *toh;
+ struct sockaddr_dl *dstptr;
+ ill_t *ill;
+ int alen = 0;
+
+ ill = ill_lookup_on_ifindex(ira->ira_rifindex, B_FALSE, ipst);
+ if (ill != NULL)
+ alen = ill->ill_phys_addr_length;
+
+ /*
+ * For loopback multicast and broadcast the packet arrives
+ * with ira_ruifdex being the physical interface, but
+ * ira_l2src is all zero since ip_postfrag_loopback doesn't
+ * know our l2src. We don't report the address in that case.
+ */
+ if (ira->ira_flags & IRAF_LOOPBACK)
+ alen = 0;
+
+ toh = (struct T_opthdr *)ancil_buf;
+ toh->level = IPPROTO_IP;
+ toh->name = IP_RECVSLLA;
+ toh->len = sizeof (struct T_opthdr) +
+ sizeof (struct sockaddr_dl);
+ toh->status = 0;
+ ancil_buf += sizeof (struct T_opthdr);
+ dstptr = (struct sockaddr_dl *)ancil_buf;
+ dstptr->sdl_family = AF_LINK;
+ dstptr->sdl_index = ira->ira_ruifindex;
+ if (ill != NULL)
+ dstptr->sdl_type = ill->ill_type;
+ else
+ dstptr->sdl_type = 0;
+ dstptr->sdl_nlen = 0;
+ dstptr->sdl_alen = alen;
+ dstptr->sdl_slen = 0;
+ bcopy(ira->ira_l2src, dstptr->sdl_data, alen);
+ ancil_buf += sizeof (struct sockaddr_dl);
+ ancil_size -= toh->len;
+ if (ill != NULL)
+ ill_refrele(ill);
+ }
+
+ if (recv_ancillary.crb_recvif) {
+ struct T_opthdr *toh;
+ uint_t *dstptr;
+
+ toh = (struct T_opthdr *)ancil_buf;
+ toh->level = IPPROTO_IP;
+ toh->name = IP_RECVIF;
+ toh->len = sizeof (struct T_opthdr) + sizeof (uint_t);
+ toh->status = 0;
+ ancil_buf += sizeof (struct T_opthdr);
+ dstptr = (uint_t *)ancil_buf;
+ *dstptr = ira->ira_ruifindex;
+ ancil_buf += sizeof (uint_t);
+ ancil_size -= toh->len;
+ }
+
+ /*
+ * ip_recvpktinfo is used for both AF_INET and AF_INET6 but
+ * are different
+ */
+ if (recv_ancillary.crb_ip_recvpktinfo &&
+ connp->conn_family == AF_INET6) {
+ struct T_opthdr *toh;
+ struct in6_pktinfo *pkti;
+
+ toh = (struct T_opthdr *)ancil_buf;
+ toh->level = IPPROTO_IPV6;
+ toh->name = IPV6_PKTINFO;
+ toh->len = sizeof (struct T_opthdr) + sizeof (*pkti);
+ toh->status = 0;
+ ancil_buf += sizeof (struct T_opthdr);
+ pkti = (struct in6_pktinfo *)ancil_buf;
+ if (ira->ira_flags & IRAF_IS_IPV4) {
+ IN6_IPADDR_TO_V4MAPPED(ipp->ipp_addr_v4,
+ &pkti->ipi6_addr);
+ } else {
+ pkti->ipi6_addr = ipp->ipp_addr;
+ }
+ pkti->ipi6_ifindex = ira->ira_ruifindex;
+
+ ancil_buf += sizeof (*pkti);
+ ancil_size -= toh->len;
+ }
+ if (recv_ancillary.crb_ipv6_recvhoplimit) {
+ struct T_opthdr *toh;
+
+ toh = (struct T_opthdr *)ancil_buf;
+ toh->level = IPPROTO_IPV6;
+ toh->name = IPV6_HOPLIMIT;
+ toh->len = sizeof (struct T_opthdr) + sizeof (uint_t);
+ toh->status = 0;
+ ancil_buf += sizeof (struct T_opthdr);
+ *(uint_t *)ancil_buf = ipp->ipp_hoplimit;
+ ancil_buf += sizeof (uint_t);
+ ancil_size -= toh->len;
+ }
+ if (recv_ancillary.crb_ipv6_recvtclass) {
+ struct T_opthdr *toh;
+
+ toh = (struct T_opthdr *)ancil_buf;
+ toh->level = IPPROTO_IPV6;
+ toh->name = IPV6_TCLASS;
+ toh->len = sizeof (struct T_opthdr) + sizeof (uint_t);
+ toh->status = 0;
+ ancil_buf += sizeof (struct T_opthdr);
+
+ if (ira->ira_flags & IRAF_IS_IPV4)
+ *(uint_t *)ancil_buf = ipp->ipp_type_of_service;
+ else
+ *(uint_t *)ancil_buf = ipp->ipp_tclass;
+ ancil_buf += sizeof (uint_t);
+ ancil_size -= toh->len;
+ }
+ if (recv_ancillary.crb_ipv6_recvhopopts &&
+ (ipp->ipp_fields & IPPF_HOPOPTS)) {
+ struct T_opthdr *toh;
+
+ toh = (struct T_opthdr *)ancil_buf;
+ toh->level = IPPROTO_IPV6;
+ toh->name = IPV6_HOPOPTS;
+ toh->len = sizeof (struct T_opthdr) + ipp->ipp_hopoptslen;
+ toh->status = 0;
+ ancil_buf += sizeof (struct T_opthdr);
+ bcopy(ipp->ipp_hopopts, ancil_buf, ipp->ipp_hopoptslen);
+ ancil_buf += ipp->ipp_hopoptslen;
+ ancil_size -= toh->len;
+ }
+ /*
+ * To honor RFC3542 when an application asks for both IPV6_RECVDSTOPTS
+ * and IPV6_RECVRTHDR, we pass up the item rthdrdstopts (the destination
+ * options that appear before a routing header.
+ * We also pass them up if IPV6_RECVRTHDRDSTOPTS is set.
+ */
+ if (ipp->ipp_fields & IPPF_RTHDRDSTOPTS) {
+ if (recv_ancillary.crb_ipv6_recvrthdrdstopts ||
+ (recv_ancillary.crb_ipv6_recvdstopts &&
+ recv_ancillary.crb_ipv6_recvrthdr)) {
+ struct T_opthdr *toh;
+
+ toh = (struct T_opthdr *)ancil_buf;
+ toh->level = IPPROTO_IPV6;
+ toh->name = IPV6_DSTOPTS;
+ toh->len = sizeof (struct T_opthdr) +
+ ipp->ipp_rthdrdstoptslen;
+ toh->status = 0;
+ ancil_buf += sizeof (struct T_opthdr);
+ bcopy(ipp->ipp_rthdrdstopts, ancil_buf,
+ ipp->ipp_rthdrdstoptslen);
+ ancil_buf += ipp->ipp_rthdrdstoptslen;
+ ancil_size -= toh->len;
+ }
+ }
+ if (recv_ancillary.crb_ipv6_recvrthdr &&
+ (ipp->ipp_fields & IPPF_RTHDR)) {
+ struct T_opthdr *toh;
+
+ toh = (struct T_opthdr *)ancil_buf;
+ toh->level = IPPROTO_IPV6;
+ toh->name = IPV6_RTHDR;
+ toh->len = sizeof (struct T_opthdr) + ipp->ipp_rthdrlen;
+ toh->status = 0;
+ ancil_buf += sizeof (struct T_opthdr);
+ bcopy(ipp->ipp_rthdr, ancil_buf, ipp->ipp_rthdrlen);
+ ancil_buf += ipp->ipp_rthdrlen;
+ ancil_size -= toh->len;
+ }
+ if ((recv_ancillary.crb_ipv6_recvdstopts ||
+ recv_ancillary.crb_old_ipv6_recvdstopts) &&
+ (ipp->ipp_fields & IPPF_DSTOPTS)) {
+ struct T_opthdr *toh;
+
+ toh = (struct T_opthdr *)ancil_buf;
+ toh->level = IPPROTO_IPV6;
+ toh->name = IPV6_DSTOPTS;
+ toh->len = sizeof (struct T_opthdr) + ipp->ipp_dstoptslen;
+ toh->status = 0;
+ ancil_buf += sizeof (struct T_opthdr);
+ bcopy(ipp->ipp_dstopts, ancil_buf, ipp->ipp_dstoptslen);
+ ancil_buf += ipp->ipp_dstoptslen;
+ ancil_size -= toh->len;
+ }
+
+ if (recv_ancillary.crb_recvucred && ira->ira_cred != NULL) {
+ struct T_opthdr *toh;
+ cred_t *rcr = connp->conn_cred;
+
+ toh = (struct T_opthdr *)ancil_buf;
+ toh->level = SOL_SOCKET;
+ toh->name = SCM_UCRED;
+ toh->len = sizeof (struct T_opthdr) + ucredsize;
+ toh->status = 0;
+ (void) cred2ucred(ira->ira_cred, ira->ira_cpid, &toh[1], rcr);
+ ancil_buf += toh->len;
+ ancil_size -= toh->len;
+ }
+ if (recv_ancillary.crb_timestamp) {
+ struct T_opthdr *toh;
+
+ toh = (struct T_opthdr *)ancil_buf;
+ toh->level = SOL_SOCKET;
+ toh->name = SCM_TIMESTAMP;
+ toh->len = sizeof (struct T_opthdr) +
+ sizeof (timestruc_t) + _POINTER_ALIGNMENT;
+ toh->status = 0;
+ ancil_buf += sizeof (struct T_opthdr);
+ /* Align for gethrestime() */
+ ancil_buf = (uchar_t *)P2ROUNDUP((intptr_t)ancil_buf,
+ sizeof (intptr_t));
+ gethrestime((timestruc_t *)ancil_buf);
+ ancil_buf = (uchar_t *)toh + toh->len;
+ ancil_size -= toh->len;
+ }
+
+ /*
+ * CAUTION:
+ * Due to aligment issues
+ * Processing of IP_RECVTTL option
+ * should always be the last. Adding
+ * any option processing after this will
+ * cause alignment panic.
+ */
+ if (recv_ancillary.crb_recvttl &&
+ (ira->ira_flags & IRAF_IS_IPV4)) {
+ struct T_opthdr *toh;
+ uint8_t *dstptr;
+
+ toh = (struct T_opthdr *)ancil_buf;
+ toh->level = IPPROTO_IP;
+ toh->name = IP_RECVTTL;
+ toh->len = sizeof (struct T_opthdr) + sizeof (uint8_t);
+ toh->status = 0;
+ ancil_buf += sizeof (struct T_opthdr);
+ dstptr = (uint8_t *)ancil_buf;
+ *dstptr = ipp->ipp_hoplimit;
+ ancil_buf += sizeof (uint8_t);
+ ancil_size -= toh->len;
+ }
+
+ /* Consumed all of allocated space */
+ ASSERT(ancil_size == 0);
+
+}
+
+/*
+ * This routine retrieves the current status of socket options.
+ * It returns the size of the option retrieved, or -1.
+ */
+int
+conn_opt_get(conn_opt_arg_t *coa, t_scalar_t level, t_scalar_t name,
+ uchar_t *ptr)
+{
+ int *i1 = (int *)ptr;
+ conn_t *connp = coa->coa_connp;
+ ip_xmit_attr_t *ixa = coa->coa_ixa;
+ ip_pkt_t *ipp = coa->coa_ipp;
+ ip_stack_t *ipst = ixa->ixa_ipst;
+ uint_t len;
+
+ ASSERT(MUTEX_HELD(&coa->coa_connp->conn_lock));
+
+ switch (level) {
+ case SOL_SOCKET:
+ switch (name) {
+ case SO_DEBUG:
+ *i1 = connp->conn_debug ? SO_DEBUG : 0;
+ break; /* goto sizeof (int) option return */
+ case SO_KEEPALIVE:
+ *i1 = connp->conn_keepalive ? SO_KEEPALIVE : 0;
+ break;
+ case SO_LINGER: {
+ struct linger *lgr = (struct linger *)ptr;
+
+ lgr->l_onoff = connp->conn_linger ? SO_LINGER : 0;
+ lgr->l_linger = connp->conn_lingertime;
+ }
+ return (sizeof (struct linger));
+
+ case SO_OOBINLINE:
+ *i1 = connp->conn_oobinline ? SO_OOBINLINE : 0;
+ break;
+ case SO_REUSEADDR:
+ *i1 = connp->conn_reuseaddr ? SO_REUSEADDR : 0;
+ break; /* goto sizeof (int) option return */
+ case SO_TYPE:
+ *i1 = connp->conn_so_type;
+ break; /* goto sizeof (int) option return */
+ case SO_DONTROUTE:
+ *i1 = (ixa->ixa_flags & IXAF_DONTROUTE) ?
+ SO_DONTROUTE : 0;
+ break; /* goto sizeof (int) option return */
+ case SO_USELOOPBACK:
+ *i1 = connp->conn_useloopback ? SO_USELOOPBACK : 0;
+ break; /* goto sizeof (int) option return */
+ case SO_BROADCAST:
+ *i1 = connp->conn_broadcast ? SO_BROADCAST : 0;
+ break; /* goto sizeof (int) option return */
+
+ case SO_SNDBUF:
+ *i1 = connp->conn_sndbuf;
+ break; /* goto sizeof (int) option return */
+ case SO_RCVBUF:
+ *i1 = connp->conn_rcvbuf;
+ break; /* goto sizeof (int) option return */
+ case SO_RCVTIMEO:
+ case SO_SNDTIMEO:
+ /*
+ * Pass these two options in order for third part
+ * protocol usage. Here just return directly.
+ */
+ *i1 = 0;
+ break;
+ case SO_DGRAM_ERRIND:
+ *i1 = connp->conn_dgram_errind ? SO_DGRAM_ERRIND : 0;
+ break; /* goto sizeof (int) option return */
+ case SO_RECVUCRED:
+ *i1 = connp->conn_recv_ancillary.crb_recvucred;
+ break; /* goto sizeof (int) option return */
+ case SO_TIMESTAMP:
+ *i1 = connp->conn_recv_ancillary.crb_timestamp;
+ break; /* goto sizeof (int) option return */
+#ifdef SO_VRRP
+ case SO_VRRP:
+ *i1 = connp->conn_isvrrp;
+ break; /* goto sizeof (int) option return */
+#endif
+ case SO_ANON_MLP:
+ *i1 = connp->conn_anon_mlp;
+ break; /* goto sizeof (int) option return */
+ case SO_MAC_EXEMPT:
+ *i1 = (connp->conn_mac_mode == CONN_MAC_AWARE);
+ break; /* goto sizeof (int) option return */
+ case SO_MAC_IMPLICIT:
+ *i1 = (connp->conn_mac_mode == CONN_MAC_IMPLICIT);
+ break; /* goto sizeof (int) option return */
+ case SO_ALLZONES:
+ *i1 = connp->conn_allzones;
+ break; /* goto sizeof (int) option return */
+ case SO_EXCLBIND:
+ *i1 = connp->conn_exclbind ? SO_EXCLBIND : 0;
+ break;
+ case SO_PROTOTYPE:
+ *i1 = connp->conn_proto;
+ break;
+
+ case SO_DOMAIN:
+ *i1 = connp->conn_family;
+ break;
+ default:
+ return (-1);
+ }
+ break;
+ case IPPROTO_IP:
+ if (connp->conn_family != AF_INET)
+ return (-1);
+ switch (name) {
+ case IP_OPTIONS:
+ case T_IP_OPTIONS:
+ if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS))
+ return (0);
+
+ len = ipp->ipp_ipv4_options_len;
+ if (len > 0) {
+ bcopy(ipp->ipp_ipv4_options, ptr, len);
+ }
+ return (len);
+
+ case IP_PKTINFO: {
+ /*
+ * This also handles IP_RECVPKTINFO.
+ * IP_PKTINFO and IP_RECVPKTINFO have same value.
+ * Differentiation is based on the size of the
+ * argument passed in.
+ */
+ struct in_pktinfo *pktinfo;
+
+#ifdef notdef
+ /* optcom doesn't provide a length with "get" */
+ if (inlen == sizeof (int)) {
+ /* This is IP_RECVPKTINFO option. */
+ *i1 = connp->conn_recv_ancillary.
+ crb_ip_recvpktinfo;
+ return (sizeof (int));
+ }
+#endif
+ /* XXX assumes that caller has room for max size! */
+
+ pktinfo = (struct in_pktinfo *)ptr;
+ pktinfo->ipi_ifindex = ixa->ixa_ifindex;
+ if (ipp->ipp_fields & IPPF_ADDR)
+ pktinfo->ipi_spec_dst.s_addr = ipp->ipp_addr_v4;
+ else
+ pktinfo->ipi_spec_dst.s_addr = INADDR_ANY;
+ return (sizeof (struct in_pktinfo));
+ }
+ case IP_DONTFRAG:
+ *i1 = (ixa->ixa_flags & IXAF_DONTFRAG) != 0;
+ return (sizeof (int));
+ case IP_TOS:
+ case T_IP_TOS:
+ *i1 = (int)ipp->ipp_type_of_service;
+ break; /* goto sizeof (int) option return */
+ case IP_TTL:
+ *i1 = (int)ipp->ipp_unicast_hops;
+ break; /* goto sizeof (int) option return */
+ case IP_DHCPINIT_IF:
+ return (-1);
+ case IP_NEXTHOP:
+ if (ixa->ixa_flags & IXAF_NEXTHOP_SET) {
+ *(ipaddr_t *)ptr = ixa->ixa_nexthop_v4;
+ return (sizeof (ipaddr_t));
+ } else {
+ return (0);
+ }
+
+ case IP_MULTICAST_IF:
+ /* 0 address if not set */
+ *(ipaddr_t *)ptr = ixa->ixa_multicast_ifaddr;
+ return (sizeof (ipaddr_t));
+ case IP_MULTICAST_TTL:
+ *(uchar_t *)ptr = ixa->ixa_multicast_ttl;
+ return (sizeof (uchar_t));
+ case IP_MULTICAST_LOOP:
+ *ptr = (ixa->ixa_flags & IXAF_MULTICAST_LOOP) ? 1 : 0;
+ return (sizeof (uint8_t));
+ case IP_RECVOPTS:
+ *i1 = connp->conn_recv_ancillary.crb_recvopts;
+ break; /* goto sizeof (int) option return */
+ case IP_RECVDSTADDR:
+ *i1 = connp->conn_recv_ancillary.crb_recvdstaddr;
+ break; /* goto sizeof (int) option return */
+ case IP_RECVIF:
+ *i1 = connp->conn_recv_ancillary.crb_recvif;
+ break; /* goto sizeof (int) option return */
+ case IP_RECVSLLA:
+ *i1 = connp->conn_recv_ancillary.crb_recvslla;
+ break; /* goto sizeof (int) option return */
+ case IP_RECVTTL:
+ *i1 = connp->conn_recv_ancillary.crb_recvttl;
+ break; /* goto sizeof (int) option return */
+ case IP_ADD_MEMBERSHIP:
+ case IP_DROP_MEMBERSHIP:
+ case MCAST_JOIN_GROUP:
+ case MCAST_LEAVE_GROUP:
+ case IP_BLOCK_SOURCE:
+ case IP_UNBLOCK_SOURCE:
+ case IP_ADD_SOURCE_MEMBERSHIP:
+ case IP_DROP_SOURCE_MEMBERSHIP:
+ case MCAST_BLOCK_SOURCE:
+ case MCAST_UNBLOCK_SOURCE:
+ case MCAST_JOIN_SOURCE_GROUP:
+ case MCAST_LEAVE_SOURCE_GROUP:
+ case MRT_INIT:
+ case MRT_DONE:
+ case MRT_ADD_VIF:
+ case MRT_DEL_VIF:
+ case MRT_ADD_MFC:
+ case MRT_DEL_MFC:
+ /* cannot "get" the value for these */
+ return (-1);
+ case MRT_VERSION:
+ case MRT_ASSERT:
+ (void) ip_mrouter_get(name, connp, ptr);
+ return (sizeof (int));
+ case IP_SEC_OPT:
+ return (ipsec_req_from_conn(connp, (ipsec_req_t *)ptr,
+ IPSEC_AF_V4));
+ case IP_BOUND_IF:
+ /* Zero if not set */
+ *i1 = connp->conn_bound_if;
+ break; /* goto sizeof (int) option return */
+ case IP_UNSPEC_SRC:
+ *i1 = connp->conn_unspec_src;
+ break; /* goto sizeof (int) option return */
+ case IP_BROADCAST_TTL:
+ if (ixa->ixa_flags & IXAF_BROADCAST_TTL_SET)
+ *(uchar_t *)ptr = ixa->ixa_broadcast_ttl;
+ else
+ *(uchar_t *)ptr = ipst->ips_ip_broadcast_ttl;
+ return (sizeof (uchar_t));
+ default:
+ return (-1);
+ }
+ break;
+ case IPPROTO_IPV6:
+ if (connp->conn_family != AF_INET6)
+ return (-1);
+ switch (name) {
+ case IPV6_UNICAST_HOPS:
+ *i1 = (int)ipp->ipp_unicast_hops;
+ break; /* goto sizeof (int) option return */
+ case IPV6_MULTICAST_IF:
+ /* 0 index if not set */
+ *i1 = ixa->ixa_multicast_ifindex;
+ break; /* goto sizeof (int) option return */
+ case IPV6_MULTICAST_HOPS:
+ *i1 = ixa->ixa_multicast_ttl;
+ break; /* goto sizeof (int) option return */
+ case IPV6_MULTICAST_LOOP:
+ *i1 = (ixa->ixa_flags & IXAF_MULTICAST_LOOP) ? 1 : 0;
+ break; /* goto sizeof (int) option return */
+ case IPV6_JOIN_GROUP:
+ case IPV6_LEAVE_GROUP:
+ case MCAST_JOIN_GROUP:
+ case MCAST_LEAVE_GROUP:
+ case MCAST_BLOCK_SOURCE:
+ case MCAST_UNBLOCK_SOURCE:
+ case MCAST_JOIN_SOURCE_GROUP:
+ case MCAST_LEAVE_SOURCE_GROUP:
+ /* cannot "get" the value for these */
+ return (-1);
+ case IPV6_BOUND_IF:
+ /* Zero if not set */
+ *i1 = connp->conn_bound_if;
+ break; /* goto sizeof (int) option return */
+ case IPV6_UNSPEC_SRC:
+ *i1 = connp->conn_unspec_src;
+ break; /* goto sizeof (int) option return */
+ case IPV6_RECVPKTINFO:
+ *i1 = connp->conn_recv_ancillary.crb_ip_recvpktinfo;
+ break; /* goto sizeof (int) option return */
+ case IPV6_RECVTCLASS:
+ *i1 = connp->conn_recv_ancillary.crb_ipv6_recvtclass;
+ break; /* goto sizeof (int) option return */
+ case IPV6_RECVPATHMTU:
+ *i1 = connp->conn_ipv6_recvpathmtu;
+ break; /* goto sizeof (int) option return */
+ case IPV6_RECVHOPLIMIT:
+ *i1 = connp->conn_recv_ancillary.crb_ipv6_recvhoplimit;
+ break; /* goto sizeof (int) option return */
+ case IPV6_RECVHOPOPTS:
+ *i1 = connp->conn_recv_ancillary.crb_ipv6_recvhopopts;
+ break; /* goto sizeof (int) option return */
+ case IPV6_RECVDSTOPTS:
+ *i1 = connp->conn_recv_ancillary.crb_ipv6_recvdstopts;
+ break; /* goto sizeof (int) option return */
+ case _OLD_IPV6_RECVDSTOPTS:
+ *i1 =
+ connp->conn_recv_ancillary.crb_old_ipv6_recvdstopts;
+ break; /* goto sizeof (int) option return */
+ case IPV6_RECVRTHDRDSTOPTS:
+ *i1 = connp->conn_recv_ancillary.
+ crb_ipv6_recvrthdrdstopts;
+ break; /* goto sizeof (int) option return */
+ case IPV6_RECVRTHDR:
+ *i1 = connp->conn_recv_ancillary.crb_ipv6_recvrthdr;
+ break; /* goto sizeof (int) option return */
+ case IPV6_PKTINFO: {
+ /* XXX assumes that caller has room for max size! */
+ struct in6_pktinfo *pkti;
+
+ pkti = (struct in6_pktinfo *)ptr;
+ pkti->ipi6_ifindex = ixa->ixa_ifindex;
+ if (ipp->ipp_fields & IPPF_ADDR)
+ pkti->ipi6_addr = ipp->ipp_addr;
+ else
+ pkti->ipi6_addr = ipv6_all_zeros;
+ return (sizeof (struct in6_pktinfo));
+ }
+ case IPV6_TCLASS:
+ *i1 = ipp->ipp_tclass;
+ break; /* goto sizeof (int) option return */
+ case IPV6_NEXTHOP: {
+ sin6_t *sin6 = (sin6_t *)ptr;
+
+ if (ixa->ixa_flags & IXAF_NEXTHOP_SET)
+ return (0);
+
+ *sin6 = sin6_null;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_addr = ixa->ixa_nexthop_v6;
+
+ return (sizeof (sin6_t));
+ }
+ case IPV6_HOPOPTS:
+ if (!(ipp->ipp_fields & IPPF_HOPOPTS))
+ return (0);
+ bcopy(ipp->ipp_hopopts, ptr,
+ ipp->ipp_hopoptslen);
+ return (ipp->ipp_hopoptslen);
+ case IPV6_RTHDRDSTOPTS:
+ if (!(ipp->ipp_fields & IPPF_RTHDRDSTOPTS))
+ return (0);
+ bcopy(ipp->ipp_rthdrdstopts, ptr,
+ ipp->ipp_rthdrdstoptslen);
+ return (ipp->ipp_rthdrdstoptslen);
+ case IPV6_RTHDR:
+ if (!(ipp->ipp_fields & IPPF_RTHDR))
+ return (0);
+ bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen);
+ return (ipp->ipp_rthdrlen);
+ case IPV6_DSTOPTS:
+ if (!(ipp->ipp_fields & IPPF_DSTOPTS))
+ return (0);
+ bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen);
+ return (ipp->ipp_dstoptslen);
+ case IPV6_PATHMTU:
+ return (ip_fill_mtuinfo(connp, ixa,
+ (struct ip6_mtuinfo *)ptr));
+ case IPV6_SEC_OPT:
+ return (ipsec_req_from_conn(connp, (ipsec_req_t *)ptr,
+ IPSEC_AF_V6));
+ case IPV6_SRC_PREFERENCES:
+ return (ip6_get_src_preferences(ixa, (uint32_t *)ptr));
+ case IPV6_DONTFRAG:
+ *i1 = (ixa->ixa_flags & IXAF_DONTFRAG) != 0;
+ return (sizeof (int));
+ case IPV6_USE_MIN_MTU:
+ if (ixa->ixa_flags & IXAF_USE_MIN_MTU)
+ *i1 = ixa->ixa_use_min_mtu;
+ else
+ *i1 = IPV6_USE_MIN_MTU_MULTICAST;
+ break;
+ case IPV6_V6ONLY:
+ *i1 = connp->conn_ipv6_v6only;
+ return (sizeof (int));
+ default:
+ return (-1);
+ }
+ break;
+ case IPPROTO_UDP:
+ switch (name) {
+ case UDP_ANONPRIVBIND:
+ *i1 = connp->conn_anon_priv_bind;
+ break;
+ case UDP_EXCLBIND:
+ *i1 = connp->conn_exclbind ? UDP_EXCLBIND : 0;
+ break;
+ default:
+ return (-1);
+ }
+ break;
+ case IPPROTO_TCP:
+ switch (name) {
+ case TCP_RECVDSTADDR:
+ *i1 = connp->conn_recv_ancillary.crb_recvdstaddr;
+ break;
+ case TCP_ANONPRIVBIND:
+ *i1 = connp->conn_anon_priv_bind;
+ break;
+ case TCP_EXCLBIND:
+ *i1 = connp->conn_exclbind ? TCP_EXCLBIND : 0;
+ break;
+ default:
+ return (-1);
+ }
+ break;
+ default:
+ return (-1);
+ }
+ return (sizeof (int));
+}
+
+static int conn_opt_set_socket(conn_opt_arg_t *coa, t_scalar_t name,
+ uint_t inlen, uchar_t *invalp, boolean_t checkonly, cred_t *cr);
+static int conn_opt_set_ip(conn_opt_arg_t *coa, t_scalar_t name,
+ uint_t inlen, uchar_t *invalp, boolean_t checkonly, cred_t *cr);
+static int conn_opt_set_ipv6(conn_opt_arg_t *coa, t_scalar_t name,
+ uint_t inlen, uchar_t *invalp, boolean_t checkonly, cred_t *cr);
+static int conn_opt_set_udp(conn_opt_arg_t *coa, t_scalar_t name,
+ uint_t inlen, uchar_t *invalp, boolean_t checkonly, cred_t *cr);
+static int conn_opt_set_tcp(conn_opt_arg_t *coa, t_scalar_t name,
+ uint_t inlen, uchar_t *invalp, boolean_t checkonly, cred_t *cr);
+
+/*
+ * This routine sets the most common socket options including some
+ * that are transport/ULP specific.
+ * It returns errno or zero.
+ *
+ * For fixed length options, there is no sanity check
+ * of passed in length is done. It is assumed *_optcom_req()
+ * routines do the right thing.
+ */
+int
+conn_opt_set(conn_opt_arg_t *coa, t_scalar_t level, t_scalar_t name,
+ uint_t inlen, uchar_t *invalp, boolean_t checkonly, cred_t *cr)
+{
+ ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock));
+
+ /* We have different functions for different levels */
+ switch (level) {
+ case SOL_SOCKET:
+ return (conn_opt_set_socket(coa, name, inlen, invalp,
+ checkonly, cr));
+ case IPPROTO_IP:
+ return (conn_opt_set_ip(coa, name, inlen, invalp,
+ checkonly, cr));
+ case IPPROTO_IPV6:
+ return (conn_opt_set_ipv6(coa, name, inlen, invalp,
+ checkonly, cr));
+ case IPPROTO_UDP:
+ return (conn_opt_set_udp(coa, name, inlen, invalp,
+ checkonly, cr));
+ case IPPROTO_TCP:
+ return (conn_opt_set_tcp(coa, name, inlen, invalp,
+ checkonly, cr));
+ default:
+ return (0);
+ }
+}
+
+/*
+ * Handle SOL_SOCKET
+ * Note that we do not handle SO_PROTOTYPE here. The ULPs that support
+ * it implement their own checks and setting of conn_proto.
+ */
+/* ARGSUSED1 */
+static int
+conn_opt_set_socket(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen,
+ uchar_t *invalp, boolean_t checkonly, cred_t *cr)
+{
+ conn_t *connp = coa->coa_connp;
+ ip_xmit_attr_t *ixa = coa->coa_ixa;
+ int *i1 = (int *)invalp;
+ boolean_t onoff = (*i1 == 0) ? 0 : 1;
+
+ switch (name) {
+ case SO_ALLZONES:
+ if (IPCL_IS_BOUND(connp))
+ return (EINVAL);
+ break;
+#ifdef SO_VRRP
+ case SO_VRRP:
+ if (secpolicy_ip_config(cr, checkonly) != 0)
+ return (EACCES);
+ break;
+#endif
+ case SO_MAC_EXEMPT:
+ if (secpolicy_net_mac_aware(cr) != 0)
+ return (EACCES);
+ if (IPCL_IS_BOUND(connp))
+ return (EINVAL);
+ break;
+ case SO_MAC_IMPLICIT:
+ if (secpolicy_net_mac_implicit(cr) != 0)
+ return (EACCES);
+ break;
+ }
+ if (checkonly)
+ return (0);
+
+ mutex_enter(&connp->conn_lock);
+ /* Here we set the actual option value */
+ switch (name) {
+ case SO_DEBUG:
+ connp->conn_debug = onoff;
+ break;
+ case SO_KEEPALIVE:
+ connp->conn_keepalive = onoff;
+ break;
+ case SO_LINGER: {
+ struct linger *lgr = (struct linger *)invalp;
+
+ if (lgr->l_onoff) {
+ connp->conn_linger = 1;
+ connp->conn_lingertime = lgr->l_linger;
+ } else {
+ connp->conn_linger = 0;
+ connp->conn_lingertime = 0;
+ }
+ break;
+ }
+ case SO_OOBINLINE:
+ connp->conn_oobinline = onoff;
+ coa->coa_changed |= COA_OOBINLINE_CHANGED;
+ break;
+ case SO_REUSEADDR:
+ connp->conn_reuseaddr = onoff;
+ break;
+ case SO_DONTROUTE:
+ if (onoff)
+ ixa->ixa_flags |= IXAF_DONTROUTE;
+ else
+ ixa->ixa_flags &= ~IXAF_DONTROUTE;
+ coa->coa_changed |= COA_ROUTE_CHANGED;
+ break;
+ case SO_USELOOPBACK:
+ connp->conn_useloopback = onoff;
+ break;
+ case SO_BROADCAST:
+ connp->conn_broadcast = onoff;
+ break;
+ case SO_SNDBUF:
+ /* ULP has range checked the value */
+ connp->conn_sndbuf = *i1;
+ coa->coa_changed |= COA_SNDBUF_CHANGED;
+ break;
+ case SO_RCVBUF:
+ /* ULP has range checked the value */
+ connp->conn_rcvbuf = *i1;
+ coa->coa_changed |= COA_RCVBUF_CHANGED;
+ break;
+ case SO_RCVTIMEO:
+ case SO_SNDTIMEO:
+ /*
+ * Pass these two options in order for third part
+ * protocol usage.
+ */
+ break;
+ case SO_DGRAM_ERRIND:
+ connp->conn_dgram_errind = onoff;
+ break;
+ case SO_RECVUCRED:
+ connp->conn_recv_ancillary.crb_recvucred = onoff;
+ break;
+ case SO_ALLZONES:
+ connp->conn_allzones = onoff;
+ coa->coa_changed |= COA_ROUTE_CHANGED;
+ if (onoff)
+ ixa->ixa_zoneid = ALL_ZONES;
+ else
+ ixa->ixa_zoneid = connp->conn_zoneid;
+ break;
+ case SO_TIMESTAMP:
+ connp->conn_recv_ancillary.crb_timestamp = onoff;
+ break;
+#ifdef SO_VRRP
+ case SO_VRRP:
+ connp->conn_isvrrp = onoff;
+ break;
+#endif
+ case SO_ANON_MLP:
+ connp->conn_anon_mlp = onoff;
+ break;
+ case SO_MAC_EXEMPT:
+ connp->conn_mac_mode = onoff ?
+ CONN_MAC_AWARE : CONN_MAC_DEFAULT;
+ break;
+ case SO_MAC_IMPLICIT:
+ connp->conn_mac_mode = onoff ?
+ CONN_MAC_IMPLICIT : CONN_MAC_DEFAULT;
+ break;
+ case SO_EXCLBIND:
+ connp->conn_exclbind = onoff;
+ break;
+ }
+ mutex_exit(&connp->conn_lock);
+ return (0);
+}
+
+/* Handle IPPROTO_IP */
+static int
+conn_opt_set_ip(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen,
+ uchar_t *invalp, boolean_t checkonly, cred_t *cr)
+{
+ conn_t *connp = coa->coa_connp;
+ ip_xmit_attr_t *ixa = coa->coa_ixa;
+ ip_pkt_t *ipp = coa->coa_ipp;
+ int *i1 = (int *)invalp;
+ boolean_t onoff = (*i1 == 0) ? 0 : 1;
+ ipaddr_t addr = (ipaddr_t)*i1;
+ uint_t ifindex;
+ zoneid_t zoneid = IPCL_ZONEID(connp);
+ ipif_t *ipif;
+ ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
+ int error;
+
+ if (connp->conn_family != AF_INET)
+ return (EINVAL);
+
+ switch (name) {
+ case IP_TTL:
+ /* Don't allow zero */
+ if (*i1 < 1 || *i1 > 255)
+ return (EINVAL);
+ break;
+ case IP_MULTICAST_IF:
+ if (addr == INADDR_ANY) {
+ /* Clear */
+ ifindex = 0;
+ break;
+ }
+ ipif = ipif_lookup_addr(addr, NULL, zoneid, ipst);
+ if (ipif == NULL)
+ return (EHOSTUNREACH);
+ /* not supported by the virtual network iface */
+ if (IS_VNI(ipif->ipif_ill)) {
+ ipif_refrele(ipif);
+ return (EINVAL);
+ }
+ ifindex = ipif->ipif_ill->ill_phyint->phyint_ifindex;
+ ipif_refrele(ipif);
+ break;
+ case IP_NEXTHOP: {
+ ire_t *ire;
+
+ if (addr == INADDR_ANY) {
+ /* Clear */
+ break;
+ }
+ /* Verify that the next-hop is on-link */
+ ire = ire_ftable_lookup_v4(addr, 0, 0, IRE_ONLINK, NULL, zoneid,
+ NULL, MATCH_IRE_TYPE, 0, ipst, NULL);
+ if (ire == NULL)
+ return (EHOSTUNREACH);
+ ire_refrele(ire);
+ break;
+ }
+ case IP_OPTIONS:
+ case T_IP_OPTIONS: {
+ uint_t newlen;
+
+ if (ipp->ipp_fields & IPPF_LABEL_V4)
+ newlen = inlen + (ipp->ipp_label_len_v4 + 3) & ~3;
+ else
+ newlen = inlen;
+ if ((inlen & 0x3) || newlen > IP_MAX_OPT_LENGTH) {
+ return (EINVAL);
+ }
+ break;
+ }
+ case IP_PKTINFO: {
+ struct in_pktinfo *pktinfo;
+
+ /* Two different valid lengths */
+ if (inlen != sizeof (int) &&
+ inlen != sizeof (struct in_pktinfo))
+ return (EINVAL);
+ if (inlen == sizeof (int))
+ break;
+
+ pktinfo = (struct in_pktinfo *)invalp;
+ if (pktinfo->ipi_spec_dst.s_addr != INADDR_ANY) {
+ switch (ip_laddr_verify_v4(pktinfo->ipi_spec_dst.s_addr,
+ zoneid, ipst, B_FALSE)) {
+ case IPVL_UNICAST_UP:
+ case IPVL_UNICAST_DOWN:
+ break;
+ default:
+ return (EADDRNOTAVAIL);
+ }
+ }
+ if (!ip_ifindex_valid(pktinfo->ipi_ifindex, B_FALSE, ipst))
+ return (ENXIO);
+ break;
+ }
+ case IP_BOUND_IF:
+ ifindex = *(uint_t *)i1;
+
+ /* Just check it is ok. */
+ if (!ip_ifindex_valid(ifindex, B_FALSE, ipst))
+ return (ENXIO);
+ break;
+ }
+ if (checkonly)
+ return (0);
+
+ /* Here we set the actual option value */
+ /*
+ * conn_lock protects the bitfields, and is used to
+ * set the fields atomically. Not needed for ixa settings since
+ * the caller has an exclusive copy of the ixa.
+ * We can not hold conn_lock across the multicast options though.
+ */
+ switch (name) {
+ case IP_OPTIONS:
+ case T_IP_OPTIONS:
+ /* Save options for use by IP. */
+ mutex_enter(&connp->conn_lock);
+ error = optcom_pkt_set(invalp, inlen,
+ (uchar_t **)&ipp->ipp_ipv4_options,
+ &ipp->ipp_ipv4_options_len);
+ if (error != 0) {
+ mutex_exit(&connp->conn_lock);
+ return (error);
+ }
+ if (ipp->ipp_ipv4_options_len == 0) {
+ ipp->ipp_fields &= ~IPPF_IPV4_OPTIONS;
+ } else {
+ ipp->ipp_fields |= IPPF_IPV4_OPTIONS;
+ }
+ mutex_exit(&connp->conn_lock);
+ coa->coa_changed |= COA_HEADER_CHANGED;
+ coa->coa_changed |= COA_WROFF_CHANGED;
+ break;
+
+ case IP_TTL:
+ mutex_enter(&connp->conn_lock);
+ ipp->ipp_unicast_hops = *i1;
+ mutex_exit(&connp->conn_lock);
+ coa->coa_changed |= COA_HEADER_CHANGED;
+ break;
+ case IP_TOS:
+ case T_IP_TOS:
+ mutex_enter(&connp->conn_lock);
+ if (*i1 == -1) {
+ ipp->ipp_type_of_service = 0;
+ } else {
+ ipp->ipp_type_of_service = *i1;
+ }
+ mutex_exit(&connp->conn_lock);
+ coa->coa_changed |= COA_HEADER_CHANGED;
+ break;
+ case IP_MULTICAST_IF:
+ ixa->ixa_multicast_ifindex = ifindex;
+ ixa->ixa_multicast_ifaddr = addr;
+ coa->coa_changed |= COA_ROUTE_CHANGED;
+ break;
+ case IP_MULTICAST_TTL:
+ ixa->ixa_multicast_ttl = *invalp;
+ /* Handled automatically by ip_output */
+ break;
+ case IP_MULTICAST_LOOP:
+ if (*invalp != 0)
+ ixa->ixa_flags |= IXAF_MULTICAST_LOOP;
+ else
+ ixa->ixa_flags &= ~IXAF_MULTICAST_LOOP;
+ /* Handled automatically by ip_output */
+ break;
+ case IP_RECVOPTS:
+ mutex_enter(&connp->conn_lock);
+ connp->conn_recv_ancillary.crb_recvopts = onoff;
+ mutex_exit(&connp->conn_lock);
+ break;
+ case IP_RECVDSTADDR:
+ mutex_enter(&connp->conn_lock);
+ connp->conn_recv_ancillary.crb_recvdstaddr = onoff;
+ mutex_exit(&connp->conn_lock);
+ break;
+ case IP_RECVIF:
+ mutex_enter(&connp->conn_lock);
+ connp->conn_recv_ancillary.crb_recvif = onoff;
+ mutex_exit(&connp->conn_lock);
+ break;
+ case IP_RECVSLLA:
+ mutex_enter(&connp->conn_lock);
+ connp->conn_recv_ancillary.crb_recvslla = onoff;
+ mutex_exit(&connp->conn_lock);
+ break;
+ case IP_RECVTTL:
+ mutex_enter(&connp->conn_lock);
+ connp->conn_recv_ancillary.crb_recvttl = onoff;
+ mutex_exit(&connp->conn_lock);
+ break;
+ case IP_PKTINFO: {
+ /*
+ * This also handles IP_RECVPKTINFO.
+ * IP_PKTINFO and IP_RECVPKTINFO have same value.
+ * Differentiation is based on the size of the
+ * argument passed in.
+ */
+ struct in_pktinfo *pktinfo;
+
+ if (inlen == sizeof (int)) {
+ /* This is IP_RECVPKTINFO option. */
+ mutex_enter(&connp->conn_lock);
+ connp->conn_recv_ancillary.crb_ip_recvpktinfo =
+ onoff;
+ mutex_exit(&connp->conn_lock);
+ break;
+ }
+
+ /* This is IP_PKTINFO option. */
+ mutex_enter(&connp->conn_lock);
+ pktinfo = (struct in_pktinfo *)invalp;
+ if (ipp->ipp_addr_v4 != INADDR_ANY) {
+ ipp->ipp_fields |= IPPF_ADDR;
+ IN6_INADDR_TO_V4MAPPED(&pktinfo->ipi_spec_dst,
+ &ipp->ipp_addr);
+ } else {
+ ipp->ipp_fields &= ~IPPF_ADDR;
+ ipp->ipp_addr = ipv6_all_zeros;
+ }
+ mutex_exit(&connp->conn_lock);
+ ixa->ixa_ifindex = pktinfo->ipi_ifindex;
+ coa->coa_changed |= COA_ROUTE_CHANGED;
+ coa->coa_changed |= COA_HEADER_CHANGED;
+ break;
+ }
+ case IP_DONTFRAG:
+ if (onoff) {
+ ixa->ixa_flags |= (IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
+ ixa->ixa_flags &= ~IXAF_PMTU_DISCOVERY;
+ } else {
+ ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
+ ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
+ }
+ /* Need to redo ip_attr_connect */
+ coa->coa_changed |= COA_ROUTE_CHANGED;
+ break;
+ case IP_ADD_MEMBERSHIP:
+ case IP_DROP_MEMBERSHIP:
+ case MCAST_JOIN_GROUP:
+ case MCAST_LEAVE_GROUP:
+ return (ip_opt_set_multicast_group(connp, name,
+ invalp, B_FALSE, checkonly));
+
+ case IP_BLOCK_SOURCE:
+ case IP_UNBLOCK_SOURCE:
+ case IP_ADD_SOURCE_MEMBERSHIP:
+ case IP_DROP_SOURCE_MEMBERSHIP:
+ case MCAST_BLOCK_SOURCE:
+ case MCAST_UNBLOCK_SOURCE:
+ case MCAST_JOIN_SOURCE_GROUP:
+ case MCAST_LEAVE_SOURCE_GROUP:
+ return (ip_opt_set_multicast_sources(connp, name,
+ invalp, B_FALSE, checkonly));
+
+ case IP_SEC_OPT:
+ mutex_enter(&connp->conn_lock);
+ error = ipsec_set_req(cr, connp, (ipsec_req_t *)invalp);
+ mutex_exit(&connp->conn_lock);
+ if (error != 0) {
+ return (error);
+ }
+ /* This is an IPsec policy change - redo ip_attr_connect */
+ coa->coa_changed |= COA_ROUTE_CHANGED;
+ break;
+ case IP_NEXTHOP:
+ ixa->ixa_nexthop_v4 = addr;
+ if (addr != INADDR_ANY)
+ ixa->ixa_flags |= IXAF_NEXTHOP_SET;
+ else
+ ixa->ixa_flags &= ~IXAF_NEXTHOP_SET;
+ coa->coa_changed |= COA_ROUTE_CHANGED;
+ break;
+
+ case IP_BOUND_IF:
+ ixa->ixa_ifindex = ifindex; /* Send */
+ mutex_enter(&connp->conn_lock);
+ connp->conn_incoming_ifindex = ifindex; /* Receive */
+ connp->conn_bound_if = ifindex; /* getsockopt */
+ mutex_exit(&connp->conn_lock);
+ coa->coa_changed |= COA_ROUTE_CHANGED;
+ break;
+ case IP_UNSPEC_SRC:
+ mutex_enter(&connp->conn_lock);
+ connp->conn_unspec_src = onoff;
+ if (onoff)
+ ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
+ else
+ ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
+
+ mutex_exit(&connp->conn_lock);
+ break;
+ case IP_BROADCAST_TTL:
+ ixa->ixa_broadcast_ttl = *invalp;
+ ixa->ixa_flags |= IXAF_BROADCAST_TTL_SET;
+ /* Handled automatically by ip_output */
+ break;
+ case MRT_INIT:
+ case MRT_DONE:
+ case MRT_ADD_VIF:
+ case MRT_DEL_VIF:
+ case MRT_ADD_MFC:
+ case MRT_DEL_MFC:
+ case MRT_ASSERT:
+ if ((error = secpolicy_ip_config(cr, B_FALSE)) != 0) {
+ return (error);
+ }
+ error = ip_mrouter_set((int)name, connp, checkonly,
+ (uchar_t *)invalp, inlen);
+ if (error) {
+ return (error);
+ }
+ return (0);
+
+ }
+ return (0);
+}
+
+/* Handle IPPROTO_IPV6 */
+static int
+conn_opt_set_ipv6(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen,
+ uchar_t *invalp, boolean_t checkonly, cred_t *cr)
+{
+ conn_t *connp = coa->coa_connp;
+ ip_xmit_attr_t *ixa = coa->coa_ixa;
+ ip_pkt_t *ipp = coa->coa_ipp;
+ int *i1 = (int *)invalp;
+ boolean_t onoff = (*i1 == 0) ? 0 : 1;
+ uint_t ifindex;
+ zoneid_t zoneid = IPCL_ZONEID(connp);
+ ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
+ int error;
+
+ if (connp->conn_family != AF_INET6)
+ return (EINVAL);
+
+ switch (name) {
+ case IPV6_MULTICAST_IF:
+ /*
+ * The only possible error is EINVAL.
+ * We call this option on both V4 and V6
+ * If both fail, then this call returns
+ * EINVAL. If at least one of them succeeds we
+ * return success.
+ */
+ ifindex = *(uint_t *)i1;
+
+ if (!ip_ifindex_valid(ifindex, B_TRUE, ipst) &&
+ !ip_ifindex_valid(ifindex, B_FALSE, ipst))
+ return (EINVAL);
+ break;
+ case IPV6_UNICAST_HOPS:
+ /* Don't allow zero. -1 means to use default */
+ if (*i1 < -1 || *i1 == 0 || *i1 > IPV6_MAX_HOPS)
+ return (EINVAL);
+ break;
+ case IPV6_MULTICAST_HOPS:
+ /* -1 means use default */
+ if (*i1 < -1 || *i1 > IPV6_MAX_HOPS)
+ return (EINVAL);
+ break;
+ case IPV6_MULTICAST_LOOP:
+ if (*i1 != 0 && *i1 != 1)
+ return (EINVAL);
+ break;
+ case IPV6_BOUND_IF:
+ ifindex = *(uint_t *)i1;
+
+ if (!ip_ifindex_valid(ifindex, B_TRUE, ipst))
+ return (ENXIO);
+ break;
+ case IPV6_PKTINFO: {
+ struct in6_pktinfo *pkti;
+ boolean_t isv6;
+
+ if (inlen != 0 && inlen != sizeof (struct in6_pktinfo))
+ return (EINVAL);
+ if (inlen == 0)
+ break; /* Clear values below */
+
+ /*
+ * Verify the source address and ifindex. Privileged users
+ * can use any source address.
+ */
+ pkti = (struct in6_pktinfo *)invalp;
+
+ /*
+ * For link-local addresses we use the ipi6_ifindex when
+ * we verify the local address.
+ * If net_rawaccess then any source address can be used.
+ */
+ if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr) &&
+ secpolicy_net_rawaccess(cr) != 0) {
+ uint_t scopeid = 0;
+ in6_addr_t *v6src = &pkti->ipi6_addr;
+ ipaddr_t v4src;
+ ip_laddr_t laddr_type = IPVL_UNICAST_UP;
+
+ if (IN6_IS_ADDR_V4MAPPED(v6src)) {
+ IN6_V4MAPPED_TO_IPADDR(v6src, v4src);
+ if (v4src != INADDR_ANY) {
+ laddr_type = ip_laddr_verify_v4(v4src,
+ zoneid, ipst, B_FALSE);
+ }
+ } else {
+ if (IN6_IS_ADDR_LINKSCOPE(v6src))
+ scopeid = pkti->ipi6_ifindex;
+
+ laddr_type = ip_laddr_verify_v6(v6src, zoneid,
+ ipst, B_FALSE, scopeid);
+ }
+ switch (laddr_type) {
+ case IPVL_UNICAST_UP:
+ case IPVL_UNICAST_DOWN:
+ break;
+ default:
+ return (EADDRNOTAVAIL);
+ }
+ ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
+ } else if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr)) {
+ /* Allow any source */
+ ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
+ }
+ isv6 = !(IN6_IS_ADDR_V4MAPPED(&pkti->ipi6_addr));
+ if (!ip_ifindex_valid(pkti->ipi6_ifindex, isv6, ipst))
+ return (ENXIO);
+ break;
+ }
+ case IPV6_HOPLIMIT:
+ /* It is only allowed as ancilary data */
+ if (!coa->coa_ancillary)
+ return (EINVAL);
+
+ if (inlen != 0 && inlen != sizeof (int))
+ return (EINVAL);
+ if (inlen == sizeof (int)) {
+ if (*i1 > 255 || *i1 < -1 || *i1 == 0)
+ return (EINVAL);
+ }
+ break;
+ case IPV6_TCLASS:
+ if (inlen != 0 && inlen != sizeof (int))
+ return (EINVAL);
+ if (inlen == sizeof (int)) {
+ if (*i1 > 255 || *i1 < -1)
+ return (EINVAL);
+ }
+ break;
+ case IPV6_NEXTHOP:
+ if (inlen != 0 && inlen != sizeof (sin6_t))
+ return (EINVAL);
+ if (inlen == sizeof (sin6_t)) {
+ sin6_t *sin6 = (sin6_t *)invalp;
+ ire_t *ire;
+
+ if (sin6->sin6_family != AF_INET6)
+ return (EAFNOSUPPORT);
+ if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr))
+ return (EADDRNOTAVAIL);
+
+ /* Verify that the next-hop is on-link */
+ ire = ire_ftable_lookup_v6(&sin6->sin6_addr,
+ 0, 0, IRE_ONLINK, NULL, zoneid,
+ NULL, MATCH_IRE_TYPE, 0, ipst, NULL);
+ if (ire == NULL)
+ return (EHOSTUNREACH);
+ ire_refrele(ire);
+ break;
+ }
+ break;
+ case IPV6_RTHDR:
+ case IPV6_DSTOPTS:
+ case IPV6_RTHDRDSTOPTS:
+ case IPV6_HOPOPTS: {
+ /* All have the length field in the same place */
+ ip6_hbh_t *hopts = (ip6_hbh_t *)invalp;
+ /*
+ * Sanity checks - minimum size, size a multiple of
+ * eight bytes, and matching size passed in.
+ */
+ if (inlen != 0 &&
+ inlen != (8 * (hopts->ip6h_len + 1)))
+ return (EINVAL);
+ break;
+ }
+ case IPV6_PATHMTU:
+ /* Can't be set */
+ return (EINVAL);
+
+ case IPV6_USE_MIN_MTU:
+ if (inlen != sizeof (int))
+ return (EINVAL);
+ if (*i1 < -1 || *i1 > 1)
+ return (EINVAL);
+ break;
+ case IPV6_SRC_PREFERENCES:
+ if (inlen != sizeof (uint32_t))
+ return (EINVAL);
+ break;
+ case IPV6_V6ONLY:
+ if (*i1 < 0 || *i1 > 1) {
+ return (EINVAL);
+ }
+ break;
+ }
+ if (checkonly)
+ return (0);
+
+ /* Here we set the actual option value */
+ /*
+ * conn_lock protects the bitfields, and is used to
+ * set the fields atomically. Not needed for ixa settings since
+ * the caller has an exclusive copy of the ixa.
+ * We can not hold conn_lock across the multicast options though.
+ */
+ ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock));
+ switch (name) {
+ case IPV6_MULTICAST_IF:
+ ixa->ixa_multicast_ifindex = ifindex;
+ /* Need to redo ip_attr_connect */
+ coa->coa_changed |= COA_ROUTE_CHANGED;
+ break;
+ case IPV6_UNICAST_HOPS:
+ /* -1 means use default */
+ mutex_enter(&connp->conn_lock);
+ if (*i1 == -1) {
+ ipp->ipp_unicast_hops = connp->conn_default_ttl;
+ } else {
+ ipp->ipp_unicast_hops = (uint8_t)*i1;
+ }
+ mutex_exit(&connp->conn_lock);
+ coa->coa_changed |= COA_HEADER_CHANGED;
+ break;
+ case IPV6_MULTICAST_HOPS:
+ /* -1 means use default */
+ if (*i1 == -1) {
+ ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
+ } else {
+ ixa->ixa_multicast_ttl = (uint8_t)*i1;
+ }
+ /* Handled automatically by ip_output */
+ break;
+ case IPV6_MULTICAST_LOOP:
+ if (*i1 != 0)
+ ixa->ixa_flags |= IXAF_MULTICAST_LOOP;
+ else
+ ixa->ixa_flags &= ~IXAF_MULTICAST_LOOP;
+ /* Handled automatically by ip_output */
+ break;
+ case IPV6_JOIN_GROUP:
+ case IPV6_LEAVE_GROUP:
+ case MCAST_JOIN_GROUP:
+ case MCAST_LEAVE_GROUP:
+ return (ip_opt_set_multicast_group(connp, name,
+ invalp, B_TRUE, checkonly));
+
+ case MCAST_BLOCK_SOURCE:
+ case MCAST_UNBLOCK_SOURCE:
+ case MCAST_JOIN_SOURCE_GROUP:
+ case MCAST_LEAVE_SOURCE_GROUP:
+ return (ip_opt_set_multicast_sources(connp, name,
+ invalp, B_TRUE, checkonly));
+
+ case IPV6_BOUND_IF:
+ ixa->ixa_ifindex = ifindex; /* Send */
+ mutex_enter(&connp->conn_lock);
+ connp->conn_incoming_ifindex = ifindex; /* Receive */
+ connp->conn_bound_if = ifindex; /* getsockopt */
+ mutex_exit(&connp->conn_lock);
+ coa->coa_changed |= COA_ROUTE_CHANGED;
+ break;
+ case IPV6_UNSPEC_SRC:
+ mutex_enter(&connp->conn_lock);
+ connp->conn_unspec_src = onoff;
+ if (onoff)
+ ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
+ else
+ ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
+ mutex_exit(&connp->conn_lock);
+ break;
+ case IPV6_RECVPKTINFO:
+ mutex_enter(&connp->conn_lock);
+ connp->conn_recv_ancillary.crb_ip_recvpktinfo = onoff;
+ mutex_exit(&connp->conn_lock);
+ break;
+ case IPV6_RECVTCLASS:
+ mutex_enter(&connp->conn_lock);
+ connp->conn_recv_ancillary.crb_ipv6_recvtclass = onoff;
+ mutex_exit(&connp->conn_lock);
+ break;
+ case IPV6_RECVPATHMTU:
+ mutex_enter(&connp->conn_lock);
+ connp->conn_ipv6_recvpathmtu = onoff;
+ mutex_exit(&connp->conn_lock);
+ break;
+ case IPV6_RECVHOPLIMIT:
+ mutex_enter(&connp->conn_lock);
+ connp->conn_recv_ancillary.crb_ipv6_recvhoplimit =
+ onoff;
+ mutex_exit(&connp->conn_lock);
+ break;
+ case IPV6_RECVHOPOPTS:
+ mutex_enter(&connp->conn_lock);
+ connp->conn_recv_ancillary.crb_ipv6_recvhopopts = onoff;
+ mutex_exit(&connp->conn_lock);
+ break;
+ case IPV6_RECVDSTOPTS:
+ mutex_enter(&connp->conn_lock);
+ connp->conn_recv_ancillary.crb_ipv6_recvdstopts = onoff;
+ mutex_exit(&connp->conn_lock);
+ break;
+ case _OLD_IPV6_RECVDSTOPTS:
+ mutex_enter(&connp->conn_lock);
+ connp->conn_recv_ancillary.crb_old_ipv6_recvdstopts =
+ onoff;
+ mutex_exit(&connp->conn_lock);
+ break;
+ case IPV6_RECVRTHDRDSTOPTS:
+ mutex_enter(&connp->conn_lock);
+ connp->conn_recv_ancillary.crb_ipv6_recvrthdrdstopts =
+ onoff;
+ mutex_exit(&connp->conn_lock);
+ break;
+ case IPV6_RECVRTHDR:
+ mutex_enter(&connp->conn_lock);
+ connp->conn_recv_ancillary.crb_ipv6_recvrthdr = onoff;
+ mutex_exit(&connp->conn_lock);
+ break;
+ case IPV6_PKTINFO:
+ mutex_enter(&connp->conn_lock);
+ if (inlen == 0) {
+ ipp->ipp_fields &= ~IPPF_ADDR;
+ ipp->ipp_addr = ipv6_all_zeros;
+ ixa->ixa_ifindex = 0;
+ } else {
+ struct in6_pktinfo *pkti;
+
+ pkti = (struct in6_pktinfo *)invalp;
+ ipp->ipp_addr = pkti->ipi6_addr;
+ if (!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr))
+ ipp->ipp_fields |= IPPF_ADDR;
+ else
+ ipp->ipp_fields &= ~IPPF_ADDR;
+ ixa->ixa_ifindex = pkti->ipi6_ifindex;
+ }
+ mutex_exit(&connp->conn_lock);
+ /* Source and ifindex might have changed */
+ coa->coa_changed |= COA_HEADER_CHANGED;
+ coa->coa_changed |= COA_ROUTE_CHANGED;
+ break;
+ case IPV6_HOPLIMIT:
+ mutex_enter(&connp->conn_lock);
+ if (inlen == 0 || *i1 == -1) {
+ /* Revert to default */
+ ipp->ipp_fields &= ~IPPF_HOPLIMIT;
+ ixa->ixa_flags &= ~IXAF_NO_TTL_CHANGE;
+ } else {
+ ipp->ipp_hoplimit = *i1;
+ ipp->ipp_fields |= IPPF_HOPLIMIT;
+ /* Ensure that it sticks for multicast packets */
+ ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
+ }
+ mutex_exit(&connp->conn_lock);
+ coa->coa_changed |= COA_HEADER_CHANGED;
+ break;
+ case IPV6_TCLASS:
+ /*
+ * IPV6_TCLASS accepts -1 as use kernel default
+ * and [0, 255] as the actualy traffic class.
+ */
+ mutex_enter(&connp->conn_lock);
+ if (inlen == 0 || *i1 == -1) {
+ ipp->ipp_tclass = 0;
+ ipp->ipp_fields &= ~IPPF_TCLASS;
+ } else {
+ ipp->ipp_tclass = *i1;
+ ipp->ipp_fields |= IPPF_TCLASS;
+ }
+ mutex_exit(&connp->conn_lock);
+ coa->coa_changed |= COA_HEADER_CHANGED;
+ break;
+ case IPV6_NEXTHOP:
+ if (inlen == 0) {
+ ixa->ixa_flags &= ~IXAF_NEXTHOP_SET;
+ } else {
+ sin6_t *sin6 = (sin6_t *)invalp;
+
+ ixa->ixa_nexthop_v6 = sin6->sin6_addr;
+ if (!IN6_IS_ADDR_UNSPECIFIED(&ixa->ixa_nexthop_v6))
+ ixa->ixa_flags |= IXAF_NEXTHOP_SET;
+ else
+ ixa->ixa_flags &= ~IXAF_NEXTHOP_SET;
+ }
+ coa->coa_changed |= COA_ROUTE_CHANGED;
+ break;
+ case IPV6_HOPOPTS:
+ mutex_enter(&connp->conn_lock);
+ error = optcom_pkt_set(invalp, inlen,
+ (uchar_t **)&ipp->ipp_hopopts, &ipp->ipp_hopoptslen);
+ if (error != 0) {
+ mutex_exit(&connp->conn_lock);
+ return (error);
+ }
+ if (ipp->ipp_hopoptslen == 0) {
+ ipp->ipp_fields &= ~IPPF_HOPOPTS;
+ } else {
+ ipp->ipp_fields |= IPPF_HOPOPTS;
+ }
+ mutex_exit(&connp->conn_lock);
+ coa->coa_changed |= COA_HEADER_CHANGED;
+ coa->coa_changed |= COA_WROFF_CHANGED;
+ break;
+ case IPV6_RTHDRDSTOPTS:
+ mutex_enter(&connp->conn_lock);
+ error = optcom_pkt_set(invalp, inlen,
+ (uchar_t **)&ipp->ipp_rthdrdstopts,
+ &ipp->ipp_rthdrdstoptslen);
+ if (error != 0) {
+ mutex_exit(&connp->conn_lock);
+ return (error);
+ }
+ if (ipp->ipp_rthdrdstoptslen == 0) {
+ ipp->ipp_fields &= ~IPPF_RTHDRDSTOPTS;
+ } else {
+ ipp->ipp_fields |= IPPF_RTHDRDSTOPTS;
+ }
+ mutex_exit(&connp->conn_lock);
+ coa->coa_changed |= COA_HEADER_CHANGED;
+ coa->coa_changed |= COA_WROFF_CHANGED;
+ break;
+ case IPV6_DSTOPTS:
+ mutex_enter(&connp->conn_lock);
+ error = optcom_pkt_set(invalp, inlen,
+ (uchar_t **)&ipp->ipp_dstopts, &ipp->ipp_dstoptslen);
+ if (error != 0) {
+ mutex_exit(&connp->conn_lock);
+ return (error);
+ }
+ if (ipp->ipp_dstoptslen == 0) {
+ ipp->ipp_fields &= ~IPPF_DSTOPTS;
+ } else {
+ ipp->ipp_fields |= IPPF_DSTOPTS;
+ }
+ mutex_exit(&connp->conn_lock);
+ coa->coa_changed |= COA_HEADER_CHANGED;
+ coa->coa_changed |= COA_WROFF_CHANGED;
+ break;
+ case IPV6_RTHDR:
+ mutex_enter(&connp->conn_lock);
+ error = optcom_pkt_set(invalp, inlen,
+ (uchar_t **)&ipp->ipp_rthdr, &ipp->ipp_rthdrlen);
+ if (error != 0) {
+ mutex_exit(&connp->conn_lock);
+ return (error);
+ }
+ if (ipp->ipp_rthdrlen == 0) {
+ ipp->ipp_fields &= ~IPPF_RTHDR;
+ } else {
+ ipp->ipp_fields |= IPPF_RTHDR;
+ }
+ mutex_exit(&connp->conn_lock);
+ coa->coa_changed |= COA_HEADER_CHANGED;
+ coa->coa_changed |= COA_WROFF_CHANGED;
+ break;
+
+ case IPV6_DONTFRAG:
+ if (onoff) {
+ ixa->ixa_flags |= IXAF_DONTFRAG;
+ ixa->ixa_flags &= ~IXAF_PMTU_DISCOVERY;
+ } else {
+ ixa->ixa_flags &= ~IXAF_DONTFRAG;
+ ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
+ }
+ /* Need to redo ip_attr_connect */
+ coa->coa_changed |= COA_ROUTE_CHANGED;
+ break;
+
+ case IPV6_USE_MIN_MTU:
+ ixa->ixa_flags |= IXAF_USE_MIN_MTU;
+ ixa->ixa_use_min_mtu = *i1;
+ /* Need to redo ip_attr_connect */
+ coa->coa_changed |= COA_ROUTE_CHANGED;
+ break;
+
+ case IPV6_SEC_OPT:
+ mutex_enter(&connp->conn_lock);
+ error = ipsec_set_req(cr, connp, (ipsec_req_t *)invalp);
+ mutex_exit(&connp->conn_lock);
+ if (error != 0) {
+ return (error);
+ }
+ /* This is an IPsec policy change - redo ip_attr_connect */
+ coa->coa_changed |= COA_ROUTE_CHANGED;
+ break;
+ case IPV6_SRC_PREFERENCES:
+ /*
+ * This socket option only affects connected
+ * sockets that haven't already bound to a specific
+ * IPv6 address. In other words, sockets that
+ * don't call bind() with an address other than the
+ * unspecified address and that call connect().
+ * ip_set_destination_v6() passes these preferences
+ * to the ipif_select_source_v6() function.
+ */
+ mutex_enter(&connp->conn_lock);
+ error = ip6_set_src_preferences(ixa, *(uint32_t *)invalp);
+ mutex_exit(&connp->conn_lock);
+ if (error != 0) {
+ return (error);
+ }
+ break;
+ case IPV6_V6ONLY:
+ mutex_enter(&connp->conn_lock);
+ connp->conn_ipv6_v6only = onoff;
+ mutex_exit(&connp->conn_lock);
+ break;
+ }
+ return (0);
+}
+
+/* Handle IPPROTO_UDP */
+/* ARGSUSED1 */
+static int
+conn_opt_set_udp(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen,
+ uchar_t *invalp, boolean_t checkonly, cred_t *cr)
+{
+ conn_t *connp = coa->coa_connp;
+ int *i1 = (int *)invalp;
+ boolean_t onoff = (*i1 == 0) ? 0 : 1;
+ int error;
+
+ switch (name) {
+ case UDP_ANONPRIVBIND:
+ if ((error = secpolicy_net_privaddr(cr, 0, IPPROTO_UDP)) != 0) {
+ return (error);
+ }
+ break;
+ }
+ if (checkonly)
+ return (0);
+
+ /* Here we set the actual option value */
+ mutex_enter(&connp->conn_lock);
+ switch (name) {
+ case UDP_ANONPRIVBIND:
+ connp->conn_anon_priv_bind = onoff;
+ break;
+ case UDP_EXCLBIND:
+ connp->conn_exclbind = onoff;
+ break;
+ }
+ mutex_exit(&connp->conn_lock);
+ return (0);
+}
+
+/* Handle IPPROTO_TCP */
+/* ARGSUSED1 */
+static int
+conn_opt_set_tcp(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen,
+ uchar_t *invalp, boolean_t checkonly, cred_t *cr)
+{
+ conn_t *connp = coa->coa_connp;
+ int *i1 = (int *)invalp;
+ boolean_t onoff = (*i1 == 0) ? 0 : 1;
+ int error;
+
+ switch (name) {
+ case TCP_ANONPRIVBIND:
+ if ((error = secpolicy_net_privaddr(cr, 0, IPPROTO_TCP)) != 0) {
+ return (error);
+ }
+ break;
+ }
+ if (checkonly)
+ return (0);
+
+ /* Here we set the actual option value */
+ mutex_enter(&connp->conn_lock);
+ switch (name) {
+ case TCP_ANONPRIVBIND:
+ connp->conn_anon_priv_bind = onoff;
+ break;
+ case TCP_EXCLBIND:
+ connp->conn_exclbind = onoff;
+ break;
+ case TCP_RECVDSTADDR:
+ connp->conn_recv_ancillary.crb_recvdstaddr = onoff;
+ break;
+ }
+ mutex_exit(&connp->conn_lock);
+ return (0);
+}
+
+int
+conn_getsockname(conn_t *connp, struct sockaddr *sa, uint_t *salenp)
+{
+ sin_t *sin;
+ sin6_t *sin6;
+
+ if (connp->conn_family == AF_INET) {
+ if (*salenp < sizeof (sin_t))
+ return (EINVAL);
+
+ *salenp = sizeof (sin_t);
+ /* Fill zeroes and then initialize non-zero fields */
+ sin = (sin_t *)sa;
+ *sin = sin_null;
+ sin->sin_family = AF_INET;
+ if (!IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_saddr_v6) &&
+ !IN6_IS_ADDR_UNSPECIFIED(&connp->conn_saddr_v6)) {
+ sin->sin_addr.s_addr = connp->conn_saddr_v4;
+ } else {
+ /*
+ * INADDR_ANY
+ * conn_saddr is not set, we might be bound to
+ * broadcast/multicast. Use conn_bound_addr as
+ * local address instead (that could
+ * also still be INADDR_ANY)
+ */
+ sin->sin_addr.s_addr = connp->conn_bound_addr_v4;
+ }
+ sin->sin_port = connp->conn_lport;
+ } else {
+ if (*salenp < sizeof (sin6_t))
+ return (EINVAL);
+
+ *salenp = sizeof (sin6_t);
+ /* Fill zeroes and then initialize non-zero fields */
+ sin6 = (sin6_t *)sa;
+ *sin6 = sin6_null;
+ sin6->sin6_family = AF_INET6;
+ if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_saddr_v6)) {
+ sin6->sin6_addr = connp->conn_saddr_v6;
+ } else {
+ /*
+ * conn_saddr is not set, we might be bound to
+ * broadcast/multicast. Use conn_bound_addr as
+ * local address instead (which could
+ * also still be unspecified)
+ */
+ sin6->sin6_addr = connp->conn_bound_addr_v6;
+ }
+ sin6->sin6_port = connp->conn_lport;
+ if (IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr) &&
+ (connp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET))
+ sin6->sin6_scope_id = connp->conn_ixa->ixa_scopeid;
+ }
+ return (0);
+}
+
+int
+conn_getpeername(conn_t *connp, struct sockaddr *sa, uint_t *salenp)
+{
+ struct sockaddr_in *sin;
+ struct sockaddr_in6 *sin6;
+
+ if (connp->conn_family == AF_INET) {
+ if (*salenp < sizeof (sin_t))
+ return (EINVAL);
+
+ *salenp = sizeof (sin_t);
+ /* initialize */
+ sin = (sin_t *)sa;
+ *sin = sin_null;
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = connp->conn_faddr_v4;
+ sin->sin_port = connp->conn_fport;
+ } else {
+ if (*salenp < sizeof (sin6_t))
+ return (EINVAL);
+
+ *salenp = sizeof (sin6_t);
+ /* initialize */
+ sin6 = (sin6_t *)sa;
+ *sin6 = sin6_null;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_addr = connp->conn_faddr_v6;
+ sin6->sin6_port = connp->conn_fport;
+ sin6->sin6_flowinfo = connp->conn_flowinfo;
+ if (IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr) &&
+ (connp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET))
+ sin6->sin6_scope_id = connp->conn_ixa->ixa_scopeid;
+ }
+ return (0);
+}
+
+static uint32_t cksum_massage_options_v4(ipha_t *, netstack_t *);
+static uint32_t cksum_massage_options_v6(ip6_t *, uint_t, netstack_t *);
+
+/*
+ * Allocate and fill in conn_ht_iphc based on the current information
+ * in the conn.
+ * Normally used when we bind() and connect().
+ * Returns failure if can't allocate memory, or if there is a problem
+ * with a routing header/option.
+ *
+ * We allocate space for the transport header (ulp_hdr_len + extra) and
+ * indicate the offset of the ulp header by setting ixa_ip_hdr_length.
+ * The extra is there for transports that want some spare room for future
+ * options. conn_ht_iphc_allocated is what was allocated; conn_ht_iphc_len
+ * excludes the extra part.
+ *
+ * We massage an routing option/header and store the ckecksum difference
+ * in conn_sum.
+ *
+ * Caller needs to update conn_wroff if desired.
+ */
+int
+conn_build_hdr_template(conn_t *connp, uint_t ulp_hdr_length, uint_t extra,
+ const in6_addr_t *v6src, const in6_addr_t *v6dst, uint32_t flowinfo)
+{
+ ip_xmit_attr_t *ixa = connp->conn_ixa;
+ ip_pkt_t *ipp = &connp->conn_xmit_ipp;
+ uint_t ip_hdr_length;
+ uchar_t *hdrs;
+ uint_t hdrs_len;
+
+ ASSERT(MUTEX_HELD(&connp->conn_lock));
+
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ ip_hdr_length = ip_total_hdrs_len_v4(ipp);
+ /* In case of TX label and IP options it can be too much */
+ if (ip_hdr_length > IP_MAX_HDR_LENGTH) {
+ /* Preserves existing TX errno for this */
+ return (EHOSTUNREACH);
+ }
+ } else {
+ ip_hdr_length = ip_total_hdrs_len_v6(ipp);
+ }
+ ixa->ixa_ip_hdr_length = ip_hdr_length;
+ hdrs_len = ip_hdr_length + ulp_hdr_length + extra;
+ ASSERT(hdrs_len != 0);
+
+ if (hdrs_len != connp->conn_ht_iphc_allocated) {
+ /* Allocate new before we free any old */
+ hdrs = kmem_alloc(hdrs_len, KM_NOSLEEP);
+ if (hdrs == NULL)
+ return (ENOMEM);
+
+ if (connp->conn_ht_iphc != NULL) {
+ kmem_free(connp->conn_ht_iphc,
+ connp->conn_ht_iphc_allocated);
+ }
+ connp->conn_ht_iphc = hdrs;
+ connp->conn_ht_iphc_allocated = hdrs_len;
+ } else {
+ hdrs = connp->conn_ht_iphc;
+ }
+ hdrs_len -= extra;
+ connp->conn_ht_iphc_len = hdrs_len;
+
+ connp->conn_ht_ulp = hdrs + ip_hdr_length;
+ connp->conn_ht_ulp_len = ulp_hdr_length;
+
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ ipha_t *ipha = (ipha_t *)hdrs;
+
+ IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src);
+ IN6_V4MAPPED_TO_IPADDR(v6dst, ipha->ipha_dst);
+ ip_build_hdrs_v4(hdrs, ip_hdr_length, ipp, connp->conn_proto);
+ ipha->ipha_length = htons(hdrs_len);
+ if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF)
+ ipha->ipha_fragment_offset_and_flags |= IPH_DF_HTONS;
+ else
+ ipha->ipha_fragment_offset_and_flags &= ~IPH_DF_HTONS;
+
+ if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
+ connp->conn_sum = cksum_massage_options_v4(ipha,
+ connp->conn_netstack);
+ } else {
+ connp->conn_sum = 0;
+ }
+ } else {
+ ip6_t *ip6h = (ip6_t *)hdrs;
+
+ ip6h->ip6_src = *v6src;
+ ip6h->ip6_dst = *v6dst;
+ ip_build_hdrs_v6(hdrs, ip_hdr_length, ipp, connp->conn_proto,
+ flowinfo);
+ ip6h->ip6_plen = htons(hdrs_len - IPV6_HDR_LEN);
+
+ if (ipp->ipp_fields & IPPF_RTHDR) {
+ connp->conn_sum = cksum_massage_options_v6(ip6h,
+ ip_hdr_length, connp->conn_netstack);
+
+ /*
+ * Verify that the first hop isn't a mapped address.
+ * Routers along the path need to do this verification
+ * for subsequent hops.
+ */
+ if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_dst))
+ return (EADDRNOTAVAIL);
+
+ } else {
+ connp->conn_sum = 0;
+ }
+ }
+ return (0);
+}
+
+/*
+ * Prepend a header template to data_mp based on the ip_pkt_t
+ * and the passed in source, destination and protocol.
+ *
+ * Returns failure if can't allocate memory, in which case data_mp is freed.
+ * We allocate space for the transport header (ulp_hdr_len) and
+ * indicate the offset of the ulp header by setting ixa_ip_hdr_length.
+ *
+ * We massage an routing option/header and return the ckecksum difference
+ * in *sump. This is in host byte order.
+ *
+ * Caller needs to update conn_wroff if desired.
+ */
+mblk_t *
+conn_prepend_hdr(ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
+ const in6_addr_t *v6src, const in6_addr_t *v6dst,
+ uint8_t protocol, uint32_t flowinfo, uint_t ulp_hdr_length, mblk_t *data_mp,
+ uint_t data_length, uint_t wroff_extra, uint32_t *sump, int *errorp)
+{
+ uint_t ip_hdr_length;
+ uchar_t *hdrs;
+ uint_t hdrs_len;
+ mblk_t *mp;
+
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ ip_hdr_length = ip_total_hdrs_len_v4(ipp);
+ ASSERT(ip_hdr_length <= IP_MAX_HDR_LENGTH);
+ } else {
+ ip_hdr_length = ip_total_hdrs_len_v6(ipp);
+ }
+ hdrs_len = ip_hdr_length + ulp_hdr_length;
+ ASSERT(hdrs_len != 0);
+
+ ixa->ixa_ip_hdr_length = ip_hdr_length;
+
+ /* Can we prepend to data_mp? */
+ if (data_mp != NULL &&
+ data_mp->b_rptr - data_mp->b_datap->db_base >= hdrs_len &&
+ data_mp->b_datap->db_ref == 1) {
+ hdrs = data_mp->b_rptr - hdrs_len;
+ data_mp->b_rptr = hdrs;
+ mp = data_mp;
+ } else {
+ mp = allocb(hdrs_len + wroff_extra, BPRI_MED);
+ if (mp == NULL) {
+ freemsg(data_mp);
+ *errorp = ENOMEM;
+ return (NULL);
+ }
+ mp->b_wptr = mp->b_datap->db_lim;
+ hdrs = mp->b_rptr = mp->b_wptr - hdrs_len;
+ mp->b_cont = data_mp;
+ }
+
+ /*
+ * Set the source in the header. ip_build_hdrs_v4/v6 will overwrite it
+ * if PKTINFO (aka IPPF_ADDR) was set.
+ */
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ ipha_t *ipha = (ipha_t *)hdrs;
+
+ ASSERT(IN6_IS_ADDR_V4MAPPED(v6dst));
+ IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src);
+ IN6_V4MAPPED_TO_IPADDR(v6dst, ipha->ipha_dst);
+ ip_build_hdrs_v4(hdrs, ip_hdr_length, ipp, protocol);
+ ipha->ipha_length = htons(hdrs_len + data_length);
+ if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF)
+ ipha->ipha_fragment_offset_and_flags |= IPH_DF_HTONS;
+ else
+ ipha->ipha_fragment_offset_and_flags &= ~IPH_DF_HTONS;
+
+ if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
+ *sump = cksum_massage_options_v4(ipha,
+ ixa->ixa_ipst->ips_netstack);
+ } else {
+ *sump = 0;
+ }
+ } else {
+ ip6_t *ip6h = (ip6_t *)hdrs;
+
+ ip6h->ip6_src = *v6src;
+ ip6h->ip6_dst = *v6dst;
+ ip_build_hdrs_v6(hdrs, ip_hdr_length, ipp, protocol, flowinfo);
+ ip6h->ip6_plen = htons(hdrs_len + data_length - IPV6_HDR_LEN);
+
+ if (ipp->ipp_fields & IPPF_RTHDR) {
+ *sump = cksum_massage_options_v6(ip6h,
+ ip_hdr_length, ixa->ixa_ipst->ips_netstack);
+
+ /*
+ * Verify that the first hop isn't a mapped address.
+ * Routers along the path need to do this verification
+ * for subsequent hops.
+ */
+ if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_dst)) {
+ *errorp = EADDRNOTAVAIL;
+ freemsg(mp);
+ return (NULL);
+ }
+ } else {
+ *sump = 0;
+ }
+ }
+ return (mp);
+}
+
+/*
+ * Massage a source route if any putting the first hop
+ * in ipha_dst. Compute a starting value for the checksum which
+ * takes into account that the original ipha_dst should be
+ * included in the checksum but that IP will include the
+ * first hop from the source route in the tcp checksum.
+ */
+static uint32_t
+cksum_massage_options_v4(ipha_t *ipha, netstack_t *ns)
+{
+ in_addr_t dst;
+ uint32_t cksum;
+
+ /* Get last hop then diff against first hop */
+ cksum = ip_massage_options(ipha, ns);
+ cksum = (cksum & 0xFFFF) + (cksum >> 16);
+ dst = ipha->ipha_dst;
+ cksum -= ((dst >> 16) + (dst & 0xffff));
+ if ((int)cksum < 0)
+ cksum--;
+ cksum = (cksum & 0xFFFF) + (cksum >> 16);
+ cksum = (cksum & 0xFFFF) + (cksum >> 16);
+ ASSERT(cksum < 0x10000);
+ return (ntohs(cksum));
+}
+
+static uint32_t
+cksum_massage_options_v6(ip6_t *ip6h, uint_t ip_hdr_len, netstack_t *ns)
+{
+ uint8_t *end;
+ ip6_rthdr_t *rth;
+ uint32_t cksum;
+
+ end = (uint8_t *)ip6h + ip_hdr_len;
+ rth = ip_find_rthdr_v6(ip6h, end);
+ if (rth == NULL)
+ return (0);
+
+ cksum = ip_massage_options_v6(ip6h, rth, ns);
+ cksum = (cksum & 0xFFFF) + (cksum >> 16);
+ ASSERT(cksum < 0x10000);
+ return (ntohs(cksum));
+}
+
+/*
+ * ULPs that change the destination address need to call this for each
+ * change to discard any state about a previous destination that might
+ * have been multicast or multirt.
+ */
+void
+ip_attr_newdst(ip_xmit_attr_t *ixa)
+{
+ ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM |
+ IXAF_NO_TTL_CHANGE | IXAF_IPV6_ADD_FRAGHDR |
+ IXAF_NO_LOOP_ZONEID_SET);
+}
+
+/*
+ * Determine the nexthop which will be used.
+ * Normally this is just the destination, but if a IPv4 source route, or
+ * IPv6 routing header, is in the ip_pkt_t then we extract the nexthop from
+ * there.
+ */
+void
+ip_attr_nexthop(const ip_pkt_t *ipp, const ip_xmit_attr_t *ixa,
+ const in6_addr_t *dst, in6_addr_t *nexthop)
+{
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ ipaddr_t v4dst;
+ ipaddr_t v4nexthop;
+
+ IN6_V4MAPPED_TO_IPADDR(dst, v4dst);
+ v4nexthop = ip_pkt_source_route_v4(ipp);
+ if (v4nexthop == INADDR_ANY)
+ v4nexthop = v4dst;
+
+ IN6_IPADDR_TO_V4MAPPED(v4nexthop, nexthop);
+ } else {
+ const in6_addr_t *v6nexthop;
+
+ v6nexthop = ip_pkt_source_route_v6(ipp);
+ if (v6nexthop == NULL)
+ v6nexthop = dst;
+
+ *nexthop = *v6nexthop;
+ }
+}
+
+/*
+ * Update the ip_xmit_attr_t based the addresses, conn_xmit_ipp and conn_ixa.
+ * If IPDF_IPSEC is set we cache the IPsec policy to handle the unconnected
+ * case (connected latching is done in conn_connect).
+ * Note that IPsec policy lookup requires conn_proto and conn_laddr to be
+ * set, but doesn't otherwise use the conn_t.
+ *
+ * Caller must set/clear IXAF_IS_IPV4 as appropriately.
+ * Caller must use ip_attr_nexthop() to determine the nexthop argument.
+ *
+ * The caller must NOT hold conn_lock (to avoid problems with ill_refrele
+ * causing the squeue to run doing ipcl_walk grabbing conn_lock.)
+ *
+ * Updates laddrp and uinfo if they are non-NULL.
+ *
+ * TSOL notes: The callers if ip_attr_connect must check if the destination
+ * is different than before and in that case redo conn_update_label.
+ * The callers of conn_connect do not need that since conn_connect
+ * performs the conn_update_label.
+ */
+int
+ip_attr_connect(const conn_t *connp, ip_xmit_attr_t *ixa,
+ const in6_addr_t *v6src, const in6_addr_t *v6dst,
+ const in6_addr_t *v6nexthop, in_port_t dstport, in6_addr_t *laddrp,
+ iulp_t *uinfo, uint32_t flags)
+{
+ in6_addr_t laddr = *v6src;
+ int error;
+
+ ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
+
+ if (connp->conn_zone_is_global)
+ flags |= IPDF_ZONE_IS_GLOBAL;
+ else
+ flags &= ~IPDF_ZONE_IS_GLOBAL;
+
+ /*
+ * Lookup the route to determine a source address and the uinfo.
+ * If the ULP has a source route option then the caller will
+ * have set v6nexthop to be the first hop.
+ */
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ ipaddr_t v4dst;
+ ipaddr_t v4src, v4nexthop;
+
+ IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst);
+ IN6_V4MAPPED_TO_IPADDR(v6nexthop, v4nexthop);
+ IN6_V4MAPPED_TO_IPADDR(v6src, v4src);
+
+ if (connp->conn_unspec_src || v4src != INADDR_ANY)
+ flags &= ~IPDF_SELECT_SRC;
+ else
+ flags |= IPDF_SELECT_SRC;
+
+ error = ip_set_destination_v4(&v4src, v4dst, v4nexthop, ixa,
+ uinfo, flags, connp->conn_mac_mode);
+ IN6_IPADDR_TO_V4MAPPED(v4src, &laddr);
+ } else {
+ if (connp->conn_unspec_src || !IN6_IS_ADDR_UNSPECIFIED(v6src))
+ flags &= ~IPDF_SELECT_SRC;
+ else
+ flags |= IPDF_SELECT_SRC;
+
+ error = ip_set_destination_v6(&laddr, v6dst, v6nexthop, ixa,
+ uinfo, flags, connp->conn_mac_mode);
+ }
+ /* Pass out some address even if we hit a RTF_REJECT etc */
+ if (laddrp != NULL)
+ *laddrp = laddr;
+
+ if (error != 0)
+ return (error);
+
+ if (flags & IPDF_IPSEC) {
+ /*
+ * Set any IPsec policy in ixa. Routine also looks at ULP
+ * ports.
+ */
+ ipsec_cache_outbound_policy(connp, v6src, v6dst, dstport, ixa);
+ }
+ return (0);
+}
+
+/*
+ * Connect the conn based on the addresses, conn_xmit_ipp and conn_ixa.
+ * Assumes that conn_faddr and conn_fport are already set. As such it is not
+ * usable for SCTP, since SCTP has multiple faddrs.
+ *
+ * Caller must hold conn_lock to provide atomic constency between the
+ * conn_t's addresses and the ixa.
+ * NOTE: this function drops and reaquires conn_lock since it can't be
+ * held across ip_attr_connect/ip_set_destination.
+ *
+ * The caller needs to handle inserting in the receive-side fanout when
+ * appropriate after conn_connect returns.
+ */
+int
+conn_connect(conn_t *connp, iulp_t *uinfo, uint32_t flags)
+{
+ ip_xmit_attr_t *ixa = connp->conn_ixa;
+ in6_addr_t nexthop;
+ in6_addr_t saddr, faddr;
+ in_port_t fport;
+ int error;
+
+ ASSERT(MUTEX_HELD(&connp->conn_lock));
+
+ if (connp->conn_ipversion == IPV4_VERSION)
+ ixa->ixa_flags |= IXAF_IS_IPV4;
+ else
+ ixa->ixa_flags &= ~IXAF_IS_IPV4;
+
+ /* We do IPsec latching below - hence no caching in ip_attr_connect */
+ flags &= ~IPDF_IPSEC;
+
+ /* In case we had previously done an ip_attr_connect */
+ ip_attr_newdst(ixa);
+
+ /*
+ * Determine the nexthop and copy the addresses before dropping
+ * conn_lock.
+ */
+ ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
+ &connp->conn_faddr_v6, &nexthop);
+ saddr = connp->conn_saddr_v6;
+ faddr = connp->conn_faddr_v6;
+ fport = connp->conn_fport;
+
+ mutex_exit(&connp->conn_lock);
+ error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop, fport,
+ &saddr, uinfo, flags | IPDF_VERIFY_DST);
+ mutex_enter(&connp->conn_lock);
+
+ /* Could have changed even if an error */
+ connp->conn_saddr_v6 = saddr;
+ if (error != 0)
+ return (error);
+
+ /*
+ * Check whether Trusted Solaris policy allows communication with this
+ * host, and pretend that the destination is unreachable if not.
+ * Compute any needed label and place it in ipp_label_v4/v6.
+ *
+ * Later conn_build_hdr_template() takes ipp_label_v4/v6 to form
+ * the packet.
+ *
+ * TSOL Note: Any concurrent threads would pick a different ixa
+ * (and ipp if they are to change the ipp) so we
+ * don't have to worry about concurrent threads.
+ */
+ if (is_system_labeled()) {
+ if (connp->conn_mlp_type != mlptSingle)
+ return (ECONNREFUSED);
+
+ /*
+ * conn_update_label will set ipp_label* which will later
+ * be used by conn_build_hdr_template.
+ */
+ error = conn_update_label(connp, ixa,
+ &connp->conn_faddr_v6, &connp->conn_xmit_ipp);
+ if (error != 0)
+ return (error);
+ }
+
+ /*
+ * Ensure that we match on the selected local address.
+ * This overrides conn_laddr in the case we had earlier bound to a
+ * multicast or broadcast address.
+ */
+ connp->conn_laddr_v6 = connp->conn_saddr_v6;
+
+ /*
+ * Allow setting new policies.
+ * The addresses/ports are already set, thus the IPsec policy calls
+ * can handle their passed-in conn's.
+ */
+ connp->conn_policy_cached = B_FALSE;
+
+ /*
+ * Cache IPsec policy in this conn. If we have per-socket policy,
+ * we'll cache that. If we don't, we'll inherit global policy.
+ *
+ * This is done before the caller inserts in the receive-side fanout.
+ * Note that conn_policy_cached is set by ipsec_conn_cache_policy() even
+ * for connections where we don't have a policy. This is to prevent
+ * global policy lookups in the inbound path.
+ *
+ * If we insert before we set conn_policy_cached,
+ * CONN_INBOUND_POLICY_PRESENT() check can still evaluate true
+ * because global policy cound be non-empty. We normally call
+ * ipsec_check_policy() for conn_policy_cached connections only if
+ * conn_in_enforce_policy is set. But in this case,
+ * conn_policy_cached can get set anytime since we made the
+ * CONN_INBOUND_POLICY_PRESENT() check and ipsec_check_policy() is
+ * called, which will make the above assumption false. Thus, we
+ * need to insert after we set conn_policy_cached.
+ */
+ error = ipsec_conn_cache_policy(connp,
+ connp->conn_ipversion == IPV4_VERSION);
+ if (error != 0)
+ return (error);
+
+ /*
+ * We defer to do LSO check until here since now we have better idea
+ * whether IPsec is present. If the underlying ill is LSO capable,
+ * copy its capability in so the ULP can decide whether to enable LSO
+ * on this connection. So far, only TCP/IPv4 is implemented, so won't
+ * claim LSO for IPv6.
+ *
+ * Currently, won't enable LSO for IRE_LOOPBACK or IRE_LOCAL, because
+ * the receiver can not handle it. Also not to enable LSO for MULTIRT.
+ */
+ ixa->ixa_flags &= ~IXAF_LSO_CAPAB;
+
+ ASSERT(ixa->ixa_ire != NULL);
+ if (ixa->ixa_ipst->ips_ip_lso_outbound && (flags & IPDF_LSO) &&
+ !(ixa->ixa_flags & IXAF_IPSEC_SECURE) &&
+ !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
+ !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) &&
+ (ixa->ixa_nce != NULL) &&
+ ((ixa->ixa_flags & IXAF_IS_IPV4) ?
+ ILL_LSO_TCP_IPV4_USABLE(ixa->ixa_nce->nce_ill) :
+ ILL_LSO_TCP_IPV6_USABLE(ixa->ixa_nce->nce_ill))) {
+ ixa->ixa_lso_capab = *ixa->ixa_nce->nce_ill->ill_lso_capab;
+ ixa->ixa_flags |= IXAF_LSO_CAPAB;
+ }
+
+ /* Check whether ZEROCOPY capability is usable for this connection. */
+ ixa->ixa_flags &= ~IXAF_ZCOPY_CAPAB;
+
+ if ((flags & IPDF_ZCOPY) &&
+ !(ixa->ixa_flags & IXAF_IPSEC_SECURE) &&
+ !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
+ !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) &&
+ (ixa->ixa_nce != NULL) &&
+ ILL_ZCOPY_USABLE(ixa->ixa_nce->nce_ill)) {
+ ixa->ixa_flags |= IXAF_ZCOPY_CAPAB;
+ }
+ return (0);
+}
+
+/*
+ * Predicates to check if the addresses match conn_last*
+ */
+
+/*
+ * Compare the conn against an address.
+ * If using mapped addresses on AF_INET6 sockets, use the _v6 function
+ */
+boolean_t
+conn_same_as_last_v4(conn_t *connp, sin_t *sin)
+{
+ ASSERT(connp->conn_family == AF_INET);
+ return (sin->sin_addr.s_addr == connp->conn_v4lastdst &&
+ sin->sin_port == connp->conn_lastdstport);
+}
+
+/*
+ * Compare, including for mapped addresses
+ */
+boolean_t
+conn_same_as_last_v6(conn_t *connp, sin6_t *sin6)
+{
+ return (IN6_ARE_ADDR_EQUAL(&connp->conn_v6lastdst, &sin6->sin6_addr) &&
+ sin6->sin6_port == connp->conn_lastdstport &&
+ sin6->sin6_flowinfo == connp->conn_lastflowinfo &&
+ sin6->sin6_scope_id == connp->conn_lastscopeid);
+}
+
+/*
+ * Compute a label and place it in the ip_packet_t.
+ * Handles IPv4 and IPv6.
+ * The caller should have a correct ixa_tsl and ixa_zoneid and have
+ * already called conn_connect or ip_attr_connect to ensure that tsol_check_dest
+ * has been called.
+ */
+int
+conn_update_label(const conn_t *connp, const ip_xmit_attr_t *ixa,
+ const in6_addr_t *v6dst, ip_pkt_t *ipp)
+{
+ int err;
+ ipaddr_t v4dst;
+
+ if (IN6_IS_ADDR_V4MAPPED(v6dst)) {
+ uchar_t opt_storage[IP_MAX_OPT_LENGTH];
+
+ IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst);
+
+ err = tsol_compute_label_v4(ixa->ixa_tsl, ixa->ixa_zoneid,
+ v4dst, opt_storage, ixa->ixa_ipst);
+ if (err == 0) {
+ /* Length contained in opt_storage[IPOPT_OLEN] */
+ err = optcom_pkt_set(opt_storage,
+ opt_storage[IPOPT_OLEN],
+ (uchar_t **)&ipp->ipp_label_v4,
+ &ipp->ipp_label_len_v4);
+ }
+ if (err != 0) {
+ DTRACE_PROBE4(tx__ip__log__info__updatelabel,
+ char *, "conn(1) failed to update options(2) "
+ "on ixa(3)",
+ conn_t *, connp, char *, opt_storage,
+ ip_xmit_attr_t *, ixa);
+ }
+ if (ipp->ipp_label_len_v4 != 0)
+ ipp->ipp_fields |= IPPF_LABEL_V4;
+ else
+ ipp->ipp_fields &= ~IPPF_LABEL_V4;
+ } else {
+ uchar_t opt_storage[TSOL_MAX_IPV6_OPTION];
+ uint_t optlen;
+
+ err = tsol_compute_label_v6(ixa->ixa_tsl, ixa->ixa_zoneid,
+ v6dst, opt_storage, ixa->ixa_ipst);
+ if (err == 0) {
+ /*
+ * Note that ipp_label_v6 is just the option - not
+ * the hopopts extension header.
+ *
+ * Length contained in opt_storage[IPOPT_OLEN], but
+ * that doesn't include the two byte options header.
+ */
+ optlen = opt_storage[IPOPT_OLEN];
+ if (optlen != 0)
+ optlen += 2;
+
+ err = optcom_pkt_set(opt_storage, optlen,
+ (uchar_t **)&ipp->ipp_label_v6,
+ &ipp->ipp_label_len_v6);
+ }
+ if (err != 0) {
+ DTRACE_PROBE4(tx__ip__log__info__updatelabel,
+ char *, "conn(1) failed to update options(2) "
+ "on ixa(3)",
+ conn_t *, connp, char *, opt_storage,
+ ip_xmit_attr_t *, ixa);
+ }
+ if (ipp->ipp_label_len_v6 != 0)
+ ipp->ipp_fields |= IPPF_LABEL_V6;
+ else
+ ipp->ipp_fields &= ~IPPF_LABEL_V6;
+ }
+ return (err);
+}
+
+/*
+ * Inherit all options settings from the parent/listener to the eager.
+ * Returns zero on success; ENOMEM if memory allocation failed.
+ *
+ * We assume that the eager has not had any work done i.e., the conn_ixa
+ * and conn_xmit_ipp are all zero.
+ * Furthermore we assume that no other thread can access the eager (because
+ * it isn't inserted in any fanout list).
+ */
+int
+conn_inherit_parent(conn_t *lconnp, conn_t *econnp)
+{
+ cred_t *credp;
+ int err;
+ void *notify_cookie;
+
+ econnp->conn_family = lconnp->conn_family;
+ econnp->conn_ipv6_v6only = lconnp->conn_ipv6_v6only;
+ econnp->conn_wq = lconnp->conn_wq;
+ econnp->conn_rq = lconnp->conn_rq;
+
+ /*
+ * Make a safe copy of the transmit attributes.
+ * conn_connect will later be used by the caller to setup the ire etc.
+ */
+ ASSERT(econnp->conn_ixa->ixa_refcnt == 1);
+ ASSERT(econnp->conn_ixa->ixa_ire == NULL);
+ ASSERT(econnp->conn_ixa->ixa_dce == NULL);
+ ASSERT(econnp->conn_ixa->ixa_nce == NULL);
+
+ /* Preserve ixa_notify_cookie */
+ notify_cookie = econnp->conn_ixa->ixa_notify_cookie;
+ ixa_safe_copy(lconnp->conn_ixa, econnp->conn_ixa);
+ econnp->conn_ixa->ixa_notify_cookie = notify_cookie;
+
+ econnp->conn_bound_if = lconnp->conn_bound_if;
+ econnp->conn_incoming_ifindex = lconnp->conn_incoming_ifindex;
+
+ /* Inherit all RECV options */
+ econnp->conn_recv_ancillary = lconnp->conn_recv_ancillary;
+
+ err = ip_pkt_copy(&lconnp->conn_xmit_ipp, &econnp->conn_xmit_ipp,
+ KM_NOSLEEP);
+ if (err != 0)
+ return (err);
+
+ econnp->conn_zoneid = lconnp->conn_zoneid;
+ econnp->conn_allzones = lconnp->conn_allzones;
+
+ /* This is odd. Pick a flowlabel for each connection instead? */
+ econnp->conn_flowinfo = lconnp->conn_flowinfo;
+
+ econnp->conn_default_ttl = lconnp->conn_default_ttl;
+
+ /*
+ * TSOL: tsol_input_proc() needs the eager's cred before the
+ * eager is accepted
+ */
+ ASSERT(lconnp->conn_cred != NULL);
+ econnp->conn_cred = credp = lconnp->conn_cred;
+ crhold(credp);
+ econnp->conn_cpid = lconnp->conn_cpid;
+ econnp->conn_open_time = lbolt64;
+
+ /*
+ * Cache things in the ixa without any refhold.
+ * Listener might not have set up ixa_cred
+ */
+ econnp->conn_ixa->ixa_cred = econnp->conn_cred;
+ econnp->conn_ixa->ixa_cpid = econnp->conn_cpid;
+ if (is_system_labeled())
+ econnp->conn_ixa->ixa_tsl = crgetlabel(econnp->conn_cred);
+
+ /*
+ * If the caller has the process-wide flag set, then default to MAC
+ * exempt mode. This allows read-down to unlabeled hosts.
+ */
+ if (getpflags(NET_MAC_AWARE, credp) != 0)
+ econnp->conn_mac_mode = CONN_MAC_AWARE;
+
+ econnp->conn_zone_is_global = lconnp->conn_zone_is_global;
+
+ /*
+ * We eliminate the need for sockfs to send down a T_SVR4_OPTMGMT_REQ
+ * via soaccept()->soinheritoptions() which essentially applies
+ * all the listener options to the new connection. The options that we
+ * need to take care of are:
+ * SO_DEBUG, SO_REUSEADDR, SO_KEEPALIVE, SO_DONTROUTE, SO_BROADCAST,
+ * SO_USELOOPBACK, SO_OOBINLINE, SO_DGRAM_ERRIND, SO_LINGER,
+ * SO_SNDBUF, SO_RCVBUF.
+ *
+ * SO_RCVBUF: conn_rcvbuf is set.
+ * SO_SNDBUF: conn_sndbuf is set.
+ */
+
+ econnp->conn_sndbuf = lconnp->conn_sndbuf;
+ econnp->conn_rcvbuf = lconnp->conn_rcvbuf;
+ econnp->conn_sndlowat = lconnp->conn_sndlowat;
+ econnp->conn_rcvlowat = lconnp->conn_rcvlowat;
+ econnp->conn_dgram_errind = lconnp->conn_dgram_errind;
+ econnp->conn_oobinline = lconnp->conn_oobinline;
+ econnp->conn_debug = lconnp->conn_debug;
+ econnp->conn_keepalive = lconnp->conn_keepalive;
+ econnp->conn_linger = lconnp->conn_linger;
+ econnp->conn_lingertime = lconnp->conn_lingertime;
+
+ /* Set the IP options */
+ econnp->conn_broadcast = lconnp->conn_broadcast;
+ econnp->conn_useloopback = lconnp->conn_useloopback;
+ econnp->conn_reuseaddr = lconnp->conn_reuseaddr;
+ return (0);
+}
diff --git a/usr/src/uts/common/inet/ip/icmp.c b/usr/src/uts/common/inet/ip/icmp.c
index 7f6d4b621f..8222c866d0 100644
--- a/usr/src/uts/common/inet/ip/icmp.c
+++ b/usr/src/uts/common/inet/ip/icmp.c
@@ -35,65 +35,58 @@
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/strsubr.h>
+#include <sys/suntpi.h>
+#include <sys/xti_inet.h>
#include <sys/cmn_err.h>
-#include <sys/debug.h>
#include <sys/kmem.h>
+#include <sys/cred_impl.h>
#include <sys/policy.h>
#include <sys/priv.h>
+#include <sys/ucred.h>
#include <sys/zone.h>
-#include <sys/time.h>
#include <sys/sockio.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
+#include <sys/vtrace.h>
+#include <sys/sdt.h>
+#include <sys/debug.h>
#include <sys/isa_defs.h>
-#include <sys/suntpi.h>
-#include <sys/xti_inet.h>
-#include <sys/netstack.h>
-
-#include <net/route.h>
-#include <net/if.h>
-
+#include <sys/random.h>
#include <netinet/in.h>
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
+#include <netinet/udp.h>
+
#include <inet/common.h>
#include <inet/ip.h>
+#include <inet/ip_impl.h>
+#include <inet/ipsec_impl.h>
#include <inet/ip6.h>
+#include <inet/ip_ire.h>
+#include <inet/ip_if.h>
+#include <inet/ip_multi.h>
+#include <inet/ip_ndp.h>
#include <inet/proto_set.h>
+#include <inet/mib2.h>
#include <inet/nd.h>
#include <inet/optcom.h>
#include <inet/snmpcom.h>
#include <inet/kstatcom.h>
-#include <inet/rawip_impl.h>
-
-#include <netinet/ip_mroute.h>
-#include <inet/tcp.h>
-#include <net/pfkeyv2.h>
-#include <inet/ipsec_info.h>
#include <inet/ipclassifier.h>
#include <sys/tsol/label.h>
#include <sys/tsol/tnet.h>
-#include <inet/ip_ire.h>
-#include <inet/ip_if.h>
+#include <inet/rawip_impl.h>
-#include <inet/ip_impl.h>
#include <sys/disp.h>
/*
* Synchronization notes:
*
- * RAWIP is MT and uses the usual kernel synchronization primitives. There is
- * locks, which is icmp_rwlock. We also use conn_lock when updating things
- * which affect the IP classifier lookup.
- * The lock order is icmp_rwlock -> conn_lock.
- *
- * The icmp_rwlock:
- * This protects most of the other fields in the icmp_t. The exact list of
- * fields which are protected by each of the above locks is documented in
- * the icmp_t structure definition.
+ * RAWIP is MT and uses the usual kernel synchronization primitives. We use
+ * conn_lock to protect the icmp_t.
*
* Plumbing notes:
* ICMP is always a device driver. For compatibility with mibopen() code
@@ -103,27 +96,29 @@
static void icmp_addr_req(queue_t *q, mblk_t *mp);
static void icmp_tpi_bind(queue_t *q, mblk_t *mp);
-static int icmp_bind_proto(conn_t *connp);
-static int icmp_build_hdrs(icmp_t *icmp);
+static void icmp_bind_proto(icmp_t *icmp);
+static int icmp_build_hdr_template(conn_t *, const in6_addr_t *,
+ const in6_addr_t *, uint32_t);
static void icmp_capability_req(queue_t *q, mblk_t *mp);
static int icmp_close(queue_t *q, int flags);
+static void icmp_close_free(conn_t *);
static void icmp_tpi_connect(queue_t *q, mblk_t *mp);
static void icmp_tpi_disconnect(queue_t *q, mblk_t *mp);
static void icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
- int sys_error);
+ int sys_error);
static void icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
- t_scalar_t t_error, int sys_error);
-static void icmp_icmp_error(conn_t *connp, mblk_t *mp);
-static void icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp);
+ t_scalar_t tlierr, int sys_error);
+static void icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2,
+ ip_recv_attr_t *);
+static void icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp,
+ ip_recv_attr_t *);
static void icmp_info_req(queue_t *q, mblk_t *mp);
-static void icmp_input(void *, mblk_t *, void *);
+static void icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
static conn_t *icmp_open(int family, cred_t *credp, int *err, int flags);
static int icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag,
cred_t *credp);
static int icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag,
cred_t *credp);
-static int icmp_unitdata_opt_process(queue_t *q, mblk_t *mp,
- int *errorp, void *thisdg_attrs);
static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name);
int icmp_opt_set(conn_t *connp, uint_t optset_context,
int level, int name, uint_t inlen,
@@ -131,25 +126,26 @@ int icmp_opt_set(conn_t *connp, uint_t optset_context,
void *thisdg_attrs, cred_t *cr);
int icmp_opt_get(conn_t *connp, int level, int name,
uchar_t *ptr);
+static int icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin,
+ sin6_t *sin6, cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa);
static int icmp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
static boolean_t icmp_param_register(IDP *ndp, icmpparam_t *icmppa, int cnt);
static int icmp_param_set(queue_t *q, mblk_t *mp, char *value,
caddr_t cp, cred_t *cr);
+static mblk_t *icmp_prepend_hdr(conn_t *, ip_xmit_attr_t *, const ip_pkt_t *,
+ const in6_addr_t *, const in6_addr_t *, uint32_t, mblk_t *, int *);
+static mblk_t *icmp_prepend_header_template(conn_t *, ip_xmit_attr_t *,
+ mblk_t *, const in6_addr_t *, uint32_t, int *);
static int icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
uchar_t *ptr, int len);
static void icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err);
static void icmp_tpi_unbind(queue_t *q, mblk_t *mp);
-static int icmp_update_label(icmp_t *icmp, mblk_t *mp, ipaddr_t dst);
static void icmp_wput(queue_t *q, mblk_t *mp);
static void icmp_wput_fallback(queue_t *q, mblk_t *mp);
-static int raw_ip_send_data_v6(queue_t *q, conn_t *connp, mblk_t *mp,
- sin6_t *sin6, ip6_pkt_t *ipp);
-static int raw_ip_send_data_v4(queue_t *q, conn_t *connp, mblk_t *mp,
- ipaddr_t v4dst, ip4_pkt_t *pktinfop);
static void icmp_wput_other(queue_t *q, mblk_t *mp);
static void icmp_wput_iocdata(queue_t *q, mblk_t *mp);
static void icmp_wput_restricted(queue_t *q, mblk_t *mp);
-static void icmp_ulp_recv(conn_t *, mblk_t *);
+static void icmp_ulp_recv(conn_t *, mblk_t *, uint_t);
static void *rawip_stack_init(netstackid_t stackid, netstack_t *ns);
static void rawip_stack_fini(netstackid_t stackid, void *arg);
@@ -158,10 +154,14 @@ static void *rawip_kstat_init(netstackid_t stackid);
static void rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp);
static int rawip_kstat_update(kstat_t *kp, int rw);
static void rawip_stack_shutdown(netstackid_t stackid, void *arg);
-static int rawip_do_getsockname(icmp_t *icmp, struct sockaddr *sa,
- uint_t *salenp);
-static int rawip_do_getpeername(icmp_t *icmp, struct sockaddr *sa,
- uint_t *salenp);
+
+/* Common routines for TPI and socket module */
+static conn_t *rawip_do_open(int, cred_t *, int *, int);
+static void rawip_do_close(conn_t *);
+static int rawip_do_bind(conn_t *, struct sockaddr *, socklen_t);
+static int rawip_do_unbind(conn_t *);
+static int rawip_do_connect(conn_t *, const struct sockaddr *, socklen_t,
+ cred_t *, pid_t);
int rawip_getsockname(sock_lower_handle_t, struct sockaddr *,
socklen_t *, cred_t *);
@@ -185,7 +185,7 @@ static struct qinit icmprinitv6 = {
};
static struct qinit icmpwinit = {
- (pfi_t)icmp_wput, NULL, NULL, NULL, NULL, &icmp_mod_info
+ (pfi_t)icmp_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, &icmp_mod_info
};
/* ICMP entry point during fallback */
@@ -236,6 +236,8 @@ static icmpparam_t icmp_param_arr[] = {
{ 0, 65536, 1024, "icmp_xmit_lowat"},
{ 4096, 65536, 8192, "icmp_recv_hiwat"},
{ 65536, 1024*1024*1024, 256*1024, "icmp_max_buf"},
+ { 0, 1, 0, "icmp_pmtu_discovery" },
+ { 0, 1, 0, "icmp_sendto_ignerr" },
};
#define is_wroff_extra is_param_arr[0].icmp_param_value
#define is_ipv4_ttl is_param_arr[1].icmp_param_value
@@ -245,18 +247,17 @@ static icmpparam_t icmp_param_arr[] = {
#define is_xmit_lowat is_param_arr[5].icmp_param_value
#define is_recv_hiwat is_param_arr[6].icmp_param_value
#define is_max_buf is_param_arr[7].icmp_param_value
+#define is_pmtu_discovery is_param_arr[8].icmp_param_value
+#define is_sendto_ignerr is_param_arr[9].icmp_param_value
-static int rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len);
-static int rawip_do_connect(conn_t *connp, const struct sockaddr *sa,
- socklen_t len, cred_t *cr);
-static void rawip_post_ip_bind_connect(icmp_t *icmp, mblk_t *ire_mp, int error);
+typedef union T_primitives *t_primp_t;
/*
* This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message
* passed to icmp_wput.
- * The O_T_BIND_REQ/T_BIND_REQ is passed downstream to ip with the ICMP
- * protocol type placed in the message following the address. A T_BIND_ACK
- * message is returned by ip_bind_v4/v6.
+ * It calls IP to verify the local IP address, and calls IP to insert
+ * the conn_t in the fanout table.
+ * If everything is ok it then sends the T_BIND_ACK back up.
*/
static void
icmp_tpi_bind(queue_t *q, mblk_t *mp)
@@ -297,17 +298,17 @@ icmp_tpi_bind(queue_t *q, mblk_t *mp)
if (icmp->icmp_state != TS_UNBND) {
(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
- "icmp_bind: bad state, %d", icmp->icmp_state);
+ "icmp_bind: bad state, %u", icmp->icmp_state);
icmp_err_ack(q, mp, TOUTSTATE, 0);
return;
}
/*
* Reallocate the message to make sure we have enough room for an
- * address and the protocol type.
+ * address.
*/
- mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t) + 1, 1);
- if (!mp1) {
+ mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
+ if (mp1 == NULL) {
icmp_err_ack(q, mp, TSYSERR, ENOMEM);
return;
}
@@ -320,7 +321,7 @@ icmp_tpi_bind(queue_t *q, mblk_t *mp)
switch (len) {
case 0: /* request for a generic port */
tbr->ADDR_offset = sizeof (struct T_bind_req);
- if (icmp->icmp_family == AF_INET) {
+ if (connp->conn_family == AF_INET) {
tbr->ADDR_length = sizeof (sin_t);
sin = (sin_t *)&tbr[1];
*sin = sin_null;
@@ -329,7 +330,7 @@ icmp_tpi_bind(queue_t *q, mblk_t *mp)
sa = (struct sockaddr *)sin;
len = sizeof (sin_t);
} else {
- ASSERT(icmp->icmp_family == AF_INET6);
+ ASSERT(connp->conn_family == AF_INET6);
tbr->ADDR_length = sizeof (sin6_t);
sin6 = (sin6_t *)&tbr[1];
*sin6 = sin6_null;
@@ -352,14 +353,12 @@ icmp_tpi_bind(queue_t *q, mblk_t *mp)
default:
(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
- "icmp_bind: bad ADDR_length %d", tbr->ADDR_length);
+ "icmp_bind: bad ADDR_length %u", tbr->ADDR_length);
icmp_err_ack(q, mp, TBADADDR, 0);
return;
}
error = rawip_do_bind(connp, sa, len);
-done:
- ASSERT(mp->b_cont == NULL);
if (error != 0) {
if (error > 0) {
icmp_err_ack(q, mp, TSYSERR, error);
@@ -377,225 +376,208 @@ rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len)
{
sin_t *sin;
sin6_t *sin6;
- icmp_t *icmp;
+ icmp_t *icmp = connp->conn_icmp;
int error = 0;
- mblk_t *ire_mp;
-
-
- icmp = connp->conn_icmp;
+ ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */
+ in_port_t lport; /* Network byte order */
+ ipaddr_t v4src; /* Set if AF_INET */
+ in6_addr_t v6src;
+ uint_t scopeid = 0;
+ zoneid_t zoneid = IPCL_ZONEID(connp);
+ ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
if (sa == NULL || !OK_32PTR((char *)sa)) {
return (EINVAL);
}
- /*
- * The state must be TS_UNBND. TPI mandates that users must send
- * TPI primitives only 1 at a time and wait for the response before
- * sending the next primitive.
- */
- rw_enter(&icmp->icmp_rwlock, RW_WRITER);
- if (icmp->icmp_state != TS_UNBND || icmp->icmp_pending_op != -1) {
- error = -TOUTSTATE;
- goto done;
- }
-
- ASSERT(len != 0);
switch (len) {
case sizeof (sin_t): /* Complete IPv4 address */
sin = (sin_t *)sa;
if (sin->sin_family != AF_INET ||
- icmp->icmp_family != AF_INET) {
+ connp->conn_family != AF_INET) {
/* TSYSERR, EAFNOSUPPORT */
- error = EAFNOSUPPORT;
- goto done;
+ return (EAFNOSUPPORT);
}
+ v4src = sin->sin_addr.s_addr;
+ IN6_IPADDR_TO_V4MAPPED(v4src, &v6src);
+ if (v4src != INADDR_ANY) {
+ laddr_type = ip_laddr_verify_v4(v4src, zoneid, ipst,
+ B_TRUE);
+ }
+ lport = sin->sin_port;
break;
case sizeof (sin6_t): /* Complete IPv6 address */
sin6 = (sin6_t *)sa;
if (sin6->sin6_family != AF_INET6 ||
- icmp->icmp_family != AF_INET6) {
+ connp->conn_family != AF_INET6) {
/* TSYSERR, EAFNOSUPPORT */
- error = EAFNOSUPPORT;
- goto done;
+ return (EAFNOSUPPORT);
}
/* No support for mapped addresses on raw sockets */
if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
/* TSYSERR, EADDRNOTAVAIL */
- error = EADDRNOTAVAIL;
- goto done;
+ return (EADDRNOTAVAIL);
}
+ v6src = sin6->sin6_addr;
+ if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
+ if (IN6_IS_ADDR_LINKSCOPE(&v6src))
+ scopeid = sin6->sin6_scope_id;
+ laddr_type = ip_laddr_verify_v6(&v6src, zoneid, ipst,
+ B_TRUE, scopeid);
+ }
+ lport = sin6->sin6_port;
break;
default:
/* TBADADDR */
- error = EADDRNOTAVAIL;
- goto done;
+ return (EADDRNOTAVAIL);
}
- icmp->icmp_pending_op = T_BIND_REQ;
- icmp->icmp_state = TS_IDLE;
+ /* Is the local address a valid unicast, multicast, or broadcast? */
+ if (laddr_type == IPVL_BAD)
+ return (EADDRNOTAVAIL);
+
+ /*
+ * The state must be TS_UNBND.
+ */
+ mutex_enter(&connp->conn_lock);
+ if (icmp->icmp_state != TS_UNBND) {
+ mutex_exit(&connp->conn_lock);
+ return (-TOUTSTATE);
+ }
/*
* Copy the source address into our icmp structure. This address
* may still be zero; if so, ip will fill in the correct address
* each time an outbound packet is passed to it.
* If we are binding to a broadcast or multicast address then
- * rawip_post_ip_bind_connect will clear the source address.
+ * we just set the conn_bound_addr since we don't want to use
+ * that as the source address when sending.
*/
-
- if (icmp->icmp_family == AF_INET) {
- ASSERT(sin != NULL);
- ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
- IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr,
- &icmp->icmp_v6src);
- icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
- icmp->icmp_ip_snd_options_len;
- icmp->icmp_bound_v6src = icmp->icmp_v6src;
+ connp->conn_bound_addr_v6 = v6src;
+ connp->conn_laddr_v6 = v6src;
+ if (scopeid != 0) {
+ connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
+ connp->conn_ixa->ixa_scopeid = scopeid;
+ connp->conn_incoming_ifindex = scopeid;
} else {
- int error;
-
- ASSERT(sin6 != NULL);
- ASSERT(icmp->icmp_ipversion == IPV6_VERSION);
- icmp->icmp_v6src = sin6->sin6_addr;
- icmp->icmp_max_hdr_len = icmp->icmp_sticky_hdrs_len;
- icmp->icmp_bound_v6src = icmp->icmp_v6src;
-
- /* Rebuild the header template */
- error = icmp_build_hdrs(icmp);
- if (error != 0) {
- icmp->icmp_pending_op = -1;
- /*
- * TSYSERR
- */
- goto done;
- }
+ connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
+ connp->conn_incoming_ifindex = connp->conn_bound_if;
}
- ire_mp = NULL;
- if (!(V6_OR_V4_INADDR_ANY(icmp->icmp_v6src))) {
- /*
- * request an IRE if src not 0 (INADDR_ANY)
- */
- ire_mp = allocb(sizeof (ire_t), BPRI_HI);
- if (ire_mp == NULL) {
- icmp->icmp_pending_op = -1;
- error = ENOMEM;
- goto done;
- }
- DB_TYPE(ire_mp) = IRE_DB_REQ_TYPE;
+ switch (laddr_type) {
+ case IPVL_UNICAST_UP:
+ case IPVL_UNICAST_DOWN:
+ connp->conn_saddr_v6 = v6src;
+ connp->conn_mcbc_bind = B_FALSE;
+ break;
+ case IPVL_MCAST:
+ case IPVL_BCAST:
+ /* ip_set_destination will pick a source address later */
+ connp->conn_saddr_v6 = ipv6_all_zeros;
+ connp->conn_mcbc_bind = B_TRUE;
+ break;
}
-done:
- rw_exit(&icmp->icmp_rwlock);
- if (error != 0)
- return (error);
- if (icmp->icmp_family == AF_INET6) {
- error = ip_proto_bind_laddr_v6(connp, &ire_mp, icmp->icmp_proto,
- &sin6->sin6_addr, sin6->sin6_port, B_TRUE);
+ /* Any errors after this point should use late_error */
+
+ /*
+ * Use sin_port/sin6_port since applications like psh use SOCK_RAW
+ * with IPPROTO_TCP.
+ */
+ connp->conn_lport = lport;
+ connp->conn_fport = 0;
+
+ if (connp->conn_family == AF_INET) {
+ ASSERT(connp->conn_ipversion == IPV4_VERSION);
} else {
- error = ip_proto_bind_laddr_v4(connp, &ire_mp, icmp->icmp_proto,
- sin->sin_addr.s_addr, sin->sin_port, B_TRUE);
+ ASSERT(connp->conn_ipversion == IPV6_VERSION);
}
- rawip_post_ip_bind_connect(icmp, ire_mp, error);
- return (error);
-}
-static void
-rawip_post_ip_bind_connect(icmp_t *icmp, mblk_t *ire_mp, int error)
-{
- rw_enter(&icmp->icmp_rwlock, RW_WRITER);
- if (icmp->icmp_state == TS_UNBND) {
- /*
- * not yet bound - bind sent by icmp_bind_proto.
- */
- rw_exit(&icmp->icmp_rwlock);
- return;
- }
- ASSERT(icmp->icmp_pending_op != -1);
- icmp->icmp_pending_op = -1;
+ icmp->icmp_state = TS_IDLE;
+ /*
+ * We create an initial header template here to make a subsequent
+ * sendto have a starting point. Since conn_last_dst is zero the
+ * first sendto will always follow the 'dst changed' code path.
+ * Note that we defer massaging options and the related checksum
+ * adjustment until we have a destination address.
+ */
+ error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
+ &connp->conn_faddr_v6, connp->conn_flowinfo);
if (error != 0) {
- if (icmp->icmp_state == TS_DATA_XFER) {
- /* Connect failed */
- /* Revert back to the bound source */
- icmp->icmp_v6src = icmp->icmp_bound_v6src;
- icmp->icmp_state = TS_IDLE;
- if (icmp->icmp_family == AF_INET6)
- (void) icmp_build_hdrs(icmp);
- } else {
- V6_SET_ZERO(icmp->icmp_v6src);
- V6_SET_ZERO(icmp->icmp_bound_v6src);
- icmp->icmp_state = TS_UNBND;
- if (icmp->icmp_family == AF_INET6)
- (void) icmp_build_hdrs(icmp);
- }
- } else {
- if (ire_mp != NULL && ire_mp->b_datap->db_type == IRE_DB_TYPE) {
- ire_t *ire;
-
- ire = (ire_t *)ire_mp->b_rptr;
- /*
- * If a broadcast/multicast address was bound set
- * the source address to 0.
- * This ensures no datagrams with broadcast address
- * as source address are emitted (which would violate
- * RFC1122 - Hosts requirements)
- * Note: we get IRE_BROADCAST for IPv6
- * to "mark" a multicast local address.
- */
+ mutex_exit(&connp->conn_lock);
+ goto late_error;
+ }
+ /* Just in case */
+ connp->conn_faddr_v6 = ipv6_all_zeros;
+ connp->conn_v6lastdst = ipv6_all_zeros;
+ mutex_exit(&connp->conn_lock);
+ error = ip_laddr_fanout_insert(connp);
+ if (error != 0)
+ goto late_error;
- if (ire->ire_type == IRE_BROADCAST &&
- icmp->icmp_state != TS_DATA_XFER) {
- /*
- * This was just a local bind to a
- * MC/broadcast addr
- */
- V6_SET_ZERO(icmp->icmp_v6src);
- if (icmp->icmp_family == AF_INET6)
- (void) icmp_build_hdrs(icmp);
- }
- }
+ /* Bind succeeded */
+ return (0);
+late_error:
+ mutex_enter(&connp->conn_lock);
+ connp->conn_saddr_v6 = ipv6_all_zeros;
+ connp->conn_bound_addr_v6 = ipv6_all_zeros;
+ connp->conn_laddr_v6 = ipv6_all_zeros;
+ if (scopeid != 0) {
+ connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
+ connp->conn_incoming_ifindex = connp->conn_bound_if;
}
- rw_exit(&icmp->icmp_rwlock);
- if (ire_mp != NULL)
- freeb(ire_mp);
+ icmp->icmp_state = TS_UNBND;
+ connp->conn_v6lastdst = ipv6_all_zeros;
+ connp->conn_lport = 0;
+
+ /* Restore the header that was built above - different source address */
+ (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
+ &connp->conn_faddr_v6, connp->conn_flowinfo);
+ mutex_exit(&connp->conn_lock);
+ return (error);
}
/*
- * Send message to IP to just bind to the protocol.
+ * Tell IP to just bind to the protocol.
*/
-static int
-icmp_bind_proto(conn_t *connp)
+static void
+icmp_bind_proto(icmp_t *icmp)
{
- icmp_t *icmp;
- int error;
-
- icmp = connp->conn_icmp;
+ conn_t *connp = icmp->icmp_connp;
- if (icmp->icmp_family == AF_INET6)
- error = ip_proto_bind_laddr_v6(connp, NULL, icmp->icmp_proto,
- &sin6_null.sin6_addr, 0, B_TRUE);
- else
- error = ip_proto_bind_laddr_v4(connp, NULL, icmp->icmp_proto,
- sin_null.sin_addr.s_addr, 0, B_TRUE);
+ mutex_enter(&connp->conn_lock);
+ connp->conn_saddr_v6 = ipv6_all_zeros;
+ connp->conn_laddr_v6 = ipv6_all_zeros;
+ connp->conn_faddr_v6 = ipv6_all_zeros;
+ connp->conn_v6lastdst = ipv6_all_zeros;
+ mutex_exit(&connp->conn_lock);
- rawip_post_ip_bind_connect(icmp, NULL, error);
- return (error);
+ (void) ip_laddr_fanout_insert(connp);
}
+/*
+ * This routine handles each T_CONN_REQ message passed to icmp. It
+ * associates a default destination address with the stream.
+ *
+ * After various error checks are completed, icmp_connect() lays
+ * the target address and port into the composite header template.
+ * Then we ask IP for information, including a source address if we didn't
+ * already have one. Finally we send up the T_OK_ACK reply message.
+ */
static void
icmp_tpi_connect(queue_t *q, mblk_t *mp)
{
conn_t *connp = Q_TO_CONN(q);
struct T_conn_req *tcr;
- icmp_t *icmp;
struct sockaddr *sa;
socklen_t len;
int error;
cred_t *cr;
-
+ pid_t pid;
/*
* All Solaris components should pass a db_credp
* for this TPI message, hence we ASSERT.
@@ -603,14 +585,13 @@ icmp_tpi_connect(queue_t *q, mblk_t *mp)
* like a TPI message sent by some other kernel
* component, we check and return an error.
*/
- cr = msg_getcred(mp, NULL);
+ cr = msg_getcred(mp, &pid);
ASSERT(cr != NULL);
if (cr == NULL) {
icmp_err_ack(q, mp, TSYSERR, EINVAL);
return;
}
- icmp = connp->conn_icmp;
tcr = (struct T_conn_req *)mp->b_rptr;
/* Sanity checks */
if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) {
@@ -639,13 +620,13 @@ icmp_tpi_connect(queue_t *q, mblk_t *mp)
break;
}
- error = proto_verify_ip_addr(icmp->icmp_family, sa, len);
+ error = proto_verify_ip_addr(connp->conn_family, sa, len);
if (error != 0) {
icmp_err_ack(q, mp, TSYSERR, error);
return;
}
- error = rawip_do_connect(connp, sa, len, cr);
+ error = rawip_do_connect(connp, sa, len, cr, pid);
if (error != 0) {
if (error < 0) {
icmp_err_ack(q, mp, -error, 0);
@@ -659,11 +640,11 @@ icmp_tpi_connect(queue_t *q, mblk_t *mp)
* We have to send a connection confirmation to
* keep TLI happy.
*/
- if (icmp->icmp_family == AF_INET) {
+ if (connp->conn_family == AF_INET) {
mp1 = mi_tpi_conn_con(NULL, (char *)sa,
sizeof (sin_t), NULL, 0);
} else {
- ASSERT(icmp->icmp_family == AF_INET6);
+ ASSERT(connp->conn_family == AF_INET6);
mp1 = mi_tpi_conn_con(NULL, (char *)sa,
sizeof (sin6_t), NULL, 0);
}
@@ -688,15 +669,20 @@ icmp_tpi_connect(queue_t *q, mblk_t *mp)
static int
rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
- cred_t *cr)
+ cred_t *cr, pid_t pid)
{
- icmp_t *icmp;
- sin_t *sin;
- sin6_t *sin6;
- mblk_t *ire_mp;
- int error;
+ icmp_t *icmp;
+ sin_t *sin;
+ sin6_t *sin6;
+ int error;
+ uint16_t dstport;
ipaddr_t v4dst;
in6_addr_t v6dst;
+ uint32_t flowinfo;
+ ip_xmit_attr_t *ixa;
+ uint_t scopeid = 0;
+ uint_t srcid = 0;
+ in6_addr_t v6src = connp->conn_saddr_v6;
icmp = connp->conn_icmp;
@@ -704,170 +690,199 @@ rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
return (EINVAL);
}
- ire_mp = allocb(sizeof (ire_t), BPRI_HI);
- if (ire_mp == NULL)
- return (ENOMEM);
- DB_TYPE(ire_mp) = IRE_DB_REQ_TYPE;
-
-
ASSERT(sa != NULL && len != 0);
- rw_enter(&icmp->icmp_rwlock, RW_WRITER);
- if (icmp->icmp_state == TS_UNBND || icmp->icmp_pending_op != -1) {
- rw_exit(&icmp->icmp_rwlock);
- freeb(ire_mp);
- return (-TOUTSTATE);
- }
-
+ /*
+ * Determine packet type based on type of address passed in
+ * the request should contain an IPv4 or IPv6 address.
+ * Make sure that address family matches the type of
+ * family of the address passed down.
+ */
switch (len) {
case sizeof (sin_t):
sin = (sin_t *)sa;
- ASSERT(icmp->icmp_family == AF_INET);
- ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
-
v4dst = sin->sin_addr.s_addr;
- /*
- * Interpret a zero destination to mean loopback.
- * Update the T_CONN_REQ (sin/sin6) since it is used to
- * generate the T_CONN_CON.
- */
- if (v4dst == INADDR_ANY) {
- v4dst = htonl(INADDR_LOOPBACK);
- }
-
+ dstport = sin->sin_port;
IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
- ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
- icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
- icmp->icmp_ip_snd_options_len;
- icmp->icmp_v6dst.sin6_addr = v6dst;
- icmp->icmp_v6dst.sin6_family = AF_INET6;
- icmp->icmp_v6dst.sin6_flowinfo = 0;
- icmp->icmp_v6dst.sin6_port = 0;
-
- /*
- * If the destination address is multicast and
- * an outgoing multicast interface has been set,
- * use the address of that interface as our
- * source address if no source address has been set.
- */
- if (V4_PART_OF_V6(icmp->icmp_v6src) == INADDR_ANY &&
- CLASSD(v4dst) &&
- icmp->icmp_multicast_if_addr != INADDR_ANY) {
- IN6_IPADDR_TO_V4MAPPED(icmp->icmp_multicast_if_addr,
- &icmp->icmp_v6src);
- }
+ ASSERT(connp->conn_ipversion == IPV4_VERSION);
break;
+
case sizeof (sin6_t):
sin6 = (sin6_t *)sa;
/* No support for mapped addresses on raw sockets */
if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
- rw_exit(&icmp->icmp_rwlock);
- freeb(ire_mp);
return (EADDRNOTAVAIL);
}
+ v6dst = sin6->sin6_addr;
+ dstport = sin6->sin6_port;
+ ASSERT(connp->conn_ipversion == IPV6_VERSION);
+ flowinfo = sin6->sin6_flowinfo;
+ if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr))
+ scopeid = sin6->sin6_scope_id;
+ srcid = sin6->__sin6_src_id;
+ if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
+ ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
+ connp->conn_netstack);
+ }
+ break;
+ }
+
+ /*
+ * If there is a different thread using conn_ixa then we get a new
+ * copy and cut the old one loose from conn_ixa. Otherwise we use
+ * conn_ixa and prevent any other thread from using/changing it.
+ * Once connect() is done other threads can use conn_ixa since the
+ * refcnt will be back at one.
+ */
+ ixa = conn_get_ixa(connp, B_TRUE);
+ if (ixa == NULL)
+ return (ENOMEM);
- ASSERT(icmp->icmp_ipversion == IPV6_VERSION);
- ASSERT(icmp->icmp_family == AF_INET6);
+ ASSERT(ixa->ixa_refcnt >= 2);
+ ASSERT(ixa == connp->conn_ixa);
- icmp->icmp_max_hdr_len = icmp->icmp_sticky_hdrs_len;
+ mutex_enter(&connp->conn_lock);
+ /*
+ * This icmp_t must have bound already before doing a connect.
+ * Reject if a connect is in progress (we drop conn_lock during
+ * rawip_do_connect).
+ */
+ if (icmp->icmp_state == TS_UNBND || icmp->icmp_state == TS_WCON_CREQ) {
+ mutex_exit(&connp->conn_lock);
+ ixa_refrele(ixa);
+ return (-TOUTSTATE);
+ }
- icmp->icmp_v6dst = *sin6;
- icmp->icmp_v6dst.sin6_port = 0;
+ if (icmp->icmp_state == TS_DATA_XFER) {
+ /* Already connected - clear out state */
+ if (connp->conn_mcbc_bind)
+ connp->conn_saddr_v6 = ipv6_all_zeros;
+ else
+ connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
+ connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
+ connp->conn_faddr_v6 = ipv6_all_zeros;
+ icmp->icmp_state = TS_IDLE;
+ }
+ /*
+ * Use sin_port/sin6_port since applications like psh use SOCK_RAW
+ * with IPPROTO_TCP.
+ */
+ connp->conn_fport = dstport;
+ if (connp->conn_ipversion == IPV4_VERSION) {
/*
* Interpret a zero destination to mean loopback.
* Update the T_CONN_REQ (sin/sin6) since it is used to
* generate the T_CONN_CON.
*/
- if (IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6dst.sin6_addr)) {
- icmp->icmp_v6dst.sin6_addr = ipv6_loopback;
+ if (v4dst == INADDR_ANY) {
+ v4dst = htonl(INADDR_LOOPBACK);
+ IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
+ ASSERT(connp->conn_family == AF_INET);
+ sin->sin_addr.s_addr = v4dst;
}
+ connp->conn_faddr_v6 = v6dst;
+ connp->conn_flowinfo = 0;
+ } else {
+ ASSERT(connp->conn_ipversion == IPV6_VERSION);
/*
- * If the destination address is multicast and
- * an outgoing multicast interface has been set,
- * then the ip bind logic will pick the correct source
- * address (i.e. matching the outgoing multicast interface).
+ * Interpret a zero destination to mean loopback.
+ * Update the T_CONN_REQ (sin/sin6) since it is used to
+ * generate the T_CONN_CON.
*/
- break;
+ if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) {
+ v6dst = ipv6_loopback;
+ sin6->sin6_addr = v6dst;
+ }
+ connp->conn_faddr_v6 = v6dst;
+ connp->conn_flowinfo = flowinfo;
}
- icmp->icmp_pending_op = T_CONN_REQ;
-
- if (icmp->icmp_state == TS_DATA_XFER) {
- /* Already connected - clear out state */
- icmp->icmp_v6src = icmp->icmp_bound_v6src;
- icmp->icmp_state = TS_IDLE;
+ ixa->ixa_cred = cr;
+ ixa->ixa_cpid = pid;
+ if (is_system_labeled()) {
+ /* We need to restart with a label based on the cred */
+ ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
}
- icmp->icmp_state = TS_DATA_XFER;
- rw_exit(&icmp->icmp_rwlock);
-
- if (icmp->icmp_family == AF_INET6) {
- error = ip_proto_bind_connected_v6(connp, &ire_mp,
- icmp->icmp_proto, &icmp->icmp_v6src, 0,
- &icmp->icmp_v6dst.sin6_addr,
- NULL, sin6->sin6_port, B_TRUE, B_TRUE, cr);
+ if (scopeid != 0) {
+ ixa->ixa_flags |= IXAF_SCOPEID_SET;
+ ixa->ixa_scopeid = scopeid;
+ connp->conn_incoming_ifindex = scopeid;
} else {
- error = ip_proto_bind_connected_v4(connp, &ire_mp,
- icmp->icmp_proto, &V4_PART_OF_V6(icmp->icmp_v6src), 0,
- V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr), sin->sin_port,
- B_TRUE, B_TRUE, cr);
+ ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
+ connp->conn_incoming_ifindex = connp->conn_bound_if;
}
- rawip_post_ip_bind_connect(icmp, ire_mp, error);
- return (error);
-}
-static void
-icmp_close_free(conn_t *connp)
-{
- icmp_t *icmp = connp->conn_icmp;
-
- /* If there are any options associated with the stream, free them. */
- if (icmp->icmp_ip_snd_options != NULL) {
- mi_free((char *)icmp->icmp_ip_snd_options);
- icmp->icmp_ip_snd_options = NULL;
- icmp->icmp_ip_snd_options_len = 0;
- }
+ /*
+ * conn_connect will drop conn_lock and reacquire it.
+ * To prevent a send* from messing with this icmp_t while the lock
+ * is dropped we set icmp_state and clear conn_v6lastdst.
+ * That will make all send* fail with EISCONN.
+ */
+ connp->conn_v6lastdst = ipv6_all_zeros;
+ icmp->icmp_state = TS_WCON_CREQ;
- if (icmp->icmp_filter != NULL) {
- kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t));
- icmp->icmp_filter = NULL;
- }
+ error = conn_connect(connp, NULL, IPDF_ALLOW_MCBC);
+ mutex_exit(&connp->conn_lock);
+ if (error != 0)
+ goto connect_failed;
- /* Free memory associated with sticky options */
- if (icmp->icmp_sticky_hdrs_len != 0) {
- kmem_free(icmp->icmp_sticky_hdrs,
- icmp->icmp_sticky_hdrs_len);
- icmp->icmp_sticky_hdrs = NULL;
- icmp->icmp_sticky_hdrs_len = 0;
- }
+ /*
+ * The addresses have been verified. Time to insert in
+ * the correct fanout list.
+ */
+ error = ipcl_conn_insert(connp);
+ if (error != 0)
+ goto connect_failed;
- if (icmp->icmp_last_cred != NULL) {
- crfree(icmp->icmp_last_cred);
- icmp->icmp_last_cred = NULL;
+ mutex_enter(&connp->conn_lock);
+ error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
+ &connp->conn_faddr_v6, connp->conn_flowinfo);
+ if (error != 0) {
+ mutex_exit(&connp->conn_lock);
+ goto connect_failed;
}
- if (icmp->icmp_effective_cred != NULL) {
- crfree(icmp->icmp_effective_cred);
- icmp->icmp_effective_cred = NULL;
- }
+ icmp->icmp_state = TS_DATA_XFER;
+ /* Record this as the "last" send even though we haven't sent any */
+ connp->conn_v6lastdst = connp->conn_faddr_v6;
+ connp->conn_lastipversion = connp->conn_ipversion;
+ connp->conn_lastdstport = connp->conn_fport;
+ connp->conn_lastflowinfo = connp->conn_flowinfo;
+ connp->conn_lastscopeid = scopeid;
+ connp->conn_lastsrcid = srcid;
+ /* Also remember a source to use together with lastdst */
+ connp->conn_v6lastsrc = v6src;
+ mutex_exit(&connp->conn_lock);
- ip6_pkt_free(&icmp->icmp_sticky_ipp);
+ ixa_refrele(ixa);
+ return (0);
- /*
- * Clear any fields which the kmem_cache constructor clears.
- * Only icmp_connp needs to be preserved.
- * TBD: We should make this more efficient to avoid clearing
- * everything.
- */
- ASSERT(icmp->icmp_connp == connp);
- bzero(icmp, sizeof (icmp_t));
- icmp->icmp_connp = connp;
+connect_failed:
+ if (ixa != NULL)
+ ixa_refrele(ixa);
+ mutex_enter(&connp->conn_lock);
+ icmp->icmp_state = TS_IDLE;
+ /* In case the source address was set above */
+ if (connp->conn_mcbc_bind)
+ connp->conn_saddr_v6 = ipv6_all_zeros;
+ else
+ connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
+ connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
+ connp->conn_faddr_v6 = ipv6_all_zeros;
+ connp->conn_v6lastdst = ipv6_all_zeros;
+ connp->conn_flowinfo = 0;
+
+ (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
+ &connp->conn_faddr_v6, connp->conn_flowinfo);
+ mutex_exit(&connp->conn_lock);
+ return (error);
}
-static int
+static void
rawip_do_close(conn_t *connp)
{
ASSERT(connp != NULL && IPCL_IS_RAWIP(connp));
@@ -878,8 +893,6 @@ rawip_do_close(conn_t *connp)
qprocsoff(connp->conn_rq);
}
- ASSERT(connp->conn_icmp->icmp_fallback_queue_head == NULL &&
- connp->conn_icmp->icmp_fallback_queue_tail == NULL);
icmp_close_free(connp);
/*
@@ -902,8 +915,6 @@ rawip_do_close(conn_t *connp)
connp->conn_ref--;
ipcl_conn_destroy(connp);
-
- return (0);
}
static int
@@ -928,60 +939,63 @@ done:
return (0);
}
+static void
+icmp_close_free(conn_t *connp)
+{
+ icmp_t *icmp = connp->conn_icmp;
+
+ if (icmp->icmp_filter != NULL) {
+ kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t));
+ icmp->icmp_filter = NULL;
+ }
+
+ /*
+ * Clear any fields which the kmem_cache constructor clears.
+ * Only icmp_connp needs to be preserved.
+ * TBD: We should make this more efficient to avoid clearing
+ * everything.
+ */
+ ASSERT(icmp->icmp_connp == connp);
+ bzero(icmp, sizeof (icmp_t));
+ icmp->icmp_connp = connp;
+}
+
/*
* This routine handles each T_DISCON_REQ message passed to icmp
* as an indicating that ICMP is no longer connected. This results
- * in sending a T_BIND_REQ to IP to restore the binding to just
- * the local address.
- *
- * The disconnect completes in rawip_post_ip_bind_connect.
+ * in telling IP to restore the binding to just the local address.
*/
static int
icmp_do_disconnect(conn_t *connp)
{
- icmp_t *icmp;
- mblk_t *ire_mp;
- int error;
+ icmp_t *icmp = connp->conn_icmp;
+ int error;
- icmp = connp->conn_icmp;
- rw_enter(&icmp->icmp_rwlock, RW_WRITER);
- if (icmp->icmp_state != TS_DATA_XFER || icmp->icmp_pending_op != -1) {
- rw_exit(&icmp->icmp_rwlock);
+ mutex_enter(&connp->conn_lock);
+ if (icmp->icmp_state != TS_DATA_XFER) {
+ mutex_exit(&connp->conn_lock);
return (-TOUTSTATE);
}
- icmp->icmp_pending_op = T_DISCON_REQ;
- icmp->icmp_v6src = icmp->icmp_bound_v6src;
+ if (connp->conn_mcbc_bind)
+ connp->conn_saddr_v6 = ipv6_all_zeros;
+ else
+ connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
+ connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
+ connp->conn_faddr_v6 = ipv6_all_zeros;
icmp->icmp_state = TS_IDLE;
+ connp->conn_v6lastdst = ipv6_all_zeros;
+ error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
+ &connp->conn_faddr_v6, connp->conn_flowinfo);
+ mutex_exit(&connp->conn_lock);
+ if (error != 0)
+ return (error);
- if (icmp->icmp_family == AF_INET6) {
- /* Rebuild the header template */
- error = icmp_build_hdrs(icmp);
- if (error != 0) {
- icmp->icmp_pending_op = -1;
- rw_exit(&icmp->icmp_rwlock);
- return (error);
- }
- }
-
- rw_exit(&icmp->icmp_rwlock);
- ire_mp = allocb(sizeof (ire_t), BPRI_HI);
- if (ire_mp == NULL) {
- return (ENOMEM);
- }
-
- if (icmp->icmp_family == AF_INET6) {
- error = ip_proto_bind_laddr_v6(connp, &ire_mp, icmp->icmp_proto,
- &icmp->icmp_bound_v6src, 0, B_TRUE);
- } else {
-
- error = ip_proto_bind_laddr_v4(connp, &ire_mp, icmp->icmp_proto,
- V4_PART_OF_V6(icmp->icmp_bound_v6src), 0, B_TRUE);
- }
-
- rawip_post_ip_bind_connect(icmp, ire_mp, error);
-
- return (error);
+ /*
+ * Tell IP to remove the full binding and revert
+ * to the local address binding.
+ */
+ return (ip_laddr_fanout_insert(connp));
}
static void
@@ -1014,16 +1028,14 @@ icmp_tpi_disconnect(queue_t *q, mblk_t *mp)
ASSERT(mp != NULL);
qreply(q, mp);
}
-
}
static int
icmp_disconnect(conn_t *connp)
{
int error;
- icmp_t *icmp = connp->conn_icmp;
- icmp->icmp_dgram_errind = B_FALSE;
+ connp->conn_dgram_errind = B_FALSE;
error = icmp_do_disconnect(connp);
@@ -1058,22 +1070,22 @@ icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
}
/*
- * icmp_icmp_error is called by icmp_input to process ICMP
- * messages passed up by IP.
- * Generates the appropriate permanent (non-transient) errors.
- * Assumes that IP has pulled up everything up to and including
- * the ICMP header.
+ * icmp_icmp_input is called as conn_recvicmp to process ICMP messages.
+ * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
+ * Assumes that IP has pulled up everything up to and including the ICMP header.
*/
+/* ARGSUSED2 */
static void
-icmp_icmp_error(conn_t *connp, mblk_t *mp)
+icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
{
- icmph_t *icmph;
- ipha_t *ipha;
- int iph_hdr_length;
- sin_t sin;
- mblk_t *mp1;
- int error = 0;
- icmp_t *icmp = connp->conn_icmp;
+ conn_t *connp = (conn_t *)arg1;
+ icmp_t *icmp = connp->conn_icmp;
+ icmph_t *icmph;
+ ipha_t *ipha;
+ int iph_hdr_length;
+ sin_t sin;
+ mblk_t *mp1;
+ int error = 0;
ipha = (ipha_t *)mp->b_rptr;
@@ -1081,34 +1093,57 @@ icmp_icmp_error(conn_t *connp, mblk_t *mp)
if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
- icmp_icmp_error_ipv6(connp, mp);
+ icmp_icmp_error_ipv6(connp, mp, ira);
return;
}
-
- /*
- * icmp does not support v4 mapped addresses
- * so we can never be here for a V6 socket
- * i.e. icmp_family == AF_INET6
- */
- ASSERT((IPH_HDR_VERSION(ipha) == IPV4_VERSION) &&
- (icmp->icmp_family == AF_INET));
-
- ASSERT(icmp->icmp_family == AF_INET);
+ ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
/* Skip past the outer IP and ICMP headers */
- iph_hdr_length = IPH_HDR_LENGTH(ipha);
- icmph = (icmph_t *)(&mp->b_rptr[iph_hdr_length]);
- ipha = (ipha_t *)&icmph[1];
+ ASSERT(IPH_HDR_LENGTH(ipha) == ira->ira_ip_hdr_length);
+ iph_hdr_length = ira->ira_ip_hdr_length;
+ icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
+ ipha = (ipha_t *)&icmph[1]; /* Inner IP header */
+
iph_hdr_length = IPH_HDR_LENGTH(ipha);
switch (icmph->icmph_type) {
case ICMP_DEST_UNREACHABLE:
switch (icmph->icmph_code) {
- case ICMP_FRAGMENTATION_NEEDED:
+ case ICMP_FRAGMENTATION_NEEDED: {
+ ipha_t *ipha;
+ ip_xmit_attr_t *ixa;
/*
* IP has already adjusted the path MTU.
+ * But we need to adjust DF for IPv4.
*/
+ if (connp->conn_ipversion != IPV4_VERSION)
+ break;
+
+ ixa = conn_get_ixa(connp, B_FALSE);
+ if (ixa == NULL || ixa->ixa_ire == NULL) {
+ /*
+ * Some other thread holds conn_ixa. We will
+ * redo this on the next ICMP too big.
+ */
+ if (ixa != NULL)
+ ixa_refrele(ixa);
+ break;
+ }
+ (void) ip_get_pmtu(ixa);
+
+ mutex_enter(&connp->conn_lock);
+ ipha = (ipha_t *)connp->conn_ht_iphc;
+ if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
+ ipha->ipha_fragment_offset_and_flags |=
+ IPH_DF_HTONS;
+ } else {
+ ipha->ipha_fragment_offset_and_flags &=
+ ~IPH_DF_HTONS;
+ }
+ mutex_exit(&connp->conn_lock);
+ ixa_refrele(ixa);
break;
+ }
case ICMP_PORT_UNREACHABLE:
case ICMP_PROTOCOL_UNREACHABLE:
error = ECONNREFUSED;
@@ -1131,7 +1166,7 @@ icmp_icmp_error(conn_t *connp, mblk_t *mp)
* Deliver T_UDERROR_IND when the application has asked for it.
* The socket layer enables this automatically when connected.
*/
- if (!icmp->icmp_dgram_errind) {
+ if (!connp->conn_dgram_errind) {
freemsg(mp);
return;
}
@@ -1141,11 +1176,10 @@ icmp_icmp_error(conn_t *connp, mblk_t *mp)
sin.sin_addr.s_addr = ipha->ipha_dst;
if (IPCL_IS_NONSTR(connp)) {
- rw_enter(&icmp->icmp_rwlock, RW_WRITER);
+ mutex_enter(&connp->conn_lock);
if (icmp->icmp_state == TS_DATA_XFER) {
- if (sin.sin_addr.s_addr ==
- V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr)) {
- rw_exit(&icmp->icmp_rwlock);
+ if (sin.sin_addr.s_addr == connp->conn_faddr_v4) {
+ mutex_exit(&connp->conn_lock);
(*connp->conn_upcalls->su_set_error)
(connp->conn_upper_handle, error);
goto done;
@@ -1154,27 +1188,25 @@ icmp_icmp_error(conn_t *connp, mblk_t *mp)
icmp->icmp_delayed_error = error;
*((sin_t *)&icmp->icmp_delayed_addr) = sin;
}
- rw_exit(&icmp->icmp_rwlock);
+ mutex_exit(&connp->conn_lock);
} else {
- mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL,
- 0, error);
+ mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0,
+ error);
if (mp1 != NULL)
putnext(connp->conn_rq, mp1);
}
done:
- ASSERT(!RW_ISWRITER(&icmp->icmp_rwlock));
freemsg(mp);
}
/*
- * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMPv6
- * for IPv6 packets.
- * Send permanent (non-transient) errors upstream.
- * Assumes that IP has pulled up all the extension headers as well
- * as the ICMPv6 header.
+ * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMP for IPv6.
+ * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
+ * Assumes that IP has pulled up all the extension headers as well as the
+ * ICMPv6 header.
*/
static void
-icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
+icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira)
{
icmp6_t *icmp6;
ip6_t *ip6h, *outer_ip6h;
@@ -1186,13 +1218,18 @@ icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
icmp_t *icmp = connp->conn_icmp;
outer_ip6h = (ip6_t *)mp->b_rptr;
+#ifdef DEBUG
if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6)
iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h);
else
iph_hdr_length = IPV6_HDR_LEN;
-
+ ASSERT(iph_hdr_length == ira->ira_ip_hdr_length);
+#endif
+ /* Skip past the outer IP and ICMP headers */
+ iph_hdr_length = ira->ira_ip_hdr_length;
icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
- ip6h = (ip6_t *)&icmp6[1];
+
+ ip6h = (ip6_t *)&icmp6[1]; /* Inner IP header */
if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) {
freemsg(mp);
return;
@@ -1229,7 +1266,7 @@ icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
* information, send up an empty message containing an
* IPV6_PATHMTU ancillary data item.
*/
- if (!icmp->icmp_ipv6_recvpathmtu)
+ if (!connp->conn_ipv6_recvpathmtu)
break;
udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) +
@@ -1255,7 +1292,7 @@ icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
sin6 = (sin6_t *)&tudi[1];
bzero(sin6, sizeof (sin6_t));
sin6->sin6_family = AF_INET6;
- sin6->sin6_addr = icmp->icmp_v6dst.sin6_addr;
+ sin6->sin6_addr = connp->conn_faddr_v6;
toh = (struct T_opthdr *)&sin6[1];
toh->level = IPPROTO_IPV6;
@@ -1273,8 +1310,7 @@ icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
* message. Free it, then send our empty message.
*/
freemsg(mp);
- icmp_ulp_recv(connp, newmp);
-
+ icmp_ulp_recv(connp, newmp, msgdsize(newmp));
return;
}
case ICMP6_TIME_EXCEEDED:
@@ -1299,7 +1335,7 @@ icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
* Deliver T_UDERROR_IND when the application has asked for it.
* The socket layer enables this automatically when connected.
*/
- if (!icmp->icmp_dgram_errind) {
+ if (!connp->conn_dgram_errind) {
freemsg(mp);
return;
}
@@ -1308,13 +1344,12 @@ icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
sin6.sin6_family = AF_INET6;
sin6.sin6_addr = ip6h->ip6_dst;
sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
-
if (IPCL_IS_NONSTR(connp)) {
- rw_enter(&icmp->icmp_rwlock, RW_WRITER);
+ mutex_enter(&connp->conn_lock);
if (icmp->icmp_state == TS_DATA_XFER) {
if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr,
- &icmp->icmp_v6dst.sin6_addr)) {
- rw_exit(&icmp->icmp_rwlock);
+ &connp->conn_faddr_v6)) {
+ mutex_exit(&connp->conn_lock);
(*connp->conn_upcalls->su_set_error)
(connp->conn_upper_handle, error);
goto done;
@@ -1323,7 +1358,7 @@ icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
icmp->icmp_delayed_error = error;
*((sin6_t *)&icmp->icmp_delayed_addr) = sin6;
}
- rw_exit(&icmp->icmp_rwlock);
+ mutex_exit(&connp->conn_lock);
} else {
mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
NULL, 0, error);
@@ -1331,7 +1366,6 @@ icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
putnext(connp->conn_rq, mp1);
}
done:
- ASSERT(!RW_ISWRITER(&icmp->icmp_rwlock));
freemsg(mp);
}
@@ -1345,9 +1379,12 @@ done:
static void
icmp_addr_req(queue_t *q, mblk_t *mp)
{
- icmp_t *icmp = Q_TO_ICMP(q);
+ struct sockaddr *sa;
mblk_t *ackmp;
struct T_addr_ack *taa;
+ icmp_t *icmp = Q_TO_ICMP(q);
+ conn_t *connp = icmp->icmp_connp;
+ uint_t addrlen;
/* Make it large enough for worst case */
ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
@@ -1363,65 +1400,39 @@ icmp_addr_req(queue_t *q, mblk_t *mp)
taa->PRIM_type = T_ADDR_ACK;
ackmp->b_datap->db_type = M_PCPROTO;
- rw_enter(&icmp->icmp_rwlock, RW_READER);
+
+ if (connp->conn_family == AF_INET)
+ addrlen = sizeof (sin_t);
+ else
+ addrlen = sizeof (sin6_t);
+
+ mutex_enter(&connp->conn_lock);
/*
* Note: Following code assumes 32 bit alignment of basic
* data structures like sin_t and struct T_addr_ack.
*/
if (icmp->icmp_state != TS_UNBND) {
/*
- * Fill in local address
+ * Fill in local address first
*/
taa->LOCADDR_offset = sizeof (*taa);
- if (icmp->icmp_family == AF_INET) {
- sin_t *sin;
-
- taa->LOCADDR_length = sizeof (sin_t);
- sin = (sin_t *)&taa[1];
- /* Fill zeroes and then intialize non-zero fields */
- *sin = sin_null;
- sin->sin_family = AF_INET;
- if (!IN6_IS_ADDR_V4MAPPED_ANY(&icmp->icmp_v6src) &&
- !IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
- IN6_V4MAPPED_TO_IPADDR(&icmp->icmp_v6src,
- sin->sin_addr.s_addr);
- } else {
- /*
- * INADDR_ANY
- * icmp_v6src is not set, we might be bound to
- * broadcast/multicast. Use icmp_bound_v6src as
- * local address instead (that could
- * also still be INADDR_ANY)
- */
- IN6_V4MAPPED_TO_IPADDR(&icmp->icmp_bound_v6src,
- sin->sin_addr.s_addr);
- }
- ackmp->b_wptr = (uchar_t *)&sin[1];
- } else {
- sin6_t *sin6;
-
- ASSERT(icmp->icmp_family == AF_INET6);
- taa->LOCADDR_length = sizeof (sin6_t);
- sin6 = (sin6_t *)&taa[1];
- /* Fill zeroes and then intialize non-zero fields */
- *sin6 = sin6_null;
- sin6->sin6_family = AF_INET6;
- if (!IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
- sin6->sin6_addr = icmp->icmp_v6src;
- } else {
- /*
- * UNSPECIFIED
- * icmp_v6src is not set, we might be bound to
- * broadcast/multicast. Use icmp_bound_v6src as
- * local address instead (that could
- * also still be UNSPECIFIED)
- */
- sin6->sin6_addr = icmp->icmp_bound_v6src;
- }
- ackmp->b_wptr = (uchar_t *)&sin6[1];
- }
+ taa->LOCADDR_length = addrlen;
+ sa = (struct sockaddr *)&taa[1];
+ (void) conn_getsockname(connp, sa, &addrlen);
+ ackmp->b_wptr += addrlen;
+ }
+ if (icmp->icmp_state == TS_DATA_XFER) {
+ /*
+ * connected, fill remote address too
+ */
+ taa->REMADDR_length = addrlen;
+ /* assumed 32-bit alignment */
+ taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length;
+ sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset);
+ (void) conn_getpeername(connp, sa, &addrlen);
+ ackmp->b_wptr += addrlen;
}
- rw_exit(&icmp->icmp_rwlock);
+ mutex_exit(&connp->conn_lock);
ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
qreply(q, ackmp);
}
@@ -1429,9 +1440,11 @@ icmp_addr_req(queue_t *q, mblk_t *mp)
static void
icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp)
{
+ conn_t *connp = icmp->icmp_connp;
+
*tap = icmp_g_t_info_ack;
- if (icmp->icmp_family == AF_INET6)
+ if (connp->conn_family == AF_INET6)
tap->ADDR_size = sizeof (sin6_t);
else
tap->ADDR_size = sizeof (sin_t);
@@ -1488,6 +1501,7 @@ icmp_info_req(queue_t *q, mblk_t *mp)
{
icmp_t *icmp = Q_TO_ICMP(q);
+ /* Create a T_INFO_ACK message. */
mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
T_INFO_ACK);
if (!mp)
@@ -1496,18 +1510,14 @@ icmp_info_req(queue_t *q, mblk_t *mp)
qreply(q, mp);
}
-/* For /dev/icmp aka AF_INET open */
static int
icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
int family)
{
conn_t *connp;
dev_t conn_dev;
- icmp_stack_t *is;
int error;
- conn_dev = NULL;
-
/* If the stream is already open, return immediately. */
if (q->q_ptr != NULL)
return (0);
@@ -1534,9 +1544,9 @@ icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
return (0);
}
- connp = icmp_open(family, credp, &error, KM_SLEEP);
+ connp = rawip_do_open(family, credp, &error, KM_SLEEP);
if (connp == NULL) {
- ASSERT(error != NULL);
+ ASSERT(error != 0);
inet_minor_free(ip_minor_arena_sa, connp->conn_dev);
return (error);
}
@@ -1545,8 +1555,6 @@ icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
connp->conn_dev = conn_dev;
connp->conn_minor_arena = ip_minor_arena_sa;
- is = connp->conn_icmp->icmp_is;
-
/*
* Initialize the icmp_t structure for this stream.
*/
@@ -1555,38 +1563,25 @@ icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
connp->conn_rq = q;
connp->conn_wq = WR(q);
- if (connp->conn_icmp->icmp_family == AF_INET6) {
- /* Build initial header template for transmit */
- rw_enter(&connp->conn_icmp->icmp_rwlock, RW_WRITER);
- if ((error = icmp_build_hdrs(connp->conn_icmp)) != 0) {
- rw_exit(&connp->conn_icmp->icmp_rwlock);
- inet_minor_free(ip_minor_arena_sa, connp->conn_dev);
- ipcl_conn_destroy(connp);
- return (error);
- }
- rw_exit(&connp->conn_icmp->icmp_rwlock);
- }
-
-
- q->q_hiwat = is->is_recv_hiwat;
- WR(q)->q_hiwat = is->is_xmit_hiwat;
- WR(q)->q_lowat = is->is_xmit_lowat;
+ WR(q)->q_hiwat = connp->conn_sndbuf;
+ WR(q)->q_lowat = connp->conn_sndlowat;
qprocson(q);
/* Set the Stream head write offset. */
- (void) proto_set_tx_wroff(q, connp,
- connp->conn_icmp->icmp_max_hdr_len + is->is_wroff_extra);
- (void) proto_set_rx_hiwat(connp->conn_rq, connp, q->q_hiwat);
+ (void) proto_set_tx_wroff(q, connp, connp->conn_wroff);
+ (void) proto_set_rx_hiwat(connp->conn_rq, connp, connp->conn_rcvbuf);
mutex_enter(&connp->conn_lock);
connp->conn_state_flags &= ~CONN_INCIPIENT;
mutex_exit(&connp->conn_lock);
+ icmp_bind_proto(connp->conn_icmp);
+
return (0);
}
-/* For /dev/icmp4 aka AF_INET open */
+/* For /dev/icmp aka AF_INET open */
static int
icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
{
@@ -1604,15 +1599,15 @@ icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
* This is the open routine for icmp. It allocates a icmp_t structure for
* the stream and, on the first open of the module, creates an ND table.
*/
-/* ARGSUSED */
static conn_t *
-icmp_open(int family, cred_t *credp, int *err, int flags)
+rawip_do_open(int family, cred_t *credp, int *err, int flags)
{
icmp_t *icmp;
conn_t *connp;
zoneid_t zoneid;
netstack_t *ns;
icmp_stack_t *is;
+ int len;
boolean_t isv6 = B_FALSE;
*err = secpolicy_net_icmpaccess(credp);
@@ -1621,6 +1616,7 @@ icmp_open(int family, cred_t *credp, int *err, int flags)
if (family == AF_INET6)
isv6 = B_TRUE;
+
ns = netstack_find_by_cred(credp);
ASSERT(ns != NULL);
is = ns->netstack_icmp;
@@ -1639,7 +1635,6 @@ icmp_open(int family, cred_t *credp, int *err, int flags)
connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns);
icmp = connp->conn_icmp;
- icmp->icmp_v6dst = sin6_null;
/*
* ipcl_conn_create did a netstack_hold. Undo the hold that was
@@ -1647,35 +1642,52 @@ icmp_open(int family, cred_t *credp, int *err, int flags)
*/
netstack_rele(ns);
- rw_enter(&icmp->icmp_rwlock, RW_WRITER);
- ASSERT(connp->conn_ulp == IPPROTO_ICMP);
+ /*
+ * Since this conn_t/icmp_t is not yet visible to anybody else we don't
+ * need to lock anything.
+ */
+ ASSERT(connp->conn_proto == IPPROTO_ICMP);
ASSERT(connp->conn_icmp == icmp);
ASSERT(icmp->icmp_connp == connp);
/* Set the initial state of the stream and the privilege status. */
icmp->icmp_state = TS_UNBND;
+ connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
if (isv6) {
- icmp->icmp_ipversion = IPV6_VERSION;
- icmp->icmp_family = AF_INET6;
- connp->conn_ulp = IPPROTO_ICMPV6;
+ connp->conn_family = AF_INET6;
+ connp->conn_ipversion = IPV6_VERSION;
+ connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4;
+ connp->conn_proto = IPPROTO_ICMPV6;
/* May be changed by a SO_PROTOTYPE socket option. */
- icmp->icmp_proto = IPPROTO_ICMPV6;
- icmp->icmp_checksum_off = 2; /* Offset for icmp6_cksum */
- icmp->icmp_max_hdr_len = IPV6_HDR_LEN;
- icmp->icmp_ttl = (uint8_t)is->is_ipv6_hoplimit;
- connp->conn_af_isv6 = B_TRUE;
+ connp->conn_proto = IPPROTO_ICMPV6;
+ connp->conn_ixa->ixa_protocol = connp->conn_proto;
+ connp->conn_ixa->ixa_raw_cksum_offset = 2;
+ connp->conn_default_ttl = is->is_ipv6_hoplimit;
+ len = sizeof (ip6_t);
} else {
- icmp->icmp_ipversion = IPV4_VERSION;
- icmp->icmp_family = AF_INET;
+ connp->conn_family = AF_INET;
+ connp->conn_ipversion = IPV4_VERSION;
+ connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4;
/* May be changed by a SO_PROTOTYPE socket option. */
- icmp->icmp_proto = IPPROTO_ICMP;
- icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH;
- icmp->icmp_ttl = (uint8_t)is->is_ipv4_ttl;
- connp->conn_af_isv6 = B_FALSE;
- }
- icmp->icmp_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
- icmp->icmp_pending_op = -1;
- connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
+ connp->conn_proto = IPPROTO_ICMP;
+ connp->conn_ixa->ixa_protocol = connp->conn_proto;
+ connp->conn_default_ttl = is->is_ipv4_ttl;
+ len = sizeof (ipha_t);
+ }
+ connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl;
+
+ connp->conn_ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
+
+ /*
+ * For the socket of protocol IPPROTO_RAW or when IP_HDRINCL is set,
+ * the checksum is provided in the pre-built packet. We clear
+ * IXAF_SET_ULP_CKSUM to tell IP that the application has sent a
+ * complete IP header and not to compute the transport checksum.
+ */
+ connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM;
+ /* conn_allzones can not be set this early, hence no IPCL_ZONEID */
+ connp->conn_ixa->ixa_zoneid = zoneid;
+
connp->conn_zoneid = zoneid;
/*
@@ -1685,17 +1697,35 @@ icmp_open(int family, cred_t *credp, int *err, int flags)
if (getpflags(NET_MAC_AWARE, credp) != 0)
connp->conn_mac_mode = CONN_MAC_AWARE;
- connp->conn_ulp_labeled = is_system_labeled();
+ connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID);
icmp->icmp_is = is;
+ connp->conn_rcvbuf = is->is_recv_hiwat;
+ connp->conn_sndbuf = is->is_xmit_hiwat;
+ connp->conn_sndlowat = is->is_xmit_lowat;
+ connp->conn_rcvlowat = icmp_mod_info.mi_lowat;
+
+ connp->conn_wroff = len + is->is_wroff_extra;
+ connp->conn_so_type = SOCK_RAW;
+
connp->conn_recv = icmp_input;
+ connp->conn_recvicmp = icmp_icmp_input;
crhold(credp);
connp->conn_cred = credp;
-
- rw_exit(&icmp->icmp_rwlock);
+ connp->conn_cpid = curproc->p_pid;
+ connp->conn_open_time = lbolt64;
+ /* Cache things in ixa without an extra refhold */
+ connp->conn_ixa->ixa_cred = connp->conn_cred;
+ connp->conn_ixa->ixa_cpid = connp->conn_cpid;
+ if (is_system_labeled())
+ connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
connp->conn_flow_cntrld = B_FALSE;
+
+ if (is->is_pmtu_discovery)
+ connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
+
return (connp);
}
@@ -1713,9 +1743,8 @@ icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name)
* This routine gets default values of certain options whose default
* values are maintained by protcol specific code
*/
-/* ARGSUSED */
int
-icmp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
+icmp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
{
icmp_t *icmp = Q_TO_ICMP(q);
icmp_stack_t *is = icmp->icmp_is;
@@ -1759,366 +1788,88 @@ icmp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
/*
* This routine retrieves the current status of socket options.
- * It returns the size of the option retrieved.
+ * It returns the size of the option retrieved, or -1.
*/
int
icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
{
icmp_t *icmp = connp->conn_icmp;
- icmp_stack_t *is = icmp->icmp_is;
int *i1 = (int *)ptr;
- ip6_pkt_t *ipp = &icmp->icmp_sticky_ipp;
- int ret = 0;
+ conn_opt_arg_t coas;
+ int retval;
- ASSERT(RW_READ_HELD(&icmp->icmp_rwlock));
- switch (level) {
- case SOL_SOCKET:
- switch (name) {
- case SO_DEBUG:
- *i1 = icmp->icmp_debug;
- break;
- case SO_TYPE:
- *i1 = SOCK_RAW;
- break;
- case SO_PROTOTYPE:
- *i1 = icmp->icmp_proto;
- break;
- case SO_REUSEADDR:
- *i1 = icmp->icmp_reuseaddr;
- break;
-
- /*
- * The following three items are available here,
- * but are only meaningful to IP.
- */
- case SO_DONTROUTE:
- *i1 = icmp->icmp_dontroute;
- break;
- case SO_USELOOPBACK:
- *i1 = icmp->icmp_useloopback;
- break;
- case SO_BROADCAST:
- *i1 = icmp->icmp_broadcast;
- break;
-
- case SO_SNDBUF:
- ASSERT(icmp->icmp_xmit_hiwat <= INT_MAX);
- *i1 = icmp->icmp_xmit_hiwat;
- break;
- case SO_RCVBUF:
- ASSERT(icmp->icmp_recv_hiwat <= INT_MAX);
- *i1 = icmp->icmp_recv_hiwat;
- break;
- case SO_DGRAM_ERRIND:
- *i1 = icmp->icmp_dgram_errind;
- break;
- case SO_TIMESTAMP:
- *i1 = icmp->icmp_timestamp;
- break;
- case SO_MAC_EXEMPT:
- *i1 = (connp->conn_mac_mode == CONN_MAC_AWARE);
- break;
- case SO_MAC_IMPLICIT:
- *i1 = (connp->conn_mac_mode == CONN_MAC_IMPLICIT);
- break;
- case SO_DOMAIN:
- *i1 = icmp->icmp_family;
- break;
+ coas.coa_connp = connp;
+ coas.coa_ixa = connp->conn_ixa;
+ coas.coa_ipp = &connp->conn_xmit_ipp;
+ coas.coa_ancillary = B_FALSE;
+ coas.coa_changed = 0;
- /*
- * Following four not meaningful for icmp
- * Action is same as "default" to which we fallthrough
- * so we keep them in comments.
- * case SO_LINGER:
- * case SO_KEEPALIVE:
- * case SO_OOBINLINE:
- * case SO_ALLZONES:
- */
- default:
- ret = -1;
- goto done;
- }
- break;
+ /*
+ * We assume that the optcom framework has checked for the set
+ * of levels and names that are supported, hence we don't worry
+ * about rejecting based on that.
+ * First check for ICMP specific handling, then pass to common routine.
+ */
+ switch (level) {
case IPPROTO_IP:
/*
* Only allow IPv4 option processing on IPv4 sockets.
*/
- if (icmp->icmp_family != AF_INET) {
- ret = -1;
- goto done;
- }
+ if (connp->conn_family != AF_INET)
+ return (-1);
switch (name) {
case IP_OPTIONS:
case T_IP_OPTIONS:
/* Options are passed up with each packet */
- ret = 0;
- goto done;
+ return (0);
case IP_HDRINCL:
+ mutex_enter(&connp->conn_lock);
*i1 = (int)icmp->icmp_hdrincl;
- break;
- case IP_TOS:
- case T_IP_TOS:
- *i1 = (int)icmp->icmp_type_of_service;
- break;
- case IP_TTL:
- *i1 = (int)icmp->icmp_ttl;
- break;
- case IP_MULTICAST_IF:
- /* 0 address if not set */
- *(ipaddr_t *)ptr = icmp->icmp_multicast_if_addr;
- ret = sizeof (ipaddr_t);
- goto done;
- case IP_MULTICAST_TTL:
- *(uchar_t *)ptr = icmp->icmp_multicast_ttl;
- ret = sizeof (uchar_t);
- goto done;
- case IP_MULTICAST_LOOP:
- *ptr = connp->conn_multicast_loop;
- ret = sizeof (uint8_t);
- goto done;
- case IP_BOUND_IF:
- /* Zero if not set */
- *i1 = icmp->icmp_bound_if;
- break; /* goto sizeof (int) option return */
- case IP_UNSPEC_SRC:
- *ptr = icmp->icmp_unspec_source;
- break; /* goto sizeof (int) option return */
- case IP_RECVIF:
- *ptr = icmp->icmp_recvif;
- break; /* goto sizeof (int) option return */
- case IP_BROADCAST_TTL:
- *(uchar_t *)ptr = connp->conn_broadcast_ttl;
- return (sizeof (uchar_t));
- case IP_RECVPKTINFO:
- /*
- * This also handles IP_PKTINFO.
- * IP_PKTINFO and IP_RECVPKTINFO have the same value.
- * Differentiation is based on the size of the argument
- * passed in.
- * This option is handled in IP which will return an
- * error for IP_PKTINFO as it's not supported as a
- * sticky option.
- */
- ret = -EINVAL;
- goto done;
- /*
- * Cannot "get" the value of following options
- * at this level. Action is same as "default" to
- * which we fallthrough so we keep them in comments.
- *
- * case IP_ADD_MEMBERSHIP:
- * case IP_DROP_MEMBERSHIP:
- * case IP_BLOCK_SOURCE:
- * case IP_UNBLOCK_SOURCE:
- * case IP_ADD_SOURCE_MEMBERSHIP:
- * case IP_DROP_SOURCE_MEMBERSHIP:
- * case MCAST_JOIN_GROUP:
- * case MCAST_LEAVE_GROUP:
- * case MCAST_BLOCK_SOURCE:
- * case MCAST_UNBLOCK_SOURCE:
- * case MCAST_JOIN_SOURCE_GROUP:
- * case MCAST_LEAVE_SOURCE_GROUP:
- * case MRT_INIT:
- * case MRT_DONE:
- * case MRT_ADD_VIF:
- * case MRT_DEL_VIF:
- * case MRT_ADD_MFC:
- * case MRT_DEL_MFC:
- * case MRT_VERSION:
- * case MRT_ASSERT:
- * case IP_SEC_OPT:
- * case IP_NEXTHOP:
- */
- default:
- ret = -1;
- goto done;
+ mutex_exit(&connp->conn_lock);
+ return (sizeof (int));
}
break;
+
case IPPROTO_IPV6:
/*
* Only allow IPv6 option processing on native IPv6 sockets.
*/
- if (icmp->icmp_family != AF_INET6) {
- ret = -1;
- goto done;
- }
+ if (connp->conn_family != AF_INET6)
+ return (-1);
+
switch (name) {
- case IPV6_UNICAST_HOPS:
- *i1 = (unsigned int)icmp->icmp_ttl;
- break;
- case IPV6_MULTICAST_IF:
- /* 0 index if not set */
- *i1 = icmp->icmp_multicast_if_index;
- break;
- case IPV6_MULTICAST_HOPS:
- *i1 = icmp->icmp_multicast_ttl;
- break;
- case IPV6_MULTICAST_LOOP:
- *i1 = connp->conn_multicast_loop;
- break;
- case IPV6_BOUND_IF:
- /* Zero if not set */
- *i1 = icmp->icmp_bound_if;
- break;
- case IPV6_UNSPEC_SRC:
- *i1 = icmp->icmp_unspec_source;
- break;
case IPV6_CHECKSUM:
/*
* Return offset or -1 if no checksum offset.
* Does not apply to IPPROTO_ICMPV6
*/
- if (icmp->icmp_proto == IPPROTO_ICMPV6) {
- ret = -1;
- goto done;
- }
+ if (connp->conn_proto == IPPROTO_ICMPV6)
+ return (-1);
- if (icmp->icmp_raw_checksum) {
- *i1 = icmp->icmp_checksum_off;
- } else {
- *i1 = -1;
- }
- break;
- case IPV6_JOIN_GROUP:
- case IPV6_LEAVE_GROUP:
- case MCAST_JOIN_GROUP:
- case MCAST_LEAVE_GROUP:
- case MCAST_BLOCK_SOURCE:
- case MCAST_UNBLOCK_SOURCE:
- case MCAST_JOIN_SOURCE_GROUP:
- case MCAST_LEAVE_SOURCE_GROUP:
- /* cannot "get" the value for these */
- ret = -1;
- goto done;
- case IPV6_RECVPKTINFO:
- *i1 = icmp->icmp_ip_recvpktinfo;
- break;
- case IPV6_RECVTCLASS:
- *i1 = icmp->icmp_ipv6_recvtclass;
- break;
- case IPV6_RECVPATHMTU:
- *i1 = icmp->icmp_ipv6_recvpathmtu;
- break;
- case IPV6_V6ONLY:
- *i1 = 1;
- break;
- case IPV6_RECVHOPLIMIT:
- *i1 = icmp->icmp_ipv6_recvhoplimit;
- break;
- case IPV6_RECVHOPOPTS:
- *i1 = icmp->icmp_ipv6_recvhopopts;
- break;
- case IPV6_RECVDSTOPTS:
- *i1 = icmp->icmp_ipv6_recvdstopts;
- break;
- case _OLD_IPV6_RECVDSTOPTS:
- *i1 = icmp->icmp_old_ipv6_recvdstopts;
- break;
- case IPV6_RECVRTHDRDSTOPTS:
- *i1 = icmp->icmp_ipv6_recvrtdstopts;
- break;
- case IPV6_RECVRTHDR:
- *i1 = icmp->icmp_ipv6_recvrthdr;
- break;
- case IPV6_PKTINFO: {
- /* XXX assumes that caller has room for max size! */
- struct in6_pktinfo *pkti;
-
- pkti = (struct in6_pktinfo *)ptr;
- if (ipp->ipp_fields & IPPF_IFINDEX)
- pkti->ipi6_ifindex = ipp->ipp_ifindex;
+ mutex_enter(&connp->conn_lock);
+ if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM)
+ *i1 = connp->conn_ixa->ixa_raw_cksum_offset;
else
- pkti->ipi6_ifindex = 0;
- if (ipp->ipp_fields & IPPF_ADDR)
- pkti->ipi6_addr = ipp->ipp_addr;
- else
- pkti->ipi6_addr = ipv6_all_zeros;
- ret = sizeof (struct in6_pktinfo);
- goto done;
- }
- case IPV6_NEXTHOP: {
- sin6_t *sin6 = (sin6_t *)ptr;
-
- if (!(ipp->ipp_fields & IPPF_NEXTHOP))
- return (0);
- *sin6 = sin6_null;
- sin6->sin6_family = AF_INET6;
- sin6->sin6_addr = ipp->ipp_nexthop;
- ret = (sizeof (sin6_t));
- goto done;
- }
- case IPV6_HOPOPTS:
- if (!(ipp->ipp_fields & IPPF_HOPOPTS))
- return (0);
- if (ipp->ipp_hopoptslen <= icmp->icmp_label_len_v6)
- return (0);
- bcopy((char *)ipp->ipp_hopopts +
- icmp->icmp_label_len_v6, ptr,
- ipp->ipp_hopoptslen - icmp->icmp_label_len_v6);
- if (icmp->icmp_label_len_v6 > 0) {
- ptr[0] = ((char *)ipp->ipp_hopopts)[0];
- ptr[1] = (ipp->ipp_hopoptslen -
- icmp->icmp_label_len_v6 + 7) / 8 - 1;
- }
- ret = (ipp->ipp_hopoptslen - icmp->icmp_label_len_v6);
- goto done;
- case IPV6_RTHDRDSTOPTS:
- if (!(ipp->ipp_fields & IPPF_RTDSTOPTS))
- return (0);
- bcopy(ipp->ipp_rtdstopts, ptr, ipp->ipp_rtdstoptslen);
- ret = ipp->ipp_rtdstoptslen;
- goto done;
- case IPV6_RTHDR:
- if (!(ipp->ipp_fields & IPPF_RTHDR))
- return (0);
- bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen);
- ret = ipp->ipp_rthdrlen;
- goto done;
- case IPV6_DSTOPTS:
- if (!(ipp->ipp_fields & IPPF_DSTOPTS)) {
- ret = 0;
- goto done;
- }
- bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen);
- ret = ipp->ipp_dstoptslen;
- goto done;
- case IPV6_PATHMTU:
- if (!(ipp->ipp_fields & IPPF_PATHMTU)) {
- ret = 0;
- } else {
- ret = ip_fill_mtuinfo(
- &icmp->icmp_v6dst.sin6_addr, 0,
- (struct ip6_mtuinfo *)ptr,
- is->is_netstack);
- }
- goto done;
- case IPV6_TCLASS:
- if (ipp->ipp_fields & IPPF_TCLASS)
- *i1 = ipp->ipp_tclass;
- else
- *i1 = IPV6_FLOW_TCLASS(
- IPV6_DEFAULT_VERS_AND_FLOW);
- break;
- default:
- ret = -1;
- goto done;
+ *i1 = -1;
+ mutex_exit(&connp->conn_lock);
+ return (sizeof (int));
}
break;
+
case IPPROTO_ICMPV6:
/*
* Only allow IPv6 option processing on native IPv6 sockets.
*/
- if (icmp->icmp_family != AF_INET6) {
- ret = -1;
- }
+ if (connp->conn_family != AF_INET6)
+ return (-1);
- if (icmp->icmp_proto != IPPROTO_ICMPV6) {
- ret = -1;
- }
+ if (connp->conn_proto != IPPROTO_ICMPV6)
+ return (-1);
switch (name) {
case ICMP6_FILTER:
+ mutex_enter(&connp->conn_lock);
if (icmp->icmp_filter == NULL) {
/* Make it look like "pass all" */
ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
@@ -2126,501 +1877,149 @@ icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
(void) bcopy(icmp->icmp_filter, ptr,
sizeof (icmp6_filter_t));
}
- ret = sizeof (icmp6_filter_t);
- goto done;
- default:
- ret = -1;
- goto done;
+ mutex_exit(&connp->conn_lock);
+ return (sizeof (icmp6_filter_t));
}
- default:
- ret = -1;
- goto done;
}
- ret = sizeof (int);
-done:
- return (ret);
+ mutex_enter(&connp->conn_lock);
+ retval = conn_opt_get(&coas, level, name, ptr);
+ mutex_exit(&connp->conn_lock);
+ return (retval);
}
/*
* This routine retrieves the current status of socket options.
- * It returns the size of the option retrieved.
+ * It returns the size of the option retrieved, or -1.
*/
int
icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
{
- conn_t *connp = Q_TO_CONN(q);
- icmp_t *icmp = connp->conn_icmp;
- int err;
+ conn_t *connp = Q_TO_CONN(q);
+ int err;
- rw_enter(&icmp->icmp_rwlock, RW_READER);
err = icmp_opt_get(connp, level, name, ptr);
- rw_exit(&icmp->icmp_rwlock);
return (err);
}
+/*
+ * This routine sets socket options.
+ */
int
-icmp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
- uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, cred_t *cr,
- void *thisdg_attrs, boolean_t checkonly)
+icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name,
+ uint_t inlen, uchar_t *invalp, cred_t *cr, boolean_t checkonly)
{
+ conn_t *connp = coa->coa_connp;
+ ip_xmit_attr_t *ixa = coa->coa_ixa;
+ icmp_t *icmp = connp->conn_icmp;
+ icmp_stack_t *is = icmp->icmp_is;
+ int *i1 = (int *)invalp;
+ boolean_t onoff = (*i1 == 0) ? 0 : 1;
+ int error;
- int *i1 = (int *)invalp;
- boolean_t onoff = (*i1 == 0) ? 0 : 1;
- icmp_t *icmp = connp->conn_icmp;
- icmp_stack_t *is = icmp->icmp_is;
- int error;
+ ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock));
- ASSERT(RW_WRITE_HELD(&icmp->icmp_rwlock));
/*
* For fixed length options, no sanity check
* of passed in length is done. It is assumed *_optcom_req()
* routines do the right thing.
*/
+
switch (level) {
case SOL_SOCKET:
switch (name) {
- case SO_DEBUG:
- if (!checkonly)
- icmp->icmp_debug = onoff;
- break;
case SO_PROTOTYPE:
if ((*i1 & 0xFF) != IPPROTO_ICMP &&
(*i1 & 0xFF) != IPPROTO_ICMPV6 &&
secpolicy_net_rawaccess(cr) != 0) {
- *outlenp = 0;
return (EACCES);
}
- /* Can't use IPPROTO_RAW with IPv6 */
- if ((*i1 & 0xFF) == IPPROTO_RAW &&
- icmp->icmp_family == AF_INET6) {
- *outlenp = 0;
- return (EPROTONOSUPPORT);
- }
- if (checkonly) {
- /* T_CHECK case */
- *(int *)outvalp = (*i1 & 0xFF);
+ if (checkonly)
break;
- }
- icmp->icmp_proto = *i1 & 0xFF;
- if ((icmp->icmp_proto == IPPROTO_RAW ||
- icmp->icmp_proto == IPPROTO_IGMP) &&
- icmp->icmp_family == AF_INET)
+
+ mutex_enter(&connp->conn_lock);
+ connp->conn_proto = *i1 & 0xFF;
+ ixa->ixa_protocol = connp->conn_proto;
+ if ((connp->conn_proto == IPPROTO_RAW ||
+ connp->conn_proto == IPPROTO_IGMP) &&
+ connp->conn_family == AF_INET) {
icmp->icmp_hdrincl = 1;
- else
+ ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
+ } else if (connp->conn_proto == IPPROTO_UDP ||
+ connp->conn_proto == IPPROTO_TCP ||
+ connp->conn_proto == IPPROTO_SCTP) {
+ /* Used by test applications like psh */
icmp->icmp_hdrincl = 0;
-
- if (icmp->icmp_family == AF_INET6 &&
- icmp->icmp_proto == IPPROTO_ICMPV6) {
- /* Set offset for icmp6_cksum */
- icmp->icmp_raw_checksum = 0;
- icmp->icmp_checksum_off = 2;
- }
- if (icmp->icmp_proto == IPPROTO_UDP ||
- icmp->icmp_proto == IPPROTO_TCP ||
- icmp->icmp_proto == IPPROTO_SCTP) {
- icmp->icmp_no_tp_cksum = 1;
- icmp->icmp_sticky_ipp.ipp_fields |=
- IPPF_NO_CKSUM;
+ ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
} else {
- icmp->icmp_no_tp_cksum = 0;
- icmp->icmp_sticky_ipp.ipp_fields &=
- ~IPPF_NO_CKSUM;
+ icmp->icmp_hdrincl = 0;
+ ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
}
+ if (connp->conn_family == AF_INET6 &&
+ connp->conn_proto == IPPROTO_ICMPV6) {
+ /* Set offset for icmp6_cksum */
+ ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM;
+ ixa->ixa_raw_cksum_offset = 2;
+ }
if (icmp->icmp_filter != NULL &&
- icmp->icmp_proto != IPPROTO_ICMPV6) {
+ connp->conn_proto != IPPROTO_ICMPV6) {
kmem_free(icmp->icmp_filter,
sizeof (icmp6_filter_t));
icmp->icmp_filter = NULL;
}
+ mutex_exit(&connp->conn_lock);
- /* Rebuild the header template */
- error = icmp_build_hdrs(icmp);
- if (error != 0) {
- *outlenp = 0;
- return (error);
- }
-
+ coa->coa_changed |= COA_HEADER_CHANGED;
/*
* For SCTP, we don't use icmp_bind_proto() for
- * raw socket binding. Note that we do not need
- * to set *outlenp.
- * FIXME: how does SCTP work?
+ * raw socket binding.
*/
- if (icmp->icmp_proto == IPPROTO_SCTP)
+ if (connp->conn_proto == IPPROTO_SCTP)
return (0);
- *outlenp = sizeof (int);
- *(int *)outvalp = *i1 & 0xFF;
-
- /* Drop lock across the bind operation */
- rw_exit(&icmp->icmp_rwlock);
- (void) icmp_bind_proto(connp);
- rw_enter(&icmp->icmp_rwlock, RW_WRITER);
+ coa->coa_changed |= COA_ICMP_BIND_NEEDED;
return (0);
- case SO_REUSEADDR:
- if (!checkonly) {
- icmp->icmp_reuseaddr = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
-
- /*
- * The following three items are available here,
- * but are only meaningful to IP.
- */
- case SO_DONTROUTE:
- if (!checkonly) {
- icmp->icmp_dontroute = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case SO_USELOOPBACK:
- if (!checkonly) {
- icmp->icmp_useloopback = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case SO_BROADCAST:
- if (!checkonly) {
- icmp->icmp_broadcast = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
case SO_SNDBUF:
if (*i1 > is->is_max_buf) {
- *outlenp = 0;
return (ENOBUFS);
}
- if (!checkonly) {
- if (!IPCL_IS_NONSTR(connp)) {
- connp->conn_wq->q_hiwat = *i1;
- }
- icmp->icmp_xmit_hiwat = *i1;
- }
break;
case SO_RCVBUF:
if (*i1 > is->is_max_buf) {
- *outlenp = 0;
return (ENOBUFS);
}
- if (!checkonly) {
- icmp->icmp_recv_hiwat = *i1;
- rw_exit(&icmp->icmp_rwlock);
- (void) proto_set_rx_hiwat(connp->conn_rq, connp,
- *i1);
- rw_enter(&icmp->icmp_rwlock, RW_WRITER);
- }
- break;
- case SO_DGRAM_ERRIND:
- if (!checkonly)
- icmp->icmp_dgram_errind = onoff;
break;
- case SO_ALLZONES:
- /*
- * "soft" error (negative)
- * option not handled at this level
- * Note: Do not modify *outlenp
- */
- return (-EINVAL);
- case SO_TIMESTAMP:
- if (!checkonly) {
- icmp->icmp_timestamp = onoff;
- }
- break;
- case SO_MAC_EXEMPT:
- /*
- * "soft" error (negative)
- * option not handled at this level
- * Note: Do not modify *outlenp
- */
- return (-EINVAL);
- case SO_RCVTIMEO:
- case SO_SNDTIMEO:
- /*
- * Pass these two options in order for third part
- * protocol usage. Here just return directly.
- */
- return (0);
- /*
- * Following three not meaningful for icmp
- * Action is same as "default" so we keep them
- * in comments.
- * case SO_LINGER:
- * case SO_KEEPALIVE:
- * case SO_OOBINLINE:
- */
- default:
- *outlenp = 0;
- return (EINVAL);
}
break;
+
case IPPROTO_IP:
/*
* Only allow IPv4 option processing on IPv4 sockets.
*/
- if (icmp->icmp_family != AF_INET) {
- *outlenp = 0;
- return (ENOPROTOOPT);
- }
- switch (name) {
- case IP_OPTIONS:
- case T_IP_OPTIONS:
- /* Save options for use by IP. */
- if ((inlen & 0x3) ||
- inlen + icmp->icmp_label_len > IP_MAX_OPT_LENGTH) {
- *outlenp = 0;
- return (EINVAL);
- }
- if (checkonly)
- break;
-
- if (!tsol_option_set(&icmp->icmp_ip_snd_options,
- &icmp->icmp_ip_snd_options_len,
- icmp->icmp_label_len, invalp, inlen)) {
- *outlenp = 0;
- return (ENOMEM);
- }
+ if (connp->conn_family != AF_INET)
+ return (EINVAL);
- icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
- icmp->icmp_ip_snd_options_len;
- rw_exit(&icmp->icmp_rwlock);
- (void) proto_set_tx_wroff(connp->conn_rq == NULL ? NULL:
- RD(connp->conn_rq), connp,
- icmp->icmp_max_hdr_len + is->is_wroff_extra);
- rw_enter(&icmp->icmp_rwlock, RW_WRITER);
- break;
+ switch (name) {
case IP_HDRINCL:
- if (!checkonly)
- icmp->icmp_hdrincl = onoff;
- break;
- case IP_TOS:
- case T_IP_TOS:
- if (!checkonly) {
- icmp->icmp_type_of_service = (uint8_t)*i1;
- }
- break;
- case IP_TTL:
if (!checkonly) {
- icmp->icmp_ttl = (uint8_t)*i1;
- }
- break;
- case IP_MULTICAST_IF:
- /*
- * TODO should check OPTMGMT reply and undo this if
- * there is an error.
- */
- if (!checkonly) {
- icmp->icmp_multicast_if_addr = *i1;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IP_MULTICAST_TTL:
- if (!checkonly)
- icmp->icmp_multicast_ttl = *invalp;
- break;
- case IP_MULTICAST_LOOP:
- if (!checkonly) {
- connp->conn_multicast_loop =
- (*invalp == 0) ? 0 : 1;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IP_BOUND_IF:
- if (!checkonly) {
- icmp->icmp_bound_if = *i1;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IP_UNSPEC_SRC:
- if (!checkonly) {
- icmp->icmp_unspec_source = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IP_BROADCAST_TTL:
- if (!checkonly)
- connp->conn_broadcast_ttl = *invalp;
- break;
- case IP_RECVIF:
- if (!checkonly) {
- icmp->icmp_recvif = onoff;
- }
- /*
- * pass to ip
- */
- return (-EINVAL);
- case IP_PKTINFO: {
- /*
- * This also handles IP_RECVPKTINFO.
- * IP_PKTINFO and IP_RECVPKTINFO have the same value.
- * Differentiation is based on the size of the argument
- * passed in.
- */
- struct in_pktinfo *pktinfop;
- ip4_pkt_t *attr_pktinfop;
-
- if (checkonly)
- break;
-
- if (inlen == sizeof (int)) {
- /*
- * This is IP_RECVPKTINFO option.
- * Keep a local copy of wether this option is
- * set or not and pass it down to IP for
- * processing.
- */
- icmp->icmp_ip_recvpktinfo = onoff;
- return (-EINVAL);
- }
-
-
- if (inlen != sizeof (struct in_pktinfo)) {
- return (EINVAL);
- }
-
- if ((attr_pktinfop = (ip4_pkt_t *)thisdg_attrs)
- == NULL) {
- /*
- * sticky option is not supported
- */
- return (EINVAL);
- }
-
- pktinfop = (struct in_pktinfo *)invalp;
-
- /*
- * Atleast one of the values should be specified
- */
- if (pktinfop->ipi_ifindex == 0 &&
- pktinfop->ipi_spec_dst.s_addr == INADDR_ANY) {
- return (EINVAL);
+ mutex_enter(&connp->conn_lock);
+ icmp->icmp_hdrincl = onoff;
+ if (onoff)
+ ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
+ else
+ ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
+ mutex_exit(&connp->conn_lock);
}
-
- attr_pktinfop->ip4_addr = pktinfop->ipi_spec_dst.s_addr;
- attr_pktinfop->ip4_ill_index = pktinfop->ipi_ifindex;
- }
break;
- case IP_ADD_MEMBERSHIP:
- case IP_DROP_MEMBERSHIP:
- case IP_BLOCK_SOURCE:
- case IP_UNBLOCK_SOURCE:
- case IP_ADD_SOURCE_MEMBERSHIP:
- case IP_DROP_SOURCE_MEMBERSHIP:
- case MCAST_JOIN_GROUP:
- case MCAST_LEAVE_GROUP:
- case MCAST_BLOCK_SOURCE:
- case MCAST_UNBLOCK_SOURCE:
- case MCAST_JOIN_SOURCE_GROUP:
- case MCAST_LEAVE_SOURCE_GROUP:
- case MRT_INIT:
- case MRT_DONE:
- case MRT_ADD_VIF:
- case MRT_DEL_VIF:
- case MRT_ADD_MFC:
- case MRT_DEL_MFC:
- case MRT_VERSION:
- case MRT_ASSERT:
- case IP_SEC_OPT:
- case IP_NEXTHOP:
- /*
- * "soft" error (negative)
- * option not handled at this level
- * Note: Do not modify *outlenp
- */
- return (-EINVAL);
- default:
- *outlenp = 0;
- return (EINVAL);
}
break;
- case IPPROTO_IPV6: {
- ip6_pkt_t *ipp;
- boolean_t sticky;
- if (icmp->icmp_family != AF_INET6) {
- *outlenp = 0;
- return (ENOPROTOOPT);
- }
- /*
- * Deal with both sticky options and ancillary data
- */
- if (thisdg_attrs == NULL) {
- /* sticky options, or none */
- ipp = &icmp->icmp_sticky_ipp;
- sticky = B_TRUE;
- } else {
- /* ancillary data */
- ipp = (ip6_pkt_t *)thisdg_attrs;
- sticky = B_FALSE;
- }
+ case IPPROTO_IPV6:
+ if (connp->conn_family != AF_INET6)
+ return (EINVAL);
switch (name) {
- case IPV6_MULTICAST_IF:
- if (!checkonly) {
- icmp->icmp_multicast_if_index = *i1;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IPV6_UNICAST_HOPS:
- /* -1 means use default */
- if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) {
- *outlenp = 0;
- return (EINVAL);
- }
- if (!checkonly) {
- if (*i1 == -1) {
- icmp->icmp_ttl = ipp->ipp_unicast_hops =
- is->is_ipv6_hoplimit;
- ipp->ipp_fields &= ~IPPF_UNICAST_HOPS;
- /* Pass modified value to IP. */
- *i1 = ipp->ipp_hoplimit;
- } else {
- icmp->icmp_ttl = ipp->ipp_unicast_hops =
- (uint8_t)*i1;
- ipp->ipp_fields |= IPPF_UNICAST_HOPS;
- }
- /* Rebuild the header template */
- error = icmp_build_hdrs(icmp);
- if (error != 0) {
- *outlenp = 0;
- return (error);
- }
- }
- break;
- case IPV6_MULTICAST_HOPS:
- /* -1 means use default */
- if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) {
- *outlenp = 0;
- return (EINVAL);
- }
- if (!checkonly) {
- if (*i1 == -1) {
- icmp->icmp_multicast_ttl =
- ipp->ipp_multicast_hops =
- IP_DEFAULT_MULTICAST_TTL;
- ipp->ipp_fields &= ~IPPF_MULTICAST_HOPS;
- /* Pass modified value to IP. */
- *i1 = icmp->icmp_multicast_ttl;
- } else {
- icmp->icmp_multicast_ttl =
- ipp->ipp_multicast_hops =
- (uint8_t)*i1;
- ipp->ipp_fields |= IPPF_MULTICAST_HOPS;
- }
- }
- break;
- case IPV6_MULTICAST_LOOP:
- if (*i1 != 0 && *i1 != 1) {
- *outlenp = 0;
- return (EINVAL);
- }
- if (!checkonly) {
- connp->conn_multicast_loop = *i1;
- PASS_OPT_TO_IP(connp);
- }
- break;
case IPV6_CHECKSUM:
/*
* Integer offset into the user data of where the
@@ -2628,517 +2027,93 @@ icmp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
* Offset of -1 disables option.
* Does not apply to IPPROTO_ICMPV6.
*/
- if (icmp->icmp_proto == IPPROTO_ICMPV6 || !sticky) {
- *outlenp = 0;
+ if (connp->conn_proto == IPPROTO_ICMPV6 ||
+ coa->coa_ancillary) {
return (EINVAL);
}
if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) {
/* Negative or not 16 bit aligned offset */
- *outlenp = 0;
return (EINVAL);
}
if (checkonly)
break;
+ mutex_enter(&connp->conn_lock);
if (*i1 == -1) {
- icmp->icmp_raw_checksum = 0;
- ipp->ipp_fields &= ~IPPF_RAW_CKSUM;
+ ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM;
+ ixa->ixa_raw_cksum_offset = 0;
+ ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
} else {
- icmp->icmp_raw_checksum = 1;
- icmp->icmp_checksum_off = *i1;
- ipp->ipp_fields |= IPPF_RAW_CKSUM;
- }
- /* Rebuild the header template */
- error = icmp_build_hdrs(icmp);
- if (error != 0) {
- *outlenp = 0;
- return (error);
- }
- break;
- case IPV6_JOIN_GROUP:
- case IPV6_LEAVE_GROUP:
- case MCAST_JOIN_GROUP:
- case MCAST_LEAVE_GROUP:
- case MCAST_BLOCK_SOURCE:
- case MCAST_UNBLOCK_SOURCE:
- case MCAST_JOIN_SOURCE_GROUP:
- case MCAST_LEAVE_SOURCE_GROUP:
- /*
- * "soft" error (negative)
- * option not handled at this level
- * Note: Do not modify *outlenp
- */
- return (-EINVAL);
- case IPV6_BOUND_IF:
- if (!checkonly) {
- icmp->icmp_bound_if = *i1;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IPV6_UNSPEC_SRC:
- if (!checkonly) {
- icmp->icmp_unspec_source = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IPV6_RECVTCLASS:
- if (!checkonly) {
- icmp->icmp_ipv6_recvtclass = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
- /*
- * Set boolean switches for ancillary data delivery
- */
- case IPV6_RECVPKTINFO:
- if (!checkonly) {
- icmp->icmp_ip_recvpktinfo = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IPV6_RECVPATHMTU:
- if (!checkonly) {
- icmp->icmp_ipv6_recvpathmtu = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IPV6_RECVHOPLIMIT:
- if (!checkonly) {
- icmp->icmp_ipv6_recvhoplimit = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IPV6_RECVHOPOPTS:
- if (!checkonly) {
- icmp->icmp_ipv6_recvhopopts = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IPV6_RECVDSTOPTS:
- if (!checkonly) {
- icmp->icmp_ipv6_recvdstopts = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case _OLD_IPV6_RECVDSTOPTS:
- if (!checkonly)
- icmp->icmp_old_ipv6_recvdstopts = onoff;
- break;
- case IPV6_RECVRTHDRDSTOPTS:
- if (!checkonly) {
- icmp->icmp_ipv6_recvrtdstopts = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IPV6_RECVRTHDR:
- if (!checkonly) {
- icmp->icmp_ipv6_recvrthdr = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
- /*
- * Set sticky options or ancillary data.
- * If sticky options, (re)build any extension headers
- * that might be needed as a result.
- */
- case IPV6_PKTINFO:
- /*
- * The source address and ifindex are verified
- * in ip_opt_set(). For ancillary data the
- * source address is checked in ip_wput_v6.
- */
- if (inlen != 0 && inlen !=
- sizeof (struct in6_pktinfo)) {
- return (EINVAL);
- }
- if (checkonly)
- break;
-
- if (inlen == 0) {
- ipp->ipp_fields &= ~(IPPF_IFINDEX|IPPF_ADDR);
- ipp->ipp_sticky_ignored |=
- (IPPF_IFINDEX|IPPF_ADDR);
- } else {
- struct in6_pktinfo *pkti;
-
- pkti = (struct in6_pktinfo *)invalp;
- ipp->ipp_ifindex = pkti->ipi6_ifindex;
- ipp->ipp_addr = pkti->ipi6_addr;
- if (ipp->ipp_ifindex != 0)
- ipp->ipp_fields |= IPPF_IFINDEX;
- else
- ipp->ipp_fields &= ~IPPF_IFINDEX;
- if (!IN6_IS_ADDR_UNSPECIFIED(
- &ipp->ipp_addr))
- ipp->ipp_fields |= IPPF_ADDR;
- else
- ipp->ipp_fields &= ~IPPF_ADDR;
- }
- if (sticky) {
- error = icmp_build_hdrs(icmp);
- if (error != 0)
- return (error);
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IPV6_HOPLIMIT:
- /* This option can only be used as ancillary data. */
- if (sticky)
- return (EINVAL);
- if (inlen != 0 && inlen != sizeof (int))
- return (EINVAL);
- if (checkonly)
- break;
-
- if (inlen == 0) {
- ipp->ipp_fields &= ~IPPF_HOPLIMIT;
- ipp->ipp_sticky_ignored |= IPPF_HOPLIMIT;
- } else {
- if (*i1 > 255 || *i1 < -1)
- return (EINVAL);
- if (*i1 == -1)
- ipp->ipp_hoplimit =
- is->is_ipv6_hoplimit;
- else
- ipp->ipp_hoplimit = *i1;
- ipp->ipp_fields |= IPPF_HOPLIMIT;
- }
- break;
- case IPV6_TCLASS:
- /*
- * IPV6_RECVTCLASS accepts -1 as use kernel default
- * and [0, 255] as the actualy traffic class.
- */
- if (inlen != 0 && inlen != sizeof (int)) {
- return (EINVAL);
- }
- if (checkonly)
- break;
-
- if (inlen == 0) {
- ipp->ipp_fields &= ~IPPF_TCLASS;
- ipp->ipp_sticky_ignored |= IPPF_TCLASS;
- } else {
- if (*i1 >= 256 || *i1 < -1)
- return (EINVAL);
- if (*i1 == -1) {
- ipp->ipp_tclass =
- IPV6_FLOW_TCLASS(
- IPV6_DEFAULT_VERS_AND_FLOW);
- } else {
- ipp->ipp_tclass = *i1;
- }
- ipp->ipp_fields |= IPPF_TCLASS;
- }
- if (sticky) {
- error = icmp_build_hdrs(icmp);
- if (error != 0)
- return (error);
- }
- break;
- case IPV6_NEXTHOP:
- /*
- * IP will verify that the nexthop is reachable
- * and fail for sticky options.
- */
- if (inlen != 0 && inlen != sizeof (sin6_t)) {
- return (EINVAL);
- }
- if (checkonly)
- break;
-
- if (inlen == 0) {
- ipp->ipp_fields &= ~IPPF_NEXTHOP;
- ipp->ipp_sticky_ignored |= IPPF_NEXTHOP;
- } else {
- sin6_t *sin6 = (sin6_t *)invalp;
-
- if (sin6->sin6_family != AF_INET6) {
- return (EAFNOSUPPORT);
- }
- if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
- return (EADDRNOTAVAIL);
- }
- ipp->ipp_nexthop = sin6->sin6_addr;
- if (!IN6_IS_ADDR_UNSPECIFIED(
- &ipp->ipp_nexthop))
- ipp->ipp_fields |= IPPF_NEXTHOP;
- else
- ipp->ipp_fields &= ~IPPF_NEXTHOP;
- }
- if (sticky) {
- error = icmp_build_hdrs(icmp);
- if (error != 0)
- return (error);
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IPV6_HOPOPTS: {
- ip6_hbh_t *hopts = (ip6_hbh_t *)invalp;
- /*
- * Sanity checks - minimum size, size a multiple of
- * eight bytes, and matching size passed in.
- */
- if (inlen != 0 &&
- inlen != (8 * (hopts->ip6h_len + 1))) {
- return (EINVAL);
- }
-
- if (checkonly)
- break;
- error = optcom_pkt_set(invalp, inlen, sticky,
- (uchar_t **)&ipp->ipp_hopopts,
- &ipp->ipp_hopoptslen,
- sticky ? icmp->icmp_label_len_v6 : 0);
- if (error != 0)
- return (error);
- if (ipp->ipp_hopoptslen == 0) {
- ipp->ipp_fields &= ~IPPF_HOPOPTS;
- ipp->ipp_sticky_ignored |= IPPF_HOPOPTS;
- } else {
- ipp->ipp_fields |= IPPF_HOPOPTS;
- }
- if (sticky) {
- error = icmp_build_hdrs(icmp);
- if (error != 0)
- return (error);
+ ixa->ixa_flags |= IXAF_SET_RAW_CKSUM;
+ ixa->ixa_raw_cksum_offset = *i1;
+ ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
}
+ mutex_exit(&connp->conn_lock);
break;
}
- case IPV6_RTHDRDSTOPTS: {
- ip6_dest_t *dopts = (ip6_dest_t *)invalp;
-
- /*
- * Sanity checks - minimum size, size a multiple of
- * eight bytes, and matching size passed in.
- */
- if (inlen != 0 &&
- inlen != (8 * (dopts->ip6d_len + 1)))
- return (EINVAL);
-
- if (checkonly)
- break;
-
- if (inlen == 0) {
- if (sticky &&
- (ipp->ipp_fields & IPPF_RTDSTOPTS) != 0) {
- kmem_free(ipp->ipp_rtdstopts,
- ipp->ipp_rtdstoptslen);
- ipp->ipp_rtdstopts = NULL;
- ipp->ipp_rtdstoptslen = 0;
- }
- ipp->ipp_fields &= ~IPPF_RTDSTOPTS;
- ipp->ipp_sticky_ignored |= IPPF_RTDSTOPTS;
- } else {
- error = optcom_pkt_set(invalp, inlen, sticky,
- (uchar_t **)&ipp->ipp_rtdstopts,
- &ipp->ipp_rtdstoptslen, 0);
- if (error != 0)
- return (error);
- ipp->ipp_fields |= IPPF_RTDSTOPTS;
- }
- if (sticky) {
- error = icmp_build_hdrs(icmp);
- if (error != 0)
- return (error);
- }
- break;
- }
- case IPV6_DSTOPTS: {
- ip6_dest_t *dopts = (ip6_dest_t *)invalp;
+ break;
- /*
- * Sanity checks - minimum size, size a multiple of
- * eight bytes, and matching size passed in.
- */
- if (inlen != 0 &&
- inlen != (8 * (dopts->ip6d_len + 1)))
- return (EINVAL);
+ case IPPROTO_ICMPV6:
+ /*
+ * Only allow IPv6 option processing on IPv6 sockets.
+ */
+ if (connp->conn_family != AF_INET6)
+ return (EINVAL);
+ if (connp->conn_proto != IPPROTO_ICMPV6)
+ return (EINVAL);
+ switch (name) {
+ case ICMP6_FILTER:
if (checkonly)
break;
- if (inlen == 0) {
- if (sticky &&
- (ipp->ipp_fields & IPPF_DSTOPTS) != 0) {
- kmem_free(ipp->ipp_dstopts,
- ipp->ipp_dstoptslen);
- ipp->ipp_dstopts = NULL;
- ipp->ipp_dstoptslen = 0;
- }
- ipp->ipp_fields &= ~IPPF_DSTOPTS;
- ipp->ipp_sticky_ignored |= IPPF_DSTOPTS;
- } else {
- error = optcom_pkt_set(invalp, inlen, sticky,
- (uchar_t **)&ipp->ipp_dstopts,
- &ipp->ipp_dstoptslen, 0);
- if (error != 0)
- return (error);
- ipp->ipp_fields |= IPPF_DSTOPTS;
- }
- if (sticky) {
- error = icmp_build_hdrs(icmp);
- if (error != 0)
- return (error);
- }
- break;
- }
- case IPV6_RTHDR: {
- ip6_rthdr_t *rt = (ip6_rthdr_t *)invalp;
-
- /*
- * Sanity checks - minimum size, size a multiple of
- * eight bytes, and matching size passed in.
- */
- if (inlen != 0 &&
- inlen != (8 * (rt->ip6r_len + 1)))
+ if ((inlen != 0) &&
+ (inlen != sizeof (icmp6_filter_t)))
return (EINVAL);
- if (checkonly)
- break;
-
+ mutex_enter(&connp->conn_lock);
if (inlen == 0) {
- if (sticky &&
- (ipp->ipp_fields & IPPF_RTHDR) != 0) {
- kmem_free(ipp->ipp_rthdr,
- ipp->ipp_rthdrlen);
- ipp->ipp_rthdr = NULL;
- ipp->ipp_rthdrlen = 0;
+ if (icmp->icmp_filter != NULL) {
+ kmem_free(icmp->icmp_filter,
+ sizeof (icmp6_filter_t));
+ icmp->icmp_filter = NULL;
}
- ipp->ipp_fields &= ~IPPF_RTHDR;
- ipp->ipp_sticky_ignored |= IPPF_RTHDR;
} else {
- error = optcom_pkt_set(invalp, inlen, sticky,
- (uchar_t **)&ipp->ipp_rthdr,
- &ipp->ipp_rthdrlen, 0);
- if (error != 0)
- return (error);
- ipp->ipp_fields |= IPPF_RTHDR;
- }
- if (sticky) {
- error = icmp_build_hdrs(icmp);
- if (error != 0)
- return (error);
- }
- break;
- }
-
- case IPV6_DONTFRAG:
- if (checkonly)
- break;
-
- if (onoff) {
- ipp->ipp_fields |= IPPF_DONTFRAG;
- } else {
- ipp->ipp_fields &= ~IPPF_DONTFRAG;
- }
- break;
-
- case IPV6_USE_MIN_MTU:
- if (inlen != sizeof (int))
- return (EINVAL);
-
- if (*i1 < -1 || *i1 > 1)
- return (EINVAL);
-
- if (checkonly)
- break;
-
- ipp->ipp_fields |= IPPF_USE_MIN_MTU;
- ipp->ipp_use_min_mtu = *i1;
- break;
-
- /*
- * This option can't be set. Its only returned via
- * getsockopt() or ancillary data.
- */
- case IPV6_PATHMTU:
- return (EINVAL);
-
- case IPV6_SEC_OPT:
- case IPV6_SRC_PREFERENCES:
- case IPV6_V6ONLY:
- /* Handled at IP level */
- return (-EINVAL);
- default:
- *outlenp = 0;
- return (EINVAL);
- }
- break;
- } /* end IPPROTO_IPV6 */
-
- case IPPROTO_ICMPV6:
- /*
- * Only allow IPv6 option processing on IPv6 sockets.
- */
- if (icmp->icmp_family != AF_INET6) {
- *outlenp = 0;
- return (ENOPROTOOPT);
- }
- if (icmp->icmp_proto != IPPROTO_ICMPV6) {
- *outlenp = 0;
- return (ENOPROTOOPT);
- }
- switch (name) {
- case ICMP6_FILTER:
- if (!checkonly) {
- if ((inlen != 0) &&
- (inlen != sizeof (icmp6_filter_t)))
- return (EINVAL);
-
- if (inlen == 0) {
- if (icmp->icmp_filter != NULL) {
- kmem_free(icmp->icmp_filter,
- sizeof (icmp6_filter_t));
- icmp->icmp_filter = NULL;
- }
- } else {
+ if (icmp->icmp_filter == NULL) {
+ icmp->icmp_filter = kmem_alloc(
+ sizeof (icmp6_filter_t),
+ KM_NOSLEEP);
if (icmp->icmp_filter == NULL) {
- icmp->icmp_filter = kmem_alloc(
- sizeof (icmp6_filter_t),
- KM_NOSLEEP);
- if (icmp->icmp_filter == NULL) {
- *outlenp = 0;
- return (ENOBUFS);
- }
+ mutex_exit(&connp->conn_lock);
+ return (ENOBUFS);
}
- (void) bcopy(invalp, icmp->icmp_filter,
- inlen);
}
+ (void) bcopy(invalp, icmp->icmp_filter, inlen);
}
+ mutex_exit(&connp->conn_lock);
break;
-
- default:
- *outlenp = 0;
- return (EINVAL);
}
break;
- default:
- *outlenp = 0;
- return (EINVAL);
- }
- /*
- * Common case of OK return with outval same as inval.
- */
- if (invalp != outvalp) {
- /* don't trust bcopy for identical src/dst */
- (void) bcopy(invalp, outvalp, inlen);
}
- *outlenp = inlen;
- return (0);
+ error = conn_opt_set(coa, level, name, inlen, invalp,
+ checkonly, cr);
+ return (error);
}
-/* This routine sets socket options. */
-/* ARGSUSED */
+/*
+ * This routine sets socket options.
+ */
int
icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
void *thisdg_attrs, cred_t *cr)
{
- boolean_t checkonly;
- int error;
+ icmp_t *icmp = connp->conn_icmp;
+ int err;
+ conn_opt_arg_t coas, *coa;
+ boolean_t checkonly;
+ icmp_stack_t *is = icmp->icmp_is;
- error = 0;
switch (optset_context) {
case SETFN_OPTCOM_CHECKONLY:
checkonly = B_TRUE;
@@ -3152,8 +2127,7 @@ icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
*/
if (inlen == 0) {
*outlenp = 0;
- error = 0;
- goto done;
+ return (0);
}
break;
case SETFN_OPTCOM_NEGOTIATE:
@@ -3171,8 +2145,7 @@ icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
*/
if (!icmp_opt_allow_udr_set(level, name)) {
*outlenp = 0;
- error = EINVAL;
- goto done;
+ return (EINVAL);
}
break;
default:
@@ -3180,105 +2153,265 @@ icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
* We should never get here
*/
*outlenp = 0;
- error = EINVAL;
- goto done;
+ return (EINVAL);
}
ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
(optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
- error = icmp_do_opt_set(connp, level, name, inlen, invalp, outlenp,
- outvalp, cr, thisdg_attrs, checkonly);
-done:
- return (error);
+ if (thisdg_attrs != NULL) {
+ /* Options from T_UNITDATA_REQ */
+ coa = (conn_opt_arg_t *)thisdg_attrs;
+ ASSERT(coa->coa_connp == connp);
+ ASSERT(coa->coa_ixa != NULL);
+ ASSERT(coa->coa_ipp != NULL);
+ ASSERT(coa->coa_ancillary);
+ } else {
+ coa = &coas;
+ coas.coa_connp = connp;
+ /* Get a reference on conn_ixa to prevent concurrent mods */
+ coas.coa_ixa = conn_get_ixa(connp, B_TRUE);
+ if (coas.coa_ixa == NULL) {
+ *outlenp = 0;
+ return (ENOMEM);
+ }
+ coas.coa_ipp = &connp->conn_xmit_ipp;
+ coas.coa_ancillary = B_FALSE;
+ coas.coa_changed = 0;
+ }
+
+ err = icmp_do_opt_set(coa, level, name, inlen, invalp,
+ cr, checkonly);
+ if (err != 0) {
+errout:
+ if (!coa->coa_ancillary)
+ ixa_refrele(coa->coa_ixa);
+ *outlenp = 0;
+ return (err);
+ }
+
+ /*
+ * Common case of OK return with outval same as inval.
+ */
+ if (invalp != outvalp) {
+ /* don't trust bcopy for identical src/dst */
+ (void) bcopy(invalp, outvalp, inlen);
+ }
+ *outlenp = inlen;
+
+ /*
+ * If this was not ancillary data, then we rebuild the headers,
+ * update the IRE/NCE, and IPsec as needed.
+ * Since the label depends on the destination we go through
+ * ip_set_destination first.
+ */
+ if (coa->coa_ancillary) {
+ return (0);
+ }
+
+ if (coa->coa_changed & COA_ROUTE_CHANGED) {
+ in6_addr_t saddr, faddr, nexthop;
+ in_port_t fport;
+
+ /*
+ * We clear lastdst to make sure we pick up the change
+ * next time sending.
+ * If we are connected we re-cache the information.
+ * We ignore errors to preserve BSD behavior.
+ * Note that we don't redo IPsec policy lookup here
+ * since the final destination (or source) didn't change.
+ */
+ mutex_enter(&connp->conn_lock);
+ connp->conn_v6lastdst = ipv6_all_zeros;
+
+ ip_attr_nexthop(coa->coa_ipp, coa->coa_ixa,
+ &connp->conn_faddr_v6, &nexthop);
+ saddr = connp->conn_saddr_v6;
+ faddr = connp->conn_faddr_v6;
+ fport = connp->conn_fport;
+ mutex_exit(&connp->conn_lock);
+
+ if (!IN6_IS_ADDR_UNSPECIFIED(&faddr) &&
+ !IN6_IS_ADDR_V4MAPPED_ANY(&faddr)) {
+ (void) ip_attr_connect(connp, coa->coa_ixa,
+ &saddr, &faddr, &nexthop, fport, NULL, NULL,
+ IPDF_ALLOW_MCBC | IPDF_VERIFY_DST);
+ }
+ }
+
+ ixa_refrele(coa->coa_ixa);
+
+ if (coa->coa_changed & COA_HEADER_CHANGED) {
+ /*
+ * Rebuild the header template if we are connected.
+ * Otherwise clear conn_v6lastdst so we rebuild the header
+ * in the data path.
+ */
+ mutex_enter(&connp->conn_lock);
+ if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
+ !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
+ err = icmp_build_hdr_template(connp,
+ &connp->conn_saddr_v6, &connp->conn_faddr_v6,
+ connp->conn_flowinfo);
+ if (err != 0) {
+ mutex_exit(&connp->conn_lock);
+ return (err);
+ }
+ } else {
+ connp->conn_v6lastdst = ipv6_all_zeros;
+ }
+ mutex_exit(&connp->conn_lock);
+ }
+ if (coa->coa_changed & COA_RCVBUF_CHANGED) {
+ (void) proto_set_rx_hiwat(connp->conn_rq, connp,
+ connp->conn_rcvbuf);
+ }
+ if ((coa->coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
+ connp->conn_wq->q_hiwat = connp->conn_sndbuf;
+ }
+ if (coa->coa_changed & COA_WROFF_CHANGED) {
+ /* Increase wroff if needed */
+ uint_t wroff;
+
+ mutex_enter(&connp->conn_lock);
+ wroff = connp->conn_ht_iphc_allocated + is->is_wroff_extra;
+ if (wroff > connp->conn_wroff) {
+ connp->conn_wroff = wroff;
+ mutex_exit(&connp->conn_lock);
+ (void) proto_set_tx_wroff(connp->conn_rq, connp, wroff);
+ } else {
+ mutex_exit(&connp->conn_lock);
+ }
+ }
+ if (coa->coa_changed & COA_ICMP_BIND_NEEDED) {
+ icmp_bind_proto(icmp);
+ }
+ return (err);
}
/* This routine sets socket options. */
-/* ARGSUSED */
int
icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
- void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
+ void *thisdg_attrs, cred_t *cr)
{
- conn_t *connp = Q_TO_CONN(q);
- icmp_t *icmp;
+ conn_t *connp = Q_TO_CONN(q);
int error;
- icmp = connp->conn_icmp;
- rw_enter(&icmp->icmp_rwlock, RW_WRITER);
error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp,
outlenp, outvalp, thisdg_attrs, cr);
- rw_exit(&icmp->icmp_rwlock);
return (error);
}
/*
- * Update icmp_sticky_hdrs based on icmp_sticky_ipp, icmp_v6src, icmp_ttl,
- * icmp_proto, icmp_raw_checksum and icmp_no_tp_cksum.
- * The headers include ip6i_t (if needed), ip6_t, and any sticky extension
- * headers.
- * Returns failure if can't allocate memory.
+ * Setup IP headers.
+ *
+ * Note that IP_HDRINCL has ipha_protocol that is different than conn_proto,
+ * but icmp_output_hdrincl restores ipha_protocol once we return.
*/
-static int
-icmp_build_hdrs(icmp_t *icmp)
+mblk_t *
+icmp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
+ const in6_addr_t *v6src, const in6_addr_t *v6dst, uint32_t flowinfo,
+ mblk_t *data_mp, int *errorp)
{
- icmp_stack_t *is = icmp->icmp_is;
- uchar_t *hdrs;
- uint_t hdrs_len;
- ip6_t *ip6h;
- ip6i_t *ip6i;
- ip6_pkt_t *ipp = &icmp->icmp_sticky_ipp;
-
- ASSERT(RW_WRITE_HELD(&icmp->icmp_rwlock));
- hdrs_len = ip_total_hdrs_len_v6(ipp);
- ASSERT(hdrs_len != 0);
- if (hdrs_len != icmp->icmp_sticky_hdrs_len) {
- /* Need to reallocate */
- if (hdrs_len != 0) {
- hdrs = kmem_alloc(hdrs_len, KM_NOSLEEP);
- if (hdrs == NULL)
- return (ENOMEM);
- } else {
- hdrs = NULL;
- }
- if (icmp->icmp_sticky_hdrs_len != 0) {
- kmem_free(icmp->icmp_sticky_hdrs,
- icmp->icmp_sticky_hdrs_len);
- }
- icmp->icmp_sticky_hdrs = hdrs;
- icmp->icmp_sticky_hdrs_len = hdrs_len;
+ mblk_t *mp;
+ icmp_stack_t *is = connp->conn_netstack->netstack_icmp;
+ uint_t data_len;
+ uint32_t cksum;
+
+ data_len = msgdsize(data_mp);
+ mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, connp->conn_proto,
+ flowinfo, 0, data_mp, data_len, is->is_wroff_extra, &cksum, errorp);
+ if (mp == NULL) {
+ ASSERT(*errorp != 0);
+ return (NULL);
}
- ip_build_hdrs_v6(icmp->icmp_sticky_hdrs,
- icmp->icmp_sticky_hdrs_len, ipp, icmp->icmp_proto);
- /* Set header fields not in ipp */
- if (ipp->ipp_fields & IPPF_HAS_IP6I) {
- ip6i = (ip6i_t *)icmp->icmp_sticky_hdrs;
- ip6h = (ip6_t *)&ip6i[1];
+ ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length;
- if (ipp->ipp_fields & IPPF_RAW_CKSUM) {
- ip6i->ip6i_flags |= IP6I_RAW_CHECKSUM;
- ip6i->ip6i_checksum_off = icmp->icmp_checksum_off;
+ /*
+ * If there was a routing option/header then conn_prepend_hdr
+ * has massaged it and placed the pseudo-header checksum difference
+ * in the cksum argument.
+ *
+ * Prepare for ICMPv6 checksum done in IP.
+ *
+ * We make it easy for IP to include our pseudo header
+ * by putting our length (and any routing header adjustment)
+ * in the ICMPv6 checksum field.
+ * The IP source, destination, and length have already been set by
+ * conn_prepend_hdr.
+ */
+ cksum += data_len;
+ cksum = (cksum >> 16) + (cksum & 0xFFFF);
+ ASSERT(cksum < 0x10000);
+
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ ipha_t *ipha = (ipha_t *)mp->b_rptr;
+
+ ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen);
+ } else {
+ ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+ uint_t cksum_offset = 0;
+
+ ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == ixa->ixa_pktlen);
+
+ if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) {
+ if (connp->conn_proto == IPPROTO_ICMPV6) {
+ cksum_offset = ixa->ixa_ip_hdr_length +
+ offsetof(icmp6_t, icmp6_cksum);
+ } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
+ cksum_offset = ixa->ixa_ip_hdr_length +
+ ixa->ixa_raw_cksum_offset;
+ }
}
- if (ipp->ipp_fields & IPPF_NO_CKSUM) {
- ip6i->ip6i_flags |= IP6I_NO_ULP_CKSUM;
+ if (cksum_offset != 0) {
+ uint16_t *ptr;
+
+ /* Make sure the checksum fits in the first mblk */
+ if (cksum_offset + sizeof (short) > MBLKL(mp)) {
+ mblk_t *mp1;
+
+ mp1 = msgpullup(mp,
+ cksum_offset + sizeof (short));
+ freemsg(mp);
+ if (mp1 == NULL) {
+ *errorp = ENOMEM;
+ return (NULL);
+ }
+ mp = mp1;
+ ip6h = (ip6_t *)mp->b_rptr;
+ }
+ ptr = (uint16_t *)(mp->b_rptr + cksum_offset);
+ *ptr = htons(cksum);
}
- } else {
- ip6h = (ip6_t *)icmp->icmp_sticky_hdrs;
}
- if (!(ipp->ipp_fields & IPPF_ADDR))
- ip6h->ip6_src = icmp->icmp_v6src;
+ /* Note that we don't try to update wroff due to ancillary data */
+ return (mp);
+}
+
+static int
+icmp_build_hdr_template(conn_t *connp, const in6_addr_t *v6src,
+ const in6_addr_t *v6dst, uint32_t flowinfo)
+{
+ int error;
- /* Try to get everything in a single mblk */
- if (hdrs_len > icmp->icmp_max_hdr_len) {
- icmp->icmp_max_hdr_len = hdrs_len;
- rw_exit(&icmp->icmp_rwlock);
- (void) proto_set_tx_wroff(icmp->icmp_connp->conn_rq,
- icmp->icmp_connp,
- icmp->icmp_max_hdr_len + is->is_wroff_extra);
- rw_enter(&icmp->icmp_rwlock, RW_WRITER);
- }
+ ASSERT(MUTEX_HELD(&connp->conn_lock));
+ /*
+ * We clear lastdst to make sure we don't use the lastdst path
+ * next time sending since we might not have set v6dst yet.
+ */
+ connp->conn_v6lastdst = ipv6_all_zeros;
+
+ error = conn_build_hdr_template(connp, 0, 0, v6src, v6dst, flowinfo);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Any routing header/option has been massaged. The checksum difference
+ * is stored in conn_sum.
+ */
return (0);
}
@@ -3370,16 +2503,15 @@ icmp_queue_fallback(icmp_t *icmp, mblk_t *mp)
* TPI, then we'll queue the mp for later processing.
*/
static void
-icmp_ulp_recv(conn_t *connp, mblk_t *mp)
+icmp_ulp_recv(conn_t *connp, mblk_t *mp, uint_t len)
{
-
if (IPCL_IS_NONSTR(connp)) {
icmp_t *icmp = connp->conn_icmp;
int error;
+ ASSERT(len == msgdsize(mp));
if ((*connp->conn_upcalls->su_recv)
- (connp->conn_upper_handle, mp, msgdsize(mp), 0, &error,
- NULL) < 0) {
+ (connp->conn_upper_handle, mp, len, 0, &error, NULL) < 0) {
mutex_enter(&icmp->icmp_recv_lock);
if (error == ENOSPC) {
/*
@@ -3409,115 +2541,74 @@ icmp_ulp_recv(conn_t *connp, mblk_t *mp)
}
}
-/*ARGSUSED2*/
+/*
+ * This is the inbound data path.
+ * IP has already pulled up the IP headers and verified alignment
+ * etc.
+ */
+/* ARGSUSED2 */
static void
-icmp_input(void *arg1, mblk_t *mp, void *arg2)
+icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
{
- conn_t *connp = (conn_t *)arg1;
+ conn_t *connp = (conn_t *)arg1;
struct T_unitdata_ind *tudi;
- uchar_t *rptr;
+ uchar_t *rptr; /* Pointer to IP header */
+ int ip_hdr_length;
+ int udi_size; /* Size of T_unitdata_ind */
+ int pkt_len;
icmp_t *icmp;
+ ip_pkt_t ipps;
+ ip6_t *ip6h;
+ mblk_t *mp1;
+ crb_t recv_ancillary;
icmp_stack_t *is;
sin_t *sin;
sin6_t *sin6;
- ip6_t *ip6h;
- ip6i_t *ip6i;
- mblk_t *mp1;
- int hdr_len;
ipha_t *ipha;
- int udi_size; /* Size of T_unitdata_ind */
- uint_t ipvers;
- ip6_pkt_t ipp;
- uint8_t nexthdr;
- ip_pktinfo_t *pinfo = NULL;
- mblk_t *options_mp = NULL;
- uint_t icmp_opt = 0;
- boolean_t icmp_ipv6_recvhoplimit = B_FALSE;
- uint_t hopstrip;
ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
icmp = connp->conn_icmp;
is = icmp->icmp_is;
rptr = mp->b_rptr;
- ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_CTL);
+
+ ASSERT(DB_TYPE(mp) == M_DATA);
ASSERT(OK_32PTR(rptr));
+ ASSERT(ira->ira_pktlen == msgdsize(mp));
+ pkt_len = ira->ira_pktlen;
/*
- * IP should have prepended the options data in an M_CTL
- * Check M_CTL "type" to make sure are not here bcos of
- * a valid ICMP message
+ * Get a snapshot of these and allow other threads to change
+ * them after that. We need the same recv_ancillary when determining
+ * the size as when adding the ancillary data items.
*/
- if (DB_TYPE(mp) == M_CTL) {
- /*
- * FIXME: does IP still do this?
- * IP sends up the IPSEC_IN message for handling IPSEC
- * policy at the TCP level. We don't need it here.
- */
- if (*(uint32_t *)(mp->b_rptr) == IPSEC_IN) {
- mp1 = mp->b_cont;
- freeb(mp);
- mp = mp1;
- rptr = mp->b_rptr;
- } else if (MBLKL(mp) == sizeof (ip_pktinfo_t) &&
- ((ip_pktinfo_t *)mp->b_rptr)->ip_pkt_ulp_type ==
- IN_PKTINFO) {
- /*
- * IP_RECVIF or IP_RECVSLLA or IPF_RECVADDR information
- * has been prepended to the packet by IP. We need to
- * extract the mblk and adjust the rptr
- */
- pinfo = (ip_pktinfo_t *)mp->b_rptr;
- options_mp = mp;
- mp = mp->b_cont;
- rptr = mp->b_rptr;
- } else {
- /*
- * ICMP messages.
- */
- icmp_icmp_error(connp, mp);
- return;
- }
- }
+ mutex_enter(&connp->conn_lock);
+ recv_ancillary = connp->conn_recv_ancillary;
+ mutex_exit(&connp->conn_lock);
- /*
- * Discard message if it is misaligned or smaller than the IP header.
- */
- if (!OK_32PTR(rptr) || (mp->b_wptr - rptr) < sizeof (ipha_t)) {
- freemsg(mp);
- if (options_mp != NULL)
- freeb(options_mp);
- BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
- return;
- }
- ipvers = IPH_HDR_VERSION((ipha_t *)rptr);
+ ip_hdr_length = ira->ira_ip_hdr_length;
+ ASSERT(MBLKL(mp) >= ip_hdr_length); /* IP did a pullup */
+
+ /* Initialize regardless of IP version */
+ ipps.ipp_fields = 0;
+
+ if (ira->ira_flags & IRAF_IS_IPV4) {
+ ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION);
+ ASSERT(MBLKL(mp) >= sizeof (ipha_t));
+ ASSERT(ira->ira_ip_hdr_length == IPH_HDR_LENGTH(rptr));
+
+ ipha = (ipha_t *)mp->b_rptr;
+ if (recv_ancillary.crb_all != 0)
+ (void) ip_find_hdr_v4(ipha, &ipps, B_FALSE);
- /* Handle M_DATA messages containing IP packets messages */
- if (ipvers == IPV4_VERSION) {
/*
- * Special case where IP attaches
- * the IRE needs to be handled so that we don't send up
- * IRE to the user land.
+ * BSD for some reason adjusts ipha_length to exclude the
+ * IP header length. We do the same.
*/
- ipha = (ipha_t *)rptr;
- hdr_len = IPH_HDR_LENGTH(ipha);
-
- if (ipha->ipha_protocol == IPPROTO_TCP) {
- tcph_t *tcph = (tcph_t *)&mp->b_rptr[hdr_len];
-
- if (((tcph->th_flags[0] & (TH_SYN|TH_ACK)) ==
- TH_SYN) && mp->b_cont != NULL) {
- mp1 = mp->b_cont;
- if (mp1->b_datap->db_type == IRE_DB_TYPE) {
- freeb(mp1);
- mp->b_cont = NULL;
- }
- }
- }
if (is->is_bsd_compat) {
ushort_t len;
- len = ntohs(ipha->ipha_length);
+ len = ntohs(ipha->ipha_length);
if (mp->b_datap->db_ref > 1) {
/*
* Allocate a new IP header so that we can
@@ -3525,70 +2616,58 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2)
*/
mblk_t *mp1;
- mp1 = allocb(hdr_len, BPRI_MED);
- if (!mp1) {
+ mp1 = allocb(ip_hdr_length, BPRI_MED);
+ if (mp1 == NULL) {
freemsg(mp);
- if (options_mp != NULL)
- freeb(options_mp);
BUMP_MIB(&is->is_rawip_mib,
rawipInErrors);
return;
}
- bcopy(rptr, mp1->b_rptr, hdr_len);
- mp->b_rptr = rptr + hdr_len;
+ bcopy(rptr, mp1->b_rptr, ip_hdr_length);
+ mp->b_rptr = rptr + ip_hdr_length;
rptr = mp1->b_rptr;
ipha = (ipha_t *)rptr;
mp1->b_cont = mp;
- mp1->b_wptr = rptr + hdr_len;
+ mp1->b_wptr = rptr + ip_hdr_length;
mp = mp1;
}
- len -= hdr_len;
+ len -= ip_hdr_length;
ipha->ipha_length = htons(len);
}
- }
- /*
- * This is the inbound data path. Packets are passed upstream as
- * T_UNITDATA_IND messages with full IP headers still attached.
- */
- if (icmp->icmp_family == AF_INET) {
- ASSERT(ipvers == IPV4_VERSION);
- udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
- if (icmp->icmp_recvif && (pinfo != NULL) &&
- (pinfo->ip_pkt_flags & IPF_RECVIF)) {
- udi_size += sizeof (struct T_opthdr) +
- sizeof (uint_t);
- }
+ /*
+ * For RAW sockets we not pass ICMP/IPv4 packets to AF_INET6
+ * sockets. This is ensured by icmp_bind and the IP fanout code.
+ */
+ ASSERT(connp->conn_family == AF_INET);
- if (icmp->icmp_ip_recvpktinfo && (pinfo != NULL) &&
- (pinfo->ip_pkt_flags & IPF_RECVADDR)) {
- udi_size += sizeof (struct T_opthdr) +
- sizeof (struct in_pktinfo);
- }
+ /*
+ * This is the inbound data path. Packets are passed upstream
+ * as T_UNITDATA_IND messages with full IPv4 headers still
+ * attached.
+ */
/*
- * If SO_TIMESTAMP is set allocate the appropriate sized
- * buffer. Since gethrestime() expects a pointer aligned
- * argument, we allocate space necessary for extra
- * alignment (even though it might not be used).
+ * Normally only send up the source address.
+ * If any ancillary data items are wanted we add those.
*/
- if (icmp->icmp_timestamp) {
- udi_size += sizeof (struct T_opthdr) +
- sizeof (timestruc_t) + _POINTER_ALIGNMENT;
+ udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
+ if (recv_ancillary.crb_all != 0) {
+ udi_size += conn_recvancillary_size(connp,
+ recv_ancillary, ira, mp, &ipps);
}
+
+ /* Allocate a message block for the T_UNITDATA_IND structure. */
mp1 = allocb(udi_size, BPRI_MED);
if (mp1 == NULL) {
freemsg(mp);
- if (options_mp != NULL)
- freeb(options_mp);
BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
return;
}
mp1->b_cont = mp;
- mp = mp1;
- tudi = (struct T_unitdata_ind *)mp->b_rptr;
- mp->b_datap->db_type = M_PROTO;
- mp->b_wptr = (uchar_t *)tudi + udi_size;
+ tudi = (struct T_unitdata_ind *)mp1->b_rptr;
+ mp1->b_datap->db_type = M_PROTO;
+ mp1->b_wptr = (uchar_t *)tudi + udi_size;
tudi->PRIM_type = T_UNITDATA_IND;
tudi->SRC_length = sizeof (sin_t);
tudi->SRC_offset = sizeof (struct T_unitdata_ind);
@@ -3596,316 +2675,110 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2)
*sin = sin_null;
sin->sin_family = AF_INET;
sin->sin_addr.s_addr = ipha->ipha_src;
+ *(uint32_t *)&sin->sin_zero[0] = 0;
+ *(uint32_t *)&sin->sin_zero[4] = 0;
tudi->OPT_offset = sizeof (struct T_unitdata_ind) +
sizeof (sin_t);
udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t));
tudi->OPT_length = udi_size;
/*
- * Add options if IP_RECVIF is set
+ * Add options if IP_RECVIF etc is set
*/
if (udi_size != 0) {
- char *dstopt;
-
- dstopt = (char *)&sin[1];
- if (icmp->icmp_recvif && (pinfo != NULL) &&
- (pinfo->ip_pkt_flags & IPF_RECVIF)) {
-
- struct T_opthdr *toh;
- uint_t *dstptr;
-
- toh = (struct T_opthdr *)dstopt;
- toh->level = IPPROTO_IP;
- toh->name = IP_RECVIF;
- toh->len = sizeof (struct T_opthdr) +
- sizeof (uint_t);
- toh->status = 0;
- dstopt += sizeof (struct T_opthdr);
- dstptr = (uint_t *)dstopt;
- *dstptr = pinfo->ip_pkt_ifindex;
- dstopt += sizeof (uint_t);
- udi_size -= toh->len;
- }
- if (icmp->icmp_timestamp) {
- struct T_opthdr *toh;
-
- toh = (struct T_opthdr *)dstopt;
- toh->level = SOL_SOCKET;
- toh->name = SCM_TIMESTAMP;
- toh->len = sizeof (struct T_opthdr) +
- sizeof (timestruc_t) + _POINTER_ALIGNMENT;
- toh->status = 0;
- dstopt += sizeof (struct T_opthdr);
- /* Align for gethrestime() */
- dstopt = (char *)P2ROUNDUP((intptr_t)dstopt,
- sizeof (intptr_t));
- gethrestime((timestruc_t *)dstopt);
- dstopt = (char *)toh + toh->len;
- udi_size -= toh->len;
- }
- if (icmp->icmp_ip_recvpktinfo && (pinfo != NULL) &&
- (pinfo->ip_pkt_flags & IPF_RECVADDR)) {
- struct T_opthdr *toh;
- struct in_pktinfo *pktinfop;
-
- toh = (struct T_opthdr *)dstopt;
- toh->level = IPPROTO_IP;
- toh->name = IP_PKTINFO;
- toh->len = sizeof (struct T_opthdr) +
- sizeof (in_pktinfo_t);
- toh->status = 0;
- dstopt += sizeof (struct T_opthdr);
- pktinfop = (struct in_pktinfo *)dstopt;
- pktinfop->ipi_ifindex = pinfo->ip_pkt_ifindex;
- pktinfop->ipi_spec_dst =
- pinfo->ip_pkt_match_addr;
-
- pktinfop->ipi_addr.s_addr = ipha->ipha_dst;
-
- dstopt += sizeof (struct in_pktinfo);
- udi_size -= toh->len;
- }
-
- /* Consumed all of allocated space */
- ASSERT(udi_size == 0);
+ conn_recvancillary_add(connp, recv_ancillary, ira,
+ &ipps, (uchar_t *)&sin[1], udi_size);
}
-
- if (options_mp != NULL)
- freeb(options_mp);
-
- BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
goto deliver;
}
+ ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION);
/*
- * We don't need options_mp in the IPv6 path.
+ * IPv6 packets can only be received by applications
+ * that are prepared to receive IPv6 addresses.
+ * The IP fanout must ensure this.
*/
- if (options_mp != NULL) {
- freeb(options_mp);
- options_mp = NULL;
- }
+ ASSERT(connp->conn_family == AF_INET6);
/*
- * Discard message if it is smaller than the IPv6 header
- * or if the header is malformed.
+ * Handle IPv6 packets. We don't pass up the IP headers with the
+ * payload for IPv6.
*/
- if ((mp->b_wptr - rptr) < sizeof (ip6_t) ||
- IPH_HDR_VERSION((ipha_t *)rptr) != IPV6_VERSION ||
- icmp->icmp_family != AF_INET6) {
- freemsg(mp);
- BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
- return;
- }
-
- /* Initialize */
- ipp.ipp_fields = 0;
- hopstrip = 0;
ip6h = (ip6_t *)rptr;
- /*
- * Call on ip_find_hdr_v6 which gets the total hdr len
- * as well as individual lenghts of ext hdrs (and ptrs to
- * them).
- */
- if (ip6h->ip6_nxt != icmp->icmp_proto) {
- /* Look for ifindex information */
- if (ip6h->ip6_nxt == IPPROTO_RAW) {
- ip6i = (ip6i_t *)ip6h;
- if (ip6i->ip6i_flags & IP6I_IFINDEX) {
- ASSERT(ip6i->ip6i_ifindex != 0);
- ipp.ipp_fields |= IPPF_IFINDEX;
- ipp.ipp_ifindex = ip6i->ip6i_ifindex;
- }
- rptr = (uchar_t *)&ip6i[1];
- mp->b_rptr = rptr;
- if (rptr == mp->b_wptr) {
- mp1 = mp->b_cont;
- freeb(mp);
- mp = mp1;
- rptr = mp->b_rptr;
- }
- ASSERT(mp->b_wptr - rptr >= IPV6_HDR_LEN);
- ip6h = (ip6_t *)rptr;
- }
- hdr_len = ip_find_hdr_v6(mp, ip6h, &ipp, &nexthdr);
+ if (recv_ancillary.crb_all != 0) {
+ /*
+ * Call on ip_find_hdr_v6 which gets individual lenghts of
+ * extension headers (and pointers to them).
+ */
+ uint8_t nexthdr;
+
+ /* We don't care about the length or nextheader. */
+ (void) ip_find_hdr_v6(mp, ip6h, B_TRUE, &ipps, &nexthdr);
/*
- * We need to lie a bit to the user because users inside
- * labeled compartments should not see their own labels. We
- * assume that in all other respects IP has checked the label,
- * and that the label is always first among the options. (If
- * it's not first, then this code won't see it, and the option
- * will be passed along to the user.)
+ * We do not pass up hop-by-hop options or any other
+ * extension header as part of the packet. Applications
+ * that want to see them have to specify IPV6_RECV* socket
+ * options. And conn_recvancillary_size/add explicitly
+ * drops the TX option from IPV6_HOPOPTS as it does for UDP.
*
- * If we had multilevel ICMP sockets, then the following code
- * should be skipped for them to allow the user to see the
- * label.
- *
- * Alignment restrictions in the definition of IP options
- * (namely, the requirement that the 4-octet DOI goes on a
- * 4-octet boundary) mean that we know exactly where the option
- * should start, but we're lenient for other hosts.
- *
- * Note that there are no multilevel ICMP or raw IP sockets
- * yet, thus nobody ever sees the IP6OPT_LS option.
+ * If we had multilevel ICMP sockets, then we'd want to
+ * modify conn_recvancillary_size/add to
+ * allow the user to see the label.
*/
- if ((ipp.ipp_fields & IPPF_HOPOPTS) &&
- ipp.ipp_hopoptslen > 5 && is_system_labeled()) {
- const uchar_t *ucp =
- (const uchar_t *)ipp.ipp_hopopts + 2;
- int remlen = ipp.ipp_hopoptslen - 2;
-
- while (remlen > 0) {
- if (*ucp == IP6OPT_PAD1) {
- remlen--;
- ucp++;
- } else if (*ucp == IP6OPT_PADN) {
- remlen -= ucp[1] + 2;
- ucp += ucp[1] + 2;
- } else if (*ucp == ip6opt_ls) {
- hopstrip = (ucp -
- (const uchar_t *)ipp.ipp_hopopts) +
- ucp[1] + 2;
- hopstrip = (hopstrip + 7) & ~7;
- break;
- } else {
- /* label option must be first */
- break;
- }
- }
- }
- } else {
- hdr_len = IPV6_HDR_LEN;
- ip6i = NULL;
- nexthdr = ip6h->ip6_nxt;
- }
- /*
- * One special case where IP attaches the IRE needs to
- * be handled so that we don't send up IRE to the user land.
- */
- if (nexthdr == IPPROTO_TCP) {
- tcph_t *tcph = (tcph_t *)&mp->b_rptr[hdr_len];
-
- if (((tcph->th_flags[0] & (TH_SYN|TH_ACK)) == TH_SYN) &&
- mp->b_cont != NULL) {
- mp1 = mp->b_cont;
- if (mp1->b_datap->db_type == IRE_DB_TYPE) {
- freeb(mp1);
- mp->b_cont = NULL;
- }
- }
}
+
/*
* Check a filter for ICMPv6 types if needed.
* Verify raw checksums if needed.
*/
- if (icmp->icmp_filter != NULL || icmp->icmp_raw_checksum) {
- if (icmp->icmp_filter != NULL) {
- int type;
+ mutex_enter(&connp->conn_lock);
+ if (icmp->icmp_filter != NULL) {
+ int type;
- /* Assumes that IP has done the pullupmsg */
- type = mp->b_rptr[hdr_len];
+ /* Assumes that IP has done the pullupmsg */
+ type = mp->b_rptr[ip_hdr_length];
- ASSERT(mp->b_rptr + hdr_len <= mp->b_wptr);
- if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) {
- freemsg(mp);
- return;
- }
- } else {
- /* Checksum */
- uint16_t *up;
- uint32_t sum;
- int remlen;
-
- up = (uint16_t *)&ip6h->ip6_src;
-
- remlen = msgdsize(mp) - hdr_len;
- sum = htons(icmp->icmp_proto + remlen)
- + up[0] + up[1] + up[2] + up[3]
- + up[4] + up[5] + up[6] + up[7]
- + up[8] + up[9] + up[10] + up[11]
- + up[12] + up[13] + up[14] + up[15];
- sum = (sum & 0xffff) + (sum >> 16);
- sum = IP_CSUM(mp, hdr_len, sum);
- if (sum != 0) {
- /* IPv6 RAW checksum failed */
- ip0dbg(("icmp_rput: RAW checksum "
- "failed %x\n", sum));
- freemsg(mp);
- BUMP_MIB(&is->is_rawip_mib,
- rawipInCksumErrs);
- return;
- }
+ ASSERT(mp->b_rptr + ip_hdr_length <= mp->b_wptr);
+ if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) {
+ mutex_exit(&connp->conn_lock);
+ freemsg(mp);
+ return;
}
}
- /* Skip all the IPv6 headers per API */
- mp->b_rptr += hdr_len;
-
- udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
-
- /*
- * We use local variables icmp_opt and icmp_ipv6_recvhoplimit to
- * maintain state information, instead of relying on icmp_t
- * structure, since there arent any locks protecting these members
- * and there is a window where there might be a race between a
- * thread setting options on the write side and a thread reading
- * these options on the read size.
- */
- if (ipp.ipp_fields & (IPPF_HOPOPTS|IPPF_DSTOPTS|IPPF_RTDSTOPTS|
- IPPF_RTHDR|IPPF_IFINDEX)) {
- if (icmp->icmp_ipv6_recvhopopts &&
- (ipp.ipp_fields & IPPF_HOPOPTS) &&
- ipp.ipp_hopoptslen > hopstrip) {
- udi_size += sizeof (struct T_opthdr) +
- ipp.ipp_hopoptslen - hopstrip;
- icmp_opt |= IPPF_HOPOPTS;
- }
- if ((icmp->icmp_ipv6_recvdstopts ||
- icmp->icmp_old_ipv6_recvdstopts) &&
- (ipp.ipp_fields & IPPF_DSTOPTS)) {
- udi_size += sizeof (struct T_opthdr) +
- ipp.ipp_dstoptslen;
- icmp_opt |= IPPF_DSTOPTS;
- }
- if (((icmp->icmp_ipv6_recvdstopts &&
- icmp->icmp_ipv6_recvrthdr &&
- (ipp.ipp_fields & IPPF_RTHDR)) ||
- icmp->icmp_ipv6_recvrtdstopts) &&
- (ipp.ipp_fields & IPPF_RTDSTOPTS)) {
- udi_size += sizeof (struct T_opthdr) +
- ipp.ipp_rtdstoptslen;
- icmp_opt |= IPPF_RTDSTOPTS;
- }
- if (icmp->icmp_ipv6_recvrthdr &&
- (ipp.ipp_fields & IPPF_RTHDR)) {
- udi_size += sizeof (struct T_opthdr) +
- ipp.ipp_rthdrlen;
- icmp_opt |= IPPF_RTHDR;
- }
- if (icmp->icmp_ip_recvpktinfo &&
- (ipp.ipp_fields & IPPF_IFINDEX)) {
- udi_size += sizeof (struct T_opthdr) +
- sizeof (struct in6_pktinfo);
- icmp_opt |= IPPF_IFINDEX;
+ if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
+ /* Checksum */
+ uint16_t *up;
+ uint32_t sum;
+ int remlen;
+
+ up = (uint16_t *)&ip6h->ip6_src;
+
+ remlen = msgdsize(mp) - ip_hdr_length;
+ sum = htons(connp->conn_proto + remlen)
+ + up[0] + up[1] + up[2] + up[3]
+ + up[4] + up[5] + up[6] + up[7]
+ + up[8] + up[9] + up[10] + up[11]
+ + up[12] + up[13] + up[14] + up[15];
+ sum = (sum & 0xffff) + (sum >> 16);
+ sum = IP_CSUM(mp, ip_hdr_length, sum);
+ if (sum != 0) {
+ /* IPv6 RAW checksum failed */
+ ip0dbg(("icmp_rput: RAW checksum failed %x\n", sum));
+ mutex_exit(&connp->conn_lock);
+ freemsg(mp);
+ BUMP_MIB(&is->is_rawip_mib, rawipInCksumErrs);
+ return;
}
}
- if (icmp->icmp_ipv6_recvhoplimit) {
- udi_size += sizeof (struct T_opthdr) + sizeof (int);
- icmp_ipv6_recvhoplimit = B_TRUE;
- }
+ mutex_exit(&connp->conn_lock);
- if (icmp->icmp_ipv6_recvtclass)
- udi_size += sizeof (struct T_opthdr) + sizeof (int);
+ udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
- /*
- * If SO_TIMESTAMP is set allocate the appropriate sized
- * buffer. Since gethrestime() expects a pointer aligned
- * argument, we allocate space necessary for extra
- * alignment (even though it might not be used).
- */
- if (icmp->icmp_timestamp) {
- udi_size += sizeof (struct T_opthdr) +
- sizeof (timestruc_t) + _POINTER_ALIGNMENT;
+ if (recv_ancillary.crb_all != 0) {
+ udi_size += conn_recvancillary_size(connp,
+ recv_ancillary, ira, mp, &ipps);
}
mp1 = allocb(udi_size, BPRI_MED);
@@ -3915,10 +2788,9 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2)
return;
}
mp1->b_cont = mp;
- mp = mp1;
- mp->b_datap->db_type = M_PROTO;
- tudi = (struct T_unitdata_ind *)mp->b_rptr;
- mp->b_wptr = (uchar_t *)tudi + udi_size;
+ mp1->b_datap->db_type = M_PROTO;
+ tudi = (struct T_unitdata_ind *)mp1->b_rptr;
+ mp1->b_wptr = (uchar_t *)tudi + udi_size;
tudi->PRIM_type = T_UNITDATA_IND;
tudi->SRC_length = sizeof (sin6_t);
tudi->SRC_offset = sizeof (struct T_unitdata_ind);
@@ -3926,166 +2798,38 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2)
udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t));
tudi->OPT_length = udi_size;
sin6 = (sin6_t *)&tudi[1];
+ *sin6 = sin6_null;
sin6->sin6_port = 0;
sin6->sin6_family = AF_INET6;
sin6->sin6_addr = ip6h->ip6_src;
/* No sin6_flowinfo per API */
sin6->sin6_flowinfo = 0;
- /* For link-scope source pass up scope id */
- if ((ipp.ipp_fields & IPPF_IFINDEX) &&
- IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
- sin6->sin6_scope_id = ipp.ipp_ifindex;
+ /* For link-scope pass up scope id */
+ if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
+ sin6->sin6_scope_id = ira->ira_ruifindex;
else
sin6->sin6_scope_id = 0;
-
sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst,
- icmp->icmp_zoneid, is->is_netstack);
+ IPCL_ZONEID(connp), is->is_netstack);
if (udi_size != 0) {
- uchar_t *dstopt;
-
- dstopt = (uchar_t *)&sin6[1];
- if (icmp_opt & IPPF_IFINDEX) {
- struct T_opthdr *toh;
- struct in6_pktinfo *pkti;
-
- toh = (struct T_opthdr *)dstopt;
- toh->level = IPPROTO_IPV6;
- toh->name = IPV6_PKTINFO;
- toh->len = sizeof (struct T_opthdr) +
- sizeof (*pkti);
- toh->status = 0;
- dstopt += sizeof (struct T_opthdr);
- pkti = (struct in6_pktinfo *)dstopt;
- pkti->ipi6_addr = ip6h->ip6_dst;
- pkti->ipi6_ifindex = ipp.ipp_ifindex;
- dstopt += sizeof (*pkti);
- udi_size -= toh->len;
- }
- if (icmp_ipv6_recvhoplimit) {
- struct T_opthdr *toh;
-
- toh = (struct T_opthdr *)dstopt;
- toh->level = IPPROTO_IPV6;
- toh->name = IPV6_HOPLIMIT;
- toh->len = sizeof (struct T_opthdr) +
- sizeof (uint_t);
- toh->status = 0;
- dstopt += sizeof (struct T_opthdr);
- *(uint_t *)dstopt = ip6h->ip6_hops;
- dstopt += sizeof (uint_t);
- udi_size -= toh->len;
- }
- if (icmp->icmp_ipv6_recvtclass) {
- struct T_opthdr *toh;
-
- toh = (struct T_opthdr *)dstopt;
- toh->level = IPPROTO_IPV6;
- toh->name = IPV6_TCLASS;
- toh->len = sizeof (struct T_opthdr) +
- sizeof (uint_t);
- toh->status = 0;
- dstopt += sizeof (struct T_opthdr);
- *(uint_t *)dstopt = IPV6_FLOW_TCLASS(ip6h->ip6_flow);
- dstopt += sizeof (uint_t);
- udi_size -= toh->len;
- }
- if (icmp->icmp_timestamp) {
- struct T_opthdr *toh;
-
- toh = (struct T_opthdr *)dstopt;
- toh->level = SOL_SOCKET;
- toh->name = SCM_TIMESTAMP;
- toh->len = sizeof (struct T_opthdr) +
- sizeof (timestruc_t) + _POINTER_ALIGNMENT;
- toh->status = 0;
- dstopt += sizeof (struct T_opthdr);
- /* Align for gethrestime() */
- dstopt = (uchar_t *)P2ROUNDUP((intptr_t)dstopt,
- sizeof (intptr_t));
- gethrestime((timestruc_t *)dstopt);
- dstopt = (uchar_t *)toh + toh->len;
- udi_size -= toh->len;
- }
-
- if (icmp_opt & IPPF_HOPOPTS) {
- struct T_opthdr *toh;
-
- toh = (struct T_opthdr *)dstopt;
- toh->level = IPPROTO_IPV6;
- toh->name = IPV6_HOPOPTS;
- toh->len = sizeof (struct T_opthdr) +
- ipp.ipp_hopoptslen - hopstrip;
- toh->status = 0;
- dstopt += sizeof (struct T_opthdr);
- bcopy((char *)ipp.ipp_hopopts + hopstrip, dstopt,
- ipp.ipp_hopoptslen - hopstrip);
- if (hopstrip > 0) {
- /* copy next header value and fake length */
- dstopt[0] = ((uchar_t *)ipp.ipp_hopopts)[0];
- dstopt[1] = ((uchar_t *)ipp.ipp_hopopts)[1] -
- hopstrip / 8;
- }
- dstopt += ipp.ipp_hopoptslen - hopstrip;
- udi_size -= toh->len;
- }
- if (icmp_opt & IPPF_RTDSTOPTS) {
- struct T_opthdr *toh;
-
- toh = (struct T_opthdr *)dstopt;
- toh->level = IPPROTO_IPV6;
- toh->name = IPV6_DSTOPTS;
- toh->len = sizeof (struct T_opthdr) +
- ipp.ipp_rtdstoptslen;
- toh->status = 0;
- dstopt += sizeof (struct T_opthdr);
- bcopy(ipp.ipp_rtdstopts, dstopt,
- ipp.ipp_rtdstoptslen);
- dstopt += ipp.ipp_rtdstoptslen;
- udi_size -= toh->len;
- }
- if (icmp_opt & IPPF_RTHDR) {
- struct T_opthdr *toh;
-
- toh = (struct T_opthdr *)dstopt;
- toh->level = IPPROTO_IPV6;
- toh->name = IPV6_RTHDR;
- toh->len = sizeof (struct T_opthdr) +
- ipp.ipp_rthdrlen;
- toh->status = 0;
- dstopt += sizeof (struct T_opthdr);
- bcopy(ipp.ipp_rthdr, dstopt, ipp.ipp_rthdrlen);
- dstopt += ipp.ipp_rthdrlen;
- udi_size -= toh->len;
- }
- if (icmp_opt & IPPF_DSTOPTS) {
- struct T_opthdr *toh;
-
- toh = (struct T_opthdr *)dstopt;
- toh->level = IPPROTO_IPV6;
- toh->name = IPV6_DSTOPTS;
- toh->len = sizeof (struct T_opthdr) +
- ipp.ipp_dstoptslen;
- toh->status = 0;
- dstopt += sizeof (struct T_opthdr);
- bcopy(ipp.ipp_dstopts, dstopt,
- ipp.ipp_dstoptslen);
- dstopt += ipp.ipp_dstoptslen;
- udi_size -= toh->len;
- }
- /* Consumed all of allocated space */
- ASSERT(udi_size == 0);
+ conn_recvancillary_add(connp, recv_ancillary, ira,
+ &ipps, (uchar_t *)&sin6[1], udi_size);
}
- BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
-deliver:
- icmp_ulp_recv(connp, mp);
+ /* Skip all the IPv6 headers per API */
+ mp->b_rptr += ip_hdr_length;
+ pkt_len -= ip_hdr_length;
+deliver:
+ BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
+ icmp_ulp_recv(connp, mp1, pkt_len);
}
/*
- * return SNMP stuff in buffer in mpdata
+ * return SNMP stuff in buffer in mpdata. We don't hold any lock and report
+ * information that can be changing beneath us.
*/
mblk_t *
icmp_snmp_get(queue_t *q, mblk_t *mpctl)
@@ -4146,51 +2890,70 @@ icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
static void
icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err)
{
+ struct T_unitdata_req *tudr;
mblk_t *mp1;
- uchar_t *rptr = mp->b_rptr;
- struct T_unitdata_req *tudr = (struct T_unitdata_req *)rptr;
+ uchar_t *destaddr;
+ t_scalar_t destlen;
+ uchar_t *optaddr;
+ t_scalar_t optlen;
+
+ if ((mp->b_wptr < mp->b_rptr) ||
+ (MBLKL(mp)) < sizeof (struct T_unitdata_req)) {
+ goto done;
+ }
+ tudr = (struct T_unitdata_req *)mp->b_rptr;
+ destaddr = mp->b_rptr + tudr->DEST_offset;
+ if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr ||
+ destaddr + tudr->DEST_length < mp->b_rptr ||
+ destaddr + tudr->DEST_length > mp->b_wptr) {
+ goto done;
+ }
+ optaddr = mp->b_rptr + tudr->OPT_offset;
+ if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr ||
+ optaddr + tudr->OPT_length < mp->b_rptr ||
+ optaddr + tudr->OPT_length > mp->b_wptr) {
+ goto done;
+ }
+ destlen = tudr->DEST_length;
+ optlen = tudr->OPT_length;
- mp1 = mi_tpi_uderror_ind((char *)&rptr[tudr->DEST_offset],
- tudr->DEST_length, (char *)&rptr[tudr->OPT_offset],
- tudr->OPT_length, err);
- if (mp1)
+ mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen,
+ (char *)optaddr, optlen, err);
+ if (mp1 != NULL)
qreply(q, mp1);
+
+done:
freemsg(mp);
}
-
static int
rawip_do_unbind(conn_t *connp)
{
- icmp_t *icmp = connp->conn_icmp;
+ icmp_t *icmp = connp->conn_icmp;
- rw_enter(&icmp->icmp_rwlock, RW_WRITER);
+ mutex_enter(&connp->conn_lock);
/* If a bind has not been done, we can't unbind. */
- if (icmp->icmp_state == TS_UNBND || icmp->icmp_pending_op != -1) {
- rw_exit(&icmp->icmp_rwlock);
+ if (icmp->icmp_state == TS_UNBND) {
+ mutex_exit(&connp->conn_lock);
return (-TOUTSTATE);
}
- icmp->icmp_pending_op = T_UNBIND_REQ;
- rw_exit(&icmp->icmp_rwlock);
+ connp->conn_saddr_v6 = ipv6_all_zeros;
+ connp->conn_bound_addr_v6 = ipv6_all_zeros;
+ connp->conn_laddr_v6 = ipv6_all_zeros;
+ connp->conn_mcbc_bind = B_FALSE;
+ connp->conn_lport = 0;
+ connp->conn_fport = 0;
+ /* In case we were also connected */
+ connp->conn_faddr_v6 = ipv6_all_zeros;
+ connp->conn_v6lastdst = ipv6_all_zeros;
- /*
- * Call ip to unbind
- */
+ icmp->icmp_state = TS_UNBND;
- ip_unbind(connp);
+ (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
+ &connp->conn_faddr_v6, connp->conn_flowinfo);
+ mutex_exit(&connp->conn_lock);
- /*
- * Once we're unbound from IP, the pending operation may be cleared
- * here.
- */
- rw_enter(&icmp->icmp_rwlock, RW_WRITER);
- V6_SET_ZERO(icmp->icmp_v6src);
- V6_SET_ZERO(icmp->icmp_bound_v6src);
- icmp->icmp_pending_op = -1;
- icmp->icmp_state = TS_UNBND;
- if (icmp->icmp_family == AF_INET6)
- (void) icmp_build_hdrs(icmp);
- rw_exit(&icmp->icmp_rwlock);
+ ip_unbind(connp);
return (0);
}
@@ -4230,42 +2993,86 @@ icmp_tpi_unbind(queue_t *q, mblk_t *mp)
qreply(q, mp);
}
-
/*
* Process IPv4 packets that already include an IP header.
* Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and
* IPPROTO_IGMP).
+ * In this case we ignore the address and any options in the T_UNITDATA_REQ.
+ *
+ * The packet is assumed to have a base (20 byte) IP header followed
+ * by the upper-layer protocol. We include any IP_OPTIONS including a
+ * CIPSO label but otherwise preserve the base IP header.
*/
static int
-icmp_wput_hdrincl(queue_t *q, conn_t *connp, mblk_t *mp, icmp_t *icmp,
- ip4_pkt_t *pktinfop)
+icmp_output_hdrincl(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid)
{
- icmp_stack_t *is = icmp->icmp_is;
- ipha_t *ipha;
- int ip_hdr_length;
- int tp_hdr_len;
- int error;
- uchar_t ip_snd_opt[IP_MAX_OPT_LENGTH];
- uint32_t ip_snd_opt_len = 0;
- mblk_t *mp1;
- uint_t pkt_len;
- ip_opt_info_t optinfo;
- pid_t cpid;
- cred_t *cr;
+ icmp_t *icmp = connp->conn_icmp;
+ icmp_stack_t *is = icmp->icmp_is;
+ ipha_t iphas;
+ ipha_t *ipha;
+ int ip_hdr_length;
+ int tp_hdr_len;
+ ip_xmit_attr_t *ixa;
+ ip_pkt_t *ipp;
+ in6_addr_t v6src;
+ in6_addr_t v6dst;
+ in6_addr_t v6nexthop;
+ int error;
+ boolean_t do_ipsec;
- rw_enter(&icmp->icmp_rwlock, RW_READER);
+ /*
+ * We need an exclusive copy of conn_ixa since the included IP
+ * header could have any destination.
+ * That copy has no pointers hence we
+ * need to set them up once we've parsed the ancillary data.
+ */
+ ixa = conn_get_ixa_exclusive(connp);
+ if (ixa == NULL) {
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ freemsg(mp);
+ return (ENOMEM);
+ }
+ ASSERT(cr != NULL);
+ /*
+ * Caller has a reference on cr; from db_credp or because we
+ * are running in process context.
+ */
+ ixa->ixa_cred = cr;
+ ixa->ixa_cpid = pid;
+ if (is_system_labeled()) {
+ /* We need to restart with a label based on the cred */
+ ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
+ }
+
+ /* In case previous destination was multicast or multirt */
+ ip_attr_newdst(ixa);
- optinfo.ip_opt_flags = 0;
- optinfo.ip_opt_ill_index = 0;
+ /* Get a copy of conn_xmit_ipp since the TX label might change it */
+ ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP);
+ if (ipp == NULL) {
+ ixa_refrele(ixa);
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ freemsg(mp);
+ return (ENOMEM);
+ }
+ mutex_enter(&connp->conn_lock);
+ error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP);
+ mutex_exit(&connp->conn_lock);
+ if (error != 0) {
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ freemsg(mp);
+ goto done;
+ }
+
+ /* Sanity check length of packet */
ipha = (ipha_t *)mp->b_rptr;
- ip_hdr_length = IP_SIMPLE_HDR_LENGTH + icmp->icmp_ip_snd_options_len;
+
+ ip_hdr_length = IP_SIMPLE_HDR_LENGTH;
if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) {
if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
- ASSERT(icmp != NULL);
BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
freemsg(mp);
- rw_exit(&icmp->icmp_rwlock);
- return (0);
+ goto done;
}
ipha = (ipha_t *)mp->b_rptr;
}
@@ -4273,1285 +3080,1541 @@ icmp_wput_hdrincl(queue_t *q, conn_t *connp, mblk_t *mp, icmp_t *icmp,
(IP_VERSION<<4) | (ip_hdr_length>>2);
/*
- * Check if our saved options are valid; update if not.
- * TSOL Note: Since we are not in WRITER mode, ICMP packets
- * to different destination may require different labels,
- * or worse, ICMP packets to same IP address may require
- * different labels due to use of shared all-zones address.
- * We use conn_lock to ensure that lastdst, ip_snd_options,
- * and ip_snd_options_len are consistent for the current
- * destination and are updated atomically.
- */
- mutex_enter(&connp->conn_lock);
- if (is_system_labeled()) {
- /*
- * Recompute the Trusted Extensions security label if
- * we're not going to the same destination as last
- * time or the cred attached to the received mblk
- * changed.
- */
- cr = msg_getcred(mp, &cpid);
- if (!IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6lastdst) ||
- V4_PART_OF_V6(icmp->icmp_v6lastdst) != ipha->ipha_dst ||
- cr != icmp->icmp_last_cred) {
- error = icmp_update_label(icmp, mp, ipha->ipha_dst);
- if (error != 0) {
- mutex_exit(&connp->conn_lock);
- rw_exit(&icmp->icmp_rwlock);
- return (error);
- }
- }
- /*
- * Apply credentials with modified security label if they
- * exist. icmp_update_label() may have generated these
- * credentials for packets to unlabeled remote nodes.
- */
- if (icmp->icmp_effective_cred != NULL)
- mblk_setcred(mp, icmp->icmp_effective_cred, cpid);
- }
-
- if (icmp->icmp_ip_snd_options_len > 0) {
- ip_snd_opt_len = icmp->icmp_ip_snd_options_len;
- bcopy(icmp->icmp_ip_snd_options, ip_snd_opt, ip_snd_opt_len);
- }
- mutex_exit(&connp->conn_lock);
-
- /*
- * For the socket of SOCK_RAW type, the checksum is provided in the
- * pre-built packet. We set the ipha_ident field to IP_HDR_INCLUDED to
- * tell IP that the application has sent a complete IP header and not
- * to compute the transport checksum nor change the DF flag.
+ * We set IXAF_DONTFRAG if the application set DF which makes
+ * IP not fragment.
*/
- ipha->ipha_ident = IP_HDR_INCLUDED;
- ipha->ipha_hdr_checksum = 0;
ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF);
- /* Insert options if any */
- if (ip_hdr_length > IP_SIMPLE_HDR_LENGTH) {
- /*
- * Put the IP header plus any transport header that is
- * checksumed by ip_wput into the first mblk. (ip_wput assumes
- * that at least the checksum field is in the first mblk.)
- */
- switch (ipha->ipha_protocol) {
- case IPPROTO_UDP:
- tp_hdr_len = 8;
- break;
- case IPPROTO_TCP:
- tp_hdr_len = 20;
- break;
- default:
- tp_hdr_len = 0;
- break;
- }
- /*
- * The code below assumes that IP_SIMPLE_HDR_LENGTH plus
- * tp_hdr_len bytes will be in a single mblk.
- */
- if ((mp->b_wptr - mp->b_rptr) < (IP_SIMPLE_HDR_LENGTH +
- tp_hdr_len)) {
- if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH +
- tp_hdr_len)) {
- BUMP_MIB(&is->is_rawip_mib,
- rawipOutErrors);
- freemsg(mp);
- rw_exit(&icmp->icmp_rwlock);
- return (0);
- }
- ipha = (ipha_t *)mp->b_rptr;
- }
-
- /*
- * if the length is larger then the max allowed IP packet,
- * then send an error and abort the processing.
- */
- pkt_len = ntohs(ipha->ipha_length)
- + ip_snd_opt_len;
- if (pkt_len > IP_MAXPACKET) {
- rw_exit(&icmp->icmp_rwlock);
- return (EMSGSIZE);
- }
- if (!(mp1 = allocb(ip_hdr_length + is->is_wroff_extra +
- tp_hdr_len, BPRI_LO))) {
- rw_exit(&icmp->icmp_rwlock);
- return (ENOMEM);
- }
- mp1->b_rptr += is->is_wroff_extra;
- mp1->b_wptr = mp1->b_rptr + ip_hdr_length;
+ if (ipha->ipha_fragment_offset_and_flags & htons(IPH_DF))
+ ixa->ixa_flags |= (IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
+ else
+ ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
- ipha->ipha_length = htons((uint16_t)pkt_len);
- bcopy(ipha, mp1->b_rptr, IP_SIMPLE_HDR_LENGTH);
+ /* Even for multicast and broadcast we honor the apps ttl */
+ ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
- /* Copy transport header if any */
- bcopy(&ipha[1], mp1->b_wptr, tp_hdr_len);
- mp1->b_wptr += tp_hdr_len;
+ if (ipha->ipha_dst == INADDR_ANY)
+ ipha->ipha_dst = htonl(INADDR_LOOPBACK);
- /* Add options */
- ipha = (ipha_t *)mp1->b_rptr;
- bcopy(ip_snd_opt, &ipha[1], ip_snd_opt_len);
+ IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src);
+ IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
- /* Drop IP header and transport header from original */
- (void) adjmsg(mp, IP_SIMPLE_HDR_LENGTH + tp_hdr_len);
+ /* Defer IPsec if it might need to look at ICMP type/code */
+ do_ipsec = ipha->ipha_protocol != IPPROTO_ICMP;
+ ixa->ixa_flags |= IXAF_IS_IPV4;
- mp1->b_cont = mp;
- mp = mp1;
+ ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
+ error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop,
+ connp->conn_fport, &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
+ (do_ipsec ? IPDF_IPSEC : 0));
+ switch (error) {
+ case 0:
+ break;
+ case EADDRNOTAVAIL:
/*
- * Massage source route putting first source
- * route in ipha_dst.
+ * IXAF_VERIFY_SOURCE tells us to pick a better source.
+ * Don't have the application see that errno
*/
- (void) ip_massage_options(ipha, is->is_netstack);
- }
-
- if (pktinfop != NULL) {
+ error = ENETUNREACH;
+ goto failed;
+ case ENETDOWN:
/*
- * Over write the source address provided in the header
+ * Have !ipif_addr_ready address; drop packet silently
+ * until we can get applications to not send until we
+ * are ready.
*/
- if (pktinfop->ip4_addr != INADDR_ANY) {
- ipha->ipha_src = pktinfop->ip4_addr;
- optinfo.ip_opt_flags = IP_VERIFY_SRC;
- }
-
- if (pktinfop->ip4_ill_index != 0) {
- optinfo.ip_opt_ill_index = pktinfop->ip4_ill_index;
+ error = 0;
+ goto failed;
+ case EHOSTUNREACH:
+ case ENETUNREACH:
+ if (ixa->ixa_ire != NULL) {
+ /*
+ * Let conn_ip_output/ire_send_noroute return
+ * the error and send any local ICMP error.
+ */
+ error = 0;
+ break;
}
+ /* FALLTHRU */
+ default:
+ failed:
+ freemsg(mp);
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ goto done;
}
-
- rw_exit(&icmp->icmp_rwlock);
-
- ip_output_options(connp, mp, q, IP_WPUT, &optinfo);
- return (0);
-}
-
-static int
-icmp_update_label(icmp_t *icmp, mblk_t *mp, ipaddr_t dst)
-{
- int err;
- uchar_t opt_storage[IP_MAX_OPT_LENGTH];
- icmp_stack_t *is = icmp->icmp_is;
- conn_t *connp = icmp->icmp_connp;
- cred_t *cred;
- cred_t *msg_cred;
- cred_t *effective_cred;
+ if (ipha->ipha_src == INADDR_ANY)
+ IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src);
/*
- * All Solaris components should pass a db_credp
- * for this message, hence we ASSERT.
- * On production kernels we return an error to be robust against
- * random streams modules sitting on top of us.
+ * We might be going to a different destination than last time,
+ * thus check that TX allows the communication and compute any
+ * needed label.
+ *
+ * TSOL Note: We have an exclusive ipp and ixa for this thread so we
+ * don't have to worry about concurrent threads.
*/
- cred = msg_cred = msg_getcred(mp, NULL);
- ASSERT(cred != NULL);
- if (cred == NULL)
- return (EINVAL);
+ if (is_system_labeled()) {
+ /*
+ * Check whether Trusted Solaris policy allows communication
+ * with this host, and pretend that the destination is
+ * unreachable if not.
+ * Compute any needed label and place it in ipp_label_v4/v6.
+ *
+ * Later conn_build_hdr_template/conn_prepend_hdr takes
+ * ipp_label_v4/v6 to form the packet.
+ *
+ * Tsol note: We have ipp structure local to this thread so
+ * no locking is needed.
+ */
+ error = conn_update_label(connp, ixa, &v6dst, ipp);
+ if (error != 0) {
+ freemsg(mp);
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ goto done;
+ }
+ }
/*
- * Verify the destination is allowed to receive packets at
- * the security label of the message data. check_dest()
- * may create a new effective cred for this message
- * with a modified label or label flags.
+ * Save away a copy of the IPv4 header the application passed down
+ * and then prepend an IPv4 header complete with any IP options
+ * including label.
+ * We need a struct copy since icmp_prepend_hdr will reuse the available
+ * space in the mblk.
*/
- if ((err = tsol_check_dest(cred, &dst, IPV4_VERSION,
- connp->conn_mac_mode, &effective_cred)) != 0)
- goto done;
- if (effective_cred != NULL)
- cred = effective_cred;
+ iphas = *ipha;
+ mp->b_rptr += IP_SIMPLE_HDR_LENGTH;
- /*
- * Calculate the security label to be placed in the text
- * of the message (if any).
- */
- if ((err = tsol_compute_label(cred, dst, opt_storage,
- is->is_netstack->netstack_ip)) != 0)
+ mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, 0, mp, &error);
+ if (mp == NULL) {
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ ASSERT(error != 0);
goto done;
-
- /*
- * Insert the security label in the cached ip options,
- * removing any old label that may exist.
- */
- if ((err = tsol_update_options(&icmp->icmp_ip_snd_options,
- &icmp->icmp_ip_snd_options_len, &icmp->icmp_label_len,
- opt_storage)) != 0)
+ }
+ if (ixa->ixa_pktlen > IP_MAXPACKET) {
+ error = EMSGSIZE;
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ freemsg(mp);
goto done;
+ }
+ /* Restore key parts of the header that the application passed down */
+ ipha = (ipha_t *)mp->b_rptr;
+ ipha->ipha_type_of_service = iphas.ipha_type_of_service;
+ ipha->ipha_ident = iphas.ipha_ident;
+ ipha->ipha_fragment_offset_and_flags =
+ iphas.ipha_fragment_offset_and_flags;
+ ipha->ipha_ttl = iphas.ipha_ttl;
+ ipha->ipha_protocol = iphas.ipha_protocol;
+ ipha->ipha_src = iphas.ipha_src;
+ ipha->ipha_dst = iphas.ipha_dst;
+
+ ixa->ixa_protocol = ipha->ipha_protocol;
/*
- * Save the destination address and cred we used to generate
- * the security label text.
+ * Make sure that the IP header plus any transport header that is
+ * checksumed by ip_output is in the first mblk. (ip_output assumes
+ * that at least the checksum field is in the first mblk.)
*/
- IN6_IPADDR_TO_V4MAPPED(dst, &icmp->icmp_v6lastdst);
- if (cred != icmp->icmp_effective_cred) {
- if (icmp->icmp_effective_cred != NULL)
- crfree(icmp->icmp_effective_cred);
- crhold(cred);
- icmp->icmp_effective_cred = cred;
+ switch (ipha->ipha_protocol) {
+ case IPPROTO_UDP:
+ tp_hdr_len = 8;
+ break;
+ case IPPROTO_TCP:
+ tp_hdr_len = 20;
+ break;
+ default:
+ tp_hdr_len = 0;
+ break;
+ }
+ ip_hdr_length = IPH_HDR_LENGTH(ipha);
+ if (mp->b_wptr - mp->b_rptr < ip_hdr_length + tp_hdr_len) {
+ if (!pullupmsg(mp, ip_hdr_length + tp_hdr_len)) {
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ if (mp->b_cont == NULL)
+ error = EINVAL;
+ else
+ error = ENOMEM;
+ freemsg(mp);
+ goto done;
+ }
}
- if (msg_cred != icmp->icmp_last_cred) {
- if (icmp->icmp_last_cred != NULL)
- crfree(icmp->icmp_last_cred);
- crhold(msg_cred);
- icmp->icmp_last_cred = msg_cred;
+ if (!do_ipsec) {
+ /* Policy might differ for different ICMP type/code */
+ if (ixa->ixa_ipsec_policy != NULL) {
+ IPPOL_REFRELE(ixa->ixa_ipsec_policy);
+ ixa->ixa_ipsec_policy = NULL;
+ ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
+ }
+ mp = ip_output_attach_policy(mp, ipha, NULL, connp, ixa);
+ if (mp == NULL) {
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ error = EHOSTUNREACH; /* IPsec policy failure */
+ goto done;
+ }
}
+ /* We're done. Pass the packet to ip. */
+ BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
+
+ error = conn_ip_output(mp, ixa);
+ /* No rawipOutErrors if an error since IP increases its error counter */
+ switch (error) {
+ case 0:
+ break;
+ case EWOULDBLOCK:
+ (void) ixa_check_drain_insert(connp, ixa);
+ error = 0;
+ break;
+ case EADDRNOTAVAIL:
+ /*
+ * IXAF_VERIFY_SOURCE tells us to pick a better source.
+ * Don't have the application see that errno
+ */
+ error = ENETUNREACH;
+ break;
+ }
done:
- if (effective_cred != NULL)
- crfree(effective_cred);
+ ixa_refrele(ixa);
+ ip_pkt_free(ipp);
+ kmem_free(ipp, sizeof (*ipp));
+ return (error);
+}
- if (err != 0) {
- BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
- DTRACE_PROBE4(
- tx__ip__log__drop__updatelabel__icmp,
- char *, "icmp(1) failed to update options(2) on mp(3)",
- icmp_t *, icmp, char *, opt_storage, mblk_t *, mp);
- return (err);
+static mblk_t *
+icmp_output_attach_policy(mblk_t *mp, conn_t *connp, ip_xmit_attr_t *ixa)
+{
+ ipha_t *ipha = NULL;
+ ip6_t *ip6h = NULL;
+
+ if (ixa->ixa_flags & IXAF_IS_IPV4)
+ ipha = (ipha_t *)mp->b_rptr;
+ else
+ ip6h = (ip6_t *)mp->b_rptr;
+
+ if (ixa->ixa_ipsec_policy != NULL) {
+ IPPOL_REFRELE(ixa->ixa_ipsec_policy);
+ ixa->ixa_ipsec_policy = NULL;
+ ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
}
- return (0);
+ return (ip_output_attach_policy(mp, ipha, ip6h, connp, ixa));
}
/*
- * This routine handles all messages passed downstream. It either
- * consumes the message or passes it downstream; it never queues a
- * a message.
+ * Handle T_UNITDATA_REQ with options. Both IPv4 and IPv6
+ * Either tudr_mp or msg is set. If tudr_mp we take ancillary data from
+ * the TPI options, otherwise we take them from msg_control.
+ * If both sin and sin6 is set it is a connected socket and we use conn_faddr.
+ * Always consumes mp; never consumes tudr_mp.
*/
-static void
-icmp_wput(queue_t *q, mblk_t *mp)
+static int
+icmp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp,
+ mblk_t *tudr_mp, struct nmsghdr *msg, cred_t *cr, pid_t pid)
{
- uchar_t *rptr = mp->b_rptr;
- mblk_t *mp1;
-#define tudr ((struct T_unitdata_req *)rptr)
- size_t ip_len;
- conn_t *connp = Q_TO_CONN(q);
- icmp_t *icmp = connp->conn_icmp;
- icmp_stack_t *is = icmp->icmp_is;
- sin6_t *sin6;
- sin_t *sin;
- ipaddr_t v4dst;
- ip4_pkt_t pktinfo;
- ip4_pkt_t *pktinfop = &pktinfo;
- ip6_pkt_t ipp_s; /* For ancillary data options */
- ip6_pkt_t *ipp = &ipp_s;
- int error;
+ icmp_t *icmp = connp->conn_icmp;
+ icmp_stack_t *is = icmp->icmp_is;
+ int error;
+ ip_xmit_attr_t *ixa;
+ ip_pkt_t *ipp;
+ in6_addr_t v6src;
+ in6_addr_t v6dst;
+ in6_addr_t v6nexthop;
+ in_port_t dstport;
+ uint32_t flowinfo;
+ uint_t srcid;
+ int is_absreq_failure = 0;
+ conn_opt_arg_t coas, *coa;
- ipp->ipp_fields = 0;
- ipp->ipp_sticky_ignored = 0;
+ ASSERT(tudr_mp != NULL || msg != NULL);
- switch (mp->b_datap->db_type) {
- case M_DATA:
- if (icmp->icmp_hdrincl) {
- ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
- error = icmp_wput_hdrincl(q, connp, mp, icmp, NULL);
- if (error != 0)
- icmp_ud_err(q, mp, error);
- return;
- }
+ /*
+ * Get ixa before checking state to handle a disconnect race.
+ *
+ * We need an exclusive copy of conn_ixa since the ancillary data
+ * options might modify it. That copy has no pointers hence we
+ * need to set them up once we've parsed the ancillary data.
+ */
+ ixa = conn_get_ixa_exclusive(connp);
+ if (ixa == NULL) {
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
freemsg(mp);
- return;
- case M_PROTO:
- case M_PCPROTO:
- ip_len = mp->b_wptr - rptr;
- if (ip_len >= sizeof (struct T_unitdata_req)) {
- /* Expedite valid T_UNITDATA_REQ to below the switch */
- if (((union T_primitives *)rptr)->type
- == T_UNITDATA_REQ)
- break;
- }
- /* FALLTHRU */
- default:
- icmp_wput_other(q, mp);
- return;
+ return (ENOMEM);
+ }
+ ASSERT(cr != NULL);
+ ixa->ixa_cred = cr;
+ ixa->ixa_cpid = pid;
+ if (is_system_labeled()) {
+ /* We need to restart with a label based on the cred */
+ ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
}
- /* Handle T_UNITDATA_REQ messages here. */
+ /* In case previous destination was multicast or multirt */
+ ip_attr_newdst(ixa);
- mp1 = mp->b_cont;
- if (mp1 == NULL) {
+ /* Get a copy of conn_xmit_ipp since the options might change it */
+ ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP);
+ if (ipp == NULL) {
+ ixa_refrele(ixa);
BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
- icmp_ud_err(q, mp, EPROTO);
- return;
+ freemsg(mp);
+ return (ENOMEM);
}
-
- if ((rptr + tudr->DEST_offset + tudr->DEST_length) > mp->b_wptr) {
+ mutex_enter(&connp->conn_lock);
+ error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP);
+ mutex_exit(&connp->conn_lock);
+ if (error != 0) {
BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
- icmp_ud_err(q, mp, EADDRNOTAVAIL);
- return;
+ freemsg(mp);
+ goto done;
}
- switch (icmp->icmp_family) {
- case AF_INET6:
- sin6 = (sin6_t *)&rptr[tudr->DEST_offset];
- if (!OK_32PTR((char *)sin6) ||
- tudr->DEST_length != sizeof (sin6_t) ||
- sin6->sin6_family != AF_INET6) {
- BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
- icmp_ud_err(q, mp, EADDRNOTAVAIL);
- return;
- }
+ /*
+ * Parse the options and update ixa and ipp as a result.
+ */
- /* No support for mapped addresses on raw sockets */
- if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
- BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
- icmp_ud_err(q, mp, EADDRNOTAVAIL);
- return;
- }
+ coa = &coas;
+ coa->coa_connp = connp;
+ coa->coa_ixa = ixa;
+ coa->coa_ipp = ipp;
+ coa->coa_ancillary = B_TRUE;
+ coa->coa_changed = 0;
+ if (msg != NULL) {
+ error = process_auxiliary_options(connp, msg->msg_control,
+ msg->msg_controllen, coa, &icmp_opt_obj, icmp_opt_set, cr);
+ } else {
+ struct T_unitdata_req *tudr;
+
+ tudr = (struct T_unitdata_req *)tudr_mp->b_rptr;
+ ASSERT(tudr->PRIM_type == T_UNITDATA_REQ);
+ error = tpi_optcom_buf(connp->conn_wq, tudr_mp,
+ &tudr->OPT_length, tudr->OPT_offset, cr, &icmp_opt_obj,
+ coa, &is_absreq_failure);
+ }
+ if (error != 0) {
/*
- * Destination is a native IPv6 address.
- * Send out an IPv6 format packet.
+ * Note: No special action needed in this
+ * module for "is_absreq_failure"
*/
- if (tudr->OPT_length != 0) {
- int error;
-
- error = 0;
- if (icmp_unitdata_opt_process(q, mp, &error,
- (void *)ipp) < 0) {
- /* failure */
- BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
- icmp_ud_err(q, mp, error);
- return;
- }
- ASSERT(error == 0);
- }
-
- error = raw_ip_send_data_v6(q, connp, mp1, sin6, ipp);
+ freemsg(mp);
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
goto done;
-
- case AF_INET:
- sin = (sin_t *)&rptr[tudr->DEST_offset];
- if (!OK_32PTR((char *)sin) ||
- tudr->DEST_length != sizeof (sin_t) ||
- sin->sin_family != AF_INET) {
- BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
- icmp_ud_err(q, mp, EADDRNOTAVAIL);
- return;
- }
- /* Extract and ipaddr */
- v4dst = sin->sin_addr.s_addr;
- break;
-
- default:
- ASSERT(0);
}
+ ASSERT(is_absreq_failure == 0);
- pktinfop->ip4_ill_index = 0;
- pktinfop->ip4_addr = INADDR_ANY;
-
+ mutex_enter(&connp->conn_lock);
/*
- * If options passed in, feed it for verification and handling
+ * If laddr is unspecified then we look at sin6_src_id.
+ * We will give precedence to a source address set with IPV6_PKTINFO
+ * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
+ * want ip_attr_connect to select a source (since it can fail) when
+ * IPV6_PKTINFO is specified.
+ * If this doesn't result in a source address then we get a source
+ * from ip_attr_connect() below.
*/
- if (tudr->OPT_length != 0) {
- int error;
-
- error = 0;
- if (icmp_unitdata_opt_process(q, mp, &error,
- (void *)pktinfop) < 0) {
- /* failure */
- BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
- icmp_ud_err(q, mp, error);
- return;
+ v6src = connp->conn_saddr_v6;
+ if (sin != NULL) {
+ IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
+ dstport = sin->sin_port;
+ flowinfo = 0;
+ ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
+ ixa->ixa_flags |= IXAF_IS_IPV4;
+ } else if (sin6 != NULL) {
+ v6dst = sin6->sin6_addr;
+ dstport = sin6->sin6_port;
+ flowinfo = sin6->sin6_flowinfo;
+ srcid = sin6->__sin6_src_id;
+ if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
+ ixa->ixa_scopeid = sin6->sin6_scope_id;
+ ixa->ixa_flags |= IXAF_SCOPEID_SET;
+ } else {
+ ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
}
- ASSERT(error == 0);
- /*
- * Note: Success in processing options.
- * mp option buffer represented by
- * OPT_length/offset now potentially modified
- * and contain option setting results
- */
- }
-
- error = raw_ip_send_data_v4(q, connp, mp1, v4dst, pktinfop);
-done:
- if (error != 0) {
- icmp_ud_err(q, mp, error);
- return;
+ if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
+ ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
+ connp->conn_netstack);
+ }
+ if (IN6_IS_ADDR_V4MAPPED(&v6dst))
+ ixa->ixa_flags |= IXAF_IS_IPV4;
+ else
+ ixa->ixa_flags &= ~IXAF_IS_IPV4;
} else {
- mp->b_cont = NULL;
- freeb(mp);
+ /* Connected case */
+ v6dst = connp->conn_faddr_v6;
+ flowinfo = connp->conn_flowinfo;
+ }
+ mutex_exit(&connp->conn_lock);
+ /* Handle IPV6_PKTINFO setting source address. */
+ if (IN6_IS_ADDR_UNSPECIFIED(&v6src) &&
+ (ipp->ipp_fields & IPPF_ADDR)) {
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
+ v6src = ipp->ipp_addr;
+ } else {
+ if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
+ v6src = ipp->ipp_addr;
+ }
}
-}
-
-
-/* ARGSUSED */
-static void
-icmp_wput_fallback(queue_t *q, mblk_t *mp)
-{
-#ifdef DEBUG
- cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n");
-#endif
- freemsg(mp);
-}
-
-static int
-raw_ip_send_data_v4(queue_t *q, conn_t *connp, mblk_t *mp, ipaddr_t v4dst,
- ip4_pkt_t *pktinfop)
-{
- ipha_t *ipha;
- size_t ip_len;
- icmp_t *icmp = connp->conn_icmp;
- icmp_stack_t *is = icmp->icmp_is;
- int ip_hdr_length;
- ip_opt_info_t optinfo;
- uchar_t ip_snd_opt[IP_MAX_OPT_LENGTH];
- uint32_t ip_snd_opt_len = 0;
- pid_t cpid;
- cred_t *cr;
- optinfo.ip_opt_flags = 0;
- optinfo.ip_opt_ill_index = 0;
+ ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
+ error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
+ &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST);
- if (icmp->icmp_state == TS_UNBND) {
- /* If a port has not been bound to the stream, fail. */
+ switch (error) {
+ case 0:
+ break;
+ case EADDRNOTAVAIL:
+ /*
+ * IXAF_VERIFY_SOURCE tells us to pick a better source.
+ * Don't have the application see that errno
+ */
+ error = ENETUNREACH;
+ goto failed;
+ case ENETDOWN:
+ /*
+ * Have !ipif_addr_ready address; drop packet silently
+ * until we can get applications to not send until we
+ * are ready.
+ */
+ error = 0;
+ goto failed;
+ case EHOSTUNREACH:
+ case ENETUNREACH:
+ if (ixa->ixa_ire != NULL) {
+ /*
+ * Let conn_ip_output/ire_send_noroute return
+ * the error and send any local ICMP error.
+ */
+ error = 0;
+ break;
+ }
+ /* FALLTHRU */
+ default:
+ failed:
+ freemsg(mp);
BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
- return (EPROTO);
+ goto done;
}
- if (v4dst == INADDR_ANY)
- v4dst = htonl(INADDR_LOOPBACK);
-
- /* Protocol 255 contains full IP headers */
- if (icmp->icmp_hdrincl)
- return (icmp_wput_hdrincl(q, connp, mp, icmp, pktinfop));
-
- rw_enter(&icmp->icmp_rwlock, RW_READER);
-
/*
- * Check if our saved options are valid; update if not.
- * TSOL Note: Since we are not in WRITER mode, ICMP packets
- * to different destination may require different labels,
- * or worse, ICMP packets to same IP address may require
- * different labels due to use of shared all-zones address.
- * We use conn_lock to ensure that lastdst, ip_snd_options,
- * and ip_snd_options_len are consistent for the current
- * destination and are updated atomically.
+ * We might be going to a different destination than last time,
+ * thus check that TX allows the communication and compute any
+ * needed label.
+ *
+ * TSOL Note: We have an exclusive ipp and ixa for this thread so we
+ * don't have to worry about concurrent threads.
*/
- mutex_enter(&connp->conn_lock);
if (is_system_labeled()) {
-
- /*
- * Recompute the Trusted Extensions security label if we're not
- * going to the same destination as last time or the cred
- * attached to the received mblk changed.
- */
- cr = msg_getcred(mp, &cpid);
- if (!IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6lastdst) ||
- V4_PART_OF_V6(icmp->icmp_v6lastdst) != v4dst ||
- cr != icmp->icmp_last_cred) {
- int error = icmp_update_label(icmp, mp, v4dst);
- if (error != 0) {
- mutex_exit(&connp->conn_lock);
- rw_exit(&icmp->icmp_rwlock);
- return (error);
- }
- }
/*
- * Apply credentials with modified security label if they
- * exist. icmp_update_label() may have generated these
- * credentials for packets to unlabeled remote nodes.
+ * Check whether Trusted Solaris policy allows communication
+ * with this host, and pretend that the destination is
+ * unreachable if not.
+ * Compute any needed label and place it in ipp_label_v4/v6.
+ *
+ * Later conn_build_hdr_template/conn_prepend_hdr takes
+ * ipp_label_v4/v6 to form the packet.
+ *
+ * Tsol note: We have ipp structure local to this thread so
+ * no locking is needed.
*/
- if (icmp->icmp_effective_cred != NULL)
- mblk_setcred(mp, icmp->icmp_effective_cred, cpid);
- }
-
- if (icmp->icmp_ip_snd_options_len > 0) {
- ip_snd_opt_len = icmp->icmp_ip_snd_options_len;
- bcopy(icmp->icmp_ip_snd_options, ip_snd_opt, ip_snd_opt_len);
- }
- mutex_exit(&connp->conn_lock);
-
- /* Add an IP header */
- ip_hdr_length = IP_SIMPLE_HDR_LENGTH + ip_snd_opt_len;
- ipha = (ipha_t *)&mp->b_rptr[-ip_hdr_length];
- if ((uchar_t *)ipha < mp->b_datap->db_base ||
- mp->b_datap->db_ref != 1 ||
- !OK_32PTR(ipha)) {
- mblk_t *mp1;
- if (!(mp1 = allocb(ip_hdr_length + is->is_wroff_extra,
- BPRI_LO))) {
+ error = conn_update_label(connp, ixa, &v6dst, ipp);
+ if (error != 0) {
+ freemsg(mp);
BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
- rw_exit(&icmp->icmp_rwlock);
- return (ENOMEM);
+ goto done;
}
- mp1->b_cont = mp;
- ipha = (ipha_t *)mp1->b_datap->db_lim;
- mp1->b_wptr = (uchar_t *)ipha;
- ipha = (ipha_t *)((uchar_t *)ipha - ip_hdr_length);
- mp = mp1;
}
-#ifdef _BIG_ENDIAN
- /* Set version, header length, and tos */
- *(uint16_t *)&ipha->ipha_version_and_hdr_length =
- ((((IP_VERSION << 4) | (ip_hdr_length>>2)) << 8) |
- icmp->icmp_type_of_service);
- /* Set ttl and protocol */
- *(uint16_t *)&ipha->ipha_ttl = (icmp->icmp_ttl << 8) | icmp->icmp_proto;
-#else
- /* Set version, header length, and tos */
- *(uint16_t *)&ipha->ipha_version_and_hdr_length =
- ((icmp->icmp_type_of_service << 8) |
- ((IP_VERSION << 4) | (ip_hdr_length>>2)));
- /* Set ttl and protocol */
- *(uint16_t *)&ipha->ipha_ttl = (icmp->icmp_proto << 8) | icmp->icmp_ttl;
-#endif
- if (pktinfop->ip4_addr != INADDR_ANY) {
- ipha->ipha_src = pktinfop->ip4_addr;
- optinfo.ip_opt_flags = IP_VERIFY_SRC;
- } else {
-
- /*
- * Copy our address into the packet. If this is zero,
- * ip will fill in the real source address.
- */
- IN6_V4MAPPED_TO_IPADDR(&icmp->icmp_v6src, ipha->ipha_src);
+ mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, flowinfo, mp,
+ &error);
+ if (mp == NULL) {
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ ASSERT(error != 0);
+ goto done;
}
-
- ipha->ipha_fragment_offset_and_flags = 0;
-
- if (pktinfop->ip4_ill_index != 0) {
- optinfo.ip_opt_ill_index = pktinfop->ip4_ill_index;
+ if (ixa->ixa_pktlen > IP_MAXPACKET) {
+ error = EMSGSIZE;
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ freemsg(mp);
+ goto done;
}
-
- /*
- * For the socket of SOCK_RAW type, the checksum is provided in the
- * pre-built packet. We set the ipha_ident field to IP_HDR_INCLUDED to
- * tell IP that the application has sent a complete IP header and not
- * to compute the transport checksum nor change the DF flag.
- */
- ipha->ipha_ident = IP_HDR_INCLUDED;
-
- /* Finish common formatting of the packet. */
- mp->b_rptr = (uchar_t *)ipha;
-
- ip_len = mp->b_wptr - (uchar_t *)ipha;
- if (mp->b_cont != NULL)
- ip_len += msgdsize(mp->b_cont);
-
- /*
- * Set the length into the IP header.
- * If the length is greater than the maximum allowed by IP,
- * then free the message and return. Do not try and send it
- * as this can cause problems in layers below.
- */
- if (ip_len > IP_MAXPACKET) {
+ /* Policy might differ for different ICMP type/code */
+ mp = icmp_output_attach_policy(mp, connp, ixa);
+ if (mp == NULL) {
BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
- rw_exit(&icmp->icmp_rwlock);
- return (EMSGSIZE);
+ error = EHOSTUNREACH; /* IPsec policy failure */
+ goto done;
}
- ipha->ipha_length = htons((uint16_t)ip_len);
- /*
- * Copy in the destination address request
- */
- ipha->ipha_dst = v4dst;
- /*
- * Set ttl based on IP_MULTICAST_TTL to match IPv6 logic.
- */
- if (CLASSD(v4dst))
- ipha->ipha_ttl = icmp->icmp_multicast_ttl;
+ /* We're done. Pass the packet to ip. */
+ BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
- /* Copy in options if any */
- if (ip_hdr_length > IP_SIMPLE_HDR_LENGTH) {
- bcopy(ip_snd_opt,
- &ipha[1], ip_snd_opt_len);
+ /* Allow source not assigned to the system? */
+ ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
+ error = conn_ip_output(mp, ixa);
+ if (!connp->conn_unspec_src)
+ ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
+ /* No rawipOutErrors if an error since IP increases its error counter */
+ switch (error) {
+ case 0:
+ break;
+ case EWOULDBLOCK:
+ (void) ixa_check_drain_insert(connp, ixa);
+ error = 0;
+ break;
+ case EADDRNOTAVAIL:
/*
- * Massage source route putting first source route in ipha_dst.
- * Ignore the destination in the T_unitdata_req.
+ * IXAF_VERIFY_SOURCE tells us to pick a better source.
+ * Don't have the application see that errno
*/
- (void) ip_massage_options(ipha, is->is_netstack);
+ error = ENETUNREACH;
+ /* FALLTHRU */
+ default:
+ mutex_enter(&connp->conn_lock);
+ /*
+ * Clear the source and v6lastdst so we call ip_attr_connect
+ * for the next packet and try to pick a better source.
+ */
+ if (connp->conn_mcbc_bind)
+ connp->conn_saddr_v6 = ipv6_all_zeros;
+ else
+ connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
+ connp->conn_v6lastdst = ipv6_all_zeros;
+ mutex_exit(&connp->conn_lock);
+ break;
}
-
- rw_exit(&icmp->icmp_rwlock);
- BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
-
- ip_output_options(connp, mp, q, IP_WPUT, &optinfo);
- return (0);
+done:
+ ixa_refrele(ixa);
+ ip_pkt_free(ipp);
+ kmem_free(ipp, sizeof (*ipp));
+ return (error);
}
-static int
-icmp_update_label_v6(icmp_t *icmp, mblk_t *mp, in6_addr_t *dst)
+/*
+ * Handle sending an M_DATA for a connected socket.
+ * Handles both IPv4 and IPv6.
+ */
+int
+icmp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid)
{
- int err;
- uchar_t opt_storage[TSOL_MAX_IPV6_OPTION];
- icmp_stack_t *is = icmp->icmp_is;
- conn_t *connp = icmp->icmp_connp;
- cred_t *cred;
- cred_t *msg_cred;
- cred_t *effective_cred;
+ icmp_t *icmp = connp->conn_icmp;
+ icmp_stack_t *is = icmp->icmp_is;
+ int error;
+ ip_xmit_attr_t *ixa;
+ boolean_t do_ipsec;
/*
- * All Solaris components should pass a db_credp
- * for this message, hence we ASSERT.
- * On production kernels we return an error to be robust against
- * random streams modules sitting on top of us.
+ * If no other thread is using conn_ixa this just gets a reference to
+ * conn_ixa. Otherwise we get a safe copy of conn_ixa.
*/
- cred = msg_cred = msg_getcred(mp, NULL);
- ASSERT(cred != NULL);
- if (cred == NULL)
- return (EINVAL);
+ ixa = conn_get_ixa(connp, B_FALSE);
+ if (ixa == NULL) {
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ freemsg(mp);
+ return (ENOMEM);
+ }
- /*
- * Verify the destination is allowed to receive packets at
- * the security label of the message data. check_dest()
- * may create a new effective cred for this message
- * with a modified label or label flags.
- */
- if ((err = tsol_check_dest(cred, dst, IPV6_VERSION,
- connp->conn_mac_mode, &effective_cred)) != 0)
- goto done;
- if (effective_cred != NULL)
- cred = effective_cred;
+ ASSERT(cr != NULL);
+ ixa->ixa_cred = cr;
+ ixa->ixa_cpid = pid;
- /*
- * Calculate the security label to be placed in the text
- * of the message (if any).
- */
- if ((err = tsol_compute_label_v6(cred, dst, opt_storage,
- is->is_netstack->netstack_ip)) != 0)
- goto done;
+ /* Defer IPsec if it might need to look at ICMP type/code */
+ switch (ixa->ixa_protocol) {
+ case IPPROTO_ICMP:
+ case IPPROTO_ICMPV6:
+ do_ipsec = B_FALSE;
+ break;
+ default:
+ do_ipsec = B_TRUE;
+ }
- /*
- * Insert the security label in the cached ip options,
- * removing any old label that may exist.
- */
- if ((err = tsol_update_sticky(&icmp->icmp_sticky_ipp,
- &icmp->icmp_label_len_v6, opt_storage)) != 0)
- goto done;
+ mutex_enter(&connp->conn_lock);
+ mp = icmp_prepend_header_template(connp, ixa, mp,
+ &connp->conn_saddr_v6, connp->conn_flowinfo, &error);
+
+ if (mp == NULL) {
+ ASSERT(error != 0);
+ mutex_exit(&connp->conn_lock);
+ ixa_refrele(ixa);
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ freemsg(mp);
+ return (error);
+ }
+
+ if (!do_ipsec) {
+ /* Policy might differ for different ICMP type/code */
+ mp = icmp_output_attach_policy(mp, connp, ixa);
+ if (mp == NULL) {
+ mutex_exit(&connp->conn_lock);
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ ixa_refrele(ixa);
+ return (EHOSTUNREACH); /* IPsec policy failure */
+ }
+ }
/*
- * Save the destination address and cred we used to generate
- * the security label text.
+ * In case we got a safe copy of conn_ixa, or if opt_set made us a new
+ * safe copy, then we need to fill in any pointers in it.
*/
- icmp->icmp_v6lastdst = *dst;
- if (cred != icmp->icmp_effective_cred) {
- if (icmp->icmp_effective_cred != NULL)
- crfree(icmp->icmp_effective_cred);
- crhold(cred);
- icmp->icmp_effective_cred = cred;
- }
+ if (ixa->ixa_ire == NULL) {
+ in6_addr_t faddr, saddr;
+ in6_addr_t nexthop;
+ in_port_t fport;
+
+ saddr = connp->conn_saddr_v6;
+ faddr = connp->conn_faddr_v6;
+ fport = connp->conn_fport;
+ ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &faddr, &nexthop);
+ mutex_exit(&connp->conn_lock);
- if (msg_cred != icmp->icmp_last_cred) {
- if (icmp->icmp_last_cred != NULL)
- crfree(icmp->icmp_last_cred);
- crhold(msg_cred);
- icmp->icmp_last_cred = msg_cred;
+ error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop,
+ fport, NULL, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
+ (do_ipsec ? IPDF_IPSEC : 0));
+ switch (error) {
+ case 0:
+ break;
+ case EADDRNOTAVAIL:
+ /*
+ * IXAF_VERIFY_SOURCE tells us to pick a better source.
+ * Don't have the application see that errno
+ */
+ error = ENETUNREACH;
+ goto failed;
+ case ENETDOWN:
+ /*
+ * Have !ipif_addr_ready address; drop packet silently
+ * until we can get applications to not send until we
+ * are ready.
+ */
+ error = 0;
+ goto failed;
+ case EHOSTUNREACH:
+ case ENETUNREACH:
+ if (ixa->ixa_ire != NULL) {
+ /*
+ * Let conn_ip_output/ire_send_noroute return
+ * the error and send any local ICMP error.
+ */
+ error = 0;
+ break;
+ }
+ /* FALLTHRU */
+ default:
+ failed:
+ ixa_refrele(ixa);
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ freemsg(mp);
+ return (error);
+ }
+ } else {
+ /* Done with conn_t */
+ mutex_exit(&connp->conn_lock);
}
-done:
- if (effective_cred != NULL)
- crfree(effective_cred);
+ /* We're done. Pass the packet to ip. */
+ BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
- if (err != 0) {
- BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
- DTRACE_PROBE4(
- tx__ip__log__drop__updatelabel__icmp6,
- char *, "icmp(1) failed to update options(2) on mp(3)",
- icmp_t *, icmp, char *, opt_storage, mblk_t *, mp);
- return (err);
+ error = conn_ip_output(mp, ixa);
+ /* No rawipOutErrors if an error since IP increases its error counter */
+ switch (error) {
+ case 0:
+ break;
+ case EWOULDBLOCK:
+ (void) ixa_check_drain_insert(connp, ixa);
+ error = 0;
+ break;
+ case EADDRNOTAVAIL:
+ /*
+ * IXAF_VERIFY_SOURCE tells us to pick a better source.
+ * Don't have the application see that errno
+ */
+ error = ENETUNREACH;
+ break;
}
- return (0);
+ ixa_refrele(ixa);
+ return (error);
}
/*
- * raw_ip_send_data_v6():
- * Assumes that icmp_wput did some sanity checking on the destination
- * address, but that the label may not yet be correct.
+ * Handle sending an M_DATA to the last destination.
+ * Handles both IPv4 and IPv6.
+ *
+ * NOTE: The caller must hold conn_lock and we drop it here.
*/
-static int
-raw_ip_send_data_v6(queue_t *q, conn_t *connp, mblk_t *mp, sin6_t *sin6,
- ip6_pkt_t *ipp)
+int
+icmp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid,
+ ip_xmit_attr_t *ixa)
{
- ip6_t *ip6h;
- ip6i_t *ip6i; /* mp->b_rptr even if no ip6i_t */
- int ip_hdr_len = IPV6_HDR_LEN;
- size_t ip_len;
- icmp_t *icmp = connp->conn_icmp;
- icmp_stack_t *is = icmp->icmp_is;
- ip6_pkt_t *tipp;
- ip6_hbh_t *hopoptsptr = NULL;
- uint_t hopoptslen = 0;
- uint32_t csum = 0;
- uint_t ignore = 0;
- uint_t option_exists = 0, is_sticky = 0;
- uint8_t *cp;
- uint8_t *nxthdr_ptr;
- in6_addr_t ip6_dst;
- pid_t cpid;
- cred_t *cr;
-
- rw_enter(&icmp->icmp_rwlock, RW_READER);
+ icmp_t *icmp = connp->conn_icmp;
+ icmp_stack_t *is = icmp->icmp_is;
+ int error;
+ boolean_t do_ipsec;
- /*
- * If the local address is a mapped address return
- * an error.
- * It would be possible to send an IPv6 packet but the
- * response would never make it back to the application
- * since it is bound to a mapped address.
- */
- if (IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6src)) {
+ ASSERT(MUTEX_HELD(&connp->conn_lock));
+ ASSERT(ixa != NULL);
+
+ ASSERT(cr != NULL);
+ ixa->ixa_cred = cr;
+ ixa->ixa_cpid = pid;
+
+ /* Defer IPsec if it might need to look at ICMP type/code */
+ switch (ixa->ixa_protocol) {
+ case IPPROTO_ICMP:
+ case IPPROTO_ICMPV6:
+ do_ipsec = B_FALSE;
+ break;
+ default:
+ do_ipsec = B_TRUE;
+ }
+
+
+ mp = icmp_prepend_header_template(connp, ixa, mp,
+ &connp->conn_v6lastsrc, connp->conn_lastflowinfo, &error);
+
+ if (mp == NULL) {
+ ASSERT(error != 0);
+ mutex_exit(&connp->conn_lock);
+ ixa_refrele(ixa);
BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
- rw_exit(&icmp->icmp_rwlock);
- return (EADDRNOTAVAIL);
+ freemsg(mp);
+ return (error);
}
- ignore = ipp->ipp_sticky_ignored;
- if (sin6->sin6_scope_id != 0 &&
- IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
- /*
- * IPPF_SCOPE_ID is special. It's neither a sticky
- * option nor ancillary data. It needs to be
- * explicitly set in options_exists.
- */
- option_exists |= IPPF_SCOPE_ID;
+ if (!do_ipsec) {
+ /* Policy might differ for different ICMP type/code */
+ mp = icmp_output_attach_policy(mp, connp, ixa);
+ if (mp == NULL) {
+ mutex_exit(&connp->conn_lock);
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ ixa_refrele(ixa);
+ return (EHOSTUNREACH); /* IPsec policy failure */
+ }
}
/*
- * Compute the destination address
+ * In case we got a safe copy of conn_ixa, or if opt_set made us a new
+ * safe copy, then we need to fill in any pointers in it.
*/
- ip6_dst = sin6->sin6_addr;
- if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
- ip6_dst = ipv6_loopback;
+ if (ixa->ixa_ire == NULL) {
+ in6_addr_t lastdst, lastsrc;
+ in6_addr_t nexthop;
+ in_port_t lastport;
+
+ lastsrc = connp->conn_v6lastsrc;
+ lastdst = connp->conn_v6lastdst;
+ lastport = connp->conn_lastdstport;
+ ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &lastdst, &nexthop);
+ mutex_exit(&connp->conn_lock);
- /*
- * Check if our saved options are valid; update if not.
- * TSOL Note: Since we are not in WRITER mode, ICMP packets
- * to different destination may require different labels,
- * or worse, ICMP packets to same IP address may require
- * different labels due to use of shared all-zones address.
- * We use conn_lock to ensure that lastdst, sticky ipp_hopopts,
- * and sticky ipp_hopoptslen are consistent for the current
- * destination and are updated atomically.
- */
- mutex_enter(&connp->conn_lock);
- if (is_system_labeled()) {
- /*
- * Recompute the Trusted Extensions security label if we're
- * not going to the same destination as last time or the cred
- * attached to the received mblk changed. This is done in a
- * separate routine to avoid blowing up our stack here.
- */
- cr = msg_getcred(mp, &cpid);
- if (!IN6_ARE_ADDR_EQUAL(&icmp->icmp_v6lastdst, &ip6_dst) ||
- cr != icmp->icmp_last_cred) {
- int error = 0;
- error = icmp_update_label_v6(icmp, mp, &ip6_dst);
- if (error != 0) {
- mutex_exit(&connp->conn_lock);
- rw_exit(&icmp->icmp_rwlock);
- return (error);
+ error = ip_attr_connect(connp, ixa, &lastsrc, &lastdst,
+ &nexthop, lastport, NULL, NULL, IPDF_ALLOW_MCBC |
+ IPDF_VERIFY_DST | (do_ipsec ? IPDF_IPSEC : 0));
+ switch (error) {
+ case 0:
+ break;
+ case EADDRNOTAVAIL:
+ /*
+ * IXAF_VERIFY_SOURCE tells us to pick a better source.
+ * Don't have the application see that errno
+ */
+ error = ENETUNREACH;
+ goto failed;
+ case ENETDOWN:
+ /*
+ * Have !ipif_addr_ready address; drop packet silently
+ * until we can get applications to not send until we
+ * are ready.
+ */
+ error = 0;
+ goto failed;
+ case EHOSTUNREACH:
+ case ENETUNREACH:
+ if (ixa->ixa_ire != NULL) {
+ /*
+ * Let conn_ip_output/ire_send_noroute return
+ * the error and send any local ICMP error.
+ */
+ error = 0;
+ break;
}
+ /* FALLTHRU */
+ default:
+ failed:
+ ixa_refrele(ixa);
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ freemsg(mp);
+ return (error);
}
+ } else {
+ /* Done with conn_t */
+ mutex_exit(&connp->conn_lock);
+ }
+ /* We're done. Pass the packet to ip. */
+ BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
+ error = conn_ip_output(mp, ixa);
+ /* No rawipOutErrors if an error since IP increases its error counter */
+ switch (error) {
+ case 0:
+ break;
+ case EWOULDBLOCK:
+ (void) ixa_check_drain_insert(connp, ixa);
+ error = 0;
+ break;
+ case EADDRNOTAVAIL:
+ /*
+ * IXAF_VERIFY_SOURCE tells us to pick a better source.
+ * Don't have the application see that errno
+ */
+ error = ENETUNREACH;
+ /* FALLTHRU */
+ default:
+ mutex_enter(&connp->conn_lock);
/*
- * Apply credentials with modified security label if they exist.
- * icmp_update_label_v6() may have generated these credentials
- * for MAC-Exempt connections.
+ * Clear the source and v6lastdst so we call ip_attr_connect
+ * for the next packet and try to pick a better source.
*/
- if (icmp->icmp_effective_cred != NULL)
- mblk_setcred(mp, icmp->icmp_effective_cred, cpid);
+ if (connp->conn_mcbc_bind)
+ connp->conn_saddr_v6 = ipv6_all_zeros;
+ else
+ connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
+ connp->conn_v6lastdst = ipv6_all_zeros;
+ mutex_exit(&connp->conn_lock);
+ break;
}
+ ixa_refrele(ixa);
+ return (error);
+}
+
+
+/*
+ * Prepend the header template and then fill in the source and
+ * flowinfo. The caller needs to handle the destination address since
+ * it's setting is different if rthdr or source route.
+ *
+ * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET.
+ * When it returns NULL it sets errorp.
+ */
+static mblk_t *
+icmp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
+ const in6_addr_t *v6src, uint32_t flowinfo, int *errorp)
+{
+ icmp_t *icmp = connp->conn_icmp;
+ icmp_stack_t *is = icmp->icmp_is;
+ uint_t pktlen;
+ uint_t copylen;
+ uint8_t *iph;
+ uint_t ip_hdr_length;
+ uint32_t cksum;
+ ip_pkt_t *ipp;
+
+ ASSERT(MUTEX_HELD(&connp->conn_lock));
/*
- * If there's a security label here, then we ignore any options the
- * user may try to set. We keep the peer's label as a hidden sticky
- * option.
+ * Copy the header template.
*/
- if (icmp->icmp_label_len_v6 > 0) {
- ignore &= ~IPPF_HOPOPTS;
- ipp->ipp_fields &= ~IPPF_HOPOPTS;
+ copylen = connp->conn_ht_iphc_len;
+ pktlen = copylen + msgdsize(mp);
+ if (pktlen > IP_MAXPACKET) {
+ freemsg(mp);
+ *errorp = EMSGSIZE;
+ return (NULL);
}
+ ixa->ixa_pktlen = pktlen;
- if ((icmp->icmp_sticky_ipp.ipp_fields == 0) &&
- (ipp->ipp_fields == 0)) {
- /* No sticky options nor ancillary data. */
- mutex_exit(&connp->conn_lock);
- goto no_options;
+ /* check/fix buffer config, setup pointers into it */
+ iph = mp->b_rptr - copylen;
+ if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) {
+ mblk_t *mp1;
+
+ mp1 = allocb(copylen + is->is_wroff_extra, BPRI_MED);
+ if (mp1 == NULL) {
+ freemsg(mp);
+ *errorp = ENOMEM;
+ return (NULL);
+ }
+ mp1->b_wptr = DB_LIM(mp1);
+ mp1->b_cont = mp;
+ mp = mp1;
+ iph = (mp->b_wptr - copylen);
}
+ mp->b_rptr = iph;
+ bcopy(connp->conn_ht_iphc, iph, copylen);
+ ip_hdr_length = (uint_t)(connp->conn_ht_ulp - connp->conn_ht_iphc);
+
+ ixa->ixa_ip_hdr_length = ip_hdr_length;
/*
- * Go through the options figuring out where each is going to
- * come from and build two masks. The first mask indicates if
- * the option exists at all. The second mask indicates if the
- * option is sticky or ancillary.
+ * Prepare for ICMPv6 checksum done in IP.
+ *
+ * icmp_build_hdr_template has already massaged any routing header
+ * and placed the result in conn_sum.
+ *
+ * We make it easy for IP to include our pseudo header
+ * by putting our length (and any routing header adjustment)
+ * in the ICMPv6 checksum field.
*/
- if (!(ignore & IPPF_HOPOPTS)) {
- if (ipp->ipp_fields & IPPF_HOPOPTS) {
- option_exists |= IPPF_HOPOPTS;
- ip_hdr_len += ipp->ipp_hopoptslen;
- } else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_HOPOPTS) {
- option_exists |= IPPF_HOPOPTS;
- is_sticky |= IPPF_HOPOPTS;
- ASSERT(icmp->icmp_sticky_ipp.ipp_hopoptslen != 0);
- hopoptsptr = kmem_alloc(
- icmp->icmp_sticky_ipp.ipp_hopoptslen, KM_NOSLEEP);
- if (hopoptsptr == NULL) {
- mutex_exit(&connp->conn_lock);
- rw_exit(&icmp->icmp_rwlock);
- return (ENOMEM);
- }
- hopoptslen = icmp->icmp_sticky_ipp.ipp_hopoptslen;
- bcopy(icmp->icmp_sticky_ipp.ipp_hopopts, hopoptsptr,
- hopoptslen);
- ip_hdr_len += hopoptslen;
- }
- }
- mutex_exit(&connp->conn_lock);
+ cksum = pktlen - ip_hdr_length;
- if (!(ignore & IPPF_RTHDR)) {
- if (ipp->ipp_fields & IPPF_RTHDR) {
- option_exists |= IPPF_RTHDR;
- ip_hdr_len += ipp->ipp_rthdrlen;
- } else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_RTHDR) {
- option_exists |= IPPF_RTHDR;
- is_sticky |= IPPF_RTHDR;
- ip_hdr_len += icmp->icmp_sticky_ipp.ipp_rthdrlen;
- }
- }
+ cksum += connp->conn_sum;
+ cksum = (cksum >> 16) + (cksum & 0xFFFF);
+ ASSERT(cksum < 0x10000);
- if (!(ignore & IPPF_RTDSTOPTS) && (option_exists & IPPF_RTHDR)) {
- /*
- * Need to have a router header to use these.
- */
- if (ipp->ipp_fields & IPPF_RTDSTOPTS) {
- option_exists |= IPPF_RTDSTOPTS;
- ip_hdr_len += ipp->ipp_rtdstoptslen;
- } else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_RTDSTOPTS) {
- option_exists |= IPPF_RTDSTOPTS;
- is_sticky |= IPPF_RTDSTOPTS;
- ip_hdr_len +=
- icmp->icmp_sticky_ipp.ipp_rtdstoptslen;
- }
- }
+ ipp = &connp->conn_xmit_ipp;
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ ipha_t *ipha = (ipha_t *)iph;
- if (!(ignore & IPPF_DSTOPTS)) {
- if (ipp->ipp_fields & IPPF_DSTOPTS) {
- option_exists |= IPPF_DSTOPTS;
- ip_hdr_len += ipp->ipp_dstoptslen;
- } else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_DSTOPTS) {
- option_exists |= IPPF_DSTOPTS;
- is_sticky |= IPPF_DSTOPTS;
- ip_hdr_len += icmp->icmp_sticky_ipp.ipp_dstoptslen;
- }
- }
+ ipha->ipha_length = htons((uint16_t)pktlen);
- if (!(ignore & IPPF_IFINDEX)) {
- if (ipp->ipp_fields & IPPF_IFINDEX) {
- option_exists |= IPPF_IFINDEX;
- } else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_IFINDEX) {
- option_exists |= IPPF_IFINDEX;
- is_sticky |= IPPF_IFINDEX;
+ /* if IP_PKTINFO specified an addres it wins over bind() */
+ if ((ipp->ipp_fields & IPPF_ADDR) &&
+ IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
+ ASSERT(ipp->ipp_addr_v4 != INADDR_ANY);
+ ipha->ipha_src = ipp->ipp_addr_v4;
+ } else {
+ IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src);
}
- }
+ } else {
+ ip6_t *ip6h = (ip6_t *)iph;
+ uint_t cksum_offset = 0;
- if (!(ignore & IPPF_ADDR)) {
- if (ipp->ipp_fields & IPPF_ADDR) {
- option_exists |= IPPF_ADDR;
- } else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_ADDR) {
- option_exists |= IPPF_ADDR;
- is_sticky |= IPPF_ADDR;
- }
- }
+ ip6h->ip6_plen = htons((uint16_t)(pktlen - IPV6_HDR_LEN));
- if (!(ignore & IPPF_DONTFRAG)) {
- if (ipp->ipp_fields & IPPF_DONTFRAG) {
- option_exists |= IPPF_DONTFRAG;
- } else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_DONTFRAG) {
- option_exists |= IPPF_DONTFRAG;
- is_sticky |= IPPF_DONTFRAG;
+ /* if IP_PKTINFO specified an addres it wins over bind() */
+ if ((ipp->ipp_fields & IPPF_ADDR) &&
+ !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
+ ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr));
+ ip6h->ip6_src = ipp->ipp_addr;
+ } else {
+ ip6h->ip6_src = *v6src;
}
- }
-
- if (!(ignore & IPPF_USE_MIN_MTU)) {
- if (ipp->ipp_fields & IPPF_USE_MIN_MTU) {
- option_exists |= IPPF_USE_MIN_MTU;
- } else if (icmp->icmp_sticky_ipp.ipp_fields &
- IPPF_USE_MIN_MTU) {
- option_exists |= IPPF_USE_MIN_MTU;
- is_sticky |= IPPF_USE_MIN_MTU;
+ ip6h->ip6_vcf =
+ (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
+ (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
+ if (ipp->ipp_fields & IPPF_TCLASS) {
+ /* Overrides the class part of flowinfo */
+ ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
+ ipp->ipp_tclass);
+ }
+
+ if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) {
+ if (connp->conn_proto == IPPROTO_ICMPV6) {
+ cksum_offset = ixa->ixa_ip_hdr_length +
+ offsetof(icmp6_t, icmp6_cksum);
+ } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
+ cksum_offset = ixa->ixa_ip_hdr_length +
+ ixa->ixa_raw_cksum_offset;
+ }
}
- }
+ if (cksum_offset != 0) {
+ uint16_t *ptr;
+
+ /* Make sure the checksum fits in the first mblk */
+ if (cksum_offset + sizeof (short) > MBLKL(mp)) {
+ mblk_t *mp1;
- if (!(ignore & IPPF_NEXTHOP)) {
- if (ipp->ipp_fields & IPPF_NEXTHOP) {
- option_exists |= IPPF_NEXTHOP;
- } else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_NEXTHOP) {
- option_exists |= IPPF_NEXTHOP;
- is_sticky |= IPPF_NEXTHOP;
+ mp1 = msgpullup(mp,
+ cksum_offset + sizeof (short));
+ freemsg(mp);
+ if (mp1 == NULL) {
+ *errorp = ENOMEM;
+ return (NULL);
+ }
+ mp = mp1;
+ iph = mp->b_rptr;
+ ip6h = (ip6_t *)iph;
+ }
+ ptr = (uint16_t *)(mp->b_rptr + cksum_offset);
+ *ptr = htons(cksum);
}
}
- if (!(ignore & IPPF_HOPLIMIT) && (ipp->ipp_fields & IPPF_HOPLIMIT))
- option_exists |= IPPF_HOPLIMIT;
- /* IPV6_HOPLIMIT can never be sticky */
- ASSERT(!(icmp->icmp_sticky_ipp.ipp_fields & IPPF_HOPLIMIT));
+ return (mp);
+}
- if (!(ignore & IPPF_UNICAST_HOPS) &&
- (icmp->icmp_sticky_ipp.ipp_fields & IPPF_UNICAST_HOPS)) {
- option_exists |= IPPF_UNICAST_HOPS;
- is_sticky |= IPPF_UNICAST_HOPS;
- }
+/*
+ * This routine handles all messages passed downstream. It either
+ * consumes the message or passes it downstream; it never queues a
+ * a message.
+ */
+void
+icmp_wput(queue_t *q, mblk_t *mp)
+{
+ sin6_t *sin6;
+ sin_t *sin = NULL;
+ uint_t srcid;
+ conn_t *connp = Q_TO_CONN(q);
+ icmp_t *icmp = connp->conn_icmp;
+ int error = 0;
+ struct sockaddr *addr = NULL;
+ socklen_t addrlen;
+ icmp_stack_t *is = icmp->icmp_is;
+ struct T_unitdata_req *tudr;
+ mblk_t *data_mp;
+ cred_t *cr;
+ pid_t pid;
- if (!(ignore & IPPF_MULTICAST_HOPS) &&
- (icmp->icmp_sticky_ipp.ipp_fields & IPPF_MULTICAST_HOPS)) {
- option_exists |= IPPF_MULTICAST_HOPS;
- is_sticky |= IPPF_MULTICAST_HOPS;
- }
+ /*
+ * We directly handle several cases here: T_UNITDATA_REQ message
+ * coming down as M_PROTO/M_PCPROTO and M_DATA messages for connected
+ * socket.
+ */
+ switch (DB_TYPE(mp)) {
+ case M_DATA:
+ /* sockfs never sends down M_DATA */
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ freemsg(mp);
+ return;
- if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_NO_CKSUM) {
- /* This is a sticky socket option only */
- option_exists |= IPPF_NO_CKSUM;
- is_sticky |= IPPF_NO_CKSUM;
- }
+ case M_PROTO:
+ case M_PCPROTO:
+ tudr = (struct T_unitdata_req *)mp->b_rptr;
+ if (MBLKL(mp) < sizeof (*tudr) ||
+ ((t_primp_t)mp->b_rptr)->type != T_UNITDATA_REQ) {
+ icmp_wput_other(q, mp);
+ return;
+ }
+ break;
- if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_RAW_CKSUM) {
- /* This is a sticky socket option only */
- option_exists |= IPPF_RAW_CKSUM;
- is_sticky |= IPPF_RAW_CKSUM;
+ default:
+ icmp_wput_other(q, mp);
+ return;
}
- if (!(ignore & IPPF_TCLASS)) {
- if (ipp->ipp_fields & IPPF_TCLASS) {
- option_exists |= IPPF_TCLASS;
- } else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_TCLASS) {
- option_exists |= IPPF_TCLASS;
- is_sticky |= IPPF_TCLASS;
- }
+ /* Handle valid T_UNITDATA_REQ here */
+ data_mp = mp->b_cont;
+ if (data_mp == NULL) {
+ error = EPROTO;
+ goto ud_error2;
}
+ mp->b_cont = NULL;
-no_options:
+ if (!MBLKIN(mp, 0, tudr->DEST_offset + tudr->DEST_length)) {
+ error = EADDRNOTAVAIL;
+ goto ud_error2;
+ }
/*
- * If any options carried in the ip6i_t were specified, we
- * need to account for the ip6i_t in the data we'll be sending
- * down.
+ * All Solaris components should pass a db_credp
+ * for this message, hence we ASSERT.
+ * On production kernels we return an error to be robust against
+ * random streams modules sitting on top of us.
*/
- if (option_exists & IPPF_HAS_IP6I)
- ip_hdr_len += sizeof (ip6i_t);
+ cr = msg_getcred(mp, &pid);
+ ASSERT(cr != NULL);
+ if (cr == NULL) {
+ error = EINVAL;
+ goto ud_error2;
+ }
- /* check/fix buffer config, setup pointers into it */
- ip6h = (ip6_t *)&mp->b_rptr[-ip_hdr_len];
- if ((mp->b_datap->db_ref != 1) ||
- ((unsigned char *)ip6h < mp->b_datap->db_base) ||
- !OK_32PTR(ip6h)) {
- mblk_t *mp1;
-
- /* Try to get everything in a single mblk next time */
- if (ip_hdr_len > icmp->icmp_max_hdr_len) {
- icmp->icmp_max_hdr_len = ip_hdr_len;
-
- (void) proto_set_tx_wroff(q == NULL ? NULL:RD(q), connp,
- icmp->icmp_max_hdr_len + is->is_wroff_extra);
- }
- mp1 = allocb(ip_hdr_len + is->is_wroff_extra, BPRI_LO);
- if (!mp1) {
- BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
- kmem_free(hopoptsptr, hopoptslen);
- rw_exit(&icmp->icmp_rwlock);
- return (ENOMEM);
- }
- mp1->b_cont = mp;
- mp1->b_wptr = mp1->b_datap->db_lim;
- ip6h = (ip6_t *)(mp1->b_wptr - ip_hdr_len);
- mp = mp1;
+ /*
+ * If a port has not been bound to the stream, fail.
+ * This is not a problem when sockfs is directly
+ * above us, because it will ensure that the socket
+ * is first bound before allowing data to be sent.
+ */
+ if (icmp->icmp_state == TS_UNBND) {
+ error = EPROTO;
+ goto ud_error2;
}
- mp->b_rptr = (unsigned char *)ip6h;
- ip6i = (ip6i_t *)ip6h;
-
-#define ANCIL_OR_STICKY_PTR(f) ((is_sticky & f) ? &icmp->icmp_sticky_ipp : ipp)
- if (option_exists & IPPF_HAS_IP6I) {
- ip6h = (ip6_t *)&ip6i[1];
- ip6i->ip6i_flags = 0;
- ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
-
- /* sin6_scope_id takes precendence over IPPF_IFINDEX */
- if (option_exists & IPPF_SCOPE_ID) {
- ip6i->ip6i_flags |= IP6I_IFINDEX;
- ip6i->ip6i_ifindex = sin6->sin6_scope_id;
- } else if (option_exists & IPPF_IFINDEX) {
- tipp = ANCIL_OR_STICKY_PTR(IPPF_IFINDEX);
- ASSERT(tipp->ipp_ifindex != 0);
- ip6i->ip6i_flags |= IP6I_IFINDEX;
- ip6i->ip6i_ifindex = tipp->ipp_ifindex;
+ addr = (struct sockaddr *)&mp->b_rptr[tudr->DEST_offset];
+ addrlen = tudr->DEST_length;
+
+ switch (connp->conn_family) {
+ case AF_INET6:
+ sin6 = (sin6_t *)addr;
+ if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) ||
+ (sin6->sin6_family != AF_INET6)) {
+ error = EADDRNOTAVAIL;
+ goto ud_error2;
}
- if (option_exists & IPPF_RAW_CKSUM) {
- ip6i->ip6i_flags |= IP6I_RAW_CHECKSUM;
- ip6i->ip6i_checksum_off = icmp->icmp_checksum_off;
+ /* No support for mapped addresses on raw sockets */
+ if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+ error = EADDRNOTAVAIL;
+ goto ud_error2;
}
+ srcid = sin6->__sin6_src_id;
- if (option_exists & IPPF_NO_CKSUM) {
- ip6i->ip6i_flags |= IP6I_NO_ULP_CKSUM;
+ /*
+ * If the local address is a mapped address return
+ * an error.
+ * It would be possible to send an IPv6 packet but the
+ * response would never make it back to the application
+ * since it is bound to a mapped address.
+ */
+ if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
+ error = EADDRNOTAVAIL;
+ goto ud_error2;
}
- if (option_exists & IPPF_ADDR) {
+ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
+ sin6->sin6_addr = ipv6_loopback;
+
+ if (tudr->OPT_length != 0) {
/*
- * Enable per-packet source address verification if
- * IPV6_PKTINFO specified the source address.
- * ip6_src is set in the transport's _wput function.
+ * If we are connected then the destination needs to be
+ * the same as the connected one.
*/
- ip6i->ip6i_flags |= IP6I_VERIFY_SRC;
- }
+ if (icmp->icmp_state == TS_DATA_XFER &&
+ !conn_same_as_last_v6(connp, sin6)) {
+ error = EISCONN;
+ goto ud_error2;
+ }
+ error = icmp_output_ancillary(connp, NULL, sin6,
+ data_mp, mp, NULL, cr, pid);
+ } else {
+ ip_xmit_attr_t *ixa;
- if (option_exists & IPPF_DONTFRAG) {
- ip6i->ip6i_flags |= IP6I_DONTFRAG;
+ /*
+ * We have to allocate an ip_xmit_attr_t before we grab
+ * conn_lock and we need to hold conn_lock once we've
+ * checked conn_same_as_last_v6 to handle concurrent
+ * send* calls on a socket.
+ */
+ ixa = conn_get_ixa(connp, B_FALSE);
+ if (ixa == NULL) {
+ error = ENOMEM;
+ goto ud_error2;
+ }
+ mutex_enter(&connp->conn_lock);
+
+ if (conn_same_as_last_v6(connp, sin6) &&
+ connp->conn_lastsrcid == srcid &&
+ ipsec_outbound_policy_current(ixa)) {
+ /* icmp_output_lastdst drops conn_lock */
+ error = icmp_output_lastdst(connp, data_mp, cr,
+ pid, ixa);
+ } else {
+ /* icmp_output_newdst drops conn_lock */
+ error = icmp_output_newdst(connp, data_mp, NULL,
+ sin6, cr, pid, ixa);
+ }
+ ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
}
+ if (error == 0) {
+ freeb(mp);
+ return;
+ }
+ break;
- if (option_exists & IPPF_USE_MIN_MTU) {
- ip6i->ip6i_flags = IP6I_API_USE_MIN_MTU(
- ip6i->ip6i_flags, ipp->ipp_use_min_mtu);
+ case AF_INET:
+ sin = (sin_t *)addr;
+ if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) ||
+ (sin->sin_family != AF_INET)) {
+ error = EADDRNOTAVAIL;
+ goto ud_error2;
}
+ if (sin->sin_addr.s_addr == INADDR_ANY)
+ sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
- if (option_exists & IPPF_NEXTHOP) {
- tipp = ANCIL_OR_STICKY_PTR(IPPF_NEXTHOP);
- ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&tipp->ipp_nexthop));
- ip6i->ip6i_flags |= IP6I_NEXTHOP;
- ip6i->ip6i_nexthop = tipp->ipp_nexthop;
+ /* Protocol 255 contains full IP headers */
+ /* Read without holding lock */
+ if (icmp->icmp_hdrincl) {
+ if (MBLKL(data_mp) < IP_SIMPLE_HDR_LENGTH) {
+ if (!pullupmsg(data_mp, IP_SIMPLE_HDR_LENGTH)) {
+ error = EINVAL;
+ goto ud_error2;
+ }
+ }
+ error = icmp_output_hdrincl(connp, data_mp, cr, pid);
+ if (error == 0) {
+ freeb(mp);
+ return;
+ }
+ /* data_mp consumed above */
+ data_mp = NULL;
+ goto ud_error2;
}
- /*
- * tell IP this is an ip6i_t private header
- */
- ip6i->ip6i_nxt = IPPROTO_RAW;
- }
-
- /* Initialize IPv6 header */
- ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
- bzero(&ip6h->ip6_src, sizeof (ip6h->ip6_src));
-
- /* Set the hoplimit of the outgoing packet. */
- if (option_exists & IPPF_HOPLIMIT) {
- /* IPV6_HOPLIMIT ancillary data overrides all other settings. */
- ip6h->ip6_hops = ipp->ipp_hoplimit;
- ip6i->ip6i_flags |= IP6I_HOPLIMIT;
- } else if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
- ip6h->ip6_hops = icmp->icmp_multicast_ttl;
- if (option_exists & IPPF_MULTICAST_HOPS)
- ip6i->ip6i_flags |= IP6I_HOPLIMIT;
- } else {
- ip6h->ip6_hops = icmp->icmp_ttl;
- if (option_exists & IPPF_UNICAST_HOPS)
- ip6i->ip6i_flags |= IP6I_HOPLIMIT;
- }
+ if (tudr->OPT_length != 0) {
+ /*
+ * If we are connected then the destination needs to be
+ * the same as the connected one.
+ */
+ if (icmp->icmp_state == TS_DATA_XFER &&
+ !conn_same_as_last_v4(connp, sin)) {
+ error = EISCONN;
+ goto ud_error2;
+ }
+ error = icmp_output_ancillary(connp, sin, NULL,
+ data_mp, mp, NULL, cr, pid);
+ } else {
+ ip_xmit_attr_t *ixa;
- if (option_exists & IPPF_ADDR) {
- tipp = ANCIL_OR_STICKY_PTR(IPPF_ADDR);
- ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&tipp->ipp_addr));
- ip6h->ip6_src = tipp->ipp_addr;
- } else {
- /*
- * The source address was not set using IPV6_PKTINFO.
- * First look at the bound source.
- * If unspecified fallback to __sin6_src_id.
- */
- ip6h->ip6_src = icmp->icmp_v6src;
- if (sin6->__sin6_src_id != 0 &&
- IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) {
- ip_srcid_find_id(sin6->__sin6_src_id,
- &ip6h->ip6_src, icmp->icmp_zoneid,
- is->is_netstack);
+ /*
+ * We have to allocate an ip_xmit_attr_t before we grab
+ * conn_lock and we need to hold conn_lock once we've
+ * checked conn_same_as_last_v4 to handle concurrent
+ * send* calls on a socket.
+ */
+ ixa = conn_get_ixa(connp, B_FALSE);
+ if (ixa == NULL) {
+ error = ENOMEM;
+ goto ud_error2;
+ }
+ mutex_enter(&connp->conn_lock);
+
+ if (conn_same_as_last_v4(connp, sin) &&
+ ipsec_outbound_policy_current(ixa)) {
+ /* icmp_output_lastdst drops conn_lock */
+ error = icmp_output_lastdst(connp, data_mp, cr,
+ pid, ixa);
+ } else {
+ /* icmp_output_newdst drops conn_lock */
+ error = icmp_output_newdst(connp, data_mp, sin,
+ NULL, cr, pid, ixa);
+ }
+ ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
+ }
+ if (error == 0) {
+ freeb(mp);
+ return;
}
+ break;
}
+ ASSERT(mp != NULL);
+ /* mp is freed by the following routine */
+ icmp_ud_err(q, mp, (t_scalar_t)error);
+ return;
- nxthdr_ptr = (uint8_t *)&ip6h->ip6_nxt;
- cp = (uint8_t *)&ip6h[1];
+ud_error2:
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ freemsg(data_mp);
+ ASSERT(mp != NULL);
+ /* mp is freed by the following routine */
+ icmp_ud_err(q, mp, (t_scalar_t)error);
+}
+
+/*
+ * Handle the case of the IP address or flow label being different
+ * for both IPv4 and IPv6.
+ *
+ * NOTE: The caller must hold conn_lock and we drop it here.
+ */
+static int
+icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6,
+ cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa)
+{
+ icmp_t *icmp = connp->conn_icmp;
+ icmp_stack_t *is = icmp->icmp_is;
+ int error;
+ ip_xmit_attr_t *oldixa;
+ boolean_t do_ipsec;
+ uint_t srcid;
+ uint32_t flowinfo;
+ in6_addr_t v6src;
+ in6_addr_t v6dst;
+ in6_addr_t v6nexthop;
+ in_port_t dstport;
+
+ ASSERT(MUTEX_HELD(&connp->conn_lock));
+ ASSERT(ixa != NULL);
/*
- * Here's where we have to start stringing together
- * any extension headers in the right order:
- * Hop-by-hop, destination, routing, and final destination opts.
+ * We hold conn_lock across all the use and modifications of
+ * the conn_lastdst, conn_ixa, and conn_xmit_ipp to ensure that they
+ * stay consistent.
*/
- if (option_exists & IPPF_HOPOPTS) {
- /* Hop-by-hop options */
- ip6_hbh_t *hbh = (ip6_hbh_t *)cp;
-
- *nxthdr_ptr = IPPROTO_HOPOPTS;
- nxthdr_ptr = &hbh->ip6h_nxt;
- if (hopoptslen == 0) {
- tipp = ANCIL_OR_STICKY_PTR(IPPF_HOPOPTS);
- bcopy(tipp->ipp_hopopts, cp, tipp->ipp_hopoptslen);
- cp += tipp->ipp_hopoptslen;
- } else {
- bcopy(hopoptsptr, cp, hopoptslen);
- cp += hopoptslen;
- kmem_free(hopoptsptr, hopoptslen);
- }
+ ASSERT(cr != NULL);
+ ixa->ixa_cred = cr;
+ ixa->ixa_cpid = pid;
+ if (is_system_labeled()) {
+ /* We need to restart with a label based on the cred */
+ ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
}
/*
- * En-route destination options
- * Only do them if there's a routing header as well
+ * If we are connected then the destination needs to be the
+ * same as the connected one, which is not the case here since we
+ * checked for that above.
*/
- if (option_exists & IPPF_RTDSTOPTS) {
- ip6_dest_t *dst = (ip6_dest_t *)cp;
- tipp = ANCIL_OR_STICKY_PTR(IPPF_RTDSTOPTS);
+ if (icmp->icmp_state == TS_DATA_XFER) {
+ mutex_exit(&connp->conn_lock);
+ error = EISCONN;
+ goto ud_error;
+ }
- *nxthdr_ptr = IPPROTO_DSTOPTS;
- nxthdr_ptr = &dst->ip6d_nxt;
+ /* In case previous destination was multicast or multirt */
+ ip_attr_newdst(ixa);
- bcopy(tipp->ipp_rtdstopts, cp, tipp->ipp_rtdstoptslen);
- cp += tipp->ipp_rtdstoptslen;
- }
/*
- * Routing header next
+ * If laddr is unspecified then we look at sin6_src_id.
+ * We will give precedence to a source address set with IPV6_PKTINFO
+ * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
+ * want ip_attr_connect to select a source (since it can fail) when
+ * IPV6_PKTINFO is specified.
+ * If this doesn't result in a source address then we get a source
+ * from ip_attr_connect() below.
*/
- if (option_exists & IPPF_RTHDR) {
- ip6_rthdr_t *rt = (ip6_rthdr_t *)cp;
- tipp = ANCIL_OR_STICKY_PTR(IPPF_RTHDR);
+ v6src = connp->conn_saddr_v6;
+ if (sin != NULL) {
+ IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
+ dstport = sin->sin_port;
+ flowinfo = 0;
+ srcid = 0;
+ ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
+ if (srcid != 0 && V4_PART_OF_V6(&v6src) == INADDR_ANY) {
+ ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
+ connp->conn_netstack);
+ }
+ ixa->ixa_flags |= IXAF_IS_IPV4;
+ } else {
+ v6dst = sin6->sin6_addr;
+ dstport = sin6->sin6_port;
+ flowinfo = sin6->sin6_flowinfo;
+ srcid = sin6->__sin6_src_id;
+ if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
+ ixa->ixa_scopeid = sin6->sin6_scope_id;
+ ixa->ixa_flags |= IXAF_SCOPEID_SET;
+ } else {
+ ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
+ }
+ if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
+ ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
+ connp->conn_netstack);
+ }
+ if (IN6_IS_ADDR_V4MAPPED(&v6dst))
+ ixa->ixa_flags |= IXAF_IS_IPV4;
+ else
+ ixa->ixa_flags &= ~IXAF_IS_IPV4;
+ }
+ /* Handle IPV6_PKTINFO setting source address. */
+ if (IN6_IS_ADDR_UNSPECIFIED(&v6src) &&
+ (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR)) {
+ ip_pkt_t *ipp = &connp->conn_xmit_ipp;
- *nxthdr_ptr = IPPROTO_ROUTING;
- nxthdr_ptr = &rt->ip6r_nxt;
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
+ v6src = ipp->ipp_addr;
+ } else {
+ if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
+ v6src = ipp->ipp_addr;
+ }
+ }
- bcopy(tipp->ipp_rthdr, cp, tipp->ipp_rthdrlen);
- cp += tipp->ipp_rthdrlen;
+ /* Defer IPsec if it might need to look at ICMP type/code */
+ switch (ixa->ixa_protocol) {
+ case IPPROTO_ICMP:
+ case IPPROTO_ICMPV6:
+ do_ipsec = B_FALSE;
+ break;
+ default:
+ do_ipsec = B_TRUE;
}
- /*
- * Do ultimate destination options
- */
- if (option_exists & IPPF_DSTOPTS) {
- ip6_dest_t *dest = (ip6_dest_t *)cp;
- tipp = ANCIL_OR_STICKY_PTR(IPPF_DSTOPTS);
- *nxthdr_ptr = IPPROTO_DSTOPTS;
- nxthdr_ptr = &dest->ip6d_nxt;
+ ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &v6dst, &v6nexthop);
+ mutex_exit(&connp->conn_lock);
- bcopy(tipp->ipp_dstopts, cp, tipp->ipp_dstoptslen);
- cp += tipp->ipp_dstoptslen;
+ error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
+ &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
+ (do_ipsec ? IPDF_IPSEC : 0));
+ switch (error) {
+ case 0:
+ break;
+ case EADDRNOTAVAIL:
+ /*
+ * IXAF_VERIFY_SOURCE tells us to pick a better source.
+ * Don't have the application see that errno
+ */
+ error = ENETUNREACH;
+ goto failed;
+ case ENETDOWN:
+ /*
+ * Have !ipif_addr_ready address; drop packet silently
+ * until we can get applications to not send until we
+ * are ready.
+ */
+ error = 0;
+ goto failed;
+ case EHOSTUNREACH:
+ case ENETUNREACH:
+ if (ixa->ixa_ire != NULL) {
+ /*
+ * Let conn_ip_output/ire_send_noroute return
+ * the error and send any local ICMP error.
+ */
+ error = 0;
+ break;
+ }
+ /* FALLTHRU */
+ default:
+ failed:
+ goto ud_error;
}
+ mutex_enter(&connp->conn_lock);
/*
- * Now set the last header pointer to the proto passed in
+ * While we dropped the lock some other thread might have connected
+ * this socket. If so we bail out with EISCONN to ensure that the
+ * connecting thread is the one that updates conn_ixa, conn_ht_*
+ * and conn_*last*.
*/
- ASSERT((int)(cp - (uint8_t *)ip6i) == ip_hdr_len);
- *nxthdr_ptr = icmp->icmp_proto;
+ if (icmp->icmp_state == TS_DATA_XFER) {
+ mutex_exit(&connp->conn_lock);
+ error = EISCONN;
+ goto ud_error;
+ }
/*
- * Copy in the destination address
+ * We need to rebuild the headers if
+ * - we are labeling packets (could be different for different
+ * destinations)
+ * - we have a source route (or routing header) since we need to
+ * massage that to get the pseudo-header checksum
+ * - a socket option with COA_HEADER_CHANGED has been set which
+ * set conn_v6lastdst to zero.
+ *
+ * Otherwise the prepend function will just update the src, dst,
+ * and flow label.
*/
- ip6h->ip6_dst = ip6_dst;
-
- ip6h->ip6_vcf =
- (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
- (sin6->sin6_flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
-
- if (option_exists & IPPF_TCLASS) {
- tipp = ANCIL_OR_STICKY_PTR(IPPF_TCLASS);
- ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
- tipp->ipp_tclass);
- }
- if (option_exists & IPPF_RTHDR) {
- ip6_rthdr_t *rth;
-
+ if (is_system_labeled()) {
+ /* TX MLP requires SCM_UCRED and don't have that here */
+ if (connp->conn_mlp_type != mlptSingle) {
+ mutex_exit(&connp->conn_lock);
+ error = ECONNREFUSED;
+ goto ud_error;
+ }
/*
- * Perform any processing needed for source routing.
- * We know that all extension headers will be in the same mblk
- * as the IPv6 header.
+ * Check whether Trusted Solaris policy allows communication
+ * with this host, and pretend that the destination is
+ * unreachable if not.
+ * Compute any needed label and place it in ipp_label_v4/v6.
+ *
+ * Later conn_build_hdr_template/conn_prepend_hdr takes
+ * ipp_label_v4/v6 to form the packet.
+ *
+ * Tsol note: Since we hold conn_lock we know no other
+ * thread manipulates conn_xmit_ipp.
*/
- rth = ip_find_rthdr_v6(ip6h, mp->b_wptr);
- if (rth != NULL && rth->ip6r_segleft != 0) {
- if (rth->ip6r_type != IPV6_RTHDR_TYPE_0) {
- /*
- * Drop packet - only support Type 0 routing.
- * Notify the application as well.
- */
- BUMP_MIB(&is->is_rawip_mib,
- rawipOutErrors);
- rw_exit(&icmp->icmp_rwlock);
- return (EPROTO);
- }
- /*
- * rth->ip6r_len is twice the number of
- * addresses in the header
- */
- if (rth->ip6r_len & 0x1) {
- BUMP_MIB(&is->is_rawip_mib,
- rawipOutErrors);
- rw_exit(&icmp->icmp_rwlock);
- return (EPROTO);
- }
- /*
- * Shuffle the routing header and ip6_dst
- * addresses, and get the checksum difference
- * between the first hop (in ip6_dst) and
- * the destination (in the last routing hdr entry).
- */
- csum = ip_massage_options_v6(ip6h, rth,
- is->is_netstack);
- /*
- * Verify that the first hop isn't a mapped address.
- * Routers along the path need to do this verification
- * for subsequent hops.
- */
- if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_dst)) {
- BUMP_MIB(&is->is_rawip_mib,
- rawipOutErrors);
- rw_exit(&icmp->icmp_rwlock);
- return (EADDRNOTAVAIL);
+ error = conn_update_label(connp, ixa, &v6dst,
+ &connp->conn_xmit_ipp);
+ if (error != 0) {
+ mutex_exit(&connp->conn_lock);
+ goto ud_error;
+ }
+ /* Rebuild the header template */
+ error = icmp_build_hdr_template(connp, &v6src, &v6dst,
+ flowinfo);
+ if (error != 0) {
+ mutex_exit(&connp->conn_lock);
+ goto ud_error;
+ }
+ } else if (connp->conn_xmit_ipp.ipp_fields &
+ (IPPF_IPV4_OPTIONS|IPPF_RTHDR) ||
+ IN6_IS_ADDR_UNSPECIFIED(&connp->conn_v6lastdst)) {
+ /* Rebuild the header template */
+ error = icmp_build_hdr_template(connp, &v6src, &v6dst,
+ flowinfo);
+ if (error != 0) {
+ mutex_exit(&connp->conn_lock);
+ goto ud_error;
+ }
+ } else {
+ /* Simply update the destination address if no source route */
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ ipha_t *ipha = (ipha_t *)connp->conn_ht_iphc;
+
+ IN6_V4MAPPED_TO_IPADDR(&v6dst, ipha->ipha_dst);
+ if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
+ ipha->ipha_fragment_offset_and_flags |=
+ IPH_DF_HTONS;
+ } else {
+ ipha->ipha_fragment_offset_and_flags &=
+ ~IPH_DF_HTONS;
}
+ } else {
+ ip6_t *ip6h = (ip6_t *)connp->conn_ht_iphc;
+ ip6h->ip6_dst = v6dst;
}
}
- ip_len = mp->b_wptr - (uchar_t *)ip6h - IPV6_HDR_LEN;
- if (mp->b_cont != NULL)
- ip_len += msgdsize(mp->b_cont);
-
/*
- * Set the length into the IP header.
- * If the length is greater than the maximum allowed by IP,
- * then free the message and return. Do not try and send it
- * as this can cause problems in layers below.
+ * Remember the dst etc which corresponds to the built header
+ * template and conn_ixa.
*/
- if (ip_len > IP_MAXPACKET) {
- BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
- rw_exit(&icmp->icmp_rwlock);
- return (EMSGSIZE);
+ oldixa = conn_replace_ixa(connp, ixa);
+ connp->conn_v6lastdst = v6dst;
+ connp->conn_lastflowinfo = flowinfo;
+ connp->conn_lastscopeid = ixa->ixa_scopeid;
+ connp->conn_lastsrcid = srcid;
+ /* Also remember a source to use together with lastdst */
+ connp->conn_v6lastsrc = v6src;
+
+ data_mp = icmp_prepend_header_template(connp, ixa, data_mp, &v6src,
+ flowinfo, &error);
+
+ /* Done with conn_t */
+ mutex_exit(&connp->conn_lock);
+ ixa_refrele(oldixa);
+
+ if (data_mp == NULL) {
+ ASSERT(error != 0);
+ goto ud_error;
}
- if (icmp->icmp_proto == IPPROTO_ICMPV6 || icmp->icmp_raw_checksum) {
- uint_t cksum_off; /* From ip6i == mp->b_rptr */
- uint16_t *cksum_ptr;
- uint_t ext_hdrs_len;
- /* ICMPv6 must have an offset matching icmp6_cksum offset */
- ASSERT(icmp->icmp_proto != IPPROTO_ICMPV6 ||
- icmp->icmp_checksum_off == 2);
+ if (!do_ipsec) {
+ /* Policy might differ for different ICMP type/code */
+ data_mp = icmp_output_attach_policy(data_mp, connp, ixa);
+ if (data_mp == NULL) {
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ error = EHOSTUNREACH; /* IPsec policy failure */
+ goto done;
+ }
+ }
+ /* We're done. Pass the packet to ip. */
+ BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
+
+ error = conn_ip_output(data_mp, ixa);
+ /* No rawipOutErrors if an error since IP increases its error counter */
+ switch (error) {
+ case 0:
+ break;
+ case EWOULDBLOCK:
+ (void) ixa_check_drain_insert(connp, ixa);
+ error = 0;
+ break;
+ case EADDRNOTAVAIL:
/*
- * We make it easy for IP to include our pseudo header
- * by putting our length in uh_checksum, modified (if
- * we have a routing header) by the checksum difference
- * between the ultimate destination and first hop addresses.
- * Note: ICMPv6 must always checksum the packet.
+ * IXAF_VERIFY_SOURCE tells us to pick a better source.
+ * Don't have the application see that errno
*/
- cksum_off = ip_hdr_len + icmp->icmp_checksum_off;
- if (cksum_off + sizeof (uint16_t) > mp->b_wptr - mp->b_rptr) {
- if (!pullupmsg(mp, cksum_off + sizeof (uint16_t))) {
- BUMP_MIB(&is->is_rawip_mib,
- rawipOutErrors);
- freemsg(mp);
- rw_exit(&icmp->icmp_rwlock);
- return (0);
- }
- ip6i = (ip6i_t *)mp->b_rptr;
- if (ip6i->ip6i_nxt == IPPROTO_RAW)
- ip6h = (ip6_t *)&ip6i[1];
- else
- ip6h = (ip6_t *)ip6i;
- }
- /* Add payload length to checksum */
- ext_hdrs_len = ip_hdr_len - IPV6_HDR_LEN -
- (int)((uchar_t *)ip6h - (uchar_t *)ip6i);
- csum += htons(ip_len - ext_hdrs_len);
-
- cksum_ptr = (uint16_t *)((uchar_t *)ip6i + cksum_off);
- csum = (csum & 0xFFFF) + (csum >> 16);
- *cksum_ptr = (uint16_t)csum;
+ error = ENETUNREACH;
+ /* FALLTHRU */
+ default:
+ mutex_enter(&connp->conn_lock);
+ /*
+ * Clear the source and v6lastdst so we call ip_attr_connect
+ * for the next packet and try to pick a better source.
+ */
+ if (connp->conn_mcbc_bind)
+ connp->conn_saddr_v6 = ipv6_all_zeros;
+ else
+ connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
+ connp->conn_v6lastdst = ipv6_all_zeros;
+ mutex_exit(&connp->conn_lock);
+ break;
}
+done:
+ ixa_refrele(ixa);
+ return (error);
-#ifdef _LITTLE_ENDIAN
- ip_len = htons(ip_len);
-#endif
- ip6h->ip6_plen = (uint16_t)ip_len;
+ud_error:
+ if (ixa != NULL)
+ ixa_refrele(ixa);
- /* We're done. Pass the packet to IP */
- rw_exit(&icmp->icmp_rwlock);
- BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
- ip_output_v6(icmp->icmp_connp, mp, q, IP_WPUT);
- return (0);
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ freemsg(data_mp);
+ return (error);
+}
+
+/* ARGSUSED */
+static void
+icmp_wput_fallback(queue_t *q, mblk_t *mp)
+{
+#ifdef DEBUG
+ cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n");
+#endif
+ freemsg(mp);
}
static void
@@ -5559,7 +4622,6 @@ icmp_wput_other(queue_t *q, mblk_t *mp)
{
uchar_t *rptr = mp->b_rptr;
struct iocblk *iocp;
-#define tudr ((struct T_unitdata_req *)rptr)
conn_t *connp = Q_TO_CONN(q);
icmp_t *icmp = connp->conn_icmp;
icmp_stack_t *is = icmp->icmp_is;
@@ -5576,7 +4638,7 @@ icmp_wput_other(queue_t *q, mblk_t *mp)
freemsg(mp);
return;
}
- switch (((union T_primitives *)rptr)->type) {
+ switch (((t_primp_t)rptr)->type) {
case T_ADDR_REQ:
icmp_addr_req(q, mp);
return;
@@ -5596,15 +4658,14 @@ icmp_wput_other(queue_t *q, mblk_t *mp)
case T_UNITDATA_REQ:
/*
* If a T_UNITDATA_REQ gets here, the address must
- * be bad. Valid T_UNITDATA_REQs are found above
- * and break to below this switch.
+ * be bad. Valid T_UNITDATA_REQs are handled
+ * in icmp_wput.
*/
icmp_ud_err(q, mp, EADDRNOTAVAIL);
return;
case T_UNBIND_REQ:
icmp_tpi_unbind(q, mp);
return;
-
case T_SVR4_OPTMGMT_REQ:
/*
* All Solaris components should pass a db_credp
@@ -5622,9 +4683,7 @@ icmp_wput_other(queue_t *q, mblk_t *mp)
if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get,
cr)) {
- /* Only IP can return anything meaningful */
- (void) svr4_optcom_req(q, mp, cr,
- &icmp_opt_obj, B_TRUE);
+ svr4_optcom_req(q, mp, cr, &icmp_opt_obj);
}
return;
@@ -5642,8 +4701,7 @@ icmp_wput_other(queue_t *q, mblk_t *mp)
icmp_err_ack(q, mp, TSYSERR, EINVAL);
return;
}
- /* Only IP can return anything meaningful */
- (void) tpi_optcom_req(q, mp, cr, &icmp_opt_obj, B_TRUE);
+ tpi_optcom_req(q, mp, cr, &icmp_opt_obj);
return;
case T_DISCON_REQ:
@@ -5660,13 +4718,16 @@ icmp_wput_other(queue_t *q, mblk_t *mp)
case T_DATA_REQ:
case T_EXDATA_REQ:
case T_ORDREL_REQ:
- freemsg(mp);
- (void) putctl1(RD(q), M_ERROR, EPROTO);
+ icmp_err_ack(q, mp, TNOTSUPPORT, 0);
return;
default:
break;
}
break;
+ case M_FLUSH:
+ if (*rptr & FLUSHW)
+ flushq(q, FLUSHDATA);
+ break;
case M_IOCTL:
iocp = (struct iocblk *)mp->b_rptr;
switch (iocp->ioc_cmd) {
@@ -5678,7 +4739,6 @@ icmp_wput_other(queue_t *q, mblk_t *mp)
* don't know the peer's name.
*/
iocp->ioc_error = ENOTCONN;
- err_ret:;
iocp->ioc_count = 0;
mp->b_datap->db_type = M_IOCACK;
qreply(q, mp);
@@ -5696,22 +4756,13 @@ icmp_wput_other(queue_t *q, mblk_t *mp)
SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
return;
case ND_SET:
- /* nd_getset performs the necessary error checking */
+ /* nd_getset performs the necessary checking */
case ND_GET:
if (nd_getset(q, is->is_nd, mp)) {
qreply(q, mp);
return;
}
break;
- case _SIOCSOCKFALLBACK:
- /*
- * socket is falling back to be a
- * streams socket. Nothing to do
- */
- iocp->ioc_count = 0;
- iocp->ioc_rval = 0;
- qreply(q, mp);
- return;
default:
break;
}
@@ -5720,23 +4771,24 @@ icmp_wput_other(queue_t *q, mblk_t *mp)
icmp_wput_iocdata(q, mp);
return;
default:
+ /* Unrecognized messages are passed through without change. */
break;
}
- ip_wput(q, mp);
+ ip_wput_nondata(q, mp);
}
/*
- * icmp_wput_iocdata is called by icmp_wput_slow to handle all M_IOCDATA
+ * icmp_wput_iocdata is called by icmp_wput_other to handle all M_IOCDATA
* messages.
*/
static void
icmp_wput_iocdata(queue_t *q, mblk_t *mp)
{
- mblk_t *mp1;
+ mblk_t *mp1;
STRUCT_HANDLE(strbuf, sb);
- icmp_t *icmp;
- uint_t addrlen;
- uint_t error;
+ uint_t addrlen;
+ conn_t *connp = Q_TO_CONN(q);
+ icmp_t *icmp = connp->conn_icmp;
/* Make sure it is one of ours. */
switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
@@ -5744,10 +4796,10 @@ icmp_wput_iocdata(queue_t *q, mblk_t *mp)
case TI_GETPEERNAME:
break;
default:
- icmp = Q_TO_ICMP(q);
- ip_output(icmp->icmp_connp, mp, q, IP_WPUT);
+ ip_wput_nondata(q, mp);
return;
}
+
switch (mi_copy_state(q, mp, &mp1)) {
case -1:
return;
@@ -5776,6 +4828,7 @@ icmp_wput_iocdata(queue_t *q, mblk_t *mp)
mi_copy_done(q, mp, EPROTO);
return;
}
+
/*
* Now we have the strbuf structure for TI_GETMYNAME
* and TI_GETPEERNAME. Next we copyout the requested
@@ -5783,8 +4836,8 @@ icmp_wput_iocdata(queue_t *q, mblk_t *mp)
*/
STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag,
(void *)mp1->b_rptr);
- icmp = Q_TO_ICMP(q);
- if (icmp->icmp_family == AF_INET)
+
+ if (connp->conn_family == AF_INET)
addrlen = sizeof (sin_t);
else
addrlen = sizeof (sin6_t);
@@ -5793,72 +4846,37 @@ icmp_wput_iocdata(queue_t *q, mblk_t *mp)
mi_copy_done(q, mp, EINVAL);
return;
}
-
+ switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
+ case TI_GETMYNAME:
+ break;
+ case TI_GETPEERNAME:
+ if (icmp->icmp_state != TS_DATA_XFER) {
+ mi_copy_done(q, mp, ENOTCONN);
+ return;
+ }
+ break;
+ default:
+ mi_copy_done(q, mp, EPROTO);
+ return;
+ }
mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
-
- if (mp1 == NULL)
+ if (!mp1)
return;
- rw_enter(&icmp->icmp_rwlock, RW_READER);
+ STRUCT_FSET(sb, len, addrlen);
switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
case TI_GETMYNAME:
- error = rawip_do_getsockname(icmp, (void *)mp1->b_rptr,
+ (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr,
&addrlen);
break;
case TI_GETPEERNAME:
- error = rawip_do_getpeername(icmp, (void *)mp1->b_rptr,
+ (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr,
&addrlen);
break;
}
- rw_exit(&icmp->icmp_rwlock);
-
- if (error != 0) {
- mi_copy_done(q, mp, error);
- } else {
- mp1->b_wptr += addrlen;
- STRUCT_FSET(sb, len, addrlen);
-
- /* Copy out the address */
- mi_copyout(q, mp);
- }
-}
-
-static int
-icmp_unitdata_opt_process(queue_t *q, mblk_t *mp, int *errorp,
- void *thisdg_attrs)
-{
- struct T_unitdata_req *udreqp;
- int is_absreq_failure;
- cred_t *cr;
-
- udreqp = (struct T_unitdata_req *)mp->b_rptr;
- *errorp = 0;
-
- /*
- * All Solaris components should pass a db_credp
- * for this TPI message, hence we ASSERT.
- * But in case there is some other M_PROTO that looks
- * like a TPI message sent by some other kernel
- * component, we check and return an error.
- */
- cr = msg_getcred(mp, NULL);
- ASSERT(cr != NULL);
- if (cr == NULL)
- return (-1);
-
- *errorp = tpi_optcom_buf(q, mp, &udreqp->OPT_length,
- udreqp->OPT_offset, cr, &icmp_opt_obj,
- thisdg_attrs, &is_absreq_failure);
-
- if (*errorp != 0) {
- /*
- * Note: No special action needed in this
- * module for "is_absreq_failure"
- */
- return (-1); /* failure */
- }
- ASSERT(is_absreq_failure == 0);
- return (0); /* success */
+ mp1->b_wptr += addrlen;
+ /* Copy out the address */
+ mi_copyout(q, mp);
}
void
@@ -6013,7 +5031,7 @@ rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
socklen_t len, cred_t *cr)
{
conn_t *connp = (conn_t *)proto_handle;
- int error;
+ int error;
/* All Solaris components should pass a cred for this operation. */
ASSERT(cr != NULL);
@@ -6042,14 +5060,14 @@ rawip_implicit_bind(conn_t *connp)
socklen_t len;
int error;
- if (connp->conn_icmp->icmp_family == AF_INET) {
+ if (connp->conn_family == AF_INET) {
len = sizeof (struct sockaddr_in);
sin = (sin_t *)&sin6addr;
*sin = sin_null;
sin->sin_family = AF_INET;
sin->sin_addr.s_addr = INADDR_ANY;
} else {
- ASSERT(connp->conn_icmp->icmp_family == AF_INET6);
+ ASSERT(connp->conn_family == AF_INET6);
len = sizeof (sin6_t);
sin6 = (sin6_t *)&sin6addr;
*sin6 = sin6_null;
@@ -6081,7 +5099,6 @@ rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
return (EOPNOTSUPP);
}
-/* ARGSUSED */
int
rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
socklen_t len, sock_connid_t *id, cred_t *cr)
@@ -6090,6 +5107,7 @@ rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
icmp_t *icmp = connp->conn_icmp;
int error;
boolean_t did_bind = B_FALSE;
+ pid_t pid = curproc->p_pid;
/* All Solaris components should pass a cred for this operation. */
ASSERT(cr != NULL);
@@ -6106,7 +5124,7 @@ rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
return (error);
}
- error = proto_verify_ip_addr(icmp->icmp_family, sa, len);
+ error = proto_verify_ip_addr(connp->conn_family, sa, len);
if (error != 0)
return (error);
@@ -6126,10 +5144,9 @@ rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
/*
* set SO_DGRAM_ERRIND
*/
- icmp->icmp_dgram_errind = B_TRUE;
-
- error = rawip_do_connect(connp, sa, len, cr);
+ connp->conn_dgram_errind = B_TRUE;
+ error = rawip_do_connect(connp, sa, len, cr, pid);
if (error != 0 && did_bind) {
int unbind_err;
@@ -6139,15 +5156,15 @@ rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
if (error == 0) {
*id = 0;
- (*connp->conn_upcalls->su_connected)
- (connp->conn_upper_handle, 0, NULL, -1);
+ (*connp->conn_upcalls->su_connected)(connp->conn_upper_handle,
+ 0, NULL, -1);
} else if (error < 0) {
error = proto_tlitosyserr(-error);
}
return (error);
}
-/* ARGSUSED */
+/* ARGSUSED2 */
int
rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb)
@@ -6184,9 +5201,8 @@ rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
stropt_mp->b_wptr += sizeof (*stropt);
stropt = (struct stroptions *)stropt_mp->b_rptr;
stropt->so_flags = SO_WROFF | SO_HIWAT;
- stropt->so_wroff =
- (ushort_t)(icmp->icmp_max_hdr_len + icmp->icmp_is->is_wroff_extra);
- stropt->so_hiwat = icmp->icmp_recv_hiwat;
+ stropt->so_wroff = connp->conn_wroff;
+ stropt->so_hiwat = connp->conn_rcvbuf;
putnext(RD(q), stropt_mp);
/*
@@ -6207,9 +5223,9 @@ rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
if (error != 0)
faddrlen = 0;
opts = 0;
- if (icmp->icmp_dgram_errind)
+ if (connp->conn_dgram_errind)
opts |= SO_DGRAM_ERRIND;
- if (icmp->icmp_dontroute)
+ if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
opts |= SO_DONTROUTE;
(*quiesced_cb)(connp->conn_upper_handle, q, &tca,
@@ -6218,7 +5234,7 @@ rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
/*
* Attempts to send data up during fallback will result in it being
- * queued in udp_t. Now we push up any queued packets.
+ * queued in icmp_t. Now we push up any queued packets.
*/
mutex_enter(&icmp->icmp_recv_lock);
while (icmp->icmp_fallback_queue_head != NULL) {
@@ -6236,9 +5252,9 @@ rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
/*
* No longer a streams less socket
*/
- rw_enter(&icmp->icmp_rwlock, RW_WRITER);
+ mutex_enter(&connp->conn_lock);
connp->conn_flags &= ~IPCL_NONSTR;
- rw_exit(&icmp->icmp_rwlock);
+ mutex_exit(&connp->conn_lock);
mutex_exit(&icmp->icmp_recv_lock);
@@ -6250,7 +5266,7 @@ rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
return (0);
}
-/* ARGSUSED */
+/* ARGSUSED2 */
sock_lower_handle_t
rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
uint_t *smodep, int *errorp, int flags, cred_t *credp)
@@ -6262,35 +5278,10 @@ rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
return (NULL);
}
- connp = icmp_open(family, credp, errorp, flags);
+ connp = rawip_do_open(family, credp, errorp, flags);
if (connp != NULL) {
- icmp_stack_t *is;
-
- is = connp->conn_icmp->icmp_is;
connp->conn_flags |= IPCL_NONSTR;
- if (connp->conn_icmp->icmp_family == AF_INET6) {
- /* Build initial header template for transmit */
- rw_enter(&connp->conn_icmp->icmp_rwlock, RW_WRITER);
- if ((*errorp =
- icmp_build_hdrs(connp->conn_icmp)) != 0) {
- rw_exit(&connp->conn_icmp->icmp_rwlock);
- ipcl_conn_destroy(connp);
- return (NULL);
- }
- rw_exit(&connp->conn_icmp->icmp_rwlock);
- }
-
- connp->conn_icmp->icmp_recv_hiwat = is->is_recv_hiwat;
- connp->conn_icmp->icmp_xmit_hiwat = is->is_xmit_hiwat;
-
- if ((*errorp = ip_create_helper_stream(connp,
- is->is_ldi_ident)) != 0) {
- cmn_err(CE_CONT, "create of IP helper stream failed\n");
- (void) rawip_do_close(connp);
- return (NULL);
- }
-
mutex_enter(&connp->conn_lock);
connp->conn_state_flags &= ~CONN_INCIPIENT;
mutex_exit(&connp->conn_lock);
@@ -6303,14 +5294,13 @@ rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
return ((sock_lower_handle_t)connp);
}
-/* ARGSUSED */
+/* ARGSUSED3 */
void
rawip_activate(sock_lower_handle_t proto_handle,
sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags,
cred_t *cr)
{
conn_t *connp = (conn_t *)proto_handle;
- icmp_stack_t *is = connp->conn_icmp->icmp_is;
struct sock_proto_props sopp;
/* All Solaris components should pass a cred for this operation. */
@@ -6321,10 +5311,9 @@ rawip_activate(sock_lower_handle_t proto_handle,
sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ;
- sopp.sopp_wroff = connp->conn_icmp->icmp_max_hdr_len +
- is->is_wroff_extra;
- sopp.sopp_rxhiwat = is->is_recv_hiwat;
- sopp.sopp_rxlowat = icmp_mod_info.mi_lowat;
+ sopp.sopp_wroff = connp->conn_wroff;
+ sopp.sopp_rxhiwat = connp->conn_rcvbuf;
+ sopp.sopp_rxlowat = connp->conn_rcvlowat;
sopp.sopp_maxblk = INFPSZ;
sopp.sopp_maxpsz = IP_MAXPACKET;
sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 :
@@ -6332,113 +5321,11 @@ rawip_activate(sock_lower_handle_t proto_handle,
(*connp->conn_upcalls->su_set_proto_props)
(connp->conn_upper_handle, &sopp);
-}
-
-static int
-rawip_do_getsockname(icmp_t *icmp, struct sockaddr *sa, uint_t *salenp)
-{
- sin_t *sin = (sin_t *)sa;
- sin6_t *sin6 = (sin6_t *)sa;
-
- ASSERT(icmp != NULL);
- ASSERT(RW_LOCK_HELD(&icmp->icmp_rwlock));
- switch (icmp->icmp_family) {
- case AF_INET:
- ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
- if (*salenp < sizeof (sin_t))
- return (EINVAL);
-
- *salenp = sizeof (sin_t);
- *sin = sin_null;
- sin->sin_family = AF_INET;
- if (icmp->icmp_state == TS_UNBND) {
- break;
- }
-
- if (!IN6_IS_ADDR_V4MAPPED_ANY(&icmp->icmp_v6src) &&
- !IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
- sin->sin_addr.s_addr = V4_PART_OF_V6(icmp->icmp_v6src);
- } else {
- /*
- * INADDR_ANY
- * icmp_v6src is not set, we might be bound to
- * broadcast/multicast. Use icmp_bound_v6src as
- * local address instead (that could
- * also still be INADDR_ANY)
- */
- sin->sin_addr.s_addr =
- V4_PART_OF_V6(icmp->icmp_bound_v6src);
- }
- break;
- case AF_INET6:
-
- if (*salenp < sizeof (sin6_t))
- return (EINVAL);
-
- *salenp = sizeof (sin6_t);
- *sin6 = sin6_null;
- sin6->sin6_family = AF_INET6;
- if (icmp->icmp_state == TS_UNBND) {
- break;
- }
- if (!IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
- sin6->sin6_addr = icmp->icmp_v6src;
- } else {
- /*
- * UNSPECIFIED
- * icmp_v6src is not set, we might be bound to
- * broadcast/multicast. Use icmp_bound_v6src as
- * local address instead (that could
- * also still be UNSPECIFIED)
- */
-
- sin6->sin6_addr = icmp->icmp_bound_v6src;
- }
- break;
- }
- return (0);
-}
-
-static int
-rawip_do_getpeername(icmp_t *icmp, struct sockaddr *sa, uint_t *salenp)
-{
- sin_t *sin = (sin_t *)sa;
- sin6_t *sin6 = (sin6_t *)sa;
-
- ASSERT(icmp != NULL);
- ASSERT(RW_LOCK_HELD(&icmp->icmp_rwlock));
-
- if (icmp->icmp_state != TS_DATA_XFER)
- return (ENOTCONN);
-
- sa->sa_family = icmp->icmp_family;
- switch (icmp->icmp_family) {
- case AF_INET:
- ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
-
- if (*salenp < sizeof (sin_t))
- return (EINVAL);
-
- *salenp = sizeof (sin_t);
- *sin = sin_null;
- sin->sin_family = AF_INET;
- sin->sin_addr.s_addr =
- V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr);
- break;
- case AF_INET6:
- if (*salenp < sizeof (sin6_t))
- return (EINVAL);
-
- *salenp = sizeof (sin6_t);
- *sin6 = sin6_null;
- *sin6 = icmp->icmp_v6dst;
- break;
- }
- return (0);
+ icmp_bind_proto(connp->conn_icmp);
}
-/* ARGSUSED */
+/* ARGSUSED3 */
int
rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa,
socklen_t *salenp, cred_t *cr)
@@ -6450,36 +5337,29 @@ rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa,
/* All Solaris components should pass a cred for this operation. */
ASSERT(cr != NULL);
- ASSERT(icmp != NULL);
-
- rw_enter(&icmp->icmp_rwlock, RW_READER);
-
- error = rawip_do_getpeername(icmp, sa, salenp);
-
- rw_exit(&icmp->icmp_rwlock);
-
+ mutex_enter(&connp->conn_lock);
+ if (icmp->icmp_state != TS_DATA_XFER)
+ error = ENOTCONN;
+ else
+ error = conn_getpeername(connp, sa, salenp);
+ mutex_exit(&connp->conn_lock);
return (error);
}
-/* ARGSUSED */
+/* ARGSUSED3 */
int
rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa,
socklen_t *salenp, cred_t *cr)
{
conn_t *connp = (conn_t *)proto_handle;
- icmp_t *icmp = connp->conn_icmp;
int error;
/* All Solaris components should pass a cred for this operation. */
ASSERT(cr != NULL);
- ASSERT(icmp != NULL);
- rw_enter(&icmp->icmp_rwlock, RW_READER);
-
- error = rawip_do_getsockname(icmp, sa, salenp);
-
- rw_exit(&icmp->icmp_rwlock);
-
+ mutex_enter(&connp->conn_lock);
+ error = conn_getsockname(connp, sa, salenp);
+ mutex_exit(&connp->conn_lock);
return (error);
}
@@ -6488,7 +5368,6 @@ rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
const void *optvalp, socklen_t optlen, cred_t *cr)
{
conn_t *connp = (conn_t *)proto_handle;
- icmp_t *icmp = connp->conn_icmp;
int error;
/* All Solaris components should pass a cred for this operation. */
@@ -6497,7 +5376,6 @@ rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
error = proto_opt_check(level, option_name, optlen, NULL,
icmp_opt_obj.odb_opt_des_arr,
icmp_opt_obj.odb_opt_arr_cnt,
- icmp_opt_obj.odb_topmost_tpiprovider,
B_TRUE, B_FALSE, cr);
if (error != 0) {
@@ -6510,19 +5388,9 @@ rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
return (error);
}
- rw_enter(&icmp->icmp_rwlock, RW_WRITER);
error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level,
option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen,
(uchar_t *)optvalp, NULL, cr);
- rw_exit(&icmp->icmp_rwlock);
-
- if (error < 0) {
- /*
- * Pass on to ip
- */
- error = ip_set_options(connp, level, option_name, optvalp,
- optlen, cr);
- }
ASSERT(error >= 0);
@@ -6535,7 +5403,6 @@ rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
{
int error;
conn_t *connp = (conn_t *)proto_handle;
- icmp_t *icmp = connp->conn_icmp;
t_uscalar_t max_optbuf_len;
void *optvalp_buf;
int len;
@@ -6546,7 +5413,6 @@ rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
icmp_opt_obj.odb_opt_des_arr,
icmp_opt_obj.odb_opt_arr_cnt,
- icmp_opt_obj.odb_topmost_tpiprovider,
B_FALSE, B_TRUE, cr);
if (error != 0) {
@@ -6557,31 +5423,25 @@ rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
}
optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
- rw_enter(&icmp->icmp_rwlock, RW_READER);
len = icmp_opt_get(connp, level, option_name, optvalp_buf);
- rw_exit(&icmp->icmp_rwlock);
-
- if (len < 0) {
- /*
- * Pass on to IP
- */
- kmem_free(optvalp_buf, max_optbuf_len);
- return (ip_get_options(connp, level, option_name, optvalp,
- optlen, cr));
- } else {
- /*
- * update optlen and copy option value
- */
- t_uscalar_t size = MIN(len, *optlen);
- bcopy(optvalp_buf, optvalp, size);
- bcopy(&size, optlen, sizeof (size));
-
+ if (len == -1) {
kmem_free(optvalp_buf, max_optbuf_len);
- return (0);
+ return (EINVAL);
}
+
+ /*
+ * update optlen and copy option value
+ */
+ t_uscalar_t size = MIN(len, *optlen);
+
+ bcopy(optvalp_buf, optvalp, size);
+ bcopy(&size, optlen, sizeof (size));
+
+ kmem_free(optvalp_buf, max_optbuf_len);
+ return (0);
}
-/* ARGSUSED */
+/* ARGSUSED1 */
int
rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
{
@@ -6594,7 +5454,7 @@ rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
return (0);
}
-/* ARGSUSED */
+/* ARGSUSED2 */
int
rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
{
@@ -6635,6 +5495,27 @@ rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
/* All Solaris components should pass a cred for this operation. */
ASSERT(cr != NULL);
+ /*
+ * If we don't have a helper stream then create one.
+ * ip_create_helper_stream takes care of locking the conn_t,
+ * so this check for NULL is just a performance optimization.
+ */
+ if (connp->conn_helper_info == NULL) {
+ icmp_stack_t *is = connp->conn_icmp->icmp_is;
+
+ ASSERT(is->is_ldi_ident != NULL);
+
+ /*
+ * Create a helper stream for non-STREAMS socket.
+ */
+ error = ip_create_helper_stream(connp, is->is_ldi_ident);
+ if (error != 0) {
+ ip0dbg(("rawip_ioctl: create of IP helper stream "
+ "failed %d\n", error));
+ return (error);
+ }
+ }
+
switch (cmd) {
case ND_SET:
case ND_GET:
@@ -6658,25 +5539,25 @@ rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
return (error);
}
-/* ARGSUSED */
int
rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
cred_t *cr)
{
- conn_t *connp = (conn_t *)proto_handle;
- icmp_t *icmp = connp->conn_icmp;
- icmp_stack_t *is = icmp->icmp_is;
- int error = 0;
- boolean_t bypass_dgram_errind = B_FALSE;
+ sin6_t *sin6;
+ sin_t *sin = NULL;
+ uint_t srcid;
+ conn_t *connp = (conn_t *)proto_handle;
+ icmp_t *icmp = connp->conn_icmp;
+ int error = 0;
+ icmp_stack_t *is = icmp->icmp_is;
+ pid_t pid = curproc->p_pid;
+ ip_xmit_attr_t *ixa;
ASSERT(DB_TYPE(mp) == M_DATA);
/* All Solaris components should pass a cred for this operation. */
ASSERT(cr != NULL);
- /* If labeled then sockfs should have already set db_credp */
- ASSERT(!is_system_labeled() || msg_getcred(mp, NULL) != NULL);
-
/* do an implicit bind if necessary */
if (icmp->icmp_state == TS_UNBND) {
error = rawip_implicit_bind(connp);
@@ -6691,170 +5572,191 @@ rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
}
}
- rw_enter(&icmp->icmp_rwlock, RW_WRITER);
-
- if (msg->msg_name != NULL && icmp->icmp_state == TS_DATA_XFER) {
- error = EISCONN;
- goto done_lock;
- }
-
- switch (icmp->icmp_family) {
- case AF_INET6: {
- sin6_t *sin6;
- ip6_pkt_t ipp_s; /* For ancillary data options */
- ip6_pkt_t *ipp = &ipp_s;
-
- sin6 = (sin6_t *)msg->msg_name;
- if (sin6 != NULL) {
- error = proto_verify_ip_addr(icmp->icmp_family,
- (struct sockaddr *)msg->msg_name, msg->msg_namelen);
- if (error != 0) {
- bypass_dgram_errind = B_TRUE;
- goto done_lock;
+ /* Protocol 255 contains full IP headers */
+ /* Read without holding lock */
+ if (icmp->icmp_hdrincl) {
+ ASSERT(connp->conn_ipversion == IPV4_VERSION);
+ if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) {
+ if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ freemsg(mp);
+ return (EINVAL);
}
- if (icmp->icmp_delayed_error != 0) {
- sin6_t *sin1 = (sin6_t *)msg->msg_name;
- sin6_t *sin2 = (sin6_t *)
- &icmp->icmp_delayed_addr;
-
- error = icmp->icmp_delayed_error;
- icmp->icmp_delayed_error = 0;
-
- /* Compare IP address and port */
+ }
+ error = icmp_output_hdrincl(connp, mp, cr, pid);
+ if (is->is_sendto_ignerr)
+ return (0);
+ else
+ return (error);
+ }
- if (sin1->sin6_port == sin2->sin6_port &&
- IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
- &sin2->sin6_addr)) {
- goto done_lock;
- }
- }
+ /* Connected? */
+ if (msg->msg_name == NULL) {
+ if (icmp->icmp_state != TS_DATA_XFER) {
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ return (EDESTADDRREQ);
+ }
+ if (msg->msg_controllen != 0) {
+ error = icmp_output_ancillary(connp, NULL, NULL, mp,
+ NULL, msg, cr, pid);
} else {
- /*
- * Use connected address
- */
- if (icmp->icmp_state != TS_DATA_XFER) {
- BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
- error = EDESTADDRREQ;
- bypass_dgram_errind = B_TRUE;
- goto done_lock;
- }
- sin6 = &icmp->icmp_v6dst;
+ error = icmp_output_connected(connp, mp, cr, pid);
}
+ if (is->is_sendto_ignerr)
+ return (0);
+ else
+ return (error);
+ }
+ if (icmp->icmp_state == TS_DATA_XFER) {
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ return (EISCONN);
+ }
+ error = proto_verify_ip_addr(connp->conn_family,
+ (struct sockaddr *)msg->msg_name, msg->msg_namelen);
+ if (error != 0) {
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ return (error);
+ }
+ switch (connp->conn_family) {
+ case AF_INET6:
+ sin6 = (sin6_t *)msg->msg_name;
/* No support for mapped addresses on raw sockets */
if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
- error = EADDRNOTAVAIL;
- goto done_lock;
+ return (EADDRNOTAVAIL);
}
-
- ipp->ipp_fields = 0;
- ipp->ipp_sticky_ignored = 0;
+ srcid = sin6->__sin6_src_id;
/*
- * If options passed in, feed it for verification and handling
+ * If the local address is a mapped address return
+ * an error.
+ * It would be possible to send an IPv6 packet but the
+ * response would never make it back to the application
+ * since it is bound to a mapped address.
*/
- if (msg->msg_controllen != 0) {
- error = process_auxiliary_options(connp,
- msg->msg_control, msg->msg_controllen,
- ipp, &icmp_opt_obj, icmp_opt_set, cr);
- if (error != 0) {
- goto done_lock;
- }
+ if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ return (EADDRNOTAVAIL);
}
- rw_exit(&icmp->icmp_rwlock);
+ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
+ sin6->sin6_addr = ipv6_loopback;
/*
- * Destination is a native IPv6 address.
- * Send out an IPv6 format packet.
+ * We have to allocate an ip_xmit_attr_t before we grab
+ * conn_lock and we need to hold conn_lock once we've check
+ * conn_same_as_last_v6 to handle concurrent send* calls on a
+ * socket.
*/
+ if (msg->msg_controllen == 0) {
+ ixa = conn_get_ixa(connp, B_FALSE);
+ if (ixa == NULL) {
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ return (ENOMEM);
+ }
+ } else {
+ ixa = NULL;
+ }
+ mutex_enter(&connp->conn_lock);
+ if (icmp->icmp_delayed_error != 0) {
+ sin6_t *sin2 = (sin6_t *)&icmp->icmp_delayed_addr;
- error = raw_ip_send_data_v6(connp->conn_wq, connp, mp, sin6,
- ipp);
- }
- break;
- case AF_INET: {
- sin_t *sin;
- ip4_pkt_t pktinfo;
- ip4_pkt_t *pktinfop = &pktinfo;
- ipaddr_t v4dst;
+ error = icmp->icmp_delayed_error;
+ icmp->icmp_delayed_error = 0;
- sin = (sin_t *)msg->msg_name;
- if (sin != NULL) {
- error = proto_verify_ip_addr(icmp->icmp_family,
- (struct sockaddr *)msg->msg_name, msg->msg_namelen);
- if (error != 0) {
- bypass_dgram_errind = B_TRUE;
- goto done_lock;
- }
- v4dst = sin->sin_addr.s_addr;
- if (icmp->icmp_delayed_error != 0) {
- sin_t *sin1 = (sin_t *)msg->msg_name;
- sin_t *sin2 = (sin_t *)&icmp->icmp_delayed_addr;
-
- error = icmp->icmp_delayed_error;
- icmp->icmp_delayed_error = 0;
-
- /* Compare IP address and port */
- if (sin1->sin_port == sin2->sin_port &&
- sin1->sin_addr.s_addr ==
- sin2->sin_addr.s_addr) {
- goto done_lock;
- }
+ /* Compare IP address and family */
- }
- } else {
- /*
- * Use connected address
- */
- if (icmp->icmp_state != TS_DATA_XFER) {
+ if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
+ &sin2->sin6_addr) &&
+ sin6->sin6_family == sin2->sin6_family) {
+ mutex_exit(&connp->conn_lock);
BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
- error = EDESTADDRREQ;
- bypass_dgram_errind = B_TRUE;
- goto done_lock;
+ if (ixa != NULL)
+ ixa_refrele(ixa);
+ return (error);
}
- v4dst = V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr);
}
+ if (msg->msg_controllen != 0) {
+ mutex_exit(&connp->conn_lock);
+ ASSERT(ixa == NULL);
+ error = icmp_output_ancillary(connp, NULL, sin6, mp,
+ NULL, msg, cr, pid);
+ } else if (conn_same_as_last_v6(connp, sin6) &&
+ connp->conn_lastsrcid == srcid &&
+ ipsec_outbound_policy_current(ixa)) {
+ /* icmp_output_lastdst drops conn_lock */
+ error = icmp_output_lastdst(connp, mp, cr, pid, ixa);
+ } else {
+ /* icmp_output_newdst drops conn_lock */
+ error = icmp_output_newdst(connp, mp, NULL, sin6, cr,
+ pid, ixa);
+ }
+ ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
+ if (is->is_sendto_ignerr)
+ return (0);
+ else
+ return (error);
+ case AF_INET:
+ sin = (sin_t *)msg->msg_name;
-
- pktinfop->ip4_ill_index = 0;
- pktinfop->ip4_addr = INADDR_ANY;
+ if (sin->sin_addr.s_addr == INADDR_ANY)
+ sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
/*
- * If options passed in, feed it for verification and handling
+ * We have to allocate an ip_xmit_attr_t before we grab
+ * conn_lock and we need to hold conn_lock once we've check
+ * conn_same_as_last_v6 to handle concurrent send* on a socket.
*/
- if (msg->msg_controllen != 0) {
- error = process_auxiliary_options(connp,
- msg->msg_control, msg->msg_controllen,
- pktinfop, &icmp_opt_obj, icmp_opt_set, cr);
- if (error != 0) {
- goto done_lock;
+ if (msg->msg_controllen == 0) {
+ ixa = conn_get_ixa(connp, B_FALSE);
+ if (ixa == NULL) {
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ return (ENOMEM);
}
+ } else {
+ ixa = NULL;
}
- rw_exit(&icmp->icmp_rwlock);
+ mutex_enter(&connp->conn_lock);
+ if (icmp->icmp_delayed_error != 0) {
+ sin_t *sin2 = (sin_t *)&icmp->icmp_delayed_addr;
- error = raw_ip_send_data_v4(connp->conn_wq, connp, mp,
- v4dst, pktinfop);
- break;
- }
+ error = icmp->icmp_delayed_error;
+ icmp->icmp_delayed_error = 0;
- default:
- ASSERT(0);
- }
+ /* Compare IP address */
- goto done;
+ if (sin->sin_addr.s_addr == sin2->sin_addr.s_addr) {
+ mutex_exit(&connp->conn_lock);
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ if (ixa != NULL)
+ ixa_refrele(ixa);
+ return (error);
+ }
+ }
-done_lock:
- rw_exit(&icmp->icmp_rwlock);
- if (error != 0) {
- ASSERT(mp != NULL);
- freemsg(mp);
+ if (msg->msg_controllen != 0) {
+ mutex_exit(&connp->conn_lock);
+ ASSERT(ixa == NULL);
+ error = icmp_output_ancillary(connp, sin, NULL, mp,
+ NULL, msg, cr, pid);
+ } else if (conn_same_as_last_v4(connp, sin) &&
+ ipsec_outbound_policy_current(ixa)) {
+ /* icmp_output_lastdst drops conn_lock */
+ error = icmp_output_lastdst(connp, mp, cr, pid, ixa);
+ } else {
+ /* icmp_output_newdst drops conn_lock */
+ error = icmp_output_newdst(connp, mp, sin, NULL, cr,
+ pid, ixa);
+ }
+ ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
+ if (is->is_sendto_ignerr)
+ return (0);
+ else
+ return (error);
+ default:
+ return (EINVAL);
}
-done:
- if (bypass_dgram_errind)
- return (error);
- return (icmp->icmp_dgram_errind ? error : 0);
}
sock_downcalls_t sock_rawip_downcalls = {
diff --git a/usr/src/uts/common/inet/ip/icmp_opt_data.c b/usr/src/uts/common/inet/ip/icmp_opt_data.c
index 8bee9827db..ff0310de0c 100644
--- a/usr/src/uts/common/inet/ip/icmp_opt_data.c
+++ b/usr/src/uts/common/inet/ip/icmp_opt_data.c
@@ -36,23 +36,11 @@
#include <inet/common.h>
#include <netinet/ip6.h>
#include <inet/ip.h>
-/*
- * MK_XXX Following 2 includes temporary to import ip6_rthdr_t
- * definition. May not be needed if we fix ip6_dg_snd_attrs_t
- * to do all extension headers in identical manner.
- */
-#include <net/if.h>
-#include <inet/ip6.h>
#include <netinet/tcp.h>
#include <netinet/ip_mroute.h>
#include <inet/optcom.h>
-
-
-extern int icmp_opt_default(queue_t *, int, int, uchar_t *);
-extern int icmp_tpi_opt_get(queue_t *, int, int, uchar_t *);
-extern int icmp_tpi_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *,
- uint_t *, uchar_t *, void *, cred_t *, mblk_t *);
+#include <inet/rawip_impl.h>
/*
* Table of all known options handled on a ICMP protocol stack.
@@ -63,250 +51,252 @@ extern int icmp_tpi_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *,
*/
opdes_t icmp_opt_arr[] = {
-{ SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0
+{ SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
},
-{ SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
+{ SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
#ifdef SO_PROTOTYPE
/*
* icmp will only allow IPPROTO_ICMP for non-privileged streams
* that check is made on an adhoc basis.
*/
-{ SO_PROTOTYPE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
+{ SO_PROTOTYPE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
#endif
-{ SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
+{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
sizeof (struct timeval), 0 },
-{ SO_RCVTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ SO_RCVTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
sizeof (struct timeval), 0 },
-{ SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int),
+{ SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
0 },
-{ SO_TIMESTAMP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0
+{ SO_TIMESTAMP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
},
-{ SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int),
+{ SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
0 },
-{ SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int),
+{ SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
0 },
-{ SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT, sizeof (int),
+{ SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
0 },
-{ SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
+{ SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
{ IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+ (OP_VARLEN|OP_NODEFAULT),
IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
{ T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+ (OP_VARLEN|OP_NODEFAULT),
IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
-{ IP_HDRINCL, IPPROTO_IP, OA_R, OA_RW, OP_RAW, OP_PASSNEXT,
+{ IP_HDRINCL, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
sizeof (int), 0 },
-{ IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
+{ IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-{ IP_MULTICAST_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IP_MULTICAST_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
sizeof (struct in_addr), 0 /* INADDR_ANY */ },
-{ IP_MULTICAST_LOOP, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_DEF_FN),
+{ IP_MULTICAST_LOOP, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
sizeof (uchar_t), -1 /* not initialized */},
-{ IP_MULTICAST_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_DEF_FN),
+{ IP_MULTICAST_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
sizeof (uchar_t), -1 /* not initialized */ },
-{ IP_ADD_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IP_ADD_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, OP_NODEFAULT,
sizeof (struct ip_mreq), -1 /* not initialized */ },
-{ IP_DROP_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IP_DROP_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, OP_NODEFAULT,
sizeof (struct ip_mreq), 0 },
-{ IP_BLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IP_BLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, OP_NODEFAULT,
sizeof (struct ip_mreq_source), -1 },
-{ IP_UNBLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IP_UNBLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, OP_NODEFAULT,
sizeof (struct ip_mreq_source), -1 },
{ IP_ADD_SOURCE_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct ip_mreq_source), -1 },
+ OP_NODEFAULT, sizeof (struct ip_mreq_source), -1 },
{ IP_DROP_SOURCE_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct ip_mreq_source), -1 },
+ OP_NODEFAULT, sizeof (struct ip_mreq_source), -1 },
-{ IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
sizeof (ipsec_req_t), -1 /* not initialized */ },
-{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 /* no ifindex */ },
-{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, OP_PASSNEXT,
+{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
sizeof (int), 0 },
{ IP_BROADCAST_TTL, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0, sizeof (uchar_t),
0 /* disabled */ },
-{ IP_RECVIF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
+{ IP_RECVIF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ IP_PKTINFO, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT|OP_VARLEN),
+ (OP_NODEFAULT|OP_VARLEN),
sizeof (struct in_pktinfo), -1 /* not initialized */ },
-{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT,
+{ IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+
+{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
sizeof (in_addr_t), -1 /* not initialized */ },
{ MRT_INIT, IPPROTO_IP, 0, OA_X, OP_CONFIG,
- (OP_PASSNEXT|OP_NODEFAULT), sizeof (int),
+ OP_NODEFAULT, sizeof (int),
-1 /* not initialized */ },
{ MRT_DONE, IPPROTO_IP, 0, OA_X, OP_CONFIG,
- (OP_PASSNEXT|OP_NODEFAULT), 0, -1 /* not initialized */ },
+ OP_NODEFAULT, 0, -1 /* not initialized */ },
-{ MRT_ADD_VIF, IPPROTO_IP, 0, OA_X, OP_CONFIG, (OP_PASSNEXT|OP_NODEFAULT),
+{ MRT_ADD_VIF, IPPROTO_IP, 0, OA_X, OP_CONFIG, OP_NODEFAULT,
sizeof (struct vifctl), -1 /* not initialized */ },
-{ MRT_DEL_VIF, IPPROTO_IP, 0, OA_X, OP_CONFIG, (OP_PASSNEXT|OP_NODEFAULT),
+{ MRT_DEL_VIF, IPPROTO_IP, 0, OA_X, OP_CONFIG, OP_NODEFAULT,
sizeof (vifi_t), -1 /* not initialized */ },
-{ MRT_ADD_MFC, IPPROTO_IP, 0, OA_X, OP_CONFIG, (OP_PASSNEXT|OP_NODEFAULT),
+{ MRT_ADD_MFC, IPPROTO_IP, 0, OA_X, OP_CONFIG, OP_NODEFAULT,
sizeof (struct mfcctl), -1 /* not initialized */ },
-{ MRT_DEL_MFC, IPPROTO_IP, 0, OA_X, OP_CONFIG, (OP_PASSNEXT|OP_NODEFAULT),
+{ MRT_DEL_MFC, IPPROTO_IP, 0, OA_X, OP_CONFIG, OP_NODEFAULT,
sizeof (struct mfcctl), -1 /* not initialized */ },
-{ MRT_VERSION, IPPROTO_IP, OA_R, OA_R, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ MRT_VERSION, IPPROTO_IP, OA_R, OA_R, OP_NP, OP_NODEFAULT,
sizeof (int), -1 /* not initialized */ },
{ MRT_ASSERT, IPPROTO_IP, 0, OA_RW, OP_CONFIG,
- (OP_PASSNEXT|OP_NODEFAULT),
+ OP_NODEFAULT,
sizeof (int), -1 /* not initialized */ },
{ MCAST_JOIN_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_req),
+ OP_NODEFAULT, sizeof (struct group_req),
-1 /* not initialized */ },
{ MCAST_LEAVE_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_req),
+ OP_NODEFAULT, sizeof (struct group_req),
-1 /* not initialized */ },
{ MCAST_BLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+ OP_NODEFAULT, sizeof (struct group_source_req),
-1 /* not initialized */ },
{ MCAST_UNBLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+ OP_NODEFAULT, sizeof (struct group_source_req),
-1 /* not initialized */ },
{ MCAST_JOIN_SOURCE_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+ OP_NODEFAULT, sizeof (struct group_source_req),
-1 /* not initialized */ },
{ MCAST_LEAVE_SOURCE_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+ OP_NODEFAULT, sizeof (struct group_source_req),
-1 /* not initialized */ },
-{ IPV6_MULTICAST_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_MULTICAST_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
{ IPV6_MULTICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_DEF_FN), sizeof (int), -1 /* not initialized */ },
+ OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
{ IPV6_MULTICAST_LOOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_DEF_FN), sizeof (int), -1 /* not initialized */},
+ OP_DEF_FN, sizeof (int), -1 /* not initialized */},
-{ IPV6_JOIN_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IPV6_JOIN_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, OP_NODEFAULT,
sizeof (struct ipv6_mreq), -1 /* not initialized */ },
-{ IPV6_LEAVE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IPV6_LEAVE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, OP_NODEFAULT,
sizeof (struct ipv6_mreq), -1 /* not initialized */ },
-{ IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_DEF_FN),
+{ IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
sizeof (int), -1 /* not initialized */ },
-{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 /* no ifindex */ },
-{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, OP_PASSNEXT,
+{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
sizeof (int), 0 },
-{ IPV6_CHECKSUM, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int),
+{ IPV6_CHECKSUM, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
-1 },
{ ICMP6_FILTER, IPPROTO_ICMPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN|OP_VARLEN,
sizeof (icmp6_filter_t), 0 },
{ IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT|OP_VARLEN),
+ (OP_NODEFAULT|OP_VARLEN),
sizeof (struct in6_pktinfo), -1 /* not initialized */ },
{ IPV6_HOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT|OP_VARLEN),
+ (OP_NODEFAULT|OP_VARLEN),
sizeof (int), -1 /* not initialized */ },
{ IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT|OP_VARLEN),
+ (OP_NODEFAULT|OP_VARLEN),
sizeof (sin6_t), -1 /* not initialized */ },
{ IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+ (OP_VARLEN|OP_NODEFAULT),
MAX_EHDR_LEN, -1 /* not initialized */ },
{ IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+ (OP_VARLEN|OP_NODEFAULT),
MAX_EHDR_LEN, -1 /* not initialized */ },
{ IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+ (OP_VARLEN|OP_NODEFAULT),
MAX_EHDR_LEN, -1 /* not initialized */ },
{ IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+ (OP_VARLEN|OP_NODEFAULT),
MAX_EHDR_LEN, -1 /* not initialized */ },
{ IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT|OP_VARLEN),
+ (OP_NODEFAULT|OP_VARLEN),
sizeof (int), -1 /* not initialized */ },
-{ IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (struct ip6_mtuinfo), -1 },
-{ IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ IPV6_RECVPATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVPATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
sizeof (ipsec_req_t), -1 /* not initialized */ },
-{ IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
{ MCAST_JOIN_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_req),
+ OP_NODEFAULT, sizeof (struct group_req),
-1 /* not initialized */ },
{ MCAST_LEAVE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_req),
+ OP_NODEFAULT, sizeof (struct group_req),
-1 /* not initialized */ },
{ MCAST_BLOCK_SOURCE, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+ OP_NODEFAULT, sizeof (struct group_source_req),
-1 /* not initialized */ },
{ MCAST_UNBLOCK_SOURCE, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+ OP_NODEFAULT, sizeof (struct group_source_req),
-1 /* not initialized */ },
{ MCAST_JOIN_SOURCE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+ OP_NODEFAULT, sizeof (struct group_source_req),
-1 /* not initialized */ },
{ MCAST_LEAVE_SOURCE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+ OP_NODEFAULT, sizeof (struct group_source_req),
-1 /* not initialized */ },
};
@@ -342,9 +332,8 @@ uint_t icmp_max_optsize; /* initialized when ICMP driver is loaded */
optdb_obj_t icmp_opt_obj = {
icmp_opt_default, /* ICMP default value function pointer */
- icmp_tpi_opt_get, /* ICMP get function pointer */
- icmp_tpi_opt_set, /* ICMP set function pointer */
- B_TRUE, /* ICMP is tpi provider */
+ icmp_tpi_opt_get, /* ICMP get function pointer */
+ icmp_tpi_opt_set, /* ICMP set function pointer */
ICMP_OPT_ARR_CNT, /* ICMP option database count of entries */
icmp_opt_arr, /* ICMP option database */
ICMP_VALID_LEVELS_CNT, /* ICMP valid level count of entries */
diff --git a/usr/src/uts/common/inet/ip/igmp.c b/usr/src/uts/common/inet/ip/igmp.c
index 5eff11af14..9e6b552a61 100644
--- a/usr/src/uts/common/inet/ip/igmp.c
+++ b/usr/src/uts/common/inet/ip/igmp.c
@@ -56,6 +56,7 @@
#include <netinet/igmp_var.h>
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
+#include <inet/ipsec_impl.h>
#include <inet/common.h>
#include <inet/mi.h>
@@ -66,9 +67,8 @@
#include <inet/ip_listutils.h>
#include <netinet/igmp.h>
+#include <inet/ip_ndp.h>
#include <inet/ip_if.h>
-#include <net/pfkeyv2.h>
-#include <inet/ipsec_info.h>
static uint_t igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill);
static uint_t igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen);
@@ -76,14 +76,13 @@ static uint_t mld_query_in(mld_hdr_t *mldh, ill_t *ill);
static uint_t mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen);
static void igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr);
static void mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr);
-static void igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist);
+static void igmpv3_sendrpt(ill_t *ill, mrec_t *reclist);
static void mldv2_sendrpt(ill_t *ill, mrec_t *reclist);
static mrec_t *mcast_bldmrec(mcast_record_t type, in6_addr_t *grp,
slist_t *srclist, mrec_t *next);
static void mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp,
mcast_record_t rtype, slist_t *flist);
static mrec_t *mcast_merge_rtx(ilm_t *ilm, mrec_t *rp, slist_t *flist);
-static void mcast_signal_restart_thread(ip_stack_t *ipst);
/*
* Macros used to do timer len conversions. Timer values are always
@@ -122,11 +121,12 @@ static void mcast_signal_restart_thread(ip_stack_t *ipst);
* The first multicast join will trigger the igmp timers / mld timers
* The unit for next is milliseconds.
*/
-static void
+void
igmp_start_timers(unsigned next, ip_stack_t *ipst)
{
int time_left;
int ret;
+ timeout_id_t id;
ASSERT(next != 0 && next != INFINITY);
@@ -173,9 +173,10 @@ igmp_start_timers(unsigned next, ip_stack_t *ipst)
mutex_exit(&ipst->ips_igmp_timer_lock);
return;
}
+ id = ipst->ips_igmp_timeout_id;
mutex_exit(&ipst->ips_igmp_timer_lock);
- ret = untimeout(ipst->ips_igmp_timeout_id);
+ ret = untimeout(id);
mutex_enter(&ipst->ips_igmp_timer_lock);
/*
* The timeout was cancelled, or the timeout handler
@@ -207,11 +208,12 @@ igmp_start_timers(unsigned next, ip_stack_t *ipst)
* mld_start_timers:
* The unit for next is milliseconds.
*/
-static void
+void
mld_start_timers(unsigned next, ip_stack_t *ipst)
{
int time_left;
int ret;
+ timeout_id_t id;
ASSERT(next != 0 && next != INFINITY);
@@ -257,9 +259,10 @@ mld_start_timers(unsigned next, ip_stack_t *ipst)
mutex_exit(&ipst->ips_mld_timer_lock);
return;
}
+ id = ipst->ips_mld_timeout_id;
mutex_exit(&ipst->ips_mld_timer_lock);
- ret = untimeout(ipst->ips_mld_timeout_id);
+ ret = untimeout(id);
mutex_enter(&ipst->ips_mld_timer_lock);
/*
* The timeout was cancelled, or the timeout handler
@@ -294,9 +297,8 @@ mld_start_timers(unsigned next, ip_stack_t *ipst)
* Callers of igmp_input() may need to reinitialize variables that were copied
* from the mblk as this calls pullupmsg().
*/
-/* ARGSUSED */
mblk_t *
-igmp_input(queue_t *q, mblk_t *mp, ill_t *ill)
+igmp_input(mblk_t *mp, ip_recv_attr_t *ira)
{
igmpa_t *igmpa;
ipha_t *ipha = (ipha_t *)(mp->b_rptr);
@@ -304,22 +306,22 @@ igmp_input(queue_t *q, mblk_t *mp, ill_t *ill)
ilm_t *ilm;
uint32_t src, dst;
uint32_t group;
+ in6_addr_t v6group;
uint_t next;
ipif_t *ipif;
- ip_stack_t *ipst;
- ilm_walker_t ilw;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
- ASSERT(ill != NULL);
ASSERT(!ill->ill_isv6);
- ipst = ill->ill_ipst;
++ipst->ips_igmpstat.igps_rcv_total;
mblklen = MBLKL(mp);
- if (mblklen < 1 || mblklen < (iphlen = IPH_HDR_LENGTH(ipha))) {
+ iphlen = ira->ira_ip_hdr_length;
+ if (mblklen < 1 || mblklen < iphlen) {
++ipst->ips_igmpstat.igps_rcv_tooshort;
goto bad_pkt;
}
- igmplen = ntohs(ipha->ipha_length) - iphlen;
+ igmplen = ira->ira_pktlen - iphlen;
/*
* Since msg sizes are more variable with v3, just pullup the
* whole thing now.
@@ -342,13 +344,6 @@ igmp_input(queue_t *q, mblk_t *mp, ill_t *ill)
++ipst->ips_igmpstat.igps_rcv_tooshort;
goto bad_pkt;
}
- /*
- * Validate checksum
- */
- if (IP_CSUM(mp, iphlen, 0)) {
- ++ipst->ips_igmpstat.igps_rcv_badsum;
- goto bad_pkt;
- }
igmpa = (igmpa_t *)(&mp->b_rptr[iphlen]);
src = ipha->ipha_src;
@@ -400,9 +395,8 @@ igmp_input(queue_t *q, mblk_t *mp, ill_t *ill)
1,
SL_TRACE,
"igmp_input: we are only "
- "member src 0x%x ipif_local 0x%x",
- (int)ntohl(src),
- (int)ntohl(ipif->ipif_lcl_addr));
+ "member src 0x%x\n",
+ (int)ntohl(src));
}
mutex_exit(&ill->ill_lock);
return (mp);
@@ -445,15 +439,18 @@ igmp_input(queue_t *q, mblk_t *mp, ill_t *ill)
* terminology, stop our timer for that group and 'clear
* flag' i.e. mark as IGMP_OTHERMEMBER.
*/
- ilm = ilm_walker_start(&ilw, ill);
- for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
- if (ilm->ilm_addr == group) {
- ++ipst->ips_igmpstat.igps_rcv_ourreports;
- ilm->ilm_timer = INFINITY;
- ilm->ilm_state = IGMP_OTHERMEMBER;
- }
- }
- ilm_walker_finish(&ilw);
+ rw_enter(&ill->ill_mcast_lock, RW_WRITER);
+ IN6_IPADDR_TO_V4MAPPED(group, &v6group);
+ for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+ if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, &v6group))
+ continue;
+
+ ++ipst->ips_igmpstat.igps_rcv_ourreports;
+ ilm->ilm_timer = INFINITY;
+ ilm->ilm_state = IGMP_OTHERMEMBER;
+ } /* for */
+ rw_exit(&ill->ill_mcast_lock);
+ ill_mcast_timer_start(ill->ill_ipst);
break;
case IGMP_V3_MEMBERSHIP_REPORT:
@@ -482,11 +479,11 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
int timer;
uint_t next, current;
ip_stack_t *ipst;
- ilm_walker_t ilw;
ipst = ill->ill_ipst;
++ipst->ips_igmpstat.igps_rcv_queries;
+ rw_enter(&ill->ill_mcast_lock, RW_WRITER);
/*
* In the IGMPv2 specification, there are 3 states and a flag.
*
@@ -506,9 +503,6 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
* Remember that the querier on this interface is old,
* and set the timer to the value in RFC 1112.
*/
-
-
- mutex_enter(&ill->ill_lock);
ill->ill_mcast_v1_time = 0;
ill->ill_mcast_v1_tset = 1;
if (ill->ill_mcast_type != IGMP_V1_ROUTER) {
@@ -517,13 +511,14 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1);
ill->ill_mcast_type = IGMP_V1_ROUTER;
}
- mutex_exit(&ill->ill_lock);
timer = SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY);
if (ipha->ipha_dst != htonl(INADDR_ALLHOSTS_GROUP) ||
igmpa->igmpa_group != 0) {
++ipst->ips_igmpstat.igps_rcv_badqueries;
+ rw_exit(&ill->ill_mcast_lock);
+ ill_mcast_timer_start(ill->ill_ipst);
return (0);
}
@@ -537,6 +532,8 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
group = igmpa->igmpa_group;
if (group != 0 && (!CLASSD(group))) {
++ipst->ips_igmpstat.igps_rcv_badqueries;
+ rw_exit(&ill->ill_mcast_lock);
+ ill_mcast_timer_start(ill->ill_ipst);
return (0);
}
@@ -545,7 +542,6 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
* ONLY IF current state is v3. Let things be if current
* state if v1 but do reset the v2-querier-present timer.
*/
- mutex_enter(&ill->ill_lock);
if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
ip1dbg(("Received IGMPv2 Query on %s, switching mode "
"to IGMP_V2_ROUTER", ill->ill_name));
@@ -554,18 +550,15 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
}
ill->ill_mcast_v2_time = 0;
ill->ill_mcast_v2_tset = 1;
- mutex_exit(&ill->ill_lock);
timer = DSEC_TO_MSEC((int)igmpa->igmpa_code);
}
if (ip_debug > 1) {
- mutex_enter(&ill->ill_lock);
(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
"igmp_input: TIMER = igmp_code %d igmp_type 0x%x",
(int)ntohs(igmpa->igmpa_code),
(int)ntohs(igmpa->igmpa_type));
- mutex_exit(&ill->ill_lock);
}
/*
@@ -582,11 +575,9 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
*/
next = (unsigned)INFINITY;
- ilm = ilm_walker_start(&ilw, ill);
- mutex_enter(&ill->ill_lock);
current = CURRENT_MSTIME;
+ for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
- for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
/*
* A multicast router joins INADDR_ANY address
* to enable promiscuous reception of all
@@ -608,8 +599,12 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
}
}
}
- mutex_exit(&ill->ill_lock);
- ilm_walker_finish(&ilw);
+ rw_exit(&ill->ill_mcast_lock);
+ /*
+ * No packets have been sent above - no
+ * ill_mcast_send_queued is needed.
+ */
+ ill_mcast_timer_start(ill->ill_ipst);
return (next);
}
@@ -623,7 +618,6 @@ igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen)
ipaddr_t *src_array;
uint8_t qrv;
ip_stack_t *ipst;
- ilm_walker_t ilw;
ipst = ill->ill_ipst;
/* make sure numsrc matches packet size */
@@ -636,6 +630,8 @@ igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen)
++ipst->ips_igmpstat.igps_rcv_queries;
+ rw_enter(&ill->ill_mcast_lock, RW_WRITER);
+
if ((mrd = (uint_t)igmp3qa->igmp3qa_mxrc) >= IGMP_V3_MAXRT_FPMIN) {
uint_t hdrval, mant, exp;
hdrval = (uint_t)igmp3qa->igmp3qa_mxrc;
@@ -669,12 +665,11 @@ igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen)
* sooner than the delay we calculated for this response, then
* no action is required (RFC3376 section 5.2 rule 1)
*/
- mutex_enter(&ill->ill_lock);
if (ill->ill_global_timer < (current + delay)) {
- mutex_exit(&ill->ill_lock);
+ rw_exit(&ill->ill_mcast_lock);
+ ill_mcast_timer_start(ill->ill_ipst);
return (next);
}
- mutex_exit(&ill->ill_lock);
/*
* Now take action depending upon query type:
@@ -687,16 +682,11 @@ igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen)
* greater than our calculated delay, so reset it to
* our delay (random value in range [0, response time]).
*/
- mutex_enter(&ill->ill_lock);
ill->ill_global_timer = current + delay;
- mutex_exit(&ill->ill_lock);
next = delay;
-
} else {
/* group or group/source specific query */
- ilm = ilm_walker_start(&ilw, ill);
- mutex_enter(&ill->ill_lock);
- for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
+ for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr) ||
(ilm->ilm_addr == htonl(INADDR_ANY)) ||
(ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) ||
@@ -750,13 +740,21 @@ group_query:
next = ilm->ilm_timer;
ilm->ilm_timer += current;
}
- mutex_exit(&ill->ill_lock);
- ilm_walker_finish(&ilw);
}
+ rw_exit(&ill->ill_mcast_lock);
+ /*
+ * No packets have been sent above - no
+ * ill_mcast_send_queued is needed.
+ */
+ ill_mcast_timer_start(ill->ill_ipst);
return (next);
}
+/*
+ * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
+ * and it gets sent after the lock is dropped.
+ */
void
igmp_joingroup(ilm_t *ilm)
{
@@ -764,27 +762,21 @@ igmp_joingroup(ilm_t *ilm)
ill_t *ill;
ip_stack_t *ipst = ilm->ilm_ipst;
- ill = ilm->ilm_ipif->ipif_ill;
+ ill = ilm->ilm_ill;
- ASSERT(IAM_WRITER_ILL(ill));
- ASSERT(ilm->ilm_ill == NULL && !ilm->ilm_ipif->ipif_isv6);
+ ASSERT(!ill->ill_isv6);
+ ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
- mutex_enter(&ill->ill_lock);
if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) {
ilm->ilm_rtx.rtx_timer = INFINITY;
ilm->ilm_state = IGMP_OTHERMEMBER;
- mutex_exit(&ill->ill_lock);
} else {
ip1dbg(("Querier mode %d, sending report, group %x\n",
ill->ill_mcast_type, htonl(ilm->ilm_addr)));
if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
- mutex_exit(&ill->ill_lock);
igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
- mutex_enter(&ill->ill_lock);
} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
- mutex_exit(&ill->ill_lock);
igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
- mutex_enter(&ill->ill_lock);
} else if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
mrec_t *rp;
mcast_record_t rtype;
@@ -802,9 +794,7 @@ igmp_joingroup(ilm_t *ilm)
ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
ilm->ilm_filter, NULL);
- mutex_exit(&ill->ill_lock);
- igmpv3_sendrpt(ilm->ilm_ipif, rp);
- mutex_enter(&ill->ill_lock);
+ igmpv3_sendrpt(ill, rp);
/*
* Set up retransmission state. Timer is set below,
* for both v3 and older versions.
@@ -820,35 +810,33 @@ igmp_joingroup(ilm_t *ilm)
timer = ilm->ilm_rtx.rtx_timer;
ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
ilm->ilm_state = IGMP_IREPORTEDLAST;
- mutex_exit(&ill->ill_lock);
/*
- * We need to restart the IGMP timers, but we can't do it here
- * since we're inside the IPSQ and thus igmp_start_timers() ->
- * untimeout() (inside the IPSQ, waiting for a running timeout
- * to finish) could deadlock with igmp_timeout_handler() ->
- * ipsq_enter() (running the timeout, waiting to get inside
- * the IPSQ). We also can't just delay it until after we
- * ipsq_exit() since we could be inside more than one IPSQ and
- * thus still have the other IPSQs pinned after we exit -- and
- * igmp_start_timers() may be trying to enter one of those.
- * Instead, signal a dedicated thread that will do it for us.
+ * We are holding ill_mcast_lock here and the timeout
+ * handler (igmp_timeout_handler_per_ill) acquires that
+ * lock. Hence we can't call igmp_start_timer since it could
+ * deadlock in untimeout().
+ * Instead the thread which drops ill_mcast_lock will have
+ * to call ill_mcast_timer_start().
*/
mutex_enter(&ipst->ips_igmp_timer_lock);
ipst->ips_igmp_deferred_next = MIN(timer,
ipst->ips_igmp_deferred_next);
mutex_exit(&ipst->ips_igmp_timer_lock);
- mcast_signal_restart_thread(ipst);
}
if (ip_debug > 1) {
- (void) mi_strlog(ilm->ilm_ipif->ipif_ill->ill_rq, 1, SL_TRACE,
+ (void) mi_strlog(ilm->ilm_ill->ill_rq, 1, SL_TRACE,
"igmp_joingroup: multicast_type %d timer %d",
- (ilm->ilm_ipif->ipif_ill->ill_mcast_type),
+ (ilm->ilm_ill->ill_mcast_type),
(int)ntohl(timer));
}
}
+/*
+ * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
+ * and it gets sent after the lock is dropped.
+ */
void
mld_joingroup(ilm_t *ilm)
{
@@ -858,19 +846,16 @@ mld_joingroup(ilm_t *ilm)
ill = ilm->ilm_ill;
- ASSERT(IAM_WRITER_ILL(ill));
- ASSERT(ilm->ilm_ipif == NULL && ill->ill_isv6);
+ ASSERT(ill->ill_isv6);
+
+ ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
- mutex_enter(&ill->ill_lock);
if (IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr)) {
ilm->ilm_rtx.rtx_timer = INFINITY;
ilm->ilm_state = IGMP_OTHERMEMBER;
- mutex_exit(&ill->ill_lock);
} else {
if (ill->ill_mcast_type == MLD_V1_ROUTER) {
- mutex_exit(&ill->ill_lock);
mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
- mutex_enter(&ill->ill_lock);
} else {
mrec_t *rp;
mcast_record_t rtype;
@@ -888,9 +873,7 @@ mld_joingroup(ilm_t *ilm)
ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
ilm->ilm_filter, NULL);
- mutex_exit(&ill->ill_lock);
mldv2_sendrpt(ill, rp);
- mutex_enter(&ill->ill_lock);
/*
* Set up retransmission state. Timer is set below,
* for both v2 and v1.
@@ -909,17 +892,19 @@ mld_joingroup(ilm_t *ilm)
timer = ilm->ilm_rtx.rtx_timer;
ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
ilm->ilm_state = IGMP_IREPORTEDLAST;
- mutex_exit(&ill->ill_lock);
/*
- * Signal another thread to restart the timers. See the
- * comment in igmp_joingroup() for details.
+ * We are holding ill_mcast_lock here and the timeout
+ * handler (mld_timeout_handler_per_ill) acquires that
+ * lock. Hence we can't call mld_start_timer since it could
+ * deadlock in untimeout().
+ * Instead the thread which drops ill_mcast_lock will have
+ * to call ill_mcast_timer_start().
*/
mutex_enter(&ipst->ips_mld_timer_lock);
ipst->ips_mld_deferred_next = MIN(timer,
ipst->ips_mld_deferred_next);
mutex_exit(&ipst->ips_mld_timer_lock);
- mcast_signal_restart_thread(ipst);
}
if (ip_debug > 1) {
@@ -930,23 +915,26 @@ mld_joingroup(ilm_t *ilm)
}
}
+/*
+ * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
+ * and it gets sent after the lock is dropped.
+ */
void
igmp_leavegroup(ilm_t *ilm)
{
- ill_t *ill = ilm->ilm_ipif->ipif_ill;
+ ill_t *ill = ilm->ilm_ill;
- ASSERT(ilm->ilm_ill == NULL);
ASSERT(!ill->ill_isv6);
- mutex_enter(&ill->ill_lock);
+ ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
ill->ill_mcast_type == IGMP_V2_ROUTER &&
(ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
- mutex_exit(&ill->ill_lock);
igmp_sendpkt(ilm, IGMP_V2_LEAVE_GROUP,
(htonl(INADDR_ALLRTRS_GROUP)));
return;
- } else if ((ill->ill_mcast_type == IGMP_V3_ROUTER) &&
+ }
+ if ((ill->ill_mcast_type == IGMP_V3_ROUTER) &&
(ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
mrec_t *rp;
/*
@@ -965,29 +953,30 @@ igmp_leavegroup(ilm_t *ilm)
rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
NULL, NULL);
}
- mutex_exit(&ill->ill_lock);
- igmpv3_sendrpt(ilm->ilm_ipif, rp);
+ igmpv3_sendrpt(ill, rp);
return;
}
- mutex_exit(&ill->ill_lock);
}
+/*
+ * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
+ * and it gets sent after the lock is dropped.
+ */
void
mld_leavegroup(ilm_t *ilm)
{
ill_t *ill = ilm->ilm_ill;
- ASSERT(ilm->ilm_ipif == NULL);
ASSERT(ill->ill_isv6);
- mutex_enter(&ill->ill_lock);
+ ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
ill->ill_mcast_type == MLD_V1_ROUTER &&
(!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
- mutex_exit(&ill->ill_lock);
mld_sendpkt(ilm, MLD_LISTENER_REDUCTION, &ipv6_all_rtrs_mcast);
return;
- } else if ((ill->ill_mcast_type == MLD_V2_ROUTER) &&
+ }
+ if ((ill->ill_mcast_type == MLD_V2_ROUTER) &&
(!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
mrec_t *rp;
/*
@@ -1006,13 +995,15 @@ mld_leavegroup(ilm_t *ilm)
rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
NULL, NULL);
}
- mutex_exit(&ill->ill_lock);
mldv2_sendrpt(ill, rp);
return;
}
- mutex_exit(&ill->ill_lock);
}
+/*
+ * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
+ * and it gets sent after the lock is dropped.
+ */
void
igmp_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
{
@@ -1023,17 +1014,11 @@ igmp_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
ASSERT(ilm != NULL);
/* state change reports should only be sent if the router is v3 */
- if (ilm->ilm_ipif->ipif_ill->ill_mcast_type != IGMP_V3_ROUTER)
+ if (ilm->ilm_ill->ill_mcast_type != IGMP_V3_ROUTER)
return;
- if (ilm->ilm_ill == NULL) {
- ASSERT(ilm->ilm_ipif != NULL);
- ill = ilm->ilm_ipif->ipif_ill;
- } else {
- ill = ilm->ilm_ill;
- }
-
- mutex_enter(&ill->ill_lock);
+ ill = ilm->ilm_ill;
+ ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
/*
* Compare existing(old) state with the new state and prepare
@@ -1089,8 +1074,7 @@ send_to_in:
/*
* Need to set up retransmission state; merge the new info with the
* current state (which may be null). If the timer is not currently
- * running, signal a thread to restart it -- see the comment in
- * igmp_joingroup() for details.
+ * running, the caller will start it when dropping ill_mcast_lock.
*/
rp = mcast_merge_rtx(ilm, rp, flist);
if (ilm->ilm_rtx.rtx_timer == INFINITY) {
@@ -1102,13 +1086,15 @@ send_to_in:
ilm->ilm_rtx.rtx_timer);
ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
mutex_exit(&ipst->ips_igmp_timer_lock);
- mcast_signal_restart_thread(ipst);
}
- mutex_exit(&ill->ill_lock);
- igmpv3_sendrpt(ilm->ilm_ipif, rp);
+ igmpv3_sendrpt(ill, rp);
}
+/*
+ * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
+ * and it gets sent after the lock is dropped.
+ */
void
mld_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
{
@@ -1119,11 +1105,10 @@ mld_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
ASSERT(ilm != NULL);
ill = ilm->ilm_ill;
+ ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
/* only need to send if we have an mldv2-capable router */
- mutex_enter(&ill->ill_lock);
if (ill->ill_mcast_type != MLD_V2_ROUTER) {
- mutex_exit(&ill->ill_lock);
return;
}
@@ -1179,8 +1164,7 @@ send_to_in:
/*
* Need to set up retransmission state; merge the new info with the
* current state (which may be null). If the timer is not currently
- * running, signal a thread to restart it -- see the comment in
- * igmp_joingroup() for details.
+ * running, the caller will start it when dropping ill_mcast_lock.
*/
rp = mcast_merge_rtx(ilm, rp, flist);
ASSERT(ilm->ilm_rtx.rtx_cnt > 0);
@@ -1193,10 +1177,8 @@ send_to_in:
MIN(ipst->ips_mld_deferred_next, ilm->ilm_rtx.rtx_timer);
ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
mutex_exit(&ipst->ips_mld_timer_lock);
- mcast_signal_restart_thread(ipst);
}
- mutex_exit(&ill->ill_lock);
mldv2_sendrpt(ill, rp);
}
@@ -1205,15 +1187,12 @@ igmp_timeout_handler_per_ill(ill_t *ill)
{
uint_t next = INFINITY, current;
ilm_t *ilm;
- ipif_t *ipif;
mrec_t *rp = NULL;
mrec_t *rtxrp = NULL;
rtx_state_t *rtxp;
mcast_record_t rtype;
- ASSERT(IAM_WRITER_ILL(ill));
-
- mutex_enter(&ill->ill_lock);
+ rw_enter(&ill->ill_mcast_lock, RW_WRITER);
current = CURRENT_MSTIME;
/* First check the global timer on this interface */
@@ -1230,10 +1209,8 @@ igmp_timeout_handler_per_ill(ill_t *ill)
for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP))
continue;
- ASSERT(ilm->ilm_ipif != NULL);
- ilm->ilm_ipif->ipif_igmp_rpt =
- mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
- ilm->ilm_filter, ilm->ilm_ipif->ipif_igmp_rpt);
+ rp = mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
+ ilm->ilm_filter, rp);
/*
* Since we're sending a report on this group, okay
* to delete pending group-specific timers. Note
@@ -1245,20 +1222,8 @@ igmp_timeout_handler_per_ill(ill_t *ill)
FREE_SLIST(ilm->ilm_pendsrcs);
ilm->ilm_pendsrcs = NULL;
}
- /*
- * We've built per-ipif mrec lists; walk the ill's ipif list
- * and send a report for each ipif that has an mrec list.
- */
- for (ipif = ill->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
- if (ipif->ipif_igmp_rpt == NULL)
- continue;
- mutex_exit(&ill->ill_lock);
- igmpv3_sendrpt(ipif, ipif->ipif_igmp_rpt);
- mutex_enter(&ill->ill_lock);
- /* mrec list was freed by igmpv3_sendrpt() */
- ipif->ipif_igmp_rpt = NULL;
- }
+ igmpv3_sendrpt(ill, rp);
+ rp = NULL;
} else {
if ((ill->ill_global_timer - current) < next)
next = ill->ill_global_timer - current;
@@ -1288,13 +1253,9 @@ per_ilm_timer:
ilm->ilm_timer = INFINITY;
ilm->ilm_state = IGMP_IREPORTEDLAST;
if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
- mutex_exit(&ill->ill_lock);
igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
- mutex_enter(&ill->ill_lock);
} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
- mutex_exit(&ill->ill_lock);
igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
- mutex_enter(&ill->ill_lock);
} else {
slist_t *rsp;
if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
@@ -1325,9 +1286,7 @@ per_ilm_timer:
rp = mcast_bldmrec(ilm->ilm_fmode,
&ilm->ilm_v6addr, ilm->ilm_filter, rp);
}
- mutex_exit(&ill->ill_lock);
- igmpv3_sendrpt(ill->ill_ipif, rp);
- mutex_enter(&ill->ill_lock);
+ igmpv3_sendrpt(ill, rp);
rp = NULL;
}
@@ -1345,14 +1304,11 @@ per_ilm_rtxtimer:
rtxp->rtx_timer = INFINITY;
ilm->ilm_state = IGMP_IREPORTEDLAST;
if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
- mutex_exit(&ill->ill_lock);
igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
- mutex_enter(&ill->ill_lock);
continue;
- } else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
- mutex_exit(&ill->ill_lock);
+ }
+ if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
- mutex_enter(&ill->ill_lock);
continue;
}
@@ -1393,13 +1349,14 @@ per_ilm_rtxtimer:
CLEAR_SLIST(rtxp->rtx_allow);
CLEAR_SLIST(rtxp->rtx_block);
}
- mutex_exit(&ill->ill_lock);
- igmpv3_sendrpt(ilm->ilm_ipif, rtxrp);
- mutex_enter(&ill->ill_lock);
+ igmpv3_sendrpt(ill, rtxrp);
rtxrp = NULL;
}
- mutex_exit(&ill->ill_lock);
+ rw_exit(&ill->ill_mcast_lock);
+ /* Send any deferred/queued IP packets */
+ ill_mcast_send_queued(ill);
+ /* Defer ill_mcast_timer_start() until the caller is done */
return (next);
}
@@ -1411,17 +1368,15 @@ per_ilm_rtxtimer:
*
* As part of multicast join and leave igmp we may need to send out an
* igmp request. The igmp related state variables in the ilm are protected
- * by ill_lock. A single global igmp timer is used to track igmp timeouts.
+ * by ill_mcast_lock. A single global igmp timer is used to track igmp timeouts.
* igmp_timer_lock protects the global igmp_timeout_id. igmp_start_timers
* starts the igmp timer if needed. It serializes multiple threads trying to
* simultaneously start the timer using the igmp_timer_setter_active flag.
*
* igmp_input() receives igmp queries and responds to the queries
* in a delayed fashion by posting a timer i.e. it calls igmp_start_timers().
- * Later the igmp_timer fires, the timeout handler igmp_timeout_handler()
- * performs the action exclusively after entering each ill's ipsq as writer.
- * (The need to enter the IPSQ is largely historical but there are still some
- * fields like ilm_filter that rely on it.)
+ * Later the igmp_timer fires, the timeout handler igmp_timerout_handler()
+ * performs the action exclusively after acquiring ill_mcast_lock.
*
* The igmp_slowtimeo() function is called thru another timer.
* igmp_slowtimeout_lock protects the igmp_slowtimeout_id
@@ -1433,12 +1388,12 @@ igmp_timeout_handler(void *arg)
uint_t global_next = INFINITY;
uint_t next;
ill_walk_context_t ctx;
- boolean_t success;
ip_stack_t *ipst = arg;
ASSERT(arg != NULL);
mutex_enter(&ipst->ips_igmp_timer_lock);
ASSERT(ipst->ips_igmp_timeout_id != 0);
+ ipst->ips_igmp_timeout_id = 0;
ipst->ips_igmp_timer_scheduled_last = 0;
ipst->ips_igmp_time_to_next = 0;
mutex_exit(&ipst->ips_igmp_timer_lock);
@@ -1447,31 +1402,17 @@ igmp_timeout_handler(void *arg)
ill = ILL_START_WALK_V4(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
ASSERT(!ill->ill_isv6);
- /*
- * We may not be able to refhold the ill if the ill/ipif
- * is changing. But we need to make sure that the ill will
- * not vanish. So we just bump up the ill_waiter count.
- */
- if (!ill_waiter_inc(ill))
+ /* Make sure the ill isn't going away. */
+ if (!ill_check_and_refhold(ill))
continue;
rw_exit(&ipst->ips_ill_g_lock);
- success = ipsq_enter(ill, B_TRUE, NEW_OP);
- if (success) {
- next = igmp_timeout_handler_per_ill(ill);
- if (next < global_next)
- global_next = next;
- ipsq_exit(ill->ill_phyint->phyint_ipsq);
- }
+ next = igmp_timeout_handler_per_ill(ill);
+ if (next < global_next)
+ global_next = next;
+ ill_refrele(ill);
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- ill_waiter_dcr(ill);
}
rw_exit(&ipst->ips_ill_g_lock);
-
- mutex_enter(&ipst->ips_igmp_timer_lock);
- ASSERT(ipst->ips_igmp_timeout_id != 0);
- ipst->ips_igmp_timeout_id = 0;
- mutex_exit(&ipst->ips_igmp_timer_lock);
-
if (global_next != INFINITY)
igmp_start_timers(global_next, ipst);
}
@@ -1481,7 +1422,6 @@ igmp_timeout_handler(void *arg)
* Called when there are timeout events, every next (tick).
* Returns number of ticks to next event (or 0 if none).
*/
-/* ARGSUSED */
uint_t
mld_timeout_handler_per_ill(ill_t *ill)
{
@@ -1491,9 +1431,7 @@ mld_timeout_handler_per_ill(ill_t *ill)
rtx_state_t *rtxp;
mcast_record_t rtype;
- ASSERT(IAM_WRITER_ILL(ill));
-
- mutex_enter(&ill->ill_lock);
+ rw_enter(&ill->ill_mcast_lock, RW_WRITER);
current = CURRENT_MSTIME;
/*
@@ -1528,9 +1466,7 @@ mld_timeout_handler_per_ill(ill_t *ill)
FREE_SLIST(ilm->ilm_pendsrcs);
ilm->ilm_pendsrcs = NULL;
}
- mutex_exit(&ill->ill_lock);
mldv2_sendrpt(ill, rp);
- mutex_enter(&ill->ill_lock);
} else {
if ((ill->ill_global_timer - current) < next)
next = ill->ill_global_timer - current;
@@ -1561,9 +1497,7 @@ per_ilm_timer:
ilm->ilm_timer = INFINITY;
ilm->ilm_state = IGMP_IREPORTEDLAST;
if (ill->ill_mcast_type == MLD_V1_ROUTER) {
- mutex_exit(&ill->ill_lock);
mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
- mutex_enter(&ill->ill_lock);
} else {
slist_t *rsp;
if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
@@ -1605,9 +1539,7 @@ per_ilm_rtxtimer:
rtxp->rtx_timer = INFINITY;
ilm->ilm_state = IGMP_IREPORTEDLAST;
if (ill->ill_mcast_type == MLD_V1_ROUTER) {
- mutex_exit(&ill->ill_lock);
mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
- mutex_enter(&ill->ill_lock);
continue;
}
@@ -1651,13 +1583,13 @@ per_ilm_rtxtimer:
}
if (ill->ill_mcast_type == MLD_V2_ROUTER) {
- mutex_exit(&ill->ill_lock);
mldv2_sendrpt(ill, rp);
mldv2_sendrpt(ill, rtxrp);
- return (next);
}
-
- mutex_exit(&ill->ill_lock);
+ rw_exit(&ill->ill_mcast_lock);
+ /* Send any deferred/queued IP packets */
+ ill_mcast_send_queued(ill);
+ /* Defer ill_mcast_timer_start() until the caller is done */
return (next);
}
@@ -1675,12 +1607,12 @@ mld_timeout_handler(void *arg)
uint_t global_next = INFINITY;
uint_t next;
ill_walk_context_t ctx;
- boolean_t success;
ip_stack_t *ipst = arg;
ASSERT(arg != NULL);
mutex_enter(&ipst->ips_mld_timer_lock);
ASSERT(ipst->ips_mld_timeout_id != 0);
+ ipst->ips_mld_timeout_id = 0;
ipst->ips_mld_timer_scheduled_last = 0;
ipst->ips_mld_time_to_next = 0;
mutex_exit(&ipst->ips_mld_timer_lock);
@@ -1689,31 +1621,17 @@ mld_timeout_handler(void *arg)
ill = ILL_START_WALK_V6(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
ASSERT(ill->ill_isv6);
- /*
- * We may not be able to refhold the ill if the ill/ipif
- * is changing. But we need to make sure that the ill will
- * not vanish. So we just bump up the ill_waiter count.
- */
- if (!ill_waiter_inc(ill))
+ /* Make sure the ill isn't going away. */
+ if (!ill_check_and_refhold(ill))
continue;
rw_exit(&ipst->ips_ill_g_lock);
- success = ipsq_enter(ill, B_TRUE, NEW_OP);
- if (success) {
- next = mld_timeout_handler_per_ill(ill);
- if (next < global_next)
- global_next = next;
- ipsq_exit(ill->ill_phyint->phyint_ipsq);
- }
+ next = mld_timeout_handler_per_ill(ill);
+ if (next < global_next)
+ global_next = next;
+ ill_refrele(ill);
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- ill_waiter_dcr(ill);
}
rw_exit(&ipst->ips_ill_g_lock);
-
- mutex_enter(&ipst->ips_mld_timer_lock);
- ASSERT(ipst->ips_mld_timeout_id != 0);
- ipst->ips_mld_timeout_id = 0;
- mutex_exit(&ipst->ips_mld_timer_lock);
-
if (global_next != INFINITY)
mld_start_timers(global_next, ipst);
}
@@ -1743,8 +1661,6 @@ igmp_slowtimo(void *arg)
ip_stack_t *ipst = (ip_stack_t *)arg;
ASSERT(arg != NULL);
- /* Hold the ill_g_lock so that we can safely walk the ill list */
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
/*
* The ill_if_t list is circular, hence the odd loop parameters.
@@ -1754,6 +1670,7 @@ igmp_slowtimo(void *arg)
* structure (allowing us to skip if none of the instances have timers
* running).
*/
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
for (ifp = IP_V4_ILL_G_LIST(ipst);
ifp != (ill_if_t *)&IP_V4_ILL_G_LIST(ipst);
ifp = ifp->illif_next) {
@@ -1768,7 +1685,11 @@ igmp_slowtimo(void *arg)
avl_tree = &ifp->illif_avl_by_ppa;
for (ill = avl_first(avl_tree); ill != NULL;
ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
- mutex_enter(&ill->ill_lock);
+ /* Make sure the ill isn't going away. */
+ if (!ill_check_and_refhold(ill))
+ continue;
+ rw_exit(&ipst->ips_ill_g_lock);
+ rw_enter(&ill->ill_mcast_lock, RW_WRITER);
if (ill->ill_mcast_v1_tset == 1)
ill->ill_mcast_v1_time++;
if (ill->ill_mcast_v2_tset == 1)
@@ -1808,10 +1729,13 @@ igmp_slowtimo(void *arg)
ill->ill_mcast_v2_tset = 0;
atomic_add_16(&ifp->illif_mcast_v2, -1);
}
- mutex_exit(&ill->ill_lock);
+ rw_exit(&ill->ill_mcast_lock);
+ ill_refrele(ill);
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
}
}
rw_exit(&ipst->ips_ill_g_lock);
+ ill_mcast_timer_start(ipst);
mutex_enter(&ipst->ips_igmp_slowtimeout_lock);
ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo, (void *)ipst,
MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
@@ -1826,7 +1750,6 @@ igmp_slowtimo(void *arg)
* Check for ips_mld_max_version ensures that we don't revert to a higher
* IGMP version than configured.
*/
-/* ARGSUSED */
void
mld_slowtimo(void *arg)
{
@@ -1847,7 +1770,11 @@ mld_slowtimo(void *arg)
avl_tree = &ifp->illif_avl_by_ppa;
for (ill = avl_first(avl_tree); ill != NULL;
ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
- mutex_enter(&ill->ill_lock);
+ /* Make sure the ill isn't going away. */
+ if (!ill_check_and_refhold(ill))
+ continue;
+ rw_exit(&ipst->ips_ill_g_lock);
+ rw_enter(&ill->ill_mcast_lock, RW_WRITER);
if (ill->ill_mcast_v1_tset == 1)
ill->ill_mcast_v1_time++;
if ((ill->ill_mcast_type == MLD_V1_ROUTER) &&
@@ -1861,10 +1788,13 @@ mld_slowtimo(void *arg)
ill->ill_mcast_v1_tset = 0;
atomic_add_16(&ifp->illif_mcast_v1, -1);
}
- mutex_exit(&ill->ill_lock);
+ rw_exit(&ill->ill_mcast_lock);
+ ill_refrele(ill);
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
}
}
rw_exit(&ipst->ips_ill_g_lock);
+ ill_mcast_timer_start(ipst);
mutex_enter(&ipst->ips_mld_slowtimeout_lock);
ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo, (void *)ipst,
MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
@@ -1873,9 +1803,7 @@ mld_slowtimo(void *arg)
/*
* igmp_sendpkt:
- * This will send to ip_wput like icmp_inbound.
- * Note that the lower ill (on which the membership is kept) is used
- * as an upper ill to pass in the multicast parameters.
+ * This will send to ip_output_simple just like icmp_inbound.
*/
static void
igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr)
@@ -1886,51 +1814,16 @@ igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr)
ipha_t *ipha;
int hdrlen = sizeof (ipha_t) + RTRALERT_LEN;
size_t size = hdrlen + sizeof (igmpa_t);
- ipif_t *ipif = ilm->ilm_ipif;
- ill_t *ill = ipif->ipif_ill;
- mblk_t *first_mp;
- ipsec_out_t *io;
- zoneid_t zoneid;
+ ill_t *ill = ilm->ilm_ill;
ip_stack_t *ipst = ill->ill_ipst;
- /*
- * We need to make sure this packet goes out on an ipif. If
- * there is some global policy match in ip_wput_ire, we need
- * to get to the right interface after IPSEC processing.
- * To make sure this multicast packet goes out on the right
- * interface, we attach an ipsec_out and initialize ill_index
- * like we did in ip_wput. To make sure that this packet does
- * not get forwarded on other interfaces or looped back, we
- * set ipsec_out_dontroute to B_TRUE and ipsec_out_multicast_loop
- * to B_FALSE.
- */
- first_mp = allocb(sizeof (ipsec_info_t), BPRI_HI);
- if (first_mp == NULL)
- return;
-
- first_mp->b_datap->db_type = M_CTL;
- first_mp->b_wptr += sizeof (ipsec_info_t);
- bzero(first_mp->b_rptr, sizeof (ipsec_info_t));
- /* ipsec_out_secure is B_FALSE now */
- io = (ipsec_out_t *)first_mp->b_rptr;
- io->ipsec_out_type = IPSEC_OUT;
- io->ipsec_out_len = sizeof (ipsec_out_t);
- io->ipsec_out_use_global_policy = B_TRUE;
- io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
- io->ipsec_out_multicast_loop = B_FALSE;
- io->ipsec_out_dontroute = B_TRUE;
- if ((zoneid = ilm->ilm_zoneid) == ALL_ZONES)
- zoneid = GLOBAL_ZONEID;
- io->ipsec_out_zoneid = zoneid;
- io->ipsec_out_ns = ipst->ips_netstack; /* No netstack_hold */
+ ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
mp = allocb(size, BPRI_HI);
if (mp == NULL) {
- freemsg(first_mp);
return;
}
mp->b_wptr = mp->b_rptr + size;
- first_mp->b_cont = mp;
ipha = (ipha_t *)mp->b_rptr;
rtralert = (uint8_t *)&(ipha[1]);
@@ -1956,53 +1849,38 @@ igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr)
ipha->ipha_protocol = IPPROTO_IGMP;
ipha->ipha_hdr_checksum = 0;
ipha->ipha_dst = addr ? addr : igmpa->igmpa_group;
- ipha->ipha_src = ipif->ipif_src_addr;
- /*
- * Request loopback of the report if we are acting as a multicast
- * router, so that the process-level routing demon can hear it.
- */
- /*
- * This will run multiple times for the same group if there are members
- * on the same group for multiple ipif's on the same ill. The
- * igmp_input code will suppress this due to the loopback thus we
- * always loopback membership report.
- */
- ASSERT(ill->ill_rq != NULL);
- ip_multicast_loopback(ill->ill_rq, ill, first_mp, 0, ilm->ilm_zoneid);
+ ipha->ipha_src = INADDR_ANY;
- ip_wput_multicast(ill->ill_wq, first_mp, ipif, zoneid);
+ ill_mcast_queue(ill, mp);
++ipst->ips_igmpstat.igps_snd_reports;
}
/*
- * Sends an IGMP_V3_MEMBERSHIP_REPORT message out the ill associated
- * with the passed-in ipif. The report will contain one group record
+ * Sends an IGMP_V3_MEMBERSHIP_REPORT message out the ill.
+ * The report will contain one group record
* for each element of reclist. If this causes packet length to
- * exceed ipif->ipif_ill->ill_max_frag, multiple reports are sent.
+ * exceed ill->ill_mtu, multiple reports are sent.
* reclist is assumed to be made up of buffers allocated by mcast_bldmrec(),
* and those buffers are freed here.
*/
static void
-igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist)
+igmpv3_sendrpt(ill_t *ill, mrec_t *reclist)
{
- ipsec_out_t *io;
igmp3ra_t *igmp3ra;
grphdra_t *grphdr;
- mblk_t *first_mp, *mp;
+ mblk_t *mp;
ipha_t *ipha;
uint8_t *rtralert;
ipaddr_t *src_array;
int i, j, numrec, more_src_cnt;
size_t hdrsize, size, rsize;
- ill_t *ill = ipif->ipif_ill;
mrec_t *rp, *cur_reclist;
mrec_t *next_reclist = reclist;
boolean_t morepkts;
- zoneid_t zoneid;
ip_stack_t *ipst = ill->ill_ipst;
- ASSERT(IAM_WRITER_IPIF(ipif));
+ ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
/* if there aren't any records, there's nothing to send */
if (reclist == NULL)
@@ -2018,7 +1896,7 @@ nextpkt:
for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) {
rsize = sizeof (grphdra_t) +
(rp->mrec_srcs.sl_numsrc * sizeof (ipaddr_t));
- if (size + rsize > ill->ill_max_frag) {
+ if (size + rsize > ill->ill_mtu) {
if (rp == cur_reclist) {
/*
* If the first mrec we looked at is too big
@@ -2029,7 +1907,7 @@ nextpkt:
* other types).
*/
int srcspace, srcsperpkt;
- srcspace = ill->ill_max_frag - (size +
+ srcspace = ill->ill_mtu - (size +
sizeof (grphdra_t));
/*
@@ -2082,37 +1960,12 @@ nextpkt:
numrec++;
}
- /*
- * See comments in igmp_sendpkt() about initializing for ipsec and
- * load balancing requirements.
- */
- first_mp = allocb(sizeof (ipsec_info_t), BPRI_HI);
- if (first_mp == NULL)
- goto free_reclist;
-
- first_mp->b_datap->db_type = M_CTL;
- first_mp->b_wptr += sizeof (ipsec_info_t);
- bzero(first_mp->b_rptr, sizeof (ipsec_info_t));
- /* ipsec_out_secure is B_FALSE now */
- io = (ipsec_out_t *)first_mp->b_rptr;
- io->ipsec_out_type = IPSEC_OUT;
- io->ipsec_out_len = sizeof (ipsec_out_t);
- io->ipsec_out_use_global_policy = B_TRUE;
- io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
- io->ipsec_out_multicast_loop = B_FALSE;
- io->ipsec_out_dontroute = B_TRUE;
- if ((zoneid = ipif->ipif_zoneid) == ALL_ZONES)
- zoneid = GLOBAL_ZONEID;
- io->ipsec_out_zoneid = zoneid;
-
mp = allocb(size, BPRI_HI);
if (mp == NULL) {
- freemsg(first_mp);
goto free_reclist;
}
bzero((char *)mp->b_rptr, size);
mp->b_wptr = (uchar_t *)(mp->b_rptr + size);
- first_mp->b_cont = mp;
ipha = (ipha_t *)mp->b_rptr;
rtralert = (uint8_t *)&(ipha[1]);
@@ -2149,21 +2002,9 @@ nextpkt:
ipha->ipha_ttl = IGMP_TTL;
ipha->ipha_protocol = IPPROTO_IGMP;
ipha->ipha_dst = htonl(INADDR_ALLRPTS_GROUP);
- ipha->ipha_src = ipif->ipif_src_addr;
+ ipha->ipha_src = INADDR_ANY;
- /*
- * Request loopback of the report if we are acting as a multicast
- * router, so that the process-level routing daemon can hear it.
- *
- * This will run multiple times for the same group if there are
- * members on the same group for multiple ipifs on the same ill.
- * The igmp_input code will suppress this due to the loopback;
- * thus we always loopback membership report.
- */
- ASSERT(ill->ill_rq != NULL);
- ip_multicast_loopback(ill->ill_rq, ill, mp, 0, ipif->ipif_zoneid);
-
- ip_wput_multicast(ill->ill_wq, first_mp, ipif, zoneid);
+ ill_mcast_queue(ill, mp);
++ipst->ips_igmpstat.igps_snd_reports;
@@ -2190,21 +2031,24 @@ free_reclist:
/*
* mld_input:
+ * Return NULL for a bad packet that is discarded here.
+ * Return mp if the message is OK and should be handed to "raw" receivers.
+ * Callers of mld_input() may need to reinitialize variables that were copied
+ * from the mblk as this calls pullupmsg().
*/
-/* ARGSUSED */
-void
-mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
+mblk_t *
+mld_input(mblk_t *mp, ip_recv_attr_t *ira)
{
ip6_t *ip6h = (ip6_t *)(mp->b_rptr);
mld_hdr_t *mldh;
ilm_t *ilm;
ipif_t *ipif;
uint16_t hdr_length, exthdr_length;
- in6_addr_t *v6group_ptr, *lcladdr_ptr;
+ in6_addr_t *v6group_ptr;
uint_t next;
int mldlen;
+ ill_t *ill = ira->ira_ill;
ip_stack_t *ipst = ill->ill_ipst;
- ilm_walker_t ilw;
BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembTotal);
@@ -2212,30 +2056,26 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
if (!(IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))) {
BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
freemsg(mp);
- return;
+ return (NULL);
}
if (ip6h->ip6_hlim != 1) {
BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpBadHoplimit);
freemsg(mp);
- return;
+ return (NULL);
}
/* Get to the icmp header part */
- if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
- hdr_length = ip_hdr_length_v6(mp, ip6h);
- exthdr_length = hdr_length - IPV6_HDR_LEN;
- } else {
- hdr_length = IPV6_HDR_LEN;
- exthdr_length = 0;
- }
+ hdr_length = ira->ira_ip_hdr_length;
+ exthdr_length = hdr_length - IPV6_HDR_LEN;
+
mldlen = ntohs(ip6h->ip6_plen) - exthdr_length;
/* An MLD packet must at least be 24 octets to be valid */
if (mldlen < MLD_MINLEN) {
BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
freemsg(mp);
- return;
+ return (NULL);
}
mldh = (mld_hdr_t *)(&mp->b_rptr[hdr_length]);
@@ -2254,50 +2094,41 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
} else {
BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
freemsg(mp);
- return;
+ return (NULL);
}
if (next == 0) {
- freemsg(mp);
- return;
+ return (mp);
}
if (next != INFINITY)
mld_start_timers(next, ipst);
break;
- case MLD_LISTENER_REPORT: {
-
- ASSERT(ill->ill_ipif != NULL);
+ case MLD_LISTENER_REPORT:
/*
* For fast leave to work, we have to know that we are the
* last person to send a report for this group. Reports
* generated by us are looped back since we could potentially
* be a multicast router, so discard reports sourced by me.
*/
- lcladdr_ptr = &(ill->ill_ipif->ipif_v6subnet);
mutex_enter(&ill->ill_lock);
for (ipif = ill->ill_ipif; ipif != NULL;
ipif = ipif->ipif_next) {
if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
- lcladdr_ptr)) {
+ &ip6h->ip6_src)) {
if (ip_debug > 1) {
char buf1[INET6_ADDRSTRLEN];
- char buf2[INET6_ADDRSTRLEN];
(void) mi_strlog(ill->ill_rq,
1,
SL_TRACE,
"mld_input: we are only "
- "member src %s ipif_local %s",
- inet_ntop(AF_INET6, lcladdr_ptr,
- buf1, sizeof (buf1)),
- inet_ntop(AF_INET6,
- &ipif->ipif_v6lcl_addr,
- buf2, sizeof (buf2)));
+ "member src %s\n",
+ inet_ntop(AF_INET6, &ip6h->ip6_src,
+ buf1, sizeof (buf1)));
}
mutex_exit(&ill->ill_lock);
- freemsg(mp);
- return;
+ return (mp);
}
}
mutex_exit(&ill->ill_lock);
@@ -2308,9 +2139,10 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
BUMP_MIB(ill->ill_icmp6_mib,
ipv6IfIcmpInGroupMembBadReports);
freemsg(mp);
- return;
+ return (NULL);
}
+
/*
* If we belong to the group being reported, and we are a
* 'Delaying member' per the RFC terminology, stop our timer
@@ -2319,8 +2151,8 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
* membership entries for the same group address (one per zone)
* so we need to walk the ill_ilm list.
*/
- ilm = ilm_walker_start(&ilw, ill);
- for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
+ rw_enter(&ill->ill_mcast_lock, RW_WRITER);
+ for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group_ptr))
continue;
BUMP_MIB(ill->ill_icmp6_mib,
@@ -2329,23 +2161,19 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
ilm->ilm_timer = INFINITY;
ilm->ilm_state = IGMP_OTHERMEMBER;
}
- ilm_walker_finish(&ilw);
+ rw_exit(&ill->ill_mcast_lock);
+ /*
+ * No packets have been sent above - no
+ * ill_mcast_send_queued is needed.
+ */
+ ill_mcast_timer_start(ill->ill_ipst);
break;
- }
+
case MLD_LISTENER_REDUCTION:
BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembReductions);
break;
}
- /*
- * All MLD packets have already been passed up to any
- * process(es) listening on a ICMP6 raw socket. This
- * has been accomplished in ip_deliver_local_v6 prior to
- * this function call. It is assumed that the multicast daemon
- * will have a SOCK_RAW IPPROTO_ICMPV6 (and presumbly use the
- * ICMP6_FILTER socket option to only receive the MLD messages)
- * Thus we can free the MLD message block here
- */
- freemsg(mp);
+ return (mp);
}
/*
@@ -2359,7 +2187,6 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill)
int timer;
uint_t next, current;
in6_addr_t *v6group;
- ilm_walker_t ilw;
BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
@@ -2383,7 +2210,7 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill)
}
/* Need to do compatibility mode checking */
- mutex_enter(&ill->ill_lock);
+ rw_enter(&ill->ill_mcast_lock, RW_WRITER);
ill->ill_mcast_v1_time = 0;
ill->ill_mcast_v1_tset = 1;
if (ill->ill_mcast_type == MLD_V2_ROUTER) {
@@ -2392,7 +2219,6 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill)
atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1);
ill->ill_mcast_type = MLD_V1_ROUTER;
}
- mutex_exit(&ill->ill_lock);
timer = (int)ntohs(mldh->mld_maxdelay);
if (ip_debug > 1) {
@@ -2415,11 +2241,8 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill)
*/
next = INFINITY;
- ilm = ilm_walker_start(&ilw, ill);
- mutex_enter(&ill->ill_lock);
current = CURRENT_MSTIME;
-
- for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
+ for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
ASSERT(!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr));
if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
@@ -2434,9 +2257,7 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill)
/* Respond immediately */
ilm->ilm_timer = INFINITY;
ilm->ilm_state = IGMP_IREPORTEDLAST;
- mutex_exit(&ill->ill_lock);
mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
- mutex_enter(&ill->ill_lock);
break;
}
if (ilm->ilm_timer > timer) {
@@ -2448,8 +2269,10 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill)
break;
}
}
- mutex_exit(&ill->ill_lock);
- ilm_walker_finish(&ilw);
+ rw_exit(&ill->ill_mcast_lock);
+ /* Send any deferred/queued IP packets */
+ ill_mcast_send_queued(ill);
+ ill_mcast_timer_start(ill->ill_ipst);
return (next);
}
@@ -2466,7 +2289,6 @@ mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen)
in6_addr_t *v6group, *src_array;
uint_t next, numsrc, i, mrd, delay, qqi, current;
uint8_t qrv;
- ilm_walker_t ilw;
v6group = &mld2q->mld2q_addr;
numsrc = ntohs(mld2q->mld2q_numsrc);
@@ -2514,12 +2336,11 @@ mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen)
* sooner than the delay we calculated for this response, then
* no action is required (MLDv2 draft section 6.2 rule 1)
*/
- mutex_enter(&ill->ill_lock);
+ rw_enter(&ill->ill_mcast_lock, RW_WRITER);
if (ill->ill_global_timer < (current + delay)) {
- mutex_exit(&ill->ill_lock);
+ rw_exit(&ill->ill_mcast_lock);
return (next);
}
- mutex_exit(&ill->ill_lock);
/*
* Now take action depending on query type: general,
@@ -2532,16 +2353,11 @@ mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen)
* greater than our calculated delay, so reset it to
* our delay (random value in range [0, response time])
*/
- mutex_enter(&ill->ill_lock);
ill->ill_global_timer = current + delay;
- mutex_exit(&ill->ill_lock);
next = delay;
-
} else {
/* group or group/source specific query */
- ilm = ilm_walker_start(&ilw, ill);
- mutex_enter(&ill->ill_lock);
- for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
+ for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr) ||
@@ -2595,9 +2411,13 @@ group_query:
ilm->ilm_timer += current;
break;
}
- mutex_exit(&ill->ill_lock);
- ilm_walker_finish(&ilw);
}
+ rw_exit(&ill->ill_mcast_lock);
+ /*
+ * No packets have been sent above - no
+ * ill_mcast_send_queued is needed.
+ */
+ ill_mcast_timer_start(ill->ill_ipst);
return (next);
}
@@ -2615,7 +2435,8 @@ mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr)
struct ip6_opt_router *ip6router;
size_t size = IPV6_HDR_LEN + sizeof (mld_hdr_t);
ill_t *ill = ilm->ilm_ill;
- ipif_t *ipif;
+
+ ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
/*
* We need to place a router alert option in this packet. The length
@@ -2663,35 +2484,20 @@ mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr)
else
ip6h->ip6_dst = *v6addr;
- /* ipif returned by ipif_lookup_zoneid is link-local (if present) */
- if (ipif_lookup_zoneid(ill, ilm->ilm_zoneid, IPIF_UP, &ipif)) {
- ip6h->ip6_src = ipif->ipif_v6src_addr;
- ipif_refrele(ipif);
- } else {
- /* Otherwise, use IPv6 default address selection. */
- ip6h->ip6_src = ipv6_all_zeros;
- }
-
+ ip6h->ip6_src = ipv6_all_zeros;
/*
* Prepare for checksum by putting icmp length in the icmp
- * checksum field. The checksum is calculated in ip_wput_v6.
+ * checksum field. The checksum is calculated in ip_output.
*/
mldh->mld_cksum = htons(sizeof (*mldh));
- /*
- * ip_wput will automatically loopback the multicast packet to
- * the conn if multicast loopback is enabled.
- * The MIB stats corresponding to this outgoing MLD packet
- * will be accounted for in ip_wput->ip_wput_v6->ip_wput_ire_v6
- * ->icmp_update_out_mib_v6 function call.
- */
- (void) ip_output_v6(NULL, mp, ill->ill_wq, IP_WPUT);
+ ill_mcast_queue(ill, mp);
}
/*
* Sends an MLD_V2_LISTENER_REPORT message out the passed-in ill. The
* report will contain one multicast address record for each element of
- * reclist. If this causes packet length to exceed ill->ill_max_frag,
+ * reclist. If this causes packet length to exceed ill->ill_mtu,
* multiple reports are sent. reclist is assumed to be made up of
* buffers allocated by mcast_bldmrec(), and those buffers are freed here.
*/
@@ -2706,19 +2512,17 @@ mldv2_sendrpt(ill_t *ill, mrec_t *reclist)
ip6_hbh_t *ip6hbh;
struct ip6_opt_router *ip6router;
size_t size, optlen, padlen, icmpsize, rsize;
- ipif_t *ipif;
int i, numrec, more_src_cnt;
mrec_t *rp, *cur_reclist;
mrec_t *next_reclist = reclist;
boolean_t morepkts;
- ASSERT(IAM_WRITER_ILL(ill));
-
/* If there aren't any records, there's nothing to send */
if (reclist == NULL)
return;
ASSERT(ill->ill_isv6);
+ ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
/*
* Total option length (optlen + padlen) must be a multiple of
@@ -2737,7 +2541,7 @@ nextpkt:
rp = rp->mrec_next, numrec++) {
rsize = sizeof (mld2mar_t) +
(rp->mrec_srcs.sl_numsrc * sizeof (in6_addr_t));
- if (size + rsize > ill->ill_max_frag) {
+ if (size + rsize > ill->ill_mtu) {
if (rp == cur_reclist) {
/*
* If the first mrec we looked at is too big
@@ -2748,7 +2552,7 @@ nextpkt:
* other types).
*/
int srcspace, srcsperpkt;
- srcspace = ill->ill_max_frag -
+ srcspace = ill->ill_mtu -
(size + sizeof (mld2mar_t));
/*
@@ -2819,14 +2623,7 @@ nextpkt:
ip6h->ip6_nxt = IPPROTO_HOPOPTS;
ip6h->ip6_hops = MLD_HOP_LIMIT;
ip6h->ip6_dst = ipv6_all_v2rtrs_mcast;
- /* ipif returned by ipif_lookup_zoneid is link-local (if present) */
- if (ipif_lookup_zoneid(ill, ALL_ZONES, IPIF_UP, &ipif)) {
- ip6h->ip6_src = ipif->ipif_v6src_addr;
- ipif_refrele(ipif);
- } else {
- /* otherwise, use IPv6 default address selection. */
- ip6h->ip6_src = ipv6_all_zeros;
- }
+ ip6h->ip6_src = ipv6_all_zeros;
ip6hbh->ip6h_nxt = IPPROTO_ICMPV6;
/*
@@ -2844,7 +2641,7 @@ nextpkt:
mld2r->mld2r_nummar = htons(numrec);
/*
* Prepare for the checksum by putting icmp length in the icmp
- * checksum field. The checksum is calculated in ip_wput_v6.
+ * checksum field. The checksum is calculated in ip_output_simple.
*/
mld2r->mld2r_cksum = htons(icmpsize);
@@ -2861,14 +2658,7 @@ nextpkt:
mld2mar = (mld2mar_t *)&(srcarray[i]);
}
- /*
- * ip_wput will automatically loopback the multicast packet to
- * the conn if multicast loopback is enabled.
- * The MIB stats corresponding to this outgoing MLD packet
- * will be accounted for in ip_wput->ip_wput_v6->ip_wput_ire_v6
- * ->icmp_update_out_mib_v6 function call.
- */
- (void) ip_output_v6(NULL, mp, ill->ill_wq, IP_WPUT);
+ ill_mcast_queue(ill, mp);
if (morepkts) {
if (more_src_cnt > 0) {
@@ -2997,7 +2787,7 @@ mcast_merge_rtx(ilm_t *ilm, mrec_t *mreclist, slist_t *flist)
mrec_t *rp, *rpnext, *rtnmrec;
boolean_t ovf;
- ill = (ilm->ilm_ill == NULL ? ilm->ilm_ipif->ipif_ill : ilm->ilm_ill);
+ ill = ilm->ilm_ill;
if (mreclist == NULL)
return (mreclist);
@@ -3100,64 +2890,3 @@ mcast_merge_rtx(ilm_t *ilm, mrec_t *mreclist, slist_t *flist)
return (rtnmrec);
}
-
-/*
- * Convenience routine to signal the restart-timer thread.
- */
-static void
-mcast_signal_restart_thread(ip_stack_t *ipst)
-{
- mutex_enter(&ipst->ips_mrt_lock);
- ipst->ips_mrt_flags |= IP_MRT_RUN;
- cv_signal(&ipst->ips_mrt_cv);
- mutex_exit(&ipst->ips_mrt_lock);
-}
-
-/*
- * Thread to restart IGMP/MLD timers. See the comment in igmp_joingroup() for
- * the story behind this unfortunate thread.
- */
-void
-mcast_restart_timers_thread(ip_stack_t *ipst)
-{
- int next;
- char name[64];
- callb_cpr_t cprinfo;
-
- (void) snprintf(name, sizeof (name), "mcast_restart_timers_thread_%d",
- ipst->ips_netstack->netstack_stackid);
- CALLB_CPR_INIT(&cprinfo, &ipst->ips_mrt_lock, callb_generic_cpr, name);
-
- for (;;) {
- mutex_enter(&ipst->ips_mrt_lock);
- while (!(ipst->ips_mrt_flags & (IP_MRT_STOP|IP_MRT_RUN))) {
- CALLB_CPR_SAFE_BEGIN(&cprinfo);
- cv_wait(&ipst->ips_mrt_cv, &ipst->ips_mrt_lock);
- CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_mrt_lock);
- }
- if (ipst->ips_mrt_flags & IP_MRT_STOP)
- break;
- ipst->ips_mrt_flags &= ~IP_MRT_RUN;
- mutex_exit(&ipst->ips_mrt_lock);
-
- mutex_enter(&ipst->ips_igmp_timer_lock);
- next = ipst->ips_igmp_deferred_next;
- ipst->ips_igmp_deferred_next = INFINITY;
- mutex_exit(&ipst->ips_igmp_timer_lock);
-
- if (next != INFINITY)
- igmp_start_timers(next, ipst);
-
- mutex_enter(&ipst->ips_mld_timer_lock);
- next = ipst->ips_mld_deferred_next;
- ipst->ips_mld_deferred_next = INFINITY;
- mutex_exit(&ipst->ips_mld_timer_lock);
- if (next != INFINITY)
- mld_start_timers(next, ipst);
- }
-
- ipst->ips_mrt_flags |= IP_MRT_DONE;
- cv_signal(&ipst->ips_mrt_done_cv);
- CALLB_CPR_EXIT(&cprinfo); /* drops ips_mrt_lock */
- thread_exit();
-}
diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c
index ebb89e3172..b59087e9b1 100644
--- a/usr/src/uts/common/inet/ip/ip.c
+++ b/usr/src/uts/common/inet/ip/ip.c
@@ -38,6 +38,7 @@
#include <sys/tihdr.h>
#include <sys/xti_inet.h>
#include <sys/ddi.h>
+#include <sys/suntpi.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/kobj.h>
@@ -94,10 +95,8 @@
#include <inet/ipp_common.h>
#include <net/pfkeyv2.h>
-#include <inet/ipsec_info.h>
#include <inet/sadb.h>
#include <inet/ipsec_impl.h>
-#include <sys/iphada.h>
#include <inet/iptun/iptun_impl.h>
#include <inet/ipdrop.h>
#include <inet/ip_netinfo.h>
@@ -111,9 +110,7 @@
#include <ipp/ipp_impl.h>
#include <ipp/ipgpc/ipgpc.h>
-#include <sys/multidata.h>
#include <sys/pattr.h>
-
#include <inet/ipclassifier.h>
#include <inet/sctp_ip.h>
#include <inet/sctp/sctp_impl.h>
@@ -126,6 +123,7 @@
#include <rpc/pmap_prot.h>
#include <sys/squeue_impl.h>
+#include <inet/ip_arp.h>
/*
* Values for squeue switch:
@@ -133,10 +131,9 @@
* IP_SQUEUE_ENTER: SQ_PROCESS
* IP_SQUEUE_FILL: SQ_FILL
*/
-int ip_squeue_enter = 2; /* Setable in /etc/system */
+int ip_squeue_enter = IP_SQUEUE_ENTER; /* Setable in /etc/system */
int ip_squeue_flag;
-#define SET_BPREV_FLAG(x) ((mblk_t *)(uintptr_t)(x))
/*
* Setable in /etc/system
@@ -177,7 +174,8 @@ typedef struct iproutedata_s {
listptr_t ird_attrs; /* ipRouteAttributeTable */
} iproutedata_t;
-#define IRD_REPORT_TESTHIDDEN 0x01 /* include IRE_MARK_TESTHIDDEN routes */
+/* Include ire_testhidden and IRE_IF_CLONE routes */
+#define IRD_REPORT_ALL 0x01
/*
* Cluster specific hooks. These should be NULL when booted as a non-cluster
@@ -233,29 +231,26 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
* MT level protection given by STREAMS. IP uses a combination of its own
* internal serialization mechanism and standard Solaris locking techniques.
* The internal serialization is per phyint. This is used to serialize
- * plumbing operations, certain multicast operations, most set ioctls,
- * igmp/mld timers etc.
+ * plumbing operations, IPMP operations, most set ioctls, etc.
*
* Plumbing is a long sequence of operations involving message
* exchanges between IP, ARP and device drivers. Many set ioctls are typically
* involved in plumbing operations. A natural model is to serialize these
* ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in
* parallel without any interference. But various set ioctls on hme0 are best
- * serialized, along with multicast join/leave operations, igmp/mld timer
- * operations, and processing of DLPI control messages received from drivers
- * on a per phyint basis. This serialization is provided by the ipsq_t and
- * primitives operating on this. Details can be found in ip_if.c above the
- * core primitives operating on ipsq_t.
+ * serialized, along with IPMP operations and processing of DLPI control
+ * messages received from drivers on a per phyint basis. This serialization is
+ * provided by the ipsq_t and primitives operating on this. Details can
+ * be found in ip_if.c above the core primitives operating on ipsq_t.
*
* Lookups of an ipif or ill by a thread return a refheld ipif / ill.
* Simiarly lookup of an ire by a thread also returns a refheld ire.
* In addition ipif's and ill's referenced by the ire are also indirectly
- * refheld. Thus no ipif or ill can vanish nor can critical parameters like
- * the ipif's address or netmask change as long as an ipif is refheld
+ * refheld. Thus no ipif or ill can vanish as long as an ipif is refheld
* directly or indirectly. For example an SIOCSLIFADDR ioctl that changes the
* address of an ipif has to go through the ipsq_t. This ensures that only
- * 1 such exclusive operation proceeds at any time on the ipif. It then
- * deletes all ires associated with this ipif, and waits for all refcnts
+ * one such exclusive operation proceeds at any time on the ipif. It then
+ * waits for all refcnts
* associated with this ipif to come down to zero. The address is changed
* only after the ipif has been quiesced. Then the ipif is brought up again.
* More details are described above the comment in ip_sioctl_flags.
@@ -274,7 +269,7 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
* - ire_lock to protect some of the fields of the ire, IRE tables
* (one lock per hash bucket). Refer to ip_ire.c for details.
*
- * - ndp_g_lock and nce_lock for protecting NCEs.
+ * - ndp_g_lock and ncec_lock for protecting NCEs.
*
* - ill_lock protects fields of the ill and ipif. Details in ip.h
*
@@ -312,12 +307,6 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
* This lock is held in ipif_up_done and the ipif is marked IPIF_UP and the
* uniqueness check also done atomically.
*
- * - ipsec_capab_ills_lock: This readers/writer lock protects the global
- * lists of IPsec capable ills (ipsec_capab_ills_{ah,esp}). It is taken
- * as a writer when adding or deleting elements from these lists, and
- * as a reader when walking these lists to send a SADB update to the
- * IPsec capable ills.
- *
* - ill_g_usesrc_lock: This readers/writer lock protects the usesrc
* group list linked by ill_usesrc_grp_next. It also protects the
* ill_usesrc_ifindex field. It is taken as a writer when a member of the
@@ -357,20 +346,30 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
*
* ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock
* ill_g_lock -> ill_lock(s) -> phyint_lock
- * ill_g_lock -> ndp_g_lock -> ill_lock -> nce_lock
+ * ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock
* ill_g_lock -> ip_addr_avail_lock
* conn_lock -> irb_lock -> ill_lock -> ire_lock
* ill_g_lock -> ip_g_nd_lock
+ * ill_g_lock -> ips_ipmp_lock -> ill_lock -> nce_lock
+ * ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock -> nce_lock
+ * arl_lock -> ill_lock
+ * ips_ire_dep_lock -> irb_lock
*
* When more than 1 ill lock is needed to be held, all ill lock addresses
* are sorted on address and locked starting from highest addressed lock
* downward.
*
+ * Multicast scenarios
+ * ips_ill_g_lock -> ill_mcast_lock
+ * conn_ilg_lock -> ips_ill_g_lock -> ill_lock
+ * ill_mcast_serializer -> ill_mcast_lock -> ips_ipmp_lock -> ill_lock
+ * ill_mcast_serializer -> ill_mcast_lock -> connf_lock -> conn_lock
+ * ill_mcast_serializer -> ill_mcast_lock -> conn_ilg_lock
+ * ill_mcast_serializer -> ill_mcast_lock -> ips_igmp_timer_lock
+ *
* IPsec scenarios
*
* ipsa_lock -> ill_g_lock -> ill_lock
- * ipsec_capab_ills_lock -> ill_g_lock -> ill_lock
- * ipsec_capab_ills_lock -> ipsa_lock
* ill_g_usesrc_lock -> ill_g_lock -> ill_lock
*
* Trusted Solaris scenarios
@@ -414,31 +413,30 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
* Walker - Increment irb_refcnt before calling the walker callback. Hold the
* global tree lock (read mode) for traversal.
*
+ * IRE dependencies - In some cases we hold ips_ire_dep_lock across ire_refrele
+ * hence we will acquire irb_lock while holding ips_ire_dep_lock.
+ *
* IPsec notes :
*
- * IP interacts with the IPsec code (AH/ESP) by tagging a M_CTL message
- * in front of the actual packet. For outbound datagrams, the M_CTL
- * contains a ipsec_out_t (defined in ipsec_info.h), which has the
+ * IP interacts with the IPsec code (AH/ESP) by storing IPsec attributes
+ * in the ip_xmit_attr_t ip_recv_attr_t. For outbound datagrams, the
+ * ip_xmit_attr_t has the
* information used by the IPsec code for applying the right level of
- * protection. The information initialized by IP in the ipsec_out_t
+ * protection. The information initialized by IP in the ip_xmit_attr_t
* is determined by the per-socket policy or global policy in the system.
- * For inbound datagrams, the M_CTL contains a ipsec_in_t (defined in
- * ipsec_info.h) which starts out with nothing in it. It gets filled
+ * For inbound datagrams, the ip_recv_attr_t
+ * starts out with nothing in it. It gets filled
* with the right information if it goes through the AH/ESP code, which
* happens if the incoming packet is secure. The information initialized
- * by AH/ESP, is later used by IP(during fanouts to ULP) to see whether
+ * by AH/ESP, is later used by IP (during fanouts to ULP) to see whether
* the policy requirements needed by per-socket policy or global policy
* is met or not.
*
- * If there is both per-socket policy (set using setsockopt) and there
- * is also global policy match for the 5 tuples of the socket,
- * ipsec_override_policy() makes the decision of which one to use.
- *
* For fully connected sockets i.e dst, src [addr, port] is known,
* conn_policy_cached is set indicating that policy has been cached.
* conn_in_enforce_policy may or may not be set depending on whether
* there is a global policy match or per-socket policy match.
- * Policy inheriting happpens in ip_bind during the ipa_conn_t bind.
+ * Policy inheriting happpens in ip_policy_set once the destination is known.
* Once the right policy is set on the conn_t, policy cannot change for
* this socket. This makes life simpler for TCP (UDP ?) where
* re-transmissions go out with the same policy. For symmetry, policy
@@ -513,7 +511,8 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
* idl_tx_list in ips_idl_tx_list[] array. Then conn_drain_insert() is
* called passing idl_tx_list. The connp gets inserted in a drain list
* pointed to by idl_tx_list. conn_drain_list() asserts flow control for
- * the sockets (non stream based) and sets QFULL condition for conn_wq.
+ * the sockets (non stream based) and sets QFULL condition on the conn_wq
+ * of streams sockets, or the su_txqfull for non-streams sockets.
* connp->conn_direct_blocked will be set to indicate the blocked
* condition.
*
@@ -521,46 +520,37 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
* A cookie is passed in the call to ill_flow_enable() that identifies the
* blocked Tx ring. This cookie is used to get to the idl_tx_list that
* contains the blocked connp's. conn_walk_drain() uses the idl_tx_list_t
- * and goes through each of the drain list (q)enabling the conn_wq of the
- * first conn in each of the drain list. This causes ip_wsrv to run for the
+ * and goes through each conn in the drain list and calls conn_idl_remove
+ * for the conn to clear the qfull condition for the conn, as well as to
+ * remove the conn from the idl list. In addition, streams based sockets
+ * will have the conn_wq enabled, causing ip_wsrv to run for the
* conn. ip_wsrv drains the queued messages, and removes the conn from the
- * drain list, if all messages were drained. It also qenables the next conn
- * in the drain list to continue the drain process.
+ * drain list, if all messages were drained. It also notifies the
+ * conn_upcalls for the conn to signal that flow-control has opened up.
*
* In reality the drain list is not a single list, but a configurable number
- * of lists. conn_drain_walk() in the IP module, qenables the first conn in
- * each list. If the ip_wsrv of the next qenabled conn does not run, because
- * the stream closes, ip_close takes responsibility to qenable the next conn
- * in the drain list. conn_drain_insert and conn_drain_tail are the only
+ * of lists. conn_walk_drain() in the IP module, notifies the conn_upcalls for
+ * each conn in the list. conn_drain_insert and conn_drain_tail are the only
* functions that manipulate this drain list. conn_drain_insert is called in
- * ip_wput context itself (as opposed to from ip_wsrv context for STREAMS
+ * from the protocol layer when conn_ip_output returns EWOULDBLOCK.
+ * (as opposed to from ip_wsrv context for STREAMS
* case -- see below). The synchronization between drain insertion and flow
* control wakeup is handled by using idl_txl->txl_lock.
*
* Flow control using STREAMS:
* When ILL_DIRECT_CAPABLE() is not TRUE, STREAMS flow control mechanism
* is used. On the send side, if the packet cannot be sent down to the
- * driver by IP, because of a canput failure, IP does a putq on the conn_wq.
- * This will cause ip_wsrv to run on the conn_wq. ip_wsrv in turn, inserts
- * the conn in a list of conn's that need to be drained when the flow
- * control condition subsides. The blocked connps are put in first member
- * of ips_idl_tx_list[] array. Ultimately STREAMS backenables the ip_wsrv
- * on the IP module. It calls conn_walk_drain() passing ips_idl_tx_list[0].
- * ips_idl_tx_list[0] contains the drain lists of blocked conns. The
- * conn_wq of the first conn in the drain lists is (q)enabled to run.
- * ip_wsrv on this conn drains the queued messages, and removes the conn
- * from the drain list, if all messages were drained. It also qenables the
- * next conn in the drain list to continue the drain process.
- *
- * If the ip_wsrv of the next qenabled conn does not run, because the
- * stream closes, ip_close takes responsibility to qenable the next conn in
- * the drain list. The directly called ip_wput path always does a putq, if
- * it cannot putnext. Thus synchronization problems are handled between
- * ip_wsrv and ip_close. conn_drain_insert and conn_drain_tail are the only
- * functions that manipulate this drain list. Furthermore conn_drain_insert
- * is called only from ip_wsrv for the STREAMS case, and there can be only 1
- * instance of ip_wsrv running on a queue at any time. conn_drain_tail can
- * be simultaneously called from both ip_wsrv and ip_close.
+ * driver by IP, because of a canput failure, ip_xmit drops the packet
+ * and returns EWOULDBLOCK to the caller, who may then invoke
+ * ixa_check_drain_insert to insert the conn on the 0'th drain list.
+ * When ip_wsrv runs on the ill_wq because flow control has been relieved, the
+ * blocked conns in the * 0'th drain list is drained as with the
+ * non-STREAMS case.
+ *
+ * In both the STREAMS and non-STREAMS case, the sockfs upcall to set
+ * qfull is done when the conn is inserted into the drain list
+ * (conn_drain_insert()) and cleared when the conn is removed from the drain
+ * list (conn_idl_remove()).
*
* IPQOS notes:
*
@@ -579,14 +569,13 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
* By default all the callout positions are enabled.
*
* Outbound (local_out)
- * Hooks are placed in ip_wput_ire and ipsec_out_process.
+ * Hooks are placed in ire_send_wire_v4 and ire_send_wire_v6.
*
* Inbound (local_in)
- * Hooks are placed in ip_proto_input, icmp_inbound, ip_fanout_proto and
- * TCP and UDP fanout routines.
+ * Hooks are placed in ip_fanout_v4 and ip_fanout_v6.
*
* Forwarding (in and out)
- * Hooks are placed in ip_rput_forward.
+ * Hooks are placed in ire_recv_forward_v4/v6.
*
* IP Policy Framework processing (IPPF processing)
* Policy processing for a packet is initiated by ip_process, which ascertains
@@ -596,16 +585,6 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
* filters configured in ipgpc and resumes normal IP processing thereafter.
* An action instance can drop a packet in course of its processing.
*
- * A boolean variable, ip_policy, is used in all the fanout routines that can
- * invoke ip_process for a packet. This variable indicates if the packet should
- * to be sent for policy processing. The variable is set to B_TRUE by default,
- * i.e. when the routines are invoked in the normal ip procesing path for a
- * packet. The two exceptions being ip_wput_local and icmp_inbound_error_fanout;
- * ip_policy is set to B_FALSE for all the routines called in these two
- * functions because, in the former case, we don't process loopback traffic
- * currently while in the latter, the packets have already been processed in
- * icmp_inbound.
- *
* Zones notes:
*
* The partitioning rules for networking are as follows:
@@ -638,24 +617,18 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
* IRE_LOCAL Exclusive (x)
* IRE_LOOPBACK Exclusive
* IRE_PREFIX (net routes) Shared (*)
- * IRE_CACHE Exclusive
* IRE_IF_NORESOLVER (interface routes) Exclusive
* IRE_IF_RESOLVER (interface routes) Exclusive
+ * IRE_IF_CLONE (interface routes) Exclusive
* IRE_HOST (host routes) Shared (*)
*
* (*) A zone can only use a default or off-subnet route if the gateway is
* directly reachable from the zone, that is, if the gateway's address matches
* one of the zone's logical interfaces.
*
- * (x) IRE_LOCAL are handled a bit differently, since for all other entries
- * in ire_ctable and IRE_INTERFACE, ire_src_addr is what can be used as source
- * when sending packets using the IRE. For IRE_LOCAL ire_src_addr is the IP
- * address of the zone itself (the destination). Since IRE_LOCAL is used
- * for communication between zones, ip_wput_ire has special logic to set
- * the right source address when sending using an IRE_LOCAL.
- *
- * Furthermore, when ip_restrict_interzone_loopback is set (the default),
- * ire_cache_lookup restricts loopback using an IRE_LOCAL
+ * (x) IRE_LOCAL are handled a bit differently.
+ * When ip_restrict_interzone_loopback is set (the default),
+ * ire_route_recursive restricts loopback using an IRE_LOCAL
* between zone to the case when L2 would have conceptually looped the packet
* back, i.e. the loopback which is required since neither Ethernet drivers
* nor Ethernet hardware loops them back. This is the case when the normal
@@ -669,17 +642,11 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
* since some zones may not be on the 10.16.72/24 network. To handle this, each
* zone has its own set of IRE_BROADCAST entries; then, broadcast packets are
* sent to every zone that has an IRE_BROADCAST entry for the destination
- * address on the input ill, see conn_wantpacket().
+ * address on the input ill, see ip_input_broadcast().
*
* Applications in different zones can join the same multicast group address.
- * For IPv4, group memberships are per-logical interface, so they're already
- * inherently part of a zone. For IPv6, group memberships are per-physical
- * interface, so we distinguish IPv6 group memberships based on group address,
- * interface and zoneid. In both cases, received multicast packets are sent to
- * every zone for which a group membership entry exists. On IPv6 we need to
- * check that the target zone still has an address on the receiving physical
- * interface; it could have been removed since the application issued the
- * IPV6_JOIN_GROUP.
+ * The same logic applies for multicast as for broadcast. ip_input_multicast
+ * dispatches packets to all zones that have members on the physical interface.
*/
/*
@@ -694,62 +661,37 @@ boolean_t ip_squeue_fanout = 0;
*/
uint_t ip_max_frag_dups = 10;
-#define IS_SIMPLE_IPH(ipha) \
- ((ipha)->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)
-
/* RFC 1122 Conformance */
#define IP_FORWARD_DEFAULT IP_FORWARD_NEVER
#define ILL_MAX_NAMELEN LIFNAMSIZ
-static int conn_set_held_ipif(conn_t *, ipif_t **, ipif_t *);
-
static int ip_open(queue_t *q, dev_t *devp, int flag, int sflag,
cred_t *credp, boolean_t isv6);
-static mblk_t *ip_wput_attach_llhdr(mblk_t *, ire_t *, ip_proc_t, uint32_t,
- ipha_t **);
+static mblk_t *ip_xmit_attach_llhdr(mblk_t *, nce_t *);
-static void icmp_frag_needed(queue_t *, mblk_t *, int, zoneid_t,
- ip_stack_t *);
-static void icmp_inbound(queue_t *, mblk_t *, boolean_t, ill_t *, int,
- uint32_t, boolean_t, boolean_t, ill_t *, zoneid_t);
-static ipaddr_t icmp_get_nexthop_addr(ipha_t *, ill_t *, zoneid_t, mblk_t *mp);
-static boolean_t icmp_inbound_too_big(icmph_t *, ipha_t *, ill_t *, zoneid_t,
- mblk_t *, int, ip_stack_t *);
-static void icmp_inbound_error_fanout(queue_t *, ill_t *, mblk_t *,
- icmph_t *, ipha_t *, int, int, boolean_t, boolean_t,
- ill_t *, zoneid_t);
+static boolean_t icmp_inbound_verify_v4(mblk_t *, icmph_t *, ip_recv_attr_t *);
+static void icmp_inbound_too_big_v4(icmph_t *, ip_recv_attr_t *);
+static void icmp_inbound_error_fanout_v4(mblk_t *, icmph_t *,
+ ip_recv_attr_t *);
static void icmp_options_update(ipha_t *);
-static void icmp_param_problem(queue_t *, mblk_t *, uint8_t, zoneid_t,
- ip_stack_t *);
-static void icmp_pkt(queue_t *, mblk_t *, void *, size_t, boolean_t,
- zoneid_t zoneid, ip_stack_t *);
-static mblk_t *icmp_pkt_err_ok(mblk_t *, ip_stack_t *);
-static void icmp_redirect(ill_t *, mblk_t *);
-static void icmp_send_redirect(queue_t *, mblk_t *, ipaddr_t,
- ip_stack_t *);
+static void icmp_param_problem(mblk_t *, uint8_t, ip_recv_attr_t *);
+static void icmp_pkt(mblk_t *, void *, size_t, ip_recv_attr_t *);
+static mblk_t *icmp_pkt_err_ok(mblk_t *, ip_recv_attr_t *);
+static void icmp_redirect_v4(mblk_t *mp, ipha_t *, icmph_t *,
+ ip_recv_attr_t *);
+static void icmp_send_redirect(mblk_t *, ipaddr_t, ip_recv_attr_t *);
+static void icmp_send_reply_v4(mblk_t *, ipha_t *, icmph_t *,
+ ip_recv_attr_t *);
-static void ip_arp_news(queue_t *, mblk_t *);
-static boolean_t ip_bind_get_ire_v4(mblk_t **, ire_t *, iulp_t *, ip_stack_t *);
mblk_t *ip_dlpi_alloc(size_t, t_uscalar_t);
char *ip_dot_addr(ipaddr_t, char *);
mblk_t *ip_carve_mp(mblk_t **, ssize_t);
int ip_close(queue_t *, int);
static char *ip_dot_saddr(uchar_t *, char *);
-static void ip_fanout_proto(queue_t *, mblk_t *, ill_t *, ipha_t *, uint_t,
- boolean_t, boolean_t, ill_t *, zoneid_t);
-static void ip_fanout_tcp(queue_t *, mblk_t *, ill_t *, ipha_t *, uint_t,
- boolean_t, boolean_t, zoneid_t);
-static void ip_fanout_udp(queue_t *, mblk_t *, ill_t *, ipha_t *, uint32_t,
- boolean_t, uint_t, boolean_t, boolean_t, ill_t *, zoneid_t);
static void ip_lrput(queue_t *, mblk_t *);
ipaddr_t ip_net_mask(ipaddr_t);
-void ip_newroute(queue_t *, mblk_t *, ipaddr_t, conn_t *, zoneid_t,
- ip_stack_t *);
-static void ip_newroute_ipif(queue_t *, mblk_t *, ipif_t *, ipaddr_t,
- conn_t *, uint32_t, zoneid_t, ip_opt_info_t *);
char *ip_nv_lookup(nv_t *, int);
-static boolean_t ip_check_for_ipsec_opt(queue_t *, mblk_t *);
static int ip_param_get(queue_t *, mblk_t *, caddr_t, cred_t *);
static int ip_param_generic_get(queue_t *, mblk_t *, caddr_t, cred_t *);
static boolean_t ip_param_register(IDP *ndp, ipparam_t *, size_t,
@@ -758,17 +700,6 @@ static int ip_param_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *);
void ip_rput(queue_t *, mblk_t *);
static void ip_rput_dlpi_writer(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp,
void *dummy_arg);
-void ip_rput_forward(ire_t *, ipha_t *, mblk_t *, ill_t *);
-static int ip_rput_forward_options(mblk_t *, ipha_t *, ire_t *,
- ip_stack_t *);
-static boolean_t ip_rput_local_options(queue_t *, mblk_t *, ipha_t *,
- ire_t *, ip_stack_t *);
-static boolean_t ip_rput_multimblk_ipoptions(queue_t *, ill_t *,
- mblk_t *, ipha_t **, ipaddr_t *, ip_stack_t *);
-static int ip_rput_options(queue_t *, mblk_t *, ipha_t *, ipaddr_t *,
- ip_stack_t *);
-static boolean_t ip_rput_fragment(ill_t *, ill_t *, mblk_t **, ipha_t *,
- uint32_t *, uint16_t *);
int ip_snmp_get(queue_t *, mblk_t *, int);
static mblk_t *ip_snmp_get_mib2_ip(queue_t *, mblk_t *,
mib2_ipIfStatsEntry_t *, ip_stack_t *);
@@ -801,49 +732,34 @@ static mblk_t *ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *, int,
ip_stack_t *ipst);
static void ip_snmp_get2_v4(ire_t *, iproutedata_t *);
static void ip_snmp_get2_v6_route(ire_t *, iproutedata_t *);
-static int ip_snmp_get2_v6_media(nce_t *, iproutedata_t *);
+static int ip_snmp_get2_v4_media(ncec_t *, iproutedata_t *);
+static int ip_snmp_get2_v6_media(ncec_t *, iproutedata_t *);
int ip_snmp_set(queue_t *, int, int, uchar_t *, int);
-static boolean_t ip_source_routed(ipha_t *, ip_stack_t *);
-static boolean_t ip_source_route_included(ipha_t *);
-static void ip_trash_ire_reclaim_stack(ip_stack_t *);
-static void ip_wput_frag(ire_t *, mblk_t *, ip_pkt_t, uint32_t, uint32_t,
- zoneid_t, ip_stack_t *, conn_t *);
-static mblk_t *ip_wput_frag_copyhdr(uchar_t *, int, int, ip_stack_t *,
+static mblk_t *ip_fragment_copyhdr(uchar_t *, int, int, ip_stack_t *,
mblk_t *);
-static void ip_wput_local_options(ipha_t *, ip_stack_t *);
-static int ip_wput_options(queue_t *, mblk_t *, ipha_t *, boolean_t,
- zoneid_t, ip_stack_t *);
static void conn_drain_init(ip_stack_t *);
static void conn_drain_fini(ip_stack_t *);
static void conn_drain_tail(conn_t *connp, boolean_t closing);
static void conn_walk_drain(ip_stack_t *, idl_tx_list_t *);
-static void conn_setqfull(conn_t *);
-static void conn_clrqfull(conn_t *);
+static void conn_walk_sctp(pfv_t, void *, zoneid_t, netstack_t *);
static void *ip_stack_init(netstackid_t stackid, netstack_t *ns);
static void ip_stack_shutdown(netstackid_t stackid, void *arg);
static void ip_stack_fini(netstackid_t stackid, void *arg);
-static boolean_t conn_wantpacket(conn_t *, ill_t *, ipha_t *, int,
- zoneid_t);
-static void ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp,
- void *dummy_arg);
-
static int ip_forward_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *);
static int ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t,
- ipaddr_t, ipaddr_t, uint_t *, mcast_record_t, ipaddr_t, mblk_t *), ire_t *,
- conn_t *, boolean_t, ipaddr_t, mcast_record_t, ipaddr_t, mblk_t *);
-static void ip_multirt_bad_mtu(ire_t *, uint32_t);
+ const in6_addr_t *, ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *),
+ ire_t *, conn_t *, boolean_t, const in6_addr_t *, mcast_record_t,
+ const in6_addr_t *);
static int ip_cgtp_filter_get(queue_t *, mblk_t *, caddr_t, cred_t *);
static int ip_cgtp_filter_set(queue_t *, mblk_t *, char *,
caddr_t, cred_t *);
-extern int ip_helper_stream_setup(queue_t *, dev_t *, int, int,
- cred_t *, boolean_t);
static int ip_input_proc_set(queue_t *q, mblk_t *mp, char *value,
caddr_t cp, cred_t *cr);
static int ip_int_set(queue_t *, mblk_t *, char *, caddr_t,
@@ -859,30 +775,15 @@ static int icmp_kstat_update(kstat_t *kp, int rw);
static void *ip_kstat2_init(netstackid_t, ip_stat_t *);
static void ip_kstat2_fini(netstackid_t, kstat_t *);
-static mblk_t *ip_tcp_input(mblk_t *, ipha_t *, ill_t *, boolean_t,
- ire_t *, mblk_t *, uint_t, queue_t *, ill_rx_ring_t *);
+static void ipobs_init(ip_stack_t *);
+static void ipobs_fini(ip_stack_t *);
-static void ip_rput_process_forward(queue_t *, mblk_t *, ire_t *,
- ipha_t *, ill_t *, boolean_t, boolean_t);
-
-static void ipobs_init(ip_stack_t *);
-static void ipobs_fini(ip_stack_t *);
ipaddr_t ip_g_all_ones = IP_HOST_MASK;
/* How long, in seconds, we allow frags to hang around. */
#define IP_FRAG_TIMEOUT 15
#define IPV6_FRAG_TIMEOUT 60
-/*
- * Threshold which determines whether MDT should be used when
- * generating IP fragments; payload size must be greater than
- * this threshold for MDT to take place.
- */
-#define IP_WPUT_FRAG_MDT_MIN 32768
-
-/* Setable in /etc/system only */
-int ip_wput_frag_mdt_min = IP_WPUT_FRAG_MDT_MIN;
-
static long ip_rput_pullups;
int dohwcksum = 1; /* use h/w cksum if supported by the hardware */
@@ -891,24 +792,12 @@ vmem_t *ip_minor_arena_la; /* for minor nos. from 2^^18 thru 2^^32-1 */
int ip_debug;
-#ifdef DEBUG
-uint32_t ipsechw_debug = 0;
-#endif
-
/*
* Multirouting/CGTP stuff
*/
int ip_cgtp_filter_rev = CGTP_FILTER_REV; /* CGTP hooks version */
/*
- * XXX following really should only be in a header. Would need more
- * header and .c clean up first.
- */
-extern optdb_obj_t ip_opt_obj;
-
-ulong_t ip_squeue_enter_unbound = 0;
-
-/*
* Named Dispatch Parameter Table.
* All of these are alterable, within the min/max values given, at run time.
*/
@@ -922,18 +811,18 @@ static ipparam_t lcl_param_arr[] = {
{ 0, 1, 1, "ip_send_redirects"},
{ 0, 1, 0, "ip_forward_directed_broadcasts"},
{ 0, 10, 0, "ip_mrtdebug"},
- { 5000, 999999999, 60000, "ip_ire_timer_interval" },
- { 60000, 999999999, 1200000, "ip_ire_arp_interval" },
- { 60000, 999999999, 60000, "ip_ire_redirect_interval" },
+ { 1, 8, 3, "ip_ire_reclaim_fraction" },
+ { 1, 8, 3, "ip_nce_reclaim_fraction" },
+ { 1, 8, 3, "ip_dce_reclaim_fraction" },
{ 1, 255, 255, "ip_def_ttl" },
{ 0, 1, 0, "ip_forward_src_routed"},
{ 0, 256, 32, "ip_wroff_extra" },
- { 5000, 999999999, 600000, "ip_ire_pathmtu_interval" },
+ { 2, 999999999, 60*20, "ip_pathmtu_interval" }, /* In seconds */
{ 8, 65536, 64, "ip_icmp_return_data_bytes" },
{ 0, 1, 1, "ip_path_mtu_discovery" },
- { 0, 240, 30, "ip_ignore_delete_time" },
+ { 68, 65535, 576, "ip_pmtu_min" },
{ 0, 1, 0, "ip_ignore_redirect" },
- { 0, 1, 1, "ip_output_queue" },
+ { 0, 1, 0, "ip_arp_icmp_error" },
{ 1, 254, 1, "ip_broadcast_ttl" },
{ 0, 99999, 100, "ip_icmp_err_interval" },
{ 1, 99999, 10, "ip_icmp_err_burst" },
@@ -955,7 +844,7 @@ static ipparam_t lcl_param_arr[] = {
{ 0, 1, 0, "ip6_ignore_redirect" },
{ 0, 1, 0, "ip6_strict_dst_multihoming" },
- { 1, 8, 3, "ip_ire_reclaim_fraction" },
+ { 0, 2, 2, "ip_src_check" },
{ 0, 999999, 1000, "ipsec_policy_log_interval" },
@@ -964,12 +853,16 @@ static ipparam_t lcl_param_arr[] = {
{ 1, 20, 3, "ip_ndp_unsolicit_count" },
{ 0, 1, 1, "ip6_ignore_home_address_opt" },
{ 0, 15, 0, "ip_policy_mask" },
- { 1000, 60000, 1000, "ip_multirt_resolution_interval" },
+ { 0, 2, 2, "ip_ecmp_behavior" },
{ 0, 255, 1, "ip_multirt_ttl" },
- { 0, 1, 1, "ip_multidata_outbound" },
- { 0, 3600000, 300000, "ip_ndp_defense_interval" },
+ { 0, 3600, 60, "ip_ire_badcnt_lifetime" }, /* In seconds */
{ 0, 999999, 60*60*24, "ip_max_temp_idle" },
{ 0, 1000, 1, "ip_max_temp_defend" },
+ /*
+ * when a conflict of an active address is detected,
+ * defend up to ip_max_defend times, within any
+ * ip_defend_interval span.
+ */
{ 0, 1000, 3, "ip_max_defend" },
{ 0, 999999, 30, "ip_defend_interval" },
{ 0, 3600000, 300000, "ip_dup_recovery" },
@@ -977,12 +870,45 @@ static ipparam_t lcl_param_arr[] = {
{ 0, 1, 1, "ip_lso_outbound" },
{ IGMP_V1_ROUTER, IGMP_V3_ROUTER, IGMP_V3_ROUTER, "igmp_max_version" },
{ MLD_V1_ROUTER, MLD_V2_ROUTER, MLD_V2_ROUTER, "mld_max_version" },
- { 68, 65535, 576, "ip_pmtu_min" },
#ifdef DEBUG
{ 0, 1, 0, "ip6_drop_inbound_icmpv6" },
#else
{ 0, 0, 0, "" },
#endif
+ /* delay before sending first probe: */
+ { 0, 20000, 1000, "arp_probe_delay" },
+ { 0, 20000, 100, "arp_fastprobe_delay" },
+ /* interval at which DAD probes are sent: */
+ { 10, 20000, 1500, "arp_probe_interval" },
+ { 10, 20000, 150, "arp_fastprobe_interval" },
+ /* setting probe count to 0 will disable ARP probing for DAD. */
+ { 0, 20, 3, "arp_probe_count" },
+ { 0, 20, 3, "arp_fastprobe_count" },
+
+ { 0, 3600000, 15000, "ipv4_dad_announce_interval"},
+ { 0, 3600000, 15000, "ipv6_dad_announce_interval"},
+ /*
+ * Rate limiting parameters for DAD defense used in
+ * ill_defend_rate_limit():
+ * defend_rate : pkts/hour permitted
+ * defend_interval : time that can elapse before we send out a
+ * DAD defense.
+ * defend_period: denominator for defend_rate (in seconds).
+ */
+ { 0, 3600000, 300000, "arp_defend_interval"},
+ { 0, 20000, 100, "arp_defend_rate"},
+ { 0, 3600000, 300000, "ndp_defend_interval"},
+ { 0, 20000, 100, "ndp_defend_rate"},
+ { 5, 86400, 3600, "arp_defend_period"},
+ { 5, 86400, 3600, "ndp_defend_period"},
+ { 0, 1, 1, "ipv4_icmp_return_pmtu" },
+ { 0, 1, 1, "ipv6_icmp_return_pmtu" },
+ /*
+ * publish count/interval values used to announce local addresses
+ * for IPv4, IPv6.
+ */
+ { 1, 20, 5, "ip_arp_publish_count" },
+ { 1000, 20000, 2000, "ip_arp_publish_interval" },
};
/*
@@ -1336,11 +1262,11 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
ip_sioctl_get_lifsrcof, NULL },
/* 178 */ { SIOCGMSFILTER, sizeof (struct group_filter), IPI_GET_CMD,
MSFILT_CMD, ip_sioctl_msfilter, NULL },
- /* 179 */ { SIOCSMSFILTER, sizeof (struct group_filter), IPI_WR,
+ /* 179 */ { SIOCSMSFILTER, sizeof (struct group_filter), 0,
MSFILT_CMD, ip_sioctl_msfilter, NULL },
/* 180 */ { SIOCGIPMSFILTER, sizeof (struct ip_msfilter), IPI_GET_CMD,
MSFILT_CMD, ip_sioctl_msfilter, NULL },
- /* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), IPI_WR,
+ /* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), 0,
MSFILT_CMD, ip_sioctl_msfilter, NULL },
/* 182 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
/* SIOCSENABLESDP is handled by SDP */
@@ -1355,12 +1281,12 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t);
ip_ioctl_cmd_t ip_misc_ioctl_table[] = {
- { I_LINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL },
- { I_UNLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL },
- { I_PLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL },
- { I_PUNLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL },
- { ND_GET, 0, IPI_PASS_DOWN, 0, NULL, NULL },
- { ND_SET, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL },
+ { I_LINK, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
+ { I_UNLINK, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
+ { I_PLINK, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
+ { I_PUNLINK, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
+ { ND_GET, 0, 0, 0, NULL, NULL },
+ { ND_SET, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
{ IP_IOCTL, 0, 0, 0, NULL, NULL },
{ SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_GET_CMD,
MISC_CMD, mrt_ioctl},
@@ -1384,12 +1310,14 @@ static nv_t ire_nv_arr[] = {
{ IRE_BROADCAST, "BROADCAST" },
{ IRE_LOCAL, "LOCAL" },
{ IRE_LOOPBACK, "LOOPBACK" },
- { IRE_CACHE, "CACHE" },
{ IRE_DEFAULT, "DEFAULT" },
{ IRE_PREFIX, "PREFIX" },
{ IRE_IF_NORESOLVER, "IF_NORESOL" },
{ IRE_IF_RESOLVER, "IF_RESOLV" },
+ { IRE_IF_CLONE, "IF_CLONE" },
{ IRE_HOST, "HOST" },
+ { IRE_MULTICAST, "MULTICAST" },
+ { IRE_NOROUTE, "NOROUTE" },
{ 0 }
};
@@ -1412,7 +1340,6 @@ struct module_info ip_mod_info = {
/*
* Entry points for IP as a device and as a module.
- * FIXME: down the road we might want a separate module and driver qinit.
* We have separate open functions for the /dev/ip and /dev/ip6 devices.
*/
static struct qinit iprinitv4 = {
@@ -1425,13 +1352,8 @@ struct qinit iprinitv6 = {
&ip_mod_info
};
-static struct qinit ipwinitv4 = {
- (pfi_t)ip_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL,
- &ip_mod_info
-};
-
-struct qinit ipwinitv6 = {
- (pfi_t)ip_wput_v6, (pfi_t)ip_wsrv, NULL, NULL, NULL,
+static struct qinit ipwinit = {
+ (pfi_t)ip_wput_nondata, (pfi_t)ip_wsrv, NULL, NULL, NULL,
&ip_mod_info
};
@@ -1447,98 +1369,32 @@ static struct qinit iplwinit = {
/* For AF_INET aka /dev/ip */
struct streamtab ipinfov4 = {
- &iprinitv4, &ipwinitv4, &iplrinit, &iplwinit
+ &iprinitv4, &ipwinit, &iplrinit, &iplwinit
};
/* For AF_INET6 aka /dev/ip6 */
struct streamtab ipinfov6 = {
- &iprinitv6, &ipwinitv6, &iplrinit, &iplwinit
+ &iprinitv6, &ipwinit, &iplrinit, &iplwinit
};
#ifdef DEBUG
-static boolean_t skip_sctp_cksum = B_FALSE;
+boolean_t skip_sctp_cksum = B_FALSE;
#endif
/*
- * Prepend the zoneid using an ipsec_out_t for later use by functions like
- * ip_rput_v6(), ip_output(), etc. If the message
- * block already has a M_CTL at the front of it, then simply set the zoneid
- * appropriately.
- */
-mblk_t *
-ip_prepend_zoneid(mblk_t *mp, zoneid_t zoneid, ip_stack_t *ipst)
-{
- mblk_t *first_mp;
- ipsec_out_t *io;
-
- ASSERT(zoneid != ALL_ZONES);
- if (mp->b_datap->db_type == M_CTL) {
- io = (ipsec_out_t *)mp->b_rptr;
- ASSERT(io->ipsec_out_type == IPSEC_OUT);
- io->ipsec_out_zoneid = zoneid;
- return (mp);
- }
-
- first_mp = ipsec_alloc_ipsec_out(ipst->ips_netstack);
- if (first_mp == NULL)
- return (NULL);
- io = (ipsec_out_t *)first_mp->b_rptr;
- /* This is not a secure packet */
- io->ipsec_out_secure = B_FALSE;
- io->ipsec_out_zoneid = zoneid;
- first_mp->b_cont = mp;
- return (first_mp);
-}
-
-/*
- * Copy an M_CTL-tagged message, preserving reference counts appropriately.
+ * Generate an ICMP fragmentation needed message.
+ * When called from ip_output side a minimal ip_recv_attr_t needs to be
+ * constructed by the caller.
*/
-mblk_t *
-ip_copymsg(mblk_t *mp)
-{
- mblk_t *nmp;
- ipsec_info_t *in;
-
- if (mp->b_datap->db_type != M_CTL)
- return (copymsg(mp));
-
- in = (ipsec_info_t *)mp->b_rptr;
-
- /*
- * Note that M_CTL is also used for delivering ICMP error messages
- * upstream to transport layers.
- */
- if (in->ipsec_info_type != IPSEC_OUT &&
- in->ipsec_info_type != IPSEC_IN)
- return (copymsg(mp));
-
- nmp = copymsg(mp->b_cont);
-
- if (in->ipsec_info_type == IPSEC_OUT) {
- return (ipsec_out_tag(mp, nmp,
- ((ipsec_out_t *)in)->ipsec_out_ns));
- } else {
- return (ipsec_in_tag(mp, nmp,
- ((ipsec_in_t *)in)->ipsec_in_ns));
- }
-}
-
-/* Generate an ICMP fragmentation needed message. */
-static void
-icmp_frag_needed(queue_t *q, mblk_t *mp, int mtu, zoneid_t zoneid,
- ip_stack_t *ipst)
+void
+icmp_frag_needed(mblk_t *mp, int mtu, ip_recv_attr_t *ira)
{
icmph_t icmph;
- mblk_t *first_mp;
- boolean_t mctl_present;
+ ip_stack_t *ipst = ira->ira_ill->ill_ipst;
- EXTRACT_PKT_MP(mp, first_mp, mctl_present);
-
- if (!(mp = icmp_pkt_err_ok(mp, ipst))) {
- if (mctl_present)
- freeb(first_mp);
+ mp = icmp_pkt_err_ok(mp, ira);
+ if (mp == NULL)
return;
- }
bzero(&icmph, sizeof (icmph_t));
icmph.icmph_type = ICMP_DEST_UNREACHABLE;
@@ -1546,29 +1402,29 @@ icmp_frag_needed(queue_t *q, mblk_t *mp, int mtu, zoneid_t zoneid,
icmph.icmph_du_mtu = htons((uint16_t)mtu);
BUMP_MIB(&ipst->ips_icmp_mib, icmpOutFragNeeded);
BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs);
- icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present, zoneid,
- ipst);
+
+ icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
}
/*
- * icmp_inbound deals with ICMP messages in the following ways.
+ * icmp_inbound_v4 deals with ICMP messages that are handled by IP.
+ * If the ICMP message is consumed by IP, i.e., it should not be delivered
+ * to any IPPROTO_ICMP raw sockets, then it returns NULL.
+ * Likewise, if the ICMP error is misformed (too short, etc), then it
+ * returns NULL. The caller uses this to determine whether or not to send
+ * to raw sockets.
*
+ * All error messages are passed to the matching transport stream.
+ *
+ * The following cases are handled by icmp_inbound:
* 1) It needs to send a reply back and possibly delivering it
* to the "interested" upper clients.
- * 2) It needs to send it to the upper clients only.
+ * 2) Return the mblk so that the caller can pass it to the RAW socket clients.
* 3) It needs to change some values in IP only.
- * 4) It needs to change some values in IP and upper layers e.g TCP.
- *
- * We need to accomodate icmp messages coming in clear until we get
- * everything secure from the wire. If icmp_accept_clear_messages
- * is zero we check with the global policy and act accordingly. If
- * it is non-zero, we accept the message without any checks. But
- * *this does not mean* that this will be delivered to the upper
- * clients. By accepting we might send replies back, change our MTU
- * value etc. but delivery to the ULP/clients depends on their policy
- * dispositions.
+ * 4) It needs to change some values in IP and upper layers e.g TCP
+ * by delivering an error to the upper layers.
*
- * We handle the above 4 cases in the context of IPsec in the
+ * We handle the above three cases in the context of IPsec in the
* following way :
*
* 1) Send the reply back in the same way as the request came in.
@@ -1610,13 +1466,13 @@ icmp_frag_needed(queue_t *q, mblk_t *mp, int mtu, zoneid_t zoneid,
* come to a stop. This is solved by making similar decisions
* at both levels. Currently, when we are unable to deliver
* to the Upper Layer (due to policy failures) while IP has
- * adjusted ire_max_frag, the next outbound datagram would
+ * adjusted dce_pmtu, the next outbound datagram would
* generate a local ICMP_FRAGMENTATION_NEEDED message - which
* will be with the right level of protection. Thus the right
* value will be communicated even if we are not able to
* communicate when we get from the wire initially. But this
* assumes there would be at least one outbound datagram after
- * IP has adjusted its ire_max_frag value. To make things
+ * IP has adjusted its dce_pmtu value. To make things
* simpler, we accept in clear after the validation of
* AH/ESP headers.
*
@@ -1627,105 +1483,54 @@ icmp_frag_needed(queue_t *q, mblk_t *mp, int mtu, zoneid_t zoneid,
* should be accepted in clear when the Upper layer expects secure.
* Thus the communication may get aborted by some bad ICMP
* packets.
- *
- * IPQoS Notes:
- * The only instance when a packet is sent for processing is when there
- * isn't an ICMP client and if we are interested in it.
- * If there is a client, IPPF processing will take place in the
- * ip_fanout_proto routine.
- *
- * Zones notes:
- * The packet is only processed in the context of the specified zone: typically
- * only this zone will reply to an echo request, and only interested clients in
- * this zone will receive a copy of the packet. This means that the caller must
- * call icmp_inbound() for each relevant zone.
*/
-static void
-icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill,
- int sum_valid, uint32_t sum, boolean_t mctl_present, boolean_t ip_policy,
- ill_t *recv_ill, zoneid_t zoneid)
+mblk_t *
+icmp_inbound_v4(mblk_t *mp, ip_recv_attr_t *ira)
{
- icmph_t *icmph;
- ipha_t *ipha;
- int iph_hdr_length;
- int hdr_length;
+ icmph_t *icmph;
+ ipha_t *ipha; /* Outer header */
+ int ip_hdr_length; /* Outer header length */
boolean_t interested;
+ ipif_t *ipif;
uint32_t ts;
- uchar_t *wptr;
- ipif_t *ipif;
- mblk_t *first_mp;
- ipsec_in_t *ii;
- timestruc_t now;
- uint32_t ill_index;
- ip_stack_t *ipst;
-
- ASSERT(ill != NULL);
- ipst = ill->ill_ipst;
-
- first_mp = mp;
- if (mctl_present) {
- mp = first_mp->b_cont;
- ASSERT(mp != NULL);
- }
+ uint32_t *tsp;
+ timestruc_t now;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+ zoneid_t zoneid = ira->ira_zoneid;
+ int len_needed;
+ mblk_t *mp_ret = NULL;
ipha = (ipha_t *)mp->b_rptr;
- if (ipst->ips_icmp_accept_clear_messages == 0) {
- first_mp = ipsec_check_global_policy(first_mp, NULL,
- ipha, NULL, mctl_present, ipst->ips_netstack);
- if (first_mp == NULL)
- return;
- }
-
- /*
- * On a labeled system, we have to check whether the zone itself is
- * permitted to receive raw traffic.
- */
- if (is_system_labeled()) {
- if (zoneid == ALL_ZONES)
- zoneid = tsol_packet_to_zoneid(mp);
- if (!tsol_can_accept_raw(mp, B_FALSE)) {
- ip1dbg(("icmp_inbound: zone %d can't receive raw",
- zoneid));
- BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
- freemsg(first_mp);
- return;
- }
- }
-
- /*
- * We have accepted the ICMP message. It means that we will
- * respond to the packet if needed. It may not be delivered
- * to the upper client depending on the policy constraints
- * and the disposition in ipsec_inbound_accept_clear.
- */
-
- ASSERT(ill != NULL);
BUMP_MIB(&ipst->ips_icmp_mib, icmpInMsgs);
- iph_hdr_length = IPH_HDR_LENGTH(ipha);
- if ((mp->b_wptr - mp->b_rptr) < (iph_hdr_length + ICMPH_SIZE)) {
+
+ ip_hdr_length = ira->ira_ip_hdr_length;
+ if ((mp->b_wptr - mp->b_rptr) < (ip_hdr_length + ICMPH_SIZE)) {
+ if (ira->ira_pktlen < (ip_hdr_length + ICMPH_SIZE)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
+ ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
+ freemsg(mp);
+ return (NULL);
+ }
/* Last chance to get real. */
- if (!pullupmsg(mp, iph_hdr_length + ICMPH_SIZE)) {
+ ipha = ip_pullup(mp, ip_hdr_length + ICMPH_SIZE, ira);
+ if (ipha == NULL) {
BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
- freemsg(first_mp);
- return;
+ freemsg(mp);
+ return (NULL);
}
- /* Refresh iph following the pullup. */
- ipha = (ipha_t *)mp->b_rptr;
- }
- /* ICMP header checksum, including checksum field, should be zero. */
- if (sum_valid ? (sum != 0 && sum != 0xFFFF) :
- IP_CSUM(mp, iph_hdr_length, 0)) {
- BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs);
- freemsg(first_mp);
- return;
}
+
/* The IP header will always be a multiple of four bytes */
- icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
- ip2dbg(("icmp_inbound: type %d code %d\n", icmph->icmph_type,
+ icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
+ ip2dbg(("icmp_inbound_v4: type %d code %d\n", icmph->icmph_type,
icmph->icmph_code));
- wptr = (uchar_t *)icmph + ICMPH_SIZE;
- /* We will set "interested" to "true" if we want a copy */
+
+ /*
+ * We will set "interested" to "true" if we should pass a copy to
+ * the transport or if we handle the packet locally.
+ */
interested = B_FALSE;
switch (icmph->icmph_type) {
case ICMP_ECHO_REPLY:
@@ -1753,18 +1558,42 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill,
* (what isn't?). We aim to please, you pick it.
* Default is do it.
*/
- if (!broadcast && !CLASSD(ipha->ipha_dst)) {
- /* unicast: always respond */
- interested = B_TRUE;
- } else if (CLASSD(ipha->ipha_dst)) {
+ if (ira->ira_flags & IRAF_MULTICAST) {
/* multicast: respond based on tunable */
interested = ipst->ips_ip_g_resp_to_echo_mcast;
- } else if (broadcast) {
+ } else if (ira->ira_flags & IRAF_BROADCAST) {
/* broadcast: respond based on tunable */
interested = ipst->ips_ip_g_resp_to_echo_bcast;
+ } else {
+ /* unicast: always respond */
+ interested = B_TRUE;
}
BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchos);
- break;
+ if (!interested) {
+ /* We never pass these to RAW sockets */
+ freemsg(mp);
+ return (NULL);
+ }
+
+ /* Check db_ref to make sure we can modify the packet. */
+ if (mp->b_datap->db_ref > 1) {
+ mblk_t *mp1;
+
+ mp1 = copymsg(mp);
+ freemsg(mp);
+ if (!mp1) {
+ BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
+ return (NULL);
+ }
+ mp = mp1;
+ ipha = (ipha_t *)mp->b_rptr;
+ icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
+ }
+ icmph->icmph_type = ICMP_ECHO_REPLY;
+ BUMP_MIB(&ipst->ips_icmp_mib, icmpOutEchoReps);
+ icmp_send_reply_v4(mp, ipha, icmph, ira);
+ return (NULL);
+
case ICMP_ROUTER_ADVERTISEMENT:
case ICMP_ROUTER_SOLICITATION:
break;
@@ -1778,28 +1607,63 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill,
break;
case ICMP_TIME_STAMP_REQUEST:
/* Response to Time Stamp Requests is local policy. */
- if (ipst->ips_ip_g_resp_to_timestamp &&
- /* So is whether to respond if it was an IP broadcast. */
- (!broadcast || ipst->ips_ip_g_resp_to_timestamp_bcast)) {
- int tstamp_len = 3 * sizeof (uint32_t);
-
- if (wptr + tstamp_len > mp->b_wptr) {
- if (!pullupmsg(mp, wptr + tstamp_len -
- mp->b_rptr)) {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsInDiscards);
- freemsg(first_mp);
- return;
- }
- /* Refresh ipha following the pullup. */
- ipha = (ipha_t *)mp->b_rptr;
- icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
- wptr = (uchar_t *)icmph + ICMPH_SIZE;
+ if (ipst->ips_ip_g_resp_to_timestamp) {
+ if (ira->ira_flags & IRAF_MULTIBROADCAST)
+ interested =
+ ipst->ips_ip_g_resp_to_timestamp_bcast;
+ else
+ interested = B_TRUE;
+ }
+ if (!interested) {
+ /* We never pass these to RAW sockets */
+ freemsg(mp);
+ return (NULL);
+ }
+
+ /* Make sure we have enough of the packet */
+ len_needed = ip_hdr_length + ICMPH_SIZE +
+ 3 * sizeof (uint32_t);
+
+ if (mp->b_wptr - mp->b_rptr < len_needed) {
+ ipha = ip_pullup(mp, len_needed, ira);
+ if (ipha == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards - ip_pullup",
+ mp, ill);
+ freemsg(mp);
+ return (NULL);
}
- interested = B_TRUE;
+ /* Refresh following the pullup. */
+ icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
}
BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestamps);
- break;
+ /* Check db_ref to make sure we can modify the packet. */
+ if (mp->b_datap->db_ref > 1) {
+ mblk_t *mp1;
+
+ mp1 = copymsg(mp);
+ freemsg(mp);
+ if (!mp1) {
+ BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
+ return (NULL);
+ }
+ mp = mp1;
+ ipha = (ipha_t *)mp->b_rptr;
+ icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
+ }
+ icmph->icmph_type = ICMP_TIME_STAMP_REPLY;
+ tsp = (uint32_t *)&icmph[1];
+ tsp++; /* Skip past 'originate time' */
+ /* Compute # of milliseconds since midnight */
+ gethrestime(&now);
+ ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
+ now.tv_nsec / (NANOSEC / MILLISEC);
+ *tsp++ = htonl(ts); /* Lay in 'receive time' */
+ *tsp++ = htonl(ts); /* Lay in 'send time' */
+ BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimestampReps);
+ icmp_send_reply_v4(mp, ipha, icmph, ira);
+ return (NULL);
+
case ICMP_TIME_STAMP_REPLY:
BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestampReps);
break;
@@ -1808,14 +1672,68 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill,
case ICMP_INFO_REPLY:
break;
case ICMP_ADDRESS_MASK_REQUEST:
- if ((ipst->ips_ip_respond_to_address_mask_broadcast ||
- !broadcast) &&
- /* TODO m_pullup of complete header? */
- (mp->b_datap->db_lim - wptr) >= IP_ADDR_LEN) {
+ if (ira->ira_flags & IRAF_MULTIBROADCAST) {
+ interested =
+ ipst->ips_ip_respond_to_address_mask_broadcast;
+ } else {
interested = B_TRUE;
}
+ if (!interested) {
+ /* We never pass these to RAW sockets */
+ freemsg(mp);
+ return (NULL);
+ }
+ len_needed = ip_hdr_length + ICMPH_SIZE + IP_ADDR_LEN;
+ if (mp->b_wptr - mp->b_rptr < len_needed) {
+ ipha = ip_pullup(mp, len_needed, ira);
+ if (ipha == NULL) {
+ BUMP_MIB(ill->ill_ip_mib,
+ ipIfStatsInTruncatedPkts);
+ ip_drop_input("ipIfStatsInTruncatedPkts", mp,
+ ill);
+ freemsg(mp);
+ return (NULL);
+ }
+ /* Refresh following the pullup. */
+ icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
+ }
BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMasks);
- break;
+ /* Check db_ref to make sure we can modify the packet. */
+ if (mp->b_datap->db_ref > 1) {
+ mblk_t *mp1;
+
+ mp1 = copymsg(mp);
+ freemsg(mp);
+ if (!mp1) {
+ BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
+ return (NULL);
+ }
+ mp = mp1;
+ ipha = (ipha_t *)mp->b_rptr;
+ icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
+ }
+ /*
+ * Need the ipif with the mask be the same as the source
+ * address of the mask reply. For unicast we have a specific
+ * ipif. For multicast/broadcast we only handle onlink
+ * senders, and use the source address to pick an ipif.
+ */
+ ipif = ipif_lookup_addr(ipha->ipha_dst, ill, zoneid, ipst);
+ if (ipif == NULL) {
+ /* Broadcast or multicast */
+ ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid);
+ if (ipif == NULL) {
+ freemsg(mp);
+ return (NULL);
+ }
+ }
+ icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY;
+ bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN);
+ ipif_refrele(ipif);
+ BUMP_MIB(&ipst->ips_icmp_mib, icmpOutAddrMaskReps);
+ icmp_send_reply_v4(mp, ipha, icmph, ira);
+ return (NULL);
+
case ICMP_ADDRESS_MASK_REPLY:
BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMaskReps);
break;
@@ -1824,206 +1742,103 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill,
BUMP_MIB(&ipst->ips_icmp_mib, icmpInUnknowns);
break;
}
- /* See if there is an ICMP client. */
- if (ipst->ips_ipcl_proto_fanout[IPPROTO_ICMP].connf_head != NULL) {
+ /*
+ * See if there is an ICMP client to avoid an extra copymsg/freemsg
+ * if there isn't one.
+ */
+ if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_ICMP].connf_head != NULL) {
/* If there is an ICMP client and we want one too, copy it. */
- mblk_t *first_mp1;
if (!interested) {
- ip_fanout_proto(q, first_mp, ill, ipha, 0, mctl_present,
- ip_policy, recv_ill, zoneid);
- return;
+ /* Caller will deliver to RAW sockets */
+ return (mp);
}
- first_mp1 = ip_copymsg(first_mp);
- if (first_mp1 != NULL) {
- ip_fanout_proto(q, first_mp1, ill, ipha,
- 0, mctl_present, ip_policy, recv_ill, zoneid);
+ mp_ret = copymsg(mp);
+ if (mp_ret == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
}
} else if (!interested) {
- freemsg(first_mp);
- return;
- } else {
- /*
- * Initiate policy processing for this packet if ip_policy
- * is true.
- */
- if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) {
- ill_index = ill->ill_phyint->phyint_ifindex;
- ip_process(IPP_LOCAL_IN, &mp, ill_index);
- if (mp == NULL) {
- if (mctl_present) {
- freeb(first_mp);
- }
- BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
- return;
- }
+ /* Neither we nor raw sockets are interested. Drop packet now */
+ freemsg(mp);
+ return (NULL);
+ }
+
+ /*
+ * ICMP error or redirect packet. Make sure we have enough of
+ * the header and that db_ref == 1 since we might end up modifying
+ * the packet.
+ */
+ if (mp->b_cont != NULL) {
+ if (ip_pullup(mp, -1, ira) == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards - ip_pullup",
+ mp, ill);
+ freemsg(mp);
+ return (mp_ret);
}
}
- /* We want to do something with it. */
- /* Check db_ref to make sure we can modify the packet. */
+
if (mp->b_datap->db_ref > 1) {
- mblk_t *first_mp1;
+ mblk_t *mp1;
- first_mp1 = ip_copymsg(first_mp);
- freemsg(first_mp);
- if (!first_mp1) {
- BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
- return;
- }
- first_mp = first_mp1;
- if (mctl_present) {
- mp = first_mp->b_cont;
- ASSERT(mp != NULL);
- } else {
- mp = first_mp;
+ mp1 = copymsg(mp);
+ if (mp1 == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
+ freemsg(mp);
+ return (mp_ret);
}
- ipha = (ipha_t *)mp->b_rptr;
- icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
- wptr = (uchar_t *)icmph + ICMPH_SIZE;
+ freemsg(mp);
+ mp = mp1;
}
- switch (icmph->icmph_type) {
- case ICMP_ADDRESS_MASK_REQUEST:
- ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid);
- if (ipif == NULL) {
- freemsg(first_mp);
- return;
- }
- /*
- * outging interface must be IPv4
- */
- ASSERT(ipif != NULL && !ipif->ipif_isv6);
- icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY;
- bcopy(&ipif->ipif_net_mask, wptr, IP_ADDR_LEN);
- ipif_refrele(ipif);
- BUMP_MIB(&ipst->ips_icmp_mib, icmpOutAddrMaskReps);
- break;
- case ICMP_ECHO_REQUEST:
- icmph->icmph_type = ICMP_ECHO_REPLY;
- BUMP_MIB(&ipst->ips_icmp_mib, icmpOutEchoReps);
- break;
- case ICMP_TIME_STAMP_REQUEST: {
- uint32_t *tsp;
- icmph->icmph_type = ICMP_TIME_STAMP_REPLY;
- tsp = (uint32_t *)wptr;
- tsp++; /* Skip past 'originate time' */
- /* Compute # of milliseconds since midnight */
- gethrestime(&now);
- ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
- now.tv_nsec / (NANOSEC / MILLISEC);
- *tsp++ = htonl(ts); /* Lay in 'receive time' */
- *tsp++ = htonl(ts); /* Lay in 'send time' */
- BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimestampReps);
- break;
+ /*
+ * In case mp has changed, verify the message before any further
+ * processes.
+ */
+ ipha = (ipha_t *)mp->b_rptr;
+ icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
+ if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
+ freemsg(mp);
+ return (mp_ret);
}
- default:
- ipha = (ipha_t *)&icmph[1];
- if ((uchar_t *)&ipha[1] > mp->b_wptr) {
- if (!pullupmsg(mp, (uchar_t *)&ipha[1] - mp->b_rptr)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(first_mp);
- return;
- }
- icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
- ipha = (ipha_t *)&icmph[1];
- }
- if ((IPH_HDR_VERSION(ipha) != IPV4_VERSION)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(first_mp);
- return;
- }
- hdr_length = IPH_HDR_LENGTH(ipha);
- if (hdr_length < sizeof (ipha_t)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(first_mp);
- return;
- }
- if ((uchar_t *)ipha + hdr_length > mp->b_wptr) {
- if (!pullupmsg(mp,
- (uchar_t *)ipha + hdr_length - mp->b_rptr)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(first_mp);
- return;
- }
- icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
- ipha = (ipha_t *)&icmph[1];
- }
- switch (icmph->icmph_type) {
- case ICMP_REDIRECT:
- /*
- * As there is no upper client to deliver, we don't
- * need the first_mp any more.
- */
- if (mctl_present) {
- freeb(first_mp);
- }
- icmp_redirect(ill, mp);
- return;
- case ICMP_DEST_UNREACHABLE:
- if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) {
- if (!icmp_inbound_too_big(icmph, ipha, ill,
- zoneid, mp, iph_hdr_length, ipst)) {
- freemsg(first_mp);
- return;
- }
- /*
- * icmp_inbound_too_big() may alter mp.
- * Resynch ipha and icmph accordingly.
- */
- icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
- ipha = (ipha_t *)&icmph[1];
- }
- /* FALLTHRU */
- default :
- /*
- * IPQoS notes: Since we have already done IPQoS
- * processing we don't want to do it again in
- * the fanout routines called by
- * icmp_inbound_error_fanout, hence the last
- * argument, ip_policy, is B_FALSE.
- */
- icmp_inbound_error_fanout(q, ill, first_mp, icmph,
- ipha, iph_hdr_length, hdr_length, mctl_present,
- B_FALSE, recv_ill, zoneid);
+
+ switch (icmph->icmph_type) {
+ case ICMP_REDIRECT:
+ icmp_redirect_v4(mp, ipha, icmph, ira);
+ break;
+ case ICMP_DEST_UNREACHABLE:
+ if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) {
+ /* Update DCE and adjust MTU is icmp header if needed */
+ icmp_inbound_too_big_v4(icmph, ira);
}
- return;
+ /* FALLTHRU */
+ default:
+ icmp_inbound_error_fanout_v4(mp, icmph, ira);
+ break;
}
+ return (mp_ret);
+}
+
+/*
+ * Send an ICMP echo, timestamp or address mask reply.
+ * The caller has already updated the payload part of the packet.
+ * We handle the ICMP checksum, IP source address selection and feed
+ * the packet into ip_output_simple.
+ */
+static void
+icmp_send_reply_v4(mblk_t *mp, ipha_t *ipha, icmph_t *icmph,
+ ip_recv_attr_t *ira)
+{
+ uint_t ip_hdr_length = ira->ira_ip_hdr_length;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+ ip_xmit_attr_t ixas;
+
/* Send out an ICMP packet */
icmph->icmph_checksum = 0;
- icmph->icmph_checksum = IP_CSUM(mp, iph_hdr_length, 0);
- if (broadcast || CLASSD(ipha->ipha_dst)) {
- ipif_t *ipif_chosen;
- /*
- * Make it look like it was directed to us, so we don't look
- * like a fool with a broadcast or multicast source address.
- */
- ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid);
- /*
- * Make sure that we haven't grabbed an interface that's DOWN.
- */
- if (ipif != NULL) {
- ipif_chosen = ipif_select_source(ipif->ipif_ill,
- ipha->ipha_src, zoneid);
- if (ipif_chosen != NULL) {
- ipif_refrele(ipif);
- ipif = ipif_chosen;
- }
- }
- if (ipif == NULL) {
- ip0dbg(("icmp_inbound: "
- "No source for broadcast/multicast:\n"
- "\tsrc 0x%x dst 0x%x ill %p "
- "ipif_lcl_addr 0x%x\n",
- ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
- (void *)ill,
- ill->ill_ipif->ipif_lcl_addr));
- freemsg(first_mp);
- return;
- }
- ASSERT(ipif != NULL && !ipif->ipif_isv6);
- ipha->ipha_dst = ipif->ipif_src_addr;
- ipif_refrele(ipif);
- }
+ icmph->icmph_checksum = IP_CSUM(mp, ip_hdr_length, 0);
/* Reset time to live. */
ipha->ipha_ttl = ipst->ips_ip_def_ttl;
{
@@ -2038,138 +1853,159 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill,
if (!IS_SIMPLE_IPH(ipha))
icmp_options_update(ipha);
- if (!mctl_present) {
+ bzero(&ixas, sizeof (ixas));
+ ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
+ ixas.ixa_zoneid = ira->ira_zoneid;
+ ixas.ixa_cred = kcred;
+ ixas.ixa_cpid = NOPID;
+ ixas.ixa_tsl = ira->ira_tsl; /* Behave as a multi-level responder */
+ ixas.ixa_ifindex = 0;
+ ixas.ixa_ipst = ipst;
+ ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
+
+ if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
/*
* This packet should go out the same way as it
- * came in i.e in clear. To make sure that global
- * policy will not be applied to this in ip_wput_ire,
- * we attach a IPSEC_IN mp and clear ipsec_in_secure.
+ * came in i.e in clear, independent of the IPsec policy
+ * for transmitting packets.
*/
- ASSERT(first_mp == mp);
- first_mp = ipsec_in_alloc(B_TRUE, ipst->ips_netstack);
- if (first_mp == NULL) {
+ ixas.ixa_flags |= IXAF_NO_IPSEC;
+ } else {
+ if (!ipsec_in_to_out(ira, &ixas, mp, ipha, NULL)) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(mp);
+ /* Note: mp already consumed and ip_drop_packet done */
return;
}
- ii = (ipsec_in_t *)first_mp->b_rptr;
-
- /* This is not a secure packet */
- ii->ipsec_in_secure = B_FALSE;
- first_mp->b_cont = mp;
- } else {
- ii = (ipsec_in_t *)first_mp->b_rptr;
- ii->ipsec_in_ns = ipst->ips_netstack; /* No netstack_hold */
}
- if (!ipsec_in_to_out(first_mp, ipha, NULL, zoneid)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- return;
+ if (ira->ira_flags & IRAF_MULTIBROADCAST) {
+ /*
+ * Not one or our addresses (IRE_LOCALs), thus we let
+ * ip_output_simple pick the source.
+ */
+ ipha->ipha_src = INADDR_ANY;
+ ixas.ixa_flags |= IXAF_SET_SOURCE;
+ }
+ /* Should we send with DF and use dce_pmtu? */
+ if (ipst->ips_ipv4_icmp_return_pmtu) {
+ ixas.ixa_flags |= IXAF_PMTU_DISCOVERY;
+ ipha->ipha_fragment_offset_and_flags |= IPH_DF_HTONS;
}
+
BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs);
- put(WR(q), first_mp);
+
+ (void) ip_output_simple(mp, &ixas);
+ ixa_cleanup(&ixas);
}
-static ipaddr_t
-icmp_get_nexthop_addr(ipha_t *ipha, ill_t *ill, zoneid_t zoneid, mblk_t *mp)
+/*
+ * Verify the ICMP messages for either for ICMP error or redirect packet.
+ * The caller should have fully pulled up the message. If it's a redirect
+ * packet, only basic checks on IP header will be done; otherwise, verify
+ * the packet by looking at the included ULP header.
+ *
+ * Called before icmp_inbound_error_fanout_v4 is called.
+ */
+static boolean_t
+icmp_inbound_verify_v4(mblk_t *mp, icmph_t *icmph, ip_recv_attr_t *ira)
{
- conn_t *connp;
- connf_t *connfp;
- ipaddr_t nexthop_addr = INADDR_ANY;
- int hdr_length = IPH_HDR_LENGTH(ipha);
- uint16_t *up;
- uint32_t ports;
- ip_stack_t *ipst = ill->ill_ipst;
+ ill_t *ill = ira->ira_ill;
+ int hdr_length;
+ ip_stack_t *ipst = ira->ira_ill->ill_ipst;
+ conn_t *connp;
+ ipha_t *ipha; /* Inner IP header */
- up = (uint16_t *)((uchar_t *)ipha + hdr_length);
- switch (ipha->ipha_protocol) {
- case IPPROTO_TCP:
- {
- tcph_t *tcph;
-
- /* do a reverse lookup */
- tcph = (tcph_t *)((uchar_t *)ipha + hdr_length);
- connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcph,
- TCPS_LISTEN, ipst);
- break;
- }
- case IPPROTO_UDP:
- {
- uint32_t dstport, srcport;
+ ipha = (ipha_t *)&icmph[1];
+ if ((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH > mp->b_wptr)
+ goto truncated;
- ((uint16_t *)&ports)[0] = up[1];
- ((uint16_t *)&ports)[1] = up[0];
+ hdr_length = IPH_HDR_LENGTH(ipha);
- /* Extract ports in net byte order */
- dstport = htons(ntohl(ports) & 0xFFFF);
- srcport = htons(ntohl(ports) >> 16);
+ if ((IPH_HDR_VERSION(ipha) != IPV4_VERSION))
+ goto discard_pkt;
- connfp = &ipst->ips_ipcl_udp_fanout[
- IPCL_UDP_HASH(dstport, ipst)];
- mutex_enter(&connfp->connf_lock);
- connp = connfp->connf_head;
+ if (hdr_length < sizeof (ipha_t))
+ goto truncated;
- /* do a reverse lookup */
- while ((connp != NULL) &&
- (!IPCL_UDP_MATCH(connp, dstport,
- ipha->ipha_src, srcport, ipha->ipha_dst) ||
- !IPCL_ZONE_MATCH(connp, zoneid))) {
- connp = connp->conn_next;
- }
- if (connp != NULL)
- CONN_INC_REF(connp);
- mutex_exit(&connfp->connf_lock);
- break;
- }
- case IPPROTO_SCTP:
- {
- in6_addr_t map_src, map_dst;
+ if ((uchar_t *)ipha + hdr_length > mp->b_wptr)
+ goto truncated;
- IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_src);
- IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_dst);
- ((uint16_t *)&ports)[0] = up[1];
- ((uint16_t *)&ports)[1] = up[0];
+ /*
+ * Stop here for ICMP_REDIRECT.
+ */
+ if (icmph->icmph_type == ICMP_REDIRECT)
+ return (B_TRUE);
- connp = sctp_find_conn(&map_src, &map_dst, ports,
- zoneid, ipst->ips_netstack->netstack_sctp);
- if (connp == NULL) {
- connp = ipcl_classify_raw(mp, IPPROTO_SCTP,
- zoneid, ports, ipha, ipst);
- } else {
- CONN_INC_REF(connp);
- SCTP_REFRELE(CONN2SCTP(connp));
- }
- break;
- }
- default:
- {
- ipha_t ripha;
+ /*
+ * ICMP errors only.
+ */
+ switch (ipha->ipha_protocol) {
+ case IPPROTO_UDP:
+ /*
+ * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
+ * transport header.
+ */
+ if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
+ mp->b_wptr)
+ goto truncated;
+ break;
+ case IPPROTO_TCP: {
+ tcpha_t *tcpha;
+
+ /*
+ * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
+ * transport header.
+ */
+ if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
+ mp->b_wptr)
+ goto truncated;
- ripha.ipha_src = ipha->ipha_dst;
- ripha.ipha_dst = ipha->ipha_src;
- ripha.ipha_protocol = ipha->ipha_protocol;
+ tcpha = (tcpha_t *)((uchar_t *)ipha + hdr_length);
+ connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcpha, TCPS_LISTEN,
+ ipst);
+ if (connp == NULL)
+ goto discard_pkt;
- connfp = &ipst->ips_ipcl_proto_fanout[
- ipha->ipha_protocol];
- mutex_enter(&connfp->connf_lock);
- connp = connfp->connf_head;
- for (connp = connfp->connf_head; connp != NULL;
- connp = connp->conn_next) {
- if (IPCL_PROTO_MATCH(connp,
- ipha->ipha_protocol, &ripha, ill,
- 0, zoneid)) {
- CONN_INC_REF(connp);
- break;
- }
- }
- mutex_exit(&connfp->connf_lock);
+ if ((connp->conn_verifyicmp != NULL) &&
+ !connp->conn_verifyicmp(connp, tcpha, icmph, NULL, ira)) {
+ CONN_DEC_REF(connp);
+ goto discard_pkt;
}
- }
- if (connp != NULL) {
- if (connp->conn_nexthop_set)
- nexthop_addr = connp->conn_nexthop_v4;
CONN_DEC_REF(connp);
+ break;
+ }
+ case IPPROTO_SCTP:
+ /*
+ * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
+ * transport header.
+ */
+ if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
+ mp->b_wptr)
+ goto truncated;
+ break;
+ case IPPROTO_ESP:
+ case IPPROTO_AH:
+ break;
+ case IPPROTO_ENCAP:
+ if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) >
+ mp->b_wptr)
+ goto truncated;
+ break;
+ default:
+ break;
}
- return (nexthop_addr);
+
+ return (B_TRUE);
+
+discard_pkt:
+ /* Bogus ICMP error. */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ return (B_FALSE);
+
+truncated:
+ /* We pulled up everthing already. Must be truncated */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
+ ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
+ return (B_FALSE);
}
/* Table from RFC 1191 */
@@ -2178,64 +2014,52 @@ static int icmp_frag_size_table[] =
/*
* Process received ICMP Packet too big.
- * After updating any IRE it does the fanout to any matching transport streams.
- * Assumes the message has been pulled up till the IP header that caused
- * the error.
+ * Just handles the DCE create/update, including using the above table of
+ * PMTU guesses. The caller is responsible for validating the packet before
+ * passing it in and also to fanout the ICMP error to any matching transport
+ * conns. Assumes the message has been fully pulled up and verified.
+ *
+ * Before getting here, the caller has called icmp_inbound_verify_v4()
+ * that should have verified with ULP to prevent undoing the changes we're
+ * going to make to DCE. For example, TCP might have verified that the packet
+ * which generated error is in the send window.
*
- * Returns B_FALSE on failure and B_TRUE on success.
+ * In some cases modified this MTU in the ICMP header packet; the caller
+ * should pass to the matching ULP after this returns.
*/
-static boolean_t
-icmp_inbound_too_big(icmph_t *icmph, ipha_t *ipha, ill_t *ill,
- zoneid_t zoneid, mblk_t *mp, int iph_hdr_length,
- ip_stack_t *ipst)
+static void
+icmp_inbound_too_big_v4(icmph_t *icmph, ip_recv_attr_t *ira)
{
- ire_t *ire, *first_ire;
- int mtu, orig_mtu;
- int hdr_length;
- ipaddr_t nexthop_addr;
- boolean_t disable_pmtud;
+ dce_t *dce;
+ int old_mtu;
+ int mtu, orig_mtu;
+ ipaddr_t dst;
+ boolean_t disable_pmtud;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+ uint_t hdr_length;
+ ipha_t *ipha;
+ /* Caller already pulled up everything. */
+ ipha = (ipha_t *)&icmph[1];
ASSERT(icmph->icmph_type == ICMP_DEST_UNREACHABLE &&
icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED);
ASSERT(ill != NULL);
hdr_length = IPH_HDR_LENGTH(ipha);
- /* Drop if the original packet contained a source route */
- if (ip_source_route_included(ipha)) {
- return (B_FALSE);
- }
/*
- * Verify we have at least ICMP_MIN_TP_HDR_LENGTH bytes of transport
- * header.
+ * We handle path MTU for source routed packets since the DCE
+ * is looked up using the final destination.
*/
- if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
- mp->b_wptr) {
- if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length +
- ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- ip1dbg(("icmp_inbound_too_big: insufficient hdr\n"));
- return (B_FALSE);
- }
- icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
- ipha = (ipha_t *)&icmph[1];
- }
- nexthop_addr = icmp_get_nexthop_addr(ipha, ill, zoneid, mp);
- if (nexthop_addr != INADDR_ANY) {
- /* nexthop set */
- first_ire = ire_ctable_lookup(ipha->ipha_dst,
- nexthop_addr, 0, NULL, ALL_ZONES, msg_getlabel(mp),
- MATCH_IRE_MARK_PRIVATE_ADDR | MATCH_IRE_GW, ipst);
- } else {
- /* nexthop not set */
- first_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_CACHE,
- NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
- }
+ dst = ip_get_dst(ipha);
- if (!first_ire) {
- ip1dbg(("icmp_inbound_too_big: no route for 0x%x\n",
- ntohl(ipha->ipha_dst)));
- return (B_FALSE);
+ dce = dce_lookup_and_add_v4(dst, ipst);
+ if (dce == NULL) {
+ /* Couldn't add a unique one - ENOMEM */
+ ip1dbg(("icmp_inbound_too_big_v4: no dce for 0x%x\n",
+ ntohl(dst)));
+ return;
}
/* Check for MTU discovery advice as described in RFC 1191 */
@@ -2243,149 +2067,112 @@ icmp_inbound_too_big(icmph_t *icmph, ipha_t *ipha, ill_t *ill,
orig_mtu = mtu;
disable_pmtud = B_FALSE;
- rw_enter(&first_ire->ire_bucket->irb_lock, RW_READER);
- for (ire = first_ire; ire != NULL && ire->ire_addr == ipha->ipha_dst;
- ire = ire->ire_next) {
- /*
- * Look for the connection to which this ICMP message is
- * directed. If it has the IP_NEXTHOP option set, then the
- * search is limited to IREs with the MATCH_IRE_PRIVATE
- * option. Else the search is limited to regular IREs.
- */
- if (((ire->ire_marks & IRE_MARK_PRIVATE_ADDR) &&
- (nexthop_addr != ire->ire_gateway_addr)) ||
- (!(ire->ire_marks & IRE_MARK_PRIVATE_ADDR) &&
- (nexthop_addr != INADDR_ANY)))
- continue;
+ mutex_enter(&dce->dce_lock);
+ if (dce->dce_flags & DCEF_PMTU)
+ old_mtu = dce->dce_pmtu;
+ else
+ old_mtu = ill->ill_mtu;
- mutex_enter(&ire->ire_lock);
- if (icmph->icmph_du_zero != 0 || mtu < ipst->ips_ip_pmtu_min) {
- uint32_t length;
- int i;
+ if (icmph->icmph_du_zero != 0 || mtu < ipst->ips_ip_pmtu_min) {
+ uint32_t length;
+ int i;
+ /*
+ * Use the table from RFC 1191 to figure out
+ * the next "plateau" based on the length in
+ * the original IP packet.
+ */
+ length = ntohs(ipha->ipha_length);
+ DTRACE_PROBE2(ip4__pmtu__guess, dce_t *, dce,
+ uint32_t, length);
+ if (old_mtu <= length &&
+ old_mtu >= length - hdr_length) {
/*
- * Use the table from RFC 1191 to figure out
- * the next "plateau" based on the length in
- * the original IP packet.
+ * Handle broken BSD 4.2 systems that
+ * return the wrong ipha_length in ICMP
+ * errors.
*/
- length = ntohs(ipha->ipha_length);
- DTRACE_PROBE2(ip4__pmtu__guess, ire_t *, ire,
- uint32_t, length);
- if (ire->ire_max_frag <= length &&
- ire->ire_max_frag >= length - hdr_length) {
- /*
- * Handle broken BSD 4.2 systems that
- * return the wrong iph_length in ICMP
- * errors.
- */
- length -= hdr_length;
- }
- for (i = 0; i < A_CNT(icmp_frag_size_table); i++) {
- if (length > icmp_frag_size_table[i])
- break;
- }
- if (i == A_CNT(icmp_frag_size_table)) {
- /* Smaller than 68! */
- disable_pmtud = B_TRUE;
+ ip1dbg(("Wrong mtu: sent %d, dce %d\n",
+ length, old_mtu));
+ length -= hdr_length;
+ }
+ for (i = 0; i < A_CNT(icmp_frag_size_table); i++) {
+ if (length > icmp_frag_size_table[i])
+ break;
+ }
+ if (i == A_CNT(icmp_frag_size_table)) {
+ /* Smaller than IP_MIN_MTU! */
+ ip1dbg(("Too big for packet size %d\n",
+ length));
+ disable_pmtud = B_TRUE;
+ mtu = ipst->ips_ip_pmtu_min;
+ } else {
+ mtu = icmp_frag_size_table[i];
+ ip1dbg(("Calculated mtu %d, packet size %d, "
+ "before %d\n", mtu, length, old_mtu));
+ if (mtu < ipst->ips_ip_pmtu_min) {
mtu = ipst->ips_ip_pmtu_min;
- } else {
- mtu = icmp_frag_size_table[i];
- if (mtu < ipst->ips_ip_pmtu_min) {
- mtu = ipst->ips_ip_pmtu_min;
- disable_pmtud = B_TRUE;
- }
+ disable_pmtud = B_TRUE;
}
- /* Fool the ULP into believing our guessed PMTU. */
- icmph->icmph_du_zero = 0;
- icmph->icmph_du_mtu = htons(mtu);
- }
- if (disable_pmtud)
- ire->ire_frag_flag = 0;
- /* Reduce the IRE max frag value as advised. */
- ire->ire_max_frag = MIN(ire->ire_max_frag, mtu);
- if (ire->ire_max_frag == mtu) {
- /* Decreased it */
- ire->ire_marks |= IRE_MARK_PMTU;
}
- mutex_exit(&ire->ire_lock);
- DTRACE_PROBE4(ip4__pmtu__change, icmph_t *, icmph, ire_t *,
- ire, int, orig_mtu, int, mtu);
}
- rw_exit(&first_ire->ire_bucket->irb_lock);
- ire_refrele(first_ire);
- return (B_TRUE);
+ if (disable_pmtud)
+ dce->dce_flags |= DCEF_TOO_SMALL_PMTU;
+ else
+ dce->dce_flags &= ~DCEF_TOO_SMALL_PMTU;
+
+ dce->dce_pmtu = MIN(old_mtu, mtu);
+ /* Prepare to send the new max frag size for the ULP. */
+ icmph->icmph_du_zero = 0;
+ icmph->icmph_du_mtu = htons((uint16_t)dce->dce_pmtu);
+ DTRACE_PROBE4(ip4__pmtu__change, icmph_t *, icmph, dce_t *,
+ dce, int, orig_mtu, int, mtu);
+
+ /* We now have a PMTU for sure */
+ dce->dce_flags |= DCEF_PMTU;
+ dce->dce_last_change_time = TICK_TO_SEC(lbolt64);
+ mutex_exit(&dce->dce_lock);
+ /*
+ * After dropping the lock the new value is visible to everyone.
+ * Then we bump the generation number so any cached values reinspect
+ * the dce_t.
+ */
+ dce_increment_generation(dce);
+ dce_refrele(dce);
}
/*
- * If the packet in error is Self-Encapsulated, icmp_inbound_error_fanout
+ * If the packet in error is Self-Encapsulated, icmp_inbound_error_fanout_v4
* calls this function.
*/
static mblk_t *
-icmp_inbound_self_encap_error(mblk_t *mp, int iph_hdr_length, int hdr_length)
+icmp_inbound_self_encap_error_v4(mblk_t *mp, ipha_t *ipha, ipha_t *in_ipha)
{
- ipha_t *ipha;
- icmph_t *icmph;
- ipha_t *in_ipha;
int length;
ASSERT(mp->b_datap->db_type == M_DATA);
- /*
- * For Self-encapsulated packets, we added an extra IP header
- * without the options. Inner IP header is the one from which
- * the outer IP header was formed. Thus, we need to remove the
- * outer IP header. To do this, we pullup the whole message
- * and overlay whatever follows the outer IP header over the
- * outer IP header.
- */
-
- if (!pullupmsg(mp, -1))
- return (NULL);
-
- icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
- ipha = (ipha_t *)&icmph[1];
- in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length);
+ /* icmp_inbound_v4 has already pulled up the whole error packet */
+ ASSERT(mp->b_cont == NULL);
/*
- * The length that we want to overlay is following the inner
- * IP header. Subtracting the IP header + icmp header + outer
- * IP header's length should give us the length that we want to
- * overlay.
+ * The length that we want to overlay is the inner header
+ * and what follows it.
*/
- length = msgdsize(mp) - iph_hdr_length - sizeof (icmph_t) -
- hdr_length;
+ length = msgdsize(mp) - ((uchar_t *)in_ipha - mp->b_rptr);
+
/*
- * Overlay whatever follows the inner header over the
+ * Overlay the inner header and whatever follows it over the
* outer header.
*/
bcopy((uchar_t *)in_ipha, (uchar_t *)ipha, length);
- /* Set the wptr to account for the outer header */
- mp->b_wptr -= hdr_length;
+ /* Adjust for what we removed */
+ mp->b_wptr -= (uchar_t *)in_ipha - (uchar_t *)ipha;
return (mp);
}
/*
- * Fanout for ICMP errors containing IP-in-IPv4 packets. Returns B_TRUE if a
- * tunnel consumed the message, and B_FALSE otherwise.
- */
-static boolean_t
-icmp_inbound_iptun_fanout(mblk_t *first_mp, ipha_t *ripha, ill_t *ill,
- ip_stack_t *ipst)
-{
- conn_t *connp;
-
- if ((connp = ipcl_iptun_classify_v4(&ripha->ipha_src, &ripha->ipha_dst,
- ipst)) == NULL)
- return (B_FALSE);
-
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
- connp->conn_recv(connp, first_mp, NULL);
- CONN_DEC_REF(connp);
- return (B_TRUE);
-}
-
-/*
* Try to pass the ICMP message upstream in case the ULP cares.
*
* If the packet that caused the ICMP error is secure, we send
@@ -2400,25 +2187,22 @@ icmp_inbound_iptun_fanout(mblk_t *first_mp, ipha_t *ripha, ill_t *ill,
*
* IFN could have been generated locally or by some router.
*
- * LOCAL : *ip_wput_ire -> icmp_frag_needed could have generated this.
+ * LOCAL : ire_send_wire (before calling ipsec_out_process) can call
+ * icmp_frag_needed/icmp_pkt2big_v6 to generated a local IFN.
* This happens because IP adjusted its value of MTU on an
* earlier IFN message and could not tell the upper layer,
* the new adjusted value of MTU e.g. Packet was encrypted
* or there was not enough information to fanout to upper
- * layers. Thus on the next outbound datagram, ip_wput_ire
+ * layers. Thus on the next outbound datagram, ire_send_wire
* generates the IFN, where IPsec processing has *not* been
* done.
*
- * *ip_wput_ire_fragmentit -> ip_wput_frag -> icmp_frag_needed
- * could have generated this. This happens because ire_max_frag
- * value in IP was set to a new value, while the IPsec processing
- * was being done and after we made the fragmentation check in
- * ip_wput_ire. Thus on return from IPsec processing,
- * ip_wput_ipsec_out finds that the new length is > ire_max_frag
- * and generates the IFN. As IPsec processing is over, we fanout
- * to AH/ESP to remove the header.
+ * Note that we retain ixa_fragsize across IPsec thus once
+ * we have picking ixa_fragsize and entered ipsec_out_process we do
+ * no change the fragsize even if the path MTU changes before
+ * we reach ip_output_post_ipsec.
*
- * In both these cases, ipsec_in_loopback will be set indicating
+ * In the local case, IRAF_LOOPBACK will be set indicating
* that IFN was generated locally.
*
* ROUTER : IFN could be secure or non-secure.
@@ -2432,45 +2216,38 @@ icmp_inbound_iptun_fanout(mblk_t *first_mp, ipha_t *ripha, ill_t *ill,
* If the packet in error does not have AH/ESP, we handle it
* like any other case.
*
- * * NON_SECURE : If the packet in error has AH/ESP headers,
- * we attach a dummy ipsec_in and send it up to AH/ESP
- * for validation. AH/ESP will verify whether there is a
+ * * NON_SECURE : If the packet in error has AH/ESP headers, we send it
+ * up to AH/ESP for validation. AH/ESP will verify whether there is a
* valid SA or not and send it back. We will fanout again if
* we have more data in the packet.
*
* If the packet in error does not have AH/ESP, we handle it
* like any other case.
+ *
+ * The caller must have called icmp_inbound_verify_v4.
*/
static void
-icmp_inbound_error_fanout(queue_t *q, ill_t *ill, mblk_t *mp,
- icmph_t *icmph, ipha_t *ipha, int iph_hdr_length, int hdr_length,
- boolean_t mctl_present, boolean_t ip_policy, ill_t *recv_ill,
- zoneid_t zoneid)
-{
- uint16_t *up; /* Pointer to ports in ULP header */
- uint32_t ports; /* reversed ports for fanout */
- ipha_t ripha; /* With reversed addresses */
- mblk_t *first_mp;
- ipsec_in_t *ii;
- tcph_t *tcph;
- conn_t *connp;
- ip_stack_t *ipst;
-
- ASSERT(ill != NULL);
-
- ASSERT(recv_ill != NULL);
- ipst = recv_ill->ill_ipst;
+icmp_inbound_error_fanout_v4(mblk_t *mp, icmph_t *icmph, ip_recv_attr_t *ira)
+{
+ uint16_t *up; /* Pointer to ports in ULP header */
+ uint32_t ports; /* reversed ports for fanout */
+ ipha_t ripha; /* With reversed addresses */
+ ipha_t *ipha; /* Inner IP header */
+ uint_t hdr_length; /* Inner IP header length */
+ tcpha_t *tcpha;
+ conn_t *connp;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+ ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
+ ill_t *rill = ira->ira_rill;
- first_mp = mp;
- if (mctl_present) {
- mp = first_mp->b_cont;
- ASSERT(mp != NULL);
+ /* Caller already pulled up everything. */
+ ipha = (ipha_t *)&icmph[1];
+ ASSERT((uchar_t *)&ipha[1] <= mp->b_wptr);
+ ASSERT(mp->b_cont == NULL);
- ii = (ipsec_in_t *)first_mp->b_rptr;
- ASSERT(ii->ipsec_in_type == IPSEC_IN);
- } else {
- ii = NULL;
- }
+ hdr_length = IPH_HDR_LENGTH(ipha);
+ ira->ira_protocol = ipha->ipha_protocol;
/*
* We need a separate IP header with the source and destination
@@ -2482,249 +2259,223 @@ icmp_inbound_error_fanout(queue_t *q, ill_t *ill, mblk_t *mp,
ripha.ipha_protocol = ipha->ipha_protocol;
ripha.ipha_version_and_hdr_length = ipha->ipha_version_and_hdr_length;
- ip2dbg(("icmp_inbound_error: proto %d %x to %x: %d/%d\n",
+ ip2dbg(("icmp_inbound_error_v4: proto %d %x to %x: %d/%d\n",
ripha.ipha_protocol, ntohl(ipha->ipha_src),
ntohl(ipha->ipha_dst),
icmph->icmph_type, icmph->icmph_code));
switch (ipha->ipha_protocol) {
case IPPROTO_UDP:
- /*
- * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
- * transport header.
- */
- if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
- mp->b_wptr) {
- if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length +
- ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) {
- goto discard_pkt;
- }
- icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
- ipha = (ipha_t *)&icmph[1];
- }
up = (uint16_t *)((uchar_t *)ipha + hdr_length);
/* Attempt to find a client stream based on port. */
- ((uint16_t *)&ports)[0] = up[1];
- ((uint16_t *)&ports)[1] = up[0];
- ip2dbg(("icmp_inbound_error: UDP ports %d to %d\n",
+ ip2dbg(("icmp_inbound_error_v4: UDP ports %d to %d\n",
ntohs(up[0]), ntohs(up[1])));
- /* Have to change db_type after any pullupmsg */
- DB_TYPE(mp) = M_CTL;
-
- ip_fanout_udp(q, first_mp, ill, &ripha, ports, B_FALSE, 0,
- mctl_present, ip_policy, recv_ill, zoneid);
+ /* Note that we send error to all matches. */
+ ira->ira_flags |= IRAF_ICMP_ERROR;
+ ip_fanout_udp_multi_v4(mp, &ripha, up[0], up[1], ira);
+ ira->ira_flags &= ~IRAF_ICMP_ERROR;
return;
case IPPROTO_TCP:
/*
- * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
- * transport header.
- */
- if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
- mp->b_wptr) {
- if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length +
- ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) {
- goto discard_pkt;
- }
- icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
- ipha = (ipha_t *)&icmph[1];
- }
- /*
* Find a TCP client stream for this packet.
* Note that we do a reverse lookup since the header is
* in the form we sent it out.
*/
- tcph = (tcph_t *)((uchar_t *)ipha + hdr_length);
- connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcph, TCPS_LISTEN,
+ tcpha = (tcpha_t *)((uchar_t *)ipha + hdr_length);
+ connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcpha, TCPS_LISTEN,
ipst);
if (connp == NULL)
goto discard_pkt;
- /* Have to change db_type after any pullupmsg */
- DB_TYPE(mp) = M_CTL;
- SQUEUE_ENTER_ONE(connp->conn_sqp, first_mp, tcp_input, connp,
- SQ_FILL, SQTAG_TCP_INPUT_ICMP_ERR);
+ if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) ||
+ (ira->ira_flags & IRAF_IPSEC_SECURE)) {
+ mp = ipsec_check_inbound_policy(mp, connp,
+ ipha, NULL, ira);
+ if (mp == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ /* Note that mp is NULL */
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ CONN_DEC_REF(connp);
+ return;
+ }
+ }
+
+ ira->ira_flags |= IRAF_ICMP_ERROR;
+ ira->ira_ill = ira->ira_rill = NULL;
+ if (IPCL_IS_TCP(connp)) {
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
+ connp->conn_recvicmp, connp, ira, SQ_FILL,
+ SQTAG_TCP_INPUT_ICMP_ERR);
+ } else {
+ /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
+ (connp->conn_recv)(connp, mp, NULL, ira);
+ CONN_DEC_REF(connp);
+ }
+ ira->ira_ill = ill;
+ ira->ira_rill = rill;
+ ira->ira_flags &= ~IRAF_ICMP_ERROR;
return;
case IPPROTO_SCTP:
- /*
- * Verify we have at least ICMP_MIN_SCTP_HDR_LEN bytes of
- * transport header, in the first mp.
- */
- if ((uchar_t *)ipha + hdr_length + ICMP_MIN_SCTP_HDR_LEN >
- mp->b_wptr) {
- if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length +
- ICMP_MIN_SCTP_HDR_LEN - mp->b_rptr)) {
- goto discard_pkt;
- }
- icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
- ipha = (ipha_t *)&icmph[1];
- }
up = (uint16_t *)((uchar_t *)ipha + hdr_length);
/* Find a SCTP client stream for this packet. */
((uint16_t *)&ports)[0] = up[1];
((uint16_t *)&ports)[1] = up[0];
- /* Have to change db_type after any pullupmsg */
- DB_TYPE(mp) = M_CTL;
- ip_fanout_sctp(first_mp, recv_ill, &ripha, ports, 0,
- mctl_present, ip_policy, zoneid);
+ ira->ira_flags |= IRAF_ICMP_ERROR;
+ ip_fanout_sctp(mp, &ripha, NULL, ports, ira);
+ ira->ira_flags &= ~IRAF_ICMP_ERROR;
return;
case IPPROTO_ESP:
- case IPPROTO_AH: {
- int ipsec_rc;
- ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
-
- /*
- * We need a IPSEC_IN in the front to fanout to AH/ESP.
- * We will re-use the IPSEC_IN if it is already present as
- * AH/ESP will not affect any fields in the IPSEC_IN for
- * ICMP errors. If there is no IPSEC_IN, allocate a new
- * one and attach it in the front.
- */
- if (ii != NULL) {
- /*
- * ip_fanout_proto_again converts the ICMP errors
- * that come back from AH/ESP to M_DATA so that
- * if it is non-AH/ESP and we do a pullupmsg in
- * this function, it would work. Convert it back
- * to M_CTL before we send up as this is a ICMP
- * error. This could have been generated locally or
- * by some router. Validate the inner IPsec
- * headers.
- *
- * NOTE : ill_index is used by ip_fanout_proto_again
- * to locate the ill.
- */
- ASSERT(ill != NULL);
- ii->ipsec_in_ill_index =
- ill->ill_phyint->phyint_ifindex;
- ii->ipsec_in_rill_index =
- recv_ill->ill_phyint->phyint_ifindex;
- DB_TYPE(first_mp->b_cont) = M_CTL;
- } else {
- /*
- * IPSEC_IN is not present. We attach a ipsec_in
- * message and send up to IPsec for validating
- * and removing the IPsec headers. Clear
- * ipsec_in_secure so that when we return
- * from IPsec, we don't mistakenly think that this
- * is a secure packet came from the network.
- *
- * NOTE : ill_index is used by ip_fanout_proto_again
- * to locate the ill.
- */
- ASSERT(first_mp == mp);
- first_mp = ipsec_in_alloc(B_TRUE, ipst->ips_netstack);
- if (first_mp == NULL) {
- freemsg(mp);
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- return;
- }
- ii = (ipsec_in_t *)first_mp->b_rptr;
-
- /* This is not a secure packet */
- ii->ipsec_in_secure = B_FALSE;
- first_mp->b_cont = mp;
- DB_TYPE(mp) = M_CTL;
- ASSERT(ill != NULL);
- ii->ipsec_in_ill_index =
- ill->ill_phyint->phyint_ifindex;
- ii->ipsec_in_rill_index =
- recv_ill->ill_phyint->phyint_ifindex;
- }
-
+ case IPPROTO_AH:
if (!ipsec_loaded(ipss)) {
- ip_proto_not_sup(q, first_mp, 0, zoneid, ipst);
+ ip_proto_not_sup(mp, ira);
return;
}
if (ipha->ipha_protocol == IPPROTO_ESP)
- ipsec_rc = ipsecesp_icmp_error(first_mp);
+ mp = ipsecesp_icmp_error(mp, ira);
else
- ipsec_rc = ipsecah_icmp_error(first_mp);
- if (ipsec_rc == IPSEC_STATUS_FAILED)
+ mp = ipsecah_icmp_error(mp, ira);
+ if (mp == NULL)
+ return;
+
+ /* Just in case ipsec didn't preserve the NULL b_cont */
+ if (mp->b_cont != NULL) {
+ if (!pullupmsg(mp, -1))
+ goto discard_pkt;
+ }
+
+ /*
+ * Note that ira_pktlen and ira_ip_hdr_length are no longer
+ * correct, but we don't use them any more here.
+ *
+ * If succesful, the mp has been modified to not include
+ * the ESP/AH header so we can fanout to the ULP's icmp
+ * error handler.
+ */
+ if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH)
+ goto truncated;
+
+ /* Verify the modified message before any further processes. */
+ ipha = (ipha_t *)mp->b_rptr;
+ hdr_length = IPH_HDR_LENGTH(ipha);
+ icmph = (icmph_t *)&mp->b_rptr[hdr_length];
+ if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
+ freemsg(mp);
return;
+ }
- ip_fanout_proto_again(first_mp, ill, recv_ill, NULL);
+ icmp_inbound_error_fanout_v4(mp, icmph, ira);
return;
- }
- case IPPROTO_ENCAP:
- case IPPROTO_IPV6:
- if (ipha->ipha_protocol == IPPROTO_ENCAP) {
- ipha_t *in_ipha;
- if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) >
- mp->b_wptr) {
- if (!pullupmsg(mp, (uchar_t *)ipha +
- hdr_length + sizeof (ipha_t) -
- mp->b_rptr)) {
+ case IPPROTO_ENCAP: {
+ /* Look for self-encapsulated packets that caused an error */
+ ipha_t *in_ipha;
+
+ /*
+ * Caller has verified that length has to be
+ * at least the size of IP header.
+ */
+ ASSERT(hdr_length >= sizeof (ipha_t));
+ /*
+ * Check the sanity of the inner IP header like
+ * we did for the outer header.
+ */
+ in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length);
+ if ((IPH_HDR_VERSION(in_ipha) != IPV4_VERSION)) {
+ goto discard_pkt;
+ }
+ if (IPH_HDR_LENGTH(in_ipha) < sizeof (ipha_t)) {
+ goto discard_pkt;
+ }
+ /* Check for Self-encapsulated tunnels */
+ if (in_ipha->ipha_src == ipha->ipha_src &&
+ in_ipha->ipha_dst == ipha->ipha_dst) {
+
+ mp = icmp_inbound_self_encap_error_v4(mp, ipha,
+ in_ipha);
+ if (mp == NULL)
+ goto discard_pkt;
+
+ /*
+ * Just in case self_encap didn't preserve the NULL
+ * b_cont
+ */
+ if (mp->b_cont != NULL) {
+ if (!pullupmsg(mp, -1))
goto discard_pkt;
- }
- icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
- ipha = (ipha_t *)&icmph[1];
}
/*
- * Caller has verified that length has to be
- * at least the size of IP header.
+ * Note that ira_pktlen and ira_ip_hdr_length are no
+ * longer correct, but we don't use them any more here.
*/
- ASSERT(hdr_length >= sizeof (ipha_t));
+ if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH)
+ goto truncated;
+
/*
- * Check the sanity of the inner IP header like
- * we did for the outer header.
+ * Verify the modified message before any further
+ * processes.
*/
- in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length);
- if ((IPH_HDR_VERSION(in_ipha) != IPV4_VERSION) ||
- IPH_HDR_LENGTH(in_ipha) < sizeof (ipha_t))
- goto discard_pkt;
- /* Check for Self-encapsulated tunnels */
- if (in_ipha->ipha_src == ipha->ipha_src &&
- in_ipha->ipha_dst == ipha->ipha_dst) {
-
- mp = icmp_inbound_self_encap_error(mp,
- iph_hdr_length, hdr_length);
- if (mp == NULL)
- goto discard_pkt;
- icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
- ipha = (ipha_t *)&icmph[1];
- hdr_length = IPH_HDR_LENGTH(ipha);
- /*
- * The packet in error is self-encapsualted.
- * And we are finding it further encapsulated
- * which we could not have possibly generated.
- */
- if (ipha->ipha_protocol == IPPROTO_ENCAP) {
- goto discard_pkt;
- }
- icmp_inbound_error_fanout(q, ill, first_mp,
- icmph, ipha, iph_hdr_length, hdr_length,
- mctl_present, ip_policy, recv_ill, zoneid);
+ ipha = (ipha_t *)mp->b_rptr;
+ hdr_length = IPH_HDR_LENGTH(ipha);
+ icmph = (icmph_t *)&mp->b_rptr[hdr_length];
+ if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
+ freemsg(mp);
return;
}
- }
- DB_TYPE(mp) = M_CTL;
- if (icmp_inbound_iptun_fanout(first_mp, &ripha, ill, ipst))
+ /*
+ * The packet in error is self-encapsualted.
+ * And we are finding it further encapsulated
+ * which we could not have possibly generated.
+ */
+ if (ipha->ipha_protocol == IPPROTO_ENCAP) {
+ goto discard_pkt;
+ }
+ icmp_inbound_error_fanout_v4(mp, icmph, ira);
return;
+ }
+ /* No self-encapsulated */
+ /* FALLTHRU */
+ }
+ case IPPROTO_IPV6:
+ if ((connp = ipcl_iptun_classify_v4(&ripha.ipha_src,
+ &ripha.ipha_dst, ipst)) != NULL) {
+ ira->ira_flags |= IRAF_ICMP_ERROR;
+ connp->conn_recvicmp(connp, mp, NULL, ira);
+ CONN_DEC_REF(connp);
+ ira->ira_flags &= ~IRAF_ICMP_ERROR;
+ return;
+ }
/*
* No IP tunnel is interested, fallthrough and see
* if a raw socket will want it.
*/
/* FALLTHRU */
default:
- ip_fanout_proto(q, first_mp, ill, &ripha, 0, mctl_present,
- ip_policy, recv_ill, zoneid);
+ ira->ira_flags |= IRAF_ICMP_ERROR;
+ ip_fanout_proto_v4(mp, &ripha, ira);
+ ira->ira_flags &= ~IRAF_ICMP_ERROR;
return;
}
/* NOTREACHED */
discard_pkt:
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-drop_pkt:;
- ip1dbg(("icmp_inbound_error_fanout: drop pkt\n"));
- freemsg(first_mp);
+ ip1dbg(("icmp_inbound_error_fanout_v4: drop pkt\n"));
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ freemsg(mp);
+ return;
+
+truncated:
+ /* We pulled up everthing already. Must be truncated */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
+ ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
+ freemsg(mp);
}
/*
@@ -2747,6 +2498,16 @@ ipoptp_first(ipoptp_t *optp, ipha_t *ipha)
return (ipoptp_next(optp));
}
+/* Like above but without an ipha_t */
+uint8_t
+ipoptp_first2(ipoptp_t *optp, uint32_t totallen, uint8_t *opt)
+{
+ optp->ipoptp_next = opt;
+ optp->ipoptp_end = optp->ipoptp_next + totallen;
+ optp->ipoptp_flags = 0;
+ return (ipoptp_next(optp));
+}
+
/*
* Common IP options parser: extract next option.
*/
@@ -2858,38 +2619,55 @@ ipoptp_next(ipoptp_t *optp)
/*
* Use the outgoing IP header to create an IP_OPTIONS option the way
* it was passed down from the application.
+ *
+ * This is compatible with BSD in that it returns
+ * the reverse source route with the final destination
+ * as the last entry. The first 4 bytes of the option
+ * will contain the final destination.
*/
int
-ip_opt_get_user(const ipha_t *ipha, uchar_t *buf)
+ip_opt_get_user(conn_t *connp, uchar_t *buf)
{
ipoptp_t opts;
- const uchar_t *opt;
+ uchar_t *opt;
uint8_t optval;
uint8_t optlen;
uint32_t len = 0;
- uchar_t *buf1 = buf;
+ uchar_t *buf1 = buf;
+ uint32_t totallen;
+ ipaddr_t dst;
+ ip_pkt_t *ipp = &connp->conn_xmit_ipp;
+
+ if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS))
+ return (0);
+
+ totallen = ipp->ipp_ipv4_options_len;
+ if (totallen & 0x3)
+ return (0);
buf += IP_ADDR_LEN; /* Leave room for final destination */
len += IP_ADDR_LEN;
bzero(buf1, IP_ADDR_LEN);
- /*
- * OK to cast away const here, as we don't store through the returned
- * opts.ipoptp_cur pointer.
- */
- for (optval = ipoptp_first(&opts, (ipha_t *)ipha);
+ dst = connp->conn_faddr_v4;
+
+ for (optval = ipoptp_first2(&opts, totallen, ipp->ipp_ipv4_options);
optval != IPOPT_EOL;
optval = ipoptp_next(&opts)) {
int off;
opt = opts.ipoptp_cur;
+ if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
+ break;
+ }
optlen = opts.ipoptp_len;
+
switch (optval) {
case IPOPT_SSRR:
case IPOPT_LSRR:
/*
- * Insert ipha_dst as the first entry in the source
+ * Insert destination as the first entry in the source
* route and move down the entries on step.
* The last entry gets placed at buf1.
*/
@@ -2902,8 +2680,9 @@ ip_opt_get_user(const ipha_t *ipha, uchar_t *buf)
/* No entries in source route */
break;
}
- /* Last entry in source route */
- bcopy(opt + off, buf1, IP_ADDR_LEN);
+ /* Last entry in source route if not already set */
+ if (dst == INADDR_ANY)
+ bcopy(opt + off, buf1, IP_ADDR_LEN);
off -= IP_ADDR_LEN;
while (off > 0) {
@@ -2913,19 +2692,12 @@ ip_opt_get_user(const ipha_t *ipha, uchar_t *buf)
off -= IP_ADDR_LEN;
}
/* ipha_dst into first slot */
- bcopy(&ipha->ipha_dst,
- buf + off + IP_ADDR_LEN,
+ bcopy(&dst, buf + off + IP_ADDR_LEN,
IP_ADDR_LEN);
buf += optlen;
len += optlen;
break;
- case IPOPT_COMSEC:
- case IPOPT_SECURITY:
- /* if passing up a label is not ok, then remove */
- if (is_system_labeled())
- break;
- /* FALLTHROUGH */
default:
bcopy(opt, buf, optlen);
buf += optlen;
@@ -3007,57 +2779,46 @@ icmp_options_update(ipha_t *ipha)
/*
* Process received ICMP Redirect messages.
+ * Assumes the caller has verified that the headers are in the pulled up mblk.
+ * Consumes mp.
*/
static void
-icmp_redirect(ill_t *ill, mblk_t *mp)
+icmp_redirect_v4(mblk_t *mp, ipha_t *ipha, icmph_t *icmph, ip_recv_attr_t *ira)
{
- ipha_t *ipha;
- int iph_hdr_length;
- icmph_t *icmph;
- ipha_t *ipha_err;
- ire_t *ire;
- ire_t *prev_ire;
- ire_t *save_ire;
- ipaddr_t src, dst, gateway;
- iulp_t ulp_info = { 0 };
- int error;
- ip_stack_t *ipst;
+ ire_t *ire, *nire;
+ ire_t *prev_ire;
+ ipaddr_t src, dst, gateway;
+ ip_stack_t *ipst = ira->ira_ill->ill_ipst;
+ ipha_t *inner_ipha; /* Inner IP header */
- ASSERT(ill != NULL);
- ipst = ill->ill_ipst;
-
- ipha = (ipha_t *)mp->b_rptr;
- iph_hdr_length = IPH_HDR_LENGTH(ipha);
- if (((mp->b_wptr - mp->b_rptr) - iph_hdr_length) <
- sizeof (icmph_t) + IP_SIMPLE_HDR_LENGTH) {
- BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
- freemsg(mp);
- return;
- }
- icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
- ipha_err = (ipha_t *)&icmph[1];
+ /* Caller already pulled up everything. */
+ inner_ipha = (ipha_t *)&icmph[1];
src = ipha->ipha_src;
- dst = ipha_err->ipha_dst;
+ dst = inner_ipha->ipha_dst;
gateway = icmph->icmph_rd_gateway;
/* Make sure the new gateway is reachable somehow. */
- ire = ire_route_lookup(gateway, 0, 0, IRE_INTERFACE, NULL, NULL,
- ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
+ ire = ire_ftable_lookup_v4(gateway, 0, 0, IRE_ONLINK, NULL,
+ ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, NULL);
/*
* Make sure we had a route for the dest in question and that
* that route was pointing to the old gateway (the source of the
* redirect packet.)
+ * Note: this merely says that there is some IRE which matches that
+ * gateway; not that the longest match matches that gateway.
*/
- prev_ire = ire_route_lookup(dst, 0, src, 0, NULL, NULL, ALL_ZONES,
- NULL, MATCH_IRE_GW, ipst);
+ prev_ire = ire_ftable_lookup_v4(dst, 0, src, 0, NULL, ALL_ZONES,
+ NULL, MATCH_IRE_GW, 0, ipst, NULL);
/*
* Check that
* the redirect was not from ourselves
* the new gateway and the old gateway are directly reachable
*/
- if (!prev_ire ||
- !ire ||
- ire->ire_type == IRE_LOCAL) {
+ if (prev_ire == NULL || ire == NULL ||
+ (prev_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) ||
+ (prev_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
+ !(ire->ire_type & IRE_IF_ALL)) {
BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects);
+ ip_drop_input("icmpInBadRedirects - ire", mp, ira->ira_ill);
freemsg(mp);
if (ire != NULL)
ire_refrele(ire);
@@ -3066,49 +2827,9 @@ icmp_redirect(ill_t *ill, mblk_t *mp)
return;
}
- /*
- * Should we use the old ULP info to create the new gateway? From
- * a user's perspective, we should inherit the info so that it
- * is a "smooth" transition. If we do not do that, then new
- * connections going thru the new gateway will have no route metrics,
- * which is counter-intuitive to user. From a network point of
- * view, this may or may not make sense even though the new gateway
- * is still directly connected to us so the route metrics should not
- * change much.
- *
- * But if the old ire_uinfo is not initialized, we do another
- * recursive lookup on the dest using the new gateway. There may
- * be a route to that. If so, use it to initialize the redirect
- * route.
- */
- if (prev_ire->ire_uinfo.iulp_set) {
- bcopy(&prev_ire->ire_uinfo, &ulp_info, sizeof (iulp_t));
- } else {
- ire_t *tmp_ire;
- ire_t *sire;
-
- tmp_ire = ire_ftable_lookup(dst, 0, gateway, 0, NULL, &sire,
- ALL_ZONES, 0, NULL,
- (MATCH_IRE_RECURSIVE | MATCH_IRE_GW | MATCH_IRE_DEFAULT),
- ipst);
- if (sire != NULL) {
- bcopy(&sire->ire_uinfo, &ulp_info, sizeof (iulp_t));
- /*
- * If sire != NULL, ire_ftable_lookup() should not
- * return a NULL value.
- */
- ASSERT(tmp_ire != NULL);
- ire_refrele(tmp_ire);
- ire_refrele(sire);
- } else if (tmp_ire != NULL) {
- bcopy(&tmp_ire->ire_uinfo, &ulp_info,
- sizeof (iulp_t));
- ire_refrele(tmp_ire);
- }
- }
- if (prev_ire->ire_type == IRE_CACHE)
- ire_delete(prev_ire);
ire_refrele(prev_ire);
+ ire_refrele(ire);
+
/*
* TODO: more precise handling for cases 0, 2, 3, the latter two
* require TOS routing
@@ -3121,47 +2842,42 @@ icmp_redirect(ill_t *ill, mblk_t *mp)
case 3:
break;
default:
- freemsg(mp);
BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects);
- ire_refrele(ire);
+ ip_drop_input("icmpInBadRedirects - code", mp, ira->ira_ill);
+ freemsg(mp);
return;
}
/*
* Create a Route Association. This will allow us to remember that
* someone we believe told us to use the particular gateway.
*/
- save_ire = ire;
ire = ire_create(
(uchar_t *)&dst, /* dest addr */
(uchar_t *)&ip_g_all_ones, /* mask */
- (uchar_t *)&save_ire->ire_src_addr, /* source addr */
(uchar_t *)&gateway, /* gateway addr */
- &save_ire->ire_max_frag, /* max frag */
- NULL, /* no src nce */
- NULL, /* no rfq */
- NULL, /* no stq */
IRE_HOST,
- NULL, /* ipif */
- 0, /* cmask */
- 0, /* phandle */
- 0, /* ihandle */
+ NULL, /* ill */
+ ALL_ZONES,
(RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST),
- &ulp_info,
NULL, /* tsol_gc_t */
- NULL, /* gcgrp */
ipst);
if (ire == NULL) {
freemsg(mp);
- ire_refrele(save_ire);
return;
}
- error = ire_add(&ire, NULL, NULL, NULL, B_FALSE);
- ire_refrele(save_ire);
- atomic_inc_32(&ipst->ips_ip_redirect_cnt);
+ nire = ire_add(ire);
+ /* Check if it was a duplicate entry */
+ if (nire != NULL && nire != ire) {
+ ASSERT(nire->ire_identical_ref > 1);
+ ire_delete(nire);
+ ire_refrele(nire);
+ nire = NULL;
+ }
+ ire = nire;
+ if (ire != NULL) {
+ ire_refrele(ire); /* Held in ire_add */
- if (error == 0) {
- ire_refrele(ire); /* Held in ire_add_v4 */
/* tell routing sockets that we received a redirect */
ip_rts_change(RTM_REDIRECT, dst, gateway, IP_HOST_MASK, 0, src,
(RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 0,
@@ -3173,8 +2889,8 @@ icmp_redirect(ill_t *ill, mblk_t *mp)
* This together with the added IRE has the effect of
* modifying an existing redirect.
*/
- prev_ire = ire_ftable_lookup(dst, 0, src, IRE_HOST, NULL, NULL,
- ALL_ZONES, 0, NULL, (MATCH_IRE_GW | MATCH_IRE_TYPE), ipst);
+ prev_ire = ire_ftable_lookup_v4(dst, 0, src, IRE_HOST, NULL,
+ ALL_ZONES, NULL, (MATCH_IRE_GW | MATCH_IRE_TYPE), 0, ipst, NULL);
if (prev_ire != NULL) {
if (prev_ire ->ire_flags & RTF_DYNAMIC)
ire_delete(prev_ire);
@@ -3186,29 +2902,24 @@ icmp_redirect(ill_t *ill, mblk_t *mp)
/*
* Generate an ICMP parameter problem message.
+ * When called from ip_output side a minimal ip_recv_attr_t needs to be
+ * constructed by the caller.
*/
static void
-icmp_param_problem(queue_t *q, mblk_t *mp, uint8_t ptr, zoneid_t zoneid,
- ip_stack_t *ipst)
+icmp_param_problem(mblk_t *mp, uint8_t ptr, ip_recv_attr_t *ira)
{
icmph_t icmph;
- boolean_t mctl_present;
- mblk_t *first_mp;
+ ip_stack_t *ipst = ira->ira_ill->ill_ipst;
- EXTRACT_PKT_MP(mp, first_mp, mctl_present);
-
- if (!(mp = icmp_pkt_err_ok(mp, ipst))) {
- if (mctl_present)
- freeb(first_mp);
+ mp = icmp_pkt_err_ok(mp, ira);
+ if (mp == NULL)
return;
- }
bzero(&icmph, sizeof (icmph_t));
icmph.icmph_type = ICMP_PARAM_PROBLEM;
icmph.icmph_pp_ptr = ptr;
BUMP_MIB(&ipst->ips_icmp_mib, icmpOutParmProbs);
- icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present, zoneid,
- ipst);
+ icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
}
/*
@@ -3217,15 +2928,11 @@ icmp_param_problem(queue_t *q, mblk_t *mp, uint8_t ptr, zoneid_t zoneid,
* Note: assumes that icmp_pkt_err_ok has been called to verify that
* an icmp error packet can be sent.
* Assigns an appropriate source address to the packet. If ipha_dst is
- * one of our addresses use it for source. Otherwise pick a source based
- * on a route lookup back to ipha_src.
- * Note that ipha_src must be set here since the
- * packet is likely to arrive on an ill queue in ip_wput() which will
- * not set a source address.
+ * one of our addresses use it for source. Otherwise let ip_output_simple
+ * pick the source address.
*/
static void
-icmp_pkt(queue_t *q, mblk_t *mp, void *stuff, size_t len,
- boolean_t mctl_present, zoneid_t zoneid, ip_stack_t *ipst)
+icmp_pkt(mblk_t *mp, void *stuff, size_t len, ip_recv_attr_t *ira)
{
ipaddr_t dst;
icmph_t *icmph;
@@ -3235,115 +2942,62 @@ icmp_pkt(queue_t *q, mblk_t *mp, void *stuff, size_t len,
mblk_t *mp1;
ipaddr_t src;
ire_t *ire;
- mblk_t *ipsec_mp;
- ipsec_out_t *io = NULL;
-
- if (mctl_present) {
- /*
- * If it is :
- *
- * 1) a IPSEC_OUT, then this is caused by outbound
- * datagram originating on this host. IPsec processing
- * may or may not have been done. Refer to comments above
- * icmp_inbound_error_fanout for details.
- *
- * 2) a IPSEC_IN if we are generating a icmp_message
- * for an incoming datagram destined for us i.e called
- * from ip_fanout_send_icmp.
- */
- ipsec_info_t *in;
- ipsec_mp = mp;
- mp = ipsec_mp->b_cont;
+ ip_xmit_attr_t ixas;
+ ip_stack_t *ipst = ira->ira_ill->ill_ipst;
- in = (ipsec_info_t *)ipsec_mp->b_rptr;
- ipha = (ipha_t *)mp->b_rptr;
+ ipha = (ipha_t *)mp->b_rptr;
- ASSERT(in->ipsec_info_type == IPSEC_OUT ||
- in->ipsec_info_type == IPSEC_IN);
+ bzero(&ixas, sizeof (ixas));
+ ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
+ ixas.ixa_zoneid = ira->ira_zoneid;
+ ixas.ixa_ifindex = 0;
+ ixas.ixa_ipst = ipst;
+ ixas.ixa_cred = kcred;
+ ixas.ixa_cpid = NOPID;
+ ixas.ixa_tsl = ira->ira_tsl; /* Behave as a multi-level responder */
+ ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
- if (in->ipsec_info_type == IPSEC_IN) {
- /*
- * Convert the IPSEC_IN to IPSEC_OUT.
- */
- if (!ipsec_in_to_out(ipsec_mp, ipha, NULL, zoneid)) {
- BUMP_MIB(&ipst->ips_ip_mib,
- ipIfStatsOutDiscards);
- return;
- }
- io = (ipsec_out_t *)ipsec_mp->b_rptr;
- } else {
- ASSERT(in->ipsec_info_type == IPSEC_OUT);
- io = (ipsec_out_t *)in;
- /*
- * Clear out ipsec_out_proc_begin, so we do a fresh
- * ire lookup.
- */
- io->ipsec_out_proc_begin = B_FALSE;
- }
- ASSERT(zoneid != ALL_ZONES);
- /*
- * The IPSEC_IN (now an IPSEC_OUT) didn't have its zoneid
- * initialized. We need to do that now.
- */
- io->ipsec_out_zoneid = zoneid;
- } else {
+ if (ira->ira_flags & IRAF_IPSEC_SECURE) {
/*
- * This is in clear. The icmp message we are building
- * here should go out in clear.
+ * Apply IPsec based on how IPsec was applied to
+ * the packet that had the error.
*
- * Pardon the convolution of it all, but it's easier to
- * allocate a "use cleartext" IPSEC_IN message and convert
- * it than it is to allocate a new one.
+ * If it was an outbound packet that caused the ICMP
+ * error, then the caller will have setup the IRA
+ * appropriately.
*/
- ipsec_in_t *ii;
- ASSERT(DB_TYPE(mp) == M_DATA);
- ipsec_mp = ipsec_in_alloc(B_TRUE, ipst->ips_netstack);
- if (ipsec_mp == NULL) {
- freemsg(mp);
+ if (!ipsec_in_to_out(ira, &ixas, mp, ipha, NULL)) {
BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+ /* Note: mp already consumed and ip_drop_packet done */
return;
}
- ii = (ipsec_in_t *)ipsec_mp->b_rptr;
-
- /* This is not a secure packet */
- ii->ipsec_in_secure = B_FALSE;
- ipsec_mp->b_cont = mp;
- ipha = (ipha_t *)mp->b_rptr;
+ } else {
/*
- * Convert the IPSEC_IN to IPSEC_OUT.
+ * This is in clear. The icmp message we are building
+ * here should go out in clear, independent of our policy.
*/
- if (!ipsec_in_to_out(ipsec_mp, ipha, NULL, zoneid)) {
- BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
- return;
- }
- io = (ipsec_out_t *)ipsec_mp->b_rptr;
+ ixas.ixa_flags |= IXAF_NO_IPSEC;
}
/* Remember our eventual destination */
dst = ipha->ipha_src;
- ire = ire_route_lookup(ipha->ipha_dst, 0, 0, (IRE_LOCAL|IRE_LOOPBACK),
- NULL, NULL, zoneid, NULL, MATCH_IRE_TYPE, ipst);
- if (ire != NULL &&
- (ire->ire_zoneid == zoneid || ire->ire_zoneid == ALL_ZONES)) {
+ /*
+ * If the packet was for one of our unicast addresses, make
+ * sure we respond with that as the source. Otherwise
+ * have ip_output_simple pick the source address.
+ */
+ ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0,
+ (IRE_LOCAL|IRE_LOOPBACK), NULL, ira->ira_zoneid, NULL,
+ MATCH_IRE_TYPE|MATCH_IRE_ZONEONLY, 0, ipst, NULL);
+ if (ire != NULL) {
+ ire_refrele(ire);
src = ipha->ipha_dst;
} else {
- if (ire != NULL)
- ire_refrele(ire);
- ire = ire_route_lookup(dst, 0, 0, 0, NULL, NULL, zoneid, NULL,
- (MATCH_IRE_DEFAULT|MATCH_IRE_RECURSIVE|MATCH_IRE_ZONEONLY),
- ipst);
- if (ire == NULL) {
- BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes);
- freemsg(ipsec_mp);
- return;
- }
- src = ire->ire_src_addr;
+ src = INADDR_ANY;
+ ixas.ixa_flags |= IXAF_SET_SOURCE;
}
- if (ire != NULL)
- ire_refrele(ire);
-
/*
* Check if we can send back more then 8 bytes in addition to
* the IP header. We try to send 64 bytes of data and the internal
@@ -3352,10 +3006,10 @@ icmp_pkt(queue_t *q, mblk_t *mp, void *stuff, size_t len,
len_needed = IPH_HDR_LENGTH(ipha);
if (ipha->ipha_protocol == IPPROTO_ENCAP ||
ipha->ipha_protocol == IPPROTO_IPV6) {
-
if (!pullupmsg(mp, -1)) {
BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
- freemsg(ipsec_mp);
+ ip_drop_output("ipIfStatsOutDiscards", mp, NULL);
+ freemsg(mp);
return;
}
ipha = (ipha_t *)mp->b_rptr;
@@ -3376,28 +3030,23 @@ icmp_pkt(queue_t *q, mblk_t *mp, void *stuff, size_t len,
(void) adjmsg(mp, len_needed - msg_len);
msg_len = len_needed;
}
- /* Make sure we propagate the cred/label for TX */
- mp1 = allocb_tmpl(sizeof (icmp_ipha) + len, mp);
+ mp1 = allocb(sizeof (icmp_ipha) + len, BPRI_MED);
if (mp1 == NULL) {
BUMP_MIB(&ipst->ips_icmp_mib, icmpOutErrors);
- freemsg(ipsec_mp);
+ freemsg(mp);
return;
}
mp1->b_cont = mp;
mp = mp1;
- ASSERT(ipsec_mp->b_datap->db_type == M_CTL &&
- ipsec_mp->b_rptr == (uint8_t *)io &&
- io->ipsec_out_type == IPSEC_OUT);
- ipsec_mp->b_cont = mp;
/*
- * Set ipsec_out_icmp_loopback so we can let the ICMP messages this
+ * Set IXAF_TRUSTED_ICMP so we can let the ICMP messages this
* node generates be accepted in peace by all on-host destinations.
* If we do NOT assume that all on-host destinations trust
* self-generated ICMP messages, then rework here, ip6.c, and spd.c.
- * (Look for ipsec_out_icmp_loopback).
+ * (Look for IXAF_TRUSTED_ICMP).
*/
- io->ipsec_out_icmp_loopback = B_TRUE;
+ ixas.ixa_flags |= IXAF_TRUSTED_ICMP;
ipha = (ipha_t *)mp->b_rptr;
mp1->b_wptr = (uchar_t *)ipha + (sizeof (icmp_ipha) + len);
@@ -3416,7 +3065,9 @@ icmp_pkt(queue_t *q, mblk_t *mp, void *stuff, size_t len,
icmph->icmph_checksum = 0;
icmph->icmph_checksum = IP_CSUM(mp, (int32_t)sizeof (ipha_t), 0);
BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs);
- put(q, ipsec_mp);
+
+ (void) ip_output_simple(mp, &ixas);
+ ixa_cleanup(&ixas);
}
/*
@@ -3480,37 +3131,30 @@ icmp_err_rate_limit(ip_stack_t *ipst)
* ICMP error packet should be sent.
*/
static mblk_t *
-icmp_pkt_err_ok(mblk_t *mp, ip_stack_t *ipst)
+icmp_pkt_err_ok(mblk_t *mp, ip_recv_attr_t *ira)
{
+ ip_stack_t *ipst = ira->ira_ill->ill_ipst;
icmph_t *icmph;
ipha_t *ipha;
uint_t len_needed;
- ire_t *src_ire;
- ire_t *dst_ire;
if (!mp)
return (NULL);
ipha = (ipha_t *)mp->b_rptr;
if (ip_csum_hdr(ipha)) {
BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInCksumErrs);
+ ip_drop_input("ipIfStatsInCksumErrs", mp, NULL);
freemsg(mp);
return (NULL);
}
- src_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_BROADCAST,
- NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
- dst_ire = ire_ctable_lookup(ipha->ipha_src, 0, IRE_BROADCAST,
- NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
- if (src_ire != NULL || dst_ire != NULL ||
+ if (ip_type_v4(ipha->ipha_dst, ipst) == IRE_BROADCAST ||
+ ip_type_v4(ipha->ipha_src, ipst) == IRE_BROADCAST ||
CLASSD(ipha->ipha_dst) ||
CLASSD(ipha->ipha_src) ||
(ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_OFFSET)) {
/* Note: only errors to the fragment with offset 0 */
BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
freemsg(mp);
- if (src_ire != NULL)
- ire_refrele(src_ire);
- if (dst_ire != NULL)
- ire_refrele(dst_ire);
return (NULL);
}
if (ipha->ipha_protocol == IPPROTO_ICMP) {
@@ -3546,7 +3190,7 @@ icmp_pkt_err_ok(mblk_t *mp, ip_stack_t *ipst)
* If this is a labeled system, then check to see if we're allowed to
* send a response to this particular sender. If not, then just drop.
*/
- if (is_system_labeled() && !tsol_can_reply_error(mp)) {
+ if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) {
ip2dbg(("icmp_pkt_err_ok: can't respond to packet\n"));
BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
freemsg(mp);
@@ -3565,956 +3209,178 @@ icmp_pkt_err_ok(mblk_t *mp, ip_stack_t *ipst)
}
/*
- * Generate an ICMP redirect message.
+ * Called when a packet was sent out the same link that it arrived on.
+ * Check if it is ok to send a redirect and then send it.
*/
-static void
-icmp_send_redirect(queue_t *q, mblk_t *mp, ipaddr_t gateway, ip_stack_t *ipst)
+void
+ip_send_potential_redirect_v4(mblk_t *mp, ipha_t *ipha, ire_t *ire,
+ ip_recv_attr_t *ira)
{
- icmph_t icmph;
+ ip_stack_t *ipst = ira->ira_ill->ill_ipst;
+ ipaddr_t src, nhop;
+ mblk_t *mp1;
+ ire_t *nhop_ire;
/*
- * We are called from ip_rput where we could
- * not have attached an IPSEC_IN.
- */
- ASSERT(mp->b_datap->db_type == M_DATA);
-
- if (!(mp = icmp_pkt_err_ok(mp, ipst))) {
+ * Check the source address to see if it originated
+ * on the same logical subnet it is going back out on.
+ * If so, we should be able to send it a redirect.
+ * Avoid sending a redirect if the destination
+ * is directly connected (i.e., we matched an IRE_ONLINK),
+ * or if the packet was source routed out this interface.
+ *
+ * We avoid sending a redirect if the
+ * destination is directly connected
+ * because it is possible that multiple
+ * IP subnets may have been configured on
+ * the link, and the source may not
+ * be on the same subnet as ip destination,
+ * even though they are on the same
+ * physical link.
+ */
+ if ((ire->ire_type & IRE_ONLINK) ||
+ ip_source_routed(ipha, ipst))
return;
- }
-
- bzero(&icmph, sizeof (icmph_t));
- icmph.icmph_type = ICMP_REDIRECT;
- icmph.icmph_code = 1;
- icmph.icmph_rd_gateway = gateway;
- BUMP_MIB(&ipst->ips_icmp_mib, icmpOutRedirects);
- /* Redirects sent by router, and router is global zone */
- icmp_pkt(q, mp, &icmph, sizeof (icmph_t), B_FALSE, GLOBAL_ZONEID, ipst);
-}
-/*
- * Generate an ICMP time exceeded message.
- */
-void
-icmp_time_exceeded(queue_t *q, mblk_t *mp, uint8_t code, zoneid_t zoneid,
- ip_stack_t *ipst)
-{
- icmph_t icmph;
- boolean_t mctl_present;
- mblk_t *first_mp;
-
- EXTRACT_PKT_MP(mp, first_mp, mctl_present);
-
- if (!(mp = icmp_pkt_err_ok(mp, ipst))) {
- if (mctl_present)
- freeb(first_mp);
+ nhop_ire = ire_nexthop(ire);
+ if (nhop_ire == NULL)
return;
- }
-
- bzero(&icmph, sizeof (icmph_t));
- icmph.icmph_type = ICMP_TIME_EXCEEDED;
- icmph.icmph_code = code;
- BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimeExcds);
- icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present, zoneid,
- ipst);
-}
-/*
- * Generate an ICMP unreachable message.
- */
-void
-icmp_unreachable(queue_t *q, mblk_t *mp, uint8_t code, zoneid_t zoneid,
- ip_stack_t *ipst)
-{
- icmph_t icmph;
- mblk_t *first_mp;
- boolean_t mctl_present;
+ nhop = nhop_ire->ire_addr;
- EXTRACT_PKT_MP(mp, first_mp, mctl_present);
+ if (nhop_ire->ire_type & IRE_IF_CLONE) {
+ ire_t *ire2;
- if (!(mp = icmp_pkt_err_ok(mp, ipst))) {
- if (mctl_present)
- freeb(first_mp);
- return;
+ /* Follow ire_dep_parent to find non-clone IRE_INTERFACE */
+ mutex_enter(&nhop_ire->ire_lock);
+ ire2 = nhop_ire->ire_dep_parent;
+ if (ire2 != NULL)
+ ire_refhold(ire2);
+ mutex_exit(&nhop_ire->ire_lock);
+ ire_refrele(nhop_ire);
+ nhop_ire = ire2;
}
-
- bzero(&icmph, sizeof (icmph_t));
- icmph.icmph_type = ICMP_DEST_UNREACHABLE;
- icmph.icmph_code = code;
- BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs);
- ip2dbg(("send icmp destination unreachable code %d\n", code));
- icmp_pkt(q, first_mp, (char *)&icmph, sizeof (icmph_t), mctl_present,
- zoneid, ipst);
-}
-
-/*
- * Attempt to start recovery of an IPv4 interface that's been shut down as a
- * duplicate. As long as someone else holds the address, the interface will
- * stay down. When that conflict goes away, the interface is brought back up.
- * This is done so that accidental shutdowns of addresses aren't made
- * permanent. Your server will recover from a failure.
- *
- * For DHCP, recovery is not done in the kernel. Instead, it's handled by a
- * user space process (dhcpagent).
- *
- * Recovery completes if ARP reports that the address is now ours (via
- * AR_CN_READY). In that case, we go to ip_arp_excl to finish the operation.
- *
- * This function is entered on a timer expiry; the ID is in ipif_recovery_id.
- */
-static void
-ipif_dup_recovery(void *arg)
-{
- ipif_t *ipif = arg;
- ill_t *ill = ipif->ipif_ill;
- mblk_t *arp_add_mp;
- mblk_t *arp_del_mp;
- ip_stack_t *ipst = ill->ill_ipst;
-
- ipif->ipif_recovery_id = 0;
-
- /*
- * No lock needed for moving or condemned check, as this is just an
- * optimization.
- */
- if (ill->ill_arp_closing || !(ipif->ipif_flags & IPIF_DUPLICATE) ||
- (ipif->ipif_flags & IPIF_POINTOPOINT) ||
- (ipif->ipif_state_flags & (IPIF_CONDEMNED))) {
- /* No reason to try to bring this address back. */
+ if (nhop_ire == NULL)
return;
- }
- /* ACE_F_UNVERIFIED restarts DAD */
- if ((arp_add_mp = ipif_area_alloc(ipif, ACE_F_UNVERIFIED)) == NULL)
- goto alloc_fail;
-
- if (ipif->ipif_arp_del_mp == NULL) {
- if ((arp_del_mp = ipif_ared_alloc(ipif)) == NULL)
- goto alloc_fail;
- ipif->ipif_arp_del_mp = arp_del_mp;
- }
+ ASSERT(!(nhop_ire->ire_type & IRE_IF_CLONE));
- putnext(ill->ill_rq, arp_add_mp);
- return;
+ src = ipha->ipha_src;
-alloc_fail:
/*
- * On allocation failure, just restart the timer. Note that the ipif
- * is down here, so no other thread could be trying to start a recovery
- * timer. The ill_lock protects the condemned flag and the recovery
- * timer ID.
+ * We look at the interface ire for the nexthop,
+ * to see if ipha_src is in the same subnet
+ * as the nexthop.
*/
- freemsg(arp_add_mp);
- mutex_enter(&ill->ill_lock);
- if (ipst->ips_ip_dup_recovery > 0 && ipif->ipif_recovery_id == 0 &&
- !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
- ipif->ipif_recovery_id = timeout(ipif_dup_recovery, ipif,
- MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
- }
- mutex_exit(&ill->ill_lock);
-}
-
-/*
- * This is for exclusive changes due to ARP. Either tear down an interface due
- * to AR_CN_FAILED and AR_CN_BOGON, or bring one up for successful recovery.
- */
-/* ARGSUSED */
-static void
-ip_arp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
-{
- ill_t *ill = rq->q_ptr;
- arh_t *arh;
- ipaddr_t src;
- ipif_t *ipif;
- char ibuf[LIFNAMSIZ + 10]; /* 10 digits for logical i/f number */
- char hbuf[MAC_STR_LEN];
- char sbuf[INET_ADDRSTRLEN];
- const char *failtype;
- boolean_t bring_up;
- ip_stack_t *ipst = ill->ill_ipst;
-
- switch (((arcn_t *)mp->b_rptr)->arcn_code) {
- case AR_CN_READY:
- failtype = NULL;
- bring_up = B_TRUE;
- break;
- case AR_CN_FAILED:
- failtype = "in use";
- bring_up = B_FALSE;
- break;
- default:
- failtype = "claimed";
- bring_up = B_FALSE;
- break;
- }
-
- arh = (arh_t *)mp->b_cont->b_rptr;
- bcopy((char *)&arh[1] + arh->arh_hlen, &src, IP_ADDR_LEN);
-
- (void) mac_colon_addr((uint8_t *)(arh + 1), arh->arh_hlen, hbuf,
- sizeof (hbuf));
- (void) ip_dot_addr(src, sbuf);
- for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
-
- if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
- ipif->ipif_lcl_addr != src) {
- continue;
- }
-
- /*
- * If we failed on a recovery probe, then restart the timer to
- * try again later.
- */
- if (!bring_up && (ipif->ipif_flags & IPIF_DUPLICATE) &&
- !(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
- ill->ill_net_type == IRE_IF_RESOLVER &&
- !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
- ipst->ips_ip_dup_recovery > 0 &&
- ipif->ipif_recovery_id == 0) {
- ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
- ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
- continue;
- }
-
- /*
- * If what we're trying to do has already been done, then do
- * nothing.
- */
- if (bring_up == ((ipif->ipif_flags & IPIF_UP) != 0))
- continue;
-
- ipif_get_name(ipif, ibuf, sizeof (ibuf));
-
- if (failtype == NULL) {
- cmn_err(CE_NOTE, "recovered address %s on %s", sbuf,
- ibuf);
- } else {
- cmn_err(CE_WARN, "%s has duplicate address %s (%s "
- "by %s); disabled", ibuf, sbuf, failtype, hbuf);
- }
-
- if (bring_up) {
- ASSERT(ill->ill_dl_up);
- /*
- * Free up the ARP delete message so we can allocate
- * a fresh one through the normal path.
- */
- freemsg(ipif->ipif_arp_del_mp);
- ipif->ipif_arp_del_mp = NULL;
- if (ipif_resolver_up(ipif, Res_act_initial) !=
- EINPROGRESS) {
- ipif->ipif_addr_ready = 1;
- (void) ipif_up_done(ipif);
- ASSERT(ill->ill_move_ipif == NULL);
- }
- continue;
- }
-
- mutex_enter(&ill->ill_lock);
- ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
- ipif->ipif_flags |= IPIF_DUPLICATE;
- ill->ill_ipif_dup_count++;
- mutex_exit(&ill->ill_lock);
+ if ((src & nhop_ire->ire_mask) == (nhop & nhop_ire->ire_mask)) {
/*
- * Already exclusive on the ill; no need to handle deferred
- * processing here.
+ * The source is directly connected.
*/
- (void) ipif_down(ipif, NULL, NULL);
- ipif_down_tail(ipif);
- mutex_enter(&ill->ill_lock);
- if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
- ill->ill_net_type == IRE_IF_RESOLVER &&
- !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
- ipst->ips_ip_dup_recovery > 0) {
- ASSERT(ipif->ipif_recovery_id == 0);
- ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
- ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
+ mp1 = copymsg(mp);
+ if (mp1 != NULL) {
+ icmp_send_redirect(mp1, nhop, ira);
}
- mutex_exit(&ill->ill_lock);
}
- freemsg(mp);
-}
-
-/* ARGSUSED */
-static void
-ip_arp_defend(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
-{
- ill_t *ill = rq->q_ptr;
- arh_t *arh;
- ipaddr_t src;
- ipif_t *ipif;
-
- arh = (arh_t *)mp->b_cont->b_rptr;
- bcopy((char *)&arh[1] + arh->arh_hlen, &src, IP_ADDR_LEN);
- for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
- if ((ipif->ipif_flags & IPIF_UP) && ipif->ipif_lcl_addr == src)
- (void) ipif_resolver_up(ipif, Res_act_defend);
- }
- freemsg(mp);
+ ire_refrele(nhop_ire);
}
/*
- * News from ARP. ARP sends notification of interesting events down
- * to its clients using M_CTL messages with the interesting ARP packet
- * attached via b_cont.
- * The interesting event from a device comes up the corresponding ARP-IP-DEV
- * queue as opposed to ARP sending the message to all the clients, i.e. all
- * its ARP-IP-DEV instances. Thus, for AR_CN_ANNOUNCE, we must walk the cache
- * table if a cache IRE is found to delete all the entries for the address in
- * the packet.
+ * Generate an ICMP redirect message.
*/
static void
-ip_arp_news(queue_t *q, mblk_t *mp)
+icmp_send_redirect(mblk_t *mp, ipaddr_t gateway, ip_recv_attr_t *ira)
{
- arcn_t *arcn;
- arh_t *arh;
- ire_t *ire = NULL;
- char hbuf[MAC_STR_LEN];
- char sbuf[INET_ADDRSTRLEN];
- ipaddr_t src;
- in6_addr_t v6src;
- boolean_t isv6 = B_FALSE;
- ipif_t *ipif;
- ill_t *ill;
- ip_stack_t *ipst;
-
- if (CONN_Q(q)) {
- conn_t *connp = Q_TO_CONN(q);
-
- ipst = connp->conn_netstack->netstack_ip;
- } else {
- ill_t *ill = (ill_t *)q->q_ptr;
-
- ipst = ill->ill_ipst;
- }
+ icmph_t icmph;
+ ip_stack_t *ipst = ira->ira_ill->ill_ipst;
- if ((mp->b_wptr - mp->b_rptr) < sizeof (arcn_t) || !mp->b_cont) {
- if (q->q_next) {
- putnext(q, mp);
- } else
- freemsg(mp);
- return;
- }
- arh = (arh_t *)mp->b_cont->b_rptr;
- /* Is it one we are interested in? */
- if (BE16_TO_U16(arh->arh_proto) == ETHERTYPE_IPV6) {
- isv6 = B_TRUE;
- bcopy((char *)&arh[1] + (arh->arh_hlen & 0xFF), &v6src,
- IPV6_ADDR_LEN);
- } else if (BE16_TO_U16(arh->arh_proto) == IP_ARP_PROTO_TYPE) {
- bcopy((char *)&arh[1] + (arh->arh_hlen & 0xFF), &src,
- IP_ADDR_LEN);
- } else {
- freemsg(mp);
+ mp = icmp_pkt_err_ok(mp, ira);
+ if (mp == NULL)
return;
- }
-
- ill = q->q_ptr;
- arcn = (arcn_t *)mp->b_rptr;
- switch (arcn->arcn_code) {
- case AR_CN_BOGON:
- /*
- * Someone is sending ARP packets with a source protocol
- * address that we have published and for which we believe our
- * entry is authoritative and (when ill_arp_extend is set)
- * verified to be unique on the network.
- *
- * The ARP module internally handles the cases where the sender
- * is just probing (for DAD) and where the hardware address of
- * a non-authoritative entry has changed. Thus, these are the
- * real conflicts, and we have to do resolution.
- *
- * We back away quickly from the address if it's from DHCP or
- * otherwise temporary and hasn't been used recently (or at
- * all). We'd like to include "deprecated" addresses here as
- * well (as there's no real reason to defend something we're
- * discarding), but IPMP "reuses" this flag to mean something
- * other than the standard meaning.
- *
- * If the ARP module above is not extended (meaning that it
- * doesn't know how to defend the address), then we just log
- * the problem as we always did and continue on. It's not
- * right, but there's little else we can do, and those old ATM
- * users are going away anyway.
- */
- (void) mac_colon_addr((uint8_t *)(arh + 1), arh->arh_hlen,
- hbuf, sizeof (hbuf));
- (void) ip_dot_addr(src, sbuf);
- if (isv6) {
- ire = ire_cache_lookup_v6(&v6src, ALL_ZONES, NULL,
- ipst);
- } else {
- ire = ire_cache_lookup(src, ALL_ZONES, NULL, ipst);
- }
- if (ire != NULL && IRE_IS_LOCAL(ire)) {
- uint32_t now;
- uint32_t maxage;
- clock_t lused;
- uint_t maxdefense;
- uint_t defs;
-
- /*
- * First, figure out if this address hasn't been used
- * in a while. If it hasn't, then it's a better
- * candidate for abandoning.
- */
- ipif = ire->ire_ipif;
- ASSERT(ipif != NULL);
- now = gethrestime_sec();
- maxage = now - ire->ire_create_time;
- if (maxage > ipst->ips_ip_max_temp_idle)
- maxage = ipst->ips_ip_max_temp_idle;
- lused = drv_hztousec(ddi_get_lbolt() -
- ire->ire_last_used_time) / MICROSEC + 1;
- if (lused >= maxage && (ipif->ipif_flags &
- (IPIF_DHCPRUNNING | IPIF_TEMPORARY)))
- maxdefense = ipst->ips_ip_max_temp_defend;
- else
- maxdefense = ipst->ips_ip_max_defend;
-
- /*
- * Now figure out how many times we've defended
- * ourselves. Ignore defenses that happened long in
- * the past.
- */
- mutex_enter(&ire->ire_lock);
- if ((defs = ire->ire_defense_count) > 0 &&
- now - ire->ire_defense_time >
- ipst->ips_ip_defend_interval) {
- ire->ire_defense_count = defs = 0;
- }
- ire->ire_defense_count++;
- ire->ire_defense_time = now;
- mutex_exit(&ire->ire_lock);
- ill_refhold(ill);
- ire_refrele(ire);
-
- /*
- * If we've defended ourselves too many times already,
- * then give up and tear down the interface(s) using
- * this address. Otherwise, defend by sending out a
- * gratuitous ARP.
- */
- if (defs >= maxdefense && ill->ill_arp_extend) {
- qwriter_ip(ill, q, mp, ip_arp_excl, NEW_OP,
- B_FALSE);
- } else {
- cmn_err(CE_WARN,
- "node %s is using our IP address %s on %s",
- hbuf, sbuf, ill->ill_name);
- /*
- * If this is an old (ATM) ARP module, then
- * don't try to defend the address. Remain
- * compatible with the old behavior. Defend
- * only with new ARP.
- */
- if (ill->ill_arp_extend) {
- qwriter_ip(ill, q, mp, ip_arp_defend,
- NEW_OP, B_FALSE);
- } else {
- ill_refrele(ill);
- }
- }
- return;
- }
- cmn_err(CE_WARN,
- "proxy ARP problem? Node '%s' is using %s on %s",
- hbuf, sbuf, ill->ill_name);
- if (ire != NULL)
- ire_refrele(ire);
- break;
- case AR_CN_ANNOUNCE:
- if (isv6) {
- /*
- * For XRESOLV interfaces.
- * Delete the IRE cache entry and NCE for this
- * v6 address
- */
- ip_ire_clookup_and_delete_v6(&v6src, ipst);
- /*
- * If v6src is a non-zero, it's a router address
- * as below. Do the same sort of thing to clean
- * out off-net IRE_CACHE entries that go through
- * the router.
- */
- if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
- ire_walk_v6(ire_delete_cache_gw_v6,
- (char *)&v6src, ALL_ZONES, ipst);
- }
- } else {
- nce_hw_map_t hwm;
-
- /*
- * ARP gives us a copy of any packet where it thinks
- * the address has changed, so that we can update our
- * caches. We're responsible for caching known answers
- * in the current design. We check whether the
- * hardware address really has changed in all of our
- * entries that have cached this mapping, and if so, we
- * blow them away. This way we will immediately pick
- * up the rare case of a host changing hardware
- * address.
- */
- if (src == 0)
- break;
- hwm.hwm_addr = src;
- hwm.hwm_hwlen = arh->arh_hlen;
- hwm.hwm_hwaddr = (uchar_t *)(arh + 1);
- NDP_HW_CHANGE_INCR(ipst->ips_ndp4);
- ndp_walk_common(ipst->ips_ndp4, NULL,
- (pfi_t)nce_delete_hw_changed, &hwm, ALL_ZONES);
- NDP_HW_CHANGE_DECR(ipst->ips_ndp4);
- }
- break;
- case AR_CN_READY:
- /* No external v6 resolver has a contract to use this */
- if (isv6)
- break;
- /* If the link is down, we'll retry this later */
- if (!(ill->ill_phyint->phyint_flags & PHYI_RUNNING))
- break;
- ipif = ipif_lookup_addr(src, ill, ALL_ZONES, NULL, NULL,
- NULL, NULL, ipst);
- if (ipif != NULL) {
- /*
- * If this is a duplicate recovery, then we now need to
- * go exclusive to bring this thing back up.
- */
- if ((ipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)) ==
- IPIF_DUPLICATE) {
- ipif_refrele(ipif);
- ill_refhold(ill);
- qwriter_ip(ill, q, mp, ip_arp_excl, NEW_OP,
- B_FALSE);
- return;
- }
- /*
- * If this is the first notice that this address is
- * ready, then let the user know now.
- */
- if ((ipif->ipif_flags & IPIF_UP) &&
- !ipif->ipif_addr_ready) {
- ipif_mask_reply(ipif);
- ipif_up_notify(ipif);
- }
- ipif->ipif_addr_ready = 1;
- ipif_refrele(ipif);
- }
- ire = ire_cache_lookup(src, ALL_ZONES, msg_getlabel(mp), ipst);
- if (ire != NULL) {
- ire->ire_defense_count = 0;
- ire_refrele(ire);
- }
- break;
- case AR_CN_FAILED:
- /* No external v6 resolver has a contract to use this */
- if (isv6)
- break;
- if (!ill->ill_arp_extend) {
- (void) mac_colon_addr((uint8_t *)(arh + 1),
- arh->arh_hlen, hbuf, sizeof (hbuf));
- (void) ip_dot_addr(src, sbuf);
-
- cmn_err(CE_WARN,
- "node %s is using our IP address %s on %s",
- hbuf, sbuf, ill->ill_name);
- break;
- }
- ill_refhold(ill);
- qwriter_ip(ill, q, mp, ip_arp_excl, NEW_OP, B_FALSE);
- return;
- }
- freemsg(mp);
+ bzero(&icmph, sizeof (icmph_t));
+ icmph.icmph_type = ICMP_REDIRECT;
+ icmph.icmph_code = 1;
+ icmph.icmph_rd_gateway = gateway;
+ BUMP_MIB(&ipst->ips_icmp_mib, icmpOutRedirects);
+ icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
}
/*
- * Create a mblk suitable for carrying the interface index and/or source link
- * address. This mblk is tagged as an M_CTL and is sent to ULP. This is used
- * when the IP_RECVIF and/or IP_RECVSLLA socket option is set by the user
- * application.
+ * Generate an ICMP time exceeded message.
*/
-mblk_t *
-ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags, zoneid_t zoneid,
- ip_stack_t *ipst)
+void
+icmp_time_exceeded(mblk_t *mp, uint8_t code, ip_recv_attr_t *ira)
{
- mblk_t *mp;
- ip_pktinfo_t *pinfo;
- ipha_t *ipha;
- struct ether_header *pether;
- boolean_t ipmp_ill_held = B_FALSE;
-
- mp = allocb(sizeof (ip_pktinfo_t), BPRI_MED);
- if (mp == NULL) {
- ip1dbg(("ip_add_info: allocation failure.\n"));
- return (data_mp);
- }
-
- ipha = (ipha_t *)data_mp->b_rptr;
- pinfo = (ip_pktinfo_t *)mp->b_rptr;
- bzero(pinfo, sizeof (ip_pktinfo_t));
- pinfo->ip_pkt_flags = (uchar_t)flags;
- pinfo->ip_pkt_ulp_type = IN_PKTINFO; /* Tell ULP what type of info */
-
- pether = (struct ether_header *)((char *)ipha
- - sizeof (struct ether_header));
-
- /*
- * Make sure the interface is an ethernet type, since this option
- * is currently supported only on this type of interface. Also make
- * sure we are pointing correctly above db_base.
- */
- if ((flags & IPF_RECVSLLA) &&
- ((uchar_t *)pether >= data_mp->b_datap->db_base) &&
- (ill->ill_type == IFT_ETHER) &&
- (ill->ill_net_type == IRE_IF_RESOLVER)) {
- pinfo->ip_pkt_slla.sdl_type = IFT_ETHER;
- bcopy(pether->ether_shost.ether_addr_octet,
- pinfo->ip_pkt_slla.sdl_data, ETHERADDRL);
- } else {
- /*
- * Clear the bit. Indicate to upper layer that IP is not
- * sending this ancillary info.
- */
- pinfo->ip_pkt_flags = pinfo->ip_pkt_flags & ~IPF_RECVSLLA;
- }
-
- /*
- * If `ill' is in an IPMP group, use the IPMP ill to determine
- * IPF_RECVIF and IPF_RECVADDR. (This currently assumes that
- * IPF_RECVADDR support on test addresses is not needed.)
- *
- * Note that `ill' may already be an IPMP ill if e.g. we're
- * processing a packet looped back to an IPMP data address
- * (since those IRE_LOCALs are tied to IPMP ills).
- */
- if (IS_UNDER_IPMP(ill)) {
- if ((ill = ipmp_ill_hold_ipmp_ill(ill)) == NULL) {
- ip1dbg(("ip_add_info: cannot hold IPMP ill.\n"));
- freemsg(mp);
- return (data_mp);
- }
- ipmp_ill_held = B_TRUE;
- }
-
- if (flags & (IPF_RECVIF | IPF_RECVADDR))
- pinfo->ip_pkt_ifindex = ill->ill_phyint->phyint_ifindex;
- if (flags & IPF_RECVADDR) {
- ipif_t *ipif;
- ire_t *ire;
-
- /*
- * Only valid for V4
- */
- ASSERT((ipha->ipha_version_and_hdr_length & 0xf0) ==
- (IPV4_VERSION << 4));
-
- ipif = ipif_get_next_ipif(NULL, ill);
- if (ipif != NULL) {
- /*
- * Since a decision has already been made to deliver the
- * packet, there is no need to test for SECATTR and
- * ZONEONLY.
- * When a multicast packet is transmitted
- * a cache entry is created for the multicast address.
- * When delivering a copy of the packet or when new
- * packets are received we do not want to match on the
- * cached entry so explicitly match on
- * IRE_LOCAL and IRE_LOOPBACK
- */
- ire = ire_ctable_lookup(ipha->ipha_dst, 0,
- IRE_LOCAL | IRE_LOOPBACK,
- ipif, zoneid, NULL,
- MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst);
- if (ire == NULL) {
- /*
- * packet must have come on a different
- * interface.
- * Since a decision has already been made to
- * deliver the packet, there is no need to test
- * for SECATTR and ZONEONLY.
- * Only match on local and broadcast ire's.
- * See detailed comment above.
- */
- ire = ire_ctable_lookup(ipha->ipha_dst, 0,
- IRE_LOCAL | IRE_LOOPBACK, ipif, zoneid,
- NULL, MATCH_IRE_TYPE, ipst);
- }
-
- if (ire == NULL) {
- /*
- * This is either a multicast packet or
- * the address has been removed since
- * the packet was received.
- * Return INADDR_ANY so that normal source
- * selection occurs for the response.
- */
-
- pinfo->ip_pkt_match_addr.s_addr = INADDR_ANY;
- } else {
- pinfo->ip_pkt_match_addr.s_addr =
- ire->ire_src_addr;
- ire_refrele(ire);
- }
- ipif_refrele(ipif);
- } else {
- pinfo->ip_pkt_match_addr.s_addr = INADDR_ANY;
- }
- }
-
- if (ipmp_ill_held)
- ill_refrele(ill);
+ icmph_t icmph;
+ ip_stack_t *ipst = ira->ira_ill->ill_ipst;
- mp->b_datap->db_type = M_CTL;
- mp->b_wptr += sizeof (ip_pktinfo_t);
- mp->b_cont = data_mp;
+ mp = icmp_pkt_err_ok(mp, ira);
+ if (mp == NULL)
+ return;
- return (mp);
+ bzero(&icmph, sizeof (icmph_t));
+ icmph.icmph_type = ICMP_TIME_EXCEEDED;
+ icmph.icmph_code = code;
+ BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimeExcds);
+ icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
}
/*
- * Used to determine the most accurate cred_t to use for TX.
- * First priority is SCM_UCRED having set the label in the message,
- * which is used for MLP on UDP. Second priority is the open credentials
- * with the peer's label (aka conn_effective_cred), which is needed for
- * MLP on TCP/SCTP and for MAC-Exempt. Last priority is the open credentials.
+ * Generate an ICMP unreachable message.
+ * When called from ip_output side a minimal ip_recv_attr_t needs to be
+ * constructed by the caller.
*/
-cred_t *
-ip_best_cred(mblk_t *mp, conn_t *connp, pid_t *pidp)
+void
+icmp_unreachable(mblk_t *mp, uint8_t code, ip_recv_attr_t *ira)
{
- cred_t *cr;
+ icmph_t icmph;
+ ip_stack_t *ipst = ira->ira_ill->ill_ipst;
- cr = msg_getcred(mp, pidp);
- if (cr != NULL && crgetlabel(cr) != NULL)
- return (cr);
- *pidp = NOPID;
- return (CONN_CRED(connp));
+ mp = icmp_pkt_err_ok(mp, ira);
+ if (mp == NULL)
+ return;
+
+ bzero(&icmph, sizeof (icmph_t));
+ icmph.icmph_type = ICMP_DEST_UNREACHABLE;
+ icmph.icmph_code = code;
+ BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs);
+ icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
}
/*
- * Latch in the IPsec state for a stream based on the ipsec_in_t passed in as
- * part of the bind request.
+ * Latch in the IPsec state for a stream based the policy in the listener
+ * and the actions in the ip_recv_attr_t.
+ * Called directly from TCP and SCTP.
*/
-
boolean_t
-ip_bind_ipsec_policy_set(conn_t *connp, mblk_t *policy_mp)
+ip_ipsec_policy_inherit(conn_t *connp, conn_t *lconnp, ip_recv_attr_t *ira)
{
- ipsec_in_t *ii;
-
- ASSERT(policy_mp != NULL);
- ASSERT(policy_mp->b_datap->db_type == IPSEC_POLICY_SET);
+ ASSERT(lconnp->conn_policy != NULL);
+ ASSERT(connp->conn_policy == NULL);
- ii = (ipsec_in_t *)policy_mp->b_rptr;
- ASSERT(ii->ipsec_in_type == IPSEC_IN);
+ IPPH_REFHOLD(lconnp->conn_policy);
+ connp->conn_policy = lconnp->conn_policy;
- connp->conn_policy = ii->ipsec_in_policy;
- ii->ipsec_in_policy = NULL;
-
- if (ii->ipsec_in_action != NULL) {
+ if (ira->ira_ipsec_action != NULL) {
if (connp->conn_latch == NULL) {
connp->conn_latch = iplatch_create();
if (connp->conn_latch == NULL)
return (B_FALSE);
}
- ipsec_latch_inbound(connp->conn_latch, ii);
+ ipsec_latch_inbound(connp, ira);
}
return (B_TRUE);
}
/*
- * Upper level protocols (ULP) pass through bind requests to IP for inspection
- * and to arrange for power-fanout assist. The ULP is identified by
- * adding a single byte at the end of the original bind message.
- * A ULP other than UDP or TCP that wishes to be recognized passes
- * down a bind with a zero length address.
- *
- * The binding works as follows:
- * - A zero byte address means just bind to the protocol.
- * - A four byte address is treated as a request to validate
- * that the address is a valid local address, appropriate for
- * an application to bind to. This does not affect any fanout
- * information in IP.
- * - A sizeof sin_t byte address is used to bind to only the local address
- * and port.
- * - A sizeof ipa_conn_t byte address contains complete fanout information
- * consisting of local and remote addresses and ports. In
- * this case, the addresses are both validated as appropriate
- * for this operation, and, if so, the information is retained
- * for use in the inbound fanout.
+ * Verify whether or not the IP address is a valid local address.
+ * Could be a unicast, including one for a down interface.
+ * If allow_mcbc then a multicast or broadcast address is also
+ * acceptable.
*
- * The ULP (except in the zero-length bind) can append an
- * additional mblk of db_type IRE_DB_REQ_TYPE or IPSEC_POLICY_SET to the
- * T_BIND_REQ/O_T_BIND_REQ. IRE_DB_REQ_TYPE indicates that the ULP wants
- * a copy of the source or destination IRE (source for local bind;
- * destination for complete bind). IPSEC_POLICY_SET indicates that the
- * policy information contained should be copied on to the conn.
- *
- * NOTE : Only one of IRE_DB_REQ_TYPE or IPSEC_POLICY_SET can be present.
- */
-mblk_t *
-ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp)
-{
- ssize_t len;
- struct T_bind_req *tbr;
- sin_t *sin;
- ipa_conn_t *ac;
- uchar_t *ucp;
- int error = 0;
- int protocol;
- ipa_conn_x_t *acx;
- cred_t *cr;
-
- /*
- * All Solaris components should pass a db_credp
- * for this TPI message, hence we ASSERT.
- * But in case there is some other M_PROTO that looks
- * like a TPI message sent by some other kernel
- * component, we check and return an error.
- */
- cr = msg_getcred(mp, NULL);
- ASSERT(cr != NULL);
- if (cr == NULL) {
- error = EINVAL;
- goto bad_addr;
- }
-
- ASSERT(!connp->conn_af_isv6);
- connp->conn_pkt_isv6 = B_FALSE;
-
- len = MBLKL(mp);
- if (len < (sizeof (*tbr) + 1)) {
- (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
- "ip_bind: bogus msg, len %ld", len);
- /* XXX: Need to return something better */
- goto bad_addr;
- }
- /* Back up and extract the protocol identifier. */
- mp->b_wptr--;
- protocol = *mp->b_wptr & 0xFF;
- tbr = (struct T_bind_req *)mp->b_rptr;
- /* Reset the message type in preparation for shipping it back. */
- DB_TYPE(mp) = M_PCPROTO;
-
- connp->conn_ulp = (uint8_t)protocol;
-
- /*
- * Check for a zero length address. This is from a protocol that
- * wants to register to receive all packets of its type.
- */
- if (tbr->ADDR_length == 0) {
- /*
- * These protocols are now intercepted in ip_bind_v6().
- * Reject protocol-level binds here for now.
- *
- * For SCTP raw socket, ICMP sends down a bind with sin_t
- * so that the protocol type cannot be SCTP.
- */
- if (protocol == IPPROTO_TCP || protocol == IPPROTO_AH ||
- protocol == IPPROTO_ESP || protocol == IPPROTO_SCTP) {
- goto bad_addr;
- }
-
- /*
- *
- * The udp module never sends down a zero-length address,
- * and allowing this on a labeled system will break MLP
- * functionality.
- */
- if (is_system_labeled() && protocol == IPPROTO_UDP)
- goto bad_addr;
-
- if (connp->conn_mac_mode != CONN_MAC_DEFAULT)
- goto bad_addr;
-
- /* No hash here really. The table is big enough. */
- connp->conn_srcv6 = ipv6_all_zeros;
-
- ipcl_proto_insert(connp, protocol);
-
- tbr->PRIM_type = T_BIND_ACK;
- return (mp);
- }
-
- /* Extract the address pointer from the message. */
- ucp = (uchar_t *)mi_offset_param(mp, tbr->ADDR_offset,
- tbr->ADDR_length);
- if (ucp == NULL) {
- ip1dbg(("ip_bind: no address\n"));
- goto bad_addr;
- }
- if (!OK_32PTR(ucp)) {
- ip1dbg(("ip_bind: unaligned address\n"));
- goto bad_addr;
- }
-
- switch (tbr->ADDR_length) {
- default:
- ip1dbg(("ip_bind: bad address length %d\n",
- (int)tbr->ADDR_length));
- goto bad_addr;
-
- case IP_ADDR_LEN:
- /* Verification of local address only */
- error = ip_bind_laddr_v4(connp, &mp->b_cont, protocol,
- *(ipaddr_t *)ucp, 0, B_FALSE);
- break;
-
- case sizeof (sin_t):
- sin = (sin_t *)ucp;
- error = ip_bind_laddr_v4(connp, &mp->b_cont, protocol,
- sin->sin_addr.s_addr, sin->sin_port, B_TRUE);
- break;
-
- case sizeof (ipa_conn_t):
- ac = (ipa_conn_t *)ucp;
- /* For raw socket, the local port is not set. */
- if (ac->ac_lport == 0)
- ac->ac_lport = connp->conn_lport;
- /* Always verify destination reachability. */
- error = ip_bind_connected_v4(connp, &mp->b_cont, protocol,
- &ac->ac_laddr, ac->ac_lport, ac->ac_faddr, ac->ac_fport,
- B_TRUE, B_TRUE, cr);
- break;
-
- case sizeof (ipa_conn_x_t):
- acx = (ipa_conn_x_t *)ucp;
- /*
- * Whether or not to verify destination reachability depends
- * on the setting of the ACX_VERIFY_DST flag in acx->acx_flags.
- */
- error = ip_bind_connected_v4(connp, &mp->b_cont, protocol,
- &acx->acx_conn.ac_laddr, acx->acx_conn.ac_lport,
- acx->acx_conn.ac_faddr, acx->acx_conn.ac_fport,
- B_TRUE, (acx->acx_flags & ACX_VERIFY_DST) != 0, cr);
- break;
- }
- ASSERT(error != EINPROGRESS);
- if (error != 0)
- goto bad_addr;
-
- /* Send it home. */
- mp->b_datap->db_type = M_PCPROTO;
- tbr->PRIM_type = T_BIND_ACK;
- return (mp);
-
-bad_addr:
- /*
- * If error = -1 then we generate a TBADADDR - otherwise error is
- * a unix errno.
- */
- if (error > 0)
- mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error);
- else
- mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0);
- return (mp);
-}
-
-/*
- * Here address is verified to be a valid local address.
- * If the IRE_DB_REQ_TYPE mp is present, a broadcast/multicast
- * address is also considered a valid local address.
* In the case of a broadcast/multicast address, however, the
* upper protocol is expected to reset the src address
- * to 0 if it sees a IRE_BROADCAST type returned so that
+ * to zero when we return IPVL_MCAST/IPVL_BCAST so that
* no packets are emitted with broadcast/multicast address as
* source address (that violates hosts requirements RFC 1122)
* The addresses valid for bind are:
@@ -4530,323 +3396,189 @@ bad_addr:
* application still has to issue an
* IP_ADD_MEMBERSHIP socket option.
*
- * On error, return -1 for TBADADDR otherwise pass the
- * errno with TSYSERR reply.
- *
* In all the above cases, the bound address must be valid in the current zone.
* When the address is loopback, multicast or broadcast, there might be many
* matching IREs so bind has to look up based on the zone.
- *
- * Note: lport is in network byte order.
- *
*/
-int
-ip_bind_laddr_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol,
- ipaddr_t src_addr, uint16_t lport, boolean_t fanout_insert)
+ip_laddr_t
+ip_laddr_verify_v4(ipaddr_t src_addr, zoneid_t zoneid,
+ ip_stack_t *ipst, boolean_t allow_mcbc)
{
- int error = 0;
- ire_t *src_ire;
- zoneid_t zoneid;
- ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
- mblk_t *mp = NULL;
- boolean_t ire_requested = B_FALSE;
- boolean_t ipsec_policy_set = B_FALSE;
+ ire_t *src_ire;
- if (mpp)
- mp = *mpp;
+ ASSERT(src_addr != INADDR_ANY);
- if (mp != NULL) {
- ire_requested = (DB_TYPE(mp) == IRE_DB_REQ_TYPE);
- ipsec_policy_set = (DB_TYPE(mp) == IPSEC_POLICY_SET);
- }
+ src_ire = ire_ftable_lookup_v4(src_addr, 0, 0, 0,
+ NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, 0, ipst, NULL);
/*
- * If it was previously connected, conn_fully_bound would have
- * been set.
+ * If an address other than in6addr_any is requested,
+ * we verify that it is a valid address for bind
+ * Note: Following code is in if-else-if form for
+ * readability compared to a condition check.
*/
- connp->conn_fully_bound = B_FALSE;
-
- src_ire = NULL;
+ if (src_ire != NULL && (src_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK))) {
+ /*
+ * (2) Bind to address of local UP interface
+ */
+ ire_refrele(src_ire);
+ return (IPVL_UNICAST_UP);
+ } else if (src_ire != NULL && src_ire->ire_type & IRE_BROADCAST) {
+ /*
+ * (4) Bind to broadcast address
+ */
+ ire_refrele(src_ire);
+ if (allow_mcbc)
+ return (IPVL_BCAST);
+ else
+ return (IPVL_BAD);
+ } else if (CLASSD(src_addr)) {
+ /* (5) bind to multicast address. */
+ if (src_ire != NULL)
+ ire_refrele(src_ire);
- zoneid = IPCL_ZONEID(connp);
+ if (allow_mcbc)
+ return (IPVL_MCAST);
+ else
+ return (IPVL_BAD);
+ } else {
+ ipif_t *ipif;
- if (src_addr) {
- src_ire = ire_route_lookup(src_addr, 0, 0, 0,
- NULL, NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, ipst);
/*
- * If an address other than 0.0.0.0 is requested,
- * we verify that it is a valid address for bind
- * Note: Following code is in if-else-if form for
- * readability compared to a condition check.
+ * (3) Bind to address of local DOWN interface?
+ * (ipif_lookup_addr() looks up all interfaces
+ * but we do not get here for UP interfaces
+ * - case (2) above)
*/
- /* LINTED - statement has no consequence */
- if (IRE_IS_LOCAL(src_ire)) {
- /*
- * (2) Bind to address of local UP interface
- */
- } else if (src_ire && src_ire->ire_type == IRE_BROADCAST) {
- /*
- * (4) Bind to broadcast address
- * Note: permitted only from transports that
- * request IRE
- */
- if (!ire_requested)
- error = EADDRNOTAVAIL;
- } else {
- /*
- * (3) Bind to address of local DOWN interface
- * (ipif_lookup_addr() looks up all interfaces
- * but we do not get here for UP interfaces
- * - case (2) above)
- */
- /* LINTED - statement has no consequent */
- if (ip_addr_exists(src_addr, zoneid, ipst)) {
- /* The address exists */
- } else if (CLASSD(src_addr)) {
- error = 0;
- if (src_ire != NULL)
- ire_refrele(src_ire);
- /*
- * (5) bind to multicast address.
- * Fake out the IRE returned to upper
- * layer to be a broadcast IRE.
- */
- src_ire = ire_ctable_lookup(
- INADDR_BROADCAST, INADDR_ANY,
- IRE_BROADCAST, NULL, zoneid, NULL,
- (MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY),
- ipst);
- if (src_ire == NULL || !ire_requested)
- error = EADDRNOTAVAIL;
- } else {
- /*
- * Not a valid address for bind
- */
- error = EADDRNOTAVAIL;
- }
- }
- if (error) {
- /* Red Alert! Attempting to be a bogon! */
- ip1dbg(("ip_bind_laddr_v4: bad src address 0x%x\n",
- ntohl(src_addr)));
- goto bad_addr;
+ if (src_ire != NULL)
+ ire_refrele(src_ire);
+
+ ipif = ipif_lookup_addr(src_addr, NULL, zoneid, ipst);
+ if (ipif == NULL)
+ return (IPVL_BAD);
+
+ /* Not a useful source? */
+ if (ipif->ipif_flags & (IPIF_NOLOCAL | IPIF_ANYCAST)) {
+ ipif_refrele(ipif);
+ return (IPVL_BAD);
}
+ ipif_refrele(ipif);
+ return (IPVL_UNICAST_DOWN);
}
+}
+
+/*
+ * Insert in the bind fanout for IPv4 and IPv6.
+ * The caller should already have used ip_laddr_verify_v*() before calling
+ * this.
+ */
+int
+ip_laddr_fanout_insert(conn_t *connp)
+{
+ int error;
/*
- * Allow setting new policies. For example, disconnects come
- * down as ipa_t bind. As we would have set conn_policy_cached
+ * Allow setting new policies. For example, disconnects result
+ * in us being called. As we would have set conn_policy_cached
* to B_TRUE before, we should set it to B_FALSE, so that policy
* can change after the disconnect.
*/
connp->conn_policy_cached = B_FALSE;
- /*
- * If not fanout_insert this was just an address verification
- */
- if (fanout_insert) {
- /*
- * The addresses have been verified. Time to insert in
- * the correct fanout list.
- */
- IN6_IPADDR_TO_V4MAPPED(src_addr, &connp->conn_srcv6);
- IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &connp->conn_remv6);
- connp->conn_lport = lport;
- connp->conn_fport = 0;
- /*
- * Do we need to add a check to reject Multicast packets
- */
- error = ipcl_bind_insert(connp, protocol, src_addr, lport);
- }
-
- if (error == 0) {
- if (ire_requested) {
- if (!ip_bind_get_ire_v4(mpp, src_ire, NULL, ipst)) {
- error = -1;
- /* Falls through to bad_addr */
- }
- } else if (ipsec_policy_set) {
- if (!ip_bind_ipsec_policy_set(connp, mp)) {
- error = -1;
- /* Falls through to bad_addr */
- }
- }
- }
-bad_addr:
+ error = ipcl_bind_insert(connp);
if (error != 0) {
if (connp->conn_anon_port) {
(void) tsol_mlp_anon(crgetzone(connp->conn_cred),
- connp->conn_mlp_type, connp->conn_ulp, ntohs(lport),
- B_FALSE);
+ connp->conn_mlp_type, connp->conn_proto,
+ ntohs(connp->conn_lport), B_FALSE);
}
connp->conn_mlp_type = mlptSingle;
}
- if (src_ire != NULL)
- IRE_REFRELE(src_ire);
- return (error);
-}
-
-int
-ip_proto_bind_laddr_v4(conn_t *connp, mblk_t **ire_mpp, uint8_t protocol,
- ipaddr_t src_addr, uint16_t lport, boolean_t fanout_insert)
-{
- int error;
-
- ASSERT(!connp->conn_af_isv6);
- connp->conn_pkt_isv6 = B_FALSE;
- connp->conn_ulp = protocol;
-
- error = ip_bind_laddr_v4(connp, ire_mpp, protocol, src_addr, lport,
- fanout_insert);
- if (error < 0)
- error = -TBADADDR;
return (error);
}
/*
- * Verify that both the source and destination addresses
- * are valid. If verify_dst is false, then the destination address may be
- * unreachable, i.e. have no route to it. Protocols like TCP want to verify
- * destination reachability, while tunnels do not.
- * Note that we allow connect to broadcast and multicast
- * addresses when ire_requested is set. Thus the ULP
- * has to check for IRE_BROADCAST and multicast.
+ * Verify that both the source and destination addresses are valid. If
+ * IPDF_VERIFY_DST is not set, then the destination address may be unreachable,
+ * i.e. have no route to it. Protocols like TCP want to verify destination
+ * reachability, while tunnels do not.
*
- * Returns zero if ok.
- * On error: returns -1 to mean TBADADDR otherwise returns an errno
- * (for use with TSYSERR reply).
+ * Determine the route, the interface, and (optionally) the source address
+ * to use to reach a given destination.
+ * Note that we allow connect to broadcast and multicast addresses when
+ * IPDF_ALLOW_MCBC is set.
+ * first_hop and dst_addr are normally the same, but if source routing
+ * they will differ; in that case the first_hop is what we'll use for the
+ * routing lookup but the dce and label checks will be done on dst_addr,
*
- * Note: lport and fport are in network byte order.
+ * If uinfo is set, then we fill in the best available information
+ * we have for the destination. This is based on (in priority order) any
+ * metrics and path MTU stored in a dce_t, route metrics, and finally the
+ * ill_mtu.
+ *
+ * Tsol note: If we have a source route then dst_addr != firsthop. But we
+ * always do the label check on dst_addr.
*/
int
-ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol,
- ipaddr_t *src_addrp, uint16_t lport, ipaddr_t dst_addr, uint16_t fport,
- boolean_t fanout_insert, boolean_t verify_dst, cred_t *cr)
+ip_set_destination_v4(ipaddr_t *src_addrp, ipaddr_t dst_addr, ipaddr_t firsthop,
+ ip_xmit_attr_t *ixa, iulp_t *uinfo, uint32_t flags, uint_t mac_mode)
{
-
- ire_t *src_ire;
- ire_t *dst_ire;
+ ire_t *ire = NULL;
int error = 0;
- ire_t *sire = NULL;
- ire_t *md_dst_ire = NULL;
- ire_t *lso_dst_ire = NULL;
+ ipaddr_t setsrc; /* RTF_SETSRC */
+ zoneid_t zoneid = ixa->ixa_zoneid; /* Honors SO_ALLZONES */
+ ip_stack_t *ipst = ixa->ixa_ipst;
+ dce_t *dce;
+ uint_t pmtu;
+ uint_t generation;
+ nce_t *nce;
ill_t *ill = NULL;
- zoneid_t zoneid;
- ipaddr_t src_addr = *src_addrp;
- ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
- mblk_t *mp = NULL;
- boolean_t ire_requested = B_FALSE;
- boolean_t ipsec_policy_set = B_FALSE;
- ts_label_t *tsl = NULL;
- cred_t *effective_cred = NULL;
-
- if (mpp)
- mp = *mpp;
-
- if (mp != NULL) {
- ire_requested = (DB_TYPE(mp) == IRE_DB_REQ_TYPE);
- ipsec_policy_set = (DB_TYPE(mp) == IPSEC_POLICY_SET);
- }
+ boolean_t multirt = B_FALSE;
- src_ire = dst_ire = NULL;
+ ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
/*
- * If we never got a disconnect before, clear it now.
+ * We never send to zero; the ULPs map it to the loopback address.
+ * We can't allow it since we use zero to mean unitialized in some
+ * places.
*/
- connp->conn_fully_bound = B_FALSE;
+ ASSERT(dst_addr != INADDR_ANY);
- zoneid = IPCL_ZONEID(connp);
-
- /*
- * Check whether Trusted Solaris policy allows communication with this
- * host, and pretend that the destination is unreachable if not.
- *
- * This is never a problem for TCP, since that transport is known to
- * compute the label properly as part of the tcp_rput_other T_BIND_ACK
- * handling. If the remote is unreachable, it will be detected at that
- * point, so there's no reason to check it here.
- *
- * Note that for sendto (and other datagram-oriented friends), this
- * check is done as part of the data path label computation instead.
- * The check here is just to make non-TCP connect() report the right
- * error.
- */
- if (is_system_labeled() && !IPCL_IS_TCP(connp)) {
- if ((error = tsol_check_dest(cr, &dst_addr, IPV4_VERSION,
- connp->conn_mac_mode, &effective_cred)) != 0) {
- if (ip_debug > 2) {
- pr_addr_dbg(
- "ip_bind_connected_v4:"
- " no label for dst %s\n",
- AF_INET, &dst_addr);
- }
- goto bad_addr;
- }
+ if (is_system_labeled()) {
+ ts_label_t *tsl = NULL;
- /*
- * tsol_check_dest() may have created a new cred with
- * a modified security label. Use that cred if it exists
- * for ire lookups.
- */
- if (effective_cred == NULL) {
- tsl = crgetlabel(cr);
- } else {
- tsl = crgetlabel(effective_cred);
+ error = tsol_check_dest(ixa->ixa_tsl, &dst_addr, IPV4_VERSION,
+ mac_mode, (flags & IPDF_ZONE_IS_GLOBAL) != 0, &tsl);
+ if (error != 0)
+ return (error);
+ if (tsl != NULL) {
+ /* Update the label */
+ ip_xmit_attr_replace_tsl(ixa, tsl);
}
}
- if (CLASSD(dst_addr)) {
- /* Pick up an IRE_BROADCAST */
- dst_ire = ire_route_lookup(ip_g_all_ones, 0, 0, 0, NULL,
- NULL, zoneid, tsl,
- (MATCH_IRE_RECURSIVE |
- MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE |
- MATCH_IRE_SECATTR), ipst);
- } else {
- /*
- * If conn_dontroute is set or if conn_nexthop_set is set,
- * and onlink ipif is not found set ENETUNREACH error.
- */
- if (connp->conn_dontroute || connp->conn_nexthop_set) {
- ipif_t *ipif;
-
- ipif = ipif_lookup_onlink_addr(connp->conn_dontroute ?
- dst_addr : connp->conn_nexthop_v4, zoneid, ipst);
- if (ipif == NULL) {
- error = ENETUNREACH;
- goto bad_addr;
- }
- ipif_refrele(ipif);
- }
+ setsrc = INADDR_ANY;
+ /*
+ * Select a route; For IPMP interfaces, we would only select
+ * a "hidden" route (i.e., going through a specific under_ill)
+ * if ixa_ifindex has been specified.
+ */
+ ire = ip_select_route_v4(firsthop, ixa, &generation, &setsrc, &error,
+ &multirt);
+ ASSERT(ire != NULL); /* IRE_NOROUTE if none found */
+ if (error != 0)
+ goto bad_addr;
- if (connp->conn_nexthop_set) {
- dst_ire = ire_route_lookup(connp->conn_nexthop_v4, 0,
- 0, 0, NULL, NULL, zoneid, tsl,
- MATCH_IRE_SECATTR, ipst);
- } else {
- dst_ire = ire_route_lookup(dst_addr, 0, 0, 0, NULL,
- &sire, zoneid, tsl,
- (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
- MATCH_IRE_PARENT | MATCH_IRE_RJ_BHOLE |
- MATCH_IRE_SECATTR), ipst);
- }
- }
/*
- * dst_ire can't be a broadcast when not ire_requested.
- * We also prevent ire's with src address INADDR_ANY to
- * be used, which are created temporarily for
- * sending out packets from endpoints that have
- * conn_unspec_src set. If verify_dst is true, the destination must be
- * reachable. If verify_dst is false, the destination needn't be
- * reachable.
+ * ire can't be a broadcast or multicast unless IPDF_ALLOW_MCBC is set.
+ * If IPDF_VERIFY_DST is set, the destination must be reachable;
+ * Otherwise the destination needn't be reachable.
*
* If we match on a reject or black hole, then we've got a
* local failure. May as well fail out the connect() attempt,
* since it's never going to succeed.
*/
- if (dst_ire == NULL || dst_ire->ire_src_addr == INADDR_ANY ||
- (dst_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
- ((dst_ire->ire_type & IRE_BROADCAST) && !ire_requested)) {
+ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
/*
* If we're verifying destination reachability, we always want
* to complain here.
@@ -4854,425 +3586,435 @@ ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol,
* If we're not verifying destination reachability but the
* destination has a route, we still want to fail on the
* temporary address and broadcast address tests.
+ *
+ * In both cases do we let the code continue so some reasonable
+ * information is returned to the caller. That enables the
+ * caller to use (and even cache) the IRE. conn_ip_ouput will
+ * use the generation mismatch path to check for the unreachable
+ * case thereby avoiding any specific check in the main path.
*/
- if (verify_dst || (dst_ire != NULL)) {
- if (ip_debug > 2) {
- pr_addr_dbg("ip_bind_connected_v4:"
- "bad connected dst %s\n",
- AF_INET, &dst_addr);
- }
- if (dst_ire == NULL || !(dst_ire->ire_type & IRE_HOST))
+ ASSERT(generation == IRE_GENERATION_VERIFY);
+ if (flags & IPDF_VERIFY_DST) {
+ /*
+ * Set errno but continue to set up ixa_ire to be
+ * the RTF_REJECT|RTF_BLACKHOLE IRE.
+ * That allows callers to use ip_output to get an
+ * ICMP error back.
+ */
+ if (!(ire->ire_type & IRE_HOST))
error = ENETUNREACH;
else
error = EHOSTUNREACH;
- goto bad_addr;
+ }
+ }
+
+ if ((ire->ire_type & (IRE_BROADCAST|IRE_MULTICAST)) &&
+ !(flags & IPDF_ALLOW_MCBC)) {
+ ire_refrele(ire);
+ ire = ire_reject(ipst, B_FALSE);
+ generation = IRE_GENERATION_VERIFY;
+ error = ENETUNREACH;
+ }
+
+ /* Cache things */
+ if (ixa->ixa_ire != NULL)
+ ire_refrele_notr(ixa->ixa_ire);
+#ifdef DEBUG
+ ire_refhold_notr(ire);
+ ire_refrele(ire);
+#endif
+ ixa->ixa_ire = ire;
+ ixa->ixa_ire_generation = generation;
+
+ /*
+ * For multicast with multirt we have a flag passed back from
+ * ire_lookup_multi_ill_v4 since we don't have an IRE for each
+ * possible multicast address.
+ * We also need a flag for multicast since we can't check
+ * whether RTF_MULTIRT is set in ixa_ire for multicast.
+ */
+ if (multirt) {
+ ixa->ixa_postfragfn = ip_postfrag_multirt_v4;
+ ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
+ } else {
+ ixa->ixa_postfragfn = ire->ire_postfragfn;
+ ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
+ }
+ if (!(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
+ /* Get an nce to cache. */
+ nce = ire_to_nce(ire, firsthop, NULL);
+ if (nce == NULL) {
+ /* Allocation failure? */
+ ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
+ } else {
+ if (ixa->ixa_nce != NULL)
+ nce_refrele(ixa->ixa_nce);
+ ixa->ixa_nce = nce;
}
}
/*
- * If the app does a connect(), it means that it will most likely
- * send more than 1 packet to the destination. It makes sense
- * to clear the temporary flag.
+ * We use use ire_nexthop_ill to avoid the under ipmp
+ * interface for source address selection. Note that for ipmp
+ * probe packets, ixa_ifindex would have been specified, and
+ * the ip_select_route() invocation would have picked an ire
+ * will ire_ill pointing at an under interface.
*/
- if (dst_ire != NULL && dst_ire->ire_type == IRE_CACHE &&
- (dst_ire->ire_marks & IRE_MARK_TEMPORARY)) {
- irb_t *irb = dst_ire->ire_bucket;
+ ill = ire_nexthop_ill(ire);
- rw_enter(&irb->irb_lock, RW_WRITER);
+ /*
+ * If the source address is a loopback address, the
+ * destination had best be local or multicast.
+ * If we are sending to an IRE_LOCAL using a loopback source then
+ * it had better be the same zoneid.
+ */
+ if (*src_addrp == htonl(INADDR_LOOPBACK)) {
+ if ((ire->ire_type & IRE_LOCAL) && ire->ire_zoneid != zoneid) {
+ ire = NULL; /* Stored in ixa_ire */
+ error = EADDRNOTAVAIL;
+ goto bad_addr;
+ }
+ if (!(ire->ire_type & (IRE_LOOPBACK|IRE_LOCAL|IRE_MULTICAST))) {
+ ire = NULL; /* Stored in ixa_ire */
+ error = EADDRNOTAVAIL;
+ goto bad_addr;
+ }
+ }
+ if (ire->ire_type & IRE_BROADCAST) {
/*
- * We need to recheck for IRE_MARK_TEMPORARY after acquiring
- * the lock to guarantee irb_tmp_ire_cnt.
+ * If the ULP didn't have a specified source, then we
+ * make sure we reselect the source when sending
+ * broadcasts out different interfaces.
*/
- if (dst_ire->ire_marks & IRE_MARK_TEMPORARY) {
- dst_ire->ire_marks &= ~IRE_MARK_TEMPORARY;
- irb->irb_tmp_ire_cnt--;
- }
- rw_exit(&irb->irb_lock);
+ if (flags & IPDF_SELECT_SRC)
+ ixa->ixa_flags |= IXAF_SET_SOURCE;
+ else
+ ixa->ixa_flags &= ~IXAF_SET_SOURCE;
}
/*
- * See if we should notify ULP about LSO/MDT; we do this whether or not
- * ire_requested is TRUE, in order to handle active connects; LSO/MDT
- * eligibility tests for passive connects are handled separately
- * through tcp_adapt_ire(). We do this before the source address
- * selection, because dst_ire may change after a call to
- * ipif_select_source(). This is a best-effort check, as the
- * packet for this connection may not actually go through
- * dst_ire->ire_stq, and the exact IRE can only be known after
- * calling ip_newroute(). This is why we further check on the
- * IRE during LSO/Multidata packet transmission in
- * tcp_lsosend()/tcp_multisend().
+ * Does the caller want us to pick a source address?
*/
- if (!ipsec_policy_set && dst_ire != NULL &&
- !(dst_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST)) &&
- (ill = ire_to_ill(dst_ire), ill != NULL)) {
- if (ipst->ips_ip_lso_outbound && ILL_LSO_CAPABLE(ill)) {
- lso_dst_ire = dst_ire;
- IRE_REFHOLD(lso_dst_ire);
- } else if (ipst->ips_ip_multidata_outbound &&
- ILL_MDT_CAPABLE(ill)) {
- md_dst_ire = dst_ire;
- IRE_REFHOLD(md_dst_ire);
+ if (flags & IPDF_SELECT_SRC) {
+ ipaddr_t src_addr;
+
+ /* If unreachable we have no ill but need some source */
+ if (ill == NULL) {
+ src_addr = htonl(INADDR_LOOPBACK);
+ /* Make sure we look for a better source address */
+ generation = SRC_GENERATION_VERIFY;
+ } else {
+ error = ip_select_source_v4(ill, setsrc, dst_addr,
+ ixa->ixa_multicast_ifaddr, zoneid,
+ ipst, &src_addr, &generation, NULL);
+ if (error != 0) {
+ ire = NULL; /* Stored in ixa_ire */
+ goto bad_addr;
+ }
}
- }
- if (dst_ire != NULL && dst_ire->ire_type == IRE_LOCAL &&
- dst_ire->ire_zoneid != zoneid && dst_ire->ire_zoneid != ALL_ZONES) {
/*
- * If the IRE belongs to a different zone, look for a matching
- * route in the forwarding table and use the source address from
- * that route.
+ * We allow the source address to to down.
+ * However, we check that we don't use the loopback address
+ * as a source when sending out on the wire.
*/
- src_ire = ire_ftable_lookup(dst_addr, 0, 0, 0, NULL, NULL,
- zoneid, 0, NULL,
- MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
- MATCH_IRE_RJ_BHOLE, ipst);
- if (src_ire == NULL) {
- error = EHOSTUNREACH;
- goto bad_addr;
- } else if (src_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
- if (!(src_ire->ire_type & IRE_HOST))
- error = ENETUNREACH;
- else
- error = EHOSTUNREACH;
+ if ((src_addr == htonl(INADDR_LOOPBACK)) &&
+ !(ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK|IRE_MULTICAST)) &&
+ !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
+ ire = NULL; /* Stored in ixa_ire */
+ error = EADDRNOTAVAIL;
goto bad_addr;
}
- if (src_addr == INADDR_ANY)
- src_addr = src_ire->ire_src_addr;
- ire_refrele(src_ire);
- src_ire = NULL;
- } else if ((src_addr == INADDR_ANY) && (dst_ire != NULL)) {
- if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) {
- src_addr = sire->ire_src_addr;
- ire_refrele(dst_ire);
- dst_ire = sire;
- sire = NULL;
- } else {
- /*
- * Pick a source address so that a proper inbound
- * load spreading would happen.
- */
- ill_t *ire_ill = dst_ire->ire_ipif->ipif_ill;
- ipif_t *src_ipif = NULL;
- ire_t *ipif_ire;
- /*
- * Supply a local source address such that inbound
- * load spreading happens.
- *
- * Determine the best source address on this ill for
- * the destination.
- *
- * 1) For broadcast, we should return a broadcast ire
- * found above so that upper layers know that the
- * destination address is a broadcast address.
- *
- * 2) If the ipif is DEPRECATED, select a better
- * source address. Similarly, if the ipif is on
- * the IPMP meta-interface, pick a source address
- * at random to improve inbound load spreading.
- *
- * 3) If the outgoing interface is part of a usesrc
- * group, then try selecting a source address from
- * the usesrc ILL.
- */
- if ((dst_ire->ire_zoneid != zoneid &&
- dst_ire->ire_zoneid != ALL_ZONES) ||
- (!(dst_ire->ire_flags & RTF_SETSRC)) &&
- (!(dst_ire->ire_type & IRE_BROADCAST) &&
- (IS_IPMP(ire_ill) ||
- (dst_ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) ||
- (ire_ill->ill_usesrc_ifindex != 0)))) {
- /*
- * If the destination is reachable via a
- * given gateway, the selected source address
- * should be in the same subnet as the gateway.
- * Otherwise, the destination is not reachable.
- *
- * If there are no interfaces on the same subnet
- * as the destination, ipif_select_source gives
- * first non-deprecated interface which might be
- * on a different subnet than the gateway.
- * This is not desirable. Hence pass the dst_ire
- * source address to ipif_select_source.
- * It is sure that the destination is reachable
- * with the dst_ire source address subnet.
- * So passing dst_ire source address to
- * ipif_select_source will make sure that the
- * selected source will be on the same subnet
- * as dst_ire source address.
- */
- ipaddr_t saddr =
- dst_ire->ire_ipif->ipif_src_addr;
- src_ipif = ipif_select_source(ire_ill,
- saddr, zoneid);
- if (src_ipif != NULL) {
- if (IS_VNI(src_ipif->ipif_ill)) {
- /*
- * For VNI there is no
- * interface route
- */
- src_addr =
- src_ipif->ipif_src_addr;
- } else {
- ipif_ire =
- ipif_to_ire(src_ipif);
- if (ipif_ire != NULL) {
- IRE_REFRELE(dst_ire);
- dst_ire = ipif_ire;
- }
- src_addr =
- dst_ire->ire_src_addr;
- }
- ipif_refrele(src_ipif);
- } else {
- src_addr = dst_ire->ire_src_addr;
- }
- } else {
- src_addr = dst_ire->ire_src_addr;
- }
- }
+ *src_addrp = src_addr;
+ ixa->ixa_src_generation = generation;
}
+ if (flags & IPDF_UNIQUE_DCE) {
+ /* Fallback to the default dce if allocation fails */
+ dce = dce_lookup_and_add_v4(dst_addr, ipst);
+ if (dce != NULL)
+ generation = dce->dce_generation;
+ else
+ dce = dce_lookup_v4(dst_addr, ipst, &generation);
+ } else {
+ dce = dce_lookup_v4(dst_addr, ipst, &generation);
+ }
+ ASSERT(dce != NULL);
+ if (ixa->ixa_dce != NULL)
+ dce_refrele_notr(ixa->ixa_dce);
+#ifdef DEBUG
+ dce_refhold_notr(dce);
+ dce_refrele(dce);
+#endif
+ ixa->ixa_dce = dce;
+ ixa->ixa_dce_generation = generation;
+
/*
- * We do ire_route_lookup() here (and not
- * interface lookup as we assert that
- * src_addr should only come from an
- * UP interface for hard binding.
+ * Make sure we don't leave an unreachable ixa_nce in place
+ * since ip_select_route is used when we unplumb i.e., remove
+ * references on ixa_ire, ixa_nce, and ixa_dce.
*/
- ASSERT(src_ire == NULL);
- src_ire = ire_route_lookup(src_addr, 0, 0, 0, NULL,
- NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, ipst);
- /* src_ire must be a local|loopback */
- if (!IRE_IS_LOCAL(src_ire)) {
- if (ip_debug > 2) {
- pr_addr_dbg("ip_bind_connected_v4: bad connected "
- "src %s\n", AF_INET, &src_addr);
- }
- error = EADDRNOTAVAIL;
- goto bad_addr;
+ nce = ixa->ixa_nce;
+ if (nce != NULL && nce->nce_is_condemned) {
+ nce_refrele(nce);
+ ixa->ixa_nce = NULL;
+ ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
}
/*
- * If the source address is a loopback address, the
- * destination had best be local or multicast.
- * The transports that can't handle multicast will reject
- * those addresses.
+ * The caller has set IXAF_PMTU_DISCOVERY if path MTU is desired.
+ * However, we can't do it for IPv4 multicast or broadcast.
*/
- if (src_ire->ire_type == IRE_LOOPBACK &&
- !(IRE_IS_LOCAL(dst_ire) || CLASSD(dst_addr))) {
- ip1dbg(("ip_bind_connected_v4: bad connected loopback\n"));
- error = -1;
- goto bad_addr;
- }
+ if (ire->ire_type & (IRE_BROADCAST|IRE_MULTICAST))
+ ixa->ixa_flags &= ~IXAF_PMTU_DISCOVERY;
/*
- * Allow setting new policies. For example, disconnects come
- * down as ipa_t bind. As we would have set conn_policy_cached
- * to B_TRUE before, we should set it to B_FALSE, so that policy
- * can change after the disconnect.
+ * Set initial value for fragmentation limit. Either conn_ip_output
+ * or ULP might updates it when there are routing changes.
+ * Handles a NULL ixa_ire->ire_ill or a NULL ixa_nce for RTF_REJECT.
*/
- connp->conn_policy_cached = B_FALSE;
+ pmtu = ip_get_pmtu(ixa);
+ ixa->ixa_fragsize = pmtu;
+ /* Make sure ixa_fragsize and ixa_pmtu remain identical */
+ if (ixa->ixa_flags & IXAF_VERIFY_PMTU)
+ ixa->ixa_pmtu = pmtu;
/*
- * Set the conn addresses/ports immediately, so the IPsec policy calls
- * can handle their passed-in conn's.
+ * Extract information useful for some transports.
+ * First we look for DCE metrics. Then we take what we have in
+ * the metrics in the route, where the offlink is used if we have
+ * one.
*/
+ if (uinfo != NULL) {
+ bzero(uinfo, sizeof (*uinfo));
- IN6_IPADDR_TO_V4MAPPED(src_addr, &connp->conn_srcv6);
- IN6_IPADDR_TO_V4MAPPED(dst_addr, &connp->conn_remv6);
- connp->conn_lport = lport;
- connp->conn_fport = fport;
- *src_addrp = src_addr;
+ if (dce->dce_flags & DCEF_UINFO)
+ *uinfo = dce->dce_uinfo;
- ASSERT(!(ipsec_policy_set && ire_requested));
- if (ire_requested) {
- iulp_t *ulp_info = NULL;
+ rts_merge_metrics(uinfo, &ire->ire_metrics);
- /*
- * Note that sire will not be NULL if this is an off-link
- * connection and there is not cache for that dest yet.
- *
- * XXX Because of an existing bug, if there are multiple
- * default routes, the IRE returned now may not be the actual
- * default route used (default routes are chosen in a
- * round robin fashion). So if the metrics for different
- * default routes are different, we may return the wrong
- * metrics. This will not be a problem if the existing
- * bug is fixed.
- */
- if (sire != NULL) {
- ulp_info = &(sire->ire_uinfo);
- }
- if (!ip_bind_get_ire_v4(mpp, dst_ire, ulp_info, ipst)) {
- error = -1;
- goto bad_addr;
- }
- mp = *mpp;
- } else if (ipsec_policy_set) {
- if (!ip_bind_ipsec_policy_set(connp, mp)) {
- error = -1;
- goto bad_addr;
- }
+ /* Allow ire_metrics to decrease the path MTU from above */
+ if (uinfo->iulp_mtu == 0 || uinfo->iulp_mtu > pmtu)
+ uinfo->iulp_mtu = pmtu;
+
+ uinfo->iulp_localnet = (ire->ire_type & IRE_ONLINK) != 0;
+ uinfo->iulp_loopback = (ire->ire_type & IRE_LOOPBACK) != 0;
+ uinfo->iulp_local = (ire->ire_type & IRE_LOCAL) != 0;
}
- /*
- * Cache IPsec policy in this conn. If we have per-socket policy,
- * we'll cache that. If we don't, we'll inherit global policy.
- *
- * We can't insert until the conn reflects the policy. Note that
- * conn_policy_cached is set by ipsec_conn_cache_policy() even for
- * connections where we don't have a policy. This is to prevent
- * global policy lookups in the inbound path.
- *
- * If we insert before we set conn_policy_cached,
- * CONN_INBOUND_POLICY_PRESENT() check can still evaluate true
- * because global policy cound be non-empty. We normally call
- * ipsec_check_policy() for conn_policy_cached connections only if
- * ipc_in_enforce_policy is set. But in this case,
- * conn_policy_cached can get set anytime since we made the
- * CONN_INBOUND_POLICY_PRESENT() check and ipsec_check_policy() is
- * called, which will make the above assumption false. Thus, we
- * need to insert after we set conn_policy_cached.
- */
- if ((error = ipsec_conn_cache_policy(connp, B_TRUE)) != 0)
- goto bad_addr;
+ if (ill != NULL)
+ ill_refrele(ill);
- if (fanout_insert) {
- /*
- * The addresses have been verified. Time to insert in
- * the correct fanout list.
- */
- error = ipcl_conn_insert(connp, protocol, src_addr,
- dst_addr, connp->conn_ports);
- }
+ return (error);
- if (error == 0) {
- connp->conn_fully_bound = B_TRUE;
- /*
- * Our initial checks for LSO/MDT have passed; the IRE is not
- * LOCAL/LOOPBACK/BROADCAST, and the link layer seems to
- * be supporting LSO/MDT. Pass the IRE, IPC and ILL into
- * ip_xxinfo_return(), which performs further checks
- * against them and upon success, returns the LSO/MDT info
- * mblk which we will attach to the bind acknowledgment.
- */
- if (lso_dst_ire != NULL) {
- mblk_t *lsoinfo_mp;
-
- ASSERT(ill->ill_lso_capab != NULL);
- if ((lsoinfo_mp = ip_lsoinfo_return(lso_dst_ire, connp,
- ill->ill_name, ill->ill_lso_capab)) != NULL) {
- if (mp == NULL) {
- *mpp = lsoinfo_mp;
- } else {
- linkb(mp, lsoinfo_mp);
- }
- }
- } else if (md_dst_ire != NULL) {
- mblk_t *mdinfo_mp;
-
- ASSERT(ill->ill_mdt_capab != NULL);
- if ((mdinfo_mp = ip_mdinfo_return(md_dst_ire, connp,
- ill->ill_name, ill->ill_mdt_capab)) != NULL) {
- if (mp == NULL) {
- *mpp = mdinfo_mp;
- } else {
- linkb(mp, mdinfo_mp);
- }
- }
- }
- }
bad_addr:
- if (ipsec_policy_set) {
- ASSERT(mp != NULL);
- freeb(mp);
- /*
- * As of now assume that nothing else accompanies
- * IPSEC_POLICY_SET.
- */
- *mpp = NULL;
+ if (ire != NULL)
+ ire_refrele(ire);
+
+ if (ill != NULL)
+ ill_refrele(ill);
+
+ /*
+ * Make sure we don't leave an unreachable ixa_nce in place
+ * since ip_select_route is used when we unplumb i.e., remove
+ * references on ixa_ire, ixa_nce, and ixa_dce.
+ */
+ nce = ixa->ixa_nce;
+ if (nce != NULL && nce->nce_is_condemned) {
+ nce_refrele(nce);
+ ixa->ixa_nce = NULL;
+ ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
}
- if (src_ire != NULL)
- IRE_REFRELE(src_ire);
- if (dst_ire != NULL)
- IRE_REFRELE(dst_ire);
- if (sire != NULL)
- IRE_REFRELE(sire);
- if (md_dst_ire != NULL)
- IRE_REFRELE(md_dst_ire);
- if (lso_dst_ire != NULL)
- IRE_REFRELE(lso_dst_ire);
- if (effective_cred != NULL)
- crfree(effective_cred);
+
return (error);
}
-int
-ip_proto_bind_connected_v4(conn_t *connp, mblk_t **ire_mpp, uint8_t protocol,
- ipaddr_t *src_addrp, uint16_t lport, ipaddr_t dst_addr, uint16_t fport,
- boolean_t fanout_insert, boolean_t verify_dst, cred_t *cr)
+
+/*
+ * Get the base MTU for the case when path MTU discovery is not used.
+ * Takes the MTU of the IRE into account.
+ */
+uint_t
+ip_get_base_mtu(ill_t *ill, ire_t *ire)
{
- int error;
-
- ASSERT(!connp->conn_af_isv6);
- connp->conn_pkt_isv6 = B_FALSE;
- connp->conn_ulp = protocol;
-
- /* For raw socket, the local port is not set. */
- if (lport == 0)
- lport = connp->conn_lport;
- error = ip_bind_connected_v4(connp, ire_mpp, protocol,
- src_addrp, lport, dst_addr, fport, fanout_insert, verify_dst, cr);
- if (error < 0)
- error = -TBADADDR;
- return (error);
+ uint_t mtu = ill->ill_mtu;
+ uint_t iremtu = ire->ire_metrics.iulp_mtu;
+
+ if (iremtu != 0 && iremtu < mtu)
+ mtu = iremtu;
+
+ return (mtu);
}
/*
- * Get the ire in *mpp. Returns false if it fails (due to lack of space).
- * Prefers dst_ire over src_ire.
+ * Get the PMTU for the attributes. Handles both IPv4 and IPv6.
+ * Assumes that ixa_ire, dce, and nce have already been set up.
+ *
+ * The caller has set IXAF_PMTU_DISCOVERY if path MTU discovery is desired.
+ * We avoid path MTU discovery if it is disabled with ndd.
+ * Furtermore, if the path MTU is too small, then we don't set DF for IPv4.
+ *
+ * NOTE: We also used to turn it off for source routed packets. That
+ * is no longer required since the dce is per final destination.
*/
-static boolean_t
-ip_bind_get_ire_v4(mblk_t **mpp, ire_t *ire, iulp_t *ulp_info, ip_stack_t *ipst)
+uint_t
+ip_get_pmtu(ip_xmit_attr_t *ixa)
{
- mblk_t *mp = *mpp;
- ire_t *ret_ire;
+ ip_stack_t *ipst = ixa->ixa_ipst;
+ dce_t *dce;
+ nce_t *nce;
+ ire_t *ire;
+ uint_t pmtu;
- ASSERT(mp != NULL);
+ ire = ixa->ixa_ire;
+ dce = ixa->ixa_dce;
+ nce = ixa->ixa_nce;
- if (ire != NULL) {
- /*
- * mp initialized above to IRE_DB_REQ_TYPE
- * appended mblk. Its <upper protocol>'s
- * job to make sure there is room.
- */
- if ((mp->b_datap->db_lim - mp->b_rptr) < sizeof (ire_t))
- return (B_FALSE);
+ /*
+ * If path MTU discovery has been turned off by ndd, then we ignore
+ * any dce_pmtu and for IPv4 we will not set DF.
+ */
+ if (!ipst->ips_ip_path_mtu_discovery)
+ ixa->ixa_flags &= ~IXAF_PMTU_DISCOVERY;
- mp->b_datap->db_type = IRE_DB_TYPE;
- mp->b_wptr = mp->b_rptr + sizeof (ire_t);
- bcopy(ire, mp->b_rptr, sizeof (ire_t));
- ret_ire = (ire_t *)mp->b_rptr;
+ pmtu = IP_MAXPACKET;
+ /*
+ * Decide whether whether IPv4 sets DF
+ * For IPv6 "no DF" means to use the 1280 mtu
+ */
+ if (ixa->ixa_flags & IXAF_PMTU_DISCOVERY) {
+ ixa->ixa_flags |= IXAF_PMTU_IPV4_DF;
+ } else {
+ ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
+ if (!(ixa->ixa_flags & IXAF_IS_IPV4))
+ pmtu = IPV6_MIN_MTU;
+ }
+
+ /* Check if the PMTU is to old before we use it */
+ if ((dce->dce_flags & DCEF_PMTU) &&
+ TICK_TO_SEC(lbolt64) - dce->dce_last_change_time >
+ ipst->ips_ip_pathmtu_interval) {
/*
- * Pass the latest setting of the ip_path_mtu_discovery and
- * copy the ulp info if any.
+ * Older than 20 minutes. Drop the path MTU information.
*/
- ret_ire->ire_frag_flag |= (ipst->ips_ip_path_mtu_discovery) ?
- IPH_DF : 0;
- if (ulp_info != NULL) {
- bcopy(ulp_info, &(ret_ire->ire_uinfo),
- sizeof (iulp_t));
+ mutex_enter(&dce->dce_lock);
+ dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
+ dce->dce_last_change_time = TICK_TO_SEC(lbolt64);
+ mutex_exit(&dce->dce_lock);
+ dce_increment_generation(dce);
+ }
+
+ /* The metrics on the route can lower the path MTU */
+ if (ire->ire_metrics.iulp_mtu != 0 &&
+ ire->ire_metrics.iulp_mtu < pmtu)
+ pmtu = ire->ire_metrics.iulp_mtu;
+
+ /*
+ * If the path MTU is smaller than some minimum, we still use dce_pmtu
+ * above (would be 576 for IPv4 and 1280 for IPv6), but we clear
+ * IXAF_PMTU_IPV4_DF so that we avoid setting DF for IPv4.
+ */
+ if (ixa->ixa_flags & IXAF_PMTU_DISCOVERY) {
+ if (dce->dce_flags & DCEF_PMTU) {
+ if (dce->dce_pmtu < pmtu)
+ pmtu = dce->dce_pmtu;
+
+ if (dce->dce_flags & DCEF_TOO_SMALL_PMTU) {
+ ixa->ixa_flags |= IXAF_PMTU_TOO_SMALL;
+ ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
+ } else {
+ ixa->ixa_flags &= ~IXAF_PMTU_TOO_SMALL;
+ ixa->ixa_flags |= IXAF_PMTU_IPV4_DF;
+ }
+ } else {
+ ixa->ixa_flags &= ~IXAF_PMTU_TOO_SMALL;
+ ixa->ixa_flags |= IXAF_PMTU_IPV4_DF;
}
- ret_ire->ire_mp = mp;
- } else {
+ }
+
+ /*
+ * If we have an IRE_LOCAL we use the loopback mtu instead of
+ * the ill for going out the wire i.e., IRE_LOCAL gets the same
+ * mtu as IRE_LOOPBACK.
+ */
+ if (ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) {
+ uint_t loopback_mtu;
+
+ loopback_mtu = (ire->ire_ipversion == IPV6_VERSION) ?
+ ip_loopback_mtu_v6plus : ip_loopback_mtuplus;
+
+ if (loopback_mtu < pmtu)
+ pmtu = loopback_mtu;
+ } else if (nce != NULL) {
/*
- * No IRE was found. Remove IRE mblk.
+ * Make sure we don't exceed the interface MTU.
+ * In the case of RTF_REJECT or RTF_BLACKHOLE we might not have
+ * an ill. We'd use the above IP_MAXPACKET in that case just
+ * to tell the transport something larger than zero.
*/
- *mpp = mp->b_cont;
- freeb(mp);
+ if (nce->nce_common->ncec_ill->ill_mtu < pmtu)
+ pmtu = nce->nce_common->ncec_ill->ill_mtu;
+ if (nce->nce_common->ncec_ill != nce->nce_ill &&
+ nce->nce_ill->ill_mtu < pmtu) {
+ /*
+ * for interfaces in an IPMP group, the mtu of
+ * the nce_ill (under_ill) could be different
+ * from the mtu of the ncec_ill, so we take the
+ * min of the two.
+ */
+ pmtu = nce->nce_ill->ill_mtu;
+ }
}
- return (B_TRUE);
+
+ /*
+ * Handle the IPV6_USE_MIN_MTU socket option or ancillary data.
+ * Only applies to IPv6.
+ */
+ if (!(ixa->ixa_flags & IXAF_IS_IPV4)) {
+ if (ixa->ixa_flags & IXAF_USE_MIN_MTU) {
+ switch (ixa->ixa_use_min_mtu) {
+ case IPV6_USE_MIN_MTU_MULTICAST:
+ if (ire->ire_type & IRE_MULTICAST)
+ pmtu = IPV6_MIN_MTU;
+ break;
+ case IPV6_USE_MIN_MTU_ALWAYS:
+ pmtu = IPV6_MIN_MTU;
+ break;
+ case IPV6_USE_MIN_MTU_NEVER:
+ break;
+ }
+ } else {
+ /* Default is IPV6_USE_MIN_MTU_MULTICAST */
+ if (ire->ire_type & IRE_MULTICAST)
+ pmtu = IPV6_MIN_MTU;
+ }
+ }
+
+ /*
+ * After receiving an ICMPv6 "packet too big" message with a
+ * MTU < 1280, and for multirouted IPv6 packets, the IP layer
+ * will insert a 8-byte fragment header in every packet. We compensate
+ * for those cases by returning a smaller path MTU to the ULP.
+ *
+ * In the case of CGTP then ip_output will add a fragment header.
+ * Make sure there is room for it by telling a smaller number
+ * to the transport.
+ *
+ * When IXAF_IPV6_ADDR_FRAGHDR we subtract the frag hdr here
+ * so the ULPs consistently see a iulp_pmtu and ip_get_pmtu()
+ * which is the size of the packets it can send.
+ */
+ if (!(ixa->ixa_flags & IXAF_IS_IPV4)) {
+ if ((dce->dce_flags & DCEF_TOO_SMALL_PMTU) ||
+ (ire->ire_flags & RTF_MULTIRT) ||
+ (ixa->ixa_flags & IXAF_MULTIRT_MULTICAST)) {
+ pmtu -= sizeof (ip6_frag_t);
+ ixa->ixa_flags |= IXAF_IPV6_ADD_FRAGHDR;
+ }
+ }
+
+ return (pmtu);
}
/*
@@ -5386,6 +4128,7 @@ ip_modclose(ill_t *ill)
queue_t *q = ill->ill_rq;
ip_stack_t *ipst = ill->ill_ipst;
int i;
+ arl_ill_common_t *ai = ill->ill_common;
/*
* The punlink prior to this may have initiated a capability
@@ -5452,6 +4195,7 @@ ip_modclose(ill_t *ill)
mutex_enter(&ill->ill_lock);
while (!ill_is_freeable(ill))
cv_wait(&ill->ill_cv, &ill->ill_lock);
+
while (ill->ill_waiters)
cv_wait(&ill->ill_cv, &ill->ill_lock);
@@ -5466,12 +4210,16 @@ ip_modclose(ill_t *ill)
/* qprocsoff is done via ill_delete_tail */
ill_delete_tail(ill);
+ /*
+ * synchronously wait for arp stream to unbind. After this, we
+ * cannot get any data packets up from the driver.
+ */
+ arp_unbind_complete(ill);
ASSERT(ill->ill_ipst == NULL);
/*
- * Walk through all upper (conn) streams and qenable
- * those that have queued data.
- * close synchronization needs this to
+ * Walk through all conns and qenable those that have queued data.
+ * Close synchronization needs this to
* be done to ensure that all upper layers blocked
* due to flow control to the closing device
* get unblocked.
@@ -5481,6 +4229,25 @@ ip_modclose(ill_t *ill)
conn_walk_drain(ipst, &ipst->ips_idl_tx_list[i]);
}
+ /*
+ * ai can be null if this is an IPv6 ill, or if the IPv4
+ * stream is being torn down before ARP was plumbed (e.g.,
+ * /sbin/ifconfig plumbing a stream twice, and encountering
+ * an error
+ */
+ if (ai != NULL) {
+ ASSERT(!ill->ill_isv6);
+ mutex_enter(&ai->ai_lock);
+ ai->ai_ill = NULL;
+ if (ai->ai_arl == NULL) {
+ mutex_destroy(&ai->ai_lock);
+ kmem_free(ai, sizeof (*ai));
+ } else {
+ cv_signal(&ai->ai_ill_unplumb_done);
+ mutex_exit(&ai->ai_lock);
+ }
+ }
+
mutex_enter(&ipst->ips_ip_mi_lock);
mi_close_unlink(&ipst->ips_ip_g_head, (IDP)ill);
mutex_exit(&ipst->ips_ip_mi_lock);
@@ -5492,6 +4259,12 @@ ip_modclose(ill_t *ill)
if (ill->ill_credp != NULL)
crfree(ill->ill_credp);
+ mutex_destroy(&ill->ill_saved_ire_lock);
+ mutex_destroy(&ill->ill_lock);
+ rw_destroy(&ill->ill_mcast_lock);
+ mutex_destroy(&ill->ill_mcast_serializer);
+ list_destroy(&ill->ill_nce);
+
/*
* Now we are done with the module close pieces that
* need the netstack_t.
@@ -5525,11 +4298,8 @@ ip_quiesce_conn(conn_t *connp)
* Mark the conn as closing, and this conn must not be
* inserted in future into any list. Eg. conn_drain_insert(),
* won't insert this conn into the conn_drain_list.
- * Similarly ill_pending_mp_add() will not add any mp to
- * the pending mp list, after this conn has started closing.
*
- * conn_idl, conn_pending_ill, conn_down_pending_ill, conn_ilg
- * cannot get set henceforth.
+ * conn_idl, and conn_ilg cannot get set henceforth.
*/
mutex_enter(&connp->conn_lock);
ASSERT(!(connp->conn_state_flags & CONN_QUIESCED));
@@ -5541,9 +4311,10 @@ ip_quiesce_conn(conn_t *connp)
if (connp->conn_dhcpinit_ill != NULL) {
ASSERT(connp->conn_dhcpinit_ill->ill_dhcpinit != 0);
atomic_dec_32(&connp->conn_dhcpinit_ill->ill_dhcpinit);
+ ill_set_inputfn(connp->conn_dhcpinit_ill);
connp->conn_dhcpinit_ill = NULL;
}
- if (connp->conn_ilg_inuse != 0)
+ if (connp->conn_ilg != NULL)
ilg_cleanup_reqd = B_TRUE;
mutex_exit(&connp->conn_lock);
@@ -5552,7 +4323,7 @@ ip_quiesce_conn(conn_t *connp)
if (is_system_labeled() && connp->conn_anon_port) {
(void) tsol_mlp_anon(crgetzone(connp->conn_cred),
- connp->conn_mlp_type, connp->conn_ulp,
+ connp->conn_mlp_type, connp->conn_proto,
ntohs(connp->conn_lport), B_FALSE);
connp->conn_anon_port = 0;
}
@@ -5568,21 +4339,22 @@ ip_quiesce_conn(conn_t *connp)
/*
* Remove this conn from the drain list, and do
* any other cleanup that may be required.
- * (Only non-tcp streams may have a non-null conn_idl.
- * TCP streams are never flow controlled, and
+ * (Only non-tcp conns may have a non-null conn_idl.
+ * TCP conns are never flow controlled, and
* conn_idl will be null)
*/
- if (drain_cleanup_reqd)
+ if (drain_cleanup_reqd && connp->conn_idl != NULL) {
+ mutex_enter(&connp->conn_idl->idl_lock);
conn_drain_tail(connp, B_TRUE);
+ mutex_exit(&connp->conn_idl->idl_lock);
+ }
if (connp == ipst->ips_ip_g_mrouter)
- (void) ip_mrouter_done(NULL, ipst);
+ (void) ip_mrouter_done(ipst);
if (ilg_cleanup_reqd)
ilg_delete_all(connp);
- conn_delete_ire(connp, NULL);
-
/*
* Now conn refcnt can increase only thru CONN_INC_REF_LOCKED.
* callers from write side can't be there now because close
@@ -5603,8 +4375,6 @@ ip_close(queue_t *q, int flags)
{
conn_t *connp;
- TRACE_1(TR_FAC_IP, TR_IP_CLOSE, "ip_close: q %p", q);
-
/*
* Call the appropriate delete routine depending on whether this is
* a module or device.
@@ -5646,13 +4416,21 @@ ip_close(queue_t *q, int flags)
*/
/*ARGSUSED2*/
static void
-ip_conn_input(void *arg1, mblk_t *mp, void *arg2)
+ip_conn_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
{
conn_t *connp = (conn_t *)arg1;
putnext(connp->conn_rq, mp);
}
+/* Dummy in case ICMP error delivery is attempted to a /dev/ip instance */
+/* ARGSUSED */
+static void
+ip_conn_input_icmp(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
+{
+ freemsg(mp);
+}
+
/*
* Called when the module is about to be unloaded
*/
@@ -5667,6 +4445,7 @@ ip_ddi_destroy(void)
sctp_ddi_g_destroy();
tcp_ddi_g_destroy();
ilb_ddi_g_destroy();
+ dce_g_destroy();
ipsec_policy_g_destroy();
ipcl_g_destroy();
ip_net_g_destroy();
@@ -5709,16 +4488,12 @@ ip_stack_shutdown(netstackid_t stackid, void *arg)
*/
ipv4_hook_shutdown(ipst);
ipv6_hook_shutdown(ipst);
+ arp_hook_shutdown(ipst);
mutex_enter(&ipst->ips_capab_taskq_lock);
ipst->ips_capab_taskq_quit = B_TRUE;
cv_signal(&ipst->ips_capab_taskq_cv);
mutex_exit(&ipst->ips_capab_taskq_lock);
-
- mutex_enter(&ipst->ips_mrt_lock);
- ipst->ips_mrt_flags |= IP_MRT_STOP;
- cv_signal(&ipst->ips_mrt_cv);
- mutex_exit(&ipst->ips_mrt_lock);
}
/*
@@ -5741,18 +4516,12 @@ ip_stack_fini(netstackid_t stackid, void *arg)
ipobs_fini(ipst);
ipv4_hook_destroy(ipst);
ipv6_hook_destroy(ipst);
+ arp_hook_destroy(ipst);
ip_net_destroy(ipst);
mutex_destroy(&ipst->ips_capab_taskq_lock);
cv_destroy(&ipst->ips_capab_taskq_cv);
- mutex_enter(&ipst->ips_mrt_lock);
- while (!(ipst->ips_mrt_flags & IP_MRT_DONE))
- cv_wait(&ipst->ips_mrt_done_cv, &ipst->ips_mrt_lock);
- mutex_destroy(&ipst->ips_mrt_lock);
- cv_destroy(&ipst->ips_mrt_cv);
- cv_destroy(&ipst->ips_mrt_done_cv);
-
ipmp_destroy(ipst);
rw_destroy(&ipst->ips_srcid_lock);
@@ -5773,10 +4542,10 @@ ip_stack_fini(netstackid_t stackid, void *arg)
kmem_free(ipst->ips_ndp_arr, sizeof (lcl_ndp_arr));
ipst->ips_ndp_arr = NULL;
+ dce_stack_destroy(ipst);
ip_mrouter_stack_destroy(ipst);
mutex_destroy(&ipst->ips_ip_mi_lock);
- rw_destroy(&ipst->ips_ipsec_capab_ills_lock);
rw_destroy(&ipst->ips_ill_g_usesrc_lock);
rw_destroy(&ipst->ips_ip_g_nd_lock);
@@ -5808,13 +4577,6 @@ ip_stack_fini(netstackid_t stackid, void *arg)
ASSERT(ipst->ips_mld_slowtimeout_id != 0);
ipst->ips_mld_slowtimeout_id = 0;
}
- ret = untimeout(ipst->ips_ip_ire_expire_id);
- if (ret == -1) {
- ASSERT(ipst->ips_ip_ire_expire_id == 0);
- } else {
- ASSERT(ipst->ips_ip_ire_expire_id != 0);
- ipst->ips_ip_ire_expire_id = 0;
- }
mutex_destroy(&ipst->ips_igmp_timer_lock);
mutex_destroy(&ipst->ips_mld_timer_lock);
@@ -5915,6 +4677,10 @@ ip_ddi_init(void)
list_create(&ip_thread_list, sizeof (th_hash_t),
offsetof(th_hash_t, thh_link));
#endif
+ ipsec_policy_g_init();
+ tcp_ddi_g_init();
+ sctp_ddi_g_init();
+ dce_g_init();
/*
* We want to be informed each time a stack is created or
@@ -5924,10 +4690,6 @@ ip_ddi_init(void)
netstack_register(NS_IP, ip_stack_init, ip_stack_shutdown,
ip_stack_fini);
- ipsec_policy_g_init();
- tcp_ddi_g_init();
- sctp_ddi_g_init();
-
tnet_init();
udp_ddi_g_init();
@@ -5973,7 +4735,6 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns)
mutex_init(&ipst->ips_ip_mi_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&ipst->ips_ip_addr_avail_lock, NULL, MUTEX_DEFAULT, NULL);
rw_init(&ipst->ips_ill_g_lock, NULL, RW_DEFAULT, NULL);
- rw_init(&ipst->ips_ipsec_capab_ills_lock, NULL, RW_DEFAULT, NULL);
rw_init(&ipst->ips_ill_g_usesrc_lock, NULL, RW_DEFAULT, NULL);
ipcl_init(ipst);
@@ -5982,6 +4743,7 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns)
ipif_init(ipst);
conn_drain_init(ipst);
ip_mrouter_stack_init(ipst);
+ dce_stack_init(ipst);
ipst->ips_ip_g_frag_timeout = IP_FRAG_TIMEOUT;
ipst->ips_ip_g_frag_timo_ms = IP_FRAG_TIMEOUT * 1000;
@@ -6026,9 +4788,12 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns)
ipst->ips_ip_src_id = 1;
rw_init(&ipst->ips_srcid_lock, NULL, RW_DEFAULT, NULL);
+ ipst->ips_src_generation = SRC_GENERATION_INITIAL;
+
ip_net_init(ipst, ns);
ipv4_hook_init(ipst);
ipv6_hook_init(ipst);
+ arp_hook_init(ipst);
ipmp_init(ipst);
ipobs_init(ipst);
@@ -6040,15 +4805,6 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns)
mutex_init(&ipst->ips_capab_taskq_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&ipst->ips_capab_taskq_cv, NULL, CV_DEFAULT, NULL);
- /*
- * Create the mcast_restart_timers_thread() worker thread.
- */
- mutex_init(&ipst->ips_mrt_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&ipst->ips_mrt_cv, NULL, CV_DEFAULT, NULL);
- cv_init(&ipst->ips_mrt_done_cv, NULL, CV_DEFAULT, NULL);
- ipst->ips_mrt_thread = thread_create(NULL, 0,
- mcast_restart_timers_thread, ipst, 0, &p0, TS_RUN, minclsyspri);
-
major = mod_name_to_major(INET_NAME);
(void) ldi_ident_from_major(major, &ipst->ips_ldi_ident);
return (ipst);
@@ -6161,37 +4917,26 @@ mac_colon_addr(const uint8_t *addr, size_t alen, char *buf, size_t buflen)
}
/*
- * Send an ICMP error after patching up the packet appropriately. Returns
- * non-zero if the appropriate MIB should be bumped; zero otherwise.
+ * Called when it is conceptually a ULP that would sent the packet
+ * e.g., port unreachable and protocol unreachable. Check that the packet
+ * would have passed the IPsec global policy before sending the error.
+ *
+ * Send an ICMP error after patching up the packet appropriately.
+ * Uses ip_drop_input and bumps the appropriate MIB.
*/
-static boolean_t
-ip_fanout_send_icmp(queue_t *q, mblk_t *mp, uint_t flags,
- uint_t icmp_type, uint_t icmp_code, boolean_t mctl_present,
- zoneid_t zoneid, ip_stack_t *ipst)
+void
+ip_fanout_send_icmp_v4(mblk_t *mp, uint_t icmp_type, uint_t icmp_code,
+ ip_recv_attr_t *ira)
{
- ipha_t *ipha;
- mblk_t *first_mp;
- boolean_t secure;
- unsigned char db_type;
- ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
+ ipha_t *ipha;
+ boolean_t secure;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+ netstack_t *ns = ipst->ips_netstack;
+ ipsec_stack_t *ipss = ns->netstack_ipsec;
+
+ secure = ira->ira_flags & IRAF_IPSEC_SECURE;
- first_mp = mp;
- if (mctl_present) {
- mp = mp->b_cont;
- secure = ipsec_in_is_secure(first_mp);
- ASSERT(mp != NULL);
- } else {
- /*
- * If this is an ICMP error being reported - which goes
- * up as M_CTLs, we need to convert them to M_DATA till
- * we finish checking with global policy because
- * ipsec_check_global_policy() assumes M_DATA as clear
- * and M_CTL as secure.
- */
- db_type = DB_TYPE(mp);
- DB_TYPE(mp) = M_DATA;
- secure = B_FALSE;
- }
/*
* We are generating an icmp error for some inbound packet.
* Called from all ip_fanout_(udp, tcp, proto) functions.
@@ -6201,47 +4946,52 @@ ip_fanout_send_icmp(queue_t *q, mblk_t *mp, uint_t flags,
*/
ipha = (ipha_t *)mp->b_rptr;
if (secure || ipss->ipsec_inbound_v4_policy_present) {
- first_mp = ipsec_check_global_policy(first_mp, NULL,
- ipha, NULL, mctl_present, ipst->ips_netstack);
- if (first_mp == NULL)
- return (B_FALSE);
+ mp = ipsec_check_global_policy(mp, NULL, ipha, NULL, ira, ns);
+ if (mp == NULL)
+ return;
}
- if (!mctl_present)
- DB_TYPE(mp) = db_type;
+ /* We never send errors for protocols that we do implement */
+ if (ira->ira_protocol == IPPROTO_ICMP ||
+ ira->ira_protocol == IPPROTO_IGMP) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ip_fanout_send_icmp_v4", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ /*
+ * Have to correct checksum since
+ * the packet might have been
+ * fragmented and the reassembly code in ip_rput
+ * does not restore the IP checksum.
+ */
+ ipha->ipha_hdr_checksum = 0;
+ ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
- if (flags & IP_FF_SEND_ICMP) {
- if (flags & IP_FF_HDR_COMPLETE) {
- if (ip_hdr_complete(ipha, zoneid, ipst)) {
- freemsg(first_mp);
- return (B_TRUE);
- }
- }
- if (flags & IP_FF_CKSUM) {
- /*
- * Have to correct checksum since
- * the packet might have been
- * fragmented and the reassembly code in ip_rput
- * does not restore the IP checksum.
- */
- ipha->ipha_hdr_checksum = 0;
- ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
- }
- switch (icmp_type) {
- case ICMP_DEST_UNREACHABLE:
- icmp_unreachable(WR(q), first_mp, icmp_code, zoneid,
- ipst);
+ switch (icmp_type) {
+ case ICMP_DEST_UNREACHABLE:
+ switch (icmp_code) {
+ case ICMP_PROTOCOL_UNREACHABLE:
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInUnknownProtos);
+ ip_drop_input("ipIfStatsInUnknownProtos", mp, ill);
break;
- default:
- freemsg(first_mp);
+ case ICMP_PORT_UNREACHABLE:
+ BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts);
+ ip_drop_input("ipIfStatsNoPorts", mp, ill);
break;
}
- } else {
- freemsg(first_mp);
- return (B_FALSE);
- }
- return (B_TRUE);
+ icmp_unreachable(mp, icmp_code, ira);
+ break;
+ default:
+#ifdef DEBUG
+ panic("ip_fanout_send_icmp_v4: wrong type");
+ /*NOTREACHED*/
+#else
+ freemsg(mp);
+ break;
+#endif
+ }
}
/*
@@ -6250,66 +5000,86 @@ ip_fanout_send_icmp(queue_t *q, mblk_t *mp, uint_t flags,
* is consumed by this function.
*/
void
-ip_proto_not_sup(queue_t *q, mblk_t *ipsec_mp, uint_t flags, zoneid_t zoneid,
- ip_stack_t *ipst)
+ip_proto_not_sup(mblk_t *mp, ip_recv_attr_t *ira)
{
- mblk_t *mp;
- ipha_t *ipha;
- ill_t *ill;
- ipsec_in_t *ii;
-
- ii = (ipsec_in_t *)ipsec_mp->b_rptr;
- ASSERT(ii->ipsec_in_type == IPSEC_IN);
+ ipha_t *ipha;
- mp = ipsec_mp->b_cont;
- ipsec_mp->b_cont = NULL;
ipha = (ipha_t *)mp->b_rptr;
- /* Get ill from index in ipsec_in_t. */
- ill = ill_lookup_on_ifindex(ii->ipsec_in_ill_index,
- (IPH_HDR_VERSION(ipha) == IPV6_VERSION), NULL, NULL, NULL, NULL,
- ipst);
- if (ill != NULL) {
- if (IPH_HDR_VERSION(ipha) == IP_VERSION) {
- if (ip_fanout_send_icmp(q, mp, flags,
- ICMP_DEST_UNREACHABLE,
- ICMP_PROTOCOL_UNREACHABLE, B_FALSE, zoneid, ipst)) {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsInUnknownProtos);
- }
- } else {
- if (ip_fanout_send_icmp_v6(q, mp, flags,
- ICMP6_PARAM_PROB, ICMP6_PARAMPROB_NEXTHEADER,
- 0, B_FALSE, zoneid, ipst)) {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsInUnknownProtos);
- }
- }
- ill_refrele(ill);
- } else { /* re-link for the freemsg() below. */
- ipsec_mp->b_cont = mp;
+ if (ira->ira_flags & IRAF_IS_IPV4) {
+ ASSERT(IPH_HDR_VERSION(ipha) == IP_VERSION);
+ ip_fanout_send_icmp_v4(mp, ICMP_DEST_UNREACHABLE,
+ ICMP_PROTOCOL_UNREACHABLE, ira);
+ } else {
+ ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
+ ip_fanout_send_icmp_v6(mp, ICMP6_PARAM_PROB,
+ ICMP6_PARAMPROB_NEXTHEADER, ira);
}
-
- /* If ICMP delivered, ipsec_mp will be a singleton (b_cont == NULL). */
- freemsg(ipsec_mp);
}
/*
- * See if the inbound datagram has had IPsec processing applied to it.
+ * Deliver a rawip packet to the given conn, possibly applying ipsec policy.
+ * Handles IPv4 and IPv6.
+ * We are responsible for disposing of mp, such as by freemsg() or putnext()
+ * Caller is responsible for dropping references to the conn.
*/
-boolean_t
-ipsec_in_is_secure(mblk_t *ipsec_mp)
+void
+ip_fanout_proto_conn(conn_t *connp, mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
+ ip_recv_attr_t *ira)
{
- ipsec_in_t *ii;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+ ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
+ boolean_t secure;
+ uint_t protocol = ira->ira_protocol;
+ iaflags_t iraflags = ira->ira_flags;
+ queue_t *rq;
+
+ secure = iraflags & IRAF_IPSEC_SECURE;
+
+ rq = connp->conn_rq;
+ if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld : !canputnext(rq)) {
+ switch (protocol) {
+ case IPPROTO_ICMPV6:
+ BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInOverflows);
+ break;
+ case IPPROTO_ICMP:
+ BUMP_MIB(&ipst->ips_icmp_mib, icmpInOverflows);
+ break;
+ default:
+ BUMP_MIB(ill->ill_ip_mib, rawipIfStatsInOverflows);
+ break;
+ }
+ freemsg(mp);
+ return;
+ }
- ii = (ipsec_in_t *)ipsec_mp->b_rptr;
- ASSERT(ii->ipsec_in_type == IPSEC_IN);
+ ASSERT(!(IPCL_IS_IPTUN(connp)));
- if (ii->ipsec_in_loopback) {
- return (ii->ipsec_in_secure);
+ if (((iraflags & IRAF_IS_IPV4) ?
+ CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
+ CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) ||
+ secure) {
+ mp = ipsec_check_inbound_policy(mp, connp, ipha,
+ ip6h, ira);
+ if (mp == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ /* Note that mp is NULL */
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ return;
+ }
+ }
+
+ if (iraflags & IRAF_ICMP_ERROR) {
+ (connp->conn_recvicmp)(connp, mp, NULL, ira);
} else {
- return (ii->ipsec_in_ah_sa != NULL ||
- ii->ipsec_in_esp_sa != NULL ||
- ii->ipsec_in_decaps);
+ ill_t *rill = ira->ira_rill;
+
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+ ira->ira_ill = ira->ira_rill = NULL;
+ /* Send it upstream */
+ (connp->conn_recv)(connp, mp, NULL, ira);
+ ira->ira_ill = ill;
+ ira->ira_rill = rill;
}
}
@@ -6336,65 +5106,33 @@ ipsec_in_is_secure(mblk_t *ipsec_mp)
* is used to negotiate SAs as SAs will be added only after
* verifying the policy.
*
- * IPQoS Notes:
- * Once we have determined the client, invoke IPPF processing.
- * Policy processing takes place only if the callout_position, IPP_LOCAL_IN,
- * is enabled. If we get here from icmp_inbound_error_fanout or ip_wput_local
- * ip_policy will be false.
- *
* Zones notes:
- * Currently only applications in the global zone can create raw sockets for
- * protocols other than ICMP. So unlike the broadcast / multicast case of
- * ip_fanout_udp(), we only send a copy of the packet to streams in the
- * specified zone. For ICMP, this is handled by the callers of icmp_inbound().
+ * Earlier in ip_input on a system with multiple shared-IP zones we
+ * duplicate the multicast and broadcast packets and send them up
+ * with each explicit zoneid that exists on that ill.
+ * This means that here we can match the zoneid with SO_ALLZONES being special.
*/
-static void
-ip_fanout_proto(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, uint_t flags,
- boolean_t mctl_present, boolean_t ip_policy, ill_t *recv_ill,
- zoneid_t zoneid)
+void
+ip_fanout_proto_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
{
- queue_t *rq;
- mblk_t *mp1, *first_mp1;
- uint_t protocol = ipha->ipha_protocol;
- ipaddr_t dst;
- mblk_t *first_mp = mp;
- boolean_t secure;
- uint32_t ill_index;
- conn_t *connp, *first_connp, *next_connp;
- connf_t *connfp;
- boolean_t shared_addr;
- mib2_ipIfStatsEntry_t *mibptr;
- ip_stack_t *ipst = recv_ill->ill_ipst;
- ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
+ mblk_t *mp1;
+ ipaddr_t laddr;
+ conn_t *connp, *first_connp, *next_connp;
+ connf_t *connfp;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
- mibptr = (ill != NULL) ? ill->ill_ip_mib : &ipst->ips_ip_mib;
- if (mctl_present) {
- mp = first_mp->b_cont;
- secure = ipsec_in_is_secure(first_mp);
- ASSERT(mp != NULL);
- } else {
- secure = B_FALSE;
- }
- dst = ipha->ipha_dst;
- shared_addr = (zoneid == ALL_ZONES);
- if (shared_addr) {
- /*
- * We don't allow multilevel ports for raw IP, so no need to
- * check for that here.
- */
- zoneid = tsol_packet_to_zoneid(mp);
- }
+ laddr = ipha->ipha_dst;
- connfp = &ipst->ips_ipcl_proto_fanout[protocol];
+ connfp = &ipst->ips_ipcl_proto_fanout_v4[ira->ira_protocol];
mutex_enter(&connfp->connf_lock);
connp = connfp->connf_head;
for (connp = connfp->connf_head; connp != NULL;
connp = connp->conn_next) {
- if (IPCL_PROTO_MATCH(connp, protocol, ipha, ill, flags,
- zoneid) &&
- (!is_system_labeled() ||
- tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr,
- connp))) {
+ /* Note: IPCL_PROTO_MATCH includes conn_wantpacket */
+ if (IPCL_PROTO_MATCH(connp, ira, ipha) &&
+ (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
+ tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp))) {
break;
}
}
@@ -6406,40 +5144,12 @@ ip_fanout_proto(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, uint_t flags,
* unclaimed datagrams?
*/
mutex_exit(&connfp->connf_lock);
- /*
- * Check for IPPROTO_ENCAP...
- */
- if (protocol == IPPROTO_ENCAP && ipst->ips_ip_g_mrouter) {
- /*
- * If an IPsec mblk is here on a multicast
- * tunnel (using ip_mroute stuff), check policy here,
- * THEN ship off to ip_mroute_decap().
- *
- * BTW, If I match a configured IP-in-IP
- * tunnel, this path will not be reached, and
- * ip_mroute_decap will never be called.
- */
- first_mp = ipsec_check_global_policy(first_mp, connp,
- ipha, NULL, mctl_present, ipst->ips_netstack);
- if (first_mp != NULL) {
- if (mctl_present)
- freeb(first_mp);
- ip_mroute_decap(q, mp, ill);
- } /* Else we already freed everything! */
- } else {
- /*
- * Otherwise send an ICMP protocol unreachable.
- */
- if (ip_fanout_send_icmp(q, first_mp, flags,
- ICMP_DEST_UNREACHABLE, ICMP_PROTOCOL_UNREACHABLE,
- mctl_present, zoneid, ipst)) {
- BUMP_MIB(mibptr, ipIfStatsInUnknownProtos);
- }
- }
+ ip_fanout_send_icmp_v4(mp, ICMP_DEST_UNREACHABLE,
+ ICMP_PROTOCOL_UNREACHABLE, ira);
return;
}
- ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL);
+ ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
CONN_INC_REF(connp);
first_connp = connp;
@@ -6447,111 +5157,35 @@ ip_fanout_proto(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, uint_t flags,
for (;;) {
while (connp != NULL) {
- if (IPCL_PROTO_MATCH(connp, protocol, ipha, ill,
- flags, zoneid) &&
- (!is_system_labeled() ||
- tsol_receive_local(mp, &dst, IPV4_VERSION,
- shared_addr, connp)))
+ /* Note: IPCL_PROTO_MATCH includes conn_wantpacket */
+ if (IPCL_PROTO_MATCH(connp, ira, ipha) &&
+ (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
+ tsol_receive_local(mp, &laddr, IPV4_VERSION,
+ ira, connp)))
break;
connp = connp->conn_next;
}
- /*
- * Copy the packet.
- */
- if (connp == NULL ||
- (((first_mp1 = dupmsg(first_mp)) == NULL) &&
- ((first_mp1 = ip_copymsg(first_mp)) == NULL))) {
- /*
- * No more interested clients or memory
- * allocation failed
- */
+ if (connp == NULL) {
+ /* No more interested clients */
connp = first_connp;
break;
}
- ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
- mp1 = mctl_present ? first_mp1->b_cont : first_mp1;
+ if (((mp1 = dupmsg(mp)) == NULL) &&
+ ((mp1 = copymsg(mp)) == NULL)) {
+ /* Memory allocation failed */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ connp = first_connp;
+ break;
+ }
+
CONN_INC_REF(connp);
mutex_exit(&connfp->connf_lock);
- rq = connp->conn_rq;
- /*
- * Check flow control
- */
- if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
- (!IPCL_IS_NONSTR(connp) && !canputnext(rq))) {
- if (flags & IP_FF_RAWIP) {
- BUMP_MIB(mibptr, rawipIfStatsInOverflows);
- } else {
- BUMP_MIB(&ipst->ips_icmp_mib, icmpInOverflows);
- }
+ ip_fanout_proto_conn(connp, mp1, (ipha_t *)mp1->b_rptr, NULL,
+ ira);
- freemsg(first_mp1);
- } else {
- /*
- * Enforce policy like any other conn_t. Note that
- * IP-in-IP packets don't come through here, but
- * through ip_iptun_input() or
- * icmp_inbound_iptun_fanout(). IPsec policy for such
- * packets is enforced in the iptun module.
- */
- if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) ||
- secure) {
- first_mp1 = ipsec_check_inbound_policy
- (first_mp1, connp, ipha, NULL,
- mctl_present);
- }
- if (first_mp1 != NULL) {
- int in_flags = 0;
- /*
- * ip_fanout_proto also gets called from
- * icmp_inbound_error_fanout, in which case
- * the msg type is M_CTL. Don't add info
- * in this case for the time being. In future
- * when there is a need for knowing the
- * inbound iface index for ICMP error msgs,
- * then this can be changed.
- */
- if (connp->conn_recvif)
- in_flags = IPF_RECVIF;
- /*
- * The ULP may support IP_RECVPKTINFO for both
- * IP v4 and v6 so pass the appropriate argument
- * based on conn IP version.
- */
- if (connp->conn_ip_recvpktinfo) {
- if (connp->conn_af_isv6) {
- /*
- * V6 only needs index
- */
- in_flags |= IPF_RECVIF;
- } else {
- /*
- * V4 needs index +
- * matching address.
- */
- in_flags |= IPF_RECVADDR;
- }
- }
- if ((in_flags != 0) &&
- (mp->b_datap->db_type != M_CTL)) {
- /*
- * the actual data will be
- * contained in b_cont upon
- * successful return of the
- * following call else
- * original mblk is returned
- */
- ASSERT(recv_ill != NULL);
- mp1 = ip_add_info(mp1, recv_ill,
- in_flags, IPCL_ZONEID(connp), ipst);
- }
- BUMP_MIB(mibptr, ipIfStatsHCInDelivers);
- if (mctl_present)
- freeb(first_mp1);
- (connp->conn_recv)(connp, mp1, NULL);
- }
- }
mutex_enter(&connfp->connf_lock);
/* Follow the next pointer before releasing the conn. */
next_connp = connp->conn_next;
@@ -6562,363 +5196,27 @@ ip_fanout_proto(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, uint_t flags,
/* Last one. Send it upstream. */
mutex_exit(&connfp->connf_lock);
- /*
- * If this packet is coming from icmp_inbound_error_fanout ip_policy
- * will be set to false.
- */
- if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) {
- ill_index = ill->ill_phyint->phyint_ifindex;
- ip_process(IPP_LOCAL_IN, &mp, ill_index);
- if (mp == NULL) {
- CONN_DEC_REF(connp);
- if (mctl_present) {
- freeb(first_mp);
- }
- return;
- }
- }
-
- rq = connp->conn_rq;
- /*
- * Check flow control
- */
- if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
- (!IPCL_IS_NONSTR(connp) && !canputnext(rq))) {
- if (flags & IP_FF_RAWIP) {
- BUMP_MIB(mibptr, rawipIfStatsInOverflows);
- } else {
- BUMP_MIB(&ipst->ips_icmp_mib, icmpInOverflows);
- }
-
- freemsg(first_mp);
- } else {
- ASSERT(!IPCL_IS_IPTUN(connp));
-
- if ((CONN_INBOUND_POLICY_PRESENT(connp, ipss) || secure)) {
- first_mp = ipsec_check_inbound_policy(first_mp, connp,
- ipha, NULL, mctl_present);
- }
-
- if (first_mp != NULL) {
- int in_flags = 0;
-
- /*
- * ip_fanout_proto also gets called
- * from icmp_inbound_error_fanout, in
- * which case the msg type is M_CTL.
- * Don't add info in this case for time
- * being. In future when there is a
- * need for knowing the inbound iface
- * index for ICMP error msgs, then this
- * can be changed
- */
- if (connp->conn_recvif)
- in_flags = IPF_RECVIF;
- if (connp->conn_ip_recvpktinfo) {
- if (connp->conn_af_isv6) {
- /*
- * V6 only needs index
- */
- in_flags |= IPF_RECVIF;
- } else {
- /*
- * V4 needs index +
- * matching address.
- */
- in_flags |= IPF_RECVADDR;
- }
- }
- if ((in_flags != 0) &&
- (mp->b_datap->db_type != M_CTL)) {
+ ip_fanout_proto_conn(connp, mp, ipha, NULL, ira);
- /*
- * the actual data will be contained in
- * b_cont upon successful return
- * of the following call else original
- * mblk is returned
- */
- ASSERT(recv_ill != NULL);
- mp = ip_add_info(mp, recv_ill,
- in_flags, IPCL_ZONEID(connp), ipst);
- }
- BUMP_MIB(mibptr, ipIfStatsHCInDelivers);
- (connp->conn_recv)(connp, mp, NULL);
- if (mctl_present)
- freeb(first_mp);
- }
- }
CONN_DEC_REF(connp);
}
/*
- * Serialize tcp resets by calling tcp_xmit_reset_serialize through
- * SQUEUE_ENTER_ONE(SQ_FILL). We do this to ensure the reset is handled on
- * the correct squeue, in this case the same squeue as a valid listener with
- * no current connection state for the packet we are processing. The function
- * is called for synchronizing both IPv4 and IPv6.
- */
-void
-ip_xmit_reset_serialize(mblk_t *mp, int hdrlen, zoneid_t zoneid,
- tcp_stack_t *tcps, conn_t *connp)
-{
- mblk_t *rst_mp;
- tcp_xmit_reset_event_t *eventp;
-
- rst_mp = allocb(sizeof (tcp_xmit_reset_event_t), BPRI_HI);
-
- if (rst_mp == NULL) {
- freemsg(mp);
- return;
- }
-
- rst_mp->b_datap->db_type = M_PROTO;
- rst_mp->b_wptr += sizeof (tcp_xmit_reset_event_t);
-
- eventp = (tcp_xmit_reset_event_t *)rst_mp->b_rptr;
- eventp->tcp_xre_event = TCP_XRE_EVENT_IP_FANOUT_TCP;
- eventp->tcp_xre_iphdrlen = hdrlen;
- eventp->tcp_xre_zoneid = zoneid;
- eventp->tcp_xre_tcps = tcps;
-
- rst_mp->b_cont = mp;
- mp = rst_mp;
-
- /*
- * Increment the connref, this ref will be released by the squeue
- * framework.
- */
- CONN_INC_REF(connp);
- SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_xmit_reset, connp,
- SQ_FILL, SQTAG_XMIT_EARLY_RESET);
-}
-
-/*
- * Fanout for TCP packets
- * The caller puts <fport, lport> in the ports parameter.
- *
- * IPQoS Notes
- * Before sending it to the client, invoke IPPF processing.
- * Policy processing takes place only if the callout_position, IPP_LOCAL_IN,
- * is enabled. If we get here from icmp_inbound_error_fanout or ip_wput_local
- * ip_policy is false.
- */
-static void
-ip_fanout_tcp(queue_t *q, mblk_t *mp, ill_t *recv_ill, ipha_t *ipha,
- uint_t flags, boolean_t mctl_present, boolean_t ip_policy, zoneid_t zoneid)
-{
- mblk_t *first_mp;
- boolean_t secure;
- uint32_t ill_index;
- int ip_hdr_len;
- tcph_t *tcph;
- boolean_t syn_present = B_FALSE;
- conn_t *connp;
- ip_stack_t *ipst = recv_ill->ill_ipst;
- ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
-
- ASSERT(recv_ill != NULL);
-
- first_mp = mp;
- if (mctl_present) {
- ASSERT(first_mp->b_datap->db_type == M_CTL);
- mp = first_mp->b_cont;
- secure = ipsec_in_is_secure(first_mp);
- ASSERT(mp != NULL);
- } else {
- secure = B_FALSE;
- }
-
- ip_hdr_len = IPH_HDR_LENGTH(mp->b_rptr);
-
- if ((connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_len,
- zoneid, ipst)) == NULL) {
- /*
- * No connected connection or listener. Send a
- * TH_RST via tcp_xmit_listeners_reset.
- */
-
- /* Initiate IPPf processing, if needed. */
- if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) {
- uint32_t ill_index;
- ill_index = recv_ill->ill_phyint->phyint_ifindex;
- ip_process(IPP_LOCAL_IN, &first_mp, ill_index);
- if (first_mp == NULL)
- return;
- }
- BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers);
- ip2dbg(("ip_fanout_tcp: no listener; send reset to zone %d\n",
- zoneid));
- tcp_xmit_listeners_reset(first_mp, ip_hdr_len, zoneid,
- ipst->ips_netstack->netstack_tcp, NULL);
- return;
- }
-
- /*
- * Allocate the SYN for the TCP connection here itself
- */
- tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len];
- if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) {
- if (IPCL_IS_TCP(connp)) {
- squeue_t *sqp;
-
- /*
- * If the queue belongs to a conn, and fused tcp
- * loopback is enabled, assign the eager's squeue
- * to be that of the active connect's. Note that
- * we don't check for IP_FF_LOOPBACK here since this
- * routine gets called only for loopback (unlike the
- * IPv6 counterpart).
- */
- if (do_tcp_fusion &&
- CONN_Q(q) && IPCL_IS_TCP(Q_TO_CONN(q)) &&
- !CONN_INBOUND_POLICY_PRESENT(connp, ipss) &&
- !secure &&
- !IPP_ENABLED(IPP_LOCAL_IN, ipst) && !ip_policy) {
- ASSERT(Q_TO_CONN(q)->conn_sqp != NULL);
- sqp = Q_TO_CONN(q)->conn_sqp;
- } else {
- sqp = IP_SQUEUE_GET(lbolt);
- }
-
- mp->b_datap->db_struioflag |= STRUIO_EAGER;
- DB_CKSUMSTART(mp) = (intptr_t)sqp;
- syn_present = B_TRUE;
- }
- }
-
- if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp) && !syn_present) {
- uint_t flags = (unsigned int)tcph->th_flags[0] & 0xFF;
- BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers);
- if ((flags & TH_RST) || (flags & TH_URG)) {
- CONN_DEC_REF(connp);
- freemsg(first_mp);
- return;
- }
- if (flags & TH_ACK) {
- ip_xmit_reset_serialize(first_mp, ip_hdr_len, zoneid,
- ipst->ips_netstack->netstack_tcp, connp);
- CONN_DEC_REF(connp);
- return;
- }
-
- CONN_DEC_REF(connp);
- freemsg(first_mp);
- return;
- }
-
- if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || secure) {
- first_mp = ipsec_check_inbound_policy(first_mp, connp, ipha,
- NULL, mctl_present);
- if (first_mp == NULL) {
- BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards);
- CONN_DEC_REF(connp);
- return;
- }
- if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp)) {
- ASSERT(syn_present);
- if (mctl_present) {
- ASSERT(first_mp != mp);
- first_mp->b_datap->db_struioflag |=
- STRUIO_POLICY;
- } else {
- ASSERT(first_mp == mp);
- mp->b_datap->db_struioflag &=
- ~STRUIO_EAGER;
- mp->b_datap->db_struioflag |=
- STRUIO_POLICY;
- }
- } else {
- /*
- * Discard first_mp early since we're dealing with a
- * fully-connected conn_t and tcp doesn't do policy in
- * this case.
- */
- if (mctl_present) {
- freeb(first_mp);
- mctl_present = B_FALSE;
- }
- first_mp = mp;
- }
- }
-
- /*
- * Initiate policy processing here if needed. If we get here from
- * icmp_inbound_error_fanout, ip_policy is false.
- */
- if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) {
- ill_index = recv_ill->ill_phyint->phyint_ifindex;
- ip_process(IPP_LOCAL_IN, &mp, ill_index);
- if (mp == NULL) {
- CONN_DEC_REF(connp);
- if (mctl_present)
- freeb(first_mp);
- return;
- } else if (mctl_present) {
- ASSERT(first_mp != mp);
- first_mp->b_cont = mp;
- } else {
- first_mp = mp;
- }
- }
-
- /* Handle socket options. */
- if (!syn_present &&
- connp->conn_ip_recvpktinfo && (flags & IP_FF_IPINFO)) {
- /* Add header */
- ASSERT(recv_ill != NULL);
- /*
- * Since tcp does not support IP_RECVPKTINFO for V4, only pass
- * IPF_RECVIF.
- */
- mp = ip_add_info(mp, recv_ill, IPF_RECVIF, IPCL_ZONEID(connp),
- ipst);
- if (mp == NULL) {
- BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards);
- CONN_DEC_REF(connp);
- if (mctl_present)
- freeb(first_mp);
- return;
- } else if (mctl_present) {
- /*
- * ip_add_info might return a new mp.
- */
- ASSERT(first_mp != mp);
- first_mp->b_cont = mp;
- } else {
- first_mp = mp;
- }
- }
- BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers);
- if (IPCL_IS_TCP(connp)) {
- /* do not drain, certain use cases can blow the stack */
- SQUEUE_ENTER_ONE(connp->conn_sqp, first_mp, connp->conn_recv,
- connp, SQ_NODRAIN, SQTAG_IP_FANOUT_TCP);
- } else {
- /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
- (connp->conn_recv)(connp, first_mp, NULL);
- CONN_DEC_REF(connp);
- }
-}
-
-/*
* If we have a IPsec NAT-Traversal packet, strip the zero-SPI or
- * pass it along to ESP if the SPI is non-zero. Returns TRUE if the mblk
+ * pass it along to ESP if the SPI is non-zero. Returns the mblk if the mblk
* is not consumed.
*
- * One of four things can happen, all of which affect the passed-in mblk:
- *
- * 1.) ICMP messages that go through here just get returned TRUE.
+ * One of three things can happen, all of which affect the passed-in mblk:
*
- * 2.) The packet is stock UDP and gets its zero-SPI stripped. Return TRUE.
+ * 1.) The packet is stock UDP and gets its zero-SPI stripped. Return mblk..
*
- * 3.) The packet is ESP-in-UDP, gets transformed into an equivalent
- * ESP packet, and is passed along to ESP for consumption. Return FALSE.
+ * 2.) The packet is ESP-in-UDP, gets transformed into an equivalent
+ * ESP packet, and is passed along to ESP for consumption. Return NULL.
*
- * 4.) The packet is an ESP-in-UDP Keepalive. Drop it and return FALSE.
+ * 3.) The packet is an ESP-in-UDP Keepalive. Drop it and return NULL.
*/
-static boolean_t
-zero_spi_check(queue_t *q, mblk_t *mp, ire_t *ire, ill_t *recv_ill,
- ipsec_stack_t *ipss)
+mblk_t *
+zero_spi_check(mblk_t *mp, ip_recv_attr_t *ira)
{
int shift, plen, iph_len;
ipha_t *ipha;
@@ -6926,28 +5224,12 @@ zero_spi_check(queue_t *q, mblk_t *mp, ire_t *ire, ill_t *recv_ill,
uint32_t *spi;
uint32_t esp_ports;
uint8_t *orptr;
- boolean_t free_ire;
-
- if (DB_TYPE(mp) == M_CTL) {
- /*
- * ICMP message with UDP inside. Don't bother stripping, just
- * send it up.
- *
- * NOTE: Any app with UDP_NAT_T_ENDPOINT set is probably going
- * to ignore errors set by ICMP anyway ('cause they might be
- * forged), but that's the app's decision, not ours.
- */
-
- /* Bunch of reality checks for DEBUG kernels... */
- ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION);
- ASSERT(((ipha_t *)mp->b_rptr)->ipha_protocol == IPPROTO_ICMP);
-
- return (B_TRUE);
- }
+ ip_stack_t *ipst = ira->ira_ill->ill_ipst;
+ ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
ipha = (ipha_t *)mp->b_rptr;
- iph_len = IPH_HDR_LENGTH(ipha);
- plen = ntohs(ipha->ipha_length);
+ iph_len = ira->ira_ip_hdr_length;
+ plen = ira->ira_pktlen;
if (plen - iph_len - sizeof (udpha_t) < sizeof (uint32_t)) {
/*
@@ -6958,18 +5240,18 @@ zero_spi_check(queue_t *q, mblk_t *mp, ire_t *ire, ill_t *recv_ill,
* byte packets (keepalives are 1-byte), but we'll drop them
* also.
*/
- ip_drop_packet(mp, B_TRUE, recv_ill, NULL,
+ ip_drop_packet(mp, B_TRUE, ira->ira_ill,
DROPPER(ipss, ipds_esp_nat_t_ka), &ipss->ipsec_dropper);
- return (B_FALSE);
+ return (NULL);
}
if (MBLKL(mp) < iph_len + sizeof (udpha_t) + sizeof (*spi)) {
/* might as well pull it all up - it might be ESP. */
if (!pullupmsg(mp, -1)) {
- ip_drop_packet(mp, B_TRUE, recv_ill, NULL,
+ ip_drop_packet(mp, B_TRUE, ira->ira_ill,
DROPPER(ipss, ipds_esp_nomem),
&ipss->ipsec_dropper);
- return (B_FALSE);
+ return (NULL);
}
ipha = (ipha_t *)mp->b_rptr;
@@ -6985,7 +5267,8 @@ zero_spi_check(queue_t *q, mblk_t *mp, ire_t *ire, ill_t *recv_ill,
}
/* Fix IP header */
- ipha->ipha_length = htons(plen - shift);
+ ira->ira_pktlen = (plen - shift);
+ ipha->ipha_length = htons(ira->ira_pktlen);
ipha->ipha_hdr_checksum = 0;
orptr = mp->b_rptr;
@@ -7005,388 +5288,185 @@ zero_spi_check(queue_t *q, mblk_t *mp, ire_t *ire, ill_t *recv_ill,
if (esp_ports != 0) /* Punt up for ESP processing. */ {
ipha = (ipha_t *)(orptr + shift);
- free_ire = (ire == NULL);
- if (free_ire) {
- /* Re-acquire ire. */
- ire = ire_cache_lookup(ipha->ipha_dst, ALL_ZONES, NULL,
- ipss->ipsec_netstack->netstack_ip);
- if (ire == NULL || !(ire->ire_type & IRE_LOCAL)) {
- if (ire != NULL)
- ire_refrele(ire);
- /*
- * Do a regular freemsg(), as this is an IP
- * error (no local route) not an IPsec one.
- */
- freemsg(mp);
- }
- }
-
- ip_proto_input(q, mp, ipha, ire, recv_ill, esp_ports);
- if (free_ire)
- ire_refrele(ire);
+ ira->ira_flags |= IRAF_ESP_UDP_PORTS;
+ ira->ira_esp_udp_ports = esp_ports;
+ ip_fanout_v4(mp, ipha, ira);
+ return (NULL);
}
-
- return (esp_ports == 0);
+ return (mp);
}
/*
* Deliver a udp packet to the given conn, possibly applying ipsec policy.
+ * Handles IPv4 and IPv6.
* We are responsible for disposing of mp, such as by freemsg() or putnext()
- * Caller is responsible for dropping references to the conn, and freeing
- * first_mp.
- *
- * IPQoS Notes
- * Before sending it to the client, invoke IPPF processing. Policy processing
- * takes place only if the callout_position, IPP_LOCAL_IN, is enabled and
- * ip_policy is true. If we get here from icmp_inbound_error_fanout or
- * ip_wput_local, ip_policy is false.
+ * Caller is responsible for dropping references to the conn.
*/
-static void
-ip_fanout_udp_conn(conn_t *connp, mblk_t *first_mp, mblk_t *mp,
- boolean_t secure, ill_t *ill, ipha_t *ipha, uint_t flags, ill_t *recv_ill,
- boolean_t ip_policy)
+void
+ip_fanout_udp_conn(conn_t *connp, mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
+ ip_recv_attr_t *ira)
{
- boolean_t mctl_present = (first_mp != NULL);
- uint32_t in_flags = 0; /* set to IP_RECVSLLA and/or IP_RECVIF */
- uint32_t ill_index;
- ip_stack_t *ipst = recv_ill->ill_ipst;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
+ boolean_t secure;
+ iaflags_t iraflags = ira->ira_flags;
- ASSERT(ill != NULL);
+ secure = iraflags & IRAF_IPSEC_SECURE;
- if (mctl_present)
- first_mp->b_cont = mp;
- else
- first_mp = mp;
-
- if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
- (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) {
+ if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld :
+ !canputnext(connp->conn_rq)) {
BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
- freemsg(first_mp);
+ freemsg(mp);
return;
}
- if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || secure) {
- first_mp = ipsec_check_inbound_policy(first_mp, connp, ipha,
- NULL, mctl_present);
- /* Freed by ipsec_check_inbound_policy(). */
- if (first_mp == NULL) {
+ if (((iraflags & IRAF_IS_IPV4) ?
+ CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
+ CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) ||
+ secure) {
+ mp = ipsec_check_inbound_policy(mp, connp, ipha,
+ ip6h, ira);
+ if (mp == NULL) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ /* Note that mp is NULL */
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
return;
}
}
- if (mctl_present)
- freeb(first_mp);
-
- /* Let's hope the compilers utter "branch, predict-not-taken..." ;) */
- if (connp->conn_udp->udp_nat_t_endpoint) {
- if (mctl_present) {
- /* mctl_present *shouldn't* happen. */
- ip_drop_packet(mp, B_TRUE, NULL, NULL,
- DROPPER(ipss, ipds_esp_nat_t_ipsec),
- &ipss->ipsec_dropper);
- return;
- }
-
- if (!zero_spi_check(ill->ill_rq, mp, NULL, recv_ill, ipss))
- return;
- }
- /* Handle options. */
- if (connp->conn_recvif)
- in_flags = IPF_RECVIF;
/*
- * UDP supports IP_RECVPKTINFO option for both v4 and v6 so the flag
- * passed to ip_add_info is based on IP version of connp.
+ * Since this code is not used for UDP unicast we don't need a NAT_T
+ * check. Only ip_fanout_v4 has that check.
*/
- if (connp->conn_ip_recvpktinfo && (flags & IP_FF_IPINFO)) {
- if (connp->conn_af_isv6) {
- /*
- * V6 only needs index
- */
- in_flags |= IPF_RECVIF;
- } else {
- /*
- * V4 needs index + matching address.
- */
- in_flags |= IPF_RECVADDR;
- }
- }
-
- if (connp->conn_recvslla && !(flags & IP_FF_SEND_SLLA))
- in_flags |= IPF_RECVSLLA;
+ if (ira->ira_flags & IRAF_ICMP_ERROR) {
+ (connp->conn_recvicmp)(connp, mp, NULL, ira);
+ } else {
+ ill_t *rill = ira->ira_rill;
- /*
- * Initiate IPPF processing here, if needed. Note first_mp won't be
- * freed if the packet is dropped. The caller will do so.
- */
- if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) {
- ill_index = recv_ill->ill_phyint->phyint_ifindex;
- ip_process(IPP_LOCAL_IN, &mp, ill_index);
- if (mp == NULL) {
- return;
- }
- }
- if ((in_flags != 0) &&
- (mp->b_datap->db_type != M_CTL)) {
- /*
- * The actual data will be contained in b_cont
- * upon successful return of the following call
- * else original mblk is returned
- */
- ASSERT(recv_ill != NULL);
- mp = ip_add_info(mp, recv_ill, in_flags, IPCL_ZONEID(connp),
- ipst);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+ ira->ira_ill = ira->ira_rill = NULL;
+ /* Send it upstream */
+ (connp->conn_recv)(connp, mp, NULL, ira);
+ ira->ira_ill = ill;
+ ira->ira_rill = rill;
}
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
- /* Send it upstream */
- (connp->conn_recv)(connp, mp, NULL);
}
/*
- * Fanout for UDP packets.
- * The caller puts <fport, lport> in the ports parameter.
+ * Fanout for UDP packets that are multicast or broadcast, and ICMP errors.
+ * (Unicast fanout is handled in ip_input_v4.)
*
* If SO_REUSEADDR is set all multicast and broadcast packets
- * will be delivered to all streams bound to the same port.
+ * will be delivered to all conns bound to the same port.
*
- * Zones notes:
- * Multicast and broadcast packets will be distributed to streams in all zones.
+ * If there is at least one matching AF_INET receiver, then we will
+ * ignore any AF_INET6 receivers.
* In the special case where an AF_INET socket binds to 0.0.0.0/<port> and an
* AF_INET6 socket binds to ::/<port>, only the AF_INET socket receives the IPv4
- * packets. To maintain this behavior with multiple zones, the conns are grouped
- * by zone and the SO_REUSEADDR flag is checked for the first matching conn in
- * each zone. If unset, all the following conns in the same zone are skipped.
+ * packets.
+ *
+ * Zones notes:
+ * Earlier in ip_input on a system with multiple shared-IP zones we
+ * duplicate the multicast and broadcast packets and send them up
+ * with each explicit zoneid that exists on that ill.
+ * This means that here we can match the zoneid with SO_ALLZONES being special.
*/
-static void
-ip_fanout_udp(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha,
- uint32_t ports, boolean_t broadcast, uint_t flags, boolean_t mctl_present,
- boolean_t ip_policy, ill_t *recv_ill, zoneid_t zoneid)
+void
+ip_fanout_udp_multi_v4(mblk_t *mp, ipha_t *ipha, uint16_t lport, uint16_t fport,
+ ip_recv_attr_t *ira)
{
- uint32_t dstport, srcport;
- ipaddr_t dst;
- mblk_t *first_mp;
- boolean_t secure;
- in6_addr_t v6src;
+ ipaddr_t laddr;
+ in6_addr_t v6faddr;
conn_t *connp;
connf_t *connfp;
- conn_t *first_connp;
- conn_t *next_connp;
- mblk_t *mp1, *first_mp1;
- ipaddr_t src;
- zoneid_t last_zoneid;
- boolean_t reuseaddr;
- boolean_t shared_addr;
- boolean_t unlabeled;
- ip_stack_t *ipst;
-
- ASSERT(recv_ill != NULL);
- ipst = recv_ill->ill_ipst;
-
- first_mp = mp;
- if (mctl_present) {
- mp = first_mp->b_cont;
- first_mp->b_cont = NULL;
- secure = ipsec_in_is_secure(first_mp);
- ASSERT(mp != NULL);
- } else {
- first_mp = NULL;
- secure = B_FALSE;
- }
+ ipaddr_t faddr;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
- /* Extract ports in net byte order */
- dstport = htons(ntohl(ports) & 0xFFFF);
- srcport = htons(ntohl(ports) >> 16);
- dst = ipha->ipha_dst;
- src = ipha->ipha_src;
+ ASSERT(ira->ira_flags & (IRAF_MULTIBROADCAST|IRAF_ICMP_ERROR));
- unlabeled = B_FALSE;
- if (is_system_labeled())
- /* Cred cannot be null on IPv4 */
- unlabeled = (msg_getlabel(mp)->tsl_flags &
- TSLF_UNLABELED) != 0;
- shared_addr = (zoneid == ALL_ZONES);
- if (shared_addr) {
- /*
- * No need to handle exclusive-stack zones since ALL_ZONES
- * only applies to the shared stack.
- */
- zoneid = tsol_mlp_findzone(IPPROTO_UDP, dstport);
- /*
- * If no shared MLP is found, tsol_mlp_findzone returns
- * ALL_ZONES. In that case, we assume it's SLP, and
- * search for the zone based on the packet label.
- *
- * If there is such a zone, we prefer to find a
- * connection in it. Otherwise, we look for a
- * MAC-exempt connection in any zone whose label
- * dominates the default label on the packet.
- */
- if (zoneid == ALL_ZONES)
- zoneid = tsol_packet_to_zoneid(mp);
- else
- unlabeled = B_FALSE;
- }
+ laddr = ipha->ipha_dst;
+ faddr = ipha->ipha_src;
- connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(dstport, ipst)];
+ connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
mutex_enter(&connfp->connf_lock);
connp = connfp->connf_head;
- if (!broadcast && !CLASSD(dst)) {
- /*
- * Not broadcast or multicast. Send to the one (first)
- * client we find. No need to check conn_wantpacket()
- * since IP_BOUND_IF/conn_incoming_ill does not apply to
- * IPv4 unicast packets.
- */
- while ((connp != NULL) &&
- (!IPCL_UDP_MATCH(connp, dstport, dst, srcport, src) ||
- (!IPCL_ZONE_MATCH(connp, zoneid) &&
- !(unlabeled && (connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
- shared_addr)))) {
- /*
- * We keep searching since the conn did not match,
- * or its zone did not match and it is not either
- * an allzones conn or a mac exempt conn (if the
- * sender is unlabeled.)
- */
- connp = connp->conn_next;
- }
-
- if (connp == NULL ||
- !IPCL_IS_NONSTR(connp) && connp->conn_upq == NULL)
- goto notfound;
-
- ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL);
-
- if (is_system_labeled() &&
- !tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr,
- connp))
- goto notfound;
-
- CONN_INC_REF(connp);
- mutex_exit(&connfp->connf_lock);
- ip_fanout_udp_conn(connp, first_mp, mp, secure, ill, ipha,
- flags, recv_ill, ip_policy);
- IP_STAT(ipst, ip_udp_fannorm);
- CONN_DEC_REF(connp);
- return;
- }
/*
- * Broadcast and multicast case
- *
- * Need to check conn_wantpacket().
* If SO_REUSEADDR has been set on the first we send the
* packet to all clients that have joined the group and
* match the port.
*/
-
while (connp != NULL) {
- if ((IPCL_UDP_MATCH(connp, dstport, dst, srcport, src)) &&
- conn_wantpacket(connp, ill, ipha, flags, zoneid) &&
- (!is_system_labeled() ||
- tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr,
- connp)))
+ if ((IPCL_UDP_MATCH(connp, lport, laddr, fport, faddr)) &&
+ conn_wantpacket(connp, ira, ipha) &&
+ (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
+ tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp)))
break;
connp = connp->conn_next;
}
- if (connp == NULL ||
- !IPCL_IS_NONSTR(connp) && connp->conn_upq == NULL)
+ if (connp == NULL)
goto notfound;
- ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL);
+ CONN_INC_REF(connp);
- first_connp = connp;
- /*
- * When SO_REUSEADDR is not set, send the packet only to the first
- * matching connection in its zone by keeping track of the zoneid.
- */
- reuseaddr = first_connp->conn_reuseaddr;
- last_zoneid = first_connp->conn_zoneid;
+ if (connp->conn_reuseaddr) {
+ conn_t *first_connp = connp;
+ conn_t *next_connp;
+ mblk_t *mp1;
- CONN_INC_REF(connp);
- connp = connp->conn_next;
- for (;;) {
- while (connp != NULL) {
- if (IPCL_UDP_MATCH(connp, dstport, dst, srcport, src) &&
- (reuseaddr || connp->conn_zoneid != last_zoneid) &&
- conn_wantpacket(connp, ill, ipha, flags, zoneid) &&
- (!is_system_labeled() ||
- tsol_receive_local(mp, &dst, IPV4_VERSION,
- shared_addr, connp)))
+ connp = connp->conn_next;
+ for (;;) {
+ while (connp != NULL) {
+ if (IPCL_UDP_MATCH(connp, lport, laddr,
+ fport, faddr) &&
+ conn_wantpacket(connp, ira, ipha) &&
+ (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
+ tsol_receive_local(mp, &laddr, IPV4_VERSION,
+ ira, connp)))
+ break;
+ connp = connp->conn_next;
+ }
+ if (connp == NULL) {
+ /* No more interested clients */
+ connp = first_connp;
break;
- connp = connp->conn_next;
- }
- /*
- * Just copy the data part alone. The mctl part is
- * needed just for verifying policy and it is never
- * sent up.
- */
- if (connp == NULL || (((mp1 = dupmsg(mp)) == NULL) &&
- ((mp1 = copymsg(mp)) == NULL))) {
- /*
- * No more interested clients or memory
- * allocation failed
- */
- connp = first_connp;
- break;
- }
- if (connp->conn_zoneid != last_zoneid) {
- /*
- * Update the zoneid so that the packet isn't sent to
- * any more conns in the same zone unless SO_REUSEADDR
- * is set.
- */
- reuseaddr = connp->conn_reuseaddr;
- last_zoneid = connp->conn_zoneid;
- }
- if (first_mp != NULL) {
- ASSERT(((ipsec_info_t *)first_mp->b_rptr)->
- ipsec_info_type == IPSEC_IN);
- first_mp1 = ipsec_in_tag(first_mp, NULL,
- ipst->ips_netstack);
- if (first_mp1 == NULL) {
- freemsg(mp1);
+ }
+ if (((mp1 = dupmsg(mp)) == NULL) &&
+ ((mp1 = copymsg(mp)) == NULL)) {
+ /* Memory allocation failed */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
connp = first_connp;
break;
}
- } else {
- first_mp1 = NULL;
+ CONN_INC_REF(connp);
+ mutex_exit(&connfp->connf_lock);
+
+ IP_STAT(ipst, ip_udp_fanmb);
+ ip_fanout_udp_conn(connp, mp1, (ipha_t *)mp1->b_rptr,
+ NULL, ira);
+ mutex_enter(&connfp->connf_lock);
+ /* Follow the next pointer before releasing the conn */
+ next_connp = connp->conn_next;
+ CONN_DEC_REF(connp);
+ connp = next_connp;
}
- CONN_INC_REF(connp);
- mutex_exit(&connfp->connf_lock);
- /*
- * IPQoS notes: We don't send the packet for policy
- * processing here, will do it for the last one (below).
- * i.e. we do it per-packet now, but if we do policy
- * processing per-conn, then we would need to do it
- * here too.
- */
- ip_fanout_udp_conn(connp, first_mp1, mp1, secure, ill,
- ipha, flags, recv_ill, B_FALSE);
- mutex_enter(&connfp->connf_lock);
- /* Follow the next pointer before releasing the conn. */
- next_connp = connp->conn_next;
- IP_STAT(ipst, ip_udp_fanmb);
- CONN_DEC_REF(connp);
- connp = next_connp;
}
/* Last one. Send it upstream. */
mutex_exit(&connfp->connf_lock);
- ip_fanout_udp_conn(connp, first_mp, mp, secure, ill, ipha, flags,
- recv_ill, ip_policy);
IP_STAT(ipst, ip_udp_fanmb);
+ ip_fanout_udp_conn(connp, mp, ipha, NULL, ira);
CONN_DEC_REF(connp);
return;
notfound:
-
mutex_exit(&connfp->connf_lock);
- IP_STAT(ipst, ip_udp_fanothers);
/*
- * IPv6 endpoints bound to unicast or multicast IPv4-mapped addresses
+ * IPv6 endpoints bound to multicast IPv4-mapped addresses
* have already been matched above, since they live in the IPv4
* fanout tables. This implies we only need to
* check for IPv6 in6addr_any endpoints here.
@@ -7394,85 +5474,28 @@ notfound:
* address, except for the multicast group membership lookup which
* uses the IPv4 destination.
*/
- IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src);
- connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(dstport, ipst)];
+ IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6faddr);
+ connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
mutex_enter(&connfp->connf_lock);
connp = connfp->connf_head;
- if (!broadcast && !CLASSD(dst)) {
- while (connp != NULL) {
- if (IPCL_UDP_MATCH_V6(connp, dstport, ipv6_all_zeros,
- srcport, v6src) && IPCL_ZONE_MATCH(connp, zoneid) &&
- conn_wantpacket(connp, ill, ipha, flags, zoneid) &&
- !connp->conn_ipv6_v6only)
- break;
- connp = connp->conn_next;
- }
-
- if (connp != NULL && is_system_labeled() &&
- !tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr,
- connp))
- connp = NULL;
-
- if (connp == NULL ||
- !IPCL_IS_NONSTR(connp) && connp->conn_upq == NULL) {
- /*
- * No one bound to this port. Is
- * there a client that wants all
- * unclaimed datagrams?
- */
- mutex_exit(&connfp->connf_lock);
-
- if (mctl_present)
- first_mp->b_cont = mp;
- else
- first_mp = mp;
- if (ipst->ips_ipcl_proto_fanout[IPPROTO_UDP].
- connf_head != NULL) {
- ip_fanout_proto(q, first_mp, ill, ipha,
- flags | IP_FF_RAWIP, mctl_present,
- ip_policy, recv_ill, zoneid);
- } else {
- if (ip_fanout_send_icmp(q, first_mp, flags,
- ICMP_DEST_UNREACHABLE,
- ICMP_PORT_UNREACHABLE,
- mctl_present, zoneid, ipst)) {
- BUMP_MIB(ill->ill_ip_mib,
- udpIfStatsNoPorts);
- }
- }
- return;
- }
- ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL);
-
- CONN_INC_REF(connp);
- mutex_exit(&connfp->connf_lock);
- ip_fanout_udp_conn(connp, first_mp, mp, secure, ill, ipha,
- flags, recv_ill, ip_policy);
- CONN_DEC_REF(connp);
- return;
- }
/*
* IPv4 multicast packet being delivered to an AF_INET6
* in6addr_any endpoint.
* Need to check conn_wantpacket(). Note that we use conn_wantpacket()
* and not conn_wantpacket_v6() since any multicast membership is
* for an IPv4-mapped multicast address.
- * The packet is sent to all clients in all zones that have joined the
- * group and match the port.
*/
while (connp != NULL) {
- if (IPCL_UDP_MATCH_V6(connp, dstport, ipv6_all_zeros,
- srcport, v6src) &&
- conn_wantpacket(connp, ill, ipha, flags, zoneid) &&
- (!is_system_labeled() ||
- tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr,
- connp)))
+ if (IPCL_UDP_MATCH_V6(connp, lport, ipv6_all_zeros,
+ fport, v6faddr) &&
+ conn_wantpacket(connp, ira, ipha) &&
+ (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
+ tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp)))
break;
connp = connp->conn_next;
}
- if (connp == NULL ||
- !IPCL_IS_NONSTR(connp) && connp->conn_upq == NULL) {
+ if (connp == NULL) {
/*
* No one bound to this port. Is
* there a client that wants all
@@ -7480,15 +5503,10 @@ notfound:
*/
mutex_exit(&connfp->connf_lock);
- if (mctl_present)
- first_mp->b_cont = mp;
- else
- first_mp = mp;
- if (ipst->ips_ipcl_proto_fanout[IPPROTO_UDP].connf_head !=
+ if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_UDP].connf_head !=
NULL) {
- ip_fanout_proto(q, first_mp, ill, ipha,
- flags | IP_FF_RAWIP, mctl_present, ip_policy,
- recv_ill, zoneid);
+ ASSERT(ira->ira_protocol == IPPROTO_UDP);
+ ip_fanout_proto_v4(mp, ipha, ira);
} else {
/*
* We used to attempt to send an icmp error here, but
@@ -7497,102 +5515,263 @@ notfound:
* multicast, just drop the packet and give up sooner.
*/
BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts);
- freemsg(first_mp);
+ freemsg(mp);
}
return;
}
- ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL);
+ ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
- first_connp = connp;
+ /*
+ * If SO_REUSEADDR has been set on the first we send the
+ * packet to all clients that have joined the group and
+ * match the port.
+ */
+ if (connp->conn_reuseaddr) {
+ conn_t *first_connp = connp;
+ conn_t *next_connp;
+ mblk_t *mp1;
- CONN_INC_REF(connp);
- connp = connp->conn_next;
- for (;;) {
- while (connp != NULL) {
- if (IPCL_UDP_MATCH_V6(connp, dstport,
- ipv6_all_zeros, srcport, v6src) &&
- conn_wantpacket(connp, ill, ipha, flags, zoneid) &&
- (!is_system_labeled() ||
- tsol_receive_local(mp, &dst, IPV4_VERSION,
- shared_addr, connp)))
+ CONN_INC_REF(connp);
+ connp = connp->conn_next;
+ for (;;) {
+ while (connp != NULL) {
+ if (IPCL_UDP_MATCH_V6(connp, lport,
+ ipv6_all_zeros, fport, v6faddr) &&
+ conn_wantpacket(connp, ira, ipha) &&
+ (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
+ tsol_receive_local(mp, &laddr, IPV4_VERSION,
+ ira, connp)))
+ break;
+ connp = connp->conn_next;
+ }
+ if (connp == NULL) {
+ /* No more interested clients */
+ connp = first_connp;
break;
- connp = connp->conn_next;
- }
- /*
- * Just copy the data part alone. The mctl part is
- * needed just for verifying policy and it is never
- * sent up.
- */
- if (connp == NULL || (((mp1 = dupmsg(mp)) == NULL) &&
- ((mp1 = copymsg(mp)) == NULL))) {
- /*
- * No more intested clients or memory
- * allocation failed
- */
- connp = first_connp;
- break;
- }
- if (first_mp != NULL) {
- ASSERT(((ipsec_info_t *)first_mp->b_rptr)->
- ipsec_info_type == IPSEC_IN);
- first_mp1 = ipsec_in_tag(first_mp, NULL,
- ipst->ips_netstack);
- if (first_mp1 == NULL) {
- freemsg(mp1);
+ }
+ if (((mp1 = dupmsg(mp)) == NULL) &&
+ ((mp1 = copymsg(mp)) == NULL)) {
+ /* Memory allocation failed */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
connp = first_connp;
break;
}
- } else {
- first_mp1 = NULL;
+ CONN_INC_REF(connp);
+ mutex_exit(&connfp->connf_lock);
+
+ IP_STAT(ipst, ip_udp_fanmb);
+ ip_fanout_udp_conn(connp, mp1, (ipha_t *)mp1->b_rptr,
+ NULL, ira);
+ mutex_enter(&connfp->connf_lock);
+ /* Follow the next pointer before releasing the conn */
+ next_connp = connp->conn_next;
+ CONN_DEC_REF(connp);
+ connp = next_connp;
}
- CONN_INC_REF(connp);
- mutex_exit(&connfp->connf_lock);
- /*
- * IPQoS notes: We don't send the packet for policy
- * processing here, will do it for the last one (below).
- * i.e. we do it per-packet now, but if we do policy
- * processing per-conn, then we would need to do it
- * here too.
- */
- ip_fanout_udp_conn(connp, first_mp1, mp1, secure, ill,
- ipha, flags, recv_ill, B_FALSE);
- mutex_enter(&connfp->connf_lock);
- /* Follow the next pointer before releasing the conn. */
- next_connp = connp->conn_next;
- CONN_DEC_REF(connp);
- connp = next_connp;
}
/* Last one. Send it upstream. */
mutex_exit(&connfp->connf_lock);
- ip_fanout_udp_conn(connp, first_mp, mp, secure, ill, ipha, flags,
- recv_ill, ip_policy);
+ IP_STAT(ipst, ip_udp_fanmb);
+ ip_fanout_udp_conn(connp, mp, ipha, NULL, ira);
CONN_DEC_REF(connp);
}
/*
- * Complete the ip_wput header so that it
- * is possible to generate ICMP
- * errors.
+ * Split an incoming packet's IPv4 options into the label and the other options.
+ * If 'allocate' is set it does memory allocation for the ip_pkt_t, including
+ * clearing out any leftover label or options.
+ * Otherwise it just makes ipp point into the packet.
+ *
+ * Returns zero if ok; ENOMEM if the buffer couldn't be allocated.
*/
int
-ip_hdr_complete(ipha_t *ipha, zoneid_t zoneid, ip_stack_t *ipst)
+ip_find_hdr_v4(ipha_t *ipha, ip_pkt_t *ipp, boolean_t allocate)
{
- ire_t *ire;
+ uchar_t *opt;
+ uint32_t totallen;
+ uint32_t optval;
+ uint32_t optlen;
- if (ipha->ipha_src == INADDR_ANY) {
- ire = ire_lookup_local(zoneid, ipst);
- if (ire == NULL) {
- ip1dbg(("ip_hdr_complete: no source IRE\n"));
- return (1);
+ ipp->ipp_fields |= IPPF_HOPLIMIT | IPPF_TCLASS | IPPF_ADDR;
+ ipp->ipp_hoplimit = ipha->ipha_ttl;
+ ipp->ipp_type_of_service = ipha->ipha_type_of_service;
+ IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &ipp->ipp_addr);
+
+ /*
+ * Get length (in 4 byte octets) of IP header options.
+ */
+ totallen = ipha->ipha_version_and_hdr_length -
+ (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
+
+ if (totallen == 0) {
+ if (!allocate)
+ return (0);
+
+ /* Clear out anything from a previous packet */
+ if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
+ kmem_free(ipp->ipp_ipv4_options,
+ ipp->ipp_ipv4_options_len);
+ ipp->ipp_ipv4_options = NULL;
+ ipp->ipp_ipv4_options_len = 0;
+ ipp->ipp_fields &= ~IPPF_IPV4_OPTIONS;
}
- ipha->ipha_src = ire->ire_addr;
- ire_refrele(ire);
+ if (ipp->ipp_fields & IPPF_LABEL_V4) {
+ kmem_free(ipp->ipp_label_v4, ipp->ipp_label_len_v4);
+ ipp->ipp_label_v4 = NULL;
+ ipp->ipp_label_len_v4 = 0;
+ ipp->ipp_fields &= ~IPPF_LABEL_V4;
+ }
+ return (0);
}
- ipha->ipha_ttl = ipst->ips_ip_def_ttl;
- ipha->ipha_hdr_checksum = 0;
- ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
- return (0);
+
+ totallen <<= 2;
+ opt = (uchar_t *)&ipha[1];
+ if (!is_system_labeled()) {
+
+ copyall:
+ if (!allocate) {
+ if (totallen != 0) {
+ ipp->ipp_ipv4_options = opt;
+ ipp->ipp_ipv4_options_len = totallen;
+ ipp->ipp_fields |= IPPF_IPV4_OPTIONS;
+ }
+ return (0);
+ }
+ /* Just copy all of options */
+ if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
+ if (totallen == ipp->ipp_ipv4_options_len) {
+ bcopy(opt, ipp->ipp_ipv4_options, totallen);
+ return (0);
+ }
+ kmem_free(ipp->ipp_ipv4_options,
+ ipp->ipp_ipv4_options_len);
+ ipp->ipp_ipv4_options = NULL;
+ ipp->ipp_ipv4_options_len = 0;
+ ipp->ipp_fields &= ~IPPF_IPV4_OPTIONS;
+ }
+ if (totallen == 0)
+ return (0);
+
+ ipp->ipp_ipv4_options = kmem_alloc(totallen, KM_NOSLEEP);
+ if (ipp->ipp_ipv4_options == NULL)
+ return (ENOMEM);
+ ipp->ipp_ipv4_options_len = totallen;
+ ipp->ipp_fields |= IPPF_IPV4_OPTIONS;
+ bcopy(opt, ipp->ipp_ipv4_options, totallen);
+ return (0);
+ }
+
+ if (allocate && (ipp->ipp_fields & IPPF_LABEL_V4)) {
+ kmem_free(ipp->ipp_label_v4, ipp->ipp_label_len_v4);
+ ipp->ipp_label_v4 = NULL;
+ ipp->ipp_label_len_v4 = 0;
+ ipp->ipp_fields &= ~IPPF_LABEL_V4;
+ }
+
+ /*
+ * Search for CIPSO option.
+ * We assume CIPSO is first in options if it is present.
+ * If it isn't, then ipp_opt_ipv4_options will not include the options
+ * prior to the CIPSO option.
+ */
+ while (totallen != 0) {
+ switch (optval = opt[IPOPT_OPTVAL]) {
+ case IPOPT_EOL:
+ return (0);
+ case IPOPT_NOP:
+ optlen = 1;
+ break;
+ default:
+ if (totallen <= IPOPT_OLEN)
+ return (EINVAL);
+ optlen = opt[IPOPT_OLEN];
+ if (optlen < 2)
+ return (EINVAL);
+ }
+ if (optlen > totallen)
+ return (EINVAL);
+
+ switch (optval) {
+ case IPOPT_COMSEC:
+ if (!allocate) {
+ ipp->ipp_label_v4 = opt;
+ ipp->ipp_label_len_v4 = optlen;
+ ipp->ipp_fields |= IPPF_LABEL_V4;
+ } else {
+ ipp->ipp_label_v4 = kmem_alloc(optlen,
+ KM_NOSLEEP);
+ if (ipp->ipp_label_v4 == NULL)
+ return (ENOMEM);
+ ipp->ipp_label_len_v4 = optlen;
+ ipp->ipp_fields |= IPPF_LABEL_V4;
+ bcopy(opt, ipp->ipp_label_v4, optlen);
+ }
+ totallen -= optlen;
+ opt += optlen;
+
+ /* Skip padding bytes until we get to a multiple of 4 */
+ while ((totallen & 3) != 0 && opt[0] == IPOPT_NOP) {
+ totallen--;
+ opt++;
+ }
+ /* Remaining as ipp_ipv4_options */
+ goto copyall;
+ }
+ totallen -= optlen;
+ opt += optlen;
+ }
+ /* No CIPSO found; return everything as ipp_ipv4_options */
+ totallen = ipha->ipha_version_and_hdr_length -
+ (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
+ totallen <<= 2;
+ opt = (uchar_t *)&ipha[1];
+ goto copyall;
+}
+
+/*
+ * Efficient versions of lookup for an IRE when we only
+ * match the address.
+ * For RTF_REJECT or BLACKHOLE we return IRE_NOROUTE.
+ * Does not handle multicast addresses.
+ */
+uint_t
+ip_type_v4(ipaddr_t addr, ip_stack_t *ipst)
+{
+ ire_t *ire;
+ uint_t result;
+
+ ire = ire_ftable_lookup_simple_v4(addr, 0, ipst, NULL);
+ ASSERT(ire != NULL);
+ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
+ result = IRE_NOROUTE;
+ else
+ result = ire->ire_type;
+ ire_refrele(ire);
+ return (result);
+}
+
+/*
+ * Efficient versions of lookup for an IRE when we only
+ * match the address.
+ * For RTF_REJECT or BLACKHOLE we return IRE_NOROUTE.
+ * Does not handle multicast addresses.
+ */
+uint_t
+ip_type_v6(const in6_addr_t *addr, ip_stack_t *ipst)
+{
+ ire_t *ire;
+ uint_t result;
+
+ ire = ire_ftable_lookup_simple_v6(addr, 0, ipst, NULL);
+ ASSERT(ire != NULL);
+ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
+ result = IRE_NOROUTE;
+ else
+ result = ire->ire_type;
+ ire_refrele(ire);
+ return (result);
}
/*
@@ -7602,8 +5781,6 @@ ip_hdr_complete(ipha_t *ipha, zoneid_t zoneid, ip_stack_t *ipst)
static void
ip_lrput(queue_t *q, mblk_t *mp)
{
- mblk_t *mp1;
-
switch (mp->b_datap->db_type) {
case M_FLUSH:
/* Turn around */
@@ -7614,9 +5791,6 @@ ip_lrput(queue_t *q, mblk_t *mp)
}
break;
}
- /* Could receive messages that passed through ar_rput */
- for (mp1 = mp; mp1; mp1 = mp1->b_cont)
- mp1->b_prev = mp1->b_next = NULL;
freemsg(mp);
}
@@ -7631,7 +5805,7 @@ ip_lwput(queue_t *q, mblk_t *mp)
/*
* Move the first hop in any source route to ipha_dst and remove that part of
* the source route. Called by other protocols. Errors in option formatting
- * are ignored - will be handled by ip_wput_options Return the final
+ * are ignored - will be handled by ip_output_options. Return the final
* destination (either ipha_dst or the last entry in a source route.)
*/
ipaddr_t
@@ -7643,7 +5817,6 @@ ip_massage_options(ipha_t *ipha, netstack_t *ns)
uint8_t optlen;
ipaddr_t dst;
int i;
- ire_t *ire;
ip_stack_t *ipst = ns->netstack_ip;
ip2dbg(("ip_massage_options\n"));
@@ -7679,10 +5852,7 @@ ip_massage_options(ipha_t *ipha, netstack_t *ns)
* XXX verify per-interface ip_forwarding
* for source route?
*/
- ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL,
- ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
- if (ire != NULL) {
- ire_refrele(ire);
+ if (ip_type_v4(dst, ipst) == IRE_LOCAL) {
off += IP_ADDR_LEN;
goto redo_srr;
}
@@ -7760,1843 +5930,41 @@ ip_net_mask(ipaddr_t addr)
return ((ipaddr_t)0);
}
-/*
- * Helper ill lookup function used by IPsec.
- */
-ill_t *
-ip_grab_ill(mblk_t *first_mp, int ifindex, boolean_t isv6, ip_stack_t *ipst)
+/* Name/Value Table Lookup Routine */
+char *
+ip_nv_lookup(nv_t *nv, int value)
{
- ill_t *ret_ill;
-
- ASSERT(ifindex != 0);
-
- ret_ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL,
- ipst);
- if (ret_ill == NULL) {
- if (isv6) {
- BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
- ip1dbg(("ip_grab_ill (IPv6): bad ifindex %d.\n",
- ifindex));
- } else {
- BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
- ip1dbg(("ip_grab_ill (IPv4): bad ifindex %d.\n",
- ifindex));
- }
- freemsg(first_mp);
+ if (!nv)
return (NULL);
+ for (; nv->nv_name; nv++) {
+ if (nv->nv_value == value)
+ return (nv->nv_name);
}
- return (ret_ill);
-}
-
-/*
- * IPv4 -
- * ip_newroute is called by ip_rput or ip_wput whenever we need to send
- * out a packet to a destination address for which we do not have specific
- * (or sufficient) routing information.
- *
- * NOTE : These are the scopes of some of the variables that point at IRE,
- * which needs to be followed while making any future modifications
- * to avoid memory leaks.
- *
- * - ire and sire are the entries looked up initially by
- * ire_ftable_lookup.
- * - ipif_ire is used to hold the interface ire associated with
- * the new cache ire. But it's scope is limited, so we always REFRELE
- * it before branching out to error paths.
- * - save_ire is initialized before ire_create, so that ire returned
- * by ire_create will not over-write the ire. We REFRELE save_ire
- * before breaking out of the switch.
- *
- * Thus on failures, we have to REFRELE only ire and sire, if they
- * are not NULL.
- */
-void
-ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp,
- zoneid_t zoneid, ip_stack_t *ipst)
-{
- areq_t *areq;
- ipaddr_t gw = 0;
- ire_t *ire = NULL;
- mblk_t *res_mp;
- ipaddr_t *addrp;
- ipaddr_t nexthop_addr;
- ipif_t *src_ipif = NULL;
- ill_t *dst_ill = NULL;
- ipha_t *ipha;
- ire_t *sire = NULL;
- mblk_t *first_mp;
- ire_t *save_ire;
- ushort_t ire_marks = 0;
- boolean_t mctl_present;
- ipsec_out_t *io;
- mblk_t *saved_mp;
- mblk_t *copy_mp = NULL;
- mblk_t *xmit_mp = NULL;
- ipaddr_t save_dst;
- uint32_t multirt_flags =
- MULTIRT_CACHEGW | MULTIRT_USESTAMP | MULTIRT_SETSTAMP;
- boolean_t multirt_is_resolvable;
- boolean_t multirt_resolve_next;
- boolean_t unspec_src;
- boolean_t ip_nexthop = B_FALSE;
- tsol_ire_gw_secattr_t *attrp = NULL;
- tsol_gcgrp_t *gcgrp = NULL;
- tsol_gcgrp_addr_t ga;
- int multirt_res_failures = 0;
- int multirt_res_attempts = 0;
- int multirt_already_resolved = 0;
- boolean_t multirt_no_icmp_error = B_FALSE;
-
- if (ip_debug > 2) {
- /* ip1dbg */
- pr_addr_dbg("ip_newroute: dst %s\n", AF_INET, &dst);
- }
-
- EXTRACT_PKT_MP(mp, first_mp, mctl_present);
- if (mctl_present) {
- io = (ipsec_out_t *)first_mp->b_rptr;
- ASSERT(io->ipsec_out_type == IPSEC_OUT);
- ASSERT(zoneid == io->ipsec_out_zoneid);
- ASSERT(zoneid != ALL_ZONES);
- }
-
- ipha = (ipha_t *)mp->b_rptr;
-
- /* All multicast lookups come through ip_newroute_ipif() */
- if (CLASSD(dst)) {
- ip0dbg(("ip_newroute: CLASSD 0x%x (b_prev %p, b_next %p)\n",
- ntohl(dst), (void *)mp->b_prev, (void *)mp->b_next));
- freemsg(first_mp);
- return;
- }
-
- if (mctl_present && io->ipsec_out_ip_nexthop) {
- ip_nexthop = B_TRUE;
- nexthop_addr = io->ipsec_out_nexthop_addr;
- }
- /*
- * If this IRE is created for forwarding or it is not for
- * traffic for congestion controlled protocols, mark it as temporary.
- */
- if (mp->b_prev != NULL || !IP_FLOW_CONTROLLED_ULP(ipha->ipha_protocol))
- ire_marks |= IRE_MARK_TEMPORARY;
-
- /*
- * Get what we can from ire_ftable_lookup which will follow an IRE
- * chain until it gets the most specific information available.
- * For example, we know that there is no IRE_CACHE for this dest,
- * but there may be an IRE_OFFSUBNET which specifies a gateway.
- * ire_ftable_lookup will look up the gateway, etc.
- * Otherwise, given ire_ftable_lookup algorithm, only one among routes
- * to the destination, of equal netmask length in the forward table,
- * will be recursively explored. If no information is available
- * for the final gateway of that route, we force the returned ire
- * to be equal to sire using MATCH_IRE_PARENT.
- * At least, in this case we have a starting point (in the buckets)
- * to look for other routes to the destination in the forward table.
- * This is actually used only for multirouting, where a list
- * of routes has to be processed in sequence.
- *
- * In the process of coming up with the most specific information,
- * ire_ftable_lookup may end up with an incomplete IRE_CACHE entry
- * for the gateway (i.e., one for which the ire_nce->nce_state is
- * not yet ND_REACHABLE, and is in the middle of arp resolution).
- * Two caveats when handling incomplete ire's in ip_newroute:
- * - we should be careful when accessing its ire_nce (specifically
- * the nce_res_mp) ast it might change underneath our feet, and,
- * - not all legacy code path callers are prepared to handle
- * incomplete ire's, so we should not create/add incomplete
- * ire_cache entries here. (See discussion about temporary solution
- * further below).
- *
- * In order to minimize packet dropping, and to preserve existing
- * behavior, we treat this case as if there were no IRE_CACHE for the
- * gateway, and instead use the IF_RESOLVER ire to send out
- * another request to ARP (this is achieved by passing the
- * MATCH_IRE_COMPLETE flag to ire_ftable_lookup). When the
- * arp response comes back in ip_wput_nondata, we will create
- * a per-dst ire_cache that has an ND_COMPLETE ire.
- *
- * Note that this is a temporary solution; the correct solution is
- * to create an incomplete per-dst ire_cache entry, and send the
- * packet out when the gw's nce is resolved. In order to achieve this,
- * all packet processing must have been completed prior to calling
- * ire_add_then_send. Some legacy code paths (e.g. cgtp) would need
- * to be modified to accomodate this solution.
- */
- if (ip_nexthop) {
- /*
- * The first time we come here, we look for an IRE_INTERFACE
- * entry for the specified nexthop, set the dst to be the
- * nexthop address and create an IRE_CACHE entry for the
- * nexthop. The next time around, we are able to find an
- * IRE_CACHE entry for the nexthop, set the gateway to be the
- * nexthop address and create an IRE_CACHE entry for the
- * destination address via the specified nexthop.
- */
- ire = ire_cache_lookup(nexthop_addr, zoneid,
- msg_getlabel(mp), ipst);
- if (ire != NULL) {
- gw = nexthop_addr;
- ire_marks |= IRE_MARK_PRIVATE_ADDR;
- } else {
- ire = ire_ftable_lookup(nexthop_addr, 0, 0,
- IRE_INTERFACE, NULL, NULL, zoneid, 0,
- msg_getlabel(mp),
- MATCH_IRE_TYPE | MATCH_IRE_SECATTR,
- ipst);
- if (ire != NULL) {
- dst = nexthop_addr;
- }
- }
- } else {
- ire = ire_ftable_lookup(dst, 0, 0, 0,
- NULL, &sire, zoneid, 0, msg_getlabel(mp),
- MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
- MATCH_IRE_RJ_BHOLE | MATCH_IRE_PARENT |
- MATCH_IRE_SECATTR | MATCH_IRE_COMPLETE,
- ipst);
- }
-
- ip3dbg(("ip_newroute: ire_ftable_lookup() "
- "returned ire %p, sire %p\n", (void *)ire, (void *)sire));
-
- /*
- * This loop is run only once in most cases.
- * We loop to resolve further routes only when the destination
- * can be reached through multiple RTF_MULTIRT-flagged ires.
- */
- do {
- /* Clear the previous iteration's values */
- if (src_ipif != NULL) {
- ipif_refrele(src_ipif);
- src_ipif = NULL;
- }
- if (dst_ill != NULL) {
- ill_refrele(dst_ill);
- dst_ill = NULL;
- }
-
- multirt_resolve_next = B_FALSE;
- /*
- * We check if packets have to be multirouted.
- * In this case, given the current <ire, sire> couple,
- * we look for the next suitable <ire, sire>.
- * This check is done in ire_multirt_lookup(),
- * which applies various criteria to find the next route
- * to resolve. ire_multirt_lookup() leaves <ire, sire>
- * unchanged if it detects it has not been tried yet.
- */
- if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) {
- ip3dbg(("ip_newroute: starting next_resolution "
- "with first_mp %p, tag %d\n",
- (void *)first_mp,
- MULTIRT_DEBUG_TAGGED(first_mp)));
-
- ASSERT(sire != NULL);
- multirt_is_resolvable =
- ire_multirt_lookup(&ire, &sire, multirt_flags,
- &multirt_already_resolved, msg_getlabel(mp), ipst);
-
- ip3dbg(("ip_newroute: multirt_is_resolvable %d, "
- "multirt_already_resolved %d, "
- "multirt_res_attempts %d, multirt_res_failures %d, "
- "ire %p, sire %p\n", multirt_is_resolvable,
- multirt_already_resolved, multirt_res_attempts,
- multirt_res_failures, (void *)ire, (void *)sire));
-
- if (!multirt_is_resolvable) {
- /*
- * No more multirt route to resolve; give up
- * (all routes resolved or no more
- * resolvable routes).
- */
- if (ire != NULL) {
- ire_refrele(ire);
- ire = NULL;
- }
- /*
- * Generate ICMP error only if all attempts to
- * resolve multirt route failed and there is no
- * already resolved one. Don't generate ICMP
- * error when:
- *
- * 1) there was no attempt to resolve
- * 2) at least one attempt passed
- * 3) a multirt route is already resolved
- *
- * Case 1) may occur due to multiple
- * resolution attempts during single
- * ip_multirt_resolution_interval.
- *
- * Case 2-3) means that CGTP destination is
- * reachable via one link so we don't want to
- * generate ICMP host unreachable error.
- */
- if (multirt_res_attempts == 0 ||
- multirt_res_failures <
- multirt_res_attempts ||
- multirt_already_resolved > 0)
- multirt_no_icmp_error = B_TRUE;
- } else {
- ASSERT(sire != NULL);
- ASSERT(ire != NULL);
-
- multirt_res_attempts++;
- }
- }
-
- if (ire == NULL) {
- if (ip_debug > 3) {
- /* ip2dbg */
- pr_addr_dbg("ip_newroute: "
- "can't resolve %s\n", AF_INET, &dst);
- }
- ip3dbg(("ip_newroute: "
- "ire %p, sire %p, multirt_no_icmp_error %d\n",
- (void *)ire, (void *)sire,
- (int)multirt_no_icmp_error));
-
- if (sire != NULL) {
- ire_refrele(sire);
- sire = NULL;
- }
-
- if (multirt_no_icmp_error) {
- /* There is no need to report an ICMP error. */
- MULTIRT_DEBUG_UNTAG(first_mp);
- freemsg(first_mp);
- return;
- }
- ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0,
- RTA_DST, ipst);
- goto icmp_err_ret;
- }
-
- /*
- * Verify that the returned IRE does not have either
- * the RTF_REJECT or RTF_BLACKHOLE flags set and that the IRE is
- * either an IRE_CACHE, IRE_IF_NORESOLVER or IRE_IF_RESOLVER.
- */
- if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) ||
- (ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0) {
- goto icmp_err_ret;
- }
- /*
- * Increment the ire_ob_pkt_count field for ire if it is an
- * INTERFACE (IF_RESOLVER or IF_NORESOLVER) IRE type, and
- * increment the same for the parent IRE, sire, if it is some
- * sort of prefix IRE (which includes DEFAULT, PREFIX, and HOST)
- */
- if ((ire->ire_type & IRE_INTERFACE) != 0) {
- UPDATE_OB_PKT_COUNT(ire);
- ire->ire_last_used_time = lbolt;
- }
-
- if (sire != NULL) {
- gw = sire->ire_gateway_addr;
- ASSERT((sire->ire_type & (IRE_CACHETABLE |
- IRE_INTERFACE)) == 0);
- UPDATE_OB_PKT_COUNT(sire);
- sire->ire_last_used_time = lbolt;
- }
- /*
- * We have a route to reach the destination. Find the
- * appropriate ill, then get a source address using
- * ipif_select_source().
- *
- * If we are here trying to create an IRE_CACHE for an offlink
- * destination and have an IRE_CACHE entry for VNI, then use
- * ire_stq instead since VNI's queue is a black hole.
- */
- if ((ire->ire_type == IRE_CACHE) &&
- IS_VNI(ire->ire_ipif->ipif_ill)) {
- dst_ill = ire->ire_stq->q_ptr;
- ill_refhold(dst_ill);
- } else {
- ill_t *ill = ire->ire_ipif->ipif_ill;
-
- if (IS_IPMP(ill)) {
- dst_ill =
- ipmp_illgrp_hold_next_ill(ill->ill_grp);
- } else {
- dst_ill = ill;
- ill_refhold(dst_ill);
- }
- }
-
- if (dst_ill == NULL) {
- if (ip_debug > 2) {
- pr_addr_dbg("ip_newroute: no dst "
- "ill for dst %s\n", AF_INET, &dst);
- }
- goto icmp_err_ret;
- }
- ip2dbg(("ip_newroute: dst_ill %s\n", dst_ill->ill_name));
-
- /*
- * Pick the best source address from dst_ill.
- *
- * 1) Try to pick the source address from the destination
- * route. Clustering assumes that when we have multiple
- * prefixes hosted on an interface, the prefix of the
- * source address matches the prefix of the destination
- * route. We do this only if the address is not
- * DEPRECATED.
- *
- * 2) If the conn is in a different zone than the ire, we
- * need to pick a source address from the right zone.
- */
- ASSERT(src_ipif == NULL);
- if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) {
- /*
- * The RTF_SETSRC flag is set in the parent ire (sire).
- * Check that the ipif matching the requested source
- * address still exists.
- */
- src_ipif = ipif_lookup_addr(sire->ire_src_addr, NULL,
- zoneid, NULL, NULL, NULL, NULL, ipst);
- }
-
- unspec_src = (connp != NULL && connp->conn_unspec_src);
-
- if (src_ipif == NULL &&
- (!unspec_src || ipha->ipha_src != INADDR_ANY)) {
- ire_marks |= IRE_MARK_USESRC_CHECK;
- if (!IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) &&
- IS_IPMP(ire->ire_ipif->ipif_ill) ||
- (ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) ||
- (connp != NULL && ire->ire_zoneid != zoneid &&
- ire->ire_zoneid != ALL_ZONES) ||
- (dst_ill->ill_usesrc_ifindex != 0)) {
- /*
- * If the destination is reachable via a
- * given gateway, the selected source address
- * should be in the same subnet as the gateway.
- * Otherwise, the destination is not reachable.
- *
- * If there are no interfaces on the same subnet
- * as the destination, ipif_select_source gives
- * first non-deprecated interface which might be
- * on a different subnet than the gateway.
- * This is not desirable. Hence pass the dst_ire
- * source address to ipif_select_source.
- * It is sure that the destination is reachable
- * with the dst_ire source address subnet.
- * So passing dst_ire source address to
- * ipif_select_source will make sure that the
- * selected source will be on the same subnet
- * as dst_ire source address.
- */
- ipaddr_t saddr = ire->ire_ipif->ipif_src_addr;
-
- src_ipif = ipif_select_source(dst_ill, saddr,
- zoneid);
- if (src_ipif == NULL) {
- /*
- * In the case of multirouting, it may
- * happen that ipif_select_source fails
- * as DAD may disallow use of the
- * particular source interface. Anyway,
- * we need to continue and attempt to
- * resolve other multirt routes.
- */
- if ((sire != NULL) &&
- (sire->ire_flags & RTF_MULTIRT)) {
- ire_refrele(ire);
- ire = NULL;
- multirt_resolve_next = B_TRUE;
- multirt_res_failures++;
- continue;
- }
-
- if (ip_debug > 2) {
- pr_addr_dbg("ip_newroute: "
- "no src for dst %s ",
- AF_INET, &dst);
- printf("on interface %s\n",
- dst_ill->ill_name);
- }
- goto icmp_err_ret;
- }
- } else {
- src_ipif = ire->ire_ipif;
- ASSERT(src_ipif != NULL);
- /* hold src_ipif for uniformity */
- ipif_refhold(src_ipif);
- }
- }
-
- /*
- * Assign a source address while we have the conn.
- * We can't have ip_wput_ire pick a source address when the
- * packet returns from arp since we need to look at
- * conn_unspec_src and conn_zoneid, and we lose the conn when
- * going through arp.
- *
- * NOTE : ip_newroute_v6 does not have this piece of code as
- * it uses ip6i to store this information.
- */
- if (ipha->ipha_src == INADDR_ANY && !unspec_src)
- ipha->ipha_src = src_ipif->ipif_src_addr;
-
- if (ip_debug > 3) {
- /* ip2dbg */
- pr_addr_dbg("ip_newroute: first hop %s\n",
- AF_INET, &gw);
- }
- ip2dbg(("\tire type %s (%d)\n",
- ip_nv_lookup(ire_nv_tbl, ire->ire_type), ire->ire_type));
-
- /*
- * The TTL of multirouted packets is bounded by the
- * ip_multirt_ttl ndd variable.
- */
- if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) {
- /* Force TTL of multirouted packets */
- if ((ipst->ips_ip_multirt_ttl > 0) &&
- (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) {
- ip2dbg(("ip_newroute: forcing multirt TTL "
- "to %d (was %d), dst 0x%08x\n",
- ipst->ips_ip_multirt_ttl, ipha->ipha_ttl,
- ntohl(sire->ire_addr)));
- ipha->ipha_ttl = ipst->ips_ip_multirt_ttl;
- }
- }
- /*
- * At this point in ip_newroute(), ire is either the
- * IRE_CACHE of the next-hop gateway for an off-subnet
- * destination or an IRE_INTERFACE type that should be used
- * to resolve an on-subnet destination or an on-subnet
- * next-hop gateway.
- *
- * In the IRE_CACHE case, we have the following :
- *
- * 1) src_ipif - used for getting a source address.
- *
- * 2) dst_ill - from which we derive ire_stq/ire_rfq. This
- * means packets using this IRE_CACHE will go out on
- * dst_ill.
- *
- * 3) The IRE sire will point to the prefix that is the
- * longest matching route for the destination. These
- * prefix types include IRE_DEFAULT, IRE_PREFIX, IRE_HOST.
- *
- * The newly created IRE_CACHE entry for the off-subnet
- * destination is tied to both the prefix route and the
- * interface route used to resolve the next-hop gateway
- * via the ire_phandle and ire_ihandle fields,
- * respectively.
- *
- * In the IRE_INTERFACE case, we have the following :
- *
- * 1) src_ipif - used for getting a source address.
- *
- * 2) dst_ill - from which we derive ire_stq/ire_rfq. This
- * means packets using the IRE_CACHE that we will build
- * here will go out on dst_ill.
- *
- * 3) sire may or may not be NULL. But, the IRE_CACHE that is
- * to be created will only be tied to the IRE_INTERFACE
- * that was derived from the ire_ihandle field.
- *
- * If sire is non-NULL, it means the destination is
- * off-link and we will first create the IRE_CACHE for the
- * gateway. Next time through ip_newroute, we will create
- * the IRE_CACHE for the final destination as described
- * above.
- *
- * In both cases, after the current resolution has been
- * completed (or possibly initialised, in the IRE_INTERFACE
- * case), the loop may be re-entered to attempt the resolution
- * of another RTF_MULTIRT route.
- *
- * When an IRE_CACHE entry for the off-subnet destination is
- * created, RTF_SETSRC and RTF_MULTIRT are inherited from sire,
- * for further processing in emission loops.
- */
- save_ire = ire;
- switch (ire->ire_type) {
- case IRE_CACHE: {
- ire_t *ipif_ire;
-
- ASSERT(save_ire->ire_nce->nce_state == ND_REACHABLE);
- if (gw == 0)
- gw = ire->ire_gateway_addr;
- /*
- * We need 3 ire's to create a new cache ire for an
- * off-link destination from the cache ire of the
- * gateway.
- *
- * 1. The prefix ire 'sire' (Note that this does
- * not apply to the conn_nexthop_set case)
- * 2. The cache ire of the gateway 'ire'
- * 3. The interface ire 'ipif_ire'
- *
- * We have (1) and (2). We lookup (3) below.
- *
- * If there is no interface route to the gateway,
- * it is a race condition, where we found the cache
- * but the interface route has been deleted.
- */
- if (ip_nexthop) {
- ipif_ire = ire_ihandle_lookup_onlink(ire);
- } else {
- ipif_ire =
- ire_ihandle_lookup_offlink(ire, sire);
- }
- if (ipif_ire == NULL) {
- ip1dbg(("ip_newroute: "
- "ire_ihandle_lookup_offlink failed\n"));
- goto icmp_err_ret;
- }
-
- /*
- * Check cached gateway IRE for any security
- * attributes; if found, associate the gateway
- * credentials group to the destination IRE.
- */
- if ((attrp = save_ire->ire_gw_secattr) != NULL) {
- mutex_enter(&attrp->igsa_lock);
- if ((gcgrp = attrp->igsa_gcgrp) != NULL)
- GCGRP_REFHOLD(gcgrp);
- mutex_exit(&attrp->igsa_lock);
- }
-
- /*
- * XXX For the source of the resolver mp,
- * we are using the same DL_UNITDATA_REQ
- * (from save_ire->ire_nce->nce_res_mp)
- * though the save_ire is not pointing at the same ill.
- * This is incorrect. We need to send it up to the
- * resolver to get the right res_mp. For ethernets
- * this may be okay (ill_type == DL_ETHER).
- */
-
- ire = ire_create(
- (uchar_t *)&dst, /* dest address */
- (uchar_t *)&ip_g_all_ones, /* mask */
- (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */
- (uchar_t *)&gw, /* gateway address */
- &save_ire->ire_max_frag,
- save_ire->ire_nce, /* src nce */
- dst_ill->ill_rq, /* recv-from queue */
- dst_ill->ill_wq, /* send-to queue */
- IRE_CACHE, /* IRE type */
- src_ipif,
- (sire != NULL) ?
- sire->ire_mask : 0, /* Parent mask */
- (sire != NULL) ?
- sire->ire_phandle : 0, /* Parent handle */
- ipif_ire->ire_ihandle, /* Interface handle */
- (sire != NULL) ? (sire->ire_flags &
- (RTF_SETSRC | RTF_MULTIRT)) : 0, /* flags */
- (sire != NULL) ?
- &(sire->ire_uinfo) : &(save_ire->ire_uinfo),
- NULL,
- gcgrp,
- ipst);
-
- if (ire == NULL) {
- if (gcgrp != NULL) {
- GCGRP_REFRELE(gcgrp);
- gcgrp = NULL;
- }
- ire_refrele(ipif_ire);
- ire_refrele(save_ire);
- break;
- }
-
- /* reference now held by IRE */
- gcgrp = NULL;
-
- ire->ire_marks |= ire_marks;
-
- /*
- * Prevent sire and ipif_ire from getting deleted.
- * The newly created ire is tied to both of them via
- * the phandle and ihandle respectively.
- */
- if (sire != NULL) {
- IRB_REFHOLD(sire->ire_bucket);
- /* Has it been removed already ? */
- if (sire->ire_marks & IRE_MARK_CONDEMNED) {
- IRB_REFRELE(sire->ire_bucket);
- ire_refrele(ipif_ire);
- ire_refrele(save_ire);
- break;
- }
- }
-
- IRB_REFHOLD(ipif_ire->ire_bucket);
- /* Has it been removed already ? */
- if (ipif_ire->ire_marks & IRE_MARK_CONDEMNED) {
- IRB_REFRELE(ipif_ire->ire_bucket);
- if (sire != NULL)
- IRB_REFRELE(sire->ire_bucket);
- ire_refrele(ipif_ire);
- ire_refrele(save_ire);
- break;
- }
-
- xmit_mp = first_mp;
- /*
- * In the case of multirouting, a copy
- * of the packet is done before its sending.
- * The copy is used to attempt another
- * route resolution, in a next loop.
- */
- if (ire->ire_flags & RTF_MULTIRT) {
- copy_mp = copymsg(first_mp);
- if (copy_mp != NULL) {
- xmit_mp = copy_mp;
- MULTIRT_DEBUG_TAG(first_mp);
- }
- }
-
- ire_add_then_send(q, ire, xmit_mp);
- ire_refrele(save_ire);
-
- /* Assert that sire is not deleted yet. */
- if (sire != NULL) {
- ASSERT(sire->ire_ptpn != NULL);
- IRB_REFRELE(sire->ire_bucket);
- }
-
- /* Assert that ipif_ire is not deleted yet. */
- ASSERT(ipif_ire->ire_ptpn != NULL);
- IRB_REFRELE(ipif_ire->ire_bucket);
- ire_refrele(ipif_ire);
-
- /*
- * If copy_mp is not NULL, multirouting was
- * requested. We loop to initiate a next
- * route resolution attempt, starting from sire.
- */
- if (copy_mp != NULL) {
- /*
- * Search for the next unresolved
- * multirt route.
- */
- copy_mp = NULL;
- ipif_ire = NULL;
- ire = NULL;
- multirt_resolve_next = B_TRUE;
- continue;
- }
- if (sire != NULL)
- ire_refrele(sire);
- ipif_refrele(src_ipif);
- ill_refrele(dst_ill);
- return;
- }
- case IRE_IF_NORESOLVER: {
- if (dst_ill->ill_resolver_mp == NULL) {
- ip1dbg(("ip_newroute: dst_ill %p "
- "for IRE_IF_NORESOLVER ire %p has "
- "no ill_resolver_mp\n",
- (void *)dst_ill, (void *)ire));
- break;
- }
-
- /*
- * TSol note: We are creating the ire cache for the
- * destination 'dst'. If 'dst' is offlink, going
- * through the first hop 'gw', the security attributes
- * of 'dst' must be set to point to the gateway
- * credentials of gateway 'gw'. If 'dst' is onlink, it
- * is possible that 'dst' is a potential gateway that is
- * referenced by some route that has some security
- * attributes. Thus in the former case, we need to do a
- * gcgrp_lookup of 'gw' while in the latter case we
- * need to do gcgrp_lookup of 'dst' itself.
- */
- ga.ga_af = AF_INET;
- IN6_IPADDR_TO_V4MAPPED(gw != INADDR_ANY ? gw : dst,
- &ga.ga_addr);
- gcgrp = gcgrp_lookup(&ga, B_FALSE);
-
- ire = ire_create(
- (uchar_t *)&dst, /* dest address */
- (uchar_t *)&ip_g_all_ones, /* mask */
- (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */
- (uchar_t *)&gw, /* gateway address */
- &save_ire->ire_max_frag,
- NULL, /* no src nce */
- dst_ill->ill_rq, /* recv-from queue */
- dst_ill->ill_wq, /* send-to queue */
- IRE_CACHE,
- src_ipif,
- save_ire->ire_mask, /* Parent mask */
- (sire != NULL) ? /* Parent handle */
- sire->ire_phandle : 0,
- save_ire->ire_ihandle, /* Interface handle */
- (sire != NULL) ? sire->ire_flags &
- (RTF_SETSRC | RTF_MULTIRT) : 0, /* flags */
- &(save_ire->ire_uinfo),
- NULL,
- gcgrp,
- ipst);
-
- if (ire == NULL) {
- if (gcgrp != NULL) {
- GCGRP_REFRELE(gcgrp);
- gcgrp = NULL;
- }
- ire_refrele(save_ire);
- break;
- }
-
- /* reference now held by IRE */
- gcgrp = NULL;
-
- ire->ire_marks |= ire_marks;
-
- /* Prevent save_ire from getting deleted */
- IRB_REFHOLD(save_ire->ire_bucket);
- /* Has it been removed already ? */
- if (save_ire->ire_marks & IRE_MARK_CONDEMNED) {
- IRB_REFRELE(save_ire->ire_bucket);
- ire_refrele(save_ire);
- break;
- }
-
- /*
- * In the case of multirouting, a copy
- * of the packet is made before it is sent.
- * The copy is used in the next
- * loop to attempt another resolution.
- */
- xmit_mp = first_mp;
- if ((sire != NULL) &&
- (sire->ire_flags & RTF_MULTIRT)) {
- copy_mp = copymsg(first_mp);
- if (copy_mp != NULL) {
- xmit_mp = copy_mp;
- MULTIRT_DEBUG_TAG(first_mp);
- }
- }
- ire_add_then_send(q, ire, xmit_mp);
-
- /* Assert that it is not deleted yet. */
- ASSERT(save_ire->ire_ptpn != NULL);
- IRB_REFRELE(save_ire->ire_bucket);
- ire_refrele(save_ire);
-
- if (copy_mp != NULL) {
- /*
- * If we found a (no)resolver, we ignore any
- * trailing top priority IRE_CACHE in further
- * loops. This ensures that we do not omit any
- * (no)resolver.
- * This IRE_CACHE, if any, will be processed
- * by another thread entering ip_newroute().
- * IRE_CACHE entries, if any, will be processed
- * by another thread entering ip_newroute(),
- * (upon resolver response, for instance).
- * This aims to force parallel multirt
- * resolutions as soon as a packet must be sent.
- * In the best case, after the tx of only one
- * packet, all reachable routes are resolved.
- * Otherwise, the resolution of all RTF_MULTIRT
- * routes would require several emissions.
- */
- multirt_flags &= ~MULTIRT_CACHEGW;
-
- /*
- * Search for the next unresolved multirt
- * route.
- */
- copy_mp = NULL;
- save_ire = NULL;
- ire = NULL;
- multirt_resolve_next = B_TRUE;
- continue;
- }
-
- /*
- * Don't need sire anymore
- */
- if (sire != NULL)
- ire_refrele(sire);
-
- ipif_refrele(src_ipif);
- ill_refrele(dst_ill);
- return;
- }
- case IRE_IF_RESOLVER:
- /*
- * We can't build an IRE_CACHE yet, but at least we
- * found a resolver that can help.
- */
- res_mp = dst_ill->ill_resolver_mp;
- if (!OK_RESOLVER_MP(res_mp))
- break;
-
- /*
- * To be at this point in the code with a non-zero gw
- * means that dst is reachable through a gateway that
- * we have never resolved. By changing dst to the gw
- * addr we resolve the gateway first.
- * When ire_add_then_send() tries to put the IP dg
- * to dst, it will reenter ip_newroute() at which
- * time we will find the IRE_CACHE for the gw and
- * create another IRE_CACHE in case IRE_CACHE above.
- */
- if (gw != INADDR_ANY) {
- /*
- * The source ipif that was determined above was
- * relative to the destination address, not the
- * gateway's. If src_ipif was not taken out of
- * the IRE_IF_RESOLVER entry, we'll need to call
- * ipif_select_source() again.
- */
- if (src_ipif != ire->ire_ipif) {
- ipif_refrele(src_ipif);
- src_ipif = ipif_select_source(dst_ill,
- gw, zoneid);
- /*
- * In the case of multirouting, it may
- * happen that ipif_select_source fails
- * as DAD may disallow use of the
- * particular source interface. Anyway,
- * we need to continue and attempt to
- * resolve other multirt routes.
- */
- if (src_ipif == NULL) {
- if (sire != NULL &&
- (sire->ire_flags &
- RTF_MULTIRT)) {
- ire_refrele(ire);
- ire = NULL;
- multirt_resolve_next =
- B_TRUE;
- multirt_res_failures++;
- continue;
- }
- if (ip_debug > 2) {
- pr_addr_dbg(
- "ip_newroute: no "
- "src for gw %s ",
- AF_INET, &gw);
- printf("on "
- "interface %s\n",
- dst_ill->ill_name);
- }
- goto icmp_err_ret;
- }
- }
- save_dst = dst;
- dst = gw;
- gw = INADDR_ANY;
- }
-
- /*
- * We obtain a partial IRE_CACHE which we will pass
- * along with the resolver query. When the response
- * comes back it will be there ready for us to add.
- * The ire_max_frag is atomically set under the
- * irebucket lock in ire_add_v[46].
- */
-
- ire = ire_create_mp(
- (uchar_t *)&dst, /* dest address */
- (uchar_t *)&ip_g_all_ones, /* mask */
- (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */
- (uchar_t *)&gw, /* gateway address */
- NULL, /* ire_max_frag */
- NULL, /* no src nce */
- dst_ill->ill_rq, /* recv-from queue */
- dst_ill->ill_wq, /* send-to queue */
- IRE_CACHE,
- src_ipif, /* Interface ipif */
- save_ire->ire_mask, /* Parent mask */
- 0,
- save_ire->ire_ihandle, /* Interface handle */
- 0, /* flags if any */
- &(save_ire->ire_uinfo),
- NULL,
- NULL,
- ipst);
-
- if (ire == NULL) {
- ire_refrele(save_ire);
- break;
- }
-
- if ((sire != NULL) &&
- (sire->ire_flags & RTF_MULTIRT)) {
- copy_mp = copymsg(first_mp);
- if (copy_mp != NULL)
- MULTIRT_DEBUG_TAG(copy_mp);
- }
-
- ire->ire_marks |= ire_marks;
-
- /*
- * Construct message chain for the resolver
- * of the form:
- * ARP_REQ_MBLK-->IRE_MBLK-->Packet
- * Packet could contain a IPSEC_OUT mp.
- *
- * NOTE : ire will be added later when the response
- * comes back from ARP. If the response does not
- * come back, ARP frees the packet. For this reason,
- * we can't REFHOLD the bucket of save_ire to prevent
- * deletions. We may not be able to REFRELE the bucket
- * if the response never comes back. Thus, before
- * adding the ire, ire_add_v4 will make sure that the
- * interface route does not get deleted. This is the
- * only case unlike ip_newroute_v6, ip_newroute_ipif_v6
- * where we can always prevent deletions because of
- * the synchronous nature of adding IRES i.e
- * ire_add_then_send is called after creating the IRE.
- */
- ASSERT(ire->ire_mp != NULL);
- ire->ire_mp->b_cont = first_mp;
- /* Have saved_mp handy, for cleanup if canput fails */
- saved_mp = mp;
- mp = copyb(res_mp);
- if (mp == NULL) {
- /* Prepare for cleanup */
- mp = saved_mp; /* pkt */
- ire_delete(ire); /* ire_mp */
- ire = NULL;
- ire_refrele(save_ire);
- if (copy_mp != NULL) {
- MULTIRT_DEBUG_UNTAG(copy_mp);
- freemsg(copy_mp);
- copy_mp = NULL;
- }
- break;
- }
- linkb(mp, ire->ire_mp);
-
- /*
- * Fill in the source and dest addrs for the resolver.
- * NOTE: this depends on memory layouts imposed by
- * ill_init().
- */
- areq = (areq_t *)mp->b_rptr;
- addrp = (ipaddr_t *)((char *)areq +
- areq->areq_sender_addr_offset);
- *addrp = save_ire->ire_src_addr;
-
- ire_refrele(save_ire);
- addrp = (ipaddr_t *)((char *)areq +
- areq->areq_target_addr_offset);
- *addrp = dst;
- /* Up to the resolver. */
- if (canputnext(dst_ill->ill_rq) &&
- !(dst_ill->ill_arp_closing)) {
- putnext(dst_ill->ill_rq, mp);
- ire = NULL;
- if (copy_mp != NULL) {
- /*
- * If we found a resolver, we ignore
- * any trailing top priority IRE_CACHE
- * in the further loops. This ensures
- * that we do not omit any resolver.
- * IRE_CACHE entries, if any, will be
- * processed next time we enter
- * ip_newroute().
- */
- multirt_flags &= ~MULTIRT_CACHEGW;
- /*
- * Search for the next unresolved
- * multirt route.
- */
- first_mp = copy_mp;
- copy_mp = NULL;
- /* Prepare the next resolution loop. */
- mp = first_mp;
- EXTRACT_PKT_MP(mp, first_mp,
- mctl_present);
- if (mctl_present)
- io = (ipsec_out_t *)
- first_mp->b_rptr;
- ipha = (ipha_t *)mp->b_rptr;
-
- ASSERT(sire != NULL);
-
- dst = save_dst;
- multirt_resolve_next = B_TRUE;
- continue;
- }
-
- if (sire != NULL)
- ire_refrele(sire);
-
- /*
- * The response will come back in ip_wput
- * with db_type IRE_DB_TYPE.
- */
- ipif_refrele(src_ipif);
- ill_refrele(dst_ill);
- return;
- } else {
- /* Prepare for cleanup */
- DTRACE_PROBE1(ip__newroute__drop, mblk_t *,
- mp);
- mp->b_cont = NULL;
- freeb(mp); /* areq */
- /*
- * this is an ire that is not added to the
- * cache. ire_freemblk will handle the release
- * of any resources associated with the ire.
- */
- ire_delete(ire); /* ire_mp */
- mp = saved_mp; /* pkt */
- ire = NULL;
- if (copy_mp != NULL) {
- MULTIRT_DEBUG_UNTAG(copy_mp);
- freemsg(copy_mp);
- copy_mp = NULL;
- }
- break;
- }
- default:
- break;
- }
- } while (multirt_resolve_next);
-
- ip1dbg(("ip_newroute: dropped\n"));
- /* Did this packet originate externally? */
- if (mp->b_prev) {
- mp->b_next = NULL;
- mp->b_prev = NULL;
- BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
- } else {
- if (dst_ill != NULL) {
- BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutDiscards);
- } else {
- BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
- }
- }
- ASSERT(copy_mp == NULL);
- MULTIRT_DEBUG_UNTAG(first_mp);
- freemsg(first_mp);
- if (ire != NULL)
- ire_refrele(ire);
- if (sire != NULL)
- ire_refrele(sire);
- if (src_ipif != NULL)
- ipif_refrele(src_ipif);
- if (dst_ill != NULL)
- ill_refrele(dst_ill);
- return;
-
-icmp_err_ret:
- ip1dbg(("ip_newroute: no route\n"));
- if (src_ipif != NULL)
- ipif_refrele(src_ipif);
- if (dst_ill != NULL)
- ill_refrele(dst_ill);
- if (sire != NULL)
- ire_refrele(sire);
- /* Did this packet originate externally? */
- if (mp->b_prev) {
- mp->b_next = NULL;
- mp->b_prev = NULL;
- BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInNoRoutes);
- q = WR(q);
- } else {
- /*
- * There is no outgoing ill, so just increment the
- * system MIB.
- */
- BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes);
- /*
- * Since ip_wput() isn't close to finished, we fill
- * in enough of the header for credible error reporting.
- */
- if (ip_hdr_complete(ipha, zoneid, ipst)) {
- /* Failed */
- MULTIRT_DEBUG_UNTAG(first_mp);
- freemsg(first_mp);
- if (ire != NULL)
- ire_refrele(ire);
- return;
- }
- }
-
- /*
- * At this point we will have ire only if RTF_BLACKHOLE
- * or RTF_REJECT flags are set on the IRE. It will not
- * generate ICMP_HOST_UNREACHABLE if RTF_BLACKHOLE is set.
- */
- if (ire != NULL) {
- if (ire->ire_flags & RTF_BLACKHOLE) {
- ire_refrele(ire);
- MULTIRT_DEBUG_UNTAG(first_mp);
- freemsg(first_mp);
- return;
- }
- ire_refrele(ire);
- }
- if (ip_source_routed(ipha, ipst)) {
- icmp_unreachable(q, first_mp, ICMP_SOURCE_ROUTE_FAILED,
- zoneid, ipst);
- return;
- }
- icmp_unreachable(q, first_mp, ICMP_HOST_UNREACHABLE, zoneid, ipst);
+ return ("unknown");
}
-ip_opt_info_t zero_info;
-
-/*
- * IPv4 -
- * ip_newroute_ipif is called by ip_wput_multicast and
- * ip_rput_forward_multicast whenever we need to send
- * out a packet to a destination address for which we do not have specific
- * routing information. It is used when the packet will be sent out
- * on a specific interface. It is also called by ip_wput() when IP_BOUND_IF
- * socket option is set or icmp error message wants to go out on a particular
- * interface for a unicast packet.
- *
- * In most cases, the destination address is resolved thanks to the ipif
- * intrinsic resolver. However, there are some cases where the call to
- * ip_newroute_ipif must take into account the potential presence of
- * RTF_SETSRC and/or RTF_MULITRT flags in an IRE_OFFSUBNET ire
- * that uses the interface. This is specified through flags,
- * which can be a combination of:
- * - RTF_SETSRC: if an IRE_OFFSUBNET ire exists that has the RTF_SETSRC
- * flag, the resulting ire will inherit the IRE_OFFSUBNET source address
- * and flags. Additionally, the packet source address has to be set to
- * the specified address. The caller is thus expected to set this flag
- * if the packet has no specific source address yet.
- * - RTF_MULTIRT: if an IRE_OFFSUBNET ire exists that has the RTF_MULTIRT
- * flag, the resulting ire will inherit the flag. All unresolved routes
- * to the destination must be explored in the same call to
- * ip_newroute_ipif().
- */
-static void
-ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst,
- conn_t *connp, uint32_t flags, zoneid_t zoneid, ip_opt_info_t *infop)
+static int
+ip_wait_for_info_ack(ill_t *ill)
{
- areq_t *areq;
- ire_t *ire = NULL;
- mblk_t *res_mp;
- ipaddr_t *addrp;
- mblk_t *first_mp;
- ire_t *save_ire = NULL;
- ipif_t *src_ipif = NULL;
- ushort_t ire_marks = 0;
- ill_t *dst_ill = NULL;
- ipha_t *ipha;
- mblk_t *saved_mp;
- ire_t *fire = NULL;
- mblk_t *copy_mp = NULL;
- boolean_t multirt_resolve_next;
- boolean_t unspec_src;
- ipaddr_t ipha_dst;
- ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
-
- /*
- * CGTP goes in a loop which looks up a new ipif, do an ipif_refhold
- * here for uniformity
- */
- ipif_refhold(ipif);
-
- /*
- * This loop is run only once in most cases.
- * We loop to resolve further routes only when the destination
- * can be reached through multiple RTF_MULTIRT-flagged ires.
- */
- do {
- if (dst_ill != NULL) {
- ill_refrele(dst_ill);
- dst_ill = NULL;
- }
- if (src_ipif != NULL) {
- ipif_refrele(src_ipif);
- src_ipif = NULL;
- }
- multirt_resolve_next = B_FALSE;
-
- ip1dbg(("ip_newroute_ipif: dst 0x%x, if %s\n", ntohl(dst),
- ipif->ipif_ill->ill_name));
-
- first_mp = mp;
- if (DB_TYPE(mp) == M_CTL)
- mp = mp->b_cont;
- ipha = (ipha_t *)mp->b_rptr;
-
- /*
- * Save the packet destination address, we may need it after
- * the packet has been consumed.
- */
- ipha_dst = ipha->ipha_dst;
-
- /*
- * If the interface is a pt-pt interface we look for an
- * IRE_IF_RESOLVER or IRE_IF_NORESOLVER that matches both the
- * local_address and the pt-pt destination address. Otherwise
- * we just match the local address.
- * NOTE: dst could be different than ipha->ipha_dst in case
- * of sending igmp multicast packets over a point-to-point
- * connection.
- * Thus we must be careful enough to check ipha_dst to be a
- * multicast address, otherwise it will take xmit_if path for
- * multicast packets resulting into kernel stack overflow by
- * repeated calls to ip_newroute_ipif from ire_send().
- */
- if (CLASSD(ipha_dst) &&
- !(ipif->ipif_ill->ill_flags & ILLF_MULTICAST)) {
- goto err_ret;
- }
-
- /*
- * We check if an IRE_OFFSUBNET for the addr that goes through
- * ipif exists. We need it to determine if the RTF_SETSRC and/or
- * RTF_MULTIRT flags must be honored. This IRE_OFFSUBNET ire may
- * propagate its flags to the new ire.
- */
- if (CLASSD(ipha_dst) && (flags & (RTF_MULTIRT | RTF_SETSRC))) {
- fire = ipif_lookup_multi_ire(ipif, ipha_dst);
- ip2dbg(("ip_newroute_ipif: "
- "ipif_lookup_multi_ire("
- "ipif %p, dst %08x) = fire %p\n",
- (void *)ipif, ntohl(dst), (void *)fire));
- }
-
- /*
- * Note: While we pick a dst_ill we are really only
- * interested in the ill for load spreading. The source
- * ipif is determined by source address selection below.
- */
- if (IS_IPMP(ipif->ipif_ill)) {
- ipmp_illgrp_t *illg = ipif->ipif_ill->ill_grp;
-
- if (CLASSD(ipha_dst))
- dst_ill = ipmp_illgrp_hold_cast_ill(illg);
- else
- dst_ill = ipmp_illgrp_hold_next_ill(illg);
- } else {
- dst_ill = ipif->ipif_ill;
- ill_refhold(dst_ill);
- }
-
- if (dst_ill == NULL) {
- if (ip_debug > 2) {
- pr_addr_dbg("ip_newroute_ipif: no dst ill "
- "for dst %s\n", AF_INET, &dst);
- }
- goto err_ret;
- }
-
- /*
- * Pick a source address preferring non-deprecated ones.
- * Unlike ip_newroute, we don't do any source address
- * selection here since for multicast it really does not help
- * in inbound load spreading as in the unicast case.
- */
- if ((flags & RTF_SETSRC) && (fire != NULL) &&
- (fire->ire_flags & RTF_SETSRC)) {
- /*
- * As requested by flags, an IRE_OFFSUBNET was looked up
- * on that interface. This ire has RTF_SETSRC flag, so
- * the source address of the packet must be changed.
- * Check that the ipif matching the requested source
- * address still exists.
- */
- src_ipif = ipif_lookup_addr(fire->ire_src_addr, NULL,
- zoneid, NULL, NULL, NULL, NULL, ipst);
- }
-
- unspec_src = (connp != NULL && connp->conn_unspec_src);
-
- if (!IS_UNDER_IPMP(ipif->ipif_ill) &&
- (IS_IPMP(ipif->ipif_ill) ||
- (!ipif->ipif_isv6 && ipif->ipif_lcl_addr == INADDR_ANY) ||
- (ipif->ipif_flags & (IPIF_DEPRECATED|IPIF_UP)) != IPIF_UP ||
- (connp != NULL && ipif->ipif_zoneid != zoneid &&
- ipif->ipif_zoneid != ALL_ZONES)) &&
- (src_ipif == NULL) &&
- (!unspec_src || ipha->ipha_src != INADDR_ANY)) {
- src_ipif = ipif_select_source(dst_ill, dst, zoneid);
- if (src_ipif == NULL) {
- if (ip_debug > 2) {
- /* ip1dbg */
- pr_addr_dbg("ip_newroute_ipif: "
- "no src for dst %s",
- AF_INET, &dst);
- }
- ip1dbg((" on interface %s\n",
- dst_ill->ill_name));
- goto err_ret;
- }
- ipif_refrele(ipif);
- ipif = src_ipif;
- ipif_refhold(ipif);
- }
- if (src_ipif == NULL) {
- src_ipif = ipif;
- ipif_refhold(src_ipif);
- }
-
- /*
- * Assign a source address while we have the conn.
- * We can't have ip_wput_ire pick a source address when the
- * packet returns from arp since conn_unspec_src might be set
- * and we lose the conn when going through arp.
- */
- if (ipha->ipha_src == INADDR_ANY && !unspec_src)
- ipha->ipha_src = src_ipif->ipif_src_addr;
-
- /*
- * In the case of IP_BOUND_IF and IP_PKTINFO, it is possible
- * that the outgoing interface does not have an interface ire.
- */
- if (CLASSD(ipha_dst) && (connp == NULL ||
- connp->conn_outgoing_ill == NULL) &&
- infop->ip_opt_ill_index == 0) {
- /* ipif_to_ire returns an held ire */
- ire = ipif_to_ire(ipif);
- if (ire == NULL)
- goto err_ret;
- if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE))
- goto err_ret;
- save_ire = ire;
-
- ip2dbg(("ip_newroute_ipif: ire %p, ipif %p, "
- "flags %04x\n",
- (void *)ire, (void *)ipif, flags));
- if ((flags & RTF_MULTIRT) && (fire != NULL) &&
- (fire->ire_flags & RTF_MULTIRT)) {
- /*
- * As requested by flags, an IRE_OFFSUBNET was
- * looked up on that interface. This ire has
- * RTF_MULTIRT flag, so the resolution loop will
- * be re-entered to resolve additional routes on
- * other interfaces. For that purpose, a copy of
- * the packet is performed at this point.
- */
- fire->ire_last_used_time = lbolt;
- copy_mp = copymsg(first_mp);
- if (copy_mp) {
- MULTIRT_DEBUG_TAG(copy_mp);
- }
- }
- if ((flags & RTF_SETSRC) && (fire != NULL) &&
- (fire->ire_flags & RTF_SETSRC)) {
- /*
- * As requested by flags, an IRE_OFFSUBET was
- * looked up on that interface. This ire has
- * RTF_SETSRC flag, so the source address of the
- * packet must be changed.
- */
- ipha->ipha_src = fire->ire_src_addr;
- }
- } else {
- /*
- * The only ways we can come here are:
- * 1) IP_BOUND_IF socket option is set
- * 2) SO_DONTROUTE socket option is set
- * 3) IP_PKTINFO option is passed in as ancillary data.
- * In all cases, the new ire will not be added
- * into cache table.
- */
- ASSERT(connp == NULL || connp->conn_dontroute ||
- connp->conn_outgoing_ill != NULL ||
- infop->ip_opt_ill_index != 0);
- ire_marks |= IRE_MARK_NOADD;
- }
-
- switch (ipif->ipif_net_type) {
- case IRE_IF_NORESOLVER: {
- /* We have what we need to build an IRE_CACHE. */
-
- if (dst_ill->ill_resolver_mp == NULL) {
- ip1dbg(("ip_newroute_ipif: dst_ill %p "
- "for IRE_IF_NORESOLVER ire %p has "
- "no ill_resolver_mp\n",
- (void *)dst_ill, (void *)ire));
- break;
- }
-
- /*
- * The new ire inherits the IRE_OFFSUBNET flags
- * and source address, if this was requested.
- */
- ire = ire_create(
- (uchar_t *)&dst, /* dest address */
- (uchar_t *)&ip_g_all_ones, /* mask */
- (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */
- NULL, /* gateway address */
- &ipif->ipif_mtu,
- NULL, /* no src nce */
- dst_ill->ill_rq, /* recv-from queue */
- dst_ill->ill_wq, /* send-to queue */
- IRE_CACHE,
- src_ipif,
- (save_ire != NULL ? save_ire->ire_mask : 0),
- (fire != NULL) ? /* Parent handle */
- fire->ire_phandle : 0,
- (save_ire != NULL) ? /* Interface handle */
- save_ire->ire_ihandle : 0,
- (fire != NULL) ?
- (fire->ire_flags &
- (RTF_SETSRC | RTF_MULTIRT)) : 0,
- (save_ire == NULL ? &ire_uinfo_null :
- &save_ire->ire_uinfo),
- NULL,
- NULL,
- ipst);
-
- if (ire == NULL) {
- if (save_ire != NULL)
- ire_refrele(save_ire);
- break;
- }
-
- ire->ire_marks |= ire_marks;
-
- /*
- * If IRE_MARK_NOADD is set then we need to convert
- * the max_fragp to a useable value now. This is
- * normally done in ire_add_v[46]. We also need to
- * associate the ire with an nce (normally would be
- * done in ip_wput_nondata()).
- *
- * Note that IRE_MARK_NOADD packets created here
- * do not have a non-null ire_mp pointer. The null
- * value of ire_bucket indicates that they were
- * never added.
- */
- if (ire->ire_marks & IRE_MARK_NOADD) {
- uint_t max_frag;
-
- max_frag = *ire->ire_max_fragp;
- ire->ire_max_fragp = NULL;
- ire->ire_max_frag = max_frag;
-
- if ((ire->ire_nce = ndp_lookup_v4(
- ire_to_ill(ire),
- (ire->ire_gateway_addr != INADDR_ANY ?
- &ire->ire_gateway_addr : &ire->ire_addr),
- B_FALSE)) == NULL) {
- if (save_ire != NULL)
- ire_refrele(save_ire);
- break;
- }
- ASSERT(ire->ire_nce->nce_state ==
- ND_REACHABLE);
- NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce);
- }
-
- /* Prevent save_ire from getting deleted */
- if (save_ire != NULL) {
- IRB_REFHOLD(save_ire->ire_bucket);
- /* Has it been removed already ? */
- if (save_ire->ire_marks & IRE_MARK_CONDEMNED) {
- IRB_REFRELE(save_ire->ire_bucket);
- ire_refrele(save_ire);
- break;
- }
- }
-
- ire_add_then_send(q, ire, first_mp);
-
- /* Assert that save_ire is not deleted yet. */
- if (save_ire != NULL) {
- ASSERT(save_ire->ire_ptpn != NULL);
- IRB_REFRELE(save_ire->ire_bucket);
- ire_refrele(save_ire);
- save_ire = NULL;
- }
- if (fire != NULL) {
- ire_refrele(fire);
- fire = NULL;
- }
-
- /*
- * the resolution loop is re-entered if this
- * was requested through flags and if we
- * actually are in a multirouting case.
- */
- if ((flags & RTF_MULTIRT) && (copy_mp != NULL)) {
- boolean_t need_resolve =
- ire_multirt_need_resolve(ipha_dst,
- msg_getlabel(copy_mp), ipst);
- if (!need_resolve) {
- MULTIRT_DEBUG_UNTAG(copy_mp);
- freemsg(copy_mp);
- copy_mp = NULL;
- } else {
- /*
- * ipif_lookup_group() calls
- * ire_lookup_multi() that uses
- * ire_ftable_lookup() to find
- * an IRE_INTERFACE for the group.
- * In the multirt case,
- * ire_lookup_multi() then invokes
- * ire_multirt_lookup() to find
- * the next resolvable ire.
- * As a result, we obtain an new
- * interface, derived from the
- * next ire.
- */
- ipif_refrele(ipif);
- ipif = ipif_lookup_group(ipha_dst,
- zoneid, ipst);
- ip2dbg(("ip_newroute_ipif: "
- "multirt dst %08x, ipif %p\n",
- htonl(dst), (void *)ipif));
- if (ipif != NULL) {
- mp = copy_mp;
- copy_mp = NULL;
- multirt_resolve_next = B_TRUE;
- continue;
- } else {
- freemsg(copy_mp);
- }
- }
- }
- if (ipif != NULL)
- ipif_refrele(ipif);
- ill_refrele(dst_ill);
- ipif_refrele(src_ipif);
- return;
- }
- case IRE_IF_RESOLVER:
- /*
- * We can't build an IRE_CACHE yet, but at least
- * we found a resolver that can help.
- */
- res_mp = dst_ill->ill_resolver_mp;
- if (!OK_RESOLVER_MP(res_mp))
- break;
-
- /*
- * We obtain a partial IRE_CACHE which we will pass
- * along with the resolver query. When the response
- * comes back it will be there ready for us to add.
- * The new ire inherits the IRE_OFFSUBNET flags
- * and source address, if this was requested.
- * The ire_max_frag is atomically set under the
- * irebucket lock in ire_add_v[46]. Only in the
- * case of IRE_MARK_NOADD, we set it here itself.
- */
- ire = ire_create_mp(
- (uchar_t *)&dst, /* dest address */
- (uchar_t *)&ip_g_all_ones, /* mask */
- (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */
- NULL, /* gateway address */
- (ire_marks & IRE_MARK_NOADD) ?
- ipif->ipif_mtu : 0, /* max_frag */
- NULL, /* no src nce */
- dst_ill->ill_rq, /* recv-from queue */
- dst_ill->ill_wq, /* send-to queue */
- IRE_CACHE,
- src_ipif,
- (save_ire != NULL ? save_ire->ire_mask : 0),
- (fire != NULL) ? /* Parent handle */
- fire->ire_phandle : 0,
- (save_ire != NULL) ? /* Interface handle */
- save_ire->ire_ihandle : 0,
- (fire != NULL) ? /* flags if any */
- (fire->ire_flags &
- (RTF_SETSRC | RTF_MULTIRT)) : 0,
- (save_ire == NULL ? &ire_uinfo_null :
- &save_ire->ire_uinfo),
- NULL,
- NULL,
- ipst);
-
- if (save_ire != NULL) {
- ire_refrele(save_ire);
- save_ire = NULL;
- }
- if (ire == NULL)
- break;
-
- ire->ire_marks |= ire_marks;
- /*
- * Construct message chain for the resolver of the
- * form:
- * ARP_REQ_MBLK-->IRE_MBLK-->Packet
- *
- * NOTE : ire will be added later when the response
- * comes back from ARP. If the response does not
- * come back, ARP frees the packet. For this reason,
- * we can't REFHOLD the bucket of save_ire to prevent
- * deletions. We may not be able to REFRELE the
- * bucket if the response never comes back.
- * Thus, before adding the ire, ire_add_v4 will make
- * sure that the interface route does not get deleted.
- * This is the only case unlike ip_newroute_v6,
- * ip_newroute_ipif_v6 where we can always prevent
- * deletions because ire_add_then_send is called after
- * creating the IRE.
- * If IRE_MARK_NOADD is set, then ire_add_then_send
- * does not add this IRE into the IRE CACHE.
- */
- ASSERT(ire->ire_mp != NULL);
- ire->ire_mp->b_cont = first_mp;
- /* Have saved_mp handy, for cleanup if canput fails */
- saved_mp = mp;
- mp = copyb(res_mp);
- if (mp == NULL) {
- /* Prepare for cleanup */
- mp = saved_mp; /* pkt */
- ire_delete(ire); /* ire_mp */
- ire = NULL;
- if (copy_mp != NULL) {
- MULTIRT_DEBUG_UNTAG(copy_mp);
- freemsg(copy_mp);
- copy_mp = NULL;
- }
- break;
- }
- linkb(mp, ire->ire_mp);
-
- /*
- * Fill in the source and dest addrs for the resolver.
- * NOTE: this depends on memory layouts imposed by
- * ill_init(). There are corner cases above where we
- * might've created the IRE with an INADDR_ANY source
- * address (e.g., if the zeroth ipif on an underlying
- * ill in an IPMP group is 0.0.0.0, but another ipif
- * on the ill has a usable test address). If so, tell
- * ARP to use ipha_src as its sender address.
- */
- areq = (areq_t *)mp->b_rptr;
- addrp = (ipaddr_t *)((char *)areq +
- areq->areq_sender_addr_offset);
- if (ire->ire_src_addr != INADDR_ANY)
- *addrp = ire->ire_src_addr;
- else
- *addrp = ipha->ipha_src;
- addrp = (ipaddr_t *)((char *)areq +
- areq->areq_target_addr_offset);
- *addrp = dst;
- /* Up to the resolver. */
- if (canputnext(dst_ill->ill_rq) &&
- !(dst_ill->ill_arp_closing)) {
- putnext(dst_ill->ill_rq, mp);
- /*
- * The response will come back in ip_wput
- * with db_type IRE_DB_TYPE.
- */
- } else {
- mp->b_cont = NULL;
- freeb(mp); /* areq */
- ire_delete(ire); /* ire_mp */
- saved_mp->b_next = NULL;
- saved_mp->b_prev = NULL;
- freemsg(first_mp); /* pkt */
- ip2dbg(("ip_newroute_ipif: dropped\n"));
- }
-
- if (fire != NULL) {
- ire_refrele(fire);
- fire = NULL;
- }
+ int err;
- /*
- * The resolution loop is re-entered if this was
- * requested through flags and we actually are
- * in a multirouting case.
- */
- if ((flags & RTF_MULTIRT) && (copy_mp != NULL)) {
- boolean_t need_resolve =
- ire_multirt_need_resolve(ipha_dst,
- msg_getlabel(copy_mp), ipst);
- if (!need_resolve) {
- MULTIRT_DEBUG_UNTAG(copy_mp);
- freemsg(copy_mp);
- copy_mp = NULL;
- } else {
- /*
- * ipif_lookup_group() calls
- * ire_lookup_multi() that uses
- * ire_ftable_lookup() to find
- * an IRE_INTERFACE for the group.
- * In the multirt case,
- * ire_lookup_multi() then invokes
- * ire_multirt_lookup() to find
- * the next resolvable ire.
- * As a result, we obtain an new
- * interface, derived from the
- * next ire.
- */
- ipif_refrele(ipif);
- ipif = ipif_lookup_group(ipha_dst,
- zoneid, ipst);
- if (ipif != NULL) {
- mp = copy_mp;
- copy_mp = NULL;
- multirt_resolve_next = B_TRUE;
- continue;
- } else {
- freemsg(copy_mp);
- }
- }
- }
- if (ipif != NULL)
- ipif_refrele(ipif);
- ill_refrele(dst_ill);
- ipif_refrele(src_ipif);
- return;
- default:
- break;
- }
- } while (multirt_resolve_next);
-
-err_ret:
- ip2dbg(("ip_newroute_ipif: dropped\n"));
- if (fire != NULL)
- ire_refrele(fire);
- ipif_refrele(ipif);
- /* Did this packet originate externally? */
- if (dst_ill != NULL)
- ill_refrele(dst_ill);
- if (src_ipif != NULL)
- ipif_refrele(src_ipif);
- if (mp->b_prev || mp->b_next) {
- mp->b_next = NULL;
- mp->b_prev = NULL;
- } else {
+ mutex_enter(&ill->ill_lock);
+ while (ill->ill_state_flags & ILL_LL_SUBNET_PENDING) {
/*
- * Since ip_wput() isn't close to finished, we fill
- * in enough of the header for credible error reporting.
+ * Return value of 0 indicates a pending signal.
*/
- if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst)) {
- /* Failed */
- freemsg(first_mp);
- if (ire != NULL)
- ire_refrele(ire);
- return;
+ err = cv_wait_sig(&ill->ill_cv, &ill->ill_lock);
+ if (err == 0) {
+ mutex_exit(&ill->ill_lock);
+ return (EINTR);
}
}
+ mutex_exit(&ill->ill_lock);
/*
- * At this point we will have ire only if RTF_BLACKHOLE
- * or RTF_REJECT flags are set on the IRE. It will not
- * generate ICMP_HOST_UNREACHABLE if RTF_BLACKHOLE is set.
+ * ip_rput_other could have set an error in ill_error on
+ * receipt of M_ERROR.
*/
- if (ire != NULL) {
- if (ire->ire_flags & RTF_BLACKHOLE) {
- ire_refrele(ire);
- freemsg(first_mp);
- return;
- }
- ire_refrele(ire);
- }
- icmp_unreachable(q, first_mp, ICMP_HOST_UNREACHABLE, zoneid, ipst);
-}
-
-/* Name/Value Table Lookup Routine */
-char *
-ip_nv_lookup(nv_t *nv, int value)
-{
- if (!nv)
- return (NULL);
- for (; nv->nv_name; nv++) {
- if (nv->nv_value == value)
- return (nv->nv_name);
- }
- return ("unknown");
+ return (ill->ill_error);
}
/*
@@ -9604,7 +5972,7 @@ ip_nv_lookup(nv_t *nv, int value)
* to a DLPI device. We allocate an ill_t as the instance data in
* this case.
*/
-int
+static int
ip_modopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
{
ill_t *ill;
@@ -9644,6 +6012,7 @@ ip_modopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
* down a DL_INFO_REQ after calling qprocson.
*/
err = ill_init(q, ill);
+
if (err != 0) {
mi_free(ill);
netstack_rele(ipst->ips_netstack);
@@ -9652,41 +6021,26 @@ ip_modopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
return (err);
}
- /* ill_init initializes the ipsq marking this thread as writer */
- ipsq_exit(ill->ill_phyint->phyint_ipsq);
- /* Wait for the DL_INFO_ACK */
- mutex_enter(&ill->ill_lock);
- while (ill->ill_state_flags & ILL_LL_SUBNET_PENDING) {
- /*
- * Return value of 0 indicates a pending signal.
- */
- err = cv_wait_sig(&ill->ill_cv, &ill->ill_lock);
- if (err == 0) {
- mutex_exit(&ill->ill_lock);
- (void) ip_close(q, 0);
- return (EINTR);
- }
- }
- mutex_exit(&ill->ill_lock);
-
/*
- * ip_rput_other could have set an error in ill_error on
- * receipt of M_ERROR.
+ * Wait for the DL_INFO_ACK if a DL_INFO_REQ was sent.
+ *
+ * ill_init initializes the ipsq marking this thread as
+ * writer
*/
+ ipsq_exit(ill->ill_phyint->phyint_ipsq);
+ err = ip_wait_for_info_ack(ill);
+ if (err == 0)
+ ill->ill_credp = credp;
+ else
+ goto fail;
- err = ill->ill_error;
- if (err != 0) {
- (void) ip_close(q, 0);
- return (err);
- }
-
- ill->ill_credp = credp;
crhold(credp);
mutex_enter(&ipst->ips_ip_mi_lock);
- err = mi_open_link(&ipst->ips_ip_g_head, (IDP)ill, devp, flag, sflag,
- credp);
+ err = mi_open_link(&ipst->ips_ip_g_head, (IDP)q->q_ptr, devp, flag,
+ sflag, credp);
mutex_exit(&ipst->ips_ip_mi_lock);
+fail:
if (err) {
(void) ip_close(q, 0);
return (err);
@@ -9719,8 +6073,6 @@ ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
netstack_t *ns;
ip_stack_t *ipst;
- TRACE_1(TR_FAC_IP, TR_IP_OPEN, "ip_open: q %p", q);
-
/* Allow reopen. */
if (q->q_ptr != NULL)
return (0);
@@ -9765,25 +6117,24 @@ ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
*/
netstack_rele(ipst->ips_netstack);
+ connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM;
+ /* conn_allzones can not be set this early, hence no IPCL_ZONEID */
+ connp->conn_ixa->ixa_zoneid = zoneid;
connp->conn_zoneid = zoneid;
- connp->conn_sqp = NULL;
- connp->conn_initial_sqp = NULL;
- connp->conn_final_sqp = NULL;
- connp->conn_upq = q;
+ connp->conn_rq = q;
q->q_ptr = WR(q)->q_ptr = connp;
- if (flag & SO_SOCKSTR)
- connp->conn_flags |= IPCL_SOCKET;
-
/* Minor tells us which /dev entry was opened */
if (isv6) {
- connp->conn_af_isv6 = B_TRUE;
- ip_setpktversion(connp, isv6, B_FALSE, ipst);
- connp->conn_src_preferences = IPV6_PREFER_SRC_DEFAULT;
+ connp->conn_family = AF_INET6;
+ connp->conn_ipversion = IPV6_VERSION;
+ connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4;
+ connp->conn_ixa->ixa_src_preferences = IPV6_PREFER_SRC_DEFAULT;
} else {
- connp->conn_af_isv6 = B_FALSE;
- connp->conn_pkt_isv6 = B_FALSE;
+ connp->conn_family = AF_INET;
+ connp->conn_ipversion = IPV4_VERSION;
+ connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4;
}
if ((ip_minor_arena_la != NULL) && (flag & SO_SOCKSTR) &&
@@ -9812,11 +6163,17 @@ ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
* connp->conn_cred is crfree()ed in ipcl_conn_destroy()
*/
connp->conn_cred = credp;
+ /* Cache things in ixa without an extra refhold */
+ connp->conn_ixa->ixa_cred = connp->conn_cred;
+ connp->conn_ixa->ixa_cpid = connp->conn_cpid;
+ if (is_system_labeled())
+ connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
/*
- * Handle IP_RTS_REQUEST and other ioctls which use conn_recv
+ * Handle IP_IOC_RTS_REQUEST and other ioctls which use conn_recv
*/
connp->conn_recv = ip_conn_input;
+ connp->conn_recvicmp = ip_conn_input_icmp;
crhold(connp->conn_cred);
@@ -9827,11 +6184,13 @@ ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
if (getpflags(NET_MAC_AWARE, credp) != 0)
connp->conn_mac_mode = CONN_MAC_AWARE;
+ connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID);
+
connp->conn_rq = q;
connp->conn_wq = WR(q);
/* Non-zero default values */
- connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
+ connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP;
/*
* Make the conn globally visible to walkers
@@ -9847,210 +6206,6 @@ ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
}
/*
- * Change the output format (IPv4 vs. IPv6) for a conn_t.
- * Note that there is no race since either ip_output function works - it
- * is just an optimization to enter the best ip_output routine directly.
- */
-void
-ip_setpktversion(conn_t *connp, boolean_t isv6, boolean_t bump_mib,
- ip_stack_t *ipst)
-{
- if (isv6) {
- if (bump_mib) {
- BUMP_MIB(&ipst->ips_ip6_mib,
- ipIfStatsOutSwitchIPVersion);
- }
- connp->conn_send = ip_output_v6;
- connp->conn_pkt_isv6 = B_TRUE;
- } else {
- if (bump_mib) {
- BUMP_MIB(&ipst->ips_ip_mib,
- ipIfStatsOutSwitchIPVersion);
- }
- connp->conn_send = ip_output;
- connp->conn_pkt_isv6 = B_FALSE;
- }
-
-}
-
-/*
- * See if IPsec needs loading because of the options in mp.
- */
-static boolean_t
-ipsec_opt_present(mblk_t *mp)
-{
- uint8_t *optcp, *next_optcp, *opt_endcp;
- struct opthdr *opt;
- struct T_opthdr *topt;
- int opthdr_len;
- t_uscalar_t optname, optlevel;
- struct T_optmgmt_req *tor = (struct T_optmgmt_req *)mp->b_rptr;
- ipsec_req_t *ipsr;
-
- /*
- * Walk through the mess, and find IP_SEC_OPT. If it's there,
- * return TRUE.
- */
-
- optcp = mi_offset_param(mp, tor->OPT_offset, tor->OPT_length);
- opt_endcp = optcp + tor->OPT_length;
- if (tor->PRIM_type == T_OPTMGMT_REQ) {
- opthdr_len = sizeof (struct T_opthdr);
- } else { /* O_OPTMGMT_REQ */
- ASSERT(tor->PRIM_type == T_SVR4_OPTMGMT_REQ);
- opthdr_len = sizeof (struct opthdr);
- }
- for (; optcp < opt_endcp; optcp = next_optcp) {
- if (optcp + opthdr_len > opt_endcp)
- return (B_FALSE); /* Not enough option header. */
- if (tor->PRIM_type == T_OPTMGMT_REQ) {
- topt = (struct T_opthdr *)optcp;
- optlevel = topt->level;
- optname = topt->name;
- next_optcp = optcp + _TPI_ALIGN_TOPT(topt->len);
- } else {
- opt = (struct opthdr *)optcp;
- optlevel = opt->level;
- optname = opt->name;
- next_optcp = optcp + opthdr_len +
- _TPI_ALIGN_OPT(opt->len);
- }
- if ((next_optcp < optcp) || /* wraparound pointer space */
- ((next_optcp >= opt_endcp) && /* last option bad len */
- ((next_optcp - opt_endcp) >= __TPI_ALIGN_SIZE)))
- return (B_FALSE); /* bad option buffer */
- if ((optlevel == IPPROTO_IP && optname == IP_SEC_OPT) ||
- (optlevel == IPPROTO_IPV6 && optname == IPV6_SEC_OPT)) {
- /*
- * Check to see if it's an all-bypass or all-zeroes
- * IPsec request. Don't bother loading IPsec if
- * the socket doesn't want to use it. (A good example
- * is a bypass request.)
- *
- * Basically, if any of the non-NEVER bits are set,
- * load IPsec.
- */
- ipsr = (ipsec_req_t *)(optcp + opthdr_len);
- if ((ipsr->ipsr_ah_req & ~IPSEC_PREF_NEVER) != 0 ||
- (ipsr->ipsr_esp_req & ~IPSEC_PREF_NEVER) != 0 ||
- (ipsr->ipsr_self_encap_req & ~IPSEC_PREF_NEVER)
- != 0)
- return (B_TRUE);
- }
- }
- return (B_FALSE);
-}
-
-/*
- * If conn is is waiting for ipsec to finish loading, kick it.
- */
-/* ARGSUSED */
-static void
-conn_restart_ipsec_waiter(conn_t *connp, void *arg)
-{
- t_scalar_t optreq_prim;
- mblk_t *mp;
- cred_t *cr;
- int err = 0;
-
- /*
- * This function is called, after ipsec loading is complete.
- * Since IP checks exclusively and atomically (i.e it prevents
- * ipsec load from completing until ip_optcom_req completes)
- * whether ipsec load is complete, there cannot be a race with IP
- * trying to set the CONN_IPSEC_LOAD_WAIT flag on any conn now.
- */
- mutex_enter(&connp->conn_lock);
- if (connp->conn_state_flags & CONN_IPSEC_LOAD_WAIT) {
- ASSERT(connp->conn_ipsec_opt_mp != NULL);
- mp = connp->conn_ipsec_opt_mp;
- connp->conn_ipsec_opt_mp = NULL;
- connp->conn_state_flags &= ~CONN_IPSEC_LOAD_WAIT;
- mutex_exit(&connp->conn_lock);
-
- /*
- * All Solaris components should pass a db_credp
- * for this TPI message, hence we ASSERT.
- * But in case there is some other M_PROTO that looks
- * like a TPI message sent by some other kernel
- * component, we check and return an error.
- */
- cr = msg_getcred(mp, NULL);
- ASSERT(cr != NULL);
- if (cr == NULL) {
- mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL);
- if (mp != NULL)
- qreply(connp->conn_wq, mp);
- return;
- }
-
- ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
-
- optreq_prim = ((union T_primitives *)mp->b_rptr)->type;
- if (optreq_prim == T_OPTMGMT_REQ) {
- err = tpi_optcom_req(CONNP_TO_WQ(connp), mp, cr,
- &ip_opt_obj, B_FALSE);
- } else {
- ASSERT(optreq_prim == T_SVR4_OPTMGMT_REQ);
- err = svr4_optcom_req(CONNP_TO_WQ(connp), mp, cr,
- &ip_opt_obj, B_FALSE);
- }
- if (err != EINPROGRESS)
- CONN_OPER_PENDING_DONE(connp);
- return;
- }
- mutex_exit(&connp->conn_lock);
-}
-
-/*
- * Called from the ipsec_loader thread, outside any perimeter, to tell
- * ip qenable any of the queues waiting for the ipsec loader to
- * complete.
- */
-void
-ip_ipsec_load_complete(ipsec_stack_t *ipss)
-{
- netstack_t *ns = ipss->ipsec_netstack;
-
- ipcl_walk(conn_restart_ipsec_waiter, NULL, ns->netstack_ip);
-}
-
-/*
- * Can't be used. Need to call svr4* -> optset directly. the leaf routine
- * determines the grp on which it has to become exclusive, queues the mp
- * and IPSQ draining restarts the optmgmt
- */
-static boolean_t
-ip_check_for_ipsec_opt(queue_t *q, mblk_t *mp)
-{
- conn_t *connp = Q_TO_CONN(q);
- ipsec_stack_t *ipss = connp->conn_netstack->netstack_ipsec;
-
- /*
- * Take IPsec requests and treat them special.
- */
- if (ipsec_opt_present(mp)) {
- /* First check if IPsec is loaded. */
- mutex_enter(&ipss->ipsec_loader_lock);
- if (ipss->ipsec_loader_state != IPSEC_LOADER_WAIT) {
- mutex_exit(&ipss->ipsec_loader_lock);
- return (B_FALSE);
- }
- mutex_enter(&connp->conn_lock);
- connp->conn_state_flags |= CONN_IPSEC_LOAD_WAIT;
-
- ASSERT(connp->conn_ipsec_opt_mp == NULL);
- connp->conn_ipsec_opt_mp = mp;
- mutex_exit(&connp->conn_lock);
- mutex_exit(&ipss->ipsec_loader_lock);
-
- ipsec_loader_loadnow(ipss);
- return (B_TRUE);
- }
- return (B_FALSE);
-}
-
-/*
* Set IPsec policy from an ipsec_req_t. If the req is not "zero" and valid,
* all of them are copied to the conn_t. If the req is "zero", the policy is
* zeroed out. A "zero" policy has zero ipsr_{ah,req,self_encap}_req
@@ -10149,15 +6304,14 @@ ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req)
}
}
- mutex_enter(&connp->conn_lock);
+ ASSERT(MUTEX_HELD(&connp->conn_lock));
/*
- * If we have already cached policies in ip_bind_connected*(), don't
+ * If we have already cached policies in conn_connect(), don't
* let them change now. We cache policies for connections
* whose src,dst [addr, port] is known.
*/
if (connp->conn_policy_cached) {
- mutex_exit(&connp->conn_lock);
return (EINVAL);
}
@@ -10171,10 +6325,8 @@ ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req)
IPPH_REFRELE(connp->conn_policy, ipst->ips_netstack);
connp->conn_policy = NULL;
}
- connp->conn_flags &= ~IPCL_CHECK_POLICY;
connp->conn_in_enforce_policy = B_FALSE;
connp->conn_out_enforce_policy = B_FALSE;
- mutex_exit(&connp->conn_lock);
return (0);
}
@@ -10203,7 +6355,7 @@ ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req)
* We're looking at a v6 socket, also insert the v6-specific
* entries.
*/
- if (connp->conn_af_isv6) {
+ if (connp->conn_family == AF_INET6) {
if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V6,
IPSEC_TYPE_INBOUND, ns))
goto enomem;
@@ -10217,10 +6369,10 @@ ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req)
/*
* If the requests need security, set enforce_policy.
* If the requests are IPSEC_PREF_NEVER, one should
- * still set conn_out_enforce_policy so that an ipsec_out
- * gets attached in ip_wput. This is needed so that
- * for connections that we don't cache policy in ip_bind,
- * if global policy matches in ip_wput_attach_policy, we
+ * still set conn_out_enforce_policy so that ip_set_destination
+ * marks the ip_xmit_attr_t appropriatly. This is needed so that
+ * for connections that we don't cache policy in at connect time,
+ * if global policy matches in ip_output_attach_policy, we
* don't wrongly inherit global policy. Similarly, we need
* to set conn_in_enforce_policy also so that we don't verify
* policy wrongly.
@@ -10230,10 +6382,8 @@ ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req)
(se_req & REQ_MASK) != 0) {
connp->conn_in_enforce_policy = B_TRUE;
connp->conn_out_enforce_policy = B_TRUE;
- connp->conn_flags |= IPCL_CHECK_POLICY;
}
- mutex_exit(&connp->conn_lock);
return (error);
#undef REQ_MASK
@@ -10241,7 +6391,6 @@ ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req)
* Common memory-allocation-failure exit path.
*/
enomem:
- mutex_exit(&connp->conn_lock);
if (actp != NULL)
ipsec_actvec_free(actp, nact);
if (is_pol_inserted)
@@ -10250,1250 +6399,283 @@ enomem:
}
/*
- * Only for options that pass in an IP addr. Currently only V4 options
- * pass in an ipif. V6 options always pass an ifindex specifying the ill.
- * So this function assumes level is IPPROTO_IP
+ * Set socket options for joining and leaving multicast groups.
+ * Common to IPv4 and IPv6; inet6 indicates the type of socket.
+ * The caller has already check that the option name is consistent with
+ * the address family of the socket.
*/
int
-ip_opt_set_ipif(conn_t *connp, ipaddr_t addr, boolean_t checkonly, int option,
- mblk_t *first_mp)
+ip_opt_set_multicast_group(conn_t *connp, t_scalar_t name,
+ uchar_t *invalp, boolean_t inet6, boolean_t checkonly)
{
- ipif_t *ipif = NULL;
- int error;
- ill_t *ill;
- int zoneid;
- ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
-
- ip2dbg(("ip_opt_set_ipif: ipaddr %X\n", addr));
-
- if (addr != INADDR_ANY || checkonly) {
- ASSERT(connp != NULL);
- zoneid = IPCL_ZONEID(connp);
- if (option == IP_NEXTHOP) {
- ipif = ipif_lookup_onlink_addr(addr,
- connp->conn_zoneid, ipst);
- } else {
- ipif = ipif_lookup_addr(addr, NULL, zoneid,
- CONNP_TO_WQ(connp), first_mp, ip_restart_optmgmt,
- &error, ipst);
- }
- if (ipif == NULL) {
- if (error == EINPROGRESS)
- return (error);
- if ((option == IP_MULTICAST_IF) ||
- (option == IP_NEXTHOP))
- return (EHOSTUNREACH);
- else
- return (EINVAL);
- } else if (checkonly) {
- if (option == IP_MULTICAST_IF) {
- ill = ipif->ipif_ill;
- /* not supported by the virtual network iface */
- if (IS_VNI(ill)) {
- ipif_refrele(ipif);
- return (EINVAL);
- }
- }
- ipif_refrele(ipif);
- return (0);
- }
- ill = ipif->ipif_ill;
- mutex_enter(&connp->conn_lock);
- mutex_enter(&ill->ill_lock);
- if ((ill->ill_state_flags & ILL_CONDEMNED) ||
- (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
- mutex_exit(&ill->ill_lock);
- mutex_exit(&connp->conn_lock);
- ipif_refrele(ipif);
- return (option == IP_MULTICAST_IF ?
- EHOSTUNREACH : EINVAL);
- }
- } else {
- mutex_enter(&connp->conn_lock);
- }
-
- /* None of the options below are supported on the VNI */
- if (ipif != NULL && IS_VNI(ipif->ipif_ill)) {
- mutex_exit(&ill->ill_lock);
- mutex_exit(&connp->conn_lock);
- ipif_refrele(ipif);
- return (EINVAL);
- }
-
- switch (option) {
- case IP_MULTICAST_IF:
- connp->conn_multicast_ipif = ipif;
+ int *i1 = (int *)invalp;
+ int error = 0;
+ ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
+ struct ip_mreq *v4_mreqp;
+ struct ipv6_mreq *v6_mreqp;
+ struct group_req *greqp;
+ ire_t *ire;
+ boolean_t done = B_FALSE;
+ ipaddr_t ifaddr;
+ in6_addr_t v6group;
+ uint_t ifindex;
+ boolean_t mcast_opt = B_TRUE;
+ mcast_record_t fmode;
+ int (*optfn)(conn_t *, boolean_t, const in6_addr_t *,
+ ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *);
+
+ switch (name) {
+ case IP_ADD_MEMBERSHIP:
+ case IPV6_JOIN_GROUP:
+ mcast_opt = B_FALSE;
+ /* FALLTHRU */
+ case MCAST_JOIN_GROUP:
+ fmode = MODE_IS_EXCLUDE;
+ optfn = ip_opt_add_group;
break;
- case IP_NEXTHOP:
- connp->conn_nexthop_v4 = addr;
- connp->conn_nexthop_set = B_TRUE;
+
+ case IP_DROP_MEMBERSHIP:
+ case IPV6_LEAVE_GROUP:
+ mcast_opt = B_FALSE;
+ /* FALLTHRU */
+ case MCAST_LEAVE_GROUP:
+ fmode = MODE_IS_INCLUDE;
+ optfn = ip_opt_delete_group;
break;
+ default:
+ ASSERT(0);
}
- if (ipif != NULL) {
- mutex_exit(&ill->ill_lock);
- mutex_exit(&connp->conn_lock);
- ipif_refrele(ipif);
- return (0);
- }
- mutex_exit(&connp->conn_lock);
- /* We succeded in cleared the option */
- return (0);
-}
+ if (mcast_opt) {
+ struct sockaddr_in *sin;
+ struct sockaddr_in6 *sin6;
-/*
- * For options that pass in an ifindex specifying the ill. V6 options always
- * pass in an ill. Some v4 options also pass in ifindex specifying the ill.
- */
-int
-ip_opt_set_ill(conn_t *connp, int ifindex, boolean_t isv6, boolean_t checkonly,
- int level, int option, mblk_t *first_mp)
-{
- ill_t *ill = NULL;
- int error = 0;
- ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
-
- ip2dbg(("ip_opt_set_ill: ifindex %d\n", ifindex));
- if (ifindex != 0) {
- ASSERT(connp != NULL);
- ill = ill_lookup_on_ifindex(ifindex, isv6, CONNP_TO_WQ(connp),
- first_mp, ip_restart_optmgmt, &error, ipst);
- if (ill != NULL) {
- if (checkonly) {
- /* not supported by the virtual network iface */
- if (IS_VNI(ill)) {
- ill_refrele(ill);
- return (EINVAL);
- }
- ill_refrele(ill);
- return (0);
- }
- if (!ipif_lookup_zoneid(ill, connp->conn_zoneid,
- 0, NULL)) {
- ill_refrele(ill);
- ill = NULL;
- mutex_enter(&connp->conn_lock);
- goto setit;
- }
- mutex_enter(&connp->conn_lock);
- mutex_enter(&ill->ill_lock);
- if (ill->ill_state_flags & ILL_CONDEMNED) {
- mutex_exit(&ill->ill_lock);
- mutex_exit(&connp->conn_lock);
- ill_refrele(ill);
- ill = NULL;
- mutex_enter(&connp->conn_lock);
- }
- goto setit;
- } else if (error == EINPROGRESS) {
- return (error);
+ greqp = (struct group_req *)i1;
+ if (greqp->gr_group.ss_family == AF_INET) {
+ sin = (struct sockaddr_in *)&(greqp->gr_group);
+ IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &v6group);
} else {
- error = 0;
- }
+ if (!inet6)
+ return (EINVAL); /* Not on INET socket */
+
+ sin6 = (struct sockaddr_in6 *)&(greqp->gr_group);
+ v6group = sin6->sin6_addr;
+ }
+ ifaddr = INADDR_ANY;
+ ifindex = greqp->gr_interface;
+ } else if (inet6) {
+ v6_mreqp = (struct ipv6_mreq *)i1;
+ v6group = v6_mreqp->ipv6mr_multiaddr;
+ ifaddr = INADDR_ANY;
+ ifindex = v6_mreqp->ipv6mr_interface;
+ } else {
+ v4_mreqp = (struct ip_mreq *)i1;
+ IN6_INADDR_TO_V4MAPPED(&v4_mreqp->imr_multiaddr, &v6group);
+ ifaddr = (ipaddr_t)v4_mreqp->imr_interface.s_addr;
+ ifindex = 0;
}
- mutex_enter(&connp->conn_lock);
-setit:
- ASSERT((level == IPPROTO_IP || level == IPPROTO_IPV6));
/*
- * The options below assume that the ILL (if any) transmits and/or
- * receives traffic. Neither of which is true for the virtual network
- * interface, so fail setting these on a VNI.
+ * In the multirouting case, we need to replicate
+ * the request on all interfaces that will take part
+ * in replication. We do so because multirouting is
+ * reflective, thus we will probably receive multi-
+ * casts on those interfaces.
+ * The ip_multirt_apply_membership() succeeds if
+ * the operation succeeds on at least one interface.
*/
- if (IS_VNI(ill)) {
- ASSERT(ill != NULL);
- mutex_exit(&ill->ill_lock);
- mutex_exit(&connp->conn_lock);
- ill_refrele(ill);
- return (EINVAL);
- }
-
- if (level == IPPROTO_IP) {
- switch (option) {
- case IP_BOUND_IF:
- connp->conn_incoming_ill = ill;
- connp->conn_outgoing_ill = ill;
- break;
-
- case IP_MULTICAST_IF:
- /*
- * This option is an internal special. The socket
- * level IP_MULTICAST_IF specifies an 'ipaddr' and
- * is handled in ip_opt_set_ipif. IPV6_MULTICAST_IF
- * specifies an ifindex and we try first on V6 ill's.
- * If we don't find one, we they try using on v4 ill's
- * intenally and we come here.
- */
- if (!checkonly && ill != NULL) {
- ipif_t *ipif;
- ipif = ill->ill_ipif;
-
- if (ipif->ipif_state_flags & IPIF_CONDEMNED) {
- mutex_exit(&ill->ill_lock);
- mutex_exit(&connp->conn_lock);
- ill_refrele(ill);
- ill = NULL;
- mutex_enter(&connp->conn_lock);
- } else {
- connp->conn_multicast_ipif = ipif;
- }
- }
- break;
+ if (IN6_IS_ADDR_V4MAPPED(&v6group)) {
+ ipaddr_t group;
- case IP_DHCPINIT_IF:
- if (connp->conn_dhcpinit_ill != NULL) {
- /*
- * We've locked the conn so conn_cleanup_ill()
- * cannot clear conn_dhcpinit_ill -- so it's
- * safe to access the ill.
- */
- ill_t *oill = connp->conn_dhcpinit_ill;
+ IN6_V4MAPPED_TO_IPADDR(&v6group, group);
- ASSERT(oill->ill_dhcpinit != 0);
- atomic_dec_32(&oill->ill_dhcpinit);
- connp->conn_dhcpinit_ill = NULL;
- }
-
- if (ill != NULL) {
- connp->conn_dhcpinit_ill = ill;
- atomic_inc_32(&ill->ill_dhcpinit);
- }
- break;
- }
+ ire = ire_ftable_lookup_v4(group, IP_HOST_MASK, 0,
+ IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
+ MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
} else {
- switch (option) {
- case IPV6_BOUND_IF:
- connp->conn_incoming_ill = ill;
- connp->conn_outgoing_ill = ill;
- break;
-
- case IPV6_MULTICAST_IF:
- /*
- * Set conn_multicast_ill to be the IPv6 ill.
- * Set conn_multicast_ipif to be an IPv4 ipif
- * for ifindex to make IPv4 mapped addresses
- * on PF_INET6 sockets honor IPV6_MULTICAST_IF.
- * Even if no IPv6 ill exists for the ifindex
- * we need to check for an IPv4 ifindex in order
- * for this to work with mapped addresses. In that
- * case only set conn_multicast_ipif.
- */
- if (!checkonly) {
- if (ifindex == 0) {
- connp->conn_multicast_ill = NULL;
- connp->conn_multicast_ipif = NULL;
- } else if (ill != NULL) {
- connp->conn_multicast_ill = ill;
- }
- }
- break;
+ ire = ire_ftable_lookup_v6(&v6group, &ipv6_all_ones, 0,
+ IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
+ MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
+ }
+ if (ire != NULL) {
+ if (ire->ire_flags & RTF_MULTIRT) {
+ error = ip_multirt_apply_membership(optfn, ire, connp,
+ checkonly, &v6group, fmode, &ipv6_all_zeros);
+ done = B_TRUE;
}
+ ire_refrele(ire);
}
- if (ill != NULL) {
- mutex_exit(&ill->ill_lock);
- mutex_exit(&connp->conn_lock);
- ill_refrele(ill);
- return (0);
+ if (!done) {
+ error = optfn(connp, checkonly, &v6group, ifaddr, ifindex,
+ fmode, &ipv6_all_zeros);
}
- mutex_exit(&connp->conn_lock);
- /*
- * We succeeded in clearing the option (ifindex == 0) or failed to
- * locate the ill and could not set the option (ifindex != 0)
- */
- return (ifindex == 0 ? 0 : EINVAL);
+ return (error);
}
-/* This routine sets socket options. */
-/* ARGSUSED */
+/*
+ * Set socket options for joining and leaving multicast groups
+ * for specific sources.
+ * Common to IPv4 and IPv6; inet6 indicates the type of socket.
+ * The caller has already check that the option name is consistent with
+ * the address family of the socket.
+ */
int
-ip_opt_set(queue_t *q, uint_t optset_context, int level, int name,
- uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
- void *dummy, cred_t *cr, mblk_t *first_mp)
+ip_opt_set_multicast_sources(conn_t *connp, t_scalar_t name,
+ uchar_t *invalp, boolean_t inet6, boolean_t checkonly)
{
int *i1 = (int *)invalp;
- conn_t *connp = Q_TO_CONN(q);
int error = 0;
- boolean_t checkonly;
- ire_t *ire;
- boolean_t found;
ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
+ struct ip_mreq_source *imreqp;
+ struct group_source_req *gsreqp;
+ in6_addr_t v6group, v6src;
+ uint32_t ifindex;
+ ipaddr_t ifaddr;
+ boolean_t mcast_opt = B_TRUE;
+ mcast_record_t fmode;
+ ire_t *ire;
+ boolean_t done = B_FALSE;
+ int (*optfn)(conn_t *, boolean_t, const in6_addr_t *,
+ ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *);
- switch (optset_context) {
-
- case SETFN_OPTCOM_CHECKONLY:
- checkonly = B_TRUE;
- /*
- * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
- * inlen != 0 implies value supplied and
- * we have to "pretend" to set it.
- * inlen == 0 implies that there is no
- * value part in T_CHECK request and just validation
- * done elsewhere should be enough, we just return here.
- */
- if (inlen == 0) {
- *outlenp = 0;
- return (0);
- }
- break;
- case SETFN_OPTCOM_NEGOTIATE:
- case SETFN_UD_NEGOTIATE:
- case SETFN_CONN_NEGOTIATE:
- checkonly = B_FALSE;
+ switch (name) {
+ case IP_BLOCK_SOURCE:
+ mcast_opt = B_FALSE;
+ /* FALLTHRU */
+ case MCAST_BLOCK_SOURCE:
+ fmode = MODE_IS_EXCLUDE;
+ optfn = ip_opt_add_group;
break;
- default:
- /*
- * We should never get here
- */
- *outlenp = 0;
- return (EINVAL);
- }
-
- ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
- (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
- /*
- * For fixed length options, no sanity check
- * of passed in length is done. It is assumed *_optcom_req()
- * routines do the right thing.
- */
-
- switch (level) {
- case SOL_SOCKET:
- /*
- * conn_lock protects the bitfields, and is used to
- * set the fields atomically.
- */
- switch (name) {
- case SO_BROADCAST:
- if (!checkonly) {
- /* TODO: use value someplace? */
- mutex_enter(&connp->conn_lock);
- connp->conn_broadcast = *i1 ? 1 : 0;
- mutex_exit(&connp->conn_lock);
- }
- break; /* goto sizeof (int) option return */
- case SO_USELOOPBACK:
- if (!checkonly) {
- /* TODO: use value someplace? */
- mutex_enter(&connp->conn_lock);
- connp->conn_loopback = *i1 ? 1 : 0;
- mutex_exit(&connp->conn_lock);
- }
- break; /* goto sizeof (int) option return */
- case SO_DONTROUTE:
- if (!checkonly) {
- mutex_enter(&connp->conn_lock);
- connp->conn_dontroute = *i1 ? 1 : 0;
- mutex_exit(&connp->conn_lock);
- }
- break; /* goto sizeof (int) option return */
- case SO_REUSEADDR:
- if (!checkonly) {
- mutex_enter(&connp->conn_lock);
- connp->conn_reuseaddr = *i1 ? 1 : 0;
- mutex_exit(&connp->conn_lock);
- }
- break; /* goto sizeof (int) option return */
- case SO_PROTOTYPE:
- if (!checkonly) {
- mutex_enter(&connp->conn_lock);
- connp->conn_proto = *i1;
- mutex_exit(&connp->conn_lock);
- }
- break; /* goto sizeof (int) option return */
- case SO_ALLZONES:
- if (!checkonly) {
- mutex_enter(&connp->conn_lock);
- if (IPCL_IS_BOUND(connp)) {
- mutex_exit(&connp->conn_lock);
- return (EINVAL);
- }
- connp->conn_allzones = *i1 != 0 ? 1 : 0;
- mutex_exit(&connp->conn_lock);
- }
- break; /* goto sizeof (int) option return */
- case SO_ANON_MLP:
- if (!checkonly) {
- mutex_enter(&connp->conn_lock);
- connp->conn_anon_mlp = *i1 != 0 ? 1 : 0;
- mutex_exit(&connp->conn_lock);
- }
- break; /* goto sizeof (int) option return */
- case SO_MAC_EXEMPT:
- if (secpolicy_net_mac_aware(cr) != 0 ||
- IPCL_IS_BOUND(connp))
- return (EACCES);
- if (!checkonly) {
- mutex_enter(&connp->conn_lock);
- connp->conn_mac_mode = *i1 != 0 ?
- CONN_MAC_AWARE : CONN_MAC_DEFAULT;
- mutex_exit(&connp->conn_lock);
- }
- break; /* goto sizeof (int) option return */
- case SO_MAC_IMPLICIT:
- if (secpolicy_net_mac_implicit(cr) != 0)
- return (EACCES);
- if (!checkonly) {
- mutex_enter(&connp->conn_lock);
- connp->conn_mac_mode = *i1 != 0 ?
- CONN_MAC_IMPLICIT : CONN_MAC_DEFAULT;
- mutex_exit(&connp->conn_lock);
- }
- break; /* goto sizeof (int) option return */
- default:
- /*
- * "soft" error (negative)
- * option not handled at this level
- * Note: Do not modify *outlenp
- */
- return (-EINVAL);
- }
+ case IP_UNBLOCK_SOURCE:
+ mcast_opt = B_FALSE;
+ /* FALLTHRU */
+ case MCAST_UNBLOCK_SOURCE:
+ fmode = MODE_IS_EXCLUDE;
+ optfn = ip_opt_delete_group;
break;
- case IPPROTO_IP:
- switch (name) {
- case IP_NEXTHOP:
- if (secpolicy_ip_config(cr, B_FALSE) != 0)
- return (EPERM);
- /* FALLTHRU */
- case IP_MULTICAST_IF: {
- ipaddr_t addr = *i1;
-
- error = ip_opt_set_ipif(connp, addr, checkonly, name,
- first_mp);
- if (error != 0)
- return (error);
- break; /* goto sizeof (int) option return */
- }
-
- case IP_MULTICAST_TTL:
- /* Recorded in transport above IP */
- *outvalp = *invalp;
- *outlenp = sizeof (uchar_t);
- return (0);
- case IP_MULTICAST_LOOP:
- if (!checkonly) {
- mutex_enter(&connp->conn_lock);
- connp->conn_multicast_loop = *invalp ? 1 : 0;
- mutex_exit(&connp->conn_lock);
- }
- *outvalp = *invalp;
- *outlenp = sizeof (uchar_t);
- return (0);
- case IP_ADD_MEMBERSHIP:
- case MCAST_JOIN_GROUP:
- case IP_DROP_MEMBERSHIP:
- case MCAST_LEAVE_GROUP: {
- struct ip_mreq *mreqp;
- struct group_req *greqp;
- ire_t *ire;
- boolean_t done = B_FALSE;
- ipaddr_t group, ifaddr;
- struct sockaddr_in *sin;
- uint32_t *ifindexp;
- boolean_t mcast_opt = B_TRUE;
- mcast_record_t fmode;
- int (*optfn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t,
- uint_t *, mcast_record_t, ipaddr_t, mblk_t *);
-
- switch (name) {
- case IP_ADD_MEMBERSHIP:
- mcast_opt = B_FALSE;
- /* FALLTHRU */
- case MCAST_JOIN_GROUP:
- fmode = MODE_IS_EXCLUDE;
- optfn = ip_opt_add_group;
- break;
-
- case IP_DROP_MEMBERSHIP:
- mcast_opt = B_FALSE;
- /* FALLTHRU */
- case MCAST_LEAVE_GROUP:
- fmode = MODE_IS_INCLUDE;
- optfn = ip_opt_delete_group;
- break;
- }
-
- if (mcast_opt) {
- greqp = (struct group_req *)i1;
- sin = (struct sockaddr_in *)&greqp->gr_group;
- if (sin->sin_family != AF_INET) {
- *outlenp = 0;
- return (ENOPROTOOPT);
- }
- group = (ipaddr_t)sin->sin_addr.s_addr;
- ifaddr = INADDR_ANY;
- ifindexp = &greqp->gr_interface;
- } else {
- mreqp = (struct ip_mreq *)i1;
- group = (ipaddr_t)mreqp->imr_multiaddr.s_addr;
- ifaddr = (ipaddr_t)mreqp->imr_interface.s_addr;
- ifindexp = NULL;
- }
-
- /*
- * In the multirouting case, we need to replicate
- * the request on all interfaces that will take part
- * in replication. We do so because multirouting is
- * reflective, thus we will probably receive multi-
- * casts on those interfaces.
- * The ip_multirt_apply_membership() succeeds if the
- * operation succeeds on at least one interface.
- */
- ire = ire_ftable_lookup(group, IP_HOST_MASK, 0,
- IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL,
- MATCH_IRE_MASK | MATCH_IRE_TYPE, ipst);
- if (ire != NULL) {
- if (ire->ire_flags & RTF_MULTIRT) {
- error = ip_multirt_apply_membership(
- optfn, ire, connp, checkonly, group,
- fmode, INADDR_ANY, first_mp);
- done = B_TRUE;
- }
- ire_refrele(ire);
- }
- if (!done) {
- error = optfn(connp, checkonly, group, ifaddr,
- ifindexp, fmode, INADDR_ANY, first_mp);
- }
- if (error) {
- /*
- * EINPROGRESS is a soft error, needs retry
- * so don't make *outlenp zero.
- */
- if (error != EINPROGRESS)
- *outlenp = 0;
- return (error);
- }
- /* OK return - copy input buffer into output buffer */
- if (invalp != outvalp) {
- /* don't trust bcopy for identical src/dst */
- bcopy(invalp, outvalp, inlen);
- }
- *outlenp = inlen;
- return (0);
- }
- case IP_BLOCK_SOURCE:
- case IP_UNBLOCK_SOURCE:
- case IP_ADD_SOURCE_MEMBERSHIP:
- case IP_DROP_SOURCE_MEMBERSHIP:
- case MCAST_BLOCK_SOURCE:
- case MCAST_UNBLOCK_SOURCE:
- case MCAST_JOIN_SOURCE_GROUP:
- case MCAST_LEAVE_SOURCE_GROUP: {
- struct ip_mreq_source *imreqp;
- struct group_source_req *gsreqp;
- in_addr_t grp, src, ifaddr = INADDR_ANY;
- uint32_t ifindex = 0;
- mcast_record_t fmode;
- struct sockaddr_in *sin;
- ire_t *ire;
- boolean_t mcast_opt = B_TRUE, done = B_FALSE;
- int (*optfn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t,
- uint_t *, mcast_record_t, ipaddr_t, mblk_t *);
-
- switch (name) {
- case IP_BLOCK_SOURCE:
- mcast_opt = B_FALSE;
- /* FALLTHRU */
- case MCAST_BLOCK_SOURCE:
- fmode = MODE_IS_EXCLUDE;
- optfn = ip_opt_add_group;
- break;
-
- case IP_UNBLOCK_SOURCE:
- mcast_opt = B_FALSE;
- /* FALLTHRU */
- case MCAST_UNBLOCK_SOURCE:
- fmode = MODE_IS_EXCLUDE;
- optfn = ip_opt_delete_group;
- break;
-
- case IP_ADD_SOURCE_MEMBERSHIP:
- mcast_opt = B_FALSE;
- /* FALLTHRU */
- case MCAST_JOIN_SOURCE_GROUP:
- fmode = MODE_IS_INCLUDE;
- optfn = ip_opt_add_group;
- break;
-
- case IP_DROP_SOURCE_MEMBERSHIP:
- mcast_opt = B_FALSE;
- /* FALLTHRU */
- case MCAST_LEAVE_SOURCE_GROUP:
- fmode = MODE_IS_INCLUDE;
- optfn = ip_opt_delete_group;
- break;
- }
-
- if (mcast_opt) {
- gsreqp = (struct group_source_req *)i1;
- if (gsreqp->gsr_group.ss_family != AF_INET) {
- *outlenp = 0;
- return (ENOPROTOOPT);
- }
- sin = (struct sockaddr_in *)&gsreqp->gsr_group;
- grp = (ipaddr_t)sin->sin_addr.s_addr;
- sin = (struct sockaddr_in *)&gsreqp->gsr_source;
- src = (ipaddr_t)sin->sin_addr.s_addr;
- ifindex = gsreqp->gsr_interface;
- } else {
- imreqp = (struct ip_mreq_source *)i1;
- grp = (ipaddr_t)imreqp->imr_multiaddr.s_addr;
- src = (ipaddr_t)imreqp->imr_sourceaddr.s_addr;
- ifaddr = (ipaddr_t)imreqp->imr_interface.s_addr;
- }
- /*
- * In the multirouting case, we need to replicate
- * the request as noted in the mcast cases above.
- */
- ire = ire_ftable_lookup(grp, IP_HOST_MASK, 0,
- IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL,
- MATCH_IRE_MASK | MATCH_IRE_TYPE, ipst);
- if (ire != NULL) {
- if (ire->ire_flags & RTF_MULTIRT) {
- error = ip_multirt_apply_membership(
- optfn, ire, connp, checkonly, grp,
- fmode, src, first_mp);
- done = B_TRUE;
- }
- ire_refrele(ire);
- }
- if (!done) {
- error = optfn(connp, checkonly, grp, ifaddr,
- &ifindex, fmode, src, first_mp);
- }
- if (error != 0) {
- /*
- * EINPROGRESS is a soft error, needs retry
- * so don't make *outlenp zero.
- */
- if (error != EINPROGRESS)
- *outlenp = 0;
- return (error);
- }
- /* OK return - copy input buffer into output buffer */
- if (invalp != outvalp) {
- bcopy(invalp, outvalp, inlen);
- }
- *outlenp = inlen;
- return (0);
- }
- case IP_SEC_OPT:
- error = ipsec_set_req(cr, connp, (ipsec_req_t *)invalp);
- if (error != 0) {
- *outlenp = 0;
- return (error);
- }
- break;
- case IP_HDRINCL:
- case IP_OPTIONS:
- case T_IP_OPTIONS:
- case IP_TOS:
- case T_IP_TOS:
- case IP_TTL:
- case IP_RECVDSTADDR:
- case IP_RECVOPTS:
- /* OK return - copy input buffer into output buffer */
- if (invalp != outvalp) {
- /* don't trust bcopy for identical src/dst */
- bcopy(invalp, outvalp, inlen);
- }
- *outlenp = inlen;
- return (0);
- case IP_RECVIF:
- /* Retrieve the inbound interface index */
- if (!checkonly) {
- mutex_enter(&connp->conn_lock);
- connp->conn_recvif = *i1 ? 1 : 0;
- mutex_exit(&connp->conn_lock);
- }
- break; /* goto sizeof (int) option return */
- case IP_RECVPKTINFO:
- if (!checkonly) {
- mutex_enter(&connp->conn_lock);
- connp->conn_ip_recvpktinfo = *i1 ? 1 : 0;
- mutex_exit(&connp->conn_lock);
- }
- break; /* goto sizeof (int) option return */
- case IP_RECVSLLA:
- /* Retrieve the source link layer address */
- if (!checkonly) {
- mutex_enter(&connp->conn_lock);
- connp->conn_recvslla = *i1 ? 1 : 0;
- mutex_exit(&connp->conn_lock);
- }
- break; /* goto sizeof (int) option return */
- case MRT_INIT:
- case MRT_DONE:
- case MRT_ADD_VIF:
- case MRT_DEL_VIF:
- case MRT_ADD_MFC:
- case MRT_DEL_MFC:
- case MRT_ASSERT:
- if ((error = secpolicy_ip_config(cr, B_FALSE)) != 0) {
- *outlenp = 0;
- return (error);
- }
- error = ip_mrouter_set((int)name, q, checkonly,
- (uchar_t *)invalp, inlen, first_mp);
- if (error) {
- *outlenp = 0;
- return (error);
- }
- /* OK return - copy input buffer into output buffer */
- if (invalp != outvalp) {
- /* don't trust bcopy for identical src/dst */
- bcopy(invalp, outvalp, inlen);
- }
- *outlenp = inlen;
- return (0);
- case IP_BOUND_IF:
- case IP_DHCPINIT_IF:
- error = ip_opt_set_ill(connp, *i1, B_FALSE, checkonly,
- level, name, first_mp);
- if (error != 0)
- return (error);
- break; /* goto sizeof (int) option return */
-
- case IP_UNSPEC_SRC:
- /* Allow sending with a zero source address */
- if (!checkonly) {
- mutex_enter(&connp->conn_lock);
- connp->conn_unspec_src = *i1 ? 1 : 0;
- mutex_exit(&connp->conn_lock);
- }
- break; /* goto sizeof (int) option return */
- default:
- /*
- * "soft" error (negative)
- * option not handled at this level
- * Note: Do not modify *outlenp
- */
- return (-EINVAL);
- }
+ case IP_ADD_SOURCE_MEMBERSHIP:
+ mcast_opt = B_FALSE;
+ /* FALLTHRU */
+ case MCAST_JOIN_SOURCE_GROUP:
+ fmode = MODE_IS_INCLUDE;
+ optfn = ip_opt_add_group;
break;
- case IPPROTO_IPV6:
- switch (name) {
- case IPV6_BOUND_IF:
- error = ip_opt_set_ill(connp, *i1, B_TRUE, checkonly,
- level, name, first_mp);
- if (error != 0)
- return (error);
- break; /* goto sizeof (int) option return */
- case IPV6_MULTICAST_IF:
- /*
- * The only possible errors are EINPROGRESS and
- * EINVAL. EINPROGRESS will be restarted and is not
- * a hard error. We call this option on both V4 and V6
- * If both return EINVAL, then this call returns
- * EINVAL. If at least one of them succeeds we
- * return success.
- */
- found = B_FALSE;
- error = ip_opt_set_ill(connp, *i1, B_TRUE, checkonly,
- level, name, first_mp);
- if (error == EINPROGRESS)
- return (error);
- if (error == 0)
- found = B_TRUE;
- error = ip_opt_set_ill(connp, *i1, B_FALSE, checkonly,
- IPPROTO_IP, IP_MULTICAST_IF, first_mp);
- if (error == 0)
- found = B_TRUE;
- if (!found)
- return (error);
- break; /* goto sizeof (int) option return */
-
- case IPV6_MULTICAST_HOPS:
- /* Recorded in transport above IP */
- break; /* goto sizeof (int) option return */
- case IPV6_MULTICAST_LOOP:
- if (!checkonly) {
- mutex_enter(&connp->conn_lock);
- connp->conn_multicast_loop = *i1;
- mutex_exit(&connp->conn_lock);
- }
- break; /* goto sizeof (int) option return */
- case IPV6_JOIN_GROUP:
- case MCAST_JOIN_GROUP:
- case IPV6_LEAVE_GROUP:
- case MCAST_LEAVE_GROUP: {
- struct ipv6_mreq *ip_mreqp;
- struct group_req *greqp;
- ire_t *ire;
- boolean_t done = B_FALSE;
- in6_addr_t groupv6;
- uint32_t ifindex;
- boolean_t mcast_opt = B_TRUE;
- mcast_record_t fmode;
- int (*optfn)(conn_t *, boolean_t, const in6_addr_t *,
- int, mcast_record_t, const in6_addr_t *, mblk_t *);
-
- switch (name) {
- case IPV6_JOIN_GROUP:
- mcast_opt = B_FALSE;
- /* FALLTHRU */
- case MCAST_JOIN_GROUP:
- fmode = MODE_IS_EXCLUDE;
- optfn = ip_opt_add_group_v6;
- break;
-
- case IPV6_LEAVE_GROUP:
- mcast_opt = B_FALSE;
- /* FALLTHRU */
- case MCAST_LEAVE_GROUP:
- fmode = MODE_IS_INCLUDE;
- optfn = ip_opt_delete_group_v6;
- break;
- }
+ case IP_DROP_SOURCE_MEMBERSHIP:
+ mcast_opt = B_FALSE;
+ /* FALLTHRU */
+ case MCAST_LEAVE_SOURCE_GROUP:
+ fmode = MODE_IS_INCLUDE;
+ optfn = ip_opt_delete_group;
+ break;
+ default:
+ ASSERT(0);
+ }
- if (mcast_opt) {
- struct sockaddr_in *sin;
- struct sockaddr_in6 *sin6;
- greqp = (struct group_req *)i1;
- if (greqp->gr_group.ss_family == AF_INET) {
- sin = (struct sockaddr_in *)
- &(greqp->gr_group);
- IN6_INADDR_TO_V4MAPPED(&sin->sin_addr,
- &groupv6);
- } else {
- sin6 = (struct sockaddr_in6 *)
- &(greqp->gr_group);
- groupv6 = sin6->sin6_addr;
- }
- ifindex = greqp->gr_interface;
- } else {
- ip_mreqp = (struct ipv6_mreq *)i1;
- groupv6 = ip_mreqp->ipv6mr_multiaddr;
- ifindex = ip_mreqp->ipv6mr_interface;
- }
- /*
- * In the multirouting case, we need to replicate
- * the request on all interfaces that will take part
- * in replication. We do so because multirouting is
- * reflective, thus we will probably receive multi-
- * casts on those interfaces.
- * The ip_multirt_apply_membership_v6() succeeds if
- * the operation succeeds on at least one interface.
- */
- ire = ire_ftable_lookup_v6(&groupv6, &ipv6_all_ones, 0,
- IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL,
- MATCH_IRE_MASK | MATCH_IRE_TYPE, ipst);
- if (ire != NULL) {
- if (ire->ire_flags & RTF_MULTIRT) {
- error = ip_multirt_apply_membership_v6(
- optfn, ire, connp, checkonly,
- &groupv6, fmode, &ipv6_all_zeros,
- first_mp);
- done = B_TRUE;
- }
- ire_refrele(ire);
- }
- if (!done) {
- error = optfn(connp, checkonly, &groupv6,
- ifindex, fmode, &ipv6_all_zeros, first_mp);
- }
- if (error) {
- /*
- * EINPROGRESS is a soft error, needs retry
- * so don't make *outlenp zero.
- */
- if (error != EINPROGRESS)
- *outlenp = 0;
- return (error);
- }
- /* OK return - copy input buffer into output buffer */
- if (invalp != outvalp) {
- /* don't trust bcopy for identical src/dst */
- bcopy(invalp, outvalp, inlen);
- }
- *outlenp = inlen;
- return (0);
- }
- case MCAST_BLOCK_SOURCE:
- case MCAST_UNBLOCK_SOURCE:
- case MCAST_JOIN_SOURCE_GROUP:
- case MCAST_LEAVE_SOURCE_GROUP: {
- struct group_source_req *gsreqp;
- in6_addr_t v6grp, v6src;
- uint32_t ifindex;
- mcast_record_t fmode;
- ire_t *ire;
- boolean_t done = B_FALSE;
- int (*optfn)(conn_t *, boolean_t, const in6_addr_t *,
- int, mcast_record_t, const in6_addr_t *, mblk_t *);
-
- switch (name) {
- case MCAST_BLOCK_SOURCE:
- fmode = MODE_IS_EXCLUDE;
- optfn = ip_opt_add_group_v6;
- break;
- case MCAST_UNBLOCK_SOURCE:
- fmode = MODE_IS_EXCLUDE;
- optfn = ip_opt_delete_group_v6;
- break;
- case MCAST_JOIN_SOURCE_GROUP:
- fmode = MODE_IS_INCLUDE;
- optfn = ip_opt_add_group_v6;
- break;
- case MCAST_LEAVE_SOURCE_GROUP:
- fmode = MODE_IS_INCLUDE;
- optfn = ip_opt_delete_group_v6;
- break;
- }
+ if (mcast_opt) {
+ gsreqp = (struct group_source_req *)i1;
+ ifindex = gsreqp->gsr_interface;
+ if (gsreqp->gsr_group.ss_family == AF_INET) {
+ struct sockaddr_in *s;
+ s = (struct sockaddr_in *)&gsreqp->gsr_group;
+ IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6group);
+ s = (struct sockaddr_in *)&gsreqp->gsr_source;
+ IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6src);
+ } else {
+ struct sockaddr_in6 *s6;
- gsreqp = (struct group_source_req *)i1;
- ifindex = gsreqp->gsr_interface;
- if (gsreqp->gsr_group.ss_family == AF_INET) {
- struct sockaddr_in *s;
- s = (struct sockaddr_in *)&gsreqp->gsr_group;
- IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6grp);
- s = (struct sockaddr_in *)&gsreqp->gsr_source;
- IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6src);
- } else {
- struct sockaddr_in6 *s6;
- s6 = (struct sockaddr_in6 *)&gsreqp->gsr_group;
- v6grp = s6->sin6_addr;
- s6 = (struct sockaddr_in6 *)&gsreqp->gsr_source;
- v6src = s6->sin6_addr;
- }
+ if (!inet6)
+ return (EINVAL); /* Not on INET socket */
- /*
- * In the multirouting case, we need to replicate
- * the request as noted in the mcast cases above.
- */
- ire = ire_ftable_lookup_v6(&v6grp, &ipv6_all_ones, 0,
- IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL,
- MATCH_IRE_MASK | MATCH_IRE_TYPE, ipst);
- if (ire != NULL) {
- if (ire->ire_flags & RTF_MULTIRT) {
- error = ip_multirt_apply_membership_v6(
- optfn, ire, connp, checkonly,
- &v6grp, fmode, &v6src, first_mp);
- done = B_TRUE;
- }
- ire_refrele(ire);
- }
- if (!done) {
- error = optfn(connp, checkonly, &v6grp,
- ifindex, fmode, &v6src, first_mp);
- }
- if (error != 0) {
- /*
- * EINPROGRESS is a soft error, needs retry
- * so don't make *outlenp zero.
- */
- if (error != EINPROGRESS)
- *outlenp = 0;
- return (error);
- }
- /* OK return - copy input buffer into output buffer */
- if (invalp != outvalp) {
- bcopy(invalp, outvalp, inlen);
- }
- *outlenp = inlen;
- return (0);
+ s6 = (struct sockaddr_in6 *)&gsreqp->gsr_group;
+ v6group = s6->sin6_addr;
+ s6 = (struct sockaddr_in6 *)&gsreqp->gsr_source;
+ v6src = s6->sin6_addr;
}
- case IPV6_UNICAST_HOPS:
- /* Recorded in transport above IP */
- break; /* goto sizeof (int) option return */
- case IPV6_UNSPEC_SRC:
- /* Allow sending with a zero source address */
- if (!checkonly) {
- mutex_enter(&connp->conn_lock);
- connp->conn_unspec_src = *i1 ? 1 : 0;
- mutex_exit(&connp->conn_lock);
- }
- break; /* goto sizeof (int) option return */
- case IPV6_RECVPKTINFO:
- if (!checkonly) {
- mutex_enter(&connp->conn_lock);
- connp->conn_ip_recvpktinfo = *i1 ? 1 : 0;
- mutex_exit(&connp->conn_lock);
- }
- break; /* goto sizeof (int) option return */
- case IPV6_RECVTCLASS:
- if (!checkonly) {
- if (*i1 < 0 || *i1 > 1) {
- return (EINVAL);
- }
- mutex_enter(&connp->conn_lock);
- connp->conn_ipv6_recvtclass = *i1;
- mutex_exit(&connp->conn_lock);
- }
- break;
- case IPV6_RECVPATHMTU:
- if (!checkonly) {
- if (*i1 < 0 || *i1 > 1) {
- return (EINVAL);
- }
- mutex_enter(&connp->conn_lock);
- connp->conn_ipv6_recvpathmtu = *i1;
- mutex_exit(&connp->conn_lock);
- }
- break;
- case IPV6_RECVHOPLIMIT:
- if (!checkonly) {
- mutex_enter(&connp->conn_lock);
- connp->conn_ipv6_recvhoplimit = *i1 ? 1 : 0;
- mutex_exit(&connp->conn_lock);
- }
- break; /* goto sizeof (int) option return */
- case IPV6_RECVHOPOPTS:
- if (!checkonly) {
- mutex_enter(&connp->conn_lock);
- connp->conn_ipv6_recvhopopts = *i1 ? 1 : 0;
- mutex_exit(&connp->conn_lock);
- }
- break; /* goto sizeof (int) option return */
- case IPV6_RECVDSTOPTS:
- if (!checkonly) {
- mutex_enter(&connp->conn_lock);
- connp->conn_ipv6_recvdstopts = *i1 ? 1 : 0;
- mutex_exit(&connp->conn_lock);
- }
- break; /* goto sizeof (int) option return */
- case IPV6_RECVRTHDR:
- if (!checkonly) {
- mutex_enter(&connp->conn_lock);
- connp->conn_ipv6_recvrthdr = *i1 ? 1 : 0;
- mutex_exit(&connp->conn_lock);
- }
- break; /* goto sizeof (int) option return */
- case IPV6_RECVRTHDRDSTOPTS:
- if (!checkonly) {
- mutex_enter(&connp->conn_lock);
- connp->conn_ipv6_recvrtdstopts = *i1 ? 1 : 0;
- mutex_exit(&connp->conn_lock);
- }
- break; /* goto sizeof (int) option return */
- case IPV6_PKTINFO:
- if (inlen == 0)
- return (-EINVAL); /* clearing option */
- error = ip6_set_pktinfo(cr, connp,
- (struct in6_pktinfo *)invalp);
- if (error != 0)
- *outlenp = 0;
- else
- *outlenp = inlen;
- return (error);
- case IPV6_NEXTHOP: {
- struct sockaddr_in6 *sin6;
-
- /* Verify that the nexthop is reachable */
- if (inlen == 0)
- return (-EINVAL); /* clearing option */
+ ifaddr = INADDR_ANY;
+ } else {
+ imreqp = (struct ip_mreq_source *)i1;
+ IN6_INADDR_TO_V4MAPPED(&imreqp->imr_multiaddr, &v6group);
+ IN6_INADDR_TO_V4MAPPED(&imreqp->imr_sourceaddr, &v6src);
+ ifaddr = (ipaddr_t)imreqp->imr_interface.s_addr;
+ ifindex = 0;
+ }
- sin6 = (struct sockaddr_in6 *)invalp;
- ire = ire_route_lookup_v6(&sin6->sin6_addr,
- 0, 0, 0, NULL, NULL, connp->conn_zoneid,
- NULL, MATCH_IRE_DEFAULT, ipst);
+ /*
+ * Handle src being mapped INADDR_ANY by changing it to unspecified.
+ */
+ if (IN6_IS_ADDR_V4MAPPED_ANY(&v6src))
+ v6src = ipv6_all_zeros;
- if (ire == NULL) {
- *outlenp = 0;
- return (EHOSTUNREACH);
- }
- ire_refrele(ire);
- return (-EINVAL);
- }
- case IPV6_SEC_OPT:
- error = ipsec_set_req(cr, connp, (ipsec_req_t *)invalp);
- if (error != 0) {
- *outlenp = 0;
- return (error);
- }
- break;
- case IPV6_SRC_PREFERENCES: {
- /*
- * This is implemented strictly in the ip module
- * (here and in tcp_opt_*() to accomodate tcp
- * sockets). Modules above ip pass this option
- * down here since ip is the only one that needs to
- * be aware of source address preferences.
- *
- * This socket option only affects connected
- * sockets that haven't already bound to a specific
- * IPv6 address. In other words, sockets that
- * don't call bind() with an address other than the
- * unspecified address and that call connect().
- * ip_bind_connected_v6() passes these preferences
- * to the ipif_select_source_v6() function.
- */
- if (inlen != sizeof (uint32_t))
- return (EINVAL);
- error = ip6_set_src_preferences(connp,
- *(uint32_t *)invalp);
- if (error != 0) {
- *outlenp = 0;
- return (error);
- } else {
- *outlenp = sizeof (uint32_t);
- }
- break;
- }
- case IPV6_V6ONLY:
- if (*i1 < 0 || *i1 > 1) {
- return (EINVAL);
- }
- mutex_enter(&connp->conn_lock);
- connp->conn_ipv6_v6only = *i1;
- mutex_exit(&connp->conn_lock);
- break;
- default:
- return (-EINVAL);
- }
- break;
- default:
- /*
- * "soft" error (negative)
- * option not handled at this level
- * Note: Do not modify *outlenp
- */
- return (-EINVAL);
- }
/*
- * Common case of return from an option that is sizeof (int)
+ * In the multirouting case, we need to replicate
+ * the request as noted in the mcast cases above.
*/
- *(int *)outvalp = *i1;
- *outlenp = sizeof (int);
- return (0);
-}
+ if (IN6_IS_ADDR_V4MAPPED(&v6group)) {
+ ipaddr_t group;
-/*
- * This routine gets default values of certain options whose default
- * values are maintained by protocol specific code
- */
-/* ARGSUSED */
-int
-ip_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
-{
- int *i1 = (int *)ptr;
- ip_stack_t *ipst = CONNQ_TO_IPST(q);
+ IN6_V4MAPPED_TO_IPADDR(&v6group, group);
- switch (level) {
- case IPPROTO_IP:
- switch (name) {
- case IP_MULTICAST_TTL:
- *ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL;
- return (sizeof (uchar_t));
- case IP_MULTICAST_LOOP:
- *ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP;
- return (sizeof (uchar_t));
- default:
- return (-1);
- }
- case IPPROTO_IPV6:
- switch (name) {
- case IPV6_UNICAST_HOPS:
- *i1 = ipst->ips_ipv6_def_hops;
- return (sizeof (int));
- case IPV6_MULTICAST_HOPS:
- *i1 = IP_DEFAULT_MULTICAST_TTL;
- return (sizeof (int));
- case IPV6_MULTICAST_LOOP:
- *i1 = IP_DEFAULT_MULTICAST_LOOP;
- return (sizeof (int));
- case IPV6_V6ONLY:
- *i1 = 1;
- return (sizeof (int));
- default:
- return (-1);
+ ire = ire_ftable_lookup_v4(group, IP_HOST_MASK, 0,
+ IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
+ MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
+ } else {
+ ire = ire_ftable_lookup_v6(&v6group, &ipv6_all_ones, 0,
+ IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
+ MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
+ }
+ if (ire != NULL) {
+ if (ire->ire_flags & RTF_MULTIRT) {
+ error = ip_multirt_apply_membership(optfn, ire, connp,
+ checkonly, &v6group, fmode, &v6src);
+ done = B_TRUE;
}
- default:
- return (-1);
+ ire_refrele(ire);
}
- /* NOTREACHED */
+ if (!done) {
+ error = optfn(connp, checkonly, &v6group, ifaddr, ifindex,
+ fmode, &v6src);
+ }
+ return (error);
}
/*
* Given a destination address and a pointer to where to put the information
* this routine fills in the mtuinfo.
+ * The socket must be connected.
+ * For sctp conn_faddr is the primary address.
*/
int
-ip_fill_mtuinfo(struct in6_addr *in6, in_port_t port,
- struct ip6_mtuinfo *mtuinfo, netstack_t *ns)
+ip_fill_mtuinfo(conn_t *connp, ip_xmit_attr_t *ixa, struct ip6_mtuinfo *mtuinfo)
{
- ire_t *ire;
- ip_stack_t *ipst = ns->netstack_ip;
+ uint32_t pmtu = IP_MAXPACKET;
+ uint_t scopeid;
- if (IN6_IS_ADDR_UNSPECIFIED(in6))
+ if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6))
return (-1);
+ /* In case we never sent or called ip_set_destination_v4/v6 */
+ if (ixa->ixa_ire != NULL)
+ pmtu = ip_get_pmtu(ixa);
+
+ if (ixa->ixa_flags & IXAF_SCOPEID_SET)
+ scopeid = ixa->ixa_scopeid;
+ else
+ scopeid = 0;
+
bzero(mtuinfo, sizeof (*mtuinfo));
mtuinfo->ip6m_addr.sin6_family = AF_INET6;
- mtuinfo->ip6m_addr.sin6_port = port;
- mtuinfo->ip6m_addr.sin6_addr = *in6;
+ mtuinfo->ip6m_addr.sin6_port = connp->conn_fport;
+ mtuinfo->ip6m_addr.sin6_addr = connp->conn_faddr_v6;
+ mtuinfo->ip6m_addr.sin6_scope_id = scopeid;
+ mtuinfo->ip6m_mtu = pmtu;
- ire = ire_cache_lookup_v6(in6, ALL_ZONES, NULL, ipst);
- if (ire != NULL) {
- mtuinfo->ip6m_mtu = ire->ire_max_frag;
- ire_refrele(ire);
- } else {
- mtuinfo->ip6m_mtu = IPV6_MIN_MTU;
- }
return (sizeof (struct ip6_mtuinfo));
}
-/*
- * This routine gets socket options. For MRT_VERSION and MRT_ASSERT, error
- * checking of cred and that ip_g_mrouter is set should be done and
- * isn't. This doesn't matter as the error checking is done properly for the
- * other MRT options coming in through ip_opt_set.
- */
-int
-ip_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
-{
- conn_t *connp = Q_TO_CONN(q);
- ipsec_req_t *req = (ipsec_req_t *)ptr;
-
- switch (level) {
- case IPPROTO_IP:
- switch (name) {
- case MRT_VERSION:
- case MRT_ASSERT:
- (void) ip_mrouter_get(name, q, ptr);
- return (sizeof (int));
- case IP_SEC_OPT:
- return (ipsec_req_from_conn(connp, req, IPSEC_AF_V4));
- case IP_NEXTHOP:
- if (connp->conn_nexthop_set) {
- *(ipaddr_t *)ptr = connp->conn_nexthop_v4;
- return (sizeof (ipaddr_t));
- } else
- return (0);
- case IP_RECVPKTINFO:
- *(int *)ptr = connp->conn_ip_recvpktinfo ? 1: 0;
- return (sizeof (int));
- default:
- break;
- }
- break;
- case IPPROTO_IPV6:
- switch (name) {
- case IPV6_SEC_OPT:
- return (ipsec_req_from_conn(connp, req, IPSEC_AF_V6));
- case IPV6_SRC_PREFERENCES: {
- return (ip6_get_src_preferences(connp,
- (uint32_t *)ptr));
- }
- case IPV6_V6ONLY:
- *(int *)ptr = connp->conn_ipv6_v6only ? 1 : 0;
- return (sizeof (int));
- case IPV6_PATHMTU:
- return (ip_fill_mtuinfo(&connp->conn_remv6, 0,
- (struct ip6_mtuinfo *)ptr, connp->conn_netstack));
- default:
- break;
- }
- break;
- default:
- break;
- }
- return (-1);
-}
/* Named Dispatch routine to get a current value out of our parameter table. */
/* ARGSUSED */
static int
@@ -11955,130 +7137,18 @@ ip_reassemble(mblk_t *mp, ipf_t *ipf, uint_t start, boolean_t more, ill_t *ill,
}
/*
- * ipsec processing for the fast path, used for input UDP Packets
- * Returns true if ready for passup to UDP.
- * Return false if packet is not passable to UDP (e.g. it failed IPsec policy,
- * was an ESP-in-UDP packet, etc.).
- */
-static boolean_t
-ip_udp_check(queue_t *q, conn_t *connp, ill_t *ill, ipha_t *ipha,
- mblk_t **mpp, mblk_t **first_mpp, boolean_t mctl_present, ire_t *ire)
-{
- uint32_t ill_index;
- uint_t in_flags; /* IPF_RECVSLLA and/or IPF_RECVIF */
- ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
- ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
- udp_t *udp = connp->conn_udp;
-
- ASSERT(ipha->ipha_protocol == IPPROTO_UDP);
- /* The ill_index of the incoming ILL */
- ill_index = ((ill_t *)q->q_ptr)->ill_phyint->phyint_ifindex;
-
- /* pass packet up to the transport */
- if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || mctl_present) {
- *first_mpp = ipsec_check_inbound_policy(*first_mpp, connp, ipha,
- NULL, mctl_present);
- if (*first_mpp == NULL) {
- return (B_FALSE);
- }
- }
-
- /* Initiate IPPF processing for fastpath UDP */
- if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) {
- ip_process(IPP_LOCAL_IN, mpp, ill_index);
- if (*mpp == NULL) {
- ip2dbg(("ip_input_ipsec_process: UDP pkt "
- "deferred/dropped during IPPF processing\n"));
- return (B_FALSE);
- }
- }
- /*
- * Remove 0-spi if it's 0, or move everything behind
- * the UDP header over it and forward to ESP via
- * ip_proto_input().
- */
- if (udp->udp_nat_t_endpoint) {
- if (mctl_present) {
- /* mctl_present *shouldn't* happen. */
- ip_drop_packet(*first_mpp, B_TRUE, NULL,
- NULL, DROPPER(ipss, ipds_esp_nat_t_ipsec),
- &ipss->ipsec_dropper);
- *first_mpp = NULL;
- return (B_FALSE);
- }
-
- /* "ill" is "recv_ill" in actuality. */
- if (!zero_spi_check(q, *mpp, ire, ill, ipss))
- return (B_FALSE);
-
- /* Else continue like a normal UDP packet. */
- }
-
- /*
- * We make the checks as below since we are in the fast path
- * and want to minimize the number of checks if the IP_RECVIF and/or
- * IP_RECVSLLA and/or IPV6_RECVPKTINFO options are not set
- */
- if (connp->conn_recvif || connp->conn_recvslla ||
- connp->conn_ip_recvpktinfo) {
- if (connp->conn_recvif) {
- in_flags = IPF_RECVIF;
- }
- /*
- * UDP supports IP_RECVPKTINFO option for both v4 and v6
- * so the flag passed to ip_add_info is based on IP version
- * of connp.
- */
- if (connp->conn_ip_recvpktinfo) {
- if (connp->conn_af_isv6) {
- /*
- * V6 only needs index
- */
- in_flags |= IPF_RECVIF;
- } else {
- /*
- * V4 needs index + matching address.
- */
- in_flags |= IPF_RECVADDR;
- }
- }
- if (connp->conn_recvslla) {
- in_flags |= IPF_RECVSLLA;
- }
- /*
- * since in_flags are being set ill will be
- * referenced in ip_add_info, so it better not
- * be NULL.
- */
- /*
- * the actual data will be contained in b_cont
- * upon successful return of the following call.
- * If the call fails then the original mblk is
- * returned.
- */
- *mpp = ip_add_info(*mpp, ill, in_flags, IPCL_ZONEID(connp),
- ipst);
- }
-
- return (B_TRUE);
-}
-
-/*
* Fragmentation reassembly. Each ILL has a hash table for
* queuing packets undergoing reassembly for all IPIFs
* associated with the ILL. The hash is based on the packet
* IP ident field. The ILL frag hash table was allocated
* as a timer block at the time the ILL was created. Whenever
* there is anything on the reassembly queue, the timer will
- * be running. Returns B_TRUE if successful else B_FALSE;
- * frees mp on failure.
+ * be running. Returns the reassembled packet if reassembly completes.
*/
-static boolean_t
-ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha,
- uint32_t *cksum_val, uint16_t *cksum_flags)
+mblk_t *
+ip_input_fragment(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
{
uint32_t frag_offset_flags;
- mblk_t *mp = *mpp;
mblk_t *t_mp;
ipaddr_t dst;
uint8_t proto = ipha->ipha_protocol;
@@ -12099,12 +7169,8 @@ ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha,
uint8_t ecn_info = 0;
uint32_t packet_size;
boolean_t pruned = B_FALSE;
- ip_stack_t *ipst = ill->ill_ipst;
-
- if (cksum_val != NULL)
- *cksum_val = 0;
- if (cksum_flags != NULL)
- *cksum_flags = 0;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
/*
* Drop the fragmented as early as possible, if
@@ -12112,13 +7178,13 @@ ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha,
*/
if (ipst->ips_ip_reass_queue_bytes == 0) {
freemsg(mp);
- return (B_FALSE);
+ return (NULL);
}
/* Check for fragmentation offset; return if there's none */
if ((frag_offset_flags = ntohs(ipha->ipha_fragment_offset_and_flags) &
(IPH_MF | IPH_OFFSET)) == 0)
- return (B_TRUE);
+ return (mp);
/*
* We utilize hardware computed checksum info only for UDP since
@@ -12126,8 +7192,9 @@ ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha,
* addition, checksum offload support for IP fragments carrying
* UDP payload is commonly implemented across network adapters.
*/
- ASSERT(recv_ill != NULL);
- if (proto == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(recv_ill) &&
+ ASSERT(ira->ira_rill != NULL);
+ if (proto == IPPROTO_UDP && dohwcksum &&
+ ILL_HCKSUM_CAPABLE(ira->ira_rill) &&
(DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) {
mblk_t *mp1 = mp->b_cont;
int32_t len;
@@ -12178,7 +7245,7 @@ ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha,
/* If end == 0 then we have a packet with no data, so just free it */
if (end == 0) {
freemsg(mp);
- return (B_FALSE);
+ return (NULL);
}
/* Record the ECN field info. */
@@ -12192,16 +7259,25 @@ ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha,
end += offset;
}
- msg_len = MBLKSIZE(mp);
+ /* Handle vnic loopback of fragments */
+ if (mp->b_datap->db_ref > 2)
+ msg_len = 0;
+ else
+ msg_len = MBLKSIZE(mp);
+
tail_mp = mp;
while (tail_mp->b_cont != NULL) {
tail_mp = tail_mp->b_cont;
- msg_len += MBLKSIZE(tail_mp);
+ if (tail_mp->b_datap->db_ref <= 2)
+ msg_len += MBLKSIZE(tail_mp);
}
/* If the reassembly list for this ILL will get too big, prune it */
if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >=
ipst->ips_ip_reass_queue_bytes) {
+ DTRACE_PROBE3(ip_reass_queue_bytes, uint_t, msg_len,
+ uint_t, ill->ill_frag_count,
+ uint_t, ipst->ips_ip_reass_queue_bytes);
ill_frag_prune(ill,
(ipst->ips_ip_reass_queue_bytes < msg_len) ? 0 :
(ipst->ips_ip_reass_queue_bytes - msg_len));
@@ -12232,7 +7308,7 @@ ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha,
ill_frag_free_pkts(ill, ipfb, ipf, 1);
freemsg(mp);
mutex_exit(&ipfb->ipfb_lock);
- return (B_FALSE);
+ return (NULL);
}
/* Found it. */
break;
@@ -12254,7 +7330,7 @@ ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha,
if (pruned && offset != 0) {
mutex_exit(&ipfb->ipfb_lock);
freemsg(mp);
- return (B_FALSE);
+ return (NULL);
}
if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS(ipst)) {
@@ -12269,10 +7345,11 @@ ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha,
mp1 = allocb(sizeof (*ipf), BPRI_MED);
if (mp1 == NULL) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
freemsg(mp);
reass_done:
mutex_exit(&ipfb->ipfb_lock);
- return (B_FALSE);
+ return (NULL);
}
BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmReqds);
@@ -12478,19 +7555,22 @@ reass_done:
/* Restore original IP length in header. */
packet_size = (uint32_t)msgdsize(mp);
if (packet_size > IP_MAXPACKET) {
- freemsg(mp);
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
- return (B_FALSE);
+ ip_drop_input("Reassembled packet too large", mp, ill);
+ freemsg(mp);
+ return (NULL);
}
if (DB_REF(mp) > 1) {
mblk_t *mp2 = copymsg(mp);
- freemsg(mp);
if (mp2 == NULL) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- return (B_FALSE);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ freemsg(mp);
+ return (NULL);
}
+ freemsg(mp);
mp = mp2;
}
ipha = (ipha_t *)mp->b_rptr;
@@ -12501,1187 +7581,239 @@ reass_done:
/* Record the ECN info. */
ipha->ipha_type_of_service &= 0xFC;
ipha->ipha_type_of_service |= ecn_info;
- *mpp = mp;
- /* Reassembly is successful; return checksum information if needed */
- if (cksum_val != NULL)
- *cksum_val = sum_val;
- if (cksum_flags != NULL)
- *cksum_flags = sum_flags;
+ /* Update the receive attributes */
+ ira->ira_pktlen = packet_size;
+ ira->ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
- return (B_TRUE);
+ /* Reassembly is successful; set checksum information in packet */
+ DB_CKSUM16(mp) = (uint16_t)sum_val;
+ DB_CKSUMFLAGS(mp) = sum_flags;
+ DB_CKSUMSTART(mp) = ira->ira_ip_hdr_length;
+
+ return (mp);
}
/*
- * Perform ip header check sum update local options.
- * return B_TRUE if all is well, else return B_FALSE and release
- * the mp. caller is responsible for decrementing ire ref cnt.
+ * Pullup function that should be used for IP input in order to
+ * ensure we do not loose the L2 source address; we need the l2 source
+ * address for IP_RECVSLLA and for ndp_input.
+ *
+ * We return either NULL or b_rptr.
*/
-static boolean_t
-ip_options_cksum(queue_t *q, ill_t *ill, mblk_t *mp, ipha_t *ipha, ire_t *ire,
- ip_stack_t *ipst)
+void *
+ip_pullup(mblk_t *mp, ssize_t len, ip_recv_attr_t *ira)
{
- mblk_t *first_mp;
- boolean_t mctl_present;
- uint16_t sum;
+ ill_t *ill = ira->ira_ill;
- EXTRACT_PKT_MP(mp, first_mp, mctl_present);
- /*
- * Don't do the checksum if it has gone through AH/ESP
- * processing.
- */
- if (!mctl_present) {
- sum = ip_csum_hdr(ipha);
- if (sum != 0) {
- if (ill != NULL) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
- } else {
- BUMP_MIB(&ipst->ips_ip_mib,
- ipIfStatsInCksumErrs);
- }
- freemsg(first_mp);
- return (B_FALSE);
- }
+ if (ip_rput_pullups++ == 0) {
+ (void) mi_strlog(ill->ill_rq, 1, SL_ERROR|SL_TRACE,
+ "ip_pullup: %s forced us to "
+ " pullup pkt, hdr len %ld, hdr addr %p",
+ ill->ill_name, len, (void *)mp->b_rptr);
}
-
- if (!ip_rput_local_options(q, mp, ipha, ire, ipst)) {
- if (mctl_present)
- freeb(first_mp);
- return (B_FALSE);
- }
-
- return (B_TRUE);
+ if (!(ira->ira_flags & IRAF_L2SRC_SET))
+ ip_setl2src(mp, ira, ira->ira_rill);
+ ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
+ if (!pullupmsg(mp, len))
+ return (NULL);
+ else
+ return (mp->b_rptr);
}
/*
- * All udp packet are delivered to the local host via this routine.
+ * Make sure ira_l2src has an address. If we don't have one fill with zeros.
+ * When called from the ULP ira_rill will be NULL hence the caller has to
+ * pass in the ill.
*/
+/* ARGSUSED */
void
-ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
- ill_t *recv_ill)
+ip_setl2src(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill)
{
- uint32_t sum;
- uint32_t u1;
- boolean_t mctl_present;
- conn_t *connp;
- mblk_t *first_mp;
- uint16_t *up;
- ill_t *ill = (ill_t *)q->q_ptr;
- uint16_t reass_hck_flags = 0;
- ip_stack_t *ipst;
-
- ASSERT(recv_ill != NULL);
- ipst = recv_ill->ill_ipst;
+ const uchar_t *addr;
+ int alen;
-#define rptr ((uchar_t *)ipha)
+ if (ira->ira_flags & IRAF_L2SRC_SET)
+ return;
- EXTRACT_PKT_MP(mp, first_mp, mctl_present);
- ASSERT(!mctl_present || ipsec_in_is_secure(first_mp));
- ASSERT(ipha->ipha_protocol == IPPROTO_UDP);
ASSERT(ill != NULL);
-
- /*
- * FAST PATH for udp packets
- */
-
- /* u1 is # words of IP options */
- u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) +
- IP_SIMPLE_HDR_LENGTH_IN_WORDS);
-
- /* IP options present */
- if (u1 != 0)
- goto ipoptions;
-
- /* Check the IP header checksum. */
- if (IS_IP_HDR_HWCKSUM(mctl_present, mp, recv_ill)) {
- /* Clear the IP header h/w cksum flag */
- DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
- } else if (!mctl_present) {
- /*
- * Don't verify header checksum if this packet is coming
- * back from AH/ESP as we already did it.
- */
-#define uph ((uint16_t *)ipha)
- sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + uph[5] +
- uph[6] + uph[7] + uph[8] + uph[9];
-#undef uph
- /* finish doing IP checksum */
- sum = (sum & 0xFFFF) + (sum >> 16);
- sum = ~(sum + (sum >> 16)) & 0xFFFF;
- if (sum != 0 && sum != 0xFFFF) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
- freemsg(first_mp);
- return;
- }
- }
-
- /*
- * Count for SNMP of inbound packets for ire.
- * if mctl is present this might be a secure packet and
- * has already been counted for in ip_proto_input().
- */
- if (!mctl_present) {
- UPDATE_IB_PKT_COUNT(ire);
- ire->ire_last_used_time = lbolt;
+ alen = ill->ill_phys_addr_length;
+ ASSERT(alen <= sizeof (ira->ira_l2src));
+ if (ira->ira_mhip != NULL &&
+ (addr = ira->ira_mhip->mhi_saddr) != NULL) {
+ bcopy(addr, ira->ira_l2src, alen);
+ } else if ((ira->ira_flags & IRAF_L2SRC_LOOPBACK) &&
+ (addr = ill->ill_phys_addr) != NULL) {
+ bcopy(addr, ira->ira_l2src, alen);
+ } else {
+ bzero(ira->ira_l2src, alen);
}
+ ira->ira_flags |= IRAF_L2SRC_SET;
+}
- /* packet part of fragmented IP packet? */
- u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
- if (u1 & (IPH_MF | IPH_OFFSET)) {
- goto fragmented;
- }
+/*
+ * check ip header length and align it.
+ */
+mblk_t *
+ip_check_and_align_header(mblk_t *mp, uint_t min_size, ip_recv_attr_t *ira)
+{
+ ill_t *ill = ira->ira_ill;
+ ssize_t len;
- /* u1 = IP header length (20 bytes) */
- u1 = IP_SIMPLE_HDR_LENGTH;
+ len = MBLKL(mp);
- /* packet does not contain complete IP & UDP headers */
- if ((mp->b_wptr - rptr) < (IP_SIMPLE_HDR_LENGTH + UDPH_SIZE))
- goto udppullup;
+ if (!OK_32PTR(mp->b_rptr))
+ IP_STAT(ill->ill_ipst, ip_notaligned);
+ else
+ IP_STAT(ill->ill_ipst, ip_recv_pullup);
- /* up points to UDP header */
- up = (uint16_t *)((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH);
-#define iphs ((uint16_t *)ipha)
+ /* Guard against bogus device drivers */
+ if (len < 0) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
+ ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
+ freemsg(mp);
+ return (NULL);
+ }
- /* if udp hdr cksum != 0, then need to checksum udp packet */
- if (up[3] != 0) {
+ if (len == 0) {
+ /* GLD sometimes sends up mblk with b_rptr == b_wptr! */
mblk_t *mp1 = mp->b_cont;
- boolean_t cksum_err;
- uint16_t hck_flags = 0;
- /* Pseudo-header checksum */
- u1 = IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] +
- iphs[9] + up[2];
+ if (!(ira->ira_flags & IRAF_L2SRC_SET))
+ ip_setl2src(mp, ira, ira->ira_rill);
+ ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
- /*
- * Revert to software checksum calculation if the interface
- * isn't capable of checksum offload or if IPsec is present.
- */
- if (ILL_HCKSUM_CAPABLE(recv_ill) && !mctl_present && dohwcksum)
- hck_flags = DB_CKSUMFLAGS(mp);
-
- if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
- IP_STAT(ipst, ip_in_sw_cksum);
-
- IP_CKSUM_RECV(hck_flags, u1,
- (uchar_t *)(rptr + DB_CKSUMSTART(mp)),
- (int32_t)((uchar_t *)up - rptr),
- mp, mp1, cksum_err);
-
- if (cksum_err) {
- BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs);
- if (hck_flags & HCK_FULLCKSUM)
- IP_STAT(ipst, ip_udp_in_full_hw_cksum_err);
- else if (hck_flags & HCK_PARTIALCKSUM)
- IP_STAT(ipst, ip_udp_in_part_hw_cksum_err);
- else
- IP_STAT(ipst, ip_udp_in_sw_cksum_err);
+ freeb(mp);
+ mp = mp1;
+ if (mp == NULL)
+ return (NULL);
- freemsg(first_mp);
- return;
- }
+ if (OK_32PTR(mp->b_rptr) && MBLKL(mp) >= min_size)
+ return (mp);
}
-
- /* Non-fragmented broadcast or multicast packet? */
- if (ire->ire_type == IRE_BROADCAST)
- goto udpslowpath;
-
- if ((connp = ipcl_classify_v4(mp, IPPROTO_UDP, IP_SIMPLE_HDR_LENGTH,
- ire->ire_zoneid, ipst)) != NULL) {
- ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL);
- IP_STAT(ipst, ip_udp_fast_path);
-
- if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
- (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) {
- freemsg(mp);
- BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
+ if (ip_pullup(mp, min_size, ira) == NULL) {
+ if (msgdsize(mp) < min_size) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
+ ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
} else {
- if (!mctl_present) {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsHCInDelivers);
- }
- /*
- * mp and first_mp can change.
- */
- if (ip_udp_check(q, connp, recv_ill,
- ipha, &mp, &first_mp, mctl_present, ire)) {
- /* Send it upstream */
- (connp->conn_recv)(connp, mp, NULL);
- }
- }
- /*
- * freeb() cannot deal with null mblk being passed
- * in and first_mp can be set to null in the call
- * ipsec_input_fast_proc()->ipsec_check_inbound_policy.
- */
- if (mctl_present && first_mp != NULL) {
- freeb(first_mp);
- }
- CONN_DEC_REF(connp);
- return;
- }
-
- /*
- * if we got here we know the packet is not fragmented and
- * has no options. The classifier could not find a conn_t and
- * most likely its an icmp packet so send it through slow path.
- */
-
- goto udpslowpath;
-
-ipoptions:
- if (!ip_options_cksum(q, ill, mp, ipha, ire, ipst)) {
- goto slow_done;
- }
-
- UPDATE_IB_PKT_COUNT(ire);
- ire->ire_last_used_time = lbolt;
- u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
- if (u1 & (IPH_MF | IPH_OFFSET)) {
-fragmented:
- /*
- * "sum" and "reass_hck_flags" are non-zero if the
- * reassembled packet has a valid hardware computed
- * checksum information associated with it.
- */
- if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, &sum,
- &reass_hck_flags)) {
- goto slow_done;
- }
-
- /*
- * Make sure that first_mp points back to mp as
- * the mp we came in with could have changed in
- * ip_rput_fragment().
- */
- ASSERT(!mctl_present);
- ipha = (ipha_t *)mp->b_rptr;
- first_mp = mp;
- }
-
- /* Now we have a complete datagram, destined for this machine. */
- u1 = IPH_HDR_LENGTH(ipha);
- /* Pull up the UDP header, if necessary. */
- if ((MBLKL(mp)) < (u1 + UDPH_SIZE)) {
-udppullup:
- if (!pullupmsg(mp, u1 + UDPH_SIZE)) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(first_mp);
- goto slow_done;
- }
- ipha = (ipha_t *)mp->b_rptr;
- }
-
- /*
- * Validate the checksum for the reassembled packet; for the
- * pullup case we calculate the payload checksum in software.
- */
- up = (uint16_t *)((uchar_t *)ipha + u1 + UDP_PORTS_OFFSET);
- if (up[3] != 0) {
- boolean_t cksum_err;
-
- if ((reass_hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
- IP_STAT(ipst, ip_in_sw_cksum);
-
- IP_CKSUM_RECV_REASS(reass_hck_flags,
- (int32_t)((uchar_t *)up - (uchar_t *)ipha),
- IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] +
- iphs[9] + up[2], sum, cksum_err);
-
- if (cksum_err) {
- BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs);
-
- if (reass_hck_flags & HCK_FULLCKSUM)
- IP_STAT(ipst, ip_udp_in_full_hw_cksum_err);
- else if (reass_hck_flags & HCK_PARTIALCKSUM)
- IP_STAT(ipst, ip_udp_in_part_hw_cksum_err);
- else
- IP_STAT(ipst, ip_udp_in_sw_cksum_err);
-
- freemsg(first_mp);
- goto slow_done;
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
}
+ freemsg(mp);
+ return (NULL);
}
-udpslowpath:
-
- /* Clear hardware checksum flag to be safe */
- DB_CKSUMFLAGS(mp) = 0;
-
- ip_fanout_udp(q, first_mp, ill, ipha, *(uint32_t *)up,
- (ire->ire_type == IRE_BROADCAST),
- IP_FF_SEND_ICMP | IP_FF_CKSUM | IP_FF_IPINFO,
- mctl_present, B_TRUE, recv_ill, ire->ire_zoneid);
-
-slow_done:
- IP_STAT(ipst, ip_udp_slow_path);
- return;
-
-#undef iphs
-#undef rptr
-}
-
-static boolean_t
-ip_iptun_input(mblk_t *ipsec_mp, mblk_t *data_mp, ipha_t *ipha, ill_t *ill,
- ire_t *ire, ip_stack_t *ipst)
-{
- conn_t *connp;
-
- ASSERT(ipsec_mp == NULL || ipsec_mp->b_cont == data_mp);
-
- if ((connp = ipcl_classify_v4(data_mp, ipha->ipha_protocol,
- IP_SIMPLE_HDR_LENGTH, ire->ire_zoneid, ipst)) != NULL) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
- connp->conn_recv(connp, ipsec_mp != NULL ? ipsec_mp : data_mp,
- NULL);
- CONN_DEC_REF(connp);
- return (B_TRUE);
- }
- return (B_FALSE);
+ return (mp);
}
-/* ARGSUSED */
-static mblk_t *
-ip_tcp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present,
- ire_t *ire, mblk_t *first_mp, uint_t flags, queue_t *q,
- ill_rx_ring_t *ill_ring)
+/*
+ * Common code for IPv4 and IPv6 to check and pullup multi-mblks
+ */
+mblk_t *
+ip_check_length(mblk_t *mp, uchar_t *rptr, ssize_t len, uint_t pkt_len,
+ uint_t min_size, ip_recv_attr_t *ira)
{
- conn_t *connp;
- uint32_t sum;
- uint32_t u1;
- uint16_t *up;
- int offset;
- ssize_t len;
- mblk_t *mp1;
- boolean_t syn_present = B_FALSE;
- tcph_t *tcph;
- uint_t tcph_flags;
- uint_t ip_hdr_len;
- ill_t *ill = (ill_t *)q->q_ptr;
- zoneid_t zoneid = ire->ire_zoneid;
- boolean_t cksum_err;
- uint16_t hck_flags = 0;
- ip_stack_t *ipst = recv_ill->ill_ipst;
- ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
-
-#define rptr ((uchar_t *)ipha)
-
- ASSERT(ipha->ipha_protocol == IPPROTO_TCP);
- ASSERT(ill != NULL);
-
- /*
- * FAST PATH for tcp packets
- */
-
- /* u1 is # words of IP options */
- u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4)
- + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
-
- /* IP options present */
- if (u1) {
- goto ipoptions;
- } else if (!mctl_present) {
- /* Check the IP header checksum. */
- if (IS_IP_HDR_HWCKSUM(mctl_present, mp, recv_ill)) {
- /* Clear the IP header h/w cksum flag */
- DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
- } else if (!mctl_present) {
- /*
- * Don't verify header checksum if this packet
- * is coming back from AH/ESP as we already did it.
- */
-#define uph ((uint16_t *)ipha)
- sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] +
- uph[5] + uph[6] + uph[7] + uph[8] + uph[9];
-#undef uph
- /* finish doing IP checksum */
- sum = (sum & 0xFFFF) + (sum >> 16);
- sum = ~(sum + (sum >> 16)) & 0xFFFF;
- if (sum != 0 && sum != 0xFFFF) {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsInCksumErrs);
- goto error;
- }
- }
- }
-
- if (!mctl_present) {
- UPDATE_IB_PKT_COUNT(ire);
- ire->ire_last_used_time = lbolt;
- }
-
- /* packet part of fragmented IP packet? */
- u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
- if (u1 & (IPH_MF | IPH_OFFSET)) {
- goto fragmented;
- }
-
- /* u1 = IP header length (20 bytes) */
- u1 = ip_hdr_len = IP_SIMPLE_HDR_LENGTH;
-
- /* does packet contain IP+TCP headers? */
- len = mp->b_wptr - rptr;
- if (len < (IP_SIMPLE_HDR_LENGTH + TCP_MIN_HEADER_LENGTH)) {
- IP_STAT(ipst, ip_tcppullup);
- goto tcppullup;
- }
-
- /* TCP options present? */
- offset = ((uchar_t *)ipha)[IP_SIMPLE_HDR_LENGTH + 12] >> 4;
-
- /*
- * If options need to be pulled up, then goto tcpoptions.
- * otherwise we are still in the fast path
- */
- if (len < (offset << 2) + IP_SIMPLE_HDR_LENGTH) {
- IP_STAT(ipst, ip_tcpoptions);
- goto tcpoptions;
- }
-
- /* multiple mblks of tcp data? */
- if ((mp1 = mp->b_cont) != NULL) {
- IP_STAT(ipst, ip_multipkttcp);
- len += msgdsize(mp1);
- }
-
- up = (uint16_t *)(rptr + IP_SIMPLE_HDR_LENGTH + TCP_PORTS_OFFSET);
-
- /* part of pseudo checksum */
-
- /* TCP datagram length */
- u1 = len - IP_SIMPLE_HDR_LENGTH;
-
-#define iphs ((uint16_t *)ipha)
-
-#ifdef _BIG_ENDIAN
- u1 += IPPROTO_TCP;
-#else
- u1 = ((u1 >> 8) & 0xFF) + (((u1 & 0xFF) + IPPROTO_TCP) << 8);
-#endif
- u1 += iphs[6] + iphs[7] + iphs[8] + iphs[9];
-
- /*
- * Revert to software checksum calculation if the interface
- * isn't capable of checksum offload or if IPsec is present.
- */
- if (ILL_HCKSUM_CAPABLE(recv_ill) && !mctl_present && dohwcksum)
- hck_flags = DB_CKSUMFLAGS(mp);
-
- if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
- IP_STAT(ipst, ip_in_sw_cksum);
-
- IP_CKSUM_RECV(hck_flags, u1,
- (uchar_t *)(rptr + DB_CKSUMSTART(mp)),
- (int32_t)((uchar_t *)up - rptr),
- mp, mp1, cksum_err);
-
- if (cksum_err) {
- BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs);
-
- if (hck_flags & HCK_FULLCKSUM)
- IP_STAT(ipst, ip_tcp_in_full_hw_cksum_err);
- else if (hck_flags & HCK_PARTIALCKSUM)
- IP_STAT(ipst, ip_tcp_in_part_hw_cksum_err);
- else
- IP_STAT(ipst, ip_tcp_in_sw_cksum_err);
-
- goto error;
- }
-
-try_again:
-
- if ((connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_len,
- zoneid, ipst)) == NULL) {
- /* Send the TH_RST */
- goto no_conn;
- }
-
- tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len];
- tcph_flags = tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG);
+ ill_t *ill = ira->ira_ill;
/*
- * TCP FAST PATH for AF_INET socket.
- *
- * TCP fast path to avoid extra work. An AF_INET socket type
- * does not have facility to receive extra information via
- * ip_process or ip_add_info. Also, when the connection was
- * established, we made a check if this connection is impacted
- * by any global IPsec policy or per connection policy (a
- * policy that comes in effect later will not apply to this
- * connection). Since all this can be determined at the
- * connection establishment time, a quick check of flags
- * can avoid extra work.
+ * Make sure we have data length consistent
+ * with the IP header.
*/
- if (IPCL_IS_TCP4_CONNECTED_NO_POLICY(connp) && !mctl_present &&
- !IPP_ENABLED(IPP_LOCAL_IN, ipst)) {
- ASSERT(first_mp == mp);
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
- if (tcph_flags != (TH_SYN | TH_ACK)) {
- SET_SQUEUE(mp, tcp_rput_data, connp);
- return (mp);
- }
- mp->b_datap->db_struioflag |= STRUIO_CONNECT;
- DB_CKSUMSTART(mp) = (intptr_t)ip_squeue_get(ill_ring);
- SET_SQUEUE(mp, tcp_input, connp);
- return (mp);
- }
-
- if (tcph_flags == TH_SYN) {
- if (IPCL_IS_TCP(connp)) {
- mp->b_datap->db_struioflag |= STRUIO_EAGER;
- DB_CKSUMSTART(mp) =
- (intptr_t)ip_squeue_get(ill_ring);
- if (IPCL_IS_FULLY_BOUND(connp) && !mctl_present &&
- !CONN_INBOUND_POLICY_PRESENT(connp, ipss)) {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsHCInDelivers);
- SET_SQUEUE(mp, connp->conn_recv, connp);
- return (mp);
- } else if (IPCL_IS_BOUND(connp) && !mctl_present &&
- !CONN_INBOUND_POLICY_PRESENT(connp, ipss)) {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsHCInDelivers);
- ip_squeue_enter_unbound++;
- SET_SQUEUE(mp, tcp_conn_request_unbound,
- connp);
- return (mp);
- }
- syn_present = B_TRUE;
- }
- }
-
- if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp) && !syn_present) {
- uint_t flags = (unsigned int)tcph->th_flags[0] & 0xFF;
-
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
- /* No need to send this packet to TCP */
- if ((flags & TH_RST) || (flags & TH_URG)) {
- CONN_DEC_REF(connp);
- freemsg(first_mp);
- return (NULL);
- }
- if (flags & TH_ACK) {
- ip_xmit_reset_serialize(first_mp, ip_hdr_len, zoneid,
- ipst->ips_netstack->netstack_tcp, connp);
- CONN_DEC_REF(connp);
- return (NULL);
- }
-
- CONN_DEC_REF(connp);
- freemsg(first_mp);
- return (NULL);
- }
-
- if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || mctl_present) {
- first_mp = ipsec_check_inbound_policy(first_mp, connp,
- ipha, NULL, mctl_present);
- if (first_mp == NULL) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- CONN_DEC_REF(connp);
+ if (mp->b_cont == NULL) {
+ /* pkt_len is based on ipha_len, not the mblk length */
+ if (pkt_len < min_size) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
+ ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
+ freemsg(mp);
return (NULL);
}
- if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp)) {
- ASSERT(syn_present);
- if (mctl_present) {
- ASSERT(first_mp != mp);
- first_mp->b_datap->db_struioflag |=
- STRUIO_POLICY;
- } else {
- ASSERT(first_mp == mp);
- mp->b_datap->db_struioflag &= ~STRUIO_EAGER;
- mp->b_datap->db_struioflag |= STRUIO_POLICY;
- }
- } else {
- /*
- * Discard first_mp early since we're dealing with a
- * fully-connected conn_t and tcp doesn't do policy in
- * this case.
- */
- if (mctl_present) {
- freeb(first_mp);
- mctl_present = B_FALSE;
- }
- first_mp = mp;
- }
- }
-
- /* Initiate IPPF processing for fastpath */
- if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) {
- uint32_t ill_index;
-
- ill_index = recv_ill->ill_phyint->phyint_ifindex;
- ip_process(IPP_LOCAL_IN, &mp, ill_index);
- if (mp == NULL) {
- ip2dbg(("ip_input_ipsec_process: TCP pkt "
- "deferred/dropped during IPPF processing\n"));
- CONN_DEC_REF(connp);
- if (mctl_present)
- freeb(first_mp);
+ if (len < 0) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
+ ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
+ freemsg(mp);
return (NULL);
- } else if (mctl_present) {
- /*
- * ip_process might return a new mp.
- */
- ASSERT(first_mp != mp);
- first_mp->b_cont = mp;
- } else {
- first_mp = mp;
}
-
- }
-
- if (!syn_present && connp->conn_ip_recvpktinfo) {
- /*
- * TCP does not support IP_RECVPKTINFO for v4 so lets
- * make sure IPF_RECVIF is passed to ip_add_info.
- */
- mp = ip_add_info(mp, recv_ill, flags|IPF_RECVIF,
- IPCL_ZONEID(connp), ipst);
- if (mp == NULL) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- CONN_DEC_REF(connp);
- if (mctl_present)
- freeb(first_mp);
+ /* Drop any pad */
+ mp->b_wptr = rptr + pkt_len;
+ } else if ((len += msgdsize(mp->b_cont)) != 0) {
+ ASSERT(pkt_len >= min_size);
+ if (pkt_len < min_size) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
+ ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
+ freemsg(mp);
return (NULL);
- } else if (mctl_present) {
- /*
- * ip_add_info might return a new mp.
- */
- ASSERT(first_mp != mp);
- first_mp->b_cont = mp;
- } else {
- first_mp = mp;
}
- }
-
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
- if (IPCL_IS_TCP(connp)) {
- SET_SQUEUE(first_mp, connp->conn_recv, connp);
- return (first_mp);
- } else {
- /* SOCK_RAW, IPPROTO_TCP case */
- (connp->conn_recv)(connp, first_mp, NULL);
- CONN_DEC_REF(connp);
- return (NULL);
- }
-
-no_conn:
- /* Initiate IPPf processing, if needed. */
- if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) {
- uint32_t ill_index;
- ill_index = recv_ill->ill_phyint->phyint_ifindex;
- ip_process(IPP_LOCAL_IN, &first_mp, ill_index);
- if (first_mp == NULL) {
+ if (len < 0) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
+ ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
+ freemsg(mp);
return (NULL);
}
- }
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
-
- tcp_xmit_listeners_reset(first_mp, IPH_HDR_LENGTH(mp->b_rptr), zoneid,
- ipst->ips_netstack->netstack_tcp, NULL);
- return (NULL);
-ipoptions:
- if (!ip_options_cksum(q, ill, first_mp, ipha, ire, ipst)) {
- goto slow_done;
- }
-
- UPDATE_IB_PKT_COUNT(ire);
- ire->ire_last_used_time = lbolt;
-
- u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
- if (u1 & (IPH_MF | IPH_OFFSET)) {
-fragmented:
- if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, NULL, NULL)) {
- if (mctl_present)
- freeb(first_mp);
- goto slow_done;
- }
- /*
- * Make sure that first_mp points back to mp as
- * the mp we came in with could have changed in
- * ip_rput_fragment().
- */
- ASSERT(!mctl_present);
- ipha = (ipha_t *)mp->b_rptr;
- first_mp = mp;
- }
-
- /* Now we have a complete datagram, destined for this machine. */
- u1 = ip_hdr_len = IPH_HDR_LENGTH(ipha);
-
- len = mp->b_wptr - mp->b_rptr;
- /* Pull up a minimal TCP header, if necessary. */
- if (len < (u1 + 20)) {
-tcppullup:
- if (!pullupmsg(mp, u1 + 20)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- goto error;
- }
- ipha = (ipha_t *)mp->b_rptr;
- len = mp->b_wptr - mp->b_rptr;
- }
-
- /*
- * Extract the offset field from the TCP header. As usual, we
- * try to help the compiler more than the reader.
- */
- offset = ((uchar_t *)ipha)[u1 + 12] >> 4;
- if (offset != 5) {
-tcpoptions:
- if (offset < 5) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- goto error;
- }
- /*
- * There must be TCP options.
- * Make sure we can grab them.
- */
- offset <<= 2;
- offset += u1;
- if (len < offset) {
- if (!pullupmsg(mp, offset)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- goto error;
- }
- ipha = (ipha_t *)mp->b_rptr;
- len = mp->b_wptr - rptr;
- }
- }
-
- /* Get the total packet length in len, including headers. */
- if (mp->b_cont)
- len = msgdsize(mp);
-
- /*
- * Check the TCP checksum by pulling together the pseudo-
- * header checksum, and passing it to ip_csum to be added in
- * with the TCP datagram.
- *
- * Since we are not using the hwcksum if available we must
- * clear the flag. We may come here via tcppullup or tcpoptions.
- * If either of these fails along the way the mblk is freed.
- * If this logic ever changes and mblk is reused to say send
- * ICMP's back, then this flag may need to be cleared in
- * other places as well.
- */
- DB_CKSUMFLAGS(mp) = 0;
-
- up = (uint16_t *)(rptr + u1 + TCP_PORTS_OFFSET);
-
- u1 = (uint32_t)(len - u1); /* TCP datagram length. */
-#ifdef _BIG_ENDIAN
- u1 += IPPROTO_TCP;
-#else
- u1 = ((u1 >> 8) & 0xFF) + (((u1 & 0xFF) + IPPROTO_TCP) << 8);
-#endif
- u1 += iphs[6] + iphs[7] + iphs[8] + iphs[9];
- /*
- * Not M_DATA mblk or its a dup, so do the checksum now.
- */
- IP_STAT(ipst, ip_in_sw_cksum);
- if (IP_CSUM(mp, (int32_t)((uchar_t *)up - rptr), u1) != 0) {
- BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs);
- goto error;
- }
-
- IP_STAT(ipst, ip_tcp_slow_path);
- goto try_again;
-#undef iphs
-#undef rptr
-
-error:
- freemsg(first_mp);
-slow_done:
- return (NULL);
-}
-
-/* ARGSUSED */
-static void
-ip_sctp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present,
- ire_t *ire, mblk_t *first_mp, uint_t flags, queue_t *q, ipaddr_t dst)
-{
- conn_t *connp;
- uint32_t sum;
- uint32_t u1;
- ssize_t len;
- sctp_hdr_t *sctph;
- zoneid_t zoneid = ire->ire_zoneid;
- uint32_t pktsum;
- uint32_t calcsum;
- uint32_t ports;
- in6_addr_t map_src, map_dst;
- ill_t *ill = (ill_t *)q->q_ptr;
- ip_stack_t *ipst;
- sctp_stack_t *sctps;
- boolean_t sctp_csum_err = B_FALSE;
-
- ASSERT(recv_ill != NULL);
- ipst = recv_ill->ill_ipst;
- sctps = ipst->ips_netstack->netstack_sctp;
-
-#define rptr ((uchar_t *)ipha)
-
- ASSERT(ipha->ipha_protocol == IPPROTO_SCTP);
- ASSERT(ill != NULL);
-
- /* u1 is # words of IP options */
- u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4)
- + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
-
- /* IP options present */
- if (u1 > 0) {
- goto ipoptions;
- } else {
- /* Check the IP header checksum. */
- if (!IS_IP_HDR_HWCKSUM(mctl_present, mp, recv_ill) &&
- !mctl_present) {
-#define uph ((uint16_t *)ipha)
- sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] +
- uph[5] + uph[6] + uph[7] + uph[8] + uph[9];
-#undef uph
- /* finish doing IP checksum */
- sum = (sum & 0xFFFF) + (sum >> 16);
- sum = ~(sum + (sum >> 16)) & 0xFFFF;
- /*
- * Don't verify header checksum if this packet
- * is coming back from AH/ESP as we already did it.
- */
- if (sum != 0 && sum != 0xFFFF) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
- goto error;
- }
- }
+ /* Drop any pad */
+ (void) adjmsg(mp, -len);
/*
- * Since there is no SCTP h/w cksum support yet, just
- * clear the flag.
+ * adjmsg may have freed an mblk from the chain, hence
+ * invalidate any hw checksum here. This will force IP to
+ * calculate the checksum in sw, but only for this packet.
*/
DB_CKSUMFLAGS(mp) = 0;
+ IP_STAT(ill->ill_ipst, ip_multimblk);
}
-
- /*
- * Don't verify header checksum if this packet is coming
- * back from AH/ESP as we already did it.
- */
- if (!mctl_present) {
- UPDATE_IB_PKT_COUNT(ire);
- ire->ire_last_used_time = lbolt;
- }
-
- /* packet part of fragmented IP packet? */
- u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
- if (u1 & (IPH_MF | IPH_OFFSET))
- goto fragmented;
-
- /* u1 = IP header length (20 bytes) */
- u1 = IP_SIMPLE_HDR_LENGTH;
-
-find_sctp_client:
- /* Pullup if we don't have the sctp common header. */
- len = MBLKL(mp);
- if (len < (u1 + SCTP_COMMON_HDR_LENGTH)) {
- if (mp->b_cont == NULL ||
- !pullupmsg(mp, u1 + SCTP_COMMON_HDR_LENGTH)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- goto error;
- }
- ipha = (ipha_t *)mp->b_rptr;
- len = MBLKL(mp);
- }
-
- sctph = (sctp_hdr_t *)(rptr + u1);
-#ifdef DEBUG
- if (!skip_sctp_cksum) {
-#endif
- pktsum = sctph->sh_chksum;
- sctph->sh_chksum = 0;
- calcsum = sctp_cksum(mp, u1);
- sctph->sh_chksum = pktsum;
- if (calcsum != pktsum)
- sctp_csum_err = B_TRUE;
-#ifdef DEBUG /* skip_sctp_cksum */
- }
-#endif
- /* get the ports */
- ports = *(uint32_t *)&sctph->sh_sport;
-
- IRE_REFRELE(ire);
- IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_dst);
- IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_src);
- if (sctp_csum_err) {
- /*
- * No potential sctp checksum errors go to the Sun
- * sctp stack however they might be Adler-32 summed
- * packets a userland stack bound to a raw IP socket
- * could reasonably use. Note though that Adler-32 is
- * a long deprecated algorithm and customer sctp
- * networks should eventually migrate to CRC-32 at
- * which time this facility should be removed.
- */
- flags |= IP_FF_SCTP_CSUM_ERR;
- goto no_conn;
- }
- if ((connp = sctp_fanout(&map_src, &map_dst, ports, zoneid, mp,
- sctps)) == NULL) {
- /* Check for raw socket or OOTB handling */
- goto no_conn;
- }
-
- /* Found a client; up it goes */
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
- sctp_input(connp, ipha, mp, first_mp, recv_ill, B_TRUE, mctl_present);
- return;
-
-no_conn:
- ip_fanout_sctp_raw(first_mp, recv_ill, ipha, B_TRUE,
- ports, mctl_present, flags, B_TRUE, zoneid);
- return;
-
-ipoptions:
- DB_CKSUMFLAGS(mp) = 0;
- if (!ip_options_cksum(q, ill, first_mp, ipha, ire, ipst))
- goto slow_done;
-
- UPDATE_IB_PKT_COUNT(ire);
- ire->ire_last_used_time = lbolt;
-
- u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
- if (u1 & (IPH_MF | IPH_OFFSET)) {
-fragmented:
- if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, NULL, NULL))
- goto slow_done;
- /*
- * Make sure that first_mp points back to mp as
- * the mp we came in with could have changed in
- * ip_rput_fragment().
- */
- ASSERT(!mctl_present);
- ipha = (ipha_t *)mp->b_rptr;
- first_mp = mp;
- }
-
- /* Now we have a complete datagram, destined for this machine. */
- u1 = IPH_HDR_LENGTH(ipha);
- goto find_sctp_client;
-#undef iphs
-#undef rptr
-
-error:
- freemsg(first_mp);
-slow_done:
- IRE_REFRELE(ire);
+ return (mp);
}
-#define VER_BITS 0xF0
-#define VERSION_6 0x60
-
-static boolean_t
-ip_rput_multimblk_ipoptions(queue_t *q, ill_t *ill, mblk_t *mp, ipha_t **iphapp,
- ipaddr_t *dstp, ip_stack_t *ipst)
+/*
+ * Check that the IPv4 opt_len is consistent with the packet and pullup
+ * the options.
+ */
+mblk_t *
+ip_check_optlen(mblk_t *mp, ipha_t *ipha, uint_t opt_len, uint_t pkt_len,
+ ip_recv_attr_t *ira)
{
- uint_t opt_len;
- ipha_t *ipha;
+ ill_t *ill = ira->ira_ill;
ssize_t len;
- uint_t pkt_len;
- ASSERT(ill != NULL);
- IP_STAT(ipst, ip_ipoptions);
- ipha = *iphapp;
-
-#define rptr ((uchar_t *)ipha)
/* Assume no IPv6 packets arrive over the IPv4 queue */
- if (IPH_HDR_VERSION(ipha) == IPV6_VERSION) {
+ if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInWrongIPVersion);
- freemsg(mp);
- return (B_FALSE);
- }
-
- /* multiple mblk or too short */
- pkt_len = ntohs(ipha->ipha_length);
-
- /* Get the number of words of IP options in the IP header. */
- opt_len = ipha->ipha_version_and_hdr_length - IP_SIMPLE_HDR_VERSION;
- if (opt_len) {
- /* IP Options present! Validate and process. */
- if (opt_len > (15 - IP_SIMPLE_HDR_LENGTH_IN_WORDS)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
- goto done;
- }
- /*
- * Recompute complete header length and make sure we
- * have access to all of it.
- */
- len = ((size_t)opt_len + IP_SIMPLE_HDR_LENGTH_IN_WORDS) << 2;
- if (len > (mp->b_wptr - rptr)) {
- if (len > pkt_len) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
- goto done;
- }
- if (!pullupmsg(mp, len)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- goto done;
- }
- ipha = (ipha_t *)mp->b_rptr;
- }
- /*
- * Go off to ip_rput_options which returns the next hop
- * destination address, which may have been affected
- * by source routing.
- */
- IP_STAT(ipst, ip_opt);
- if (ip_rput_options(q, mp, ipha, dstp, ipst) == -1) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- return (B_FALSE);
- }
- }
- *iphapp = ipha;
- return (B_TRUE);
-done:
- /* clear b_prev - used by ip_mroute_decap */
- mp->b_prev = NULL;
- freemsg(mp);
- return (B_FALSE);
-#undef rptr
-}
-
-/*
- * Deal with the fact that there is no ire for the destination.
- */
-static ire_t *
-ip_rput_noire(queue_t *q, mblk_t *mp, int ll_multicast, ipaddr_t dst)
-{
- ipha_t *ipha;
- ill_t *ill;
- ire_t *ire;
- ip_stack_t *ipst;
- enum ire_forward_action ret_action;
-
- ipha = (ipha_t *)mp->b_rptr;
- ill = (ill_t *)q->q_ptr;
-
- ASSERT(ill != NULL);
- ipst = ill->ill_ipst;
-
- /*
- * No IRE for this destination, so it can't be for us.
- * Unless we are forwarding, drop the packet.
- * We have to let source routed packets through
- * since we don't yet know if they are 'ping -l'
- * packets i.e. if they will go out over the
- * same interface as they came in on.
- */
- if (ll_multicast) {
+ ip_drop_input("IPvN packet on IPv4 ill", mp, ill);
freemsg(mp);
return (NULL);
}
- if (!(ill->ill_flags & ILLF_ROUTER) && !ip_source_routed(ipha, ipst)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+
+ if (opt_len > (15 - IP_SIMPLE_HDR_LENGTH_IN_WORDS)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
+ ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
freemsg(mp);
return (NULL);
}
-
/*
- * Mark this packet as having originated externally.
- *
- * For non-forwarding code path, ire_send later double
- * checks this interface to see if it is still exists
- * post-ARP resolution.
- *
- * Also, IPQOS uses this to differentiate between
- * IPP_FWD_OUT and IPP_LOCAL_OUT for post-ARP
- * QOS packet processing in ip_wput_attach_llhdr().
- * The QoS module can mark the b_band for a fastpath message
- * or the dl_priority field in a unitdata_req header for
- * CoS marking. This info can only be found in
- * ip_wput_attach_llhdr().
+ * Recompute complete header length and make sure we
+ * have access to all of it.
*/
- mp->b_prev = (mblk_t *)(uintptr_t)ill->ill_phyint->phyint_ifindex;
- /*
- * Clear the indication that this may have a hardware checksum
- * as we are not using it
- */
- DB_CKSUMFLAGS(mp) = 0;
-
- ire = ire_forward(dst, &ret_action, NULL, NULL,
- msg_getlabel(mp), ipst);
-
- if (ire == NULL && ret_action == Forward_check_multirt) {
- /* Let ip_newroute handle CGTP */
- ip_newroute(q, mp, dst, NULL, GLOBAL_ZONEID, ipst);
- return (NULL);
- }
-
- if (ire != NULL)
- return (ire);
-
- mp->b_prev = mp->b_next = 0;
-
- if (ret_action == Forward_blackhole) {
- freemsg(mp);
- return (NULL);
- }
- /* send icmp unreachable */
- q = WR(q);
- /* Sent by forwarding path, and router is global zone */
- if (ip_source_routed(ipha, ipst)) {
- icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED,
- GLOBAL_ZONEID, ipst);
- } else {
- icmp_unreachable(q, mp, ICMP_HOST_UNREACHABLE, GLOBAL_ZONEID,
- ipst);
- }
-
- return (NULL);
-
-}
-
-/*
- * check ip header length and align it.
- */
-static boolean_t
-ip_check_and_align_header(queue_t *q, mblk_t *mp, ip_stack_t *ipst)
-{
- ssize_t len;
- ill_t *ill;
- ipha_t *ipha;
-
- len = MBLKL(mp);
-
- if (!OK_32PTR(mp->b_rptr) || len < IP_SIMPLE_HDR_LENGTH) {
- ill = (ill_t *)q->q_ptr;
-
- if (!OK_32PTR(mp->b_rptr))
- IP_STAT(ipst, ip_notaligned1);
- else
- IP_STAT(ipst, ip_notaligned2);
- /* Guard against bogus device drivers */
- if (len < 0) {
- /* clear b_prev - used by ip_mroute_decap */
- mp->b_prev = NULL;
+ len = ((size_t)opt_len + IP_SIMPLE_HDR_LENGTH_IN_WORDS) << 2;
+ if (len > (mp->b_wptr - mp->b_rptr)) {
+ if (len > pkt_len) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
+ ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
freemsg(mp);
- return (B_FALSE);
- }
-
- if (ip_rput_pullups++ == 0) {
- ipha = (ipha_t *)mp->b_rptr;
- (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
- "ip_check_and_align_header: %s forced us to "
- " pullup pkt, hdr len %ld, hdr addr %p",
- ill->ill_name, len, (void *)ipha);
+ return (NULL);
}
- if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
- /* clear b_prev - used by ip_mroute_decap */
- mp->b_prev = NULL;
+ if (ip_pullup(mp, len, ira) == NULL) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
freemsg(mp);
- return (B_FALSE);
+ return (NULL);
}
}
- return (B_TRUE);
+ return (mp);
}
/*
- * Handle the situation where a packet came in on `ill' but matched an IRE
- * whose ire_rfq doesn't match `ill'. We return the IRE that should be used
- * for interface statistics.
+ * Returns a new ire, or the same ire, or NULL.
+ * If a different IRE is returned, then it is held; the caller
+ * needs to release it.
+ * In no case is there any hold/release on the ire argument.
*/
ire_t *
ip_check_multihome(void *addr, ire_t *ire, ill_t *ill)
@@ -13697,10 +7829,9 @@ ip_check_multihome(void *addr, ire_t *ire, ill_t *ill)
* issue (e.g. packet received on an underlying interface matched an
* IRE_LOCAL on its associated group interface).
*/
- if (ire->ire_rfq != NULL &&
- IS_IN_SAME_ILLGRP(ill, (ill_t *)ire->ire_rfq->q_ptr)) {
+ ASSERT(ire->ire_ill != NULL);
+ if (IS_IN_SAME_ILLGRP(ill, ire->ire_ill))
return (ire);
- }
/*
* Do another ire lookup here, using the ingress ill, to see if the
@@ -13711,25 +7842,24 @@ ip_check_multihome(void *addr, ire_t *ire, ill_t *ill)
* ip*_strict_dst_multihoming switch is on.
* We also need to check for IPIF_UNNUMBERED point2point interfaces
* where the local address may not be unique. In this case we were
- * at the mercy of the initial ire cache lookup and the IRE_LOCAL it
+ * at the mercy of the initial ire lookup and the IRE_LOCAL it
* actually returned. The new lookup, which is more specific, should
* only find the IRE_LOCAL associated with the ingress ill if one
* exists.
*/
-
if (ire->ire_ipversion == IPV4_VERSION) {
if (ipst->ips_ip_strict_dst_multihoming)
strict_check = B_TRUE;
- new_ire = ire_ctable_lookup(*((ipaddr_t *)addr), 0, IRE_LOCAL,
- ill->ill_ipif, ALL_ZONES, NULL,
- (MATCH_IRE_TYPE|MATCH_IRE_ILL), ipst);
+ new_ire = ire_ftable_lookup_v4(*((ipaddr_t *)addr), 0, 0,
+ IRE_LOCAL, ill, ALL_ZONES, NULL,
+ (MATCH_IRE_TYPE|MATCH_IRE_ILL), 0, ipst, NULL);
} else {
ASSERT(!IN6_IS_ADDR_MULTICAST((in6_addr_t *)addr));
if (ipst->ips_ipv6_strict_dst_multihoming)
strict_check = B_TRUE;
- new_ire = ire_ctable_lookup_v6((in6_addr_t *)addr, NULL,
- IRE_LOCAL, ill->ill_ipif, ALL_ZONES, NULL,
- (MATCH_IRE_TYPE|MATCH_IRE_ILL), ipst);
+ new_ire = ire_ftable_lookup_v6((in6_addr_t *)addr, NULL, NULL,
+ IRE_LOCAL, ill, ALL_ZONES, NULL,
+ (MATCH_IRE_TYPE|MATCH_IRE_ILL), 0, ipst, NULL);
}
/*
* If the same ire that was returned in ip_input() is found then this
@@ -13741,38 +7871,27 @@ ip_check_multihome(void *addr, ire_t *ire, ill_t *ill)
* order to have accurate interface statistics.
*/
if (new_ire != NULL) {
- if ((new_ire != ire) && (new_ire->ire_rfq != NULL)) {
- ire_refrele(ire);
- ire = new_ire;
- } else {
- ire_refrele(new_ire);
- }
- return (ire);
- } else if ((ire->ire_rfq == NULL) &&
- (ire->ire_ipversion == IPV4_VERSION)) {
- /*
- * The best match could have been the original ire which
- * was created against an IRE_LOCAL on lo0. In the IPv4 case
- * the strict multihoming checks are irrelevant as we consider
- * local addresses hosted on lo0 to be interface agnostic. We
- * only expect a null ire_rfq on IREs which are associated with
- * lo0 hence we can return now.
- */
+ /* Note: held in one case but not the other? Caller handles */
+ if (new_ire != ire)
+ return (new_ire);
+ /* Unchanged */
+ ire_refrele(new_ire);
return (ire);
}
/*
* Chase pointers once and store locally.
*/
- ire_ill = (ire->ire_rfq == NULL) ? NULL :
- (ill_t *)(ire->ire_rfq->q_ptr);
+ ASSERT(ire->ire_ill != NULL);
+ ire_ill = ire->ire_ill;
ifindex = ill->ill_usesrc_ifindex;
/*
* Check if it's a legal address on the 'usesrc' interface.
+ * For IPMP data addresses the IRE_LOCAL is the upper, hence we
+ * can just check phyint_ifindex.
*/
- if ((ifindex != 0) && (ire_ill != NULL) &&
- (ifindex == ire_ill->ill_phyint->phyint_ifindex)) {
+ if (ifindex != 0 && ifindex == ire_ill->ill_phyint->phyint_ifindex) {
return (ire);
}
@@ -13783,905 +7902,234 @@ ip_check_multihome(void *addr, ire_t *ire, ill_t *ill)
if (!(strict_check))
return (ire);
- if ((ill->ill_flags & ire->ire_ipif->ipif_ill->ill_flags &
- ILLF_ROUTER) != 0) {
+ if ((ill->ill_flags & ire->ire_ill->ill_flags & ILLF_ROUTER) != 0) {
return (ire);
}
-
- ire_refrele(ire);
return (NULL);
}
/*
+ * This function is used to construct a mac_header_info_s from a
+ * DL_UNITDATA_IND message.
+ * The address fields in the mhi structure points into the message,
+ * thus the caller can't use those fields after freeing the message.
*
- * This is the fast forward path. If we are here, we dont need to
- * worry about RSVP, CGTP, or TSol. Furthermore the ftable lookup
- * needed to find the nexthop in this case is much simpler
+ * We determine whether the packet received is a non-unicast packet
+ * and in doing so, determine whether or not it is broadcast vs multicast.
+ * For it to be a broadcast packet, we must have the appropriate mblk_t
+ * hanging off the ill_t. If this is either not present or doesn't match
+ * the destination mac address in the DL_UNITDATA_IND, the packet is deemed
+ * to be multicast. Thus NICs that have no broadcast address (or no
+ * capability for one, such as point to point links) cannot return as
+ * the packet being broadcast.
*/
-ire_t *
-ip_fast_forward(ire_t *ire, ipaddr_t dst, ill_t *ill, mblk_t *mp)
+void
+ip_dlur_to_mhi(ill_t *ill, mblk_t *mb, struct mac_header_info_s *mhip)
{
- ipha_t *ipha;
- ire_t *src_ire;
- ill_t *stq_ill;
- uint_t hlen;
- uint_t pkt_len;
- uint32_t sum;
- queue_t *dev_q;
- ip_stack_t *ipst = ill->ill_ipst;
- mblk_t *fpmp;
- enum ire_forward_action ret_action;
-
- ipha = (ipha_t *)mp->b_rptr;
-
- if (ire != NULL &&
- ire->ire_zoneid != GLOBAL_ZONEID &&
- ire->ire_zoneid != ALL_ZONES) {
- /*
- * Should only use IREs that are visible to the global
- * zone for forwarding.
- */
- ire_refrele(ire);
- ire = ire_cache_lookup(dst, GLOBAL_ZONEID, NULL, ipst);
- /*
- * ire_cache_lookup() can return ire of IRE_LOCAL in
- * transient cases. In such case, just drop the packet
- */
- if (ire != NULL && ire->ire_type != IRE_CACHE)
- goto indiscard;
- }
-
- /*
- * Martian Address Filtering [RFC 1812, Section 5.3.7]
- * The loopback address check for both src and dst has already
- * been checked in ip_input
- */
-
- if (dst == INADDR_ANY || CLASSD(ipha->ipha_src)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
- goto drop;
- }
- src_ire = ire_ctable_lookup(ipha->ipha_src, 0, IRE_BROADCAST, NULL,
- ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
-
- if (src_ire != NULL) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
- ire_refrele(src_ire);
- goto drop;
- }
-
- /* No ire cache of nexthop. So first create one */
- if (ire == NULL) {
-
- ire = ire_forward_simple(dst, &ret_action, ipst);
+ dl_unitdata_ind_t *ind = (dl_unitdata_ind_t *)mb->b_rptr;
+ mblk_t *bmp;
+ uint_t extra_offset;
- /*
- * We only come to ip_fast_forward if ip_cgtp_filter
- * is not set. So ire_forward() should not return with
- * Forward_check_multirt as the next action.
- */
- ASSERT(ret_action != Forward_check_multirt);
- if (ire == NULL) {
- /* An attempt was made to forward the packet */
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams);
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- mp->b_prev = mp->b_next = 0;
- /* send icmp unreachable */
- /* Sent by forwarding path, and router is global zone */
- if (ret_action == Forward_ret_icmp_err) {
- if (ip_source_routed(ipha, ipst)) {
- icmp_unreachable(ill->ill_wq, mp,
- ICMP_SOURCE_ROUTE_FAILED,
- GLOBAL_ZONEID, ipst);
- } else {
- icmp_unreachable(ill->ill_wq, mp,
- ICMP_HOST_UNREACHABLE,
- GLOBAL_ZONEID, ipst);
- }
- } else {
- freemsg(mp);
- }
- return (NULL);
- }
- }
+ bzero(mhip, sizeof (struct mac_header_info_s));
- /*
- * Forwarding fastpath exception case:
- * If any of the following are true, we take the slowpath:
- * o forwarding is not enabled
- * o incoming and outgoing interface are the same, or in the same
- * IPMP group.
- * o corresponding ire is in incomplete state
- * o packet needs fragmentation
- * o ARP cache is not resolved
- *
- * The codeflow from here on is thus:
- * ip_rput_process_forward->ip_rput_forward->ip_xmit_v4
- */
- pkt_len = ntohs(ipha->ipha_length);
- stq_ill = (ill_t *)ire->ire_stq->q_ptr;
- if (!(stq_ill->ill_flags & ILLF_ROUTER) ||
- (ill == stq_ill) || IS_IN_SAME_ILLGRP(ill, stq_ill) ||
- (ire->ire_nce == NULL) ||
- (pkt_len > ire->ire_max_frag) ||
- ((fpmp = ire->ire_nce->nce_fp_mp) == NULL) ||
- ((hlen = MBLKL(fpmp)) > MBLKHEAD(mp)) ||
- ipha->ipha_ttl <= 1) {
- ip_rput_process_forward(ill->ill_rq, mp, ire,
- ipha, ill, B_FALSE, B_TRUE);
- return (ire);
- }
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams);
+ mhip->mhi_dsttype = MAC_ADDRTYPE_UNICAST;
- DTRACE_PROBE4(ip4__forwarding__start,
- ill_t *, ill, ill_t *, stq_ill, ipha_t *, ipha, mblk_t *, mp);
+ if (ill->ill_sap_length < 0)
+ extra_offset = 0;
+ else
+ extra_offset = ill->ill_sap_length;
- FW_HOOKS(ipst->ips_ip4_forwarding_event,
- ipst->ips_ipv4firewall_forwarding,
- ill, stq_ill, ipha, mp, mp, 0, ipst);
+ mhip->mhi_daddr = (uchar_t *)ind + ind->dl_dest_addr_offset +
+ extra_offset;
+ mhip->mhi_saddr = (uchar_t *)ind + ind->dl_src_addr_offset +
+ extra_offset;
- DTRACE_PROBE1(ip4__forwarding__end, mblk_t *, mp);
+ if (!ind->dl_group_address)
+ return;
- if (mp == NULL)
- goto drop;
+ /* Multicast or broadcast */
+ mhip->mhi_dsttype = MAC_ADDRTYPE_MULTICAST;
- mp->b_datap->db_struioun.cksum.flags = 0;
- /* Adjust the checksum to reflect the ttl decrement. */
- sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST;
- ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16));
- ipha->ipha_ttl--;
+ if (ind->dl_dest_addr_offset > sizeof (*ind) &&
+ ind->dl_dest_addr_offset + ind->dl_dest_addr_length < MBLKL(mb) &&
+ (bmp = ill->ill_bcast_mp) != NULL) {
+ dl_unitdata_req_t *dlur;
+ uint8_t *bphys_addr;
- /*
- * Write the link layer header. We can do this safely here,
- * because we have already tested to make sure that the IP
- * policy is not set, and that we have a fast path destination
- * header.
- */
- mp->b_rptr -= hlen;
- bcopy(fpmp->b_rptr, mp->b_rptr, hlen);
-
- UPDATE_IB_PKT_COUNT(ire);
- ire->ire_last_used_time = lbolt;
- BUMP_MIB(stq_ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams);
- BUMP_MIB(stq_ill->ill_ip_mib, ipIfStatsHCOutTransmits);
- UPDATE_MIB(stq_ill->ill_ip_mib, ipIfStatsHCOutOctets, pkt_len);
-
- if (!ILL_DIRECT_CAPABLE(stq_ill) || DB_TYPE(mp) != M_DATA) {
- dev_q = ire->ire_stq->q_next;
- if (DEV_Q_FLOW_BLOCKED(dev_q))
- goto indiscard;
- }
-
- DTRACE_PROBE4(ip4__physical__out__start,
- ill_t *, NULL, ill_t *, stq_ill, ipha_t *, ipha, mblk_t *, mp);
- FW_HOOKS(ipst->ips_ip4_physical_out_event,
- ipst->ips_ipv4firewall_physical_out,
- NULL, stq_ill, ipha, mp, mp, 0, ipst);
- DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
- DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
- ipha, __dtrace_ipsr_ill_t *, stq_ill, ipha_t *, ipha,
- ip6_t *, NULL, int, 0);
-
- if (mp != NULL) {
- if (ipst->ips_ip4_observe.he_interested) {
- zoneid_t szone;
+ dlur = (dl_unitdata_req_t *)bmp->b_rptr;
+ bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset +
+ extra_offset;
- /*
- * Both of these functions expect b_rptr to be
- * where the IP header starts, so advance past the
- * link layer header if present.
- */
- mp->b_rptr += hlen;
- szone = ip_get_zoneid_v4(ipha->ipha_src, mp,
- ipst, ALL_ZONES);
- ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone,
- ALL_ZONES, ill, ipst);
- mp->b_rptr -= hlen;
- }
- ILL_SEND_TX(stq_ill, ire, dst, mp, IP_DROP_ON_NO_DESC, NULL);
+ if (bcmp(mhip->mhi_daddr, bphys_addr,
+ ind->dl_dest_addr_length) == 0)
+ mhip->mhi_dsttype = MAC_ADDRTYPE_BROADCAST;
}
- return (ire);
-
-indiscard:
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
-drop:
- if (mp != NULL)
- freemsg(mp);
- return (ire);
-
}
/*
- * This function is called in the forwarding slowpath, when
- * either the ire lacks the link-layer address, or the packet needs
- * further processing(eg. fragmentation), before transmission.
+ * This function is used to construct a mac_header_info_s from a
+ * M_DATA fastpath message from a DLPI driver.
+ * The address fields in the mhi structure points into the message,
+ * thus the caller can't use those fields after freeing the message.
+ *
+ * We determine whether the packet received is a non-unicast packet
+ * and in doing so, determine whether or not it is broadcast vs multicast.
+ * For it to be a broadcast packet, we must have the appropriate mblk_t
+ * hanging off the ill_t. If this is either not present or doesn't match
+ * the destination mac address in the DL_UNITDATA_IND, the packet is deemed
+ * to be multicast. Thus NICs that have no broadcast address (or no
+ * capability for one, such as point to point links) cannot return as
+ * the packet being broadcast.
*/
-
-static void
-ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha,
- ill_t *ill, boolean_t ll_multicast, boolean_t from_ip_fast_forward)
+void
+ip_mdata_to_mhi(ill_t *ill, mblk_t *mp, struct mac_header_info_s *mhip)
{
- queue_t *dev_q;
- ire_t *src_ire;
- ip_stack_t *ipst = ill->ill_ipst;
- boolean_t same_illgrp = B_FALSE;
-
- ASSERT(ire->ire_stq != NULL);
-
- mp->b_prev = NULL; /* ip_rput_noire sets incoming interface here */
- mp->b_next = NULL; /* ip_rput_noire sets dst here */
+ mblk_t *bmp;
+ struct ether_header *pether;
- /*
- * If the caller of this function is ip_fast_forward() skip the
- * next three checks as it does not apply.
- */
- if (from_ip_fast_forward)
- goto skip;
+ bzero(mhip, sizeof (struct mac_header_info_s));
- if (ll_multicast != 0) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- goto drop_pkt;
- }
+ mhip->mhi_dsttype = MAC_ADDRTYPE_UNICAST;
- /*
- * check if ipha_src is a broadcast address. Note that this
- * check is redundant when we get here from ip_fast_forward()
- * which has already done this check. However, since we can
- * also get here from ip_rput_process_broadcast() or, for
- * for the slow path through ip_fast_forward(), we perform
- * the check again for code-reusability
- */
- src_ire = ire_ctable_lookup(ipha->ipha_src, 0, IRE_BROADCAST, NULL,
- ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
- if (src_ire != NULL || ipha->ipha_dst == INADDR_ANY) {
- if (src_ire != NULL)
- ire_refrele(src_ire);
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
- ip2dbg(("ip_rput_process_forward: Received packet with"
- " bad src/dst address on %s\n", ill->ill_name));
- goto drop_pkt;
- }
+ pether = (struct ether_header *)((char *)mp->b_rptr
+ - sizeof (struct ether_header));
/*
- * Check if we want to forward this one at this time.
- * We allow source routed packets on a host provided that
- * they go out the same ill or illgrp as they came in on.
- *
- * XXX To be quicker, we may wish to not chase pointers to
- * get the ILLF_ROUTER flag and instead store the
- * forwarding policy in the ire. An unfortunate
- * side-effect of that would be requiring an ire flush
- * whenever the ILLF_ROUTER flag changes.
+ * Make sure the interface is an ethernet type, since we don't
+ * know the header format for anything but Ethernet. Also make
+ * sure we are pointing correctly above db_base.
*/
-skip:
- same_illgrp = IS_IN_SAME_ILLGRP(ill, (ill_t *)ire->ire_rfq->q_ptr);
-
- if (((ill->ill_flags &
- ((ill_t *)ire->ire_stq->q_ptr)->ill_flags & ILLF_ROUTER) == 0) &&
- !(ip_source_routed(ipha, ipst) &&
- (ire->ire_rfq == q || same_illgrp))) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
- if (ip_source_routed(ipha, ipst)) {
- q = WR(q);
- /*
- * Clear the indication that this may have
- * hardware checksum as we are not using it.
- */
- DB_CKSUMFLAGS(mp) = 0;
- /* Sent by forwarding path, and router is global zone */
- icmp_unreachable(q, mp,
- ICMP_SOURCE_ROUTE_FAILED, GLOBAL_ZONEID, ipst);
- return;
- }
- goto drop_pkt;
- }
-
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams);
-
- /* Packet is being forwarded. Turning off hwcksum flag. */
- DB_CKSUMFLAGS(mp) = 0;
- if (ipst->ips_ip_g_send_redirects) {
- /*
- * Check whether the incoming interface and outgoing
- * interface is part of the same group. If so,
- * send redirects.
- *
- * Check the source address to see if it originated
- * on the same logical subnet it is going back out on.
- * If so, we should be able to send it a redirect.
- * Avoid sending a redirect if the destination
- * is directly connected (i.e., ipha_dst is the same
- * as ire_gateway_addr or the ire_addr of the
- * nexthop IRE_CACHE ), or if the packet was source
- * routed out this interface.
- */
- ipaddr_t src, nhop;
- mblk_t *mp1;
- ire_t *nhop_ire = NULL;
-
- /*
- * Check whether ire_rfq and q are from the same ill or illgrp.
- * If so, send redirects.
- */
- if ((ire->ire_rfq == q || same_illgrp) &&
- !ip_source_routed(ipha, ipst)) {
-
- nhop = (ire->ire_gateway_addr != 0 ?
- ire->ire_gateway_addr : ire->ire_addr);
-
- if (ipha->ipha_dst == nhop) {
- /*
- * We avoid sending a redirect if the
- * destination is directly connected
- * because it is possible that multiple
- * IP subnets may have been configured on
- * the link, and the source may not
- * be on the same subnet as ip destination,
- * even though they are on the same
- * physical link.
- */
- goto sendit;
- }
-
- src = ipha->ipha_src;
-
- /*
- * We look up the interface ire for the nexthop,
- * to see if ipha_src is in the same subnet
- * as the nexthop.
- *
- * Note that, if, in the future, IRE_CACHE entries
- * are obsoleted, this lookup will not be needed,
- * as the ire passed to this function will be the
- * same as the nhop_ire computed below.
- */
- nhop_ire = ire_ftable_lookup(nhop, 0, 0,
- IRE_INTERFACE, NULL, NULL, ALL_ZONES,
- 0, NULL, MATCH_IRE_TYPE, ipst);
-
- if (nhop_ire != NULL) {
- if ((src & nhop_ire->ire_mask) ==
- (nhop & nhop_ire->ire_mask)) {
- /*
- * The source is directly connected.
- * Just copy the ip header (which is
- * in the first mblk)
- */
- mp1 = copyb(mp);
- if (mp1 != NULL) {
- icmp_send_redirect(WR(q), mp1,
- nhop, ipst);
- }
- }
- ire_refrele(nhop_ire);
- }
- }
- }
-sendit:
- dev_q = ire->ire_stq->q_next;
- if (DEV_Q_FLOW_BLOCKED(dev_q)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(mp);
+ if (ill->ill_type != IFT_ETHER)
return;
- }
-
- ip_rput_forward(ire, ipha, mp, ill);
- return;
-
-drop_pkt:
- ip2dbg(("ip_rput_process_forward: drop pkt\n"));
- freemsg(mp);
-}
-
-ire_t *
-ip_rput_process_broadcast(queue_t **qp, mblk_t *mp, ire_t *ire, ipha_t *ipha,
- ill_t *ill, ipaddr_t dst, int cgtp_flt_pkt, int ll_multicast)
-{
- queue_t *q;
- uint16_t hcksumflags;
- ip_stack_t *ipst = ill->ill_ipst;
-
- q = *qp;
-
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInBcastPkts);
-
- /*
- * Clear the indication that this may have hardware
- * checksum as we are not using it for forwarding.
- */
- hcksumflags = DB_CKSUMFLAGS(mp);
- DB_CKSUMFLAGS(mp) = 0;
-
- /*
- * Directed broadcast forwarding: if the packet came in over a
- * different interface then it is routed out over we can forward it.
- */
- if (ipha->ipha_protocol == IPPROTO_TCP) {
- ire_refrele(ire);
- freemsg(mp);
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- return (NULL);
- }
- /*
- * For multicast we have set dst to be INADDR_BROADCAST
- * for delivering to all STREAMS.
- */
- if (!CLASSD(ipha->ipha_dst)) {
- ire_t *new_ire;
- ipif_t *ipif;
-
- ipif = ipif_get_next_ipif(NULL, ill);
- if (ipif == NULL) {
-discard: ire_refrele(ire);
- freemsg(mp);
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- return (NULL);
- }
- new_ire = ire_ctable_lookup(dst, 0, 0,
- ipif, ALL_ZONES, NULL, MATCH_IRE_ILL, ipst);
- ipif_refrele(ipif);
- if (new_ire != NULL) {
- /*
- * If the matching IRE_BROADCAST is part of an IPMP
- * group, then drop the packet unless our ill has been
- * nominated to receive for the group.
- */
- if (IS_IPMP(new_ire->ire_ipif->ipif_ill) &&
- new_ire->ire_rfq != q) {
- ire_refrele(new_ire);
- goto discard;
- }
-
- /*
- * In the special case of multirouted broadcast
- * packets, we unconditionally need to "gateway"
- * them to the appropriate interface here.
- * In the normal case, this cannot happen, because
- * there is no broadcast IRE tagged with the
- * RTF_MULTIRT flag.
- */
- if (new_ire->ire_flags & RTF_MULTIRT) {
- ire_refrele(new_ire);
- if (ire->ire_rfq != NULL) {
- q = ire->ire_rfq;
- *qp = q;
- }
- } else {
- ire_refrele(ire);
- ire = new_ire;
- }
- } else if (cgtp_flt_pkt == CGTP_IP_PKT_NOT_CGTP) {
- if (!ipst->ips_ip_g_forward_directed_bcast) {
- /*
- * Free the message if
- * ip_g_forward_directed_bcast is turned
- * off for non-local broadcast.
- */
- ire_refrele(ire);
- freemsg(mp);
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- return (NULL);
- }
- } else {
- /*
- * This CGTP packet successfully passed the
- * CGTP filter, but the related CGTP
- * broadcast IRE has not been found,
- * meaning that the redundant ipif is
- * probably down. However, if we discarded
- * this packet, its duplicate would be
- * filtered out by the CGTP filter so none
- * of them would get through. So we keep
- * going with this one.
- */
- ASSERT(cgtp_flt_pkt == CGTP_IP_PKT_PREMIUM);
- if (ire->ire_rfq != NULL) {
- q = ire->ire_rfq;
- *qp = q;
- }
- }
- }
- if (ipst->ips_ip_g_forward_directed_bcast && ll_multicast == 0) {
- /*
- * Verify that there are not more then one
- * IRE_BROADCAST with this broadcast address which
- * has ire_stq set.
- * TODO: simplify, loop over all IRE's
- */
- ire_t *ire1;
- int num_stq = 0;
- mblk_t *mp1;
-
- /* Find the first one with ire_stq set */
- rw_enter(&ire->ire_bucket->irb_lock, RW_READER);
- for (ire1 = ire; ire1 &&
- !ire1->ire_stq && ire1->ire_addr == ire->ire_addr;
- ire1 = ire1->ire_next)
- ;
- if (ire1) {
- ire_refrele(ire);
- ire = ire1;
- IRE_REFHOLD(ire);
- }
+retry:
+ if ((uchar_t *)pether < mp->b_datap->db_base)
+ return;
- /* Check if there are additional ones with stq set */
- for (ire1 = ire; ire1; ire1 = ire1->ire_next) {
- if (ire->ire_addr != ire1->ire_addr)
- break;
- if (ire1->ire_stq) {
- num_stq++;
- break;
- }
+ /* Is there a VLAN tag? */
+ if (ill->ill_isv6) {
+ if (pether->ether_type != htons(ETHERTYPE_IPV6)) {
+ pether = (struct ether_header *)((char *)pether - 4);
+ goto retry;
}
- rw_exit(&ire->ire_bucket->irb_lock);
- if (num_stq == 1 && ire->ire_stq != NULL) {
- ip1dbg(("ip_rput_process_broadcast: directed "
- "broadcast to 0x%x\n",
- ntohl(ire->ire_addr)));
- mp1 = copymsg(mp);
- if (mp1) {
- switch (ipha->ipha_protocol) {
- case IPPROTO_UDP:
- ip_udp_input(q, mp1, ipha, ire, ill);
- break;
- default:
- ip_proto_input(q, mp1, ipha, ire, ill,
- 0);
- break;
- }
- }
- /*
- * Adjust ttl to 2 (1+1 - the forward engine
- * will decrement it by one.
- */
- if (ip_csum_hdr(ipha)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
- ip2dbg(("ip_rput_broadcast:drop pkt\n"));
- freemsg(mp);
- ire_refrele(ire);
- return (NULL);
- }
- ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl + 1;
- ipha->ipha_hdr_checksum = 0;
- ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
- ip_rput_process_forward(q, mp, ire, ipha,
- ill, ll_multicast, B_FALSE);
- ire_refrele(ire);
- return (NULL);
+ } else {
+ if (pether->ether_type != htons(ETHERTYPE_IP)) {
+ pether = (struct ether_header *)((char *)pether - 4);
+ goto retry;
}
- ip1dbg(("ip_rput: NO directed broadcast to 0x%x\n",
- ntohl(ire->ire_addr)));
}
+ mhip->mhi_daddr = (uchar_t *)&pether->ether_dhost;
+ mhip->mhi_saddr = (uchar_t *)&pether->ether_shost;
- /* Restore any hardware checksum flags */
- DB_CKSUMFLAGS(mp) = hcksumflags;
- return (ire);
-}
-
-/* ARGSUSED */
-static boolean_t
-ip_rput_process_multicast(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha,
- int *ll_multicast, ipaddr_t *dstp)
-{
- ip_stack_t *ipst = ill->ill_ipst;
-
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastPkts);
- UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastOctets,
- ntohs(ipha->ipha_length));
+ if (!(mhip->mhi_daddr[0] & 0x01))
+ return;
- /*
- * So that we don't end up with dups, only one ill in an IPMP group is
- * nominated to receive multicast traffic.
- */
- if (IS_UNDER_IPMP(ill) && !ill->ill_nom_cast)
- goto drop_pkt;
+ /* Multicast or broadcast */
+ mhip->mhi_dsttype = MAC_ADDRTYPE_MULTICAST;
- /*
- * Forward packets only if we have joined the allmulti
- * group on this interface.
- */
- if (ipst->ips_ip_g_mrouter && ill->ill_join_allmulti) {
- int retval;
+ if ((bmp = ill->ill_bcast_mp) != NULL) {
+ dl_unitdata_req_t *dlur;
+ uint8_t *bphys_addr;
+ uint_t addrlen;
- /*
- * Clear the indication that this may have hardware
- * checksum as we are not using it.
- */
- DB_CKSUMFLAGS(mp) = 0;
- retval = ip_mforward(ill, ipha, mp);
- /* ip_mforward updates mib variables if needed */
- /* clear b_prev - used by ip_mroute_decap */
- mp->b_prev = NULL;
-
- switch (retval) {
- case 0:
- /*
- * pkt is okay and arrived on phyint.
- *
- * If we are running as a multicast router
- * we need to see all IGMP and/or PIM packets.
- */
- if ((ipha->ipha_protocol == IPPROTO_IGMP) ||
- (ipha->ipha_protocol == IPPROTO_PIM)) {
- goto done;
- }
- break;
- case -1:
- /* pkt is mal-formed, toss it */
- goto drop_pkt;
- case 1:
- /* pkt is okay and arrived on a tunnel */
- /*
- * If we are running a multicast router
- * we need to see all igmp packets.
- */
- if (ipha->ipha_protocol == IPPROTO_IGMP) {
- *dstp = INADDR_BROADCAST;
- *ll_multicast = 1;
- return (B_FALSE);
- }
-
- goto drop_pkt;
+ dlur = (dl_unitdata_req_t *)bmp->b_rptr;
+ addrlen = dlur->dl_dest_addr_length;
+ if (ill->ill_sap_length < 0) {
+ bphys_addr = (uchar_t *)dlur +
+ dlur->dl_dest_addr_offset;
+ addrlen += ill->ill_sap_length;
+ } else {
+ bphys_addr = (uchar_t *)dlur +
+ dlur->dl_dest_addr_offset +
+ ill->ill_sap_length;
+ addrlen -= ill->ill_sap_length;
}
+ if (bcmp(mhip->mhi_daddr, bphys_addr, addrlen) == 0)
+ mhip->mhi_dsttype = MAC_ADDRTYPE_BROADCAST;
}
-
- if (ilm_lookup_ill(ill, *dstp, ALL_ZONES) == NULL) {
- /*
- * This might just be caused by the fact that
- * multiple IP Multicast addresses map to the same
- * link layer multicast - no need to increment counter!
- */
- freemsg(mp);
- return (B_TRUE);
- }
-done:
- ip2dbg(("ip_rput: multicast for us: 0x%x\n", ntohl(*dstp)));
- /*
- * This assumes the we deliver to all streams for multicast
- * and broadcast packets.
- */
- *dstp = INADDR_BROADCAST;
- *ll_multicast = 1;
- return (B_FALSE);
-drop_pkt:
- ip2dbg(("ip_rput: drop pkt\n"));
- freemsg(mp);
- return (B_TRUE);
}
/*
- * This function is used to both return an indication of whether or not
- * the packet received is a non-unicast packet (by way of the DL_UNITDATA_IND)
- * and in doing so, determine whether or not it is broadcast vs multicast.
- * For it to be a broadcast packet, we must have the appropriate mblk_t
- * hanging off the ill_t. If this is either not present or doesn't match
- * the destination mac address in the DL_UNITDATA_IND, the packet is deemed
- * to be multicast. Thus NICs that have no broadcast address (or no
- * capability for one, such as point to point links) cannot return as
- * the packet being broadcast. The use of HPE_BROADCAST/HPE_MULTICAST as
- * the return values simplifies the current use of the return value of this
- * function, which is to pass through the multicast/broadcast characteristic
- * to consumers of the netinfo/pfhooks API. While this is not cast in stone,
- * changing the return value to some other symbol demands the appropriate
- * "translation" when hpe_flags is set prior to calling hook_run() for
- * packet events.
+ * Handle anything but M_DATA messages
+ * We see the DL_UNITDATA_IND which are part
+ * of the data path, and also the other messages from the driver.
*/
-int
-ip_get_dlpi_mbcast(ill_t *ill, mblk_t *mb)
-{
- dl_unitdata_ind_t *ind = (dl_unitdata_ind_t *)mb->b_rptr;
- mblk_t *bmp;
-
- if (ind->dl_group_address) {
- if (ind->dl_dest_addr_offset > sizeof (*ind) &&
- ind->dl_dest_addr_offset + ind->dl_dest_addr_length <
- MBLKL(mb) &&
- (bmp = ill->ill_bcast_mp) != NULL) {
- dl_unitdata_req_t *dlur;
- uint8_t *bphys_addr;
-
- dlur = (dl_unitdata_req_t *)bmp->b_rptr;
- if (ill->ill_sap_length < 0)
- bphys_addr = (uchar_t *)dlur +
- dlur->dl_dest_addr_offset;
- else
- bphys_addr = (uchar_t *)dlur +
- dlur->dl_dest_addr_offset +
- ill->ill_sap_length;
-
- if (bcmp(mb->b_rptr + ind->dl_dest_addr_offset,
- bphys_addr, ind->dl_dest_addr_length) == 0) {
- return (HPE_BROADCAST);
- }
- return (HPE_MULTICAST);
- }
- return (HPE_MULTICAST);
- }
- return (0);
-}
-
-static boolean_t
-ip_rput_process_notdata(queue_t *q, mblk_t **first_mpp, ill_t *ill,
- int *ll_multicast, mblk_t **mpp)
+void
+ip_rput_notdata(ill_t *ill, mblk_t *mp)
{
- mblk_t *mp1, *from_mp, *to_mp, *mp, *first_mp;
- boolean_t must_copy = B_FALSE;
+ mblk_t *first_mp;
struct iocblk *iocp;
- ipha_t *ipha;
- ip_stack_t *ipst = ill->ill_ipst;
-
-#define rptr ((uchar_t *)ipha)
-
- first_mp = *first_mpp;
- mp = *mpp;
+ struct mac_header_info_s mhi;
- ASSERT(first_mp == mp);
-
- /*
- * if db_ref > 1 then copymsg and free original. Packet may be
- * changed and do not want other entity who has a reference to this
- * message to trip over the changes. This is a blind change because
- * trying to catch all places that might change packet is too
- * difficult (since it may be a module above this one)
- *
- * This corresponds to the non-fast path case. We walk down the full
- * chain in this case, and check the db_ref count of all the dblks,
- * and do a copymsg if required. It is possible that the db_ref counts
- * of the data blocks in the mblk chain can be different.
- * For Example, we can get a DL_UNITDATA_IND(M_PROTO) with a db_ref
- * count of 1, followed by a M_DATA block with a ref count of 2, if
- * 'snoop' is running.
- */
- for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) {
- if (mp1->b_datap->db_ref > 1) {
- must_copy = B_TRUE;
- break;
- }
- }
-
- if (must_copy) {
- mp1 = copymsg(mp);
- if (mp1 == NULL) {
- for (mp1 = mp; mp1 != NULL;
- mp1 = mp1->b_cont) {
- mp1->b_next = NULL;
- mp1->b_prev = NULL;
- }
- freemsg(mp);
- if (ill != NULL) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- } else {
- BUMP_MIB(&ipst->ips_ip_mib,
- ipIfStatsInDiscards);
- }
- return (B_TRUE);
- }
- for (from_mp = mp, to_mp = mp1; from_mp != NULL;
- from_mp = from_mp->b_cont, to_mp = to_mp->b_cont) {
- /* Copy b_prev - used by ip_mroute_decap */
- to_mp->b_prev = from_mp->b_prev;
- from_mp->b_prev = NULL;
- }
- *first_mpp = first_mp = mp1;
- freemsg(mp);
- mp = mp1;
- *mpp = mp1;
- }
-
- ipha = (ipha_t *)mp->b_rptr;
-
- /*
- * previous code has a case for M_DATA.
- * We want to check how that happens.
- */
- ASSERT(first_mp->b_datap->db_type != M_DATA);
- switch (first_mp->b_datap->db_type) {
+ switch (DB_TYPE(mp)) {
case M_PROTO:
- case M_PCPROTO:
- if (((dl_unitdata_ind_t *)rptr)->dl_primitive !=
+ case M_PCPROTO: {
+ if (((dl_unitdata_ind_t *)mp->b_rptr)->dl_primitive !=
DL_UNITDATA_IND) {
/* Go handle anything other than data elsewhere. */
- ip_rput_dlpi(q, mp);
- return (B_TRUE);
+ ip_rput_dlpi(ill, mp);
+ return;
}
- *ll_multicast = ip_get_dlpi_mbcast(ill, mp);
+ first_mp = mp;
+ mp = first_mp->b_cont;
+ first_mp->b_cont = NULL;
+
+ if (mp == NULL) {
+ freeb(first_mp);
+ return;
+ }
+ ip_dlur_to_mhi(ill, first_mp, &mhi);
+ if (ill->ill_isv6)
+ ip_input_v6(ill, NULL, mp, &mhi);
+ else
+ ip_input(ill, NULL, mp, &mhi);
+
/* Ditch the DLPI header. */
- mp1 = mp->b_cont;
- ASSERT(first_mp == mp);
- *first_mpp = mp1;
- freeb(mp);
- *mpp = mp1;
- return (B_FALSE);
+ freeb(first_mp);
+ return;
+ }
case M_IOCACK:
- ip1dbg(("got iocack "));
iocp = (struct iocblk *)mp->b_rptr;
switch (iocp->ioc_cmd) {
case DL_IOC_HDR_INFO:
- ill = (ill_t *)q->q_ptr;
ill_fastpath_ack(ill, mp);
- return (B_TRUE);
+ return;
default:
- putnext(q, mp);
- return (B_TRUE);
+ putnext(ill->ill_rq, mp);
+ return;
}
/* FALLTHRU */
case M_ERROR:
case M_HANGUP:
- /*
- * Since this is on the ill stream we unconditionally
- * bump up the refcount
- */
- ill_refhold(ill);
- qwriter_ip(ill, q, mp, ip_rput_other, CUR_OP, B_FALSE);
- return (B_TRUE);
- case M_CTL:
- if ((MBLKL(first_mp) >= sizeof (da_ipsec_t)) &&
- (((da_ipsec_t *)first_mp->b_rptr)->da_type ==
- IPHADA_M_CTL)) {
- /*
- * It's an IPsec accelerated packet.
- * Make sure that the ill from which we received the
- * packet has enabled IPsec hardware acceleration.
- */
- if (!(ill->ill_capabilities &
- (ILL_CAPAB_AH|ILL_CAPAB_ESP))) {
- /* IPsec kstats: bean counter */
- freemsg(mp);
- return (B_TRUE);
- }
-
- /*
- * Make mp point to the mblk following the M_CTL,
- * then process according to type of mp.
- * After this processing, first_mp will point to
- * the data-attributes and mp to the pkt following
- * the M_CTL.
- */
- mp = first_mp->b_cont;
- if (mp == NULL) {
- freemsg(first_mp);
- return (B_TRUE);
- }
- /*
- * A Hardware Accelerated packet can only be M_DATA
- * ESP or AH packet.
- */
- if (mp->b_datap->db_type != M_DATA) {
- /* non-M_DATA IPsec accelerated packet */
- IPSECHW_DEBUG(IPSECHW_PKT,
- ("non-M_DATA IPsec accelerated pkt\n"));
- freemsg(first_mp);
- return (B_TRUE);
- }
- ipha = (ipha_t *)mp->b_rptr;
- if (ipha->ipha_protocol != IPPROTO_AH &&
- ipha->ipha_protocol != IPPROTO_ESP) {
- IPSECHW_DEBUG(IPSECHW_PKT,
- ("non-M_DATA IPsec accelerated pkt\n"));
- freemsg(first_mp);
- return (B_TRUE);
- }
- *mpp = mp;
- return (B_FALSE);
+ mutex_enter(&ill->ill_lock);
+ if (ill->ill_state_flags & ILL_CONDEMNED) {
+ mutex_exit(&ill->ill_lock);
+ freemsg(mp);
+ return;
}
- putnext(q, mp);
- return (B_TRUE);
+ ill_refhold_locked(ill);
+ mutex_exit(&ill->ill_lock);
+ qwriter_ip(ill, ill->ill_rq, mp, ip_rput_other, CUR_OP,
+ B_FALSE);
+ return;
+ case M_CTL:
+ putnext(ill->ill_rq, mp);
+ return;
case M_IOCNAK:
ip1dbg(("got iocnak "));
iocp = (struct iocblk *)mp->b_rptr;
switch (iocp->ioc_cmd) {
case DL_IOC_HDR_INFO:
- ip_rput_other(NULL, q, mp, NULL);
- return (B_TRUE);
+ ip_rput_other(NULL, ill->ill_rq, mp, NULL);
+ return;
default:
break;
}
/* FALLTHRU */
default:
- putnext(q, mp);
- return (B_TRUE);
+ putnext(ill->ill_rq, mp);
+ return;
}
}
@@ -14692,8 +8140,6 @@ ip_rput(queue_t *q, mblk_t *mp)
ill_t *ill;
union DL_primitives *dl;
- TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_rput_start: q %p", q);
-
ill = (ill_t *)q->q_ptr;
if (ill->ill_state_flags & (ILL_CONDEMNED | ILL_LL_SUBNET_PENDING)) {
@@ -14707,70 +8153,42 @@ ip_rput(queue_t *q, mblk_t *mp)
if (DB_TYPE(mp) != M_PCPROTO ||
dl->dl_primitive == DL_UNITDATA_IND) {
inet_freemsg(mp);
- TRACE_2(TR_FAC_IP, TR_IP_RPUT_END,
- "ip_rput_end: q %p (%S)", q, "uninit");
return;
}
}
+ if (DB_TYPE(mp) == M_DATA) {
+ struct mac_header_info_s mhi;
- TRACE_2(TR_FAC_IP, TR_IP_RPUT_END,
- "ip_rput_end: q %p (%S)", q, "end");
-
- ip_input(ill, NULL, mp, NULL);
+ ip_mdata_to_mhi(ill, mp, &mhi);
+ ip_input(ill, NULL, mp, &mhi);
+ } else {
+ ip_rput_notdata(ill, mp);
+ }
}
-static mblk_t *
-ip_fix_dbref(ill_t *ill, mblk_t *mp)
+/*
+ * Move the information to a copy.
+ */
+mblk_t *
+ip_fix_dbref(mblk_t *mp, ip_recv_attr_t *ira)
{
- mblk_t *mp1;
- boolean_t adjusted = B_FALSE;
- ip_stack_t *ipst = ill->ill_ipst;
+ mblk_t *mp1;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
IP_STAT(ipst, ip_db_ref);
- /*
- * The IP_RECVSLLA option depends on having the
- * link layer header. First check that:
- * a> the underlying device is of type ether,
- * since this option is currently supported only
- * over ethernet.
- * b> there is enough room to copy over the link
- * layer header.
- *
- * Once the checks are done, adjust rptr so that
- * the link layer header will be copied via
- * copymsg. Note that, IFT_ETHER may be returned
- * by some non-ethernet drivers but in this case
- * the second check will fail.
- */
- if (ill->ill_type == IFT_ETHER &&
- (mp->b_rptr - mp->b_datap->db_base) >=
- sizeof (struct ether_header)) {
- mp->b_rptr -= sizeof (struct ether_header);
- adjusted = B_TRUE;
- }
- mp1 = copymsg(mp);
+ /* Make sure we have ira_l2src before we loose the original mblk */
+ if (!(ira->ira_flags & IRAF_L2SRC_SET))
+ ip_setl2src(mp, ira, ira->ira_rill);
+
+ mp1 = copymsg(mp);
if (mp1 == NULL) {
- mp->b_next = NULL;
- /* clear b_prev - used by ip_mroute_decap */
- mp->b_prev = NULL;
- freemsg(mp);
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ freemsg(mp);
return (NULL);
}
-
- if (adjusted) {
- /*
- * Copy is done. Restore the pointer in
- * the _new_ mblk
- */
- mp1->b_rptr += sizeof (struct ether_header);
- }
-
- /* Copy b_prev - used by ip_mroute_decap */
- mp1->b_prev = mp->b_prev;
- mp->b_prev = NULL;
-
/* preserve the hardware checksum flags and data, if present */
if (DB_CKSUMFLAGS(mp) != 0) {
DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp);
@@ -14779,888 +8197,10 @@ ip_fix_dbref(ill_t *ill, mblk_t *mp)
DB_CKSUMEND(mp1) = DB_CKSUMEND(mp);
DB_CKSUM16(mp1) = DB_CKSUM16(mp);
}
-
freemsg(mp);
return (mp1);
}
-#define ADD_TO_CHAIN(head, tail, cnt, mp) { \
- if (tail != NULL) \
- tail->b_next = mp; \
- else \
- head = mp; \
- tail = mp; \
- cnt++; \
-}
-
-/*
- * Direct read side procedure capable of dealing with chains. GLDv3 based
- * drivers call this function directly with mblk chains while STREAMS
- * read side procedure ip_rput() calls this for single packet with ip_ring
- * set to NULL to process one packet at a time.
- *
- * The ill will always be valid if this function is called directly from
- * the driver.
- *
- * If ip_input() is called from GLDv3:
- *
- * - This must be a non-VLAN IP stream.
- * - 'mp' is either an untagged or a special priority-tagged packet.
- * - Any VLAN tag that was in the MAC header has been stripped.
- *
- * If the IP header in packet is not 32-bit aligned, every message in the
- * chain will be aligned before further operations. This is required on SPARC
- * platform.
- */
-/* ARGSUSED */
-void
-ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
- struct mac_header_info_s *mhip)
-{
- ipaddr_t dst = NULL;
- ipaddr_t prev_dst;
- ire_t *ire = NULL;
- ipha_t *ipha;
- uint_t pkt_len;
- ssize_t len;
- uint_t opt_len;
- int ll_multicast;
- int cgtp_flt_pkt;
- queue_t *q = ill->ill_rq;
- squeue_t *curr_sqp = NULL;
- mblk_t *head = NULL;
- mblk_t *tail = NULL;
- mblk_t *first_mp;
- int cnt = 0;
- ip_stack_t *ipst = ill->ill_ipst;
- mblk_t *mp;
- mblk_t *dmp;
- uint8_t tag;
- ilb_stack_t *ilbs;
- ipaddr_t lb_dst;
-
- ASSERT(mp_chain != NULL);
- ASSERT(ill != NULL);
-
- TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_input_start: q %p", q);
-
- tag = (ip_ring != NULL) ? SQTAG_IP_INPUT_RX_RING : SQTAG_IP_INPUT;
-
-#define rptr ((uchar_t *)ipha)
-
- ilbs = ipst->ips_netstack->netstack_ilb;
- while (mp_chain != NULL) {
- mp = mp_chain;
- mp_chain = mp_chain->b_next;
- mp->b_next = NULL;
- ll_multicast = 0;
-
- /*
- * We do ire caching from one iteration to
- * another. In the event the packet chain contains
- * all packets from the same dst, this caching saves
- * an ire_cache_lookup for each of the succeeding
- * packets in a packet chain.
- */
- prev_dst = dst;
-
- /*
- * if db_ref > 1 then copymsg and free original. Packet
- * may be changed and we do not want the other entity
- * who has a reference to this message to trip over the
- * changes. This is a blind change because trying to
- * catch all places that might change the packet is too
- * difficult.
- *
- * This corresponds to the fast path case, where we have
- * a chain of M_DATA mblks. We check the db_ref count
- * of only the 1st data block in the mblk chain. There
- * doesn't seem to be a reason why a device driver would
- * send up data with varying db_ref counts in the mblk
- * chain. In any case the Fast path is a private
- * interface, and our drivers don't do such a thing.
- * Given the above assumption, there is no need to walk
- * down the entire mblk chain (which could have a
- * potential performance problem)
- *
- * The "(DB_REF(mp) > 1)" check was moved from ip_rput()
- * to here because of exclusive ip stacks and vnics.
- * Packets transmitted from exclusive stack over vnic
- * can have db_ref > 1 and when it gets looped back to
- * another vnic in a different zone, you have ip_input()
- * getting dblks with db_ref > 1. So if someone
- * complains of TCP performance under this scenario,
- * take a serious look here on the impact of copymsg().
- */
-
- if (DB_REF(mp) > 1) {
- if ((mp = ip_fix_dbref(ill, mp)) == NULL)
- continue;
- }
-
- /*
- * Check and align the IP header.
- */
- first_mp = mp;
- if (DB_TYPE(mp) == M_DATA) {
- dmp = mp;
- } else if (DB_TYPE(mp) == M_PROTO &&
- *(t_uscalar_t *)mp->b_rptr == DL_UNITDATA_IND) {
- dmp = mp->b_cont;
- } else {
- dmp = NULL;
- }
- if (dmp != NULL) {
- /*
- * IP header ptr not aligned?
- * OR IP header not complete in first mblk
- */
- if (!OK_32PTR(dmp->b_rptr) ||
- MBLKL(dmp) < IP_SIMPLE_HDR_LENGTH) {
- if (!ip_check_and_align_header(q, dmp, ipst))
- continue;
- }
- }
-
- /*
- * ip_input fast path
- */
-
- /* mblk type is not M_DATA */
- if (DB_TYPE(mp) != M_DATA) {
- if (ip_rput_process_notdata(q, &first_mp, ill,
- &ll_multicast, &mp))
- continue;
-
- /*
- * The only way we can get here is if we had a
- * packet that was either a DL_UNITDATA_IND or
- * an M_CTL for an IPsec accelerated packet.
- *
- * In either case, the first_mp will point to
- * the leading M_PROTO or M_CTL.
- */
- ASSERT(first_mp != NULL);
- } else if (mhip != NULL) {
- /*
- * ll_multicast is set here so that it is ready
- * for easy use with FW_HOOKS(). ip_get_dlpi_mbcast
- * manipulates ll_multicast in the same fashion when
- * called from ip_rput_process_notdata.
- */
- switch (mhip->mhi_dsttype) {
- case MAC_ADDRTYPE_MULTICAST :
- ll_multicast = HPE_MULTICAST;
- break;
- case MAC_ADDRTYPE_BROADCAST :
- ll_multicast = HPE_BROADCAST;
- break;
- default :
- break;
- }
- }
-
- /* Only M_DATA can come here and it is always aligned */
- ASSERT(DB_TYPE(mp) == M_DATA);
- ASSERT(DB_REF(mp) == 1 && OK_32PTR(mp->b_rptr));
-
- ipha = (ipha_t *)mp->b_rptr;
- len = mp->b_wptr - rptr;
- pkt_len = ntohs(ipha->ipha_length);
-
- /*
- * We must count all incoming packets, even if they end
- * up being dropped later on.
- */
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
- UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pkt_len);
-
- /* multiple mblk or too short */
- len -= pkt_len;
- if (len != 0) {
- /*
- * Make sure we have data length consistent
- * with the IP header.
- */
- if (mp->b_cont == NULL) {
- if (len < 0 || pkt_len < IP_SIMPLE_HDR_LENGTH) {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsInHdrErrors);
- ip2dbg(("ip_input: drop pkt\n"));
- freemsg(mp);
- continue;
- }
- mp->b_wptr = rptr + pkt_len;
- } else if ((len += msgdsize(mp->b_cont)) != 0) {
- if (len < 0 || pkt_len < IP_SIMPLE_HDR_LENGTH) {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsInHdrErrors);
- ip2dbg(("ip_input: drop pkt\n"));
- freemsg(mp);
- continue;
- }
- (void) adjmsg(mp, -len);
- /*
- * adjmsg may have freed an mblk from the chain,
- * hence invalidate any hw checksum here. This
- * will force IP to calculate the checksum in
- * sw, but only for this packet.
- */
- DB_CKSUMFLAGS(mp) = 0;
- IP_STAT(ipst, ip_multimblk3);
- }
- }
-
- /* Obtain the dst of the current packet */
- dst = ipha->ipha_dst;
-
- DTRACE_IP7(receive, mblk_t *, first_mp, conn_t *, NULL,
- void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *,
- ipha, ip6_t *, NULL, int, 0);
-
- /*
- * The following test for loopback is faster than
- * IP_LOOPBACK_ADDR(), because it avoids any bitwise
- * operations.
- * Note that these addresses are always in network byte order
- */
- if (((*(uchar_t *)&ipha->ipha_dst) == 127) ||
- ((*(uchar_t *)&ipha->ipha_src) == 127)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
- freemsg(mp);
- continue;
- }
-
- /*
- * The event for packets being received from a 'physical'
- * interface is placed after validation of the source and/or
- * destination address as being local so that packets can be
- * redirected to loopback addresses using ipnat.
- */
- DTRACE_PROBE4(ip4__physical__in__start,
- ill_t *, ill, ill_t *, NULL,
- ipha_t *, ipha, mblk_t *, first_mp);
-
- FW_HOOKS(ipst->ips_ip4_physical_in_event,
- ipst->ips_ipv4firewall_physical_in,
- ill, NULL, ipha, first_mp, mp, ll_multicast, ipst);
-
- DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, first_mp);
-
- if (first_mp == NULL) {
- continue;
- }
- dst = ipha->ipha_dst;
- /*
- * Attach any necessary label information to
- * this packet
- */
- if (is_system_labeled() &&
- !tsol_get_pkt_label(mp, IPV4_VERSION)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(mp);
- continue;
- }
-
- if (ipst->ips_ip4_observe.he_interested) {
- zoneid_t dzone;
-
- /*
- * On the inbound path the src zone will be unknown as
- * this packet has come from the wire.
- */
- dzone = ip_get_zoneid_v4(dst, mp, ipst, ALL_ZONES);
- ipobs_hook(mp, IPOBS_HOOK_INBOUND, ALL_ZONES, dzone,
- ill, ipst);
- }
-
- /*
- * Here we check to see if we machine is setup as
- * L3 loadbalancer and if the incoming packet is for a VIP
- *
- * Check the following:
- * - there is at least a rule
- * - protocol of the packet is supported
- */
- if (ilb_has_rules(ilbs) && ILB_SUPP_L4(ipha->ipha_protocol)) {
- int lb_ret;
-
- /* For convenience, we pull up the mblk. */
- if (mp->b_cont != NULL) {
- if (pullupmsg(mp, -1) == 0) {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsInDiscards);
- freemsg(first_mp);
- continue;
- }
- ipha = (ipha_t *)mp->b_rptr;
- }
-
- /*
- * We just drop all fragments going to any VIP, at
- * least for now....
- */
- if (ntohs(ipha->ipha_fragment_offset_and_flags) &
- (IPH_MF | IPH_OFFSET)) {
- if (!ilb_rule_match_vip_v4(ilbs,
- ipha->ipha_dst, NULL)) {
- goto after_ilb;
- }
-
- ILB_KSTAT_UPDATE(ilbs, ip_frag_in, 1);
- ILB_KSTAT_UPDATE(ilbs, ip_frag_dropped, 1);
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(first_mp);
- continue;
- }
- lb_ret = ilb_check_v4(ilbs, ill, mp, ipha,
- ipha->ipha_protocol, (uint8_t *)ipha +
- IPH_HDR_LENGTH(ipha), &lb_dst);
-
- if (lb_ret == ILB_DROPPED) {
- /* Is this the right counter to increase? */
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(first_mp);
- continue;
- } else if (lb_ret == ILB_BALANCED) {
- /* Set the dst to that of the chosen server */
- dst = lb_dst;
- DB_CKSUMFLAGS(mp) = 0;
- }
- }
-
-after_ilb:
- /*
- * Reuse the cached ire only if the ipha_dst of the previous
- * packet is the same as the current packet AND it is not
- * INADDR_ANY.
- */
- if (!(dst == prev_dst && dst != INADDR_ANY) &&
- (ire != NULL)) {
- ire_refrele(ire);
- ire = NULL;
- }
-
- opt_len = ipha->ipha_version_and_hdr_length -
- IP_SIMPLE_HDR_VERSION;
-
- /*
- * Check to see if we can take the fastpath.
- * That is possible if the following conditions are met
- * o Tsol disabled
- * o CGTP disabled
- * o ipp_action_count is 0
- * o no options in the packet
- * o not a RSVP packet
- * o not a multicast packet
- * o ill not in IP_DHCPINIT_IF mode
- */
- if (!is_system_labeled() &&
- !ipst->ips_ip_cgtp_filter && ipp_action_count == 0 &&
- opt_len == 0 && ipha->ipha_protocol != IPPROTO_RSVP &&
- !ll_multicast && !CLASSD(dst) && ill->ill_dhcpinit == 0) {
- if (ire == NULL)
- ire = ire_cache_lookup_simple(dst, ipst);
- /*
- * Unless forwarding is enabled, dont call
- * ip_fast_forward(). Incoming packet is for forwarding
- */
- if ((ill->ill_flags & ILLF_ROUTER) &&
- (ire == NULL || (ire->ire_type & IRE_CACHE))) {
- ire = ip_fast_forward(ire, dst, ill, mp);
- continue;
- }
- /* incoming packet is for local consumption */
- if ((ire != NULL) && (ire->ire_type & IRE_LOCAL))
- goto local;
- }
-
- /*
- * Disable ire caching for anything more complex
- * than the simple fast path case we checked for above.
- */
- if (ire != NULL) {
- ire_refrele(ire);
- ire = NULL;
- }
-
- /*
- * Brutal hack for DHCPv4 unicast: RFC2131 allows a DHCP
- * server to unicast DHCP packets to a DHCP client using the
- * IP address it is offering to the client. This can be
- * disabled through the "broadcast bit", but not all DHCP
- * servers honor that bit. Therefore, to interoperate with as
- * many DHCP servers as possible, the DHCP client allows the
- * server to unicast, but we treat those packets as broadcast
- * here. Note that we don't rewrite the packet itself since
- * (a) that would mess up the checksums and (b) the DHCP
- * client conn is bound to INADDR_ANY so ip_fanout_udp() will
- * hand it the packet regardless.
- */
- if (ill->ill_dhcpinit != 0 &&
- IS_SIMPLE_IPH(ipha) && ipha->ipha_protocol == IPPROTO_UDP &&
- pullupmsg(mp, sizeof (ipha_t) + sizeof (udpha_t)) == 1) {
- udpha_t *udpha;
-
- /*
- * Reload ipha since pullupmsg() can change b_rptr.
- */
- ipha = (ipha_t *)mp->b_rptr;
- udpha = (udpha_t *)&ipha[1];
-
- if (ntohs(udpha->uha_dst_port) == IPPORT_BOOTPC) {
- DTRACE_PROBE2(ip4__dhcpinit__pkt, ill_t *, ill,
- mblk_t *, mp);
- dst = INADDR_BROADCAST;
- }
- }
-
- /* Full-blown slow path */
- if (opt_len != 0) {
- if (len != 0)
- IP_STAT(ipst, ip_multimblk4);
- else
- IP_STAT(ipst, ip_ipoptions);
- if (!ip_rput_multimblk_ipoptions(q, ill, mp, &ipha,
- &dst, ipst))
- continue;
- }
-
- /*
- * Invoke the CGTP (multirouting) filtering module to process
- * the incoming packet. Packets identified as duplicates
- * must be discarded. Filtering is active only if the
- * the ip_cgtp_filter ndd variable is non-zero.
- */
- cgtp_flt_pkt = CGTP_IP_PKT_NOT_CGTP;
- if (ipst->ips_ip_cgtp_filter &&
- ipst->ips_ip_cgtp_filter_ops != NULL) {
- netstackid_t stackid;
-
- stackid = ipst->ips_netstack->netstack_stackid;
- cgtp_flt_pkt =
- ipst->ips_ip_cgtp_filter_ops->cfo_filter(stackid,
- ill->ill_phyint->phyint_ifindex, mp);
- if (cgtp_flt_pkt == CGTP_IP_PKT_DUPLICATE) {
- freemsg(first_mp);
- continue;
- }
- }
-
- /*
- * If rsvpd is running, let RSVP daemon handle its processing
- * and forwarding of RSVP multicast/unicast packets.
- * If rsvpd is not running but mrouted is running, RSVP
- * multicast packets are forwarded as multicast traffic
- * and RSVP unicast packets are forwarded by unicast router.
- * If neither rsvpd nor mrouted is running, RSVP multicast
- * packets are not forwarded, but the unicast packets are
- * forwarded like unicast traffic.
- */
- if (ipha->ipha_protocol == IPPROTO_RSVP &&
- ipst->ips_ipcl_proto_fanout[IPPROTO_RSVP].connf_head !=
- NULL) {
- /* RSVP packet and rsvpd running. Treat as ours */
- ip2dbg(("ip_input: RSVP for us: 0x%x\n", ntohl(dst)));
- /*
- * This assumes that we deliver to all streams for
- * multicast and broadcast packets.
- * We have to force ll_multicast to 1 to handle the
- * M_DATA messages passed in from ip_mroute_decap.
- */
- dst = INADDR_BROADCAST;
- ll_multicast = 1;
- } else if (CLASSD(dst)) {
- /* packet is multicast */
- mp->b_next = NULL;
- if (ip_rput_process_multicast(q, mp, ill, ipha,
- &ll_multicast, &dst))
- continue;
- }
-
- if (ire == NULL) {
- ire = ire_cache_lookup(dst, ALL_ZONES,
- msg_getlabel(mp), ipst);
- }
-
- if (ire != NULL && ire->ire_stq != NULL &&
- ire->ire_zoneid != GLOBAL_ZONEID &&
- ire->ire_zoneid != ALL_ZONES) {
- /*
- * Should only use IREs that are visible from the
- * global zone for forwarding.
- */
- ire_refrele(ire);
- ire = ire_cache_lookup(dst, GLOBAL_ZONEID,
- msg_getlabel(mp), ipst);
- }
-
- if (ire == NULL) {
- /*
- * No IRE for this destination, so it can't be for us.
- * Unless we are forwarding, drop the packet.
- * We have to let source routed packets through
- * since we don't yet know if they are 'ping -l'
- * packets i.e. if they will go out over the
- * same interface as they came in on.
- */
- ire = ip_rput_noire(q, mp, ll_multicast, dst);
- if (ire == NULL)
- continue;
- }
-
- /*
- * Broadcast IRE may indicate either broadcast or
- * multicast packet
- */
- if (ire->ire_type == IRE_BROADCAST) {
- /*
- * Skip broadcast checks if packet is UDP multicast;
- * we'd rather not enter ip_rput_process_broadcast()
- * unless the packet is broadcast for real, since
- * that routine is a no-op for multicast.
- */
- if (ipha->ipha_protocol != IPPROTO_UDP ||
- !CLASSD(ipha->ipha_dst)) {
- ire = ip_rput_process_broadcast(&q, mp,
- ire, ipha, ill, dst, cgtp_flt_pkt,
- ll_multicast);
- if (ire == NULL)
- continue;
- }
- } else if (ire->ire_stq != NULL) {
- /* fowarding? */
- ip_rput_process_forward(q, mp, ire, ipha, ill,
- ll_multicast, B_FALSE);
- /* ip_rput_process_forward consumed the packet */
- continue;
- }
-
-local:
- /*
- * If the queue in the ire is different to the ingress queue
- * then we need to check to see if we can accept the packet.
- * Note that for multicast packets and broadcast packets sent
- * to a broadcast address which is shared between multiple
- * interfaces we should not do this since we just got a random
- * broadcast ire.
- */
- if ((ire->ire_rfq != q) && (ire->ire_type != IRE_BROADCAST)) {
- ire = ip_check_multihome(&ipha->ipha_dst, ire, ill);
- if (ire == NULL) {
- /* Drop packet */
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsForwProhibits);
- freemsg(mp);
- continue;
- }
- if (ire->ire_rfq != NULL)
- q = ire->ire_rfq;
- }
-
- switch (ipha->ipha_protocol) {
- case IPPROTO_TCP:
- ASSERT(first_mp == mp);
- if ((mp = ip_tcp_input(mp, ipha, ill, B_FALSE, ire,
- mp, 0, q, ip_ring)) != NULL) {
- if (curr_sqp == NULL) {
- curr_sqp = GET_SQUEUE(mp);
- ASSERT(cnt == 0);
- cnt++;
- head = tail = mp;
- } else if (curr_sqp == GET_SQUEUE(mp)) {
- ASSERT(tail != NULL);
- cnt++;
- tail->b_next = mp;
- tail = mp;
- } else {
- /*
- * A different squeue. Send the
- * chain for the previous squeue on
- * its way. This shouldn't happen
- * often unless interrupt binding
- * changes.
- */
- IP_STAT(ipst, ip_input_multi_squeue);
- SQUEUE_ENTER(curr_sqp, head,
- tail, cnt, SQ_PROCESS, tag);
- curr_sqp = GET_SQUEUE(mp);
- head = mp;
- tail = mp;
- cnt = 1;
- }
- }
- continue;
- case IPPROTO_UDP:
- ASSERT(first_mp == mp);
- ip_udp_input(q, mp, ipha, ire, ill);
- continue;
- case IPPROTO_SCTP:
- ASSERT(first_mp == mp);
- ip_sctp_input(mp, ipha, ill, B_FALSE, ire, mp, 0,
- q, dst);
- /* ire has been released by ip_sctp_input */
- ire = NULL;
- continue;
- case IPPROTO_ENCAP:
- case IPPROTO_IPV6:
- ASSERT(first_mp == mp);
- if (ip_iptun_input(NULL, mp, ipha, ill, ire, ipst))
- break;
- /*
- * If there was no IP tunnel data-link bound to
- * receive this packet, then we fall through to
- * allow potential raw sockets bound to either of
- * these protocols to pick it up.
- */
- default:
- ip_proto_input(q, first_mp, ipha, ire, ill, 0);
- continue;
- }
- }
-
- if (ire != NULL)
- ire_refrele(ire);
-
- if (head != NULL)
- SQUEUE_ENTER(curr_sqp, head, tail, cnt, SQ_PROCESS, tag);
-
- TRACE_2(TR_FAC_IP, TR_IP_RPUT_END,
- "ip_input_end: q %p (%S)", q, "end");
-#undef rptr
-}
-
-/*
- * ip_accept_tcp() - This function is called by the squeue when it retrieves
- * a chain of packets in the poll mode. The packets have gone through the
- * data link processing but not IP processing. For performance and latency
- * reasons, the squeue wants to process the chain in line instead of feeding
- * it back via ip_input path.
- *
- * So this is a light weight function which checks to see if the packets
- * retrived are indeed TCP packets (TCP squeue always polls TCP soft ring
- * but we still do the paranoid check) meant for local machine and we don't
- * have labels etc enabled. Packets that meet the criterion are returned to
- * the squeue and processed inline while the rest go via ip_input path.
- */
-/*ARGSUSED*/
-mblk_t *
-ip_accept_tcp(ill_t *ill, ill_rx_ring_t *ip_ring, squeue_t *target_sqp,
- mblk_t *mp_chain, mblk_t **last, uint_t *cnt)
-{
- mblk_t *mp;
- ipaddr_t dst = NULL;
- ipaddr_t prev_dst;
- ire_t *ire = NULL;
- ipha_t *ipha;
- uint_t pkt_len;
- ssize_t len;
- uint_t opt_len;
- queue_t *q = ill->ill_rq;
- squeue_t *curr_sqp;
- mblk_t *ahead = NULL; /* Accepted head */
- mblk_t *atail = NULL; /* Accepted tail */
- uint_t acnt = 0; /* Accepted count */
- mblk_t *utail = NULL; /* Unaccepted head */
- mblk_t *uhead = NULL; /* Unaccepted tail */
- uint_t ucnt = 0; /* Unaccepted cnt */
- ip_stack_t *ipst = ill->ill_ipst;
- ilb_stack_t *ilbs = ipst->ips_netstack->netstack_ilb;
-
- *cnt = 0;
-
- ASSERT(ill != NULL);
- ASSERT(ip_ring != NULL);
-
- TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_accept_tcp: q %p", q);
-
- /* If ILB is enabled, don't do fast processing. */
- if (ilb_has_rules(ilbs)) {
- uhead = mp_chain;
- goto all_reject;
- }
-
-#define rptr ((uchar_t *)ipha)
-
- while (mp_chain != NULL) {
- mp = mp_chain;
- mp_chain = mp_chain->b_next;
- mp->b_next = NULL;
-
- /*
- * We do ire caching from one iteration to
- * another. In the event the packet chain contains
- * all packets from the same dst, this caching saves
- * an ire_cache_lookup for each of the succeeding
- * packets in a packet chain.
- */
- prev_dst = dst;
-
- ipha = (ipha_t *)mp->b_rptr;
- len = mp->b_wptr - rptr;
-
- ASSERT(!MBLK_RX_FANOUT_SLOWPATH(mp, ipha));
-
- /*
- * If it is a non TCP packet, or doesn't have H/W cksum,
- * or doesn't have min len, reject.
- */
- if ((ipha->ipha_protocol != IPPROTO_TCP) || (len <
- (IP_SIMPLE_HDR_LENGTH + TCP_MIN_HEADER_LENGTH))) {
- ADD_TO_CHAIN(uhead, utail, ucnt, mp);
- continue;
- }
-
- pkt_len = ntohs(ipha->ipha_length);
- if (len != pkt_len) {
- if (len > pkt_len) {
- mp->b_wptr = rptr + pkt_len;
- } else {
- ADD_TO_CHAIN(uhead, utail, ucnt, mp);
- continue;
- }
- }
-
- opt_len = ipha->ipha_version_and_hdr_length -
- IP_SIMPLE_HDR_VERSION;
- dst = ipha->ipha_dst;
-
- /* IP version bad or there are IP options */
- if (opt_len && (!ip_rput_multimblk_ipoptions(q, ill,
- mp, &ipha, &dst, ipst)))
- continue;
-
- if (is_system_labeled() || (ill->ill_dhcpinit != 0) ||
- (ipst->ips_ip_cgtp_filter &&
- ipst->ips_ip_cgtp_filter_ops != NULL)) {
- ADD_TO_CHAIN(uhead, utail, ucnt, mp);
- continue;
- }
-
- /*
- * Reuse the cached ire only if the ipha_dst of the previous
- * packet is the same as the current packet AND it is not
- * INADDR_ANY.
- */
- if (!(dst == prev_dst && dst != INADDR_ANY) &&
- (ire != NULL)) {
- ire_refrele(ire);
- ire = NULL;
- }
-
- if (ire == NULL)
- ire = ire_cache_lookup_simple(dst, ipst);
-
- /*
- * Unless forwarding is enabled, dont call
- * ip_fast_forward(). Incoming packet is for forwarding
- */
- if ((ill->ill_flags & ILLF_ROUTER) &&
- (ire == NULL || (ire->ire_type & IRE_CACHE))) {
-
- DTRACE_PROBE4(ip4__physical__in__start,
- ill_t *, ill, ill_t *, NULL,
- ipha_t *, ipha, mblk_t *, mp);
-
- FW_HOOKS(ipst->ips_ip4_physical_in_event,
- ipst->ips_ipv4firewall_physical_in,
- ill, NULL, ipha, mp, mp, 0, ipst);
-
- DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, mp);
-
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
- UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets,
- pkt_len);
-
- if (mp != NULL)
- ire = ip_fast_forward(ire, dst, ill, mp);
- continue;
- }
-
- /* incoming packet is for local consumption */
- if ((ire != NULL) && (ire->ire_type & IRE_LOCAL))
- goto local_accept;
-
- /*
- * Disable ire caching for anything more complex
- * than the simple fast path case we checked for above.
- */
- if (ire != NULL) {
- ire_refrele(ire);
- ire = NULL;
- }
-
- ire = ire_cache_lookup(dst, ALL_ZONES, msg_getlabel(mp),
- ipst);
- if (ire == NULL || ire->ire_type == IRE_BROADCAST ||
- ire->ire_stq != NULL) {
- ADD_TO_CHAIN(uhead, utail, ucnt, mp);
- if (ire != NULL) {
- ire_refrele(ire);
- ire = NULL;
- }
- continue;
- }
-
-local_accept:
-
- if (ire->ire_rfq != q) {
- ADD_TO_CHAIN(uhead, utail, ucnt, mp);
- if (ire != NULL) {
- ire_refrele(ire);
- ire = NULL;
- }
- continue;
- }
-
- /*
- * The event for packets being received from a 'physical'
- * interface is placed after validation of the source and/or
- * destination address as being local so that packets can be
- * redirected to loopback addresses using ipnat.
- */
- DTRACE_PROBE4(ip4__physical__in__start,
- ill_t *, ill, ill_t *, NULL,
- ipha_t *, ipha, mblk_t *, mp);
-
- FW_HOOKS(ipst->ips_ip4_physical_in_event,
- ipst->ips_ipv4firewall_physical_in,
- ill, NULL, ipha, mp, mp, 0, ipst);
-
- DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, mp);
-
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
- UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pkt_len);
-
- if (mp != NULL &&
- (mp = ip_tcp_input(mp, ipha, ill, B_FALSE, ire, mp,
- 0, q, ip_ring)) != NULL) {
- if ((curr_sqp = GET_SQUEUE(mp)) == target_sqp) {
- ADD_TO_CHAIN(ahead, atail, acnt, mp);
- } else {
- SQUEUE_ENTER(curr_sqp, mp, mp, 1,
- SQ_FILL, SQTAG_IP_INPUT);
- }
- }
- }
-
- if (ire != NULL)
- ire_refrele(ire);
-
-all_reject:
- if (uhead != NULL)
- ip_input(ill, ip_ring, uhead, NULL);
-
- if (ahead != NULL) {
- *last = atail;
- *cnt = acnt;
- return (ahead);
- }
-
- return (NULL);
-#undef rptr
-}
-
static void
ip_dlpi_error(ill_t *ill, t_uscalar_t prim, t_uscalar_t dl_err,
t_uscalar_t err)
@@ -15684,14 +8224,16 @@ ip_dlpi_error(ill_t *ill, t_uscalar_t prim, t_uscalar_t dl_err,
* ill_refhold before that, since qwriter_ip does an ill_refrele.
*/
void
-ip_rput_dlpi(queue_t *q, mblk_t *mp)
+ip_rput_dlpi(ill_t *ill, mblk_t *mp)
{
dl_ok_ack_t *dloa = (dl_ok_ack_t *)mp->b_rptr;
dl_error_ack_t *dlea = (dl_error_ack_t *)dloa;
- ill_t *ill = q->q_ptr;
+ queue_t *q = ill->ill_rq;
t_uscalar_t prim = dloa->dl_primitive;
t_uscalar_t reqprim = DL_PRIM_INVAL;
+ DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi",
+ char *, dl_primstr(prim), ill_t *, ill);
ip1dbg(("ip_rput_dlpi"));
/*
@@ -15721,9 +8263,6 @@ ip_rput_dlpi(queue_t *q, mblk_t *mp)
case DL_NOTIFY_ACK:
reqprim = DL_NOTIFY_REQ;
break;
- case DL_CONTROL_ACK:
- reqprim = DL_CONTROL_REQ;
- break;
case DL_CAPABILITY_ACK:
reqprim = DL_CAPABILITY_REQ;
break;
@@ -15781,7 +8320,7 @@ ip_rput_dlpi(queue_t *q, mblk_t *mp)
/*
* Handling of DLPI messages that require exclusive access to the ipsq.
*
- * Need to do ill_pending_mp_release on ioctl completion, which could
+ * Need to do ipsq_pending_mp_get on ioctl completion, which could
* happen here. (along with mi_copy_done)
*/
/* ARGSUSED */
@@ -15791,7 +8330,7 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
dl_ok_ack_t *dloa = (dl_ok_ack_t *)mp->b_rptr;
dl_error_ack_t *dlea = (dl_error_ack_t *)dloa;
int err = 0;
- ill_t *ill;
+ ill_t *ill = (ill_t *)q->q_ptr;
ipif_t *ipif = NULL;
mblk_t *mp1 = NULL;
conn_t *connp = NULL;
@@ -15800,15 +8339,14 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
boolean_t success;
boolean_t ioctl_aborted = B_FALSE;
boolean_t log = B_TRUE;
- ip_stack_t *ipst;
+
+ DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi_writer",
+ char *, dl_primstr(dloa->dl_primitive), ill_t *, ill);
ip1dbg(("ip_rput_dlpi_writer .."));
- ill = (ill_t *)q->q_ptr;
ASSERT(ipsq->ipsq_xop == ill->ill_phyint->phyint_ipsq->ipsq_xop);
ASSERT(IAM_WRITER_ILL(ill));
- ipst = ill->ill_ipst;
-
ipif = ipsq->ipsq_xop->ipx_pending_ipif;
/*
* The current ioctl could have been aborted by the user and a new
@@ -15823,6 +8361,10 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for %s\n",
dl_primstr(dlea->dl_error_primitive)));
+ DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi_writer error",
+ char *, dl_primstr(dlea->dl_error_primitive),
+ ill_t *, ill);
+
switch (dlea->dl_error_primitive) {
case DL_DISABMULTI_REQ:
ill_dlpi_done(ill, dlea->dl_error_primitive);
@@ -15916,7 +8458,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
if (ill->ill_dlpi_multicast_state == IDS_INPROGRESS)
ill->ill_dlpi_multicast_state = IDS_FAILED;
if (ill->ill_dlpi_multicast_state == IDS_FAILED) {
- ipif_t *ipif;
printf("ip: joining multicasts failed (%d)"
" on %s - will use link layer "
@@ -15924,32 +8465,18 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
dlea->dl_errno, ill->ill_name);
/*
- * Set up the multicast mapping alone.
+ * Set up for multi_bcast; We are the
* writer, so ok to access ill->ill_ipif
* without any lock.
*/
- ipif = ill->ill_ipif;
mutex_enter(&ill->ill_phyint->phyint_lock);
ill->ill_phyint->phyint_flags |=
PHYI_MULTI_BCAST;
mutex_exit(&ill->ill_phyint->phyint_lock);
- if (!ill->ill_isv6) {
- (void) ipif_arp_setup_multicast(ipif,
- NULL);
- } else {
- (void) ipif_ndp_setup_multicast(ipif,
- NULL);
- }
}
freemsg(mp); /* Don't want to pass this up */
return;
- case DL_CONTROL_REQ:
- ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for "
- "DL_CONTROL_REQ\n"));
- ill_dlpi_done(ill, dlea->dl_error_primitive);
- freemsg(mp);
- return;
case DL_CAPABILITY_REQ:
ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for "
"DL_CAPABILITY REQ\n"));
@@ -16003,10 +8530,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
mp = NULL;
break;
- case DL_CONTROL_ACK:
- /* We treat all of these as "fire and forget" */
- ill_dlpi_done(ill, DL_CONTROL_REQ);
- break;
case DL_INFO_ACK:
/* Call a routine to handle this one. */
ill_dlpi_done(ill, DL_INFO_REQ);
@@ -16019,29 +8542,33 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
* sent by ill_dl_phys, in which case just return
*/
ill_dlpi_done(ill, DL_BIND_REQ);
- if (ill->ill_ifname_pending)
+ if (ill->ill_ifname_pending) {
+ DTRACE_PROBE2(ip__rput__dlpi__ifname__pending,
+ ill_t *, ill, mblk_t *, mp);
break;
-
+ }
if (!ioctl_aborted)
mp1 = ipsq_pending_mp_get(ipsq, &connp);
- if (mp1 == NULL)
+ if (mp1 == NULL) {
+ DTRACE_PROBE1(ip__rput__dlpi__no__mblk, ill_t *, ill);
break;
+ }
/*
* mp1 was added by ill_dl_up(). if that is a result of
* a DL_NOTE_REPLUMB notification, connp could be NULL.
*/
if (connp != NULL)
q = CONNP_TO_WQ(connp);
-
/*
* We are exclusive. So nothing can change even after
- * we get the pending mp. If need be we can put it back
- * and restart, as in calling ipif_arp_up() below.
+ * we get the pending mp.
*/
ip1dbg(("ip_rput_dlpi: bind_ack %s\n", ill->ill_name));
+ DTRACE_PROBE1(ip__rput__dlpi__bind__ack, ill_t *, ill);
mutex_enter(&ill->ill_lock);
ill->ill_dl_up = 1;
+ ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS;
ill_nic_event_dispatch(ill, 0, NE_UP, NULL, 0);
mutex_exit(&ill->ill_lock);
@@ -16052,34 +8579,15 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
* ill_dl_up(), which stopped ipif_up()'s processing.
*/
if (ill->ill_isv6) {
- if (ill->ill_flags & ILLF_XRESOLV) {
- if (connp != NULL)
- mutex_enter(&connp->conn_lock);
- mutex_enter(&ill->ill_lock);
- success = ipsq_pending_mp_add(connp, ipif, q,
- mp1, 0);
- mutex_exit(&ill->ill_lock);
- if (connp != NULL)
- mutex_exit(&connp->conn_lock);
- if (success) {
- err = ipif_resolver_up(ipif,
- Res_act_initial);
- if (err == EINPROGRESS) {
- freemsg(mp);
- return;
- }
- ASSERT(err != 0);
- mp1 = ipsq_pending_mp_get(ipsq, &connp);
- ASSERT(mp1 != NULL);
- } else {
- /* conn has started closing */
- err = EINTR;
- }
- } else { /* Non XRESOLV interface */
- (void) ipif_resolver_up(ipif, Res_act_initial);
- if ((err = ipif_ndp_up(ipif, B_TRUE)) == 0)
- err = ipif_up_done_v6(ipif);
- }
+ /*
+ * v6 interfaces.
+ * Unlike ARP which has to do another bind
+ * and attach, once we get here we are
+ * done with NDP
+ */
+ (void) ipif_resolver_up(ipif, Res_act_initial);
+ if ((err = ipif_ndp_up(ipif, B_TRUE)) == 0)
+ err = ipif_up_done_v6(ipif);
} else if (ill->ill_net_type == IRE_IF_RESOLVER) {
/*
* ARP and other v4 external resolvers.
@@ -16099,7 +8607,7 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
freemsg(mp);
return;
}
- ASSERT(err != 0);
+ ASSERT(arp_no_defense || err != 0);
mp1 = ipsq_pending_mp_get(ipsq, &connp);
} else {
/* The conn has started closing */
@@ -16144,10 +8652,7 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
case DL_NOTIFY_IND: {
dl_notify_ind_t *notify = (dl_notify_ind_t *)mp->b_rptr;
- ire_t *ire;
uint_t orig_mtu;
- boolean_t need_ire_walk_v4 = B_FALSE;
- boolean_t need_ire_walk_v6 = B_FALSE;
switch (notify->dl_notification) {
case DL_NOTE_PHYS_ADDR:
@@ -16164,95 +8669,52 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
return;
case DL_NOTE_FASTPATH_FLUSH:
- ill_fastpath_flush(ill);
+ nce_flush(ill, B_FALSE);
break;
case DL_NOTE_SDU_SIZE:
/*
- * Change the MTU size of the interface, of all
- * attached ipif's, and of all relevant ire's. The
- * new value's a uint32_t at notify->dl_data.
- * Mtu change Vs. new ire creation - protocol below.
- *
- * a Mark the ipif as IPIF_CHANGING.
- * b Set the new mtu in the ipif.
- * c Change the ire_max_frag on all affected ires
- * d Unmark the IPIF_CHANGING
+ * The dce and fragmentation code can cope with
+ * this changing while packets are being sent.
+ * When packets are sent ip_output will discover
+ * a change.
*
- * To see how the protocol works, assume an interface
- * route is also being added simultaneously by
- * ip_rt_add and let 'ipif' be the ipif referenced by
- * the ire. If the ire is created before step a,
- * it will be cleaned up by step c. If the ire is
- * created after step d, it will see the new value of
- * ipif_mtu. Any attempt to create the ire between
- * steps a to d will fail because of the IPIF_CHANGING
- * flag. Note that ire_create() is passed a pointer to
- * the ipif_mtu, and not the value. During ire_add
- * under the bucket lock, the ire_max_frag of the
- * new ire being created is set from the ipif/ire from
- * which it is being derived.
+ * Change the MTU size of the interface.
*/
mutex_enter(&ill->ill_lock);
+ ill->ill_current_frag = (uint_t)notify->dl_data;
+ if (ill->ill_current_frag > ill->ill_max_frag)
+ ill->ill_max_frag = ill->ill_current_frag;
- orig_mtu = ill->ill_max_mtu;
- ill->ill_max_frag = (uint_t)notify->dl_data;
- ill->ill_max_mtu = (uint_t)notify->dl_data;
-
- /*
- * If ill_user_mtu was set (via SIOCSLIFLNKINFO),
- * clamp ill_max_mtu at it.
- */
- if (ill->ill_user_mtu != 0 &&
- ill->ill_user_mtu < ill->ill_max_mtu)
- ill->ill_max_mtu = ill->ill_user_mtu;
+ orig_mtu = ill->ill_mtu;
+ if (!(ill->ill_flags & ILLF_FIXEDMTU)) {
+ ill->ill_mtu = ill->ill_current_frag;
- /*
- * If the MTU is unchanged, we're done.
- */
- if (orig_mtu == ill->ill_max_mtu) {
- mutex_exit(&ill->ill_lock);
- break;
- }
-
- if (ill->ill_isv6) {
- if (ill->ill_max_mtu < IPV6_MIN_MTU)
- ill->ill_max_mtu = IPV6_MIN_MTU;
- } else {
- if (ill->ill_max_mtu < IP_MIN_MTU)
- ill->ill_max_mtu = IP_MIN_MTU;
- }
- for (ipif = ill->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
/*
- * Don't override the mtu if the user
- * has explicitly set it.
+ * If ill_user_mtu was set (via
+ * SIOCSLIFLNKINFO), clamp ill_mtu at it.
*/
- if (ipif->ipif_flags & IPIF_FIXEDMTU)
- continue;
- ipif->ipif_mtu = (uint_t)notify->dl_data;
- if (ipif->ipif_isv6)
- ire = ipif_to_ire_v6(ipif);
- else
- ire = ipif_to_ire(ipif);
- if (ire != NULL) {
- ire->ire_max_frag = ipif->ipif_mtu;
- ire_refrele(ire);
- }
- if (ipif->ipif_flags & IPIF_UP) {
- if (ill->ill_isv6)
- need_ire_walk_v6 = B_TRUE;
- else
- need_ire_walk_v4 = B_TRUE;
+ if (ill->ill_user_mtu != 0 &&
+ ill->ill_user_mtu < ill->ill_mtu)
+ ill->ill_mtu = ill->ill_user_mtu;
+
+ if (ill->ill_isv6) {
+ if (ill->ill_mtu < IPV6_MIN_MTU)
+ ill->ill_mtu = IPV6_MIN_MTU;
+ } else {
+ if (ill->ill_mtu < IP_MIN_MTU)
+ ill->ill_mtu = IP_MIN_MTU;
}
}
mutex_exit(&ill->ill_lock);
- if (need_ire_walk_v4)
- ire_walk_v4(ill_mtu_change, (char *)ill,
- ALL_ZONES, ipst);
- if (need_ire_walk_v6)
- ire_walk_v6(ill_mtu_change, (char *)ill,
- ALL_ZONES, ipst);
+ /*
+ * Make sure all dce_generation checks find out
+ * that ill_mtu has changed.
+ */
+ if (orig_mtu != ill->ill_mtu) {
+ dce_increment_all_generations(ill->ill_isv6,
+ ill->ill_ipst);
+ }
/*
* Refresh IPMP meta-interface MTU if necessary.
@@ -16303,8 +8765,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
case DL_NOTE_PROMISC_ON_PHYS: {
phyint_t *phyint = ill->ill_phyint;
- IPSECHW_DEBUG(IPSECHW_PKT, ("ip_rput_dlpi_writer: "
- "got a DL_NOTE_PROMISC_ON_PHYS\n"));
mutex_enter(&phyint->phyint_lock);
phyint->phyint_flags |= PHYI_PROMISC;
mutex_exit(&phyint->phyint_lock);
@@ -16313,8 +8773,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
case DL_NOTE_PROMISC_OFF_PHYS: {
phyint_t *phyint = ill->ill_phyint;
- IPSECHW_DEBUG(IPSECHW_PKT, ("ip_rput_dlpi_writer: "
- "got a DL_NOTE_PROMISC_OFF_PHYS\n"));
mutex_enter(&phyint->phyint_lock);
phyint->phyint_flags &= ~PHYI_PROMISC;
mutex_exit(&phyint->phyint_lock);
@@ -16474,6 +8932,10 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
ip2dbg(("DL_OK_ACK %s (0x%x)\n",
dl_primstr((int)dloa->dl_correct_primitive),
dloa->dl_correct_primitive));
+ DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi_writer ok",
+ char *, dl_primstr(dloa->dl_correct_primitive),
+ ill_t *, ill);
+
switch (dloa->dl_correct_primitive) {
case DL_ENABMULTI_REQ:
case DL_DISABMULTI_REQ:
@@ -16502,6 +8964,10 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
*/
ASSERT(err != EINPROGRESS);
+ DTRACE_PROBE4(ipif__ioctl, char *, "ip_rput_dlpi_writer finish",
+ int, ipsq->ipsq_xop->ipx_current_ioctl, ill_t *, ill,
+ ipif_t *, NULL);
+
switch (ipsq->ipsq_xop->ipx_current_ioctl) {
case 0:
ipsq_current_finish(ipsq);
@@ -16595,7 +9061,10 @@ ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) {
ill->ill_dlpi_fastpath_state = IDS_FAILED;
mutex_exit(&ill->ill_lock);
- ill_fastpath_nack(ill);
+ /*
+ * don't flush the nce_t entries: we use them
+ * as an index to the ncec itself.
+ */
ip1dbg(("ip_rput: DLPI fastpath off on interface %s\n",
ill->ill_name));
} else {
@@ -16611,235 +9080,24 @@ ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
}
/*
- * NOTE : This function does not ire_refrele the ire argument passed in.
- *
- * IPQoS notes
- * IP policy is invoked twice for a forwarded packet, once on the read side
- * and again on the write side if both, IPP_FWD_IN and IPP_FWD_OUT are
- * enabled. An additional parameter, in_ill, has been added for this purpose.
- * Note that in_ill could be NULL when called from ip_rput_forward_multicast
- * because ip_mroute drops this information.
- *
+ * Update any source route, record route or timestamp options
+ * When it fails it has consumed the message and BUMPed the MIB.
*/
-void
-ip_rput_forward(ire_t *ire, ipha_t *ipha, mblk_t *mp, ill_t *in_ill)
-{
- uint32_t old_pkt_len;
- uint32_t pkt_len;
- queue_t *q;
- uint32_t sum;
-#define rptr ((uchar_t *)ipha)
- uint32_t max_frag;
- uint32_t ill_index;
- ill_t *out_ill;
- mib2_ipIfStatsEntry_t *mibptr;
- ip_stack_t *ipst = ((ill_t *)(ire->ire_stq->q_ptr))->ill_ipst;
-
- /* Get the ill_index of the incoming ILL */
- ill_index = (in_ill != NULL) ? in_ill->ill_phyint->phyint_ifindex : 0;
- mibptr = (in_ill != NULL) ? in_ill->ill_ip_mib : &ipst->ips_ip_mib;
-
- /* Initiate Read side IPPF processing */
- if (IPP_ENABLED(IPP_FWD_IN, ipst)) {
- ip_process(IPP_FWD_IN, &mp, ill_index);
- if (mp == NULL) {
- ip2dbg(("ip_rput_forward: pkt dropped/deferred "\
- "during IPPF processing\n"));
- return;
- }
- }
-
- /* Adjust the checksum to reflect the ttl decrement. */
- sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST;
- ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16));
-
- if (ipha->ipha_ttl-- <= 1) {
- if (ip_csum_hdr(ipha)) {
- BUMP_MIB(mibptr, ipIfStatsInCksumErrs);
- goto drop_pkt;
- }
- /*
- * Note: ire_stq this will be NULL for multicast
- * datagrams using the long path through arp (the IRE
- * is not an IRE_CACHE). This should not cause
- * problems since we don't generate ICMP errors for
- * multicast packets.
- */
- BUMP_MIB(mibptr, ipIfStatsForwProhibits);
- q = ire->ire_stq;
- if (q != NULL) {
- /* Sent by forwarding path, and router is global zone */
- icmp_time_exceeded(q, mp, ICMP_TTL_EXCEEDED,
- GLOBAL_ZONEID, ipst);
- } else
- freemsg(mp);
- return;
- }
-
- /*
- * Don't forward if the interface is down
- */
- if (ire->ire_ipif->ipif_ill->ill_ipif_up_count == 0) {
- BUMP_MIB(mibptr, ipIfStatsInDiscards);
- ip2dbg(("ip_rput_forward:interface is down\n"));
- goto drop_pkt;
- }
-
- /* Get the ill_index of the outgoing ILL */
- out_ill = ire_to_ill(ire);
- ill_index = out_ill->ill_phyint->phyint_ifindex;
-
- DTRACE_PROBE4(ip4__forwarding__start,
- ill_t *, in_ill, ill_t *, out_ill, ipha_t *, ipha, mblk_t *, mp);
-
- FW_HOOKS(ipst->ips_ip4_forwarding_event,
- ipst->ips_ipv4firewall_forwarding,
- in_ill, out_ill, ipha, mp, mp, 0, ipst);
-
- DTRACE_PROBE1(ip4__forwarding__end, mblk_t *, mp);
-
- if (mp == NULL)
- return;
- old_pkt_len = pkt_len = ntohs(ipha->ipha_length);
-
- if (is_system_labeled()) {
- mblk_t *mp1;
-
- if ((mp1 = tsol_ip_forward(ire, mp)) == NULL) {
- BUMP_MIB(mibptr, ipIfStatsForwProhibits);
- goto drop_pkt;
- }
- /* Size may have changed */
- mp = mp1;
- ipha = (ipha_t *)mp->b_rptr;
- pkt_len = ntohs(ipha->ipha_length);
- }
-
- /* Check if there are options to update */
- if (!IS_SIMPLE_IPH(ipha)) {
- if (ip_csum_hdr(ipha)) {
- BUMP_MIB(mibptr, ipIfStatsInCksumErrs);
- goto drop_pkt;
- }
- if (ip_rput_forward_options(mp, ipha, ire, ipst)) {
- BUMP_MIB(mibptr, ipIfStatsForwProhibits);
- return;
- }
-
- ipha->ipha_hdr_checksum = 0;
- ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
- }
- max_frag = ire->ire_max_frag;
- if (pkt_len > max_frag) {
- /*
- * It needs fragging on its way out. We haven't
- * verified the header checksum yet. Since we
- * are going to put a surely good checksum in the
- * outgoing header, we have to make sure that it
- * was good coming in.
- */
- if (ip_csum_hdr(ipha)) {
- BUMP_MIB(mibptr, ipIfStatsInCksumErrs);
- goto drop_pkt;
- }
- /* Initiate Write side IPPF processing */
- if (IPP_ENABLED(IPP_FWD_OUT, ipst)) {
- ip_process(IPP_FWD_OUT, &mp, ill_index);
- if (mp == NULL) {
- ip2dbg(("ip_rput_forward: pkt dropped/deferred"\
- " during IPPF processing\n"));
- return;
- }
- }
- /*
- * Handle labeled packet resizing.
- *
- * If we have added a label, inform ip_wput_frag() of its
- * effect on the MTU for ICMP messages.
- */
- if (pkt_len > old_pkt_len) {
- uint32_t secopt_size;
-
- secopt_size = pkt_len - old_pkt_len;
- if (secopt_size < max_frag)
- max_frag -= secopt_size;
- }
-
- ip_wput_frag(ire, mp, IB_PKT, max_frag, 0,
- GLOBAL_ZONEID, ipst, NULL);
- ip2dbg(("ip_rput_forward:sent to ip_wput_frag\n"));
- return;
- }
-
- DTRACE_PROBE4(ip4__physical__out__start, ill_t *, NULL,
- ill_t *, out_ill, ipha_t *, ipha, mblk_t *, mp);
- FW_HOOKS(ipst->ips_ip4_physical_out_event,
- ipst->ips_ipv4firewall_physical_out,
- NULL, out_ill, ipha, mp, mp, 0, ipst);
- DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
- if (mp == NULL)
- return;
-
- mp->b_prev = (mblk_t *)IPP_FWD_OUT;
- ip1dbg(("ip_rput_forward: Calling ip_xmit_v4\n"));
- (void) ip_xmit_v4(mp, ire, NULL, B_FALSE, NULL);
- /* ip_xmit_v4 always consumes the packet */
- return;
-
-drop_pkt:;
- ip1dbg(("ip_rput_forward: drop pkt\n"));
- freemsg(mp);
-#undef rptr
-}
-
-void
-ip_rput_forward_multicast(ipaddr_t dst, mblk_t *mp, ipif_t *ipif)
-{
- ire_t *ire;
- ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
-
- ASSERT(!ipif->ipif_isv6);
- /*
- * Find an IRE which matches the destination and the outgoing
- * queue in the cache table. All we need is an IRE_CACHE which
- * is pointing at ipif->ipif_ill.
- */
- if (ipif->ipif_flags & IPIF_POINTOPOINT)
- dst = ipif->ipif_pp_dst_addr;
-
- ire = ire_ctable_lookup(dst, 0, 0, ipif, ALL_ZONES, msg_getlabel(mp),
- MATCH_IRE_ILL | MATCH_IRE_SECATTR, ipst);
- if (ire == NULL) {
- /*
- * Mark this packet to make it be delivered to
- * ip_rput_forward after the new ire has been
- * created.
- */
- mp->b_prev = NULL;
- mp->b_next = mp;
- ip_newroute_ipif(ipif->ipif_ill->ill_wq, mp, ipif, dst,
- NULL, 0, GLOBAL_ZONEID, &zero_info);
- } else {
- ip_rput_forward(ire, (ipha_t *)mp->b_rptr, mp, NULL);
- IRE_REFRELE(ire);
- }
-}
-
-/* Update any source route, record route or timestamp options */
-static int
-ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst)
+boolean_t
+ip_forward_options(mblk_t *mp, ipha_t *ipha, ill_t *dst_ill,
+ ip_recv_attr_t *ira)
{
ipoptp_t opts;
uchar_t *opt;
uint8_t optval;
uint8_t optlen;
ipaddr_t dst;
+ ipaddr_t ifaddr;
uint32_t ts;
- ire_t *dst_ire = NULL;
- ire_t *tmp_ire = NULL;
timestruc_t now;
+ ip_stack_t *ipst = ira->ira_ill->ill_ipst;
- ip2dbg(("ip_rput_forward_options\n"));
+ ip2dbg(("ip_forward_options\n"));
dst = ipha->ipha_dst;
for (optval = ipoptp_first(&opts, ipha);
optval != IPOPT_EOL;
@@ -16847,7 +9105,7 @@ ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst)
ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
opt = opts.ipoptp_cur;
optlen = opts.ipoptp_len;
- ip2dbg(("ip_rput_forward_options: opt %d, len %d\n",
+ ip2dbg(("ip_forward_options: opt %d, len %d\n",
optval, opts.ipoptp_len));
switch (optval) {
uint32_t off;
@@ -16855,27 +9113,17 @@ ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst)
case IPOPT_LSRR:
/* Check if adminstratively disabled */
if (!ipst->ips_ip_forward_src_routed) {
- if (ire->ire_stq != NULL) {
- /*
- * Sent by forwarding path, and router
- * is global zone
- */
- icmp_unreachable(ire->ire_stq, mp,
- ICMP_SOURCE_ROUTE_FAILED,
- GLOBAL_ZONEID, ipst);
- } else {
- ip0dbg(("ip_rput_forward_options: "
- "unable to send unreach\n"));
- freemsg(mp);
- }
- return (-1);
+ BUMP_MIB(dst_ill->ill_ip_mib,
+ ipIfStatsForwProhibits);
+ ip_drop_input("ICMP_SOURCE_ROUTE_FAILED",
+ mp, dst_ill);
+ icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED,
+ ira);
+ return (B_FALSE);
}
-
- dst_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL,
- NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
- if (dst_ire == NULL) {
+ if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
/*
- * Must be partial since ip_rput_options
+ * Must be partial since ip_input_options
* checked for strict.
*/
break;
@@ -16887,31 +9135,33 @@ ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst)
off > optlen - IP_ADDR_LEN) {
/* End of source route */
ip1dbg((
- "ip_rput_forward_options: end of SR\n"));
- ire_refrele(dst_ire);
+ "ip_forward_options: end of SR\n"));
break;
}
+ /* Pick a reasonable address on the outbound if */
+ ASSERT(dst_ill != NULL);
+ if (ip_select_source_v4(dst_ill, INADDR_ANY, dst,
+ INADDR_ANY, ALL_ZONES, ipst, &ifaddr, NULL,
+ NULL) != 0) {
+ /* No source! Shouldn't happen */
+ ifaddr = INADDR_ANY;
+ }
bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
- bcopy(&ire->ire_src_addr, (char *)opt + off,
- IP_ADDR_LEN);
- ip1dbg(("ip_rput_forward_options: next hop 0x%x\n",
+ bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
+ ip1dbg(("ip_forward_options: next hop 0x%x\n",
ntohl(dst)));
/*
* Check if our address is present more than
* once as consecutive hops in source route.
*/
- tmp_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL,
- NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
- if (tmp_ire != NULL) {
- ire_refrele(tmp_ire);
+ if (ip_type_v4(dst, ipst) == IRE_LOCAL) {
off += IP_ADDR_LEN;
opt[IPOPT_OFFSET] += IP_ADDR_LEN;
goto redo_srr;
}
ipha->ipha_dst = dst;
opt[IPOPT_OFFSET] += IP_ADDR_LEN;
- ire_refrele(dst_ire);
break;
case IPOPT_RR:
off = opt[IPOPT_OFFSET];
@@ -16920,11 +9170,18 @@ ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst)
off > optlen - IP_ADDR_LEN) {
/* No more room - ignore */
ip1dbg((
- "ip_rput_forward_options: end of RR\n"));
+ "ip_forward_options: end of RR\n"));
break;
}
- bcopy(&ire->ire_src_addr, (char *)opt + off,
- IP_ADDR_LEN);
+ /* Pick a reasonable address on the outbound if */
+ ASSERT(dst_ill != NULL);
+ if (ip_select_source_v4(dst_ill, INADDR_ANY, dst,
+ INADDR_ANY, ALL_ZONES, ipst, &ifaddr, NULL,
+ NULL) != 0) {
+ /* No source! Shouldn't happen */
+ ifaddr = INADDR_ANY;
+ }
+ bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
opt[IPOPT_OFFSET] += IP_ADDR_LEN;
break;
case IPOPT_TS:
@@ -16938,14 +9195,10 @@ ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst)
/* Verify that the address matched */
off = opt[IPOPT_OFFSET] - 1;
bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
- dst_ire = ire_ctable_lookup(dst, 0,
- IRE_LOCAL, NULL, ALL_ZONES, NULL,
- MATCH_IRE_TYPE, ipst);
- if (dst_ire == NULL) {
+ if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
/* Not for us */
break;
}
- ire_refrele(dst_ire);
/* FALLTHRU */
case IPOPT_TS_TSANDADDR:
off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
@@ -16955,9 +9208,9 @@ ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst)
* ip_*put_options should have already
* dropped this packet.
*/
- cmn_err(CE_PANIC, "ip_rput_forward_options: "
- "unknown IT - bug in ip_rput_options?\n");
- return (0); /* Keep "lint" happy */
+ cmn_err(CE_PANIC, "ip_forward_options: "
+ "unknown IT - bug in ip_input_options?\n");
+ return (B_TRUE); /* Keep "lint" happy */
}
if (opt[IPOPT_OFFSET] - 1 + off > optlen) {
/* Increase overflow counter */
@@ -16972,8 +9225,15 @@ ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst)
case IPOPT_TS_PRESPEC:
case IPOPT_TS_PRESPEC_RFC791:
case IPOPT_TS_TSANDADDR:
- bcopy(&ire->ire_src_addr,
- (char *)opt + off, IP_ADDR_LEN);
+ /* Pick a reasonable addr on the outbound if */
+ ASSERT(dst_ill != NULL);
+ if (ip_select_source_v4(dst_ill, INADDR_ANY,
+ dst, INADDR_ANY, ALL_ZONES, ipst, &ifaddr,
+ NULL, NULL) != 0) {
+ /* No source! Shouldn't happen */
+ ifaddr = INADDR_ANY;
+ }
+ bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
opt[IPOPT_OFFSET] += IP_ADDR_LEN;
/* FALLTHRU */
case IPOPT_TS_TSONLY:
@@ -16989,223 +9249,7 @@ ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst)
break;
}
}
- return (0);
-}
-
-/*
- * This is called after processing at least one of AH/ESP headers.
- *
- * NOTE: the ill corresponding to ipsec_in_ill_index may not be
- * the actual, physical interface on which the packet was received,
- * but, when ip_strict_dst_multihoming is set to 1, could be the
- * interface which had the ipha_dst configured when the packet went
- * through ip_rput. The ill_index corresponding to the recv_ill
- * is saved in ipsec_in_rill_index
- *
- * NOTE2: The "ire" argument is only used in IPv4 cases. This function
- * cannot assume "ire" points to valid data for any IPv6 cases.
- */
-void
-ip_fanout_proto_again(mblk_t *ipsec_mp, ill_t *ill, ill_t *recv_ill, ire_t *ire)
-{
- mblk_t *mp;
- ipaddr_t dst;
- in6_addr_t *v6dstp;
- ipha_t *ipha;
- ip6_t *ip6h;
- ipsec_in_t *ii;
- boolean_t ill_need_rele = B_FALSE;
- boolean_t rill_need_rele = B_FALSE;
- boolean_t ire_need_rele = B_FALSE;
- netstack_t *ns;
- ip_stack_t *ipst;
-
- ii = (ipsec_in_t *)ipsec_mp->b_rptr;
- ASSERT(ii->ipsec_in_ill_index != 0);
- ns = ii->ipsec_in_ns;
- ASSERT(ii->ipsec_in_ns != NULL);
- ipst = ns->netstack_ip;
-
- mp = ipsec_mp->b_cont;
- ASSERT(mp != NULL);
-
- if (ill == NULL) {
- ASSERT(recv_ill == NULL);
- /*
- * We need to get the original queue on which ip_rput_local
- * or ip_rput_data_v6 was called.
- */
- ill = ill_lookup_on_ifindex(ii->ipsec_in_ill_index,
- !ii->ipsec_in_v4, NULL, NULL, NULL, NULL, ipst);
- ill_need_rele = B_TRUE;
-
- if (ii->ipsec_in_ill_index != ii->ipsec_in_rill_index) {
- recv_ill = ill_lookup_on_ifindex(
- ii->ipsec_in_rill_index, !ii->ipsec_in_v4,
- NULL, NULL, NULL, NULL, ipst);
- rill_need_rele = B_TRUE;
- } else {
- recv_ill = ill;
- }
-
- if ((ill == NULL) || (recv_ill == NULL)) {
- ip0dbg(("ip_fanout_proto_again: interface "
- "disappeared\n"));
- if (ill != NULL)
- ill_refrele(ill);
- if (recv_ill != NULL)
- ill_refrele(recv_ill);
- freemsg(ipsec_mp);
- return;
- }
- }
-
- ASSERT(ill != NULL && recv_ill != NULL);
-
- if (mp->b_datap->db_type == M_CTL) {
- /*
- * AH/ESP is returning the ICMP message after
- * removing their headers. Fanout again till
- * it gets to the right protocol.
- */
- if (ii->ipsec_in_v4) {
- icmph_t *icmph;
- int iph_hdr_length;
- int hdr_length;
-
- ipha = (ipha_t *)mp->b_rptr;
- iph_hdr_length = IPH_HDR_LENGTH(ipha);
- icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
- ipha = (ipha_t *)&icmph[1];
- hdr_length = IPH_HDR_LENGTH(ipha);
- /*
- * icmp_inbound_error_fanout may need to do pullupmsg.
- * Reset the type to M_DATA.
- */
- mp->b_datap->db_type = M_DATA;
- icmp_inbound_error_fanout(ill->ill_rq, ill, ipsec_mp,
- icmph, ipha, iph_hdr_length, hdr_length, B_TRUE,
- B_FALSE, ill, ii->ipsec_in_zoneid);
- } else {
- icmp6_t *icmp6;
- int hdr_length;
-
- ip6h = (ip6_t *)mp->b_rptr;
- /* Don't call hdr_length_v6() unless you have to. */
- if (ip6h->ip6_nxt != IPPROTO_ICMPV6)
- hdr_length = ip_hdr_length_v6(mp, ip6h);
- else
- hdr_length = IPV6_HDR_LEN;
-
- icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]);
- /*
- * icmp_inbound_error_fanout_v6 may need to do
- * pullupmsg. Reset the type to M_DATA.
- */
- mp->b_datap->db_type = M_DATA;
- icmp_inbound_error_fanout_v6(ill->ill_rq, ipsec_mp,
- ip6h, icmp6, ill, recv_ill, B_TRUE,
- ii->ipsec_in_zoneid);
- }
- if (ill_need_rele)
- ill_refrele(ill);
- if (rill_need_rele)
- ill_refrele(recv_ill);
- return;
- }
-
- if (ii->ipsec_in_v4) {
- ipha = (ipha_t *)mp->b_rptr;
- dst = ipha->ipha_dst;
- if (CLASSD(dst)) {
- /*
- * Multicast has to be delivered to all streams.
- */
- dst = INADDR_BROADCAST;
- }
-
- if (ire == NULL) {
- ire = ire_cache_lookup(dst, ii->ipsec_in_zoneid,
- msg_getlabel(mp), ipst);
- if (ire == NULL) {
- if (ill_need_rele)
- ill_refrele(ill);
- if (rill_need_rele)
- ill_refrele(recv_ill);
- ip1dbg(("ip_fanout_proto_again: "
- "IRE not found"));
- freemsg(ipsec_mp);
- return;
- }
- ire_need_rele = B_TRUE;
- }
-
- switch (ipha->ipha_protocol) {
- case IPPROTO_UDP:
- ip_udp_input(ill->ill_rq, ipsec_mp, ipha, ire,
- recv_ill);
- if (ire_need_rele)
- ire_refrele(ire);
- break;
- case IPPROTO_TCP:
- if (!ire_need_rele)
- IRE_REFHOLD(ire);
- mp = ip_tcp_input(mp, ipha, ill, B_TRUE,
- ire, ipsec_mp, 0, ill->ill_rq, NULL);
- IRE_REFRELE(ire);
- if (mp != NULL) {
- SQUEUE_ENTER(GET_SQUEUE(mp), mp,
- mp, 1, SQ_PROCESS,
- SQTAG_IP_PROTO_AGAIN);
- }
- break;
- case IPPROTO_SCTP:
- if (!ire_need_rele)
- IRE_REFHOLD(ire);
- ip_sctp_input(mp, ipha, ill, B_TRUE, ire,
- ipsec_mp, 0, ill->ill_rq, dst);
- break;
- case IPPROTO_ENCAP:
- case IPPROTO_IPV6:
- if (ip_iptun_input(ipsec_mp, mp, ipha, ill, ire,
- ill->ill_ipst)) {
- /*
- * If we made it here, we don't need to worry
- * about the raw-socket/protocol fanout.
- */
- if (ire_need_rele)
- ire_refrele(ire);
- break;
- }
- /* else FALLTHRU */
- default:
- ip_proto_input(ill->ill_rq, ipsec_mp, ipha, ire,
- recv_ill, 0);
- if (ire_need_rele)
- ire_refrele(ire);
- break;
- }
- } else {
- uint32_t rput_flags = 0;
-
- ip6h = (ip6_t *)mp->b_rptr;
- v6dstp = &ip6h->ip6_dst;
- /*
- * XXX Assumes ip_rput_v6 sets ll_multicast only for multicast
- * address.
- *
- * Currently, we don't store that state in the IPSEC_IN
- * message, and we may need to.
- */
- rput_flags |= (IN6_IS_ADDR_MULTICAST(v6dstp) ?
- IP6_IN_LLMCAST : 0);
- ip_rput_data_v6(ill->ill_rq, ill, ipsec_mp, ip6h, rput_flags,
- NULL, NULL);
- }
- if (ill_need_rele)
- ill_refrele(ill);
- if (rill_need_rele)
- ill_refrele(recv_ill);
+ return (B_TRUE);
}
/*
@@ -17290,609 +9334,25 @@ ill_frag_timer_start(ill_t *ill)
}
/*
- * This routine is needed for loopback when forwarding multicasts.
- *
- * IPQoS Notes:
- * IPPF processing is done in fanout routines.
- * Policy processing is done only if IPP_lOCAL_IN is enabled. Further,
- * processing for IPsec packets is done when it comes back in clear.
- * NOTE : The callers of this function need to do the ire_refrele for the
- * ire that is being passed in.
- */
-void
-ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
- ill_t *recv_ill, uint32_t esp_udp_ports)
-{
- boolean_t esp_in_udp_packet = (esp_udp_ports != 0);
- ill_t *ill = (ill_t *)q->q_ptr;
- uint32_t sum;
- uint32_t u1;
- uint32_t u2;
- int hdr_length;
- boolean_t mctl_present;
- mblk_t *first_mp = mp;
- mblk_t *hada_mp = NULL;
- ipha_t *inner_ipha;
- ip_stack_t *ipst;
-
- ASSERT(recv_ill != NULL);
- ipst = recv_ill->ill_ipst;
-
- TRACE_1(TR_FAC_IP, TR_IP_RPUT_LOCL_START,
- "ip_rput_locl_start: q %p", q);
-
- ASSERT(ire->ire_ipversion == IPV4_VERSION);
- ASSERT(ill != NULL);
-
-#define rptr ((uchar_t *)ipha)
-#define iphs ((uint16_t *)ipha)
-
- /*
- * no UDP or TCP packet should come here anymore.
- */
- ASSERT(ipha->ipha_protocol != IPPROTO_TCP &&
- ipha->ipha_protocol != IPPROTO_UDP);
-
- EXTRACT_PKT_MP(mp, first_mp, mctl_present);
- if (mctl_present &&
- ((da_ipsec_t *)first_mp->b_rptr)->da_type == IPHADA_M_CTL) {
- ASSERT(MBLKL(first_mp) >= sizeof (da_ipsec_t));
-
- /*
- * It's an IPsec accelerated packet.
- * Keep a pointer to the data attributes around until
- * we allocate the ipsec_info_t.
- */
- IPSECHW_DEBUG(IPSECHW_PKT,
- ("ip_rput_local: inbound HW accelerated IPsec pkt\n"));
- hada_mp = first_mp;
- hada_mp->b_cont = NULL;
- /*
- * Since it is accelerated, it comes directly from
- * the ill and the data attributes is followed by
- * the packet data.
- */
- ASSERT(mp->b_datap->db_type != M_CTL);
- first_mp = mp;
- mctl_present = B_FALSE;
- }
-
- /*
- * IF M_CTL is not present, then ipsec_in_is_secure
- * should return B_TRUE. There is a case where loopback
- * packets has an M_CTL in the front with all the
- * IPsec options set to IPSEC_PREF_NEVER - which means
- * ipsec_in_is_secure will return B_FALSE. As loopback
- * packets never comes here, it is safe to ASSERT the
- * following.
- */
- ASSERT(!mctl_present || ipsec_in_is_secure(first_mp));
-
- /*
- * Also, we should never have an mctl_present if this is an
- * ESP-in-UDP packet.
- */
- ASSERT(!mctl_present || !esp_in_udp_packet);
-
- /* u1 is # words of IP options */
- u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) +
- IP_SIMPLE_HDR_LENGTH_IN_WORDS);
-
- /*
- * Don't verify header checksum if we just removed UDP header or
- * packet is coming back from AH/ESP.
- */
- if (!esp_in_udp_packet && !mctl_present) {
- if (u1) {
- if (!ip_options_cksum(q, ill, mp, ipha, ire, ipst)) {
- if (hada_mp != NULL)
- freemsg(hada_mp);
- return;
- }
- } else {
- /* Check the IP header checksum. */
-#define uph ((uint16_t *)ipha)
- sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] +
- uph[5] + uph[6] + uph[7] + uph[8] + uph[9];
-#undef uph
- /* finish doing IP checksum */
- sum = (sum & 0xFFFF) + (sum >> 16);
- sum = ~(sum + (sum >> 16)) & 0xFFFF;
- if (sum && sum != 0xFFFF) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
- goto drop_pkt;
- }
- }
- }
-
- /*
- * Count for SNMP of inbound packets for ire. As ip_proto_input
- * might be called more than once for secure packets, count only
- * the first time.
- */
- if (!mctl_present) {
- UPDATE_IB_PKT_COUNT(ire);
- ire->ire_last_used_time = lbolt;
- }
-
- /* Check for fragmentation offset. */
- u2 = ntohs(ipha->ipha_fragment_offset_and_flags);
- u1 = u2 & (IPH_MF | IPH_OFFSET);
- if (u1) {
- /*
- * We re-assemble fragments before we do the AH/ESP
- * processing. Thus, M_CTL should not be present
- * while we are re-assembling.
- */
- ASSERT(!mctl_present);
- ASSERT(first_mp == mp);
- if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, NULL, NULL))
- return;
-
- /*
- * Make sure that first_mp points back to mp as
- * the mp we came in with could have changed in
- * ip_rput_fragment().
- */
- ipha = (ipha_t *)mp->b_rptr;
- first_mp = mp;
- }
-
- /*
- * Clear hardware checksumming flag as it is currently only
- * used by TCP and UDP.
- */
- DB_CKSUMFLAGS(mp) = 0;
-
- /* Now we have a complete datagram, destined for this machine. */
- u1 = IPH_HDR_LENGTH(ipha);
- switch (ipha->ipha_protocol) {
- case IPPROTO_ICMP: {
- ire_t *ire_zone;
- ilm_t *ilm;
- mblk_t *mp1;
- zoneid_t last_zoneid;
- ilm_walker_t ilw;
-
- if (CLASSD(ipha->ipha_dst) && !IS_LOOPBACK(recv_ill)) {
- ASSERT(ire->ire_type == IRE_BROADCAST);
-
- /*
- * In the multicast case, applications may have joined
- * the group from different zones, so we need to deliver
- * the packet to each of them. Loop through the
- * multicast memberships structures (ilm) on the receive
- * ill and send a copy of the packet up each matching
- * one. However, we don't do this for multicasts sent on
- * the loopback interface (PHYI_LOOPBACK flag set) as
- * they must stay in the sender's zone.
- *
- * ilm_add_v6() ensures that ilms in the same zone are
- * contiguous in the ill_ilm list. We use this property
- * to avoid sending duplicates needed when two
- * applications in the same zone join the same group on
- * different logical interfaces: we ignore the ilm if
- * its zoneid is the same as the last matching one.
- * In addition, the sending of the packet for
- * ire_zoneid is delayed until all of the other ilms
- * have been exhausted.
- */
- last_zoneid = -1;
- ilm = ilm_walker_start(&ilw, recv_ill);
- for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
- if (ipha->ipha_dst != ilm->ilm_addr ||
- ilm->ilm_zoneid == last_zoneid ||
- ilm->ilm_zoneid == ire->ire_zoneid ||
- ilm->ilm_zoneid == ALL_ZONES ||
- !(ilm->ilm_ipif->ipif_flags & IPIF_UP))
- continue;
- mp1 = ip_copymsg(first_mp);
- if (mp1 == NULL)
- continue;
- icmp_inbound(q, mp1, B_TRUE, ilw.ilw_walk_ill,
- 0, sum, mctl_present, B_TRUE,
- recv_ill, ilm->ilm_zoneid);
- last_zoneid = ilm->ilm_zoneid;
- }
- ilm_walker_finish(&ilw);
- } else if (ire->ire_type == IRE_BROADCAST) {
- /*
- * In the broadcast case, there may be many zones
- * which need a copy of the packet delivered to them.
- * There is one IRE_BROADCAST per broadcast address
- * and per zone; we walk those using a helper function.
- * In addition, the sending of the packet for ire is
- * delayed until all of the other ires have been
- * processed.
- */
- IRB_REFHOLD(ire->ire_bucket);
- ire_zone = NULL;
- while ((ire_zone = ire_get_next_bcast_ire(ire_zone,
- ire)) != NULL) {
- mp1 = ip_copymsg(first_mp);
- if (mp1 == NULL)
- continue;
-
- UPDATE_IB_PKT_COUNT(ire_zone);
- ire_zone->ire_last_used_time = lbolt;
- icmp_inbound(q, mp1, B_TRUE, ill,
- 0, sum, mctl_present, B_TRUE,
- recv_ill, ire_zone->ire_zoneid);
- }
- IRB_REFRELE(ire->ire_bucket);
- }
- icmp_inbound(q, first_mp, (ire->ire_type == IRE_BROADCAST),
- ill, 0, sum, mctl_present, B_TRUE, recv_ill,
- ire->ire_zoneid);
- TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END,
- "ip_rput_locl_end: q %p (%S)", q, "icmp");
- return;
- }
- case IPPROTO_IGMP:
- /*
- * If we are not willing to accept IGMP packets in clear,
- * then check with global policy.
- */
- if (ipst->ips_igmp_accept_clear_messages == 0) {
- first_mp = ipsec_check_global_policy(first_mp, NULL,
- ipha, NULL, mctl_present, ipst->ips_netstack);
- if (first_mp == NULL)
- return;
- }
- if (is_system_labeled() && !tsol_can_accept_raw(mp, B_TRUE)) {
- freemsg(first_mp);
- ip1dbg(("ip_proto_input: zone all cannot accept raw"));
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- return;
- }
- if ((mp = igmp_input(q, mp, ill)) == NULL) {
- /* Bad packet - discarded by igmp_input */
- TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END,
- "ip_rput_locl_end: q %p (%S)", q, "igmp");
- if (mctl_present)
- freeb(first_mp);
- return;
- }
- /*
- * igmp_input() may have returned the pulled up message.
- * So first_mp and ipha need to be reinitialized.
- */
- ipha = (ipha_t *)mp->b_rptr;
- if (mctl_present)
- first_mp->b_cont = mp;
- else
- first_mp = mp;
- if (ipst->ips_ipcl_proto_fanout[ipha->ipha_protocol].
- connf_head != NULL) {
- /* No user-level listener for IGMP packets */
- goto drop_pkt;
- }
- /* deliver to local raw users */
- break;
- case IPPROTO_PIM:
- /*
- * If we are not willing to accept PIM packets in clear,
- * then check with global policy.
- */
- if (ipst->ips_pim_accept_clear_messages == 0) {
- first_mp = ipsec_check_global_policy(first_mp, NULL,
- ipha, NULL, mctl_present, ipst->ips_netstack);
- if (first_mp == NULL)
- return;
- }
- if (is_system_labeled() && !tsol_can_accept_raw(mp, B_TRUE)) {
- freemsg(first_mp);
- ip1dbg(("ip_proto_input: zone all cannot accept PIM"));
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- return;
- }
- if (pim_input(q, mp, ill) != 0) {
- /* Bad packet - discarded by pim_input */
- TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END,
- "ip_rput_locl_end: q %p (%S)", q, "pim");
- if (mctl_present)
- freeb(first_mp);
- return;
- }
-
- /*
- * pim_input() may have pulled up the message so ipha needs to
- * be reinitialized.
- */
- ipha = (ipha_t *)mp->b_rptr;
- if (ipst->ips_ipcl_proto_fanout[ipha->ipha_protocol].
- connf_head != NULL) {
- /* No user-level listener for PIM packets */
- goto drop_pkt;
- }
- /* deliver to local raw users */
- break;
- case IPPROTO_ENCAP:
- /*
- * Handle self-encapsulated packets (IP-in-IP where
- * the inner addresses == the outer addresses).
- */
- hdr_length = IPH_HDR_LENGTH(ipha);
- if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) >
- mp->b_wptr) {
- if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length +
- sizeof (ipha_t) - mp->b_rptr)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(first_mp);
- return;
- }
- ipha = (ipha_t *)mp->b_rptr;
- }
- inner_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length);
- /*
- * Check the sanity of the inner IP header.
- */
- if ((IPH_HDR_VERSION(inner_ipha) != IPV4_VERSION)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(first_mp);
- return;
- }
- if (IPH_HDR_LENGTH(inner_ipha) < sizeof (ipha_t)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(first_mp);
- return;
- }
- if (inner_ipha->ipha_src == ipha->ipha_src &&
- inner_ipha->ipha_dst == ipha->ipha_dst) {
- ipsec_in_t *ii;
-
- /*
- * Self-encapsulated tunnel packet. Remove
- * the outer IP header and fanout again.
- * We also need to make sure that the inner
- * header is pulled up until options.
- */
- mp->b_rptr = (uchar_t *)inner_ipha;
- ipha = inner_ipha;
- hdr_length = IPH_HDR_LENGTH(ipha);
- if ((uchar_t *)ipha + hdr_length > mp->b_wptr) {
- if (!pullupmsg(mp, (uchar_t *)ipha +
- + hdr_length - mp->b_rptr)) {
- freemsg(first_mp);
- return;
- }
- ipha = (ipha_t *)mp->b_rptr;
- }
- if (hdr_length > sizeof (ipha_t)) {
- /* We got options on the inner packet. */
- ipaddr_t dst = ipha->ipha_dst;
-
- if (ip_rput_options(q, mp, ipha, &dst, ipst) ==
- -1) {
- /* Bad options! */
- return;
- }
- if (dst != ipha->ipha_dst) {
- /*
- * Someone put a source-route in
- * the inside header of a self-
- * encapsulated packet. Drop it
- * with extreme prejudice and let
- * the sender know.
- */
- icmp_unreachable(q, first_mp,
- ICMP_SOURCE_ROUTE_FAILED,
- recv_ill->ill_zoneid, ipst);
- return;
- }
- }
- if (!mctl_present) {
- ASSERT(first_mp == mp);
- /*
- * This means that somebody is sending
- * Self-encapsualted packets without AH/ESP.
- * If AH/ESP was present, we would have already
- * allocated the first_mp.
- *
- * Send this packet to find a tunnel endpoint.
- * if I can't find one, an ICMP
- * PROTOCOL_UNREACHABLE will get sent.
- */
- goto fanout;
- }
- /*
- * We generally store the ill_index if we need to
- * do IPsec processing as we lose the ill queue when
- * we come back. But in this case, we never should
- * have to store the ill_index here as it should have
- * been stored previously when we processed the
- * AH/ESP header in this routine or for non-ipsec
- * cases, we still have the queue. But for some bad
- * packets from the wire, we can get to IPsec after
- * this and we better store the index for that case.
- */
- ill = (ill_t *)q->q_ptr;
- ii = (ipsec_in_t *)first_mp->b_rptr;
- ii->ipsec_in_ill_index =
- ill->ill_phyint->phyint_ifindex;
- ii->ipsec_in_rill_index =
- recv_ill->ill_phyint->phyint_ifindex;
- if (ii->ipsec_in_decaps) {
- /*
- * This packet is self-encapsulated multiple
- * times. We don't want to recurse infinitely.
- * To keep it simple, drop the packet.
- */
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(first_mp);
- return;
- }
- ii->ipsec_in_decaps = B_TRUE;
- ip_fanout_proto_again(first_mp, recv_ill, recv_ill,
- ire);
- return;
- }
- break;
- case IPPROTO_AH:
- case IPPROTO_ESP: {
- ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
-
- /*
- * Fast path for AH/ESP. If this is the first time
- * we are sending a datagram to AH/ESP, allocate
- * a IPSEC_IN message and prepend it. Otherwise,
- * just fanout.
- */
-
- int ipsec_rc;
- ipsec_in_t *ii;
- netstack_t *ns = ipst->ips_netstack;
-
- IP_STAT(ipst, ipsec_proto_ahesp);
- if (!mctl_present) {
- ASSERT(first_mp == mp);
- first_mp = ipsec_in_alloc(B_TRUE, ns);
- if (first_mp == NULL) {
- ip1dbg(("ip_proto_input: IPSEC_IN "
- "allocation failure.\n"));
- freemsg(hada_mp); /* okay ifnull */
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(mp);
- return;
- }
- /*
- * Store the ill_index so that when we come back
- * from IPsec we ride on the same queue.
- */
- ill = (ill_t *)q->q_ptr;
- ii = (ipsec_in_t *)first_mp->b_rptr;
- ii->ipsec_in_ill_index =
- ill->ill_phyint->phyint_ifindex;
- ii->ipsec_in_rill_index =
- recv_ill->ill_phyint->phyint_ifindex;
- first_mp->b_cont = mp;
- /*
- * Cache hardware acceleration info.
- */
- if (hada_mp != NULL) {
- IPSECHW_DEBUG(IPSECHW_PKT,
- ("ip_rput_local: caching data attr.\n"));
- ii->ipsec_in_accelerated = B_TRUE;
- ii->ipsec_in_da = hada_mp;
- hada_mp = NULL;
- }
- } else {
- ii = (ipsec_in_t *)first_mp->b_rptr;
- }
-
- ii->ipsec_in_esp_udp_ports = esp_udp_ports;
-
- if (!ipsec_loaded(ipss)) {
- ip_proto_not_sup(q, first_mp, IP_FF_SEND_ICMP,
- ire->ire_zoneid, ipst);
- return;
- }
-
- ns = ipst->ips_netstack;
- /* select inbound SA and have IPsec process the pkt */
- if (ipha->ipha_protocol == IPPROTO_ESP) {
- esph_t *esph = ipsec_inbound_esp_sa(first_mp, ns);
- boolean_t esp_in_udp_sa;
- if (esph == NULL)
- return;
- ASSERT(ii->ipsec_in_esp_sa != NULL);
- ASSERT(ii->ipsec_in_esp_sa->ipsa_input_func != NULL);
- esp_in_udp_sa = ((ii->ipsec_in_esp_sa->ipsa_flags &
- IPSA_F_NATT) != 0);
- /*
- * The following is a fancy, but quick, way of saying:
- * ESP-in-UDP SA and Raw ESP packet --> drop
- * OR
- * ESP SA and ESP-in-UDP packet --> drop
- */
- if (esp_in_udp_sa != esp_in_udp_packet) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- ip_drop_packet(first_mp, B_TRUE, ill, NULL,
- DROPPER(ns->netstack_ipsec, ipds_esp_no_sa),
- &ns->netstack_ipsec->ipsec_dropper);
- return;
- }
- ipsec_rc = ii->ipsec_in_esp_sa->ipsa_input_func(
- first_mp, esph);
- } else {
- ah_t *ah = ipsec_inbound_ah_sa(first_mp, ns);
- if (ah == NULL)
- return;
- ASSERT(ii->ipsec_in_ah_sa != NULL);
- ASSERT(ii->ipsec_in_ah_sa->ipsa_input_func != NULL);
- ipsec_rc = ii->ipsec_in_ah_sa->ipsa_input_func(
- first_mp, ah);
- }
-
- switch (ipsec_rc) {
- case IPSEC_STATUS_SUCCESS:
- break;
- case IPSEC_STATUS_FAILED:
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- /* FALLTHRU */
- case IPSEC_STATUS_PENDING:
- return;
- }
- /* we're done with IPsec processing, send it up */
- ip_fanout_proto_again(first_mp, ill, recv_ill, ire);
- return;
- }
- default:
- break;
- }
- if (is_system_labeled() && !tsol_can_accept_raw(mp, B_FALSE)) {
- ip1dbg(("ip_proto_input: zone %d cannot accept raw IP",
- ire->ire_zoneid));
- goto drop_pkt;
- }
- /*
- * Handle protocols with which IP is less intimate. There
- * can be more than one stream bound to a particular
- * protocol. When this is the case, each one gets a copy
- * of any incoming packets.
- */
-fanout:
- ip_fanout_proto(q, first_mp, ill, ipha,
- IP_FF_SEND_ICMP | IP_FF_CKSUM | IP_FF_RAWIP, mctl_present,
- B_TRUE, recv_ill, ire->ire_zoneid);
- TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END,
- "ip_rput_locl_end: q %p (%S)", q, "ip_fanout_proto");
- return;
-
-drop_pkt:
- freemsg(first_mp);
- if (hada_mp != NULL)
- freeb(hada_mp);
- TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END,
- "ip_rput_locl_end: q %p (%S)", q, "droppkt");
-#undef rptr
-#undef iphs
-
-}
-
-/*
* Update any source route, record route or timestamp options.
* Check that we are at end of strict source route.
- * The options have already been checked for sanity in ip_rput_options().
+ * The options have already been checked for sanity in ip_input_options().
*/
-static boolean_t
-ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
- ip_stack_t *ipst)
+boolean_t
+ip_input_local_options(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
{
ipoptp_t opts;
uchar_t *opt;
uint8_t optval;
uint8_t optlen;
ipaddr_t dst;
+ ipaddr_t ifaddr;
uint32_t ts;
- ire_t *dst_ire;
timestruc_t now;
- zoneid_t zoneid;
- ill_t *ill;
-
- ASSERT(ire->ire_ipversion == IPV4_VERSION);
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
- ip2dbg(("ip_rput_local_options\n"));
+ ip2dbg(("ip_input_local_options\n"));
for (optval = ipoptp_first(&opts, ipha);
optval != IPOPT_EOL;
@@ -17900,7 +9360,7 @@ ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
opt = opts.ipoptp_cur;
optlen = opts.ipoptp_len;
- ip2dbg(("ip_rput_local_options: opt %d, len %d\n",
+ ip2dbg(("ip_input_local_options: opt %d, len %d\n",
optval, optlen));
switch (optval) {
uint32_t off;
@@ -17911,7 +9371,7 @@ ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
if (optlen < IP_ADDR_LEN ||
off > optlen - IP_ADDR_LEN) {
/* End of source route */
- ip1dbg(("ip_rput_local_options: end of SR\n"));
+ ip1dbg(("ip_input_local_options: end of SR\n"));
break;
}
/*
@@ -17920,7 +9380,7 @@ ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
* it is a packet with a loose source route which
* reaches us before consuming the whole source route
*/
- ip1dbg(("ip_rput_local_options: not end of SR\n"));
+ ip1dbg(("ip_input_local_options: not end of SR\n"));
if (optval == IPOPT_SSRR) {
goto bad_src_route;
}
@@ -17941,11 +9401,17 @@ ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
off > optlen - IP_ADDR_LEN) {
/* No more room - ignore */
ip1dbg((
- "ip_rput_local_options: end of RR\n"));
+ "ip_input_local_options: end of RR\n"));
break;
}
- bcopy(&ire->ire_src_addr, (char *)opt + off,
- IP_ADDR_LEN);
+ /* Pick a reasonable address on the outbound if */
+ if (ip_select_source_v4(ill, INADDR_ANY, ipha->ipha_dst,
+ INADDR_ANY, ALL_ZONES, ipst, &ifaddr, NULL,
+ NULL) != 0) {
+ /* No source! Shouldn't happen */
+ ifaddr = INADDR_ANY;
+ }
+ bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
opt[IPOPT_OFFSET] += IP_ADDR_LEN;
break;
case IPOPT_TS:
@@ -17959,14 +9425,10 @@ ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
/* Verify that the address matched */
off = opt[IPOPT_OFFSET] - 1;
bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
- dst_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL,
- NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE,
- ipst);
- if (dst_ire == NULL) {
+ if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
/* Not for us */
break;
}
- ire_refrele(dst_ire);
/* FALLTHRU */
case IPOPT_TS_TSANDADDR:
off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
@@ -17976,8 +9438,8 @@ ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
* ip_*put_options should have already
* dropped this packet.
*/
- cmn_err(CE_PANIC, "ip_rput_local_options: "
- "unknown IT - bug in ip_rput_options?\n");
+ cmn_err(CE_PANIC, "ip_input_local_options: "
+ "unknown IT - bug in ip_input_options?\n");
return (B_TRUE); /* Keep "lint" happy */
}
if (opt[IPOPT_OFFSET] - 1 + off > optlen) {
@@ -17993,8 +9455,14 @@ ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
case IPOPT_TS_PRESPEC:
case IPOPT_TS_PRESPEC_RFC791:
case IPOPT_TS_TSANDADDR:
- bcopy(&ire->ire_src_addr, (char *)opt + off,
- IP_ADDR_LEN);
+ /* Pick a reasonable addr on the outbound if */
+ if (ip_select_source_v4(ill, INADDR_ANY,
+ ipha->ipha_dst, INADDR_ANY, ALL_ZONES, ipst,
+ &ifaddr, NULL, NULL) != 0) {
+ /* No source! Shouldn't happen */
+ ifaddr = INADDR_ANY;
+ }
+ bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
opt[IPOPT_OFFSET] += IP_ADDR_LEN;
/* FALLTHRU */
case IPOPT_TS_TSONLY:
@@ -18013,51 +9481,41 @@ ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
return (B_TRUE);
bad_src_route:
- q = WR(q);
- if (q->q_next != NULL)
- ill = q->q_ptr;
- else
- ill = NULL;
-
/* make sure we clear any indication of a hardware checksum */
DB_CKSUMFLAGS(mp) = 0;
- zoneid = ipif_lookup_addr_zoneid(ipha->ipha_dst, ill, ipst);
- if (zoneid == ALL_ZONES)
- freemsg(mp);
- else
- icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED, zoneid, ipst);
+ ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ill);
+ icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira);
return (B_FALSE);
}
/*
- * Process IP options in an inbound packet. If an option affects the
- * effective destination address, return the next hop address via dstp.
- * Returns -1 if something fails in which case an ICMP error has been sent
+ * Process IP options in an inbound packet. Always returns the nexthop.
+ * Normally this is the passed in nexthop, but if there is an option
+ * that effects the nexthop (such as a source route) that will be returned.
+ * Sets *errorp if there is an error, in which case an ICMP error has been sent
* and mp freed.
*/
-static int
-ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp,
- ip_stack_t *ipst)
+ipaddr_t
+ip_input_options(ipha_t *ipha, ipaddr_t dst, mblk_t *mp,
+ ip_recv_attr_t *ira, int *errorp)
{
+ ip_stack_t *ipst = ira->ira_ill->ill_ipst;
ipoptp_t opts;
uchar_t *opt;
uint8_t optval;
uint8_t optlen;
- ipaddr_t dst;
intptr_t code = 0;
- ire_t *ire = NULL;
- zoneid_t zoneid;
- ill_t *ill;
+ ire_t *ire;
- ip2dbg(("ip_rput_options\n"));
- dst = ipha->ipha_dst;
+ ip2dbg(("ip_input_options\n"));
+ *errorp = 0;
for (optval = ipoptp_first(&opts, ipha);
optval != IPOPT_EOL;
optval = ipoptp_next(&opts)) {
opt = opts.ipoptp_cur;
optlen = opts.ipoptp_len;
- ip2dbg(("ip_rput_options: opt %d, len %d\n",
+ ip2dbg(("ip_input_options: opt %d, len %d\n",
optval, optlen));
/*
* Note: we need to verify the checksum before we
@@ -18068,27 +9526,24 @@ ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp,
uint32_t off;
case IPOPT_SSRR:
case IPOPT_LSRR:
- ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL,
- ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
- if (ire == NULL) {
+ if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
if (optval == IPOPT_SSRR) {
- ip1dbg(("ip_rput_options: not next"
+ ip1dbg(("ip_input_options: not next"
" strict source route 0x%x\n",
ntohl(dst)));
code = (char *)&ipha->ipha_dst -
(char *)ipha;
goto param_prob; /* RouterReq's */
}
- ip2dbg(("ip_rput_options: "
+ ip2dbg(("ip_input_options: "
"not next source route 0x%x\n",
ntohl(dst)));
break;
}
- ire_refrele(ire);
if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
ip1dbg((
- "ip_rput_options: bad option offset\n"));
+ "ip_input_options: bad option offset\n"));
code = (char *)&opt[IPOPT_OLEN] -
(char *)ipha;
goto param_prob;
@@ -18099,11 +9554,11 @@ ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp,
if (optlen < IP_ADDR_LEN ||
off > optlen - IP_ADDR_LEN) {
/* End of source route */
- ip1dbg(("ip_rput_options: end of SR\n"));
+ ip1dbg(("ip_input_options: end of SR\n"));
break;
}
bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
- ip1dbg(("ip_rput_options: next hop 0x%x\n",
+ ip1dbg(("ip_input_options: next hop 0x%x\n",
ntohl(dst)));
/*
@@ -18112,17 +9567,13 @@ ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp,
* XXX verify per-interface ip_forwarding
* for source route?
*/
- ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL,
- ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
-
- if (ire != NULL) {
- ire_refrele(ire);
+ if (ip_type_v4(dst, ipst) == IRE_LOCAL) {
off += IP_ADDR_LEN;
goto redo_srr;
}
if (dst == htonl(INADDR_LOOPBACK)) {
- ip1dbg(("ip_rput_options: loopback addr in "
+ ip1dbg(("ip_input_options: loopback addr in "
"source route!\n"));
goto bad_src_route;
}
@@ -18131,12 +9582,13 @@ ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp,
* reachable.
*/
if (optval == IPOPT_SSRR) {
- ire = ire_ftable_lookup(dst, 0, 0,
- IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0,
- msg_getlabel(mp),
- MATCH_IRE_TYPE | MATCH_IRE_SECATTR, ipst);
+ ire = ire_ftable_lookup_v4(dst, 0, 0,
+ IRE_IF_ALL, NULL, ALL_ZONES,
+ ira->ira_tsl,
+ MATCH_IRE_TYPE | MATCH_IRE_SECATTR, 0, ipst,
+ NULL);
if (ire == NULL) {
- ip1dbg(("ip_rput_options: SSRR not "
+ ip1dbg(("ip_input_options: SSRR not "
"directly reachable: 0x%x\n",
ntohl(dst)));
goto bad_src_route;
@@ -18151,7 +9603,7 @@ ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp,
case IPOPT_RR:
if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
ip1dbg((
- "ip_rput_options: bad option offset\n"));
+ "ip_input_options: bad option offset\n"));
code = (char *)&opt[IPOPT_OLEN] -
(char *)ipha;
goto param_prob;
@@ -18169,7 +9621,7 @@ ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp,
}
if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
ip1dbg((
- "ip_rput_options: bad option offset\n"));
+ "ip_input_options: bad option offset\n"));
code = (char *)&opt[IPOPT_OFFSET] -
(char *)ipha;
goto param_prob;
@@ -18201,45 +9653,27 @@ ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp,
}
if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0) {
- *dstp = dst;
- return (0);
+ return (dst);
}
- ip1dbg(("ip_rput_options: error processing IP options."));
+ ip1dbg(("ip_input_options: error processing IP options."));
code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha;
param_prob:
- q = WR(q);
- if (q->q_next != NULL)
- ill = q->q_ptr;
- else
- ill = NULL;
-
/* make sure we clear any indication of a hardware checksum */
DB_CKSUMFLAGS(mp) = 0;
- /* Don't know whether this is for non-global or global/forwarding */
- zoneid = ipif_lookup_addr_zoneid(dst, ill, ipst);
- if (zoneid == ALL_ZONES)
- freemsg(mp);
- else
- icmp_param_problem(q, mp, (uint8_t)code, zoneid, ipst);
- return (-1);
+ ip_drop_input("ICMP_PARAM_PROBLEM", mp, ira->ira_ill);
+ icmp_param_problem(mp, (uint8_t)code, ira);
+ *errorp = -1;
+ return (dst);
bad_src_route:
- q = WR(q);
- if (q->q_next != NULL)
- ill = q->q_ptr;
- else
- ill = NULL;
-
/* make sure we clear any indication of a hardware checksum */
DB_CKSUMFLAGS(mp) = 0;
- zoneid = ipif_lookup_addr_zoneid(dst, ill, ipst);
- if (zoneid == ALL_ZONES)
- freemsg(mp);
- else
- icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED, zoneid, ipst);
- return (-1);
+ ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ira->ira_ill);
+ icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira);
+ *errorp = -1;
+ return (dst);
}
/*
@@ -18248,7 +9682,7 @@ bad_src_route:
* - icmp fixed part (mib2_icmp_t)
* - ipAddrEntryTable (ip 20) all IPv4 ipifs
* - ipRouteEntryTable (ip 21) all IPv4 IREs
- * - ipNetToMediaEntryTable (ip 22) [filled in by the arp module]
+ * - ipNetToMediaEntryTable (ip 22) all IPv4 Neighbor Cache entries
* - ipRouteAttributeTable (ip 102) labeled routes
* - ip multicast membership (ip_member_t)
* - ip multicast source filtering (ip_grpsrc_t)
@@ -18262,13 +9696,11 @@ bad_src_route:
* One per ill plus one generic
* - ipv6RouteEntry all IPv6 IREs
* - ipv6RouteAttributeTable (ip6 102) labeled routes
- * - ipv6NetToMediaEntry all Neighbor Cache entries
+ * - ipv6NetToMediaEntry all IPv6 Neighbor Cache entries
* - ipv6AddrEntry all IPv6 ipifs
* - ipv6 multicast membership (ipv6_member_t)
* - ipv6 multicast source filtering (ipv6_grpsrc_t)
*
- * MIB2_IP_MEDIA is filled in by the arp module with ARP cache entries.
- *
* NOTE: original mpctl is copied for msg's 2..N, since its ctl part is
* already filled in by the caller.
* Return value of 0 indicates that no messages were sent and caller
@@ -18387,6 +9819,9 @@ ip_snmp_get(queue_t *q, mblk_t *mpctl, int level)
if ((mpctl = sctp_snmp_get_mib2(q, mpctl, sctps)) == NULL) {
return (1);
}
+ if ((mpctl = ip_snmp_get_mib2_ip_dce(q, mpctl, ipst)) == NULL) {
+ return (1);
+ }
freemsg(mpctl);
return (1);
}
@@ -18426,6 +9861,7 @@ ip_snmp_get_mib2_ip(queue_t *q, mblk_t *mpctl, mib2_ipIfStatsEntry_t *ipmib,
SET_MIB(old_ip_mib.ipRouteAttributeSize,
sizeof (mib2_ipAttributeEntry_t));
SET_MIB(old_ip_mib.transportMLPSize, sizeof (mib2_transportMLPEntry_t));
+ SET_MIB(old_ip_mib.ipDestEntrySize, sizeof (dest_cache_entry_t));
/*
* Grab the statistics from the new IP MIB
@@ -18681,9 +10117,14 @@ ip_snmp_get_mib2_ip_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
if (ipif->ipif_zoneid != zoneid &&
ipif->ipif_zoneid != ALL_ZONES)
continue;
+ /* Sum of count from dead IRE_LO* and our current */
mae.ipAdEntInfo.ae_ibcnt = ipif->ipif_ib_pkt_count;
- mae.ipAdEntInfo.ae_obcnt = ipif->ipif_ob_pkt_count;
- mae.ipAdEntInfo.ae_focnt = ipif->ipif_fo_pkt_count;
+ if (ipif->ipif_ire_local != NULL) {
+ mae.ipAdEntInfo.ae_ibcnt +=
+ ipif->ipif_ire_local->ire_ib_pkt_count;
+ }
+ mae.ipAdEntInfo.ae_obcnt = 0;
+ mae.ipAdEntInfo.ae_focnt = 0;
ipif_get_name(ipif, mae.ipAdEntIfIndex.o_bytes,
OCTET_LENGTH);
@@ -18694,7 +10135,7 @@ ip_snmp_get_mib2_ip_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
mae.ipAdEntInfo.ae_subnet = ipif->ipif_subnet;
mae.ipAdEntInfo.ae_subnet_len =
ip_mask_to_plen(ipif->ipif_net_mask);
- mae.ipAdEntInfo.ae_src_addr = ipif->ipif_src_addr;
+ mae.ipAdEntInfo.ae_src_addr = ipif->ipif_lcl_addr;
for (bitval = 1;
bitval &&
!(bitval & ipif->ipif_brd_addr);
@@ -18702,7 +10143,7 @@ ip_snmp_get_mib2_ip_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
noop;
mae.ipAdEntBcastAddr = bitval;
mae.ipAdEntReasmMaxSize = IP_MAXPACKET;
- mae.ipAdEntInfo.ae_mtu = ipif->ipif_mtu;
+ mae.ipAdEntInfo.ae_mtu = ipif->ipif_ill->ill_mtu;
mae.ipAdEntInfo.ae_metric = ipif->ipif_metric;
mae.ipAdEntInfo.ae_broadcast_addr =
ipif->ipif_brd_addr;
@@ -18710,7 +10151,8 @@ ip_snmp_get_mib2_ip_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
ipif->ipif_pp_dst_addr;
mae.ipAdEntInfo.ae_flags = ipif->ipif_flags |
ill->ill_flags | ill->ill_phyint->phyint_flags;
- mae.ipAdEntRetransmitTime = AR_EQ_DEFAULT_XMIT_INTERVAL;
+ mae.ipAdEntRetransmitTime =
+ ill->ill_reachable_retrans_time;
if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
(char *)&mae, (int)sizeof (mib2_ipAddrEntry_t))) {
@@ -18762,9 +10204,14 @@ ip_snmp_get_mib2_ip6_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
if (ipif->ipif_zoneid != zoneid &&
ipif->ipif_zoneid != ALL_ZONES)
continue;
+ /* Sum of count from dead IRE_LO* and our current */
mae6.ipv6AddrInfo.ae_ibcnt = ipif->ipif_ib_pkt_count;
- mae6.ipv6AddrInfo.ae_obcnt = ipif->ipif_ob_pkt_count;
- mae6.ipv6AddrInfo.ae_focnt = ipif->ipif_fo_pkt_count;
+ if (ipif->ipif_ire_local != NULL) {
+ mae6.ipv6AddrInfo.ae_ibcnt +=
+ ipif->ipif_ire_local->ire_ib_pkt_count;
+ }
+ mae6.ipv6AddrInfo.ae_obcnt = 0;
+ mae6.ipv6AddrInfo.ae_focnt = 0;
ipif_get_name(ipif, mae6.ipv6AddrIfIndex.o_bytes,
OCTET_LENGTH);
@@ -18776,7 +10223,7 @@ ip_snmp_get_mib2_ip6_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
mae6.ipv6AddrInfo.ae_subnet = ipif->ipif_v6subnet;
mae6.ipv6AddrInfo.ae_subnet_len =
mae6.ipv6AddrPfxLength;
- mae6.ipv6AddrInfo.ae_src_addr = ipif->ipif_v6src_addr;
+ mae6.ipv6AddrInfo.ae_src_addr = ipif->ipif_v6lcl_addr;
/* Type: stateless(1), stateful(2), unknown(3) */
if (ipif->ipif_flags & IPIF_ADDRCONF)
@@ -18799,7 +10246,7 @@ ip_snmp_get_mib2_ip6_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
mae6.ipv6AddrStatus = 2;
else
mae6.ipv6AddrStatus = 1;
- mae6.ipv6AddrInfo.ae_mtu = ipif->ipif_mtu;
+ mae6.ipv6AddrInfo.ae_mtu = ipif->ipif_ill->ill_mtu;
mae6.ipv6AddrInfo.ae_metric = ipif->ipif_metric;
mae6.ipv6AddrInfo.ae_pp_dst_addr =
ipif->ipif_v6pp_dst_addr;
@@ -18842,7 +10289,6 @@ ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
mblk_t *mp_tail = NULL;
ill_walk_context_t ctx;
zoneid_t zoneid;
- ilm_walker_t ilw;
/*
* make a copy of the original message
@@ -18859,36 +10305,49 @@ ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
ill = ILL_START_WALK_V4(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
- if (IS_UNDER_IPMP(ill))
+ /* Make sure the ill isn't going away. */
+ if (!ill_check_and_refhold(ill))
continue;
+ rw_exit(&ipst->ips_ill_g_lock);
+ rw_enter(&ill->ill_mcast_lock, RW_READER);
+ for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+ if (ilm->ilm_zoneid != zoneid &&
+ ilm->ilm_zoneid != ALL_ZONES)
+ continue;
- ilm = ilm_walker_start(&ilw, ill);
- for (ipif = ill->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
- if (ipif->ipif_zoneid != zoneid &&
- ipif->ipif_zoneid != ALL_ZONES)
- continue; /* not this zone */
- ipif_get_name(ipif, ipm.ipGroupMemberIfIndex.o_bytes,
- OCTET_LENGTH);
+ /* Is there an ipif for ilm_ifaddr? */
+ for (ipif = ill->ill_ipif; ipif != NULL;
+ ipif = ipif->ipif_next) {
+ if (!IPIF_IS_CONDEMNED(ipif) &&
+ ipif->ipif_lcl_addr == ilm->ilm_ifaddr &&
+ ilm->ilm_ifaddr != INADDR_ANY)
+ break;
+ }
+ if (ipif != NULL) {
+ ipif_get_name(ipif,
+ ipm.ipGroupMemberIfIndex.o_bytes,
+ OCTET_LENGTH);
+ } else {
+ ill_get_name(ill,
+ ipm.ipGroupMemberIfIndex.o_bytes,
+ OCTET_LENGTH);
+ }
ipm.ipGroupMemberIfIndex.o_length =
mi_strlen(ipm.ipGroupMemberIfIndex.o_bytes);
- for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
- ASSERT(ilm->ilm_ipif != NULL);
- ASSERT(ilm->ilm_ill == NULL);
- if (ilm->ilm_ipif != ipif)
- continue;
- ipm.ipGroupMemberAddress = ilm->ilm_addr;
- ipm.ipGroupMemberRefCnt = ilm->ilm_refcnt;
- ipm.ipGroupMemberFilterMode = ilm->ilm_fmode;
- if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
- (char *)&ipm, (int)sizeof (ipm))) {
- ip1dbg(("ip_snmp_get_mib2_ip_group: "
- "failed to allocate %u bytes\n",
- (uint_t)sizeof (ipm)));
- }
+
+ ipm.ipGroupMemberAddress = ilm->ilm_addr;
+ ipm.ipGroupMemberRefCnt = ilm->ilm_refcnt;
+ ipm.ipGroupMemberFilterMode = ilm->ilm_fmode;
+ if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
+ (char *)&ipm, (int)sizeof (ipm))) {
+ ip1dbg(("ip_snmp_get_mib2_ip_group: "
+ "failed to allocate %u bytes\n",
+ (uint_t)sizeof (ipm)));
}
}
- ilm_walker_finish(&ilw);
+ rw_exit(&ill->ill_mcast_lock);
+ ill_refrele(ill);
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
}
rw_exit(&ipst->ips_ill_g_lock);
optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
@@ -18910,7 +10369,6 @@ ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
mblk_t *mp_tail = NULL;
ill_walk_context_t ctx;
zoneid_t zoneid;
- ilm_walker_t ilw;
/*
* make a copy of the original message
@@ -18926,15 +10384,19 @@ ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
ill = ILL_START_WALK_V6(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
- if (IS_UNDER_IPMP(ill))
+ /* Make sure the ill isn't going away. */
+ if (!ill_check_and_refhold(ill))
continue;
-
- ilm = ilm_walker_start(&ilw, ill);
+ rw_exit(&ipst->ips_ill_g_lock);
+ /*
+ * Normally we don't have any members on under IPMP interfaces.
+ * We report them as a debugging aid.
+ */
+ rw_enter(&ill->ill_mcast_lock, RW_READER);
ipm6.ipv6GroupMemberIfIndex = ill->ill_phyint->phyint_ifindex;
- for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
- ASSERT(ilm->ilm_ipif == NULL);
- ASSERT(ilm->ilm_ill != NULL);
- if (ilm->ilm_zoneid != zoneid)
+ for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+ if (ilm->ilm_zoneid != zoneid &&
+ ilm->ilm_zoneid != ALL_ZONES)
continue; /* not this zone */
ipm6.ipv6GroupMemberAddress = ilm->ilm_v6addr;
ipm6.ipv6GroupMemberRefCnt = ilm->ilm_refcnt;
@@ -18947,7 +10409,9 @@ ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
(uint_t)sizeof (ipm6)));
}
}
- ilm_walker_finish(&ilw);
+ rw_exit(&ill->ill_mcast_lock);
+ ill_refrele(ill);
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
}
rw_exit(&ipst->ips_ill_g_lock);
@@ -18973,7 +10437,6 @@ ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
zoneid_t zoneid;
int i;
slist_t *sl;
- ilm_walker_t ilw;
/*
* make a copy of the original message
@@ -18990,43 +10453,56 @@ ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
ill = ILL_START_WALK_V4(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
- if (IS_UNDER_IPMP(ill))
+ /* Make sure the ill isn't going away. */
+ if (!ill_check_and_refhold(ill))
continue;
+ rw_exit(&ipst->ips_ill_g_lock);
+ rw_enter(&ill->ill_mcast_lock, RW_READER);
+ for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+ sl = ilm->ilm_filter;
+ if (ilm->ilm_zoneid != zoneid &&
+ ilm->ilm_zoneid != ALL_ZONES)
+ continue;
+ if (SLIST_IS_EMPTY(sl))
+ continue;
- ilm = ilm_walker_start(&ilw, ill);
- for (ipif = ill->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
- if (ipif->ipif_zoneid != zoneid)
- continue; /* not this zone */
- ipif_get_name(ipif, ips.ipGroupSourceIfIndex.o_bytes,
- OCTET_LENGTH);
+ /* Is there an ipif for ilm_ifaddr? */
+ for (ipif = ill->ill_ipif; ipif != NULL;
+ ipif = ipif->ipif_next) {
+ if (!IPIF_IS_CONDEMNED(ipif) &&
+ ipif->ipif_lcl_addr == ilm->ilm_ifaddr &&
+ ilm->ilm_ifaddr != INADDR_ANY)
+ break;
+ }
+ if (ipif != NULL) {
+ ipif_get_name(ipif,
+ ips.ipGroupSourceIfIndex.o_bytes,
+ OCTET_LENGTH);
+ } else {
+ ill_get_name(ill,
+ ips.ipGroupSourceIfIndex.o_bytes,
+ OCTET_LENGTH);
+ }
ips.ipGroupSourceIfIndex.o_length =
mi_strlen(ips.ipGroupSourceIfIndex.o_bytes);
- for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
- ASSERT(ilm->ilm_ipif != NULL);
- ASSERT(ilm->ilm_ill == NULL);
- sl = ilm->ilm_filter;
- if (ilm->ilm_ipif != ipif || SLIST_IS_EMPTY(sl))
+
+ ips.ipGroupSourceGroup = ilm->ilm_addr;
+ for (i = 0; i < sl->sl_numsrc; i++) {
+ if (!IN6_IS_ADDR_V4MAPPED(&sl->sl_addr[i]))
continue;
- ips.ipGroupSourceGroup = ilm->ilm_addr;
- for (i = 0; i < sl->sl_numsrc; i++) {
- if (!IN6_IS_ADDR_V4MAPPED(
- &sl->sl_addr[i]))
- continue;
- IN6_V4MAPPED_TO_IPADDR(&sl->sl_addr[i],
- ips.ipGroupSourceAddress);
- if (snmp_append_data2(mpctl->b_cont,
- &mp_tail, (char *)&ips,
- (int)sizeof (ips)) == 0) {
- ip1dbg(("ip_snmp_get_mib2_"
- "ip_group_src: failed to "
- "allocate %u bytes\n",
- (uint_t)sizeof (ips)));
- }
+ IN6_V4MAPPED_TO_IPADDR(&sl->sl_addr[i],
+ ips.ipGroupSourceAddress);
+ if (snmp_append_data2(mpctl->b_cont, &mp_tail,
+ (char *)&ips, (int)sizeof (ips)) == 0) {
+ ip1dbg(("ip_snmp_get_mib2_ip_group_src:"
+ " failed to allocate %u bytes\n",
+ (uint_t)sizeof (ips)));
}
}
}
- ilm_walker_finish(&ilw);
+ rw_exit(&ill->ill_mcast_lock);
+ ill_refrele(ill);
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
}
rw_exit(&ipst->ips_ill_g_lock);
optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
@@ -19050,7 +10526,6 @@ ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
zoneid_t zoneid;
int i;
slist_t *sl;
- ilm_walker_t ilw;
/*
* make a copy of the original message
@@ -19066,16 +10541,22 @@ ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
ill = ILL_START_WALK_V6(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
- if (IS_UNDER_IPMP(ill))
+ /* Make sure the ill isn't going away. */
+ if (!ill_check_and_refhold(ill))
continue;
-
- ilm = ilm_walker_start(&ilw, ill);
+ rw_exit(&ipst->ips_ill_g_lock);
+ /*
+ * Normally we don't have any members on under IPMP interfaces.
+ * We report them as a debugging aid.
+ */
+ rw_enter(&ill->ill_mcast_lock, RW_READER);
ips6.ipv6GroupSourceIfIndex = ill->ill_phyint->phyint_ifindex;
- for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
- ASSERT(ilm->ilm_ipif == NULL);
- ASSERT(ilm->ilm_ill != NULL);
+ for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
sl = ilm->ilm_filter;
- if (ilm->ilm_zoneid != zoneid || SLIST_IS_EMPTY(sl))
+ if (ilm->ilm_zoneid != zoneid &&
+ ilm->ilm_zoneid != ALL_ZONES)
+ continue;
+ if (SLIST_IS_EMPTY(sl))
continue;
ips6.ipv6GroupSourceGroup = ilm->ilm_v6addr;
for (i = 0; i < sl->sl_numsrc; i++) {
@@ -19089,7 +10570,9 @@ ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
}
}
}
- ilm_walker_finish(&ilw);
+ rw_exit(&ill->ill_mcast_lock);
+ ill_refrele(ill);
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
}
rw_exit(&ipst->ips_ill_g_lock);
@@ -19189,13 +10672,13 @@ ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, int level,
ird.ird_netmedia.lp_head = mp3ctl->b_cont;
ird.ird_attrs.lp_head = mp4ctl->b_cont;
/*
- * If the level has been set the special EXPER_IP_AND_TESTHIDDEN
- * value, then also include IRE_MARK_TESTHIDDEN IREs. This is
+ * If the level has been set the special EXPER_IP_AND_ALL_IRES value,
+ * then also include ire_testhidden IREs and IRE_IF_CLONE. This is
* intended a temporary solution until a proper MIB API is provided
* that provides complete filtering/caller-opt-in.
*/
- if (level == EXPER_IP_AND_TESTHIDDEN)
- ird.ird_flags |= IRD_REPORT_TESTHIDDEN;
+ if (level == EXPER_IP_AND_ALL_IRES)
+ ird.ird_flags |= IRD_REPORT_ALL;
zoneid = Q_TO_CONN(q)->conn_zoneid;
ire_walk_v4(ip_snmp_get2_v4, &ird, zoneid, ipst);
@@ -19210,6 +10693,8 @@ ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, int level,
qreply(q, mpctl);
/* ipNetToMediaEntryTable in mp3ctl */
+ ncec_walk(NULL, ip_snmp_get2_v4_media, &ird, ipst);
+
optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)];
optp->level = MIB2_IP;
optp->name = MIB2_IP_MEDIA;
@@ -19272,13 +10757,13 @@ ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, int level,
ird.ird_netmedia.lp_head = mp3ctl->b_cont;
ird.ird_attrs.lp_head = mp4ctl->b_cont;
/*
- * If the level has been set the special EXPER_IP_AND_TESTHIDDEN
- * value, then also include IRE_MARK_TESTHIDDEN IREs. This is
+ * If the level has been set the special EXPER_IP_AND_ALL_IRES value,
+ * then also include ire_testhidden IREs and IRE_IF_CLONE. This is
* intended a temporary solution until a proper MIB API is provided
* that provides complete filtering/caller-opt-in.
*/
- if (level == EXPER_IP_AND_TESTHIDDEN)
- ird.ird_flags |= IRD_REPORT_TESTHIDDEN;
+ if (level == EXPER_IP_AND_ALL_IRES)
+ ird.ird_flags |= IRD_REPORT_ALL;
zoneid = Q_TO_CONN(q)->conn_zoneid;
ire_walk_v6(ip_snmp_get2_v6_route, &ird, zoneid, ipst);
@@ -19292,7 +10777,7 @@ ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, int level,
qreply(q, mpctl);
/* ipv6NetToMediaEntryTable in mp3ctl */
- ndp_walk(NULL, ip_snmp_get2_v6_media, &ird, ipst);
+ ncec_walk(NULL, ip_snmp_get2_v6_media, &ird, ipst);
optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)];
optp->level = MIB2_IP6;
@@ -19487,21 +10972,20 @@ static void
ip_snmp_get2_v4(ire_t *ire, iproutedata_t *ird)
{
ill_t *ill;
- ipif_t *ipif;
mib2_ipRouteEntry_t *re;
- mib2_ipAttributeEntry_t *iae, *iaeptr;
- ipaddr_t gw_addr;
+ mib2_ipAttributeEntry_t iaes;
tsol_ire_gw_secattr_t *attrp;
tsol_gc_t *gc = NULL;
tsol_gcgrp_t *gcgrp = NULL;
- uint_t sacnt = 0;
- int i;
+ ip_stack_t *ipst = ire->ire_ipst;
ASSERT(ire->ire_ipversion == IPV4_VERSION);
- if (!(ird->ird_flags & IRD_REPORT_TESTHIDDEN) &&
- ire->ire_marks & IRE_MARK_TESTHIDDEN) {
- return;
+ if (!(ird->ird_flags & IRD_REPORT_ALL)) {
+ if (ire->ire_testhidden)
+ return;
+ if (ire->ire_type & IRE_IF_CLONE)
+ return;
}
if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL)
@@ -19513,52 +10997,17 @@ ip_snmp_get2_v4(ire_t *ire, iproutedata_t *ird)
gcgrp = gc->gc_grp;
ASSERT(gcgrp != NULL);
rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
- sacnt = 1;
- } else if ((gcgrp = attrp->igsa_gcgrp) != NULL) {
- rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
- gc = gcgrp->gcgrp_head;
- sacnt = gcgrp->gcgrp_count;
}
mutex_exit(&attrp->igsa_lock);
-
- /* do nothing if there's no gc to report */
- if (gc == NULL) {
- ASSERT(sacnt == 0);
- if (gcgrp != NULL) {
- /* we might as well drop the lock now */
- rw_exit(&gcgrp->gcgrp_rwlock);
- gcgrp = NULL;
- }
- attrp = NULL;
- }
-
- ASSERT(gc == NULL || (gcgrp != NULL &&
- RW_LOCK_HELD(&gcgrp->gcgrp_rwlock)));
}
- ASSERT(sacnt == 0 || gc != NULL);
-
- if (sacnt != 0 &&
- (iae = kmem_alloc(sacnt * sizeof (*iae), KM_NOSLEEP)) == NULL) {
- kmem_free(re, sizeof (*re));
- rw_exit(&gcgrp->gcgrp_rwlock);
- return;
- }
-
/*
* Return all IRE types for route table... let caller pick and choose
*/
re->ipRouteDest = ire->ire_addr;
- ipif = ire->ire_ipif;
+ ill = ire->ire_ill;
re->ipRouteIfIndex.o_length = 0;
- if (ire->ire_type == IRE_CACHE) {
- ill = (ill_t *)ire->ire_stq->q_ptr;
- re->ipRouteIfIndex.o_length =
- ill->ill_name_length == 0 ? 0 :
- MIN(OCTET_LENGTH, ill->ill_name_length - 1);
- bcopy(ill->ill_name, re->ipRouteIfIndex.o_bytes,
- re->ipRouteIfIndex.o_length);
- } else if (ipif != NULL) {
- ipif_get_name(ipif, re->ipRouteIfIndex.o_bytes, OCTET_LENGTH);
+ if (ill != NULL) {
+ ill_get_name(ill, re->ipRouteIfIndex.o_bytes, OCTET_LENGTH);
re->ipRouteIfIndex.o_length =
mi_strlen(re->ipRouteIfIndex.o_bytes);
}
@@ -19567,30 +11016,45 @@ ip_snmp_get2_v4(ire_t *ire, iproutedata_t *ird)
re->ipRouteMetric3 = -1;
re->ipRouteMetric4 = -1;
- gw_addr = ire->ire_gateway_addr;
-
- if (ire->ire_type & (IRE_INTERFACE|IRE_LOOPBACK|IRE_BROADCAST))
- re->ipRouteNextHop = ire->ire_src_addr;
- else
- re->ipRouteNextHop = gw_addr;
+ re->ipRouteNextHop = ire->ire_gateway_addr;
/* indirect(4), direct(3), or invalid(2) */
if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE))
re->ipRouteType = 2;
+ else if (ire->ire_type & IRE_ONLINK)
+ re->ipRouteType = 3;
else
- re->ipRouteType = (gw_addr != 0) ? 4 : 3;
+ re->ipRouteType = 4;
+
re->ipRouteProto = -1;
re->ipRouteAge = gethrestime_sec() - ire->ire_create_time;
re->ipRouteMask = ire->ire_mask;
re->ipRouteMetric5 = -1;
- re->ipRouteInfo.re_max_frag = ire->ire_max_frag;
- re->ipRouteInfo.re_frag_flag = ire->ire_frag_flag;
- re->ipRouteInfo.re_rtt = ire->ire_uinfo.iulp_rtt;
+ re->ipRouteInfo.re_max_frag = ire->ire_metrics.iulp_mtu;
+ if (ire->ire_ill != NULL && re->ipRouteInfo.re_max_frag == 0)
+ re->ipRouteInfo.re_max_frag = ire->ire_ill->ill_mtu;
+
+ re->ipRouteInfo.re_frag_flag = 0;
+ re->ipRouteInfo.re_rtt = 0;
+ re->ipRouteInfo.re_src_addr = 0;
re->ipRouteInfo.re_ref = ire->ire_refcnt;
- re->ipRouteInfo.re_src_addr = ire->ire_src_addr;
re->ipRouteInfo.re_obpkt = ire->ire_ob_pkt_count;
re->ipRouteInfo.re_ibpkt = ire->ire_ib_pkt_count;
re->ipRouteInfo.re_flags = ire->ire_flags;
+ /* Add the IRE_IF_CLONE's counters to their parent IRE_INTERFACE */
+ if (ire->ire_type & IRE_INTERFACE) {
+ ire_t *child;
+
+ rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
+ child = ire->ire_dep_children;
+ while (child != NULL) {
+ re->ipRouteInfo.re_obpkt += child->ire_ob_pkt_count;
+ re->ipRouteInfo.re_ibpkt += child->ire_ib_pkt_count;
+ child = child->ire_dep_sib_next;
+ }
+ rw_exit(&ipst->ips_ire_dep_lock);
+ }
+
if (ire->ire_flags & RTF_DYNAMIC) {
re->ipRouteInfo.re_ire_type = IRE_HOST_REDIRECT;
} else {
@@ -19603,25 +11067,22 @@ ip_snmp_get2_v4(ire_t *ire, iproutedata_t *ird)
(uint_t)sizeof (*re)));
}
- for (iaeptr = iae, i = 0; i < sacnt; i++, iaeptr++, gc = gc->gc_next) {
- iaeptr->iae_routeidx = ird->ird_idx;
- iaeptr->iae_doi = gc->gc_db->gcdb_doi;
- iaeptr->iae_slrange = gc->gc_db->gcdb_slrange;
- }
+ if (gc != NULL) {
+ iaes.iae_routeidx = ird->ird_idx;
+ iaes.iae_doi = gc->gc_db->gcdb_doi;
+ iaes.iae_slrange = gc->gc_db->gcdb_slrange;
- if (!snmp_append_data2(ird->ird_attrs.lp_head, &ird->ird_attrs.lp_tail,
- (char *)iae, sacnt * sizeof (*iae))) {
- ip1dbg(("ip_snmp_get2_v4: failed to allocate %u bytes\n",
- (unsigned)(sacnt * sizeof (*iae))));
+ if (!snmp_append_data2(ird->ird_attrs.lp_head,
+ &ird->ird_attrs.lp_tail, (char *)&iaes, sizeof (iaes))) {
+ ip1dbg(("ip_snmp_get2_v4: failed to allocate %u "
+ "bytes\n", (uint_t)sizeof (iaes)));
+ }
}
/* bump route index for next pass */
ird->ird_idx++;
kmem_free(re, sizeof (*re));
- if (sacnt != 0)
- kmem_free(iae, sacnt * sizeof (*iae));
-
if (gcgrp != NULL)
rw_exit(&gcgrp->gcgrp_rwlock);
}
@@ -19633,21 +11094,20 @@ static void
ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird)
{
ill_t *ill;
- ipif_t *ipif;
mib2_ipv6RouteEntry_t *re;
- mib2_ipAttributeEntry_t *iae, *iaeptr;
- in6_addr_t gw_addr_v6;
+ mib2_ipAttributeEntry_t iaes;
tsol_ire_gw_secattr_t *attrp;
tsol_gc_t *gc = NULL;
tsol_gcgrp_t *gcgrp = NULL;
- uint_t sacnt = 0;
- int i;
+ ip_stack_t *ipst = ire->ire_ipst;
ASSERT(ire->ire_ipversion == IPV6_VERSION);
- if (!(ird->ird_flags & IRD_REPORT_TESTHIDDEN) &&
- ire->ire_marks & IRE_MARK_TESTHIDDEN) {
- return;
+ if (!(ird->ird_flags & IRD_REPORT_ALL)) {
+ if (ire->ire_testhidden)
+ return;
+ if (ire->ire_type & IRE_IF_CLONE)
+ return;
}
if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL)
@@ -19659,37 +11119,9 @@ ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird)
gcgrp = gc->gc_grp;
ASSERT(gcgrp != NULL);
rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
- sacnt = 1;
- } else if ((gcgrp = attrp->igsa_gcgrp) != NULL) {
- rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
- gc = gcgrp->gcgrp_head;
- sacnt = gcgrp->gcgrp_count;
}
mutex_exit(&attrp->igsa_lock);
-
- /* do nothing if there's no gc to report */
- if (gc == NULL) {
- ASSERT(sacnt == 0);
- if (gcgrp != NULL) {
- /* we might as well drop the lock now */
- rw_exit(&gcgrp->gcgrp_rwlock);
- gcgrp = NULL;
- }
- attrp = NULL;
- }
-
- ASSERT(gc == NULL || (gcgrp != NULL &&
- RW_LOCK_HELD(&gcgrp->gcgrp_rwlock)));
- }
- ASSERT(sacnt == 0 || gc != NULL);
-
- if (sacnt != 0 &&
- (iae = kmem_alloc(sacnt * sizeof (*iae), KM_NOSLEEP)) == NULL) {
- kmem_free(re, sizeof (*re));
- rw_exit(&gcgrp->gcgrp_rwlock);
- return;
}
-
/*
* Return all IRE types for route table... let caller pick and choose
*/
@@ -19697,16 +11129,9 @@ ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird)
re->ipv6RoutePfxLength = ip_mask_to_plen_v6(&ire->ire_mask_v6);
re->ipv6RouteIndex = 0; /* Unique when multiple with same dest/plen */
re->ipv6RouteIfIndex.o_length = 0;
- ipif = ire->ire_ipif;
- if (ire->ire_type == IRE_CACHE) {
- ill = (ill_t *)ire->ire_stq->q_ptr;
- re->ipv6RouteIfIndex.o_length =
- ill->ill_name_length == 0 ? 0 :
- MIN(OCTET_LENGTH, ill->ill_name_length - 1);
- bcopy(ill->ill_name, re->ipv6RouteIfIndex.o_bytes,
- re->ipv6RouteIfIndex.o_length);
- } else if (ipif != NULL) {
- ipif_get_name(ipif, re->ipv6RouteIfIndex.o_bytes, OCTET_LENGTH);
+ ill = ire->ire_ill;
+ if (ill != NULL) {
+ ill_get_name(ill, re->ipv6RouteIfIndex.o_bytes, OCTET_LENGTH);
re->ipv6RouteIfIndex.o_length =
mi_strlen(re->ipv6RouteIfIndex.o_bytes);
}
@@ -19714,18 +11139,13 @@ ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird)
ASSERT(!(ire->ire_type & IRE_BROADCAST));
mutex_enter(&ire->ire_lock);
- gw_addr_v6 = ire->ire_gateway_addr_v6;
+ re->ipv6RouteNextHop = ire->ire_gateway_addr_v6;
mutex_exit(&ire->ire_lock);
- if (ire->ire_type & (IRE_INTERFACE|IRE_LOOPBACK))
- re->ipv6RouteNextHop = ire->ire_src_addr_v6;
- else
- re->ipv6RouteNextHop = gw_addr_v6;
-
/* remote(4), local(3), or discard(2) */
if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE))
re->ipv6RouteType = 2;
- else if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6))
+ else if (ire->ire_type & IRE_ONLINK)
re->ipv6RouteType = 3;
else
re->ipv6RouteType = 4;
@@ -19736,15 +11156,31 @@ ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird)
re->ipv6RouteNextHopRDI = 0;
re->ipv6RouteWeight = 0;
re->ipv6RouteMetric = 0;
- re->ipv6RouteInfo.re_max_frag = ire->ire_max_frag;
- re->ipv6RouteInfo.re_frag_flag = ire->ire_frag_flag;
- re->ipv6RouteInfo.re_rtt = ire->ire_uinfo.iulp_rtt;
- re->ipv6RouteInfo.re_src_addr = ire->ire_src_addr_v6;
+ re->ipv6RouteInfo.re_max_frag = ire->ire_metrics.iulp_mtu;
+ if (ire->ire_ill != NULL && re->ipv6RouteInfo.re_max_frag == 0)
+ re->ipv6RouteInfo.re_max_frag = ire->ire_ill->ill_mtu;
+
+ re->ipv6RouteInfo.re_frag_flag = 0;
+ re->ipv6RouteInfo.re_rtt = 0;
+ re->ipv6RouteInfo.re_src_addr = ipv6_all_zeros;
re->ipv6RouteInfo.re_obpkt = ire->ire_ob_pkt_count;
re->ipv6RouteInfo.re_ibpkt = ire->ire_ib_pkt_count;
re->ipv6RouteInfo.re_ref = ire->ire_refcnt;
re->ipv6RouteInfo.re_flags = ire->ire_flags;
+ /* Add the IRE_IF_CLONE's counters to their parent IRE_INTERFACE */
+ if (ire->ire_type & IRE_INTERFACE) {
+ ire_t *child;
+
+ rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
+ child = ire->ire_dep_children;
+ while (child != NULL) {
+ re->ipv6RouteInfo.re_obpkt += child->ire_ob_pkt_count;
+ re->ipv6RouteInfo.re_ibpkt += child->ire_ib_pkt_count;
+ child = child->ire_dep_sib_next;
+ }
+ rw_exit(&ipst->ips_ire_dep_lock);
+ }
if (ire->ire_flags & RTF_DYNAMIC) {
re->ipv6RouteInfo.re_ire_type = IRE_HOST_REDIRECT;
} else {
@@ -19757,79 +11193,67 @@ ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird)
(uint_t)sizeof (*re)));
}
- for (iaeptr = iae, i = 0; i < sacnt; i++, iaeptr++, gc = gc->gc_next) {
- iaeptr->iae_routeidx = ird->ird_idx;
- iaeptr->iae_doi = gc->gc_db->gcdb_doi;
- iaeptr->iae_slrange = gc->gc_db->gcdb_slrange;
- }
+ if (gc != NULL) {
+ iaes.iae_routeidx = ird->ird_idx;
+ iaes.iae_doi = gc->gc_db->gcdb_doi;
+ iaes.iae_slrange = gc->gc_db->gcdb_slrange;
- if (!snmp_append_data2(ird->ird_attrs.lp_head, &ird->ird_attrs.lp_tail,
- (char *)iae, sacnt * sizeof (*iae))) {
- ip1dbg(("ip_snmp_get2_v6: failed to allocate %u bytes\n",
- (unsigned)(sacnt * sizeof (*iae))));
+ if (!snmp_append_data2(ird->ird_attrs.lp_head,
+ &ird->ird_attrs.lp_tail, (char *)&iaes, sizeof (iaes))) {
+ ip1dbg(("ip_snmp_get2_v6: failed to allocate %u "
+ "bytes\n", (uint_t)sizeof (iaes)));
+ }
}
/* bump route index for next pass */
ird->ird_idx++;
kmem_free(re, sizeof (*re));
- if (sacnt != 0)
- kmem_free(iae, sacnt * sizeof (*iae));
-
if (gcgrp != NULL)
rw_exit(&gcgrp->gcgrp_rwlock);
}
/*
- * ndp_walk routine to create ipv6NetToMediaEntryTable
+ * ncec_walk routine to create ipv6NetToMediaEntryTable
*/
static int
-ip_snmp_get2_v6_media(nce_t *nce, iproutedata_t *ird)
+ip_snmp_get2_v6_media(ncec_t *ncec, iproutedata_t *ird)
{
ill_t *ill;
mib2_ipv6NetToMediaEntry_t ntme;
- dl_unitdata_req_t *dl;
- ill = nce->nce_ill;
- if (ill->ill_isv6 == B_FALSE) /* skip arpce entry */
+ ill = ncec->ncec_ill;
+ /* skip arpce entries, and loopback ncec entries */
+ if (ill->ill_isv6 == B_FALSE || ill->ill_net_type == IRE_LOOPBACK)
return (0);
-
/*
* Neighbor cache entry attached to IRE with on-link
* destination.
+ * We report all IPMP groups on ncec_ill which is normally the upper.
*/
ntme.ipv6NetToMediaIfIndex = ill->ill_phyint->phyint_ifindex;
- ntme.ipv6NetToMediaNetAddress = nce->nce_addr;
- if ((ill->ill_flags & ILLF_XRESOLV) &&
- (nce->nce_res_mp != NULL)) {
- dl = (dl_unitdata_req_t *)(nce->nce_res_mp->b_rptr);
- ntme.ipv6NetToMediaPhysAddress.o_length =
- dl->dl_dest_addr_length;
- } else {
- ntme.ipv6NetToMediaPhysAddress.o_length =
- ill->ill_phys_addr_length;
- }
- if (nce->nce_res_mp != NULL) {
- bcopy((char *)nce->nce_res_mp->b_rptr +
- NCE_LL_ADDR_OFFSET(ill),
- ntme.ipv6NetToMediaPhysAddress.o_bytes,
+ ntme.ipv6NetToMediaNetAddress = ncec->ncec_addr;
+ ntme.ipv6NetToMediaPhysAddress.o_length = ill->ill_phys_addr_length;
+ if (ncec->ncec_lladdr != NULL) {
+ bcopy(ncec->ncec_lladdr, ntme.ipv6NetToMediaPhysAddress.o_bytes,
ntme.ipv6NetToMediaPhysAddress.o_length);
- } else {
- bzero(ntme.ipv6NetToMediaPhysAddress.o_bytes,
- ill->ill_phys_addr_length);
}
/*
* Note: Returns ND_* states. Should be:
* reachable(1), stale(2), delay(3), probe(4),
* invalid(5), unknown(6)
*/
- ntme.ipv6NetToMediaState = nce->nce_state;
+ ntme.ipv6NetToMediaState = ncec->ncec_state;
ntme.ipv6NetToMediaLastUpdated = 0;
/* other(1), dynamic(2), static(3), local(4) */
- if (IN6_IS_ADDR_LOOPBACK(&nce->nce_addr)) {
+ if (NCE_MYADDR(ncec)) {
ntme.ipv6NetToMediaType = 4;
- } else if (IN6_IS_ADDR_MULTICAST(&nce->nce_addr)) {
+ } else if (ncec->ncec_flags & NCE_F_PUBLISH) {
+ ntme.ipv6NetToMediaType = 1; /* proxy */
+ } else if (ncec->ncec_flags & NCE_F_STATIC) {
+ ntme.ipv6NetToMediaType = 3;
+ } else if (ncec->ncec_flags & (NCE_F_MCAST|NCE_F_BCAST)) {
ntme.ipv6NetToMediaType = 1;
} else {
ntme.ipv6NetToMediaType = 2;
@@ -19843,6 +11267,93 @@ ip_snmp_get2_v6_media(nce_t *nce, iproutedata_t *ird)
return (0);
}
+int
+nce2ace(ncec_t *ncec)
+{
+ int flags = 0;
+
+ if (NCE_ISREACHABLE(ncec))
+ flags |= ACE_F_RESOLVED;
+ if (ncec->ncec_flags & NCE_F_AUTHORITY)
+ flags |= ACE_F_AUTHORITY;
+ if (ncec->ncec_flags & NCE_F_PUBLISH)
+ flags |= ACE_F_PUBLISH;
+ if ((ncec->ncec_flags & NCE_F_NONUD) != 0)
+ flags |= ACE_F_PERMANENT;
+ if (NCE_MYADDR(ncec))
+ flags |= (ACE_F_MYADDR | ACE_F_AUTHORITY);
+ if (ncec->ncec_flags & NCE_F_UNVERIFIED)
+ flags |= ACE_F_UNVERIFIED;
+ if (ncec->ncec_flags & NCE_F_AUTHORITY)
+ flags |= ACE_F_AUTHORITY;
+ if (ncec->ncec_flags & NCE_F_DELAYED)
+ flags |= ACE_F_DELAYED;
+ return (flags);
+}
+
+/*
+ * ncec_walk routine to create ipNetToMediaEntryTable
+ */
+static int
+ip_snmp_get2_v4_media(ncec_t *ncec, iproutedata_t *ird)
+{
+ ill_t *ill;
+ mib2_ipNetToMediaEntry_t ntme;
+ const char *name = "unknown";
+ ipaddr_t ncec_addr;
+
+ ill = ncec->ncec_ill;
+ if (ill->ill_isv6 || (ncec->ncec_flags & NCE_F_BCAST) ||
+ ill->ill_net_type == IRE_LOOPBACK)
+ return (0);
+
+ /* We report all IPMP groups on ncec_ill which is normally the upper. */
+ name = ill->ill_name;
+ /* Based on RFC 4293: other(1), inval(2), dyn(3), stat(4) */
+ if (NCE_MYADDR(ncec)) {
+ ntme.ipNetToMediaType = 4;
+ } else if (ncec->ncec_flags & (NCE_F_MCAST|NCE_F_BCAST|NCE_F_PUBLISH)) {
+ ntme.ipNetToMediaType = 1;
+ } else {
+ ntme.ipNetToMediaType = 3;
+ }
+ ntme.ipNetToMediaIfIndex.o_length = MIN(OCTET_LENGTH, strlen(name));
+ bcopy(name, ntme.ipNetToMediaIfIndex.o_bytes,
+ ntme.ipNetToMediaIfIndex.o_length);
+
+ IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
+ bcopy(&ncec_addr, &ntme.ipNetToMediaNetAddress, sizeof (ncec_addr));
+
+ ntme.ipNetToMediaInfo.ntm_mask.o_length = sizeof (ipaddr_t);
+ ncec_addr = INADDR_BROADCAST;
+ bcopy(&ncec_addr, ntme.ipNetToMediaInfo.ntm_mask.o_bytes,
+ sizeof (ncec_addr));
+ /*
+ * map all the flags to the ACE counterpart.
+ */
+ ntme.ipNetToMediaInfo.ntm_flags = nce2ace(ncec);
+
+ ntme.ipNetToMediaPhysAddress.o_length =
+ MIN(OCTET_LENGTH, ill->ill_phys_addr_length);
+
+ if (!NCE_ISREACHABLE(ncec))
+ ntme.ipNetToMediaPhysAddress.o_length = 0;
+ else {
+ if (ncec->ncec_lladdr != NULL) {
+ bcopy(ncec->ncec_lladdr,
+ ntme.ipNetToMediaPhysAddress.o_bytes,
+ ntme.ipNetToMediaPhysAddress.o_length);
+ }
+ }
+
+ if (!snmp_append_data2(ird->ird_netmedia.lp_head,
+ &ird->ird_netmedia.lp_tail, (char *)&ntme, sizeof (ntme))) {
+ ip1dbg(("ip_snmp_get2_v4_media: failed to allocate %u bytes\n",
+ (uint_t)sizeof (ntme)));
+ }
+ return (0);
+}
+
/*
* return (0) if invalid set request, 1 otherwise, including non-tcp requests
*/
@@ -19999,7 +11510,7 @@ ip_mib2_add_icmp6_stats(mib2_ipv6IfIcmpEntry_t *o1, mib2_ipv6IfIcmpEntry_t *o2)
* This routine assumes that the options are well formed i.e. that they
* have already been checked.
*/
-static boolean_t
+boolean_t
ip_source_routed(ipha_t *ipha, ip_stack_t *ipst)
{
ipoptp_t opts;
@@ -20007,7 +11518,6 @@ ip_source_routed(ipha_t *ipha, ip_stack_t *ipst)
uint8_t optval;
uint8_t optlen;
ipaddr_t dst;
- ire_t *ire;
if (IS_SIMPLE_IPH(ipha)) {
ip2dbg(("not source routed\n"));
@@ -20030,15 +11540,12 @@ ip_source_routed(ipha_t *ipha, ip_stack_t *ipst)
* If dst is one of our addresses and there are some
* entries left in the source route return (true).
*/
- ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL,
- ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
- if (ire == NULL) {
+ if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
ip2dbg(("ip_source_routed: not next"
" source route 0x%x\n",
ntohl(dst)));
return (B_FALSE);
}
- ire_refrele(ire);
off = opt[IPOPT_OFFSET];
off--;
if (optlen < IP_ADDR_LEN ||
@@ -20055,267 +11562,18 @@ ip_source_routed(ipha_t *ipha, ip_stack_t *ipst)
}
/*
- * Check if the packet contains any source route.
- */
-static boolean_t
-ip_source_route_included(ipha_t *ipha)
-{
- ipoptp_t opts;
- uint8_t optval;
-
- if (IS_SIMPLE_IPH(ipha))
- return (B_FALSE);
- for (optval = ipoptp_first(&opts, ipha);
- optval != IPOPT_EOL;
- optval = ipoptp_next(&opts)) {
- switch (optval) {
- case IPOPT_SSRR:
- case IPOPT_LSRR:
- return (B_TRUE);
- }
- }
- return (B_FALSE);
-}
-
-/*
- * Called when the IRE expiration timer fires.
- */
-void
-ip_trash_timer_expire(void *args)
-{
- int flush_flag = 0;
- ire_expire_arg_t iea;
- ip_stack_t *ipst = (ip_stack_t *)args;
-
- iea.iea_ipst = ipst; /* No netstack_hold */
-
- /*
- * ip_ire_expire_id is protected by ip_trash_timer_lock.
- * This lock makes sure that a new invocation of this function
- * that occurs due to an almost immediate timer firing will not
- * progress beyond this point until the current invocation is done
- */
- mutex_enter(&ipst->ips_ip_trash_timer_lock);
- ipst->ips_ip_ire_expire_id = 0;
- mutex_exit(&ipst->ips_ip_trash_timer_lock);
-
- /* Periodic timer */
- if (ipst->ips_ip_ire_arp_time_elapsed >=
- ipst->ips_ip_ire_arp_interval) {
- /*
- * Remove all IRE_CACHE entries since they might
- * contain arp information.
- */
- flush_flag |= FLUSH_ARP_TIME;
- ipst->ips_ip_ire_arp_time_elapsed = 0;
- IP_STAT(ipst, ip_ire_arp_timer_expired);
- }
- if (ipst->ips_ip_ire_rd_time_elapsed >=
- ipst->ips_ip_ire_redir_interval) {
- /* Remove all redirects */
- flush_flag |= FLUSH_REDIRECT_TIME;
- ipst->ips_ip_ire_rd_time_elapsed = 0;
- IP_STAT(ipst, ip_ire_redirect_timer_expired);
- }
- if (ipst->ips_ip_ire_pmtu_time_elapsed >=
- ipst->ips_ip_ire_pathmtu_interval) {
- /* Increase path mtu */
- flush_flag |= FLUSH_MTU_TIME;
- ipst->ips_ip_ire_pmtu_time_elapsed = 0;
- IP_STAT(ipst, ip_ire_pmtu_timer_expired);
- }
-
- /*
- * Optimize for the case when there are no redirects in the
- * ftable, that is, no need to walk the ftable in that case.
- */
- if (flush_flag & (FLUSH_MTU_TIME|FLUSH_ARP_TIME)) {
- iea.iea_flush_flag = flush_flag;
- ire_walk_ill_tables(MATCH_IRE_TYPE, IRE_CACHETABLE, ire_expire,
- (char *)(uintptr_t)&iea, IP_MASK_TABLE_SIZE, 0, NULL,
- ipst->ips_ip_cache_table_size, ipst->ips_ip_cache_table,
- NULL, ALL_ZONES, ipst);
- }
- if ((flush_flag & FLUSH_REDIRECT_TIME) &&
- ipst->ips_ip_redirect_cnt > 0) {
- iea.iea_flush_flag = flush_flag;
- ire_walk_ill_tables(MATCH_IRE_TYPE, IRE_FORWARDTABLE,
- ire_expire, (char *)(uintptr_t)&iea, IP_MASK_TABLE_SIZE,
- 0, NULL, 0, NULL, NULL, ALL_ZONES, ipst);
- }
- if (flush_flag & FLUSH_MTU_TIME) {
- /*
- * Walk all IPv6 IRE's and update them
- * Note that ARP and redirect timers are not
- * needed since NUD handles stale entries.
- */
- flush_flag = FLUSH_MTU_TIME;
- iea.iea_flush_flag = flush_flag;
- ire_walk_v6(ire_expire, (char *)(uintptr_t)&iea,
- ALL_ZONES, ipst);
- }
-
- ipst->ips_ip_ire_arp_time_elapsed += ipst->ips_ip_timer_interval;
- ipst->ips_ip_ire_rd_time_elapsed += ipst->ips_ip_timer_interval;
- ipst->ips_ip_ire_pmtu_time_elapsed += ipst->ips_ip_timer_interval;
-
- /*
- * Hold the lock to serialize timeout calls and prevent
- * stale values in ip_ire_expire_id. Otherwise it is possible
- * for the timer to fire and a new invocation of this function
- * to start before the return value of timeout has been stored
- * in ip_ire_expire_id by the current invocation.
- */
- mutex_enter(&ipst->ips_ip_trash_timer_lock);
- ipst->ips_ip_ire_expire_id = timeout(ip_trash_timer_expire,
- (void *)ipst, MSEC_TO_TICK(ipst->ips_ip_timer_interval));
- mutex_exit(&ipst->ips_ip_trash_timer_lock);
-}
-
-/*
- * Called by the memory allocator subsystem directly, when the system
- * is running low on memory.
- */
-/* ARGSUSED */
-void
-ip_trash_ire_reclaim(void *args)
-{
- netstack_handle_t nh;
- netstack_t *ns;
-
- netstack_next_init(&nh);
- while ((ns = netstack_next(&nh)) != NULL) {
- ip_trash_ire_reclaim_stack(ns->netstack_ip);
- netstack_rele(ns);
- }
- netstack_next_fini(&nh);
-}
-
-static void
-ip_trash_ire_reclaim_stack(ip_stack_t *ipst)
-{
- ire_cache_count_t icc;
- ire_cache_reclaim_t icr;
- ncc_cache_count_t ncc;
- nce_cache_reclaim_t ncr;
- uint_t delete_cnt;
- /*
- * Memory reclaim call back.
- * Count unused, offlink, pmtu, and onlink IRE_CACHE entries.
- * Then, with a target of freeing 1/Nth of IRE_CACHE
- * entries, determine what fraction to free for
- * each category of IRE_CACHE entries giving absolute priority
- * in the order of onlink, pmtu, offlink, unused (e.g. no pmtu
- * entry will be freed unless all offlink entries are freed).
- */
- icc.icc_total = 0;
- icc.icc_unused = 0;
- icc.icc_offlink = 0;
- icc.icc_pmtu = 0;
- icc.icc_onlink = 0;
- ire_walk(ire_cache_count, (char *)&icc, ipst);
-
- /*
- * Free NCEs for IPv6 like the onlink ires.
- */
- ncc.ncc_total = 0;
- ncc.ncc_host = 0;
- ndp_walk(NULL, (pfi_t)ndp_cache_count, (uchar_t *)&ncc, ipst);
-
- ASSERT(icc.icc_total == icc.icc_unused + icc.icc_offlink +
- icc.icc_pmtu + icc.icc_onlink);
- delete_cnt = icc.icc_total/ipst->ips_ip_ire_reclaim_fraction;
- IP_STAT(ipst, ip_trash_ire_reclaim_calls);
- if (delete_cnt == 0)
- return;
- IP_STAT(ipst, ip_trash_ire_reclaim_success);
- /* Always delete all unused offlink entries */
- icr.icr_ipst = ipst;
- icr.icr_unused = 1;
- if (delete_cnt <= icc.icc_unused) {
- /*
- * Only need to free unused entries. In other words,
- * there are enough unused entries to free to meet our
- * target number of freed ire cache entries.
- */
- icr.icr_offlink = icr.icr_pmtu = icr.icr_onlink = 0;
- ncr.ncr_host = 0;
- } else if (delete_cnt <= icc.icc_unused + icc.icc_offlink) {
- /*
- * Only need to free unused entries, plus a fraction of offlink
- * entries. It follows from the first if statement that
- * icc_offlink is non-zero, and that delete_cnt != icc_unused.
- */
- delete_cnt -= icc.icc_unused;
- /* Round up # deleted by truncating fraction */
- icr.icr_offlink = icc.icc_offlink / delete_cnt;
- icr.icr_pmtu = icr.icr_onlink = 0;
- ncr.ncr_host = 0;
- } else if (delete_cnt <=
- icc.icc_unused + icc.icc_offlink + icc.icc_pmtu) {
- /*
- * Free all unused and offlink entries, plus a fraction of
- * pmtu entries. It follows from the previous if statement
- * that icc_pmtu is non-zero, and that
- * delete_cnt != icc_unused + icc_offlink.
- */
- icr.icr_offlink = 1;
- delete_cnt -= icc.icc_unused + icc.icc_offlink;
- /* Round up # deleted by truncating fraction */
- icr.icr_pmtu = icc.icc_pmtu / delete_cnt;
- icr.icr_onlink = 0;
- ncr.ncr_host = 0;
- } else {
- /*
- * Free all unused, offlink, and pmtu entries, plus a fraction
- * of onlink entries. If we're here, then we know that
- * icc_onlink is non-zero, and that
- * delete_cnt != icc_unused + icc_offlink + icc_pmtu.
- */
- icr.icr_offlink = icr.icr_pmtu = 1;
- delete_cnt -= icc.icc_unused + icc.icc_offlink +
- icc.icc_pmtu;
- /* Round up # deleted by truncating fraction */
- icr.icr_onlink = icc.icc_onlink / delete_cnt;
- /* Using the same delete fraction as for onlink IREs */
- ncr.ncr_host = ncc.ncc_host / delete_cnt;
- }
-#ifdef DEBUG
- ip1dbg(("IP reclaim: target %d out of %d current %d/%d/%d/%d "
- "fractions %d/%d/%d/%d\n",
- icc.icc_total/ipst->ips_ip_ire_reclaim_fraction, icc.icc_total,
- icc.icc_unused, icc.icc_offlink,
- icc.icc_pmtu, icc.icc_onlink,
- icr.icr_unused, icr.icr_offlink,
- icr.icr_pmtu, icr.icr_onlink));
-#endif
- ire_walk(ire_cache_reclaim, (char *)&icr, ipst);
- if (ncr.ncr_host != 0)
- ndp_walk(NULL, (pfi_t)ndp_cache_reclaim,
- (uchar_t *)&ncr, ipst);
-#ifdef DEBUG
- icc.icc_total = 0; icc.icc_unused = 0; icc.icc_offlink = 0;
- icc.icc_pmtu = 0; icc.icc_onlink = 0;
- ire_walk(ire_cache_count, (char *)&icc, ipst);
- ip1dbg(("IP reclaim: result total %d %d/%d/%d/%d\n",
- icc.icc_total, icc.icc_unused, icc.icc_offlink,
- icc.icc_pmtu, icc.icc_onlink));
-#endif
-}
-
-/*
- * ip_unbind is called when a copy of an unbind request is received from the
- * upper level protocol. We remove this conn from any fanout hash list it is
- * on, and zero out the bind information. No reply is expected up above.
+ * ip_unbind is called by the transports to remove a conn from
+ * the fanout table.
*/
void
ip_unbind(conn_t *connp)
{
+
ASSERT(!MUTEX_HELD(&connp->conn_lock));
if (is_system_labeled() && connp->conn_anon_port) {
(void) tsol_mlp_anon(crgetzone(connp->conn_cred),
- connp->conn_mlp_type, connp->conn_ulp,
+ connp->conn_mlp_type, connp->conn_proto,
ntohs(connp->conn_lport), B_FALSE);
connp->conn_anon_port = 0;
}
@@ -20325,1489 +11583,6 @@ ip_unbind(conn_t *connp)
}
/*
- * Write side put procedure. Outbound data, IOCTLs, responses from
- * resolvers, etc, come down through here.
- *
- * arg2 is always a queue_t *.
- * When that queue is an ill_t (i.e. q_next != NULL), then arg must be
- * the zoneid.
- * When that queue is not an ill_t, then arg must be a conn_t pointer.
- */
-void
-ip_output(void *arg, mblk_t *mp, void *arg2, int caller)
-{
- ip_output_options(arg, mp, arg2, caller, &zero_info);
-}
-
-void
-ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller,
- ip_opt_info_t *infop)
-{
- conn_t *connp = NULL;
- queue_t *q = (queue_t *)arg2;
- ipha_t *ipha;
-#define rptr ((uchar_t *)ipha)
- ire_t *ire = NULL;
- ire_t *sctp_ire = NULL;
- uint32_t v_hlen_tos_len;
- ipaddr_t dst;
- mblk_t *first_mp = NULL;
- boolean_t mctl_present;
- ipsec_out_t *io;
- int match_flags;
- ill_t *xmit_ill = NULL; /* IP_PKTINFO etc. */
- ipif_t *dst_ipif;
- boolean_t multirt_need_resolve = B_FALSE;
- mblk_t *copy_mp = NULL;
- int err = 0;
- zoneid_t zoneid;
- boolean_t need_decref = B_FALSE;
- boolean_t ignore_dontroute = B_FALSE;
- boolean_t ignore_nexthop = B_FALSE;
- boolean_t ip_nexthop = B_FALSE;
- ipaddr_t nexthop_addr;
- ip_stack_t *ipst;
-
-#ifdef _BIG_ENDIAN
-#define V_HLEN (v_hlen_tos_len >> 24)
-#else
-#define V_HLEN (v_hlen_tos_len & 0xFF)
-#endif
-
- TRACE_1(TR_FAC_IP, TR_IP_WPUT_START,
- "ip_wput_start: q %p", q);
-
- /*
- * ip_wput fast path
- */
-
- /* is packet from ARP ? */
- if (q->q_next != NULL) {
- zoneid = (zoneid_t)(uintptr_t)arg;
- goto qnext;
- }
-
- connp = (conn_t *)arg;
- ASSERT(connp != NULL);
- zoneid = connp->conn_zoneid;
- ipst = connp->conn_netstack->netstack_ip;
- ASSERT(ipst != NULL);
-
- /* is queue flow controlled? */
- if ((q->q_first != NULL || connp->conn_draining) &&
- (caller == IP_WPUT)) {
- ASSERT(!need_decref);
- ASSERT(!IP_FLOW_CONTROLLED_ULP(connp->conn_ulp));
- (void) putq(q, mp);
- return;
- }
-
- /* Multidata transmit? */
- if (DB_TYPE(mp) == M_MULTIDATA) {
- /*
- * We should never get here, since all Multidata messages
- * originating from tcp should have been directed over to
- * tcp_multisend() in the first place.
- */
- BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
- freemsg(mp);
- return;
- } else if (DB_TYPE(mp) != M_DATA)
- goto notdata;
-
- if (mp->b_flag & MSGHASREF) {
- ASSERT(connp->conn_ulp == IPPROTO_SCTP);
- mp->b_flag &= ~MSGHASREF;
- SCTP_EXTRACT_IPINFO(mp, sctp_ire);
- need_decref = B_TRUE;
- }
- ipha = (ipha_t *)mp->b_rptr;
-
- /* is IP header non-aligned or mblk smaller than basic IP header */
-#ifndef SAFETY_BEFORE_SPEED
- if (!OK_32PTR(rptr) ||
- (mp->b_wptr - rptr) < IP_SIMPLE_HDR_LENGTH)
- goto hdrtoosmall;
-#endif
-
- ASSERT(OK_32PTR(ipha));
-
- /*
- * This function assumes that mp points to an IPv4 packet. If it's the
- * wrong version, we'll catch it again in ip_output_v6.
- *
- * Note that this is *only* locally-generated output here, and never
- * forwarded data, and that we need to deal only with transports that
- * don't know how to label. (TCP, UDP, and ICMP/raw-IP all know how to
- * label.)
- */
- if (is_system_labeled() &&
- (ipha->ipha_version_and_hdr_length & 0xf0) == (IPV4_VERSION << 4) &&
- !connp->conn_ulp_labeled) {
- cred_t *credp;
- pid_t pid;
-
- credp = BEST_CRED(mp, connp, &pid);
- err = tsol_check_label(credp, &mp,
- connp->conn_mac_mode, ipst, pid);
- ipha = (ipha_t *)mp->b_rptr;
- if (err != 0) {
- first_mp = mp;
- if (err == EINVAL)
- goto icmp_parameter_problem;
- ip2dbg(("ip_wput: label check failed (%d)\n", err));
- goto discard_pkt;
- }
- }
-
- ASSERT(infop != NULL);
-
- if (infop->ip_opt_flags & IP_VERIFY_SRC) {
- /*
- * IP_PKTINFO ancillary option is present.
- * IPCL_ZONEID is used to honor IP_ALLZONES option which
- * allows using address of any zone as the source address.
- */
- ire = ire_ctable_lookup(ipha->ipha_src, 0,
- (IRE_LOCAL|IRE_LOOPBACK), NULL, IPCL_ZONEID(connp),
- NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst);
- if (ire == NULL)
- goto drop_pkt;
- ire_refrele(ire);
- ire = NULL;
- }
-
- /*
- * IP_BOUND_IF has precedence over the ill index passed in IP_PKTINFO.
- */
- if (infop->ip_opt_ill_index != 0 && connp->conn_outgoing_ill == NULL) {
- xmit_ill = ill_lookup_on_ifindex(infop->ip_opt_ill_index,
- B_FALSE, NULL, NULL, NULL, NULL, ipst);
-
- if (xmit_ill == NULL || IS_VNI(xmit_ill))
- goto drop_pkt;
- /*
- * check that there is an ipif belonging
- * to our zone. IPCL_ZONEID is not used because
- * IP_ALLZONES option is valid only when the ill is
- * accessible from all zones i.e has a valid ipif in
- * all zones.
- */
- if (!ipif_lookup_zoneid(xmit_ill, zoneid, 0, NULL)) {
- goto drop_pkt;
- }
- }
-
- /*
- * If there is a policy, try to attach an ipsec_out in
- * the front. At the end, first_mp either points to a
- * M_DATA message or IPSEC_OUT message linked to a
- * M_DATA message. We have to do it now as we might
- * lose the "conn" if we go through ip_newroute.
- */
- if (connp->conn_out_enforce_policy || (connp->conn_latch != NULL)) {
- if (((mp = ipsec_attach_ipsec_out(&mp, connp, NULL,
- ipha->ipha_protocol, ipst->ips_netstack)) == NULL)) {
- BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
- if (need_decref)
- CONN_DEC_REF(connp);
- return;
- }
- ASSERT(mp->b_datap->db_type == M_CTL);
- first_mp = mp;
- mp = mp->b_cont;
- mctl_present = B_TRUE;
- } else {
- first_mp = mp;
- mctl_present = B_FALSE;
- }
-
- v_hlen_tos_len = ((uint32_t *)ipha)[0];
-
- /* is wrong version or IP options present */
- if (V_HLEN != IP_SIMPLE_HDR_VERSION)
- goto version_hdrlen_check;
- dst = ipha->ipha_dst;
-
- /* If IP_BOUND_IF has been set, use that ill. */
- if (connp->conn_outgoing_ill != NULL) {
- xmit_ill = conn_get_held_ill(connp,
- &connp->conn_outgoing_ill, &err);
- if (err == ILL_LOOKUP_FAILED)
- goto drop_pkt;
-
- goto send_from_ill;
- }
-
- /* is packet multicast? */
- if (CLASSD(dst))
- goto multicast;
-
- /*
- * If xmit_ill is set above due to index passed in ip_pkt_info. It
- * takes precedence over conn_dontroute and conn_nexthop_set
- */
- if (xmit_ill != NULL)
- goto send_from_ill;
-
- if (connp->conn_dontroute || connp->conn_nexthop_set) {
- /*
- * If the destination is a broadcast, local, or loopback
- * address, SO_DONTROUTE and IP_NEXTHOP go through the
- * standard path.
- */
- ire = ire_cache_lookup(dst, zoneid, msg_getlabel(mp), ipst);
- if ((ire == NULL) || (ire->ire_type &
- (IRE_BROADCAST | IRE_LOCAL | IRE_LOOPBACK)) == 0) {
- if (ire != NULL) {
- ire_refrele(ire);
- /* No more access to ire */
- ire = NULL;
- }
- /*
- * bypass routing checks and go directly to interface.
- */
- if (connp->conn_dontroute)
- goto dontroute;
-
- ASSERT(connp->conn_nexthop_set);
- ip_nexthop = B_TRUE;
- nexthop_addr = connp->conn_nexthop_v4;
- goto send_from_ill;
- }
-
- /* Must be a broadcast, a loopback or a local ire */
- ire_refrele(ire);
- /* No more access to ire */
- ire = NULL;
- }
-
- /*
- * We cache IRE_CACHEs to avoid lookups. We don't do
- * this for the tcp global queue and listen end point
- * as it does not really have a real destination to
- * talk to. This is also true for SCTP.
- */
- if (IP_FLOW_CONTROLLED_ULP(connp->conn_ulp) &&
- !connp->conn_fully_bound) {
- ire = ire_cache_lookup(dst, zoneid, msg_getlabel(mp), ipst);
- if (ire == NULL)
- goto noirefound;
- TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
- "ip_wput_end: q %p (%S)", q, "end");
-
- /*
- * Check if the ire has the RTF_MULTIRT flag, inherited
- * from an IRE_OFFSUBNET ire entry in ip_newroute().
- */
- if (ire->ire_flags & RTF_MULTIRT) {
-
- /*
- * Force the TTL of multirouted packets if required.
- * The TTL of such packets is bounded by the
- * ip_multirt_ttl ndd variable.
- */
- if ((ipst->ips_ip_multirt_ttl > 0) &&
- (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) {
- ip2dbg(("ip_wput: forcing multirt TTL to %d "
- "(was %d), dst 0x%08x\n",
- ipst->ips_ip_multirt_ttl, ipha->ipha_ttl,
- ntohl(ire->ire_addr)));
- ipha->ipha_ttl = ipst->ips_ip_multirt_ttl;
- }
- /*
- * We look at this point if there are pending
- * unresolved routes. ire_multirt_resolvable()
- * checks in O(n) that all IRE_OFFSUBNET ire
- * entries for the packet's destination and
- * flagged RTF_MULTIRT are currently resolved.
- * If some remain unresolved, we make a copy
- * of the current message. It will be used
- * to initiate additional route resolutions.
- */
- multirt_need_resolve =
- ire_multirt_need_resolve(ire->ire_addr,
- msg_getlabel(first_mp), ipst);
- ip2dbg(("ip_wput[TCP]: ire %p, "
- "multirt_need_resolve %d, first_mp %p\n",
- (void *)ire, multirt_need_resolve,
- (void *)first_mp));
- if (multirt_need_resolve) {
- copy_mp = copymsg(first_mp);
- if (copy_mp != NULL) {
- MULTIRT_DEBUG_TAG(copy_mp);
- }
- }
- }
-
- ip_wput_ire(q, first_mp, ire, connp, caller, zoneid);
-
- /*
- * Try to resolve another multiroute if
- * ire_multirt_need_resolve() deemed it necessary.
- */
- if (copy_mp != NULL)
- ip_newroute(q, copy_mp, dst, connp, zoneid, ipst);
- if (need_decref)
- CONN_DEC_REF(connp);
- return;
- }
-
- /*
- * Access to conn_ire_cache. (protected by conn_lock)
- *
- * IRE_MARK_CONDEMNED is marked in ire_delete. We don't grab
- * the ire bucket lock here to check for CONDEMNED as it is okay to
- * send a packet or two with the IRE_CACHE that is going away.
- * Access to the ire requires an ire refhold on the ire prior to
- * its use since an interface unplumb thread may delete the cached
- * ire and release the refhold at any time.
- *
- * Caching an ire in the conn_ire_cache
- *
- * o Caching an ire pointer in the conn requires a strict check for
- * IRE_MARK_CONDEMNED. An interface unplumb thread deletes all relevant
- * ires before cleaning up the conns. So the caching of an ire pointer
- * in the conn is done after making sure under the bucket lock that the
- * ire has not yet been marked CONDEMNED. Otherwise we will end up
- * caching an ire after the unplumb thread has cleaned up the conn.
- * If the conn does not send a packet subsequently the unplumb thread
- * will be hanging waiting for the ire count to drop to zero.
- *
- * o We also need to atomically test for a null conn_ire_cache and
- * set the conn_ire_cache under the the protection of the conn_lock
- * to avoid races among concurrent threads trying to simultaneously
- * cache an ire in the conn_ire_cache.
- */
- mutex_enter(&connp->conn_lock);
- ire = sctp_ire != NULL ? sctp_ire : connp->conn_ire_cache;
-
- if (ire != NULL && ire->ire_addr == dst &&
- !(ire->ire_marks & IRE_MARK_CONDEMNED)) {
-
- IRE_REFHOLD(ire);
- mutex_exit(&connp->conn_lock);
-
- } else {
- boolean_t cached = B_FALSE;
- connp->conn_ire_cache = NULL;
- mutex_exit(&connp->conn_lock);
- /* Release the old ire */
- if (ire != NULL && sctp_ire == NULL)
- IRE_REFRELE_NOTR(ire);
-
- ire = ire_cache_lookup(dst, zoneid, msg_getlabel(mp), ipst);
- if (ire == NULL)
- goto noirefound;
- IRE_REFHOLD_NOTR(ire);
-
- mutex_enter(&connp->conn_lock);
- if (CONN_CACHE_IRE(connp) && connp->conn_ire_cache == NULL) {
- rw_enter(&ire->ire_bucket->irb_lock, RW_READER);
- if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) {
- if (connp->conn_ulp == IPPROTO_TCP)
- TCP_CHECK_IREINFO(connp->conn_tcp, ire);
- connp->conn_ire_cache = ire;
- cached = B_TRUE;
- }
- rw_exit(&ire->ire_bucket->irb_lock);
- }
- mutex_exit(&connp->conn_lock);
-
- /*
- * We can continue to use the ire but since it was
- * not cached, we should drop the extra reference.
- */
- if (!cached)
- IRE_REFRELE_NOTR(ire);
- }
-
- TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
- "ip_wput_end: q %p (%S)", q, "end");
-
- /*
- * Check if the ire has the RTF_MULTIRT flag, inherited
- * from an IRE_OFFSUBNET ire entry in ip_newroute().
- */
- if (ire->ire_flags & RTF_MULTIRT) {
- /*
- * Force the TTL of multirouted packets if required.
- * The TTL of such packets is bounded by the
- * ip_multirt_ttl ndd variable.
- */
- if ((ipst->ips_ip_multirt_ttl > 0) &&
- (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) {
- ip2dbg(("ip_wput: forcing multirt TTL to %d "
- "(was %d), dst 0x%08x\n",
- ipst->ips_ip_multirt_ttl, ipha->ipha_ttl,
- ntohl(ire->ire_addr)));
- ipha->ipha_ttl = ipst->ips_ip_multirt_ttl;
- }
-
- /*
- * At this point, we check to see if there are any pending
- * unresolved routes. ire_multirt_resolvable()
- * checks in O(n) that all IRE_OFFSUBNET ire
- * entries for the packet's destination and
- * flagged RTF_MULTIRT are currently resolved.
- * If some remain unresolved, we make a copy
- * of the current message. It will be used
- * to initiate additional route resolutions.
- */
- multirt_need_resolve = ire_multirt_need_resolve(ire->ire_addr,
- msg_getlabel(first_mp), ipst);
- ip2dbg(("ip_wput[not TCP]: ire %p, "
- "multirt_need_resolve %d, first_mp %p\n",
- (void *)ire, multirt_need_resolve, (void *)first_mp));
- if (multirt_need_resolve) {
- copy_mp = copymsg(first_mp);
- if (copy_mp != NULL) {
- MULTIRT_DEBUG_TAG(copy_mp);
- }
- }
- }
-
- ip_wput_ire(q, first_mp, ire, connp, caller, zoneid);
-
- /*
- * Try to resolve another multiroute if
- * ire_multirt_resolvable() deemed it necessary
- */
- if (copy_mp != NULL)
- ip_newroute(q, copy_mp, dst, connp, zoneid, ipst);
- if (need_decref)
- CONN_DEC_REF(connp);
- return;
-
-qnext:
- /*
- * Upper Level Protocols pass down complete IP datagrams
- * as M_DATA messages. Everything else is a sideshow.
- *
- * 1) We could be re-entering ip_wput because of ip_neworute
- * in which case we could have a IPSEC_OUT message. We
- * need to pass through ip_wput like other datagrams and
- * hence cannot branch to ip_wput_nondata.
- *
- * 2) ARP, AH, ESP, and other clients who are on the module
- * instance of IP stream, give us something to deal with.
- * We will handle AH and ESP here and rest in ip_wput_nondata.
- *
- * 3) ICMP replies also could come here.
- */
- ipst = ILLQ_TO_IPST(q);
-
- if (DB_TYPE(mp) != M_DATA) {
-notdata:
- if (DB_TYPE(mp) == M_CTL) {
- /*
- * M_CTL messages are used by ARP, AH and ESP to
- * communicate with IP. We deal with IPSEC_IN and
- * IPSEC_OUT here. ip_wput_nondata handles other
- * cases.
- */
- ipsec_info_t *ii = (ipsec_info_t *)mp->b_rptr;
- if (mp->b_cont && (mp->b_cont->b_flag & MSGHASREF)) {
- first_mp = mp->b_cont;
- first_mp->b_flag &= ~MSGHASREF;
- ASSERT(connp->conn_ulp == IPPROTO_SCTP);
- SCTP_EXTRACT_IPINFO(first_mp, sctp_ire);
- CONN_DEC_REF(connp);
- connp = NULL;
- }
- if (ii->ipsec_info_type == IPSEC_IN) {
- /*
- * Either this message goes back to
- * IPsec for further processing or to
- * ULP after policy checks.
- */
- ip_fanout_proto_again(mp, NULL, NULL, NULL);
- return;
- } else if (ii->ipsec_info_type == IPSEC_OUT) {
- io = (ipsec_out_t *)ii;
- if (io->ipsec_out_proc_begin) {
- /*
- * IPsec processing has already started.
- * Complete it.
- * IPQoS notes: We don't care what is
- * in ipsec_out_ill_index since this
- * won't be processed for IPQoS policies
- * in ipsec_out_process.
- */
- ipsec_out_process(q, mp, NULL,
- io->ipsec_out_ill_index);
- return;
- } else {
- connp = (q->q_next != NULL) ?
- NULL : Q_TO_CONN(q);
- first_mp = mp;
- mp = mp->b_cont;
- mctl_present = B_TRUE;
- }
- zoneid = io->ipsec_out_zoneid;
- ASSERT(zoneid != ALL_ZONES);
- } else if (ii->ipsec_info_type == IPSEC_CTL) {
- /*
- * It's an IPsec control message requesting
- * an SADB update to be sent to the IPsec
- * hardware acceleration capable ills.
- */
- ipsec_ctl_t *ipsec_ctl =
- (ipsec_ctl_t *)mp->b_rptr;
- ipsa_t *sa = (ipsa_t *)ipsec_ctl->ipsec_ctl_sa;
- uint_t satype = ipsec_ctl->ipsec_ctl_sa_type;
- mblk_t *cmp = mp->b_cont;
-
- ASSERT(MBLKL(mp) >= sizeof (ipsec_ctl_t));
- ASSERT(cmp != NULL);
-
- freeb(mp);
- ill_ipsec_capab_send_all(satype, cmp, sa,
- ipst->ips_netstack);
- return;
- } else {
- /*
- * This must be ARP or special TSOL signaling.
- */
- ip_wput_nondata(NULL, q, mp, NULL);
- TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
- "ip_wput_end: q %p (%S)", q, "nondata");
- return;
- }
- } else {
- /*
- * This must be non-(ARP/AH/ESP) messages.
- */
- ASSERT(!need_decref);
- ip_wput_nondata(NULL, q, mp, NULL);
- TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
- "ip_wput_end: q %p (%S)", q, "nondata");
- return;
- }
- } else {
- first_mp = mp;
- mctl_present = B_FALSE;
- }
-
- ASSERT(first_mp != NULL);
-
- if (mctl_present) {
- io = (ipsec_out_t *)first_mp->b_rptr;
- if (io->ipsec_out_ip_nexthop) {
- /*
- * We may have lost the conn context if we are
- * coming here from ip_newroute(). Copy the
- * nexthop information.
- */
- ip_nexthop = B_TRUE;
- nexthop_addr = io->ipsec_out_nexthop_addr;
-
- ipha = (ipha_t *)mp->b_rptr;
- dst = ipha->ipha_dst;
- goto send_from_ill;
- }
- }
-
- ASSERT(xmit_ill == NULL);
-
- /* We have a complete IP datagram heading outbound. */
- ipha = (ipha_t *)mp->b_rptr;
-
-#ifndef SPEED_BEFORE_SAFETY
- /*
- * Make sure we have a full-word aligned message and that at least
- * a simple IP header is accessible in the first message. If not,
- * try a pullup. For labeled systems we need to always take this
- * path as M_CTLs are "notdata" but have trailing data to process.
- */
- if (!OK_32PTR(rptr) ||
- (mp->b_wptr - rptr) < IP_SIMPLE_HDR_LENGTH || is_system_labeled()) {
-hdrtoosmall:
- if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
- TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
- "ip_wput_end: q %p (%S)", q, "pullupfailed");
- if (first_mp == NULL)
- first_mp = mp;
- goto discard_pkt;
- }
-
- /* This function assumes that mp points to an IPv4 packet. */
- if (is_system_labeled() &&
- (*mp->b_rptr & 0xf0) == (IPV4_VERSION << 4) &&
- (connp == NULL || !connp->conn_ulp_labeled)) {
- cred_t *credp;
- pid_t pid;
-
- if (connp != NULL) {
- credp = BEST_CRED(mp, connp, &pid);
- err = tsol_check_label(credp, &mp,
- connp->conn_mac_mode, ipst, pid);
- } else if ((credp = msg_getcred(mp, &pid)) != NULL) {
- err = tsol_check_label(credp, &mp,
- CONN_MAC_DEFAULT, ipst, pid);
- }
- ipha = (ipha_t *)mp->b_rptr;
- if (mctl_present)
- first_mp->b_cont = mp;
- else
- first_mp = mp;
- if (err != 0) {
- if (err == EINVAL)
- goto icmp_parameter_problem;
- ip2dbg(("ip_wput: label check failed (%d)\n",
- err));
- goto discard_pkt;
- }
- }
-
- ipha = (ipha_t *)mp->b_rptr;
- if (first_mp == NULL) {
- ASSERT(xmit_ill == NULL);
- /*
- * If we got here because of "goto hdrtoosmall"
- * We need to attach a IPSEC_OUT.
- */
- if (connp->conn_out_enforce_policy) {
- if (((mp = ipsec_attach_ipsec_out(&mp, connp,
- NULL, ipha->ipha_protocol,
- ipst->ips_netstack)) == NULL)) {
- BUMP_MIB(&ipst->ips_ip_mib,
- ipIfStatsOutDiscards);
- if (need_decref)
- CONN_DEC_REF(connp);
- return;
- } else {
- ASSERT(mp->b_datap->db_type == M_CTL);
- first_mp = mp;
- mp = mp->b_cont;
- mctl_present = B_TRUE;
- }
- } else {
- first_mp = mp;
- mctl_present = B_FALSE;
- }
- }
- }
-#endif
-
- /* Most of the code below is written for speed, not readability */
- v_hlen_tos_len = ((uint32_t *)ipha)[0];
-
- /*
- * If ip_newroute() fails, we're going to need a full
- * header for the icmp wraparound.
- */
- if (V_HLEN != IP_SIMPLE_HDR_VERSION) {
- uint_t v_hlen;
-version_hdrlen_check:
- ASSERT(first_mp != NULL);
- v_hlen = V_HLEN;
- /*
- * siphon off IPv6 packets coming down from transport
- * layer modules here.
- * Note: high-order bit carries NUD reachability confirmation
- */
- if (((v_hlen >> 4) & 0x7) == IPV6_VERSION) {
- /*
- * FIXME: assume that callers of ip_output* call
- * the right version?
- */
- BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutWrongIPVersion);
- ASSERT(xmit_ill == NULL);
- if (need_decref)
- mp->b_flag |= MSGHASREF;
- (void) ip_output_v6(arg, first_mp, arg2, caller);
- return;
- }
-
- if ((v_hlen >> 4) != IP_VERSION) {
- TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
- "ip_wput_end: q %p (%S)", q, "badvers");
- goto discard_pkt;
- }
- /*
- * Is the header length at least 20 bytes?
- *
- * Are there enough bytes accessible in the header? If
- * not, try a pullup.
- */
- v_hlen &= 0xF;
- v_hlen <<= 2;
- if (v_hlen < IP_SIMPLE_HDR_LENGTH) {
- TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
- "ip_wput_end: q %p (%S)", q, "badlen");
- goto discard_pkt;
- }
- if (v_hlen > (mp->b_wptr - rptr)) {
- if (!pullupmsg(mp, v_hlen)) {
- TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
- "ip_wput_end: q %p (%S)", q, "badpullup2");
- goto discard_pkt;
- }
- ipha = (ipha_t *)mp->b_rptr;
- }
- /*
- * Move first entry from any source route into ipha_dst and
- * verify the options
- */
- if (ip_wput_options(q, first_mp, ipha, mctl_present,
- zoneid, ipst)) {
- ASSERT(xmit_ill == NULL);
- BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
- TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
- "ip_wput_end: q %p (%S)", q, "badopts");
- if (need_decref)
- CONN_DEC_REF(connp);
- return;
- }
- }
- dst = ipha->ipha_dst;
-
- /*
- * Try to get an IRE_CACHE for the destination address. If we can't,
- * we have to run the packet through ip_newroute which will take
- * the appropriate action to arrange for an IRE_CACHE, such as querying
- * a resolver, or assigning a default gateway, etc.
- */
- if (CLASSD(dst)) {
- ipif_t *ipif;
- uint32_t setsrc = 0;
-
-multicast:
- ASSERT(first_mp != NULL);
- ip2dbg(("ip_wput: CLASSD\n"));
- if (connp == NULL) {
- /*
- * Use the first good ipif on the ill.
- * XXX Should this ever happen? (Appears
- * to show up with just ppp and no ethernet due
- * to in.rdisc.)
- * However, ire_send should be able to
- * call ip_wput_ire directly.
- *
- * XXX Also, this can happen for ICMP and other packets
- * with multicast source addresses. Perhaps we should
- * fix things so that we drop the packet in question,
- * but for now, just run with it.
- */
- ill_t *ill = (ill_t *)q->q_ptr;
-
- ipif = ipif_select_source(ill, dst, GLOBAL_ZONEID);
- if (ipif == NULL) {
- if (need_decref)
- CONN_DEC_REF(connp);
- freemsg(first_mp);
- return;
- }
- ip1dbg(("ip_wput: CLASSD no CONN: dst 0x%x on %s\n",
- ntohl(dst), ill->ill_name));
- } else {
- /*
- * The order of precedence is IP_BOUND_IF, IP_PKTINFO
- * and IP_MULTICAST_IF. The block comment above this
- * function explains the locking mechanism used here.
- */
- if (xmit_ill == NULL) {
- xmit_ill = conn_get_held_ill(connp,
- &connp->conn_outgoing_ill, &err);
- if (err == ILL_LOOKUP_FAILED) {
- ip1dbg(("ip_wput: No ill for "
- "IP_BOUND_IF\n"));
- BUMP_MIB(&ipst->ips_ip_mib,
- ipIfStatsOutNoRoutes);
- goto drop_pkt;
- }
- }
-
- if (xmit_ill == NULL) {
- ipif = conn_get_held_ipif(connp,
- &connp->conn_multicast_ipif, &err);
- if (err == IPIF_LOOKUP_FAILED) {
- ip1dbg(("ip_wput: No ipif for "
- "multicast\n"));
- BUMP_MIB(&ipst->ips_ip_mib,
- ipIfStatsOutNoRoutes);
- goto drop_pkt;
- }
- }
- if (xmit_ill != NULL) {
- ipif = ipif_get_next_ipif(NULL, xmit_ill);
- if (ipif == NULL) {
- ip1dbg(("ip_wput: No ipif for "
- "xmit_ill\n"));
- BUMP_MIB(&ipst->ips_ip_mib,
- ipIfStatsOutNoRoutes);
- goto drop_pkt;
- }
- } else if (ipif == NULL || ipif->ipif_isv6) {
- /*
- * We must do this ipif determination here
- * else we could pass through ip_newroute
- * and come back here without the conn context.
- *
- * Note: we do late binding i.e. we bind to
- * the interface when the first packet is sent.
- * For performance reasons we do not rebind on
- * each packet but keep the binding until the
- * next IP_MULTICAST_IF option.
- *
- * conn_multicast_{ipif,ill} are shared between
- * IPv4 and IPv6 and AF_INET6 sockets can
- * send both IPv4 and IPv6 packets. Hence
- * we have to check that "isv6" matches above.
- */
- if (ipif != NULL)
- ipif_refrele(ipif);
- ipif = ipif_lookup_group(dst, zoneid, ipst);
- if (ipif == NULL) {
- ip1dbg(("ip_wput: No ipif for "
- "multicast\n"));
- BUMP_MIB(&ipst->ips_ip_mib,
- ipIfStatsOutNoRoutes);
- goto drop_pkt;
- }
- err = conn_set_held_ipif(connp,
- &connp->conn_multicast_ipif, ipif);
- if (err == IPIF_LOOKUP_FAILED) {
- ipif_refrele(ipif);
- ip1dbg(("ip_wput: No ipif for "
- "multicast\n"));
- BUMP_MIB(&ipst->ips_ip_mib,
- ipIfStatsOutNoRoutes);
- goto drop_pkt;
- }
- }
- }
- ASSERT(!ipif->ipif_isv6);
- /*
- * As we may lose the conn by the time we reach ip_wput_ire,
- * we copy conn_multicast_loop and conn_dontroute on to an
- * ipsec_out. In case if this datagram goes out secure,
- * we need the ill_index also. Copy that also into the
- * ipsec_out.
- */
- if (mctl_present) {
- io = (ipsec_out_t *)first_mp->b_rptr;
- ASSERT(first_mp->b_datap->db_type == M_CTL);
- ASSERT(io->ipsec_out_type == IPSEC_OUT);
- } else {
- ASSERT(mp == first_mp);
- if ((first_mp = allocb(sizeof (ipsec_info_t),
- BPRI_HI)) == NULL) {
- ipif_refrele(ipif);
- first_mp = mp;
- goto discard_pkt;
- }
- first_mp->b_datap->db_type = M_CTL;
- first_mp->b_wptr += sizeof (ipsec_info_t);
- /* ipsec_out_secure is B_FALSE now */
- bzero(first_mp->b_rptr, sizeof (ipsec_info_t));
- io = (ipsec_out_t *)first_mp->b_rptr;
- io->ipsec_out_type = IPSEC_OUT;
- io->ipsec_out_len = sizeof (ipsec_out_t);
- io->ipsec_out_use_global_policy = B_TRUE;
- io->ipsec_out_ns = ipst->ips_netstack;
- first_mp->b_cont = mp;
- mctl_present = B_TRUE;
- }
-
- match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
- io->ipsec_out_ill_index =
- ipif->ipif_ill->ill_phyint->phyint_ifindex;
-
- if (connp != NULL) {
- io->ipsec_out_multicast_loop =
- connp->conn_multicast_loop;
- io->ipsec_out_dontroute = connp->conn_dontroute;
- io->ipsec_out_zoneid = connp->conn_zoneid;
- }
- /*
- * If the application uses IP_MULTICAST_IF with
- * different logical addresses of the same ILL, we
- * need to make sure that the soruce address of
- * the packet matches the logical IP address used
- * in the option. We do it by initializing ipha_src
- * here. This should keep IPsec also happy as
- * when we return from IPsec processing, we don't
- * have to worry about getting the right address on
- * the packet. Thus it is sufficient to look for
- * IRE_CACHE using MATCH_IRE_ILL rathen than
- * MATCH_IRE_IPIF.
- *
- * NOTE : We need to do it for non-secure case also as
- * this might go out secure if there is a global policy
- * match in ip_wput_ire.
- *
- * As we do not have the ire yet, it is possible that
- * we set the source address here and then later discover
- * that the ire implies the source address to be assigned
- * through the RTF_SETSRC flag.
- * In that case, the setsrc variable will remind us
- * that overwritting the source address by the one
- * of the RTF_SETSRC-flagged ire is allowed.
- */
- if (ipha->ipha_src == INADDR_ANY &&
- (connp == NULL || !connp->conn_unspec_src)) {
- ipha->ipha_src = ipif->ipif_src_addr;
- setsrc = RTF_SETSRC;
- }
- /*
- * Find an IRE which matches the destination and the outgoing
- * queue (i.e. the outgoing interface.)
- * For loopback use a unicast IP address for
- * the ire lookup.
- */
- if (IS_LOOPBACK(ipif->ipif_ill))
- dst = ipif->ipif_lcl_addr;
-
- /*
- * If xmit_ill is set, we branch out to ip_newroute_ipif.
- * We don't need to lookup ire in ctable as the packet
- * needs to be sent to the destination through the specified
- * ill irrespective of ires in the cache table.
- */
- ire = NULL;
- if (xmit_ill == NULL) {
- ire = ire_ctable_lookup(dst, 0, 0, ipif,
- zoneid, msg_getlabel(mp), match_flags, ipst);
- }
-
- if (ire == NULL) {
- /*
- * Multicast loopback and multicast forwarding is
- * done in ip_wput_ire.
- *
- * Mark this packet to make it be delivered to
- * ip_wput_ire after the new ire has been
- * created.
- *
- * The call to ip_newroute_ipif takes into account
- * the setsrc reminder. In any case, we take care
- * of the RTF_MULTIRT flag.
- */
- mp->b_prev = mp->b_next = NULL;
- if (xmit_ill == NULL ||
- xmit_ill->ill_ipif_up_count > 0) {
- ip_newroute_ipif(q, first_mp, ipif, dst, connp,
- setsrc | RTF_MULTIRT, zoneid, infop);
- TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
- "ip_wput_end: q %p (%S)", q, "noire");
- } else {
- freemsg(first_mp);
- }
- ipif_refrele(ipif);
- if (xmit_ill != NULL)
- ill_refrele(xmit_ill);
- if (need_decref)
- CONN_DEC_REF(connp);
- return;
- }
-
- ipif_refrele(ipif);
- ipif = NULL;
- ASSERT(xmit_ill == NULL);
-
- /*
- * Honor the RTF_SETSRC flag for multicast packets,
- * if allowed by the setsrc reminder.
- */
- if ((ire->ire_flags & RTF_SETSRC) && setsrc) {
- ipha->ipha_src = ire->ire_src_addr;
- }
-
- /*
- * Unconditionally force the TTL to 1 for
- * multirouted multicast packets:
- * multirouted multicast should not cross
- * multicast routers.
- */
- if (ire->ire_flags & RTF_MULTIRT) {
- if (ipha->ipha_ttl > 1) {
- ip2dbg(("ip_wput: forcing multicast "
- "multirt TTL to 1 (was %d), dst 0x%08x\n",
- ipha->ipha_ttl, ntohl(ire->ire_addr)));
- ipha->ipha_ttl = 1;
- }
- }
- } else {
- ire = ire_cache_lookup(dst, zoneid, msg_getlabel(mp), ipst);
- if ((ire != NULL) && (ire->ire_type &
- (IRE_BROADCAST | IRE_LOCAL | IRE_LOOPBACK))) {
- ignore_dontroute = B_TRUE;
- ignore_nexthop = B_TRUE;
- }
- if (ire != NULL) {
- ire_refrele(ire);
- ire = NULL;
- }
- /*
- * Guard against coming in from arp in which case conn is NULL.
- * Also guard against non M_DATA with dontroute set but
- * destined to local, loopback or broadcast addresses.
- */
- if (connp != NULL && connp->conn_dontroute &&
- !ignore_dontroute) {
-dontroute:
- /*
- * Set TTL to 1 if SO_DONTROUTE is set to prevent
- * routing protocols from seeing false direct
- * connectivity.
- */
- ipha->ipha_ttl = 1;
- /* If suitable ipif not found, drop packet */
- dst_ipif = ipif_lookup_onlink_addr(dst, zoneid, ipst);
- if (dst_ipif == NULL) {
-noroute:
- ip1dbg(("ip_wput: no route for dst using"
- " SO_DONTROUTE\n"));
- BUMP_MIB(&ipst->ips_ip_mib,
- ipIfStatsOutNoRoutes);
- mp->b_prev = mp->b_next = NULL;
- if (first_mp == NULL)
- first_mp = mp;
- goto drop_pkt;
- } else {
- /*
- * If suitable ipif has been found, set
- * xmit_ill to the corresponding
- * ipif_ill because we'll be using the
- * send_from_ill logic below.
- */
- ASSERT(xmit_ill == NULL);
- xmit_ill = dst_ipif->ipif_ill;
- mutex_enter(&xmit_ill->ill_lock);
- if (!ILL_CAN_LOOKUP(xmit_ill)) {
- mutex_exit(&xmit_ill->ill_lock);
- xmit_ill = NULL;
- ipif_refrele(dst_ipif);
- goto noroute;
- }
- ill_refhold_locked(xmit_ill);
- mutex_exit(&xmit_ill->ill_lock);
- ipif_refrele(dst_ipif);
- }
- }
-
-send_from_ill:
- if (xmit_ill != NULL) {
- ipif_t *ipif;
-
- /*
- * Mark this packet as originated locally
- */
- mp->b_prev = mp->b_next = NULL;
-
- /*
- * Could be SO_DONTROUTE case also.
- * Verify that at least one ipif is up on the ill.
- */
- if (xmit_ill->ill_ipif_up_count == 0) {
- ip1dbg(("ip_output: xmit_ill %s is down\n",
- xmit_ill->ill_name));
- goto drop_pkt;
- }
-
- ipif = ipif_get_next_ipif(NULL, xmit_ill);
- if (ipif == NULL) {
- ip1dbg(("ip_output: xmit_ill %s NULL ipif\n",
- xmit_ill->ill_name));
- goto drop_pkt;
- }
-
- match_flags = 0;
- if (IS_UNDER_IPMP(xmit_ill))
- match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
-
- /*
- * Look for a ire that is part of the group,
- * if found use it else call ip_newroute_ipif.
- * IPCL_ZONEID is not used for matching because
- * IP_ALLZONES option is valid only when the
- * ill is accessible from all zones i.e has a
- * valid ipif in all zones.
- */
- match_flags |= MATCH_IRE_ILL | MATCH_IRE_SECATTR;
- ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid,
- msg_getlabel(mp), match_flags, ipst);
- /*
- * If an ire exists use it or else create
- * an ire but don't add it to the cache.
- * Adding an ire may cause issues with
- * asymmetric routing.
- * In case of multiroute always act as if
- * ire does not exist.
- */
- if (ire == NULL || ire->ire_flags & RTF_MULTIRT) {
- if (ire != NULL)
- ire_refrele(ire);
- ip_newroute_ipif(q, first_mp, ipif,
- dst, connp, 0, zoneid, infop);
- ipif_refrele(ipif);
- ip1dbg(("ip_output: xmit_ill via %s\n",
- xmit_ill->ill_name));
- ill_refrele(xmit_ill);
- if (need_decref)
- CONN_DEC_REF(connp);
- return;
- }
- ipif_refrele(ipif);
- } else if (ip_nexthop || (connp != NULL &&
- (connp->conn_nexthop_set)) && !ignore_nexthop) {
- if (!ip_nexthop) {
- ip_nexthop = B_TRUE;
- nexthop_addr = connp->conn_nexthop_v4;
- }
- match_flags = MATCH_IRE_MARK_PRIVATE_ADDR |
- MATCH_IRE_GW;
- ire = ire_ctable_lookup(dst, nexthop_addr, 0,
- NULL, zoneid, msg_getlabel(mp), match_flags, ipst);
- } else {
- ire = ire_cache_lookup(dst, zoneid, msg_getlabel(mp),
- ipst);
- }
- if (!ire) {
- if (ip_nexthop && !ignore_nexthop) {
- if (mctl_present) {
- io = (ipsec_out_t *)first_mp->b_rptr;
- ASSERT(first_mp->b_datap->db_type ==
- M_CTL);
- ASSERT(io->ipsec_out_type == IPSEC_OUT);
- } else {
- ASSERT(mp == first_mp);
- first_mp = allocb(
- sizeof (ipsec_info_t), BPRI_HI);
- if (first_mp == NULL) {
- first_mp = mp;
- goto discard_pkt;
- }
- first_mp->b_datap->db_type = M_CTL;
- first_mp->b_wptr +=
- sizeof (ipsec_info_t);
- /* ipsec_out_secure is B_FALSE now */
- bzero(first_mp->b_rptr,
- sizeof (ipsec_info_t));
- io = (ipsec_out_t *)first_mp->b_rptr;
- io->ipsec_out_type = IPSEC_OUT;
- io->ipsec_out_len =
- sizeof (ipsec_out_t);
- io->ipsec_out_use_global_policy =
- B_TRUE;
- io->ipsec_out_ns = ipst->ips_netstack;
- first_mp->b_cont = mp;
- mctl_present = B_TRUE;
- }
- io->ipsec_out_ip_nexthop = ip_nexthop;
- io->ipsec_out_nexthop_addr = nexthop_addr;
- }
-noirefound:
- /*
- * Mark this packet as having originated on
- * this machine. This will be noted in
- * ire_add_then_send, which needs to know
- * whether to run it back through ip_wput or
- * ip_rput following successful resolution.
- */
- mp->b_prev = NULL;
- mp->b_next = NULL;
- ip_newroute(q, first_mp, dst, connp, zoneid, ipst);
- TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
- "ip_wput_end: q %p (%S)", q, "newroute");
- if (xmit_ill != NULL)
- ill_refrele(xmit_ill);
- if (need_decref)
- CONN_DEC_REF(connp);
- return;
- }
- }
-
- /* We now know where we are going with it. */
-
- TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
- "ip_wput_end: q %p (%S)", q, "end");
-
- /*
- * Check if the ire has the RTF_MULTIRT flag, inherited
- * from an IRE_OFFSUBNET ire entry in ip_newroute.
- */
- if (ire->ire_flags & RTF_MULTIRT) {
- /*
- * Force the TTL of multirouted packets if required.
- * The TTL of such packets is bounded by the
- * ip_multirt_ttl ndd variable.
- */
- if ((ipst->ips_ip_multirt_ttl > 0) &&
- (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) {
- ip2dbg(("ip_wput: forcing multirt TTL to %d "
- "(was %d), dst 0x%08x\n",
- ipst->ips_ip_multirt_ttl, ipha->ipha_ttl,
- ntohl(ire->ire_addr)));
- ipha->ipha_ttl = ipst->ips_ip_multirt_ttl;
- }
- /*
- * At this point, we check to see if there are any pending
- * unresolved routes. ire_multirt_resolvable()
- * checks in O(n) that all IRE_OFFSUBNET ire
- * entries for the packet's destination and
- * flagged RTF_MULTIRT are currently resolved.
- * If some remain unresolved, we make a copy
- * of the current message. It will be used
- * to initiate additional route resolutions.
- */
- multirt_need_resolve = ire_multirt_need_resolve(ire->ire_addr,
- msg_getlabel(first_mp), ipst);
- ip2dbg(("ip_wput[noirefound]: ire %p, "
- "multirt_need_resolve %d, first_mp %p\n",
- (void *)ire, multirt_need_resolve, (void *)first_mp));
- if (multirt_need_resolve) {
- copy_mp = copymsg(first_mp);
- if (copy_mp != NULL) {
- MULTIRT_DEBUG_TAG(copy_mp);
- }
- }
- }
-
- ip_wput_ire(q, first_mp, ire, connp, caller, zoneid);
- /*
- * Try to resolve another multiroute if
- * ire_multirt_resolvable() deemed it necessary.
- * At this point, we need to distinguish
- * multicasts from other packets. For multicasts,
- * we call ip_newroute_ipif() and request that both
- * multirouting and setsrc flags are checked.
- */
- if (copy_mp != NULL) {
- if (CLASSD(dst)) {
- ipif_t *ipif = ipif_lookup_group(dst, zoneid, ipst);
- if (ipif) {
- ASSERT(infop->ip_opt_ill_index == 0);
- ip_newroute_ipif(q, copy_mp, ipif, dst, connp,
- RTF_SETSRC | RTF_MULTIRT, zoneid, infop);
- ipif_refrele(ipif);
- } else {
- MULTIRT_DEBUG_UNTAG(copy_mp);
- freemsg(copy_mp);
- copy_mp = NULL;
- }
- } else {
- ip_newroute(q, copy_mp, dst, connp, zoneid, ipst);
- }
- }
- if (xmit_ill != NULL)
- ill_refrele(xmit_ill);
- if (need_decref)
- CONN_DEC_REF(connp);
- return;
-
-icmp_parameter_problem:
- /* could not have originated externally */
- ASSERT(mp->b_prev == NULL);
- if (ip_hdr_complete(ipha, zoneid, ipst) == 0) {
- BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes);
- /* it's the IP header length that's in trouble */
- icmp_param_problem(q, first_mp, 0, zoneid, ipst);
- first_mp = NULL;
- }
-
-discard_pkt:
- BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
-drop_pkt:
- ip1dbg(("ip_wput: dropped packet\n"));
- if (ire != NULL)
- ire_refrele(ire);
- if (need_decref)
- CONN_DEC_REF(connp);
- freemsg(first_mp);
- if (xmit_ill != NULL)
- ill_refrele(xmit_ill);
- TRACE_2(TR_FAC_IP, TR_IP_WPUT_END,
- "ip_wput_end: q %p (%S)", q, "droppkt");
-}
-
-/*
- * If this is a conn_t queue, then we pass in the conn. This includes the
- * zoneid.
- * Otherwise, this is a message coming back from ARP or for an ill_t queue,
- * in which case we use the global zoneid since those are all part of
- * the global zone.
- */
-void
-ip_wput(queue_t *q, mblk_t *mp)
-{
- if (CONN_Q(q))
- ip_output(Q_TO_CONN(q), mp, q, IP_WPUT);
- else
- ip_output(GLOBAL_ZONEID, mp, q, IP_WPUT);
-}
-
-/*
- *
- * The following rules must be observed when accessing any ipif or ill
- * that has been cached in the conn. Typically conn_outgoing_ill,
- * conn_multicast_ipif and conn_multicast_ill.
- *
- * Access: The ipif or ill pointed to from the conn can be accessed under
- * the protection of the conn_lock or after it has been refheld under the
- * protection of the conn lock. In addition the IPIF_CAN_LOOKUP or
- * ILL_CAN_LOOKUP macros must be used before actually doing the refhold.
- * The reason for this is that a concurrent unplumb could actually be
- * cleaning up these cached pointers by walking the conns and might have
- * finished cleaning up the conn in question. The macros check that an
- * unplumb has not yet started on the ipif or ill.
- *
- * Caching: An ipif or ill pointer may be cached in the conn only after
- * making sure that an unplumb has not started. So the caching is done
- * while holding both the conn_lock and the ill_lock and after using the
- * ILL_CAN_LOOKUP/IPIF_CAN_LOOKUP macro. An unplumb will set the ILL_CONDEMNED
- * flag before starting the cleanup of conns.
- *
- * The list of ipifs hanging off the ill is protected by ill_g_lock and ill_lock
- * On the other hand to access ipif->ipif_ill, we need one of either ill_g_lock
- * or a reference to the ipif or a reference to an ire that references the
- * ipif. An ipif only changes its ill when migrating from an underlying ill
- * to an IPMP ill in ipif_up().
- */
-ipif_t *
-conn_get_held_ipif(conn_t *connp, ipif_t **ipifp, int *err)
-{
- ipif_t *ipif;
- ill_t *ill;
- ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
-
- *err = 0;
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- mutex_enter(&connp->conn_lock);
- ipif = *ipifp;
- if (ipif != NULL) {
- ill = ipif->ipif_ill;
- mutex_enter(&ill->ill_lock);
- if (IPIF_CAN_LOOKUP(ipif)) {
- ipif_refhold_locked(ipif);
- mutex_exit(&ill->ill_lock);
- mutex_exit(&connp->conn_lock);
- rw_exit(&ipst->ips_ill_g_lock);
- return (ipif);
- } else {
- *err = IPIF_LOOKUP_FAILED;
- }
- mutex_exit(&ill->ill_lock);
- }
- mutex_exit(&connp->conn_lock);
- rw_exit(&ipst->ips_ill_g_lock);
- return (NULL);
-}
-
-ill_t *
-conn_get_held_ill(conn_t *connp, ill_t **illp, int *err)
-{
- ill_t *ill;
-
- *err = 0;
- mutex_enter(&connp->conn_lock);
- ill = *illp;
- if (ill != NULL) {
- mutex_enter(&ill->ill_lock);
- if (ILL_CAN_LOOKUP(ill)) {
- ill_refhold_locked(ill);
- mutex_exit(&ill->ill_lock);
- mutex_exit(&connp->conn_lock);
- return (ill);
- } else {
- *err = ILL_LOOKUP_FAILED;
- }
- mutex_exit(&ill->ill_lock);
- }
- mutex_exit(&connp->conn_lock);
- return (NULL);
-}
-
-static int
-conn_set_held_ipif(conn_t *connp, ipif_t **ipifp, ipif_t *ipif)
-{
- ill_t *ill;
-
- ill = ipif->ipif_ill;
- mutex_enter(&connp->conn_lock);
- mutex_enter(&ill->ill_lock);
- if (IPIF_CAN_LOOKUP(ipif)) {
- *ipifp = ipif;
- mutex_exit(&ill->ill_lock);
- mutex_exit(&connp->conn_lock);
- return (0);
- }
- mutex_exit(&ill->ill_lock);
- mutex_exit(&connp->conn_lock);
- return (IPIF_LOOKUP_FAILED);
-}
-
-/*
- * This is called if the outbound datagram needs fragmentation.
- *
- * NOTE : This function does not ire_refrele the ire argument passed in.
- */
-static void
-ip_wput_ire_fragmentit(mblk_t *ipsec_mp, ire_t *ire, zoneid_t zoneid,
- ip_stack_t *ipst, conn_t *connp)
-{
- ipha_t *ipha;
- mblk_t *mp;
- uint32_t v_hlen_tos_len;
- uint32_t max_frag;
- uint32_t frag_flag;
- boolean_t dont_use;
-
- if (ipsec_mp->b_datap->db_type == M_CTL) {
- mp = ipsec_mp->b_cont;
- } else {
- mp = ipsec_mp;
- }
-
- ipha = (ipha_t *)mp->b_rptr;
- v_hlen_tos_len = ((uint32_t *)ipha)[0];
-
-#ifdef _BIG_ENDIAN
-#define V_HLEN (v_hlen_tos_len >> 24)
-#define LENGTH (v_hlen_tos_len & 0xFFFF)
-#else
-#define V_HLEN (v_hlen_tos_len & 0xFF)
-#define LENGTH ((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00))
-#endif
-
-#ifndef SPEED_BEFORE_SAFETY
- /*
- * Check that ipha_length is consistent with
- * the mblk length
- */
- if (LENGTH != (mp->b_cont ? msgdsize(mp) : mp->b_wptr - rptr)) {
- ip0dbg(("Packet length mismatch: %d, %ld\n",
- LENGTH, msgdsize(mp)));
- freemsg(ipsec_mp);
- TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END,
- "ip_wput_ire_fragmentit: mp %p (%S)", mp,
- "packet length mismatch");
- return;
- }
-#endif
- /*
- * Don't use frag_flag if pre-built packet or source
- * routed or if multicast (since multicast packets do not solicit
- * ICMP "packet too big" messages). Get the values of
- * max_frag and frag_flag atomically by acquiring the
- * ire_lock.
- */
- mutex_enter(&ire->ire_lock);
- max_frag = ire->ire_max_frag;
- frag_flag = ire->ire_frag_flag;
- mutex_exit(&ire->ire_lock);
-
- dont_use = ((ipha->ipha_ident == IP_HDR_INCLUDED) ||
- (V_HLEN != IP_SIMPLE_HDR_VERSION &&
- ip_source_route_included(ipha)) || CLASSD(ipha->ipha_dst));
-
- ip_wput_frag(ire, ipsec_mp, OB_PKT, max_frag,
- (dont_use ? 0 : frag_flag), zoneid, ipst, connp);
-}
-
-/*
* Used for deciding the MSS size for the upper layer. Thus
* we need to check the outbound policy values in the conn.
*/
@@ -21820,10 +11595,10 @@ conn_ipsec_length(conn_t *connp)
if (ipl == NULL)
return (0);
- if (ipl->ipl_out_policy == NULL)
+ if (connp->conn_ixa->ixa_ipsec_policy == NULL)
return (0);
- return (ipl->ipl_out_policy->ipsp_act->ipa_ovhd);
+ return (connp->conn_ixa->ixa_ipsec_policy->ipsp_act->ipa_ovhd);
}
/*
@@ -21831,20 +11606,17 @@ conn_ipsec_length(conn_t *connp)
* we don't want to call into IPsec to get the exact size.
*/
int
-ipsec_out_extra_length(mblk_t *ipsec_mp)
+ipsec_out_extra_length(ip_xmit_attr_t *ixa)
{
- ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr;
ipsec_action_t *a;
- ASSERT(io->ipsec_out_type == IPSEC_OUT);
- if (!io->ipsec_out_secure)
+ if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE))
return (0);
- a = io->ipsec_out_act;
-
+ a = ixa->ixa_ipsec_action;
if (a == NULL) {
- ASSERT(io->ipsec_out_policy != NULL);
- a = io->ipsec_out_policy->ipsp_act;
+ ASSERT(ixa->ixa_ipsec_policy != NULL);
+ a = ixa->ixa_ipsec_policy->ipsp_act;
}
ASSERT(a != NULL);
@@ -21852,22 +11624,6 @@ ipsec_out_extra_length(mblk_t *ipsec_mp)
}
/*
- * Returns an estimate of the IPsec headers size. This is used if
- * we don't want to call into IPsec to get the exact size.
- */
-int
-ipsec_in_extra_length(mblk_t *ipsec_mp)
-{
- ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr;
- ipsec_action_t *a;
-
- ASSERT(ii->ipsec_in_type == IPSEC_IN);
-
- a = ii->ipsec_in_action;
- return (a == NULL ? 0 : a->ipa_ovhd);
-}
-
-/*
* If there are any source route options, return the true final
* destination. Otherwise, return the destination.
*/
@@ -21914,2257 +11670,70 @@ ip_get_dst(ipha_t *ipha)
return (dst);
}
-mblk_t *
-ip_wput_ire_parse_ipsec_out(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, ire_t *ire,
- conn_t *connp, boolean_t unspec_src, zoneid_t zoneid)
-{
- ipsec_out_t *io;
- mblk_t *first_mp;
- boolean_t policy_present;
- ip_stack_t *ipst;
- ipsec_stack_t *ipss;
-
- ASSERT(ire != NULL);
- ipst = ire->ire_ipst;
- ipss = ipst->ips_netstack->netstack_ipsec;
-
- first_mp = mp;
- if (mp->b_datap->db_type == M_CTL) {
- io = (ipsec_out_t *)first_mp->b_rptr;
- /*
- * ip_wput[_v6] attaches an IPSEC_OUT in two cases.
- *
- * 1) There is per-socket policy (including cached global
- * policy) or a policy on the IP-in-IP tunnel.
- * 2) There is no per-socket policy, but it is
- * a multicast packet that needs to go out
- * on a specific interface. This is the case
- * where (ip_wput and ip_wput_multicast) attaches
- * an IPSEC_OUT and sets ipsec_out_secure B_FALSE.
- *
- * In case (2) we check with global policy to
- * see if there is a match and set the ill_index
- * appropriately so that we can lookup the ire
- * properly in ip_wput_ipsec_out.
- */
-
- /*
- * ipsec_out_use_global_policy is set to B_FALSE
- * in ipsec_in_to_out(). Refer to that function for
- * details.
- */
- if ((io->ipsec_out_latch == NULL) &&
- (io->ipsec_out_use_global_policy)) {
- return (ip_wput_attach_policy(first_mp, ipha, ip6h,
- ire, connp, unspec_src, zoneid));
- }
- if (!io->ipsec_out_secure) {
- /*
- * If this is not a secure packet, drop
- * the IPSEC_OUT mp and treat it as a clear
- * packet. This happens when we are sending
- * a ICMP reply back to a clear packet. See
- * ipsec_in_to_out() for details.
- */
- mp = first_mp->b_cont;
- freeb(first_mp);
- }
- return (mp);
- }
- /*
- * See whether we need to attach a global policy here. We
- * don't depend on the conn (as it could be null) for deciding
- * what policy this datagram should go through because it
- * should have happened in ip_wput if there was some
- * policy. This normally happens for connections which are not
- * fully bound preventing us from caching policies in
- * ip_bind. Packets coming from the TCP listener/global queue
- * - which are non-hard_bound - could also be affected by
- * applying policy here.
- *
- * If this packet is coming from tcp global queue or listener,
- * we will be applying policy here. This may not be *right*
- * if these packets are coming from the detached connection as
- * it could have gone in clear before. This happens only if a
- * TCP connection started when there is no policy and somebody
- * added policy before it became detached. Thus packets of the
- * detached connection could go out secure and the other end
- * would drop it because it will be expecting in clear. The
- * converse is not true i.e if somebody starts a TCP
- * connection and deletes the policy, all the packets will
- * still go out with the policy that existed before deleting
- * because ip_unbind sends up policy information which is used
- * by TCP on subsequent ip_wputs. The right solution is to fix
- * TCP to attach a dummy IPSEC_OUT and set
- * ipsec_out_use_global_policy to B_FALSE. As this might
- * affect performance for normal cases, we are not doing it.
- * Thus, set policy before starting any TCP connections.
- *
- * NOTE - We might apply policy even for a hard bound connection
- * - for which we cached policy in ip_bind - if somebody added
- * global policy after we inherited the policy in ip_bind.
- * This means that the packets that were going out in clear
- * previously would start going secure and hence get dropped
- * on the other side. To fix this, TCP attaches a dummy
- * ipsec_out and make sure that we don't apply global policy.
- */
- if (ipha != NULL)
- policy_present = ipss->ipsec_outbound_v4_policy_present;
- else
- policy_present = ipss->ipsec_outbound_v6_policy_present;
- if (!policy_present)
- return (mp);
-
- return (ip_wput_attach_policy(mp, ipha, ip6h, ire, connp, unspec_src,
- zoneid));
-}
-
-/*
- * This function does the ire_refrele of the ire passed in as the
- * argument. As this function looks up more ires i.e broadcast ires,
- * it needs to REFRELE them. Currently, for simplicity we don't
- * differentiate the one passed in and looked up here. We always
- * REFRELE.
- * IPQoS Notes:
- * IP policy is invoked if IPP_LOCAL_OUT is enabled. Processing for
- * IPsec packets are done in ipsec_out_process.
- */
-void
-ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller,
- zoneid_t zoneid)
-{
- ipha_t *ipha;
-#define rptr ((uchar_t *)ipha)
- queue_t *stq;
-#define Q_TO_INDEX(stq) (((ill_t *)stq->q_ptr)->ill_phyint->phyint_ifindex)
- uint32_t v_hlen_tos_len;
- uint32_t ttl_protocol;
- ipaddr_t src;
- ipaddr_t dst;
- uint32_t cksum;
- ipaddr_t orig_src;
- ire_t *ire1;
- mblk_t *next_mp;
- uint_t hlen;
- uint16_t *up;
- uint32_t max_frag = ire->ire_max_frag;
- ill_t *ill = ire_to_ill(ire);
- int clusterwide;
- uint16_t ip_hdr_included; /* IP header included by ULP? */
- int ipsec_len;
- mblk_t *first_mp;
- ipsec_out_t *io;
- boolean_t conn_dontroute; /* conn value for multicast */
- boolean_t conn_multicast_loop; /* conn value for multicast */
- boolean_t multicast_forward; /* Should we forward ? */
- boolean_t unspec_src;
- ill_t *conn_outgoing_ill = NULL;
- ill_t *ire_ill;
- ill_t *ire1_ill;
- ill_t *out_ill;
- uint32_t ill_index = 0;
- boolean_t multirt_send = B_FALSE;
- int err;
- ipxmit_state_t pktxmit_state;
- ip_stack_t *ipst = ire->ire_ipst;
- ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
-
- TRACE_1(TR_FAC_IP, TR_IP_WPUT_IRE_START,
- "ip_wput_ire_start: q %p", q);
-
- multicast_forward = B_FALSE;
- unspec_src = (connp != NULL && connp->conn_unspec_src);
-
- if (ire->ire_flags & RTF_MULTIRT) {
- /*
- * Multirouting case. The bucket where ire is stored
- * probably holds other RTF_MULTIRT flagged ire
- * to the destination. In this call to ip_wput_ire,
- * we attempt to send the packet through all
- * those ires. Thus, we first ensure that ire is the
- * first RTF_MULTIRT ire in the bucket,
- * before walking the ire list.
- */
- ire_t *first_ire;
- irb_t *irb = ire->ire_bucket;
- ASSERT(irb != NULL);
-
- /* Make sure we do not omit any multiroute ire. */
- IRB_REFHOLD(irb);
- for (first_ire = irb->irb_ire;
- first_ire != NULL;
- first_ire = first_ire->ire_next) {
- if ((first_ire->ire_flags & RTF_MULTIRT) &&
- (first_ire->ire_addr == ire->ire_addr) &&
- !(first_ire->ire_marks &
- (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)))
- break;
- }
-
- if ((first_ire != NULL) && (first_ire != ire)) {
- IRE_REFHOLD(first_ire);
- ire_refrele(ire);
- ire = first_ire;
- ill = ire_to_ill(ire);
- }
- IRB_REFRELE(irb);
- }
-
- /*
- * conn_outgoing_ill variable is used only in the broadcast loop.
- * for performance we don't grab the mutexs in the fastpath
- */
- if (ire->ire_type == IRE_BROADCAST && connp != NULL &&
- connp->conn_outgoing_ill != NULL) {
- conn_outgoing_ill = conn_get_held_ill(connp,
- &connp->conn_outgoing_ill, &err);
- if (err == ILL_LOOKUP_FAILED) {
- ire_refrele(ire);
- freemsg(mp);
- return;
- }
- }
-
- if (mp->b_datap->db_type != M_CTL) {
- ipha = (ipha_t *)mp->b_rptr;
- } else {
- io = (ipsec_out_t *)mp->b_rptr;
- ASSERT(io->ipsec_out_type == IPSEC_OUT);
- ASSERT(zoneid == io->ipsec_out_zoneid);
- ASSERT(zoneid != ALL_ZONES);
- ipha = (ipha_t *)mp->b_cont->b_rptr;
- dst = ipha->ipha_dst;
- /*
- * For the multicast case, ipsec_out carries conn_dontroute and
- * conn_multicast_loop as conn may not be available here. We
- * need this for multicast loopback and forwarding which is done
- * later in the code.
- */
- if (CLASSD(dst)) {
- conn_dontroute = io->ipsec_out_dontroute;
- conn_multicast_loop = io->ipsec_out_multicast_loop;
- /*
- * If conn_dontroute is not set or conn_multicast_loop
- * is set, we need to do forwarding/loopback. For
- * datagrams from ip_wput_multicast, conn_dontroute is
- * set to B_TRUE and conn_multicast_loop is set to
- * B_FALSE so that we neither do forwarding nor
- * loopback.
- */
- if (!conn_dontroute || conn_multicast_loop)
- multicast_forward = B_TRUE;
- }
- }
-
- if (ire->ire_type == IRE_LOCAL && ire->ire_zoneid != zoneid &&
- ire->ire_zoneid != ALL_ZONES) {
- /*
- * When a zone sends a packet to another zone, we try to deliver
- * the packet under the same conditions as if the destination
- * was a real node on the network. To do so, we look for a
- * matching route in the forwarding table.
- * RTF_REJECT and RTF_BLACKHOLE are handled just like
- * ip_newroute() does.
- * Note that IRE_LOCAL are special, since they are used
- * when the zoneid doesn't match in some cases. This means that
- * we need to handle ipha_src differently since ire_src_addr
- * belongs to the receiving zone instead of the sending zone.
- * When ip_restrict_interzone_loopback is set, then
- * ire_cache_lookup() ensures that IRE_LOCAL are only used
- * for loopback between zones when the logical "Ethernet" would
- * have looped them back.
- */
- ire_t *src_ire;
-
- src_ire = ire_ftable_lookup(ipha->ipha_dst, 0, 0, 0,
- NULL, NULL, zoneid, 0, NULL, (MATCH_IRE_RECURSIVE |
- MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE), ipst);
- if (src_ire != NULL &&
- !(src_ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) &&
- (!ipst->ips_ip_restrict_interzone_loopback ||
- ire_local_same_lan(ire, src_ire))) {
- if (ipha->ipha_src == INADDR_ANY && !unspec_src)
- ipha->ipha_src = src_ire->ire_src_addr;
- ire_refrele(src_ire);
- } else {
- ire_refrele(ire);
- if (conn_outgoing_ill != NULL)
- ill_refrele(conn_outgoing_ill);
- BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes);
- if (src_ire != NULL) {
- if (src_ire->ire_flags & RTF_BLACKHOLE) {
- ire_refrele(src_ire);
- freemsg(mp);
- return;
- }
- ire_refrele(src_ire);
- }
- if (ip_hdr_complete(ipha, zoneid, ipst)) {
- /* Failed */
- freemsg(mp);
- return;
- }
- icmp_unreachable(q, mp, ICMP_HOST_UNREACHABLE, zoneid,
- ipst);
- return;
- }
- }
-
- if (mp->b_datap->db_type == M_CTL ||
- ipss->ipsec_outbound_v4_policy_present) {
- mp = ip_wput_ire_parse_ipsec_out(mp, ipha, NULL, ire, connp,
- unspec_src, zoneid);
- if (mp == NULL) {
- ire_refrele(ire);
- if (conn_outgoing_ill != NULL)
- ill_refrele(conn_outgoing_ill);
- return;
- }
- /*
- * Trusted Extensions supports all-zones interfaces, so
- * zoneid == ALL_ZONES is valid, but IPsec maps ALL_ZONES to
- * the global zone.
- */
- if (zoneid == ALL_ZONES && mp->b_datap->db_type == M_CTL) {
- io = (ipsec_out_t *)mp->b_rptr;
- ASSERT(io->ipsec_out_type == IPSEC_OUT);
- zoneid = io->ipsec_out_zoneid;
- }
- }
-
- first_mp = mp;
- ipsec_len = 0;
-
- if (first_mp->b_datap->db_type == M_CTL) {
- io = (ipsec_out_t *)first_mp->b_rptr;
- ASSERT(io->ipsec_out_type == IPSEC_OUT);
- mp = first_mp->b_cont;
- ipsec_len = ipsec_out_extra_length(first_mp);
- ASSERT(ipsec_len >= 0);
- if (zoneid == ALL_ZONES)
- zoneid = GLOBAL_ZONEID;
- /* We already picked up the zoneid from the M_CTL above */
- ASSERT(zoneid == io->ipsec_out_zoneid);
-
- /*
- * Drop M_CTL here if IPsec processing is not needed.
- * (Non-IPsec use of M_CTL extracted any information it
- * needed above).
- */
- if (ipsec_len == 0) {
- freeb(first_mp);
- first_mp = mp;
- }
- }
-
- /*
- * Fast path for ip_wput_ire
- */
-
- ipha = (ipha_t *)mp->b_rptr;
- v_hlen_tos_len = ((uint32_t *)ipha)[0];
- dst = ipha->ipha_dst;
-
- /*
- * ICMP(RAWIP) module should set the ipha_ident to IP_HDR_INCLUDED
- * if the socket is a SOCK_RAW type. The transport checksum should
- * be provided in the pre-built packet, so we don't need to compute it.
- * Also, other application set flags, like DF, should not be altered.
- * Other transport MUST pass down zero.
- */
- ip_hdr_included = ipha->ipha_ident;
- ASSERT(ipha->ipha_ident == 0 || ipha->ipha_ident == IP_HDR_INCLUDED);
-
- if (CLASSD(dst)) {
- ip1dbg(("ip_wput_ire: to 0x%x ire %s addr 0x%x\n",
- ntohl(dst),
- ip_nv_lookup(ire_nv_tbl, ire->ire_type),
- ntohl(ire->ire_addr)));
- }
-
-/* Macros to extract header fields from data already in registers */
-#ifdef _BIG_ENDIAN
-#define V_HLEN (v_hlen_tos_len >> 24)
-#define LENGTH (v_hlen_tos_len & 0xFFFF)
-#define PROTO (ttl_protocol & 0xFF)
-#else
-#define V_HLEN (v_hlen_tos_len & 0xFF)
-#define LENGTH ((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00))
-#define PROTO (ttl_protocol >> 8)
-#endif
-
- orig_src = src = ipha->ipha_src;
- /* (The loop back to "another" is explained down below.) */
-another:;
- /*
- * Assign an ident value for this packet. We assign idents on
- * a per destination basis out of the IRE. There could be
- * other threads targeting the same destination, so we have to
- * arrange for a atomic increment. Note that we use a 32-bit
- * atomic add because it has better performance than its
- * 16-bit sibling.
- *
- * If running in cluster mode and if the source address
- * belongs to a replicated service then vector through
- * cl_inet_ipident vector to allocate ip identifier
- * NOTE: This is a contract private interface with the
- * clustering group.
- */
- clusterwide = 0;
- if (cl_inet_ipident) {
- ASSERT(cl_inet_isclusterwide);
- netstackid_t stack_id = ipst->ips_netstack->netstack_stackid;
-
- if ((*cl_inet_isclusterwide)(stack_id, IPPROTO_IP,
- AF_INET, (uint8_t *)(uintptr_t)src, NULL)) {
- ipha->ipha_ident = (*cl_inet_ipident)(stack_id,
- IPPROTO_IP, AF_INET, (uint8_t *)(uintptr_t)src,
- (uint8_t *)(uintptr_t)dst, NULL);
- clusterwide = 1;
- }
- }
- if (!clusterwide) {
- ipha->ipha_ident =
- (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1);
- }
-
-#ifndef _BIG_ENDIAN
- ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8);
-#endif
-
- /*
- * Set source address unless sent on an ill or conn_unspec_src is set.
- * This is needed to obey conn_unspec_src when packets go through
- * ip_newroute + arp.
- * Assumes ip_newroute{,_multi} sets the source address as well.
- */
- if (src == INADDR_ANY && !unspec_src) {
- /*
- * Assign the appropriate source address from the IRE if none
- * was specified.
- */
- ASSERT(ire->ire_ipversion == IPV4_VERSION);
-
- src = ire->ire_src_addr;
- if (connp == NULL) {
- ip1dbg(("ip_wput_ire: no connp and no src "
- "address for dst 0x%x, using src 0x%x\n",
- ntohl(dst),
- ntohl(src)));
- }
- ipha->ipha_src = src;
- }
- stq = ire->ire_stq;
-
- /*
- * We only allow ire chains for broadcasts since there will
- * be multiple IRE_CACHE entries for the same multicast
- * address (one per ipif).
- */
- next_mp = NULL;
-
- /* broadcast packet */
- if (ire->ire_type == IRE_BROADCAST)
- goto broadcast;
-
- /* loopback ? */
- if (stq == NULL)
- goto nullstq;
-
- /* The ill_index for outbound ILL */
- ill_index = Q_TO_INDEX(stq);
-
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
- ttl_protocol = ((uint16_t *)ipha)[4];
-
- /* pseudo checksum (do it in parts for IP header checksum) */
- cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
-
- if (!IP_FLOW_CONTROLLED_ULP(PROTO)) {
- queue_t *dev_q = stq->q_next;
-
- /*
- * For DIRECT_CAPABLE, we do flow control at
- * the time of sending the packet. See
- * ILL_SEND_TX().
- */
- if (!ILL_DIRECT_CAPABLE((ill_t *)stq->q_ptr) &&
- (DEV_Q_FLOW_BLOCKED(dev_q)))
- goto blocked;
-
- if ((PROTO == IPPROTO_UDP) &&
- (ip_hdr_included != IP_HDR_INCLUDED)) {
- hlen = (V_HLEN & 0xF) << 2;
- up = IPH_UDPH_CHECKSUMP(ipha, hlen);
- if (*up != 0) {
- IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO,
- hlen, LENGTH, max_frag, ipsec_len, cksum);
- /* Software checksum? */
- if (DB_CKSUMFLAGS(mp) == 0) {
- IP_STAT(ipst, ip_out_sw_cksum);
- IP_STAT_UPDATE(ipst,
- ip_udp_out_sw_cksum_bytes,
- LENGTH - hlen);
- }
- }
- }
- } else if (ip_hdr_included != IP_HDR_INCLUDED) {
- hlen = (V_HLEN & 0xF) << 2;
- if (PROTO == IPPROTO_TCP) {
- up = IPH_TCPH_CHECKSUMP(ipha, hlen);
- /*
- * The packet header is processed once and for all, even
- * in the multirouting case. We disable hardware
- * checksum if the packet is multirouted, as it will be
- * replicated via several interfaces, and not all of
- * them may have this capability.
- */
- IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO, hlen,
- LENGTH, max_frag, ipsec_len, cksum);
- /* Software checksum? */
- if (DB_CKSUMFLAGS(mp) == 0) {
- IP_STAT(ipst, ip_out_sw_cksum);
- IP_STAT_UPDATE(ipst, ip_tcp_out_sw_cksum_bytes,
- LENGTH - hlen);
- }
- } else {
- sctp_hdr_t *sctph;
-
- ASSERT(PROTO == IPPROTO_SCTP);
- ASSERT(MBLKL(mp) >= (hlen + sizeof (*sctph)));
- sctph = (sctp_hdr_t *)(mp->b_rptr + hlen);
- /*
- * Zero out the checksum field to ensure proper
- * checksum calculation.
- */
- sctph->sh_chksum = 0;
-#ifdef DEBUG
- if (!skip_sctp_cksum)
-#endif
- sctph->sh_chksum = sctp_cksum(mp, hlen);
- }
- }
-
- /*
- * If this is a multicast packet and originated from ip_wput
- * we need to do loopback and forwarding checks. If it comes
- * from ip_wput_multicast, we SHOULD not do this.
- */
- if (CLASSD(ipha->ipha_dst) && multicast_forward) goto multi_loopback;
-
- /* checksum */
- cksum += ttl_protocol;
-
- /* fragment the packet */
- if (max_frag < (uint_t)(LENGTH + ipsec_len))
- goto fragmentit;
- /*
- * Don't use frag_flag if packet is pre-built or source
- * routed or if multicast (since multicast packets do
- * not solicit ICMP "packet too big" messages).
- */
- if ((ip_hdr_included != IP_HDR_INCLUDED) &&
- (V_HLEN == IP_SIMPLE_HDR_VERSION ||
- !ip_source_route_included(ipha)) &&
- !CLASSD(ipha->ipha_dst))
- ipha->ipha_fragment_offset_and_flags |=
- htons(ire->ire_frag_flag);
-
- if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) {
- /* calculate IP header checksum */
- cksum += ipha->ipha_ident;
- cksum += (v_hlen_tos_len >> 16)+(v_hlen_tos_len & 0xFFFF);
- cksum += ipha->ipha_fragment_offset_and_flags;
-
- /* IP options present */
- hlen = (V_HLEN & 0xF) - IP_SIMPLE_HDR_LENGTH_IN_WORDS;
- if (hlen)
- goto checksumoptions;
-
- /* calculate hdr checksum */
- cksum = ((cksum & 0xFFFF) + (cksum >> 16));
- cksum = ~(cksum + (cksum >> 16));
- ipha->ipha_hdr_checksum = (uint16_t)cksum;
- }
- if (ipsec_len != 0) {
- /*
- * We will do the rest of the processing after
- * we come back from IPsec in ip_wput_ipsec_out().
- */
- ASSERT(MBLKL(first_mp) >= sizeof (ipsec_out_t));
-
- io = (ipsec_out_t *)first_mp->b_rptr;
- io->ipsec_out_ill_index =
- ire->ire_ipif->ipif_ill->ill_phyint->phyint_ifindex;
- ipsec_out_process(q, first_mp, ire, 0);
- ire_refrele(ire);
- if (conn_outgoing_ill != NULL)
- ill_refrele(conn_outgoing_ill);
- return;
- }
-
- /*
- * In most cases, the emission loop below is entered only
- * once. Only in the case where the ire holds the
- * RTF_MULTIRT flag, do we loop to process all RTF_MULTIRT
- * flagged ires in the bucket, and send the packet
- * through all crossed RTF_MULTIRT routes.
- */
- if (ire->ire_flags & RTF_MULTIRT) {
- multirt_send = B_TRUE;
- }
- do {
- if (multirt_send) {
- irb_t *irb;
- /*
- * We are in a multiple send case, need to get
- * the next ire and make a duplicate of the packet.
- * ire1 holds here the next ire to process in the
- * bucket. If multirouting is expected,
- * any non-RTF_MULTIRT ire that has the
- * right destination address is ignored.
- */
- irb = ire->ire_bucket;
- ASSERT(irb != NULL);
-
- IRB_REFHOLD(irb);
- for (ire1 = ire->ire_next;
- ire1 != NULL;
- ire1 = ire1->ire_next) {
- if ((ire1->ire_flags & RTF_MULTIRT) == 0)
- continue;
- if (ire1->ire_addr != ire->ire_addr)
- continue;
- if (ire1->ire_marks &
- (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))
- continue;
-
- /* Got one */
- IRE_REFHOLD(ire1);
- break;
- }
- IRB_REFRELE(irb);
-
- if (ire1 != NULL) {
- next_mp = copyb(mp);
- if ((next_mp == NULL) ||
- ((mp->b_cont != NULL) &&
- ((next_mp->b_cont =
- dupmsg(mp->b_cont)) == NULL))) {
- freemsg(next_mp);
- next_mp = NULL;
- ire_refrele(ire1);
- ire1 = NULL;
- }
- }
-
- /* Last multiroute ire; don't loop anymore. */
- if (ire1 == NULL) {
- multirt_send = B_FALSE;
- }
- }
-
- DTRACE_PROBE4(ip4__physical__out__start, ill_t *, NULL,
- ill_t *, ire->ire_ipif->ipif_ill, ipha_t *, ipha,
- mblk_t *, mp);
- FW_HOOKS(ipst->ips_ip4_physical_out_event,
- ipst->ips_ipv4firewall_physical_out,
- NULL, ire->ire_ipif->ipif_ill, ipha, mp, mp, 0, ipst);
- DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
-
- if (mp == NULL)
- goto release_ire_and_ill;
-
- if (ipst->ips_ip4_observe.he_interested) {
- zoneid_t szone;
-
- /*
- * On the outbound path the destination zone will be
- * unknown as we're sending this packet out on the
- * wire.
- */
- szone = ip_get_zoneid_v4(ipha->ipha_src, mp, ipst,
- ALL_ZONES);
- ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, ALL_ZONES,
- ire->ire_ipif->ipif_ill, ipst);
- }
- mp->b_prev = SET_BPREV_FLAG(IPP_LOCAL_OUT);
- DTRACE_PROBE2(ip__xmit__1, mblk_t *, mp, ire_t *, ire);
-
- pktxmit_state = ip_xmit_v4(mp, ire, NULL, B_TRUE, connp);
-
- if ((pktxmit_state == SEND_FAILED) ||
- (pktxmit_state == LLHDR_RESLV_FAILED)) {
- ip2dbg(("ip_wput_ire: ip_xmit_v4 failed"
- "- packet dropped\n"));
-release_ire_and_ill:
- ire_refrele(ire);
- if (next_mp != NULL) {
- freemsg(next_mp);
- ire_refrele(ire1);
- }
- if (conn_outgoing_ill != NULL)
- ill_refrele(conn_outgoing_ill);
- return;
- }
-
- if (CLASSD(dst)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutMcastPkts);
- UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutMcastOctets,
- LENGTH);
- }
-
- TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END,
- "ip_wput_ire_end: q %p (%S)",
- q, "last copy out");
- IRE_REFRELE(ire);
-
- if (multirt_send) {
- ASSERT(ire1);
- /*
- * Proceed with the next RTF_MULTIRT ire,
- * Also set up the send-to queue accordingly.
- */
- ire = ire1;
- ire1 = NULL;
- stq = ire->ire_stq;
- mp = next_mp;
- next_mp = NULL;
- ipha = (ipha_t *)mp->b_rptr;
- ill_index = Q_TO_INDEX(stq);
- ill = (ill_t *)stq->q_ptr;
- }
- } while (multirt_send);
- if (conn_outgoing_ill != NULL)
- ill_refrele(conn_outgoing_ill);
- return;
-
- /*
- * ire->ire_type == IRE_BROADCAST (minimize diffs)
- */
-broadcast:
- {
- /*
- * To avoid broadcast storms, we usually set the TTL to 1 for
- * broadcasts. However, if SO_DONTROUTE isn't set, this value
- * can be overridden stack-wide through the ip_broadcast_ttl
- * ndd tunable, or on a per-connection basis through the
- * IP_BROADCAST_TTL socket option.
- *
- * In the event that we are replying to incoming ICMP packets,
- * connp could be NULL.
- */
- ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl;
- if (connp != NULL) {
- if (connp->conn_dontroute)
- ipha->ipha_ttl = 1;
- else if (connp->conn_broadcast_ttl != 0)
- ipha->ipha_ttl = connp->conn_broadcast_ttl;
- }
-
- /*
- * Note that we are not doing a IRB_REFHOLD here.
- * Actually we don't care if the list changes i.e
- * if somebody deletes an IRE from the list while
- * we drop the lock, the next time we come around
- * ire_next will be NULL and hence we won't send
- * out multiple copies which is fine.
- */
- rw_enter(&ire->ire_bucket->irb_lock, RW_READER);
- ire1 = ire->ire_next;
- if (conn_outgoing_ill != NULL) {
- while (ire->ire_ipif->ipif_ill != conn_outgoing_ill) {
- ASSERT(ire1 == ire->ire_next);
- if (ire1 != NULL && ire1->ire_addr == dst) {
- ire_refrele(ire);
- ire = ire1;
- IRE_REFHOLD(ire);
- ire1 = ire->ire_next;
- continue;
- }
- rw_exit(&ire->ire_bucket->irb_lock);
- /* Did not find a matching ill */
- ip1dbg(("ip_wput_ire: broadcast with no "
- "matching IP_BOUND_IF ill %s dst %x\n",
- conn_outgoing_ill->ill_name, dst));
- freemsg(first_mp);
- if (ire != NULL)
- ire_refrele(ire);
- ill_refrele(conn_outgoing_ill);
- return;
- }
- } else if (ire1 != NULL && ire1->ire_addr == dst) {
- /*
- * If the next IRE has the same address and is not one
- * of the two copies that we need to send, try to see
- * whether this copy should be sent at all. This
- * assumes that we insert loopbacks first and then
- * non-loopbacks. This is acheived by inserting the
- * loopback always before non-loopback.
- * This is used to send a single copy of a broadcast
- * packet out all physical interfaces that have an
- * matching IRE_BROADCAST while also looping
- * back one copy (to ip_wput_local) for each
- * matching physical interface. However, we avoid
- * sending packets out different logical that match by
- * having ipif_up/ipif_down supress duplicate
- * IRE_BROADCASTS.
- *
- * This feature is currently used to get broadcasts
- * sent to multiple interfaces, when the broadcast
- * address being used applies to multiple interfaces.
- * For example, a whole net broadcast will be
- * replicated on every connected subnet of
- * the target net.
- *
- * Each zone has its own set of IRE_BROADCASTs, so that
- * we're able to distribute inbound packets to multiple
- * zones who share a broadcast address. We avoid looping
- * back outbound packets in different zones but on the
- * same ill, as the application would see duplicates.
- *
- * This logic assumes that ire_add_v4() groups the
- * IRE_BROADCAST entries so that those with the same
- * ire_addr are kept together.
- */
- ire_ill = ire->ire_ipif->ipif_ill;
- if (ire->ire_stq != NULL || ire1->ire_stq == NULL) {
- while (ire1 != NULL && ire1->ire_addr == dst) {
- ire1_ill = ire1->ire_ipif->ipif_ill;
- if (ire1_ill != ire_ill)
- break;
- ire1 = ire1->ire_next;
- }
- }
- }
- ASSERT(multirt_send == B_FALSE);
- if (ire1 != NULL && ire1->ire_addr == dst) {
- if ((ire->ire_flags & RTF_MULTIRT) &&
- (ire1->ire_flags & RTF_MULTIRT)) {
- /*
- * We are in the multirouting case.
- * The message must be sent at least
- * on both ires. These ires have been
- * inserted AFTER the standard ones
- * in ip_rt_add(). There are thus no
- * other ire entries for the destination
- * address in the rest of the bucket
- * that do not have the RTF_MULTIRT
- * flag. We don't process a copy
- * of the message here. This will be
- * done in the final sending loop.
- */
- multirt_send = B_TRUE;
- } else {
- next_mp = ip_copymsg(first_mp);
- if (next_mp != NULL)
- IRE_REFHOLD(ire1);
- }
- }
- rw_exit(&ire->ire_bucket->irb_lock);
- }
-
- if (stq) {
- /*
- * A non-NULL send-to queue means this packet is going
- * out of this machine.
- */
- out_ill = (ill_t *)stq->q_ptr;
-
- BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutRequests);
- ttl_protocol = ((uint16_t *)ipha)[4];
- /*
- * We accumulate the pseudo header checksum in cksum.
- * This is pretty hairy code, so watch close. One
- * thing to keep in mind is that UDP and TCP have
- * stored their respective datagram lengths in their
- * checksum fields. This lines things up real nice.
- */
- cksum = (dst >> 16) + (dst & 0xFFFF) +
- (src >> 16) + (src & 0xFFFF);
- /*
- * We assume the udp checksum field contains the
- * length, so to compute the pseudo header checksum,
- * all we need is the protocol number and src/dst.
- */
- /* Provide the checksums for UDP and TCP. */
- if ((PROTO == IPPROTO_TCP) &&
- (ip_hdr_included != IP_HDR_INCLUDED)) {
- /* hlen gets the number of uchar_ts in the IP header */
- hlen = (V_HLEN & 0xF) << 2;
- up = IPH_TCPH_CHECKSUMP(ipha, hlen);
- IP_STAT(ipst, ip_out_sw_cksum);
- IP_STAT_UPDATE(ipst, ip_tcp_out_sw_cksum_bytes,
- LENGTH - hlen);
- *up = IP_CSUM(mp, hlen, cksum + IP_TCP_CSUM_COMP);
- } else if (PROTO == IPPROTO_SCTP &&
- (ip_hdr_included != IP_HDR_INCLUDED)) {
- sctp_hdr_t *sctph;
-
- hlen = (V_HLEN & 0xF) << 2;
- ASSERT(MBLKL(mp) >= (hlen + sizeof (*sctph)));
- sctph = (sctp_hdr_t *)(mp->b_rptr + hlen);
- sctph->sh_chksum = 0;
-#ifdef DEBUG
- if (!skip_sctp_cksum)
-#endif
- sctph->sh_chksum = sctp_cksum(mp, hlen);
- } else {
- queue_t *dev_q = stq->q_next;
-
- if (!ILL_DIRECT_CAPABLE((ill_t *)stq->q_ptr) &&
- (DEV_Q_FLOW_BLOCKED(dev_q))) {
-blocked:
- ipha->ipha_ident = ip_hdr_included;
- /*
- * If we don't have a conn to apply
- * backpressure, free the message.
- * In the ire_send path, we don't know
- * the position to requeue the packet. Rather
- * than reorder packets, we just drop this
- * packet.
- */
- if (ipst->ips_ip_output_queue &&
- connp != NULL &&
- caller != IRE_SEND) {
- if (caller == IP_WSRV) {
- idl_tx_list_t *idl_txl;
-
- idl_txl =
- &ipst->ips_idl_tx_list[0];
- connp->conn_did_putbq = 1;
- (void) putbq(connp->conn_wq,
- first_mp);
- conn_drain_insert(connp,
- idl_txl);
- /*
- * This is the service thread,
- * and the queue is already
- * noenabled. The check for
- * canput and the putbq is not
- * atomic. So we need to check
- * again.
- */
- if (canput(stq->q_next))
- connp->conn_did_putbq
- = 0;
- IP_STAT(ipst, ip_conn_flputbq);
- } else {
- /*
- * We are not the service proc.
- * ip_wsrv will be scheduled or
- * is already running.
- */
-
- (void) putq(connp->conn_wq,
- first_mp);
- }
- } else {
- out_ill = (ill_t *)stq->q_ptr;
- BUMP_MIB(out_ill->ill_ip_mib,
- ipIfStatsOutDiscards);
- freemsg(first_mp);
- TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END,
- "ip_wput_ire_end: q %p (%S)",
- q, "discard");
- }
- ire_refrele(ire);
- if (next_mp) {
- ire_refrele(ire1);
- freemsg(next_mp);
- }
- if (conn_outgoing_ill != NULL)
- ill_refrele(conn_outgoing_ill);
- return;
- }
- if ((PROTO == IPPROTO_UDP) &&
- (ip_hdr_included != IP_HDR_INCLUDED)) {
- /*
- * hlen gets the number of uchar_ts in the
- * IP header
- */
- hlen = (V_HLEN & 0xF) << 2;
- up = IPH_UDPH_CHECKSUMP(ipha, hlen);
- max_frag = ire->ire_max_frag;
- if (*up != 0) {
- IP_CKSUM_XMIT(out_ill, ire, mp, ipha,
- up, PROTO, hlen, LENGTH, max_frag,
- ipsec_len, cksum);
- /* Software checksum? */
- if (DB_CKSUMFLAGS(mp) == 0) {
- IP_STAT(ipst, ip_out_sw_cksum);
- IP_STAT_UPDATE(ipst,
- ip_udp_out_sw_cksum_bytes,
- LENGTH - hlen);
- }
- }
- }
- }
- /*
- * Need to do this even when fragmenting. The local
- * loopback can be done without computing checksums
- * but forwarding out other interface must be done
- * after the IP checksum (and ULP checksums) have been
- * computed.
- *
- * NOTE : multicast_forward is set only if this packet
- * originated from ip_wput. For packets originating from
- * ip_wput_multicast, it is not set.
- */
- if (CLASSD(ipha->ipha_dst) && multicast_forward) {
-multi_loopback:
- ip2dbg(("ip_wput: multicast, loop %d\n",
- conn_multicast_loop));
-
- /* Forget header checksum offload */
- DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
-
- /*
- * Local loopback of multicasts? Check the
- * ill.
- *
- * Note that the loopback function will not come
- * in through ip_rput - it will only do the
- * client fanout thus we need to do an mforward
- * as well. The is different from the BSD
- * logic.
- */
- if (ill != NULL) {
- if (ilm_lookup_ill(ill, ipha->ipha_dst,
- ALL_ZONES) != NULL) {
- /*
- * Pass along the virtual output q.
- * ip_wput_local() will distribute the
- * packet to all the matching zones,
- * except the sending zone when
- * IP_MULTICAST_LOOP is false.
- */
- ip_multicast_loopback(q, ill, first_mp,
- conn_multicast_loop ? 0 :
- IP_FF_NO_MCAST_LOOP, zoneid);
- }
- }
- if (ipha->ipha_ttl == 0) {
- /*
- * 0 => only to this host i.e. we are
- * done. We are also done if this was the
- * loopback interface since it is sufficient
- * to loopback one copy of a multicast packet.
- */
- freemsg(first_mp);
- TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END,
- "ip_wput_ire_end: q %p (%S)",
- q, "loopback");
- ire_refrele(ire);
- if (conn_outgoing_ill != NULL)
- ill_refrele(conn_outgoing_ill);
- return;
- }
- /*
- * ILLF_MULTICAST is checked in ip_newroute
- * i.e. we don't need to check it here since
- * all IRE_CACHEs come from ip_newroute.
- * For multicast traffic, SO_DONTROUTE is interpreted
- * to mean only send the packet out the interface
- * (optionally specified with IP_MULTICAST_IF)
- * and do not forward it out additional interfaces.
- * RSVP and the rsvp daemon is an example of a
- * protocol and user level process that
- * handles it's own routing. Hence, it uses the
- * SO_DONTROUTE option to accomplish this.
- */
-
- if (ipst->ips_ip_g_mrouter && !conn_dontroute &&
- ill != NULL) {
- /* Unconditionally redo the checksum */
- ipha->ipha_hdr_checksum = 0;
- ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
-
- /*
- * If this needs to go out secure, we need
- * to wait till we finish the IPsec
- * processing.
- */
- if (ipsec_len == 0 &&
- ip_mforward(ill, ipha, mp)) {
- freemsg(first_mp);
- ip1dbg(("ip_wput: mforward failed\n"));
- TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END,
- "ip_wput_ire_end: q %p (%S)",
- q, "mforward failed");
- ire_refrele(ire);
- if (conn_outgoing_ill != NULL)
- ill_refrele(conn_outgoing_ill);
- return;
- }
- }
- }
- max_frag = ire->ire_max_frag;
- cksum += ttl_protocol;
- if (max_frag >= (uint_t)(LENGTH + ipsec_len)) {
- /* No fragmentation required for this one. */
- /*
- * Don't use frag_flag if packet is pre-built or source
- * routed or if multicast (since multicast packets do
- * not solicit ICMP "packet too big" messages).
- */
- if ((ip_hdr_included != IP_HDR_INCLUDED) &&
- (V_HLEN == IP_SIMPLE_HDR_VERSION ||
- !ip_source_route_included(ipha)) &&
- !CLASSD(ipha->ipha_dst))
- ipha->ipha_fragment_offset_and_flags |=
- htons(ire->ire_frag_flag);
-
- if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) {
- /* Complete the IP header checksum. */
- cksum += ipha->ipha_ident;
- cksum += (v_hlen_tos_len >> 16)+
- (v_hlen_tos_len & 0xFFFF);
- cksum += ipha->ipha_fragment_offset_and_flags;
- hlen = (V_HLEN & 0xF) -
- IP_SIMPLE_HDR_LENGTH_IN_WORDS;
- if (hlen) {
-checksumoptions:
- /*
- * Account for the IP Options in the IP
- * header checksum.
- */
- up = (uint16_t *)(rptr+
- IP_SIMPLE_HDR_LENGTH);
- do {
- cksum += up[0];
- cksum += up[1];
- up += 2;
- } while (--hlen);
- }
- cksum = ((cksum & 0xFFFF) + (cksum >> 16));
- cksum = ~(cksum + (cksum >> 16));
- ipha->ipha_hdr_checksum = (uint16_t)cksum;
- }
- if (ipsec_len != 0) {
- ipsec_out_process(q, first_mp, ire, ill_index);
- if (!next_mp) {
- ire_refrele(ire);
- if (conn_outgoing_ill != NULL)
- ill_refrele(conn_outgoing_ill);
- return;
- }
- goto next;
- }
-
- /*
- * multirt_send has already been handled
- * for broadcast, but not yet for multicast
- * or IP options.
- */
- if (next_mp == NULL) {
- if (ire->ire_flags & RTF_MULTIRT) {
- multirt_send = B_TRUE;
- }
- }
-
- /*
- * In most cases, the emission loop below is
- * entered only once. Only in the case where
- * the ire holds the RTF_MULTIRT flag, do we loop
- * to process all RTF_MULTIRT ires in the bucket,
- * and send the packet through all crossed
- * RTF_MULTIRT routes.
- */
- do {
- if (multirt_send) {
- irb_t *irb;
-
- irb = ire->ire_bucket;
- ASSERT(irb != NULL);
- /*
- * We are in a multiple send case,
- * need to get the next IRE and make
- * a duplicate of the packet.
- */
- IRB_REFHOLD(irb);
- for (ire1 = ire->ire_next;
- ire1 != NULL;
- ire1 = ire1->ire_next) {
- if (!(ire1->ire_flags &
- RTF_MULTIRT))
- continue;
-
- if (ire1->ire_addr !=
- ire->ire_addr)
- continue;
-
- if (ire1->ire_marks &
- (IRE_MARK_CONDEMNED |
- IRE_MARK_TESTHIDDEN))
- continue;
-
- /* Got one */
- IRE_REFHOLD(ire1);
- break;
- }
- IRB_REFRELE(irb);
-
- if (ire1 != NULL) {
- next_mp = copyb(mp);
- if ((next_mp == NULL) ||
- ((mp->b_cont != NULL) &&
- ((next_mp->b_cont =
- dupmsg(mp->b_cont))
- == NULL))) {
- freemsg(next_mp);
- next_mp = NULL;
- ire_refrele(ire1);
- ire1 = NULL;
- }
- }
-
- /*
- * Last multiroute ire; don't loop
- * anymore. The emission is over
- * and next_mp is NULL.
- */
- if (ire1 == NULL) {
- multirt_send = B_FALSE;
- }
- }
-
- out_ill = ire_to_ill(ire);
- DTRACE_PROBE4(ip4__physical__out__start,
- ill_t *, NULL,
- ill_t *, out_ill,
- ipha_t *, ipha, mblk_t *, mp);
- FW_HOOKS(ipst->ips_ip4_physical_out_event,
- ipst->ips_ipv4firewall_physical_out,
- NULL, out_ill, ipha, mp, mp, 0, ipst);
- DTRACE_PROBE1(ip4__physical__out__end,
- mblk_t *, mp);
- if (mp == NULL)
- goto release_ire_and_ill_2;
-
- ASSERT(ipsec_len == 0);
- mp->b_prev =
- SET_BPREV_FLAG(IPP_LOCAL_OUT);
- DTRACE_PROBE2(ip__xmit__2,
- mblk_t *, mp, ire_t *, ire);
- pktxmit_state = ip_xmit_v4(mp, ire,
- NULL, B_TRUE, connp);
- if ((pktxmit_state == SEND_FAILED) ||
- (pktxmit_state == LLHDR_RESLV_FAILED)) {
-release_ire_and_ill_2:
- if (next_mp) {
- freemsg(next_mp);
- ire_refrele(ire1);
- }
- ire_refrele(ire);
- TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END,
- "ip_wput_ire_end: q %p (%S)",
- q, "discard MDATA");
- if (conn_outgoing_ill != NULL)
- ill_refrele(conn_outgoing_ill);
- return;
- }
-
- if (CLASSD(dst)) {
- BUMP_MIB(out_ill->ill_ip_mib,
- ipIfStatsHCOutMcastPkts);
- UPDATE_MIB(out_ill->ill_ip_mib,
- ipIfStatsHCOutMcastOctets,
- LENGTH);
- } else if (ire->ire_type == IRE_BROADCAST) {
- BUMP_MIB(out_ill->ill_ip_mib,
- ipIfStatsHCOutBcastPkts);
- }
-
- if (multirt_send) {
- /*
- * We are in a multiple send case,
- * need to re-enter the sending loop
- * using the next ire.
- */
- ire_refrele(ire);
- ire = ire1;
- stq = ire->ire_stq;
- mp = next_mp;
- next_mp = NULL;
- ipha = (ipha_t *)mp->b_rptr;
- ill_index = Q_TO_INDEX(stq);
- }
- } while (multirt_send);
-
- if (!next_mp) {
- /*
- * Last copy going out (the ultra-common
- * case). Note that we intentionally replicate
- * the putnext rather than calling it before
- * the next_mp check in hopes of a little
- * tail-call action out of the compiler.
- */
- TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END,
- "ip_wput_ire_end: q %p (%S)",
- q, "last copy out(1)");
- ire_refrele(ire);
- if (conn_outgoing_ill != NULL)
- ill_refrele(conn_outgoing_ill);
- return;
- }
- /* More copies going out below. */
- } else {
- int offset;
-fragmentit:
- offset = ntohs(ipha->ipha_fragment_offset_and_flags);
- /*
- * If this would generate a icmp_frag_needed message,
- * we need to handle it before we do the IPsec
- * processing. Otherwise, we need to strip the IPsec
- * headers before we send up the message to the ULPs
- * which becomes messy and difficult.
- */
- if (ipsec_len != 0) {
- if ((max_frag < (unsigned int)(LENGTH +
- ipsec_len)) && (offset & IPH_DF)) {
- out_ill = (ill_t *)stq->q_ptr;
- BUMP_MIB(out_ill->ill_ip_mib,
- ipIfStatsOutFragFails);
- BUMP_MIB(out_ill->ill_ip_mib,
- ipIfStatsOutFragReqds);
- ipha->ipha_hdr_checksum = 0;
- ipha->ipha_hdr_checksum =
- (uint16_t)ip_csum_hdr(ipha);
- icmp_frag_needed(ire->ire_stq, first_mp,
- max_frag, zoneid, ipst);
- if (!next_mp) {
- ire_refrele(ire);
- if (conn_outgoing_ill != NULL) {
- ill_refrele(
- conn_outgoing_ill);
- }
- return;
- }
- } else {
- /*
- * This won't cause a icmp_frag_needed
- * message. to be generated. Send it on
- * the wire. Note that this could still
- * cause fragmentation and all we
- * do is the generation of the message
- * to the ULP if needed before IPsec.
- */
- if (!next_mp) {
- ipsec_out_process(q, first_mp,
- ire, ill_index);
- TRACE_2(TR_FAC_IP,
- TR_IP_WPUT_IRE_END,
- "ip_wput_ire_end: q %p "
- "(%S)", q,
- "last ipsec_out_process");
- ire_refrele(ire);
- if (conn_outgoing_ill != NULL) {
- ill_refrele(
- conn_outgoing_ill);
- }
- return;
- }
- ipsec_out_process(q, first_mp,
- ire, ill_index);
- }
- } else {
- /*
- * Initiate IPPF processing. For
- * fragmentable packets we finish
- * all QOS packet processing before
- * calling:
- * ip_wput_ire_fragmentit->ip_wput_frag
- */
-
- if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) {
- ip_process(IPP_LOCAL_OUT, &mp,
- ill_index);
- if (mp == NULL) {
- out_ill = (ill_t *)stq->q_ptr;
- BUMP_MIB(out_ill->ill_ip_mib,
- ipIfStatsOutDiscards);
- if (next_mp != NULL) {
- freemsg(next_mp);
- ire_refrele(ire1);
- }
- ire_refrele(ire);
- TRACE_2(TR_FAC_IP,
- TR_IP_WPUT_IRE_END,
- "ip_wput_ire: q %p (%S)",
- q, "discard MDATA");
- if (conn_outgoing_ill != NULL) {
- ill_refrele(
- conn_outgoing_ill);
- }
- return;
- }
- }
- if (!next_mp) {
- TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END,
- "ip_wput_ire_end: q %p (%S)",
- q, "last fragmentation");
- ip_wput_ire_fragmentit(mp, ire,
- zoneid, ipst, connp);
- ire_refrele(ire);
- if (conn_outgoing_ill != NULL)
- ill_refrele(conn_outgoing_ill);
- return;
- }
- ip_wput_ire_fragmentit(mp, ire,
- zoneid, ipst, connp);
- }
- }
- } else {
-nullstq:
- /* A NULL stq means the destination address is local. */
- UPDATE_OB_PKT_COUNT(ire);
- ire->ire_last_used_time = lbolt;
- ASSERT(ire->ire_ipif != NULL);
- if (!next_mp) {
- /*
- * Is there an "in" and "out" for traffic local
- * to a host (loopback)? The code in Solaris doesn't
- * explicitly draw a line in its code for in vs out,
- * so we've had to draw a line in the sand: ip_wput_ire
- * is considered to be the "output" side and
- * ip_wput_local to be the "input" side.
- */
- out_ill = ire_to_ill(ire);
-
- /*
- * DTrace this as ip:::send. A blocked packet will
- * fire the send probe, but not the receive probe.
- */
- DTRACE_IP7(send, mblk_t *, first_mp, conn_t *, NULL,
- void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill,
- ipha_t *, ipha, ip6_t *, NULL, int, 1);
-
- DTRACE_PROBE4(ip4__loopback__out__start,
- ill_t *, NULL, ill_t *, out_ill,
- ipha_t *, ipha, mblk_t *, first_mp);
-
- FW_HOOKS(ipst->ips_ip4_loopback_out_event,
- ipst->ips_ipv4firewall_loopback_out,
- NULL, out_ill, ipha, first_mp, mp, 0, ipst);
-
- DTRACE_PROBE1(ip4__loopback__out_end,
- mblk_t *, first_mp);
-
- TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END,
- "ip_wput_ire_end: q %p (%S)",
- q, "local address");
-
- if (first_mp != NULL)
- ip_wput_local(q, out_ill, ipha,
- first_mp, ire, 0, ire->ire_zoneid);
- ire_refrele(ire);
- if (conn_outgoing_ill != NULL)
- ill_refrele(conn_outgoing_ill);
- return;
- }
-
- out_ill = ire_to_ill(ire);
-
- /*
- * DTrace this as ip:::send. A blocked packet will fire the
- * send probe, but not the receive probe.
- */
- DTRACE_IP7(send, mblk_t *, first_mp, conn_t *, NULL,
- void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill,
- ipha_t *, ipha, ip6_t *, NULL, int, 1);
-
- DTRACE_PROBE4(ip4__loopback__out__start,
- ill_t *, NULL, ill_t *, out_ill,
- ipha_t *, ipha, mblk_t *, first_mp);
-
- FW_HOOKS(ipst->ips_ip4_loopback_out_event,
- ipst->ips_ipv4firewall_loopback_out,
- NULL, out_ill, ipha, first_mp, mp, 0, ipst);
-
- DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, first_mp);
-
- if (first_mp != NULL)
- ip_wput_local(q, out_ill, ipha,
- first_mp, ire, 0, ire->ire_zoneid);
- }
-next:
- /*
- * More copies going out to additional interfaces.
- * ire1 has already been held. We don't need the
- * "ire" anymore.
- */
- ire_refrele(ire);
- ire = ire1;
- ASSERT(ire != NULL && ire->ire_refcnt >= 1 && next_mp != NULL);
- mp = next_mp;
- ASSERT(ire->ire_ipversion == IPV4_VERSION);
- ill = ire_to_ill(ire);
- first_mp = mp;
- if (ipsec_len != 0) {
- ASSERT(first_mp->b_datap->db_type == M_CTL);
- mp = mp->b_cont;
- }
- dst = ire->ire_addr;
- ipha = (ipha_t *)mp->b_rptr;
- /*
- * Restore src so that we will pick up ire->ire_src_addr if src was 0.
- * Restore ipha_ident "no checksum" flag.
- */
- src = orig_src;
- ipha->ipha_ident = ip_hdr_included;
- goto another;
-
-#undef rptr
-#undef Q_TO_INDEX
-}
-
-/*
- * Routine to allocate a message that is used to notify the ULP about MDT.
- * The caller may provide a pointer to the link-layer MDT capabilities,
- * or NULL if MDT is to be disabled on the stream.
- */
-mblk_t *
-ip_mdinfo_alloc(ill_mdt_capab_t *isrc)
-{
- mblk_t *mp;
- ip_mdt_info_t *mdti;
- ill_mdt_capab_t *idst;
-
- if ((mp = allocb(sizeof (*mdti), BPRI_HI)) != NULL) {
- DB_TYPE(mp) = M_CTL;
- mp->b_wptr = mp->b_rptr + sizeof (*mdti);
- mdti = (ip_mdt_info_t *)mp->b_rptr;
- mdti->mdt_info_id = MDT_IOC_INFO_UPDATE;
- idst = &(mdti->mdt_capab);
-
- /*
- * If the caller provides us with the capability, copy
- * it over into our notification message; otherwise
- * we zero out the capability portion.
- */
- if (isrc != NULL)
- bcopy((caddr_t)isrc, (caddr_t)idst, sizeof (*idst));
- else
- bzero((caddr_t)idst, sizeof (*idst));
- }
- return (mp);
-}
-
-/*
- * Routine which determines whether MDT can be enabled on the destination
- * IRE and IPC combination, and if so, allocates and returns the MDT
- * notification mblk that may be used by ULP. We also check if we need to
- * turn MDT back to 'on' when certain restrictions prohibiting us to allow
- * MDT usage in the past have been lifted. This gets called during IP
- * and ULP binding.
- */
-mblk_t *
-ip_mdinfo_return(ire_t *dst_ire, conn_t *connp, char *ill_name,
- ill_mdt_capab_t *mdt_cap)
-{
- mblk_t *mp;
- boolean_t rc = B_FALSE;
- ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
-
- ASSERT(dst_ire != NULL);
- ASSERT(connp != NULL);
- ASSERT(mdt_cap != NULL);
-
- /*
- * Currently, we only support simple TCP/{IPv4,IPv6} with
- * Multidata, which is handled in tcp_multisend(). This
- * is the reason why we do all these checks here, to ensure
- * that we don't enable Multidata for the cases which we
- * can't handle at the moment.
- */
- do {
- /* Only do TCP at the moment */
- if (connp->conn_ulp != IPPROTO_TCP)
- break;
-
- /*
- * IPsec outbound policy present? Note that we get here
- * after calling ipsec_conn_cache_policy() where the global
- * policy checking is performed. conn_latch will be
- * non-NULL as long as there's a policy defined,
- * i.e. conn_out_enforce_policy may be NULL in such case
- * when the connection is non-secure, and hence we check
- * further if the latch refers to an outbound policy.
- */
- if (CONN_IPSEC_OUT_ENCAPSULATED(connp))
- break;
-
- /* CGTP (multiroute) is enabled? */
- if (dst_ire->ire_flags & RTF_MULTIRT)
- break;
-
- /* Outbound IPQoS enabled? */
- if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) {
- /*
- * In this case, we disable MDT for this and all
- * future connections going over the interface.
- */
- mdt_cap->ill_mdt_on = 0;
- break;
- }
-
- /* socket option(s) present? */
- if (!CONN_IS_LSO_MD_FASTPATH(connp))
- break;
-
- rc = B_TRUE;
- /* CONSTCOND */
- } while (0);
-
- /* Remember the result */
- connp->conn_mdt_ok = rc;
-
- if (!rc)
- return (NULL);
- else if (!mdt_cap->ill_mdt_on) {
- /*
- * If MDT has been previously turned off in the past, and we
- * currently can do MDT (due to IPQoS policy removal, etc.)
- * then enable it for this interface.
- */
- mdt_cap->ill_mdt_on = 1;
- ip1dbg(("ip_mdinfo_return: reenabling MDT for "
- "interface %s\n", ill_name));
- }
-
- /* Allocate the MDT info mblk */
- if ((mp = ip_mdinfo_alloc(mdt_cap)) == NULL) {
- ip0dbg(("ip_mdinfo_return: can't enable Multidata for "
- "conn %p on %s (ENOMEM)\n", (void *)connp, ill_name));
- return (NULL);
- }
- return (mp);
-}
-
-/*
- * Routine to allocate a message that is used to notify the ULP about LSO.
- * The caller may provide a pointer to the link-layer LSO capabilities,
- * or NULL if LSO is to be disabled on the stream.
- */
-mblk_t *
-ip_lsoinfo_alloc(ill_lso_capab_t *isrc)
-{
- mblk_t *mp;
- ip_lso_info_t *lsoi;
- ill_lso_capab_t *idst;
-
- if ((mp = allocb(sizeof (*lsoi), BPRI_HI)) != NULL) {
- DB_TYPE(mp) = M_CTL;
- mp->b_wptr = mp->b_rptr + sizeof (*lsoi);
- lsoi = (ip_lso_info_t *)mp->b_rptr;
- lsoi->lso_info_id = LSO_IOC_INFO_UPDATE;
- idst = &(lsoi->lso_capab);
-
- /*
- * If the caller provides us with the capability, copy
- * it over into our notification message; otherwise
- * we zero out the capability portion.
- */
- if (isrc != NULL)
- bcopy((caddr_t)isrc, (caddr_t)idst, sizeof (*idst));
- else
- bzero((caddr_t)idst, sizeof (*idst));
- }
- return (mp);
-}
-
-/*
- * Routine which determines whether LSO can be enabled on the destination
- * IRE and IPC combination, and if so, allocates and returns the LSO
- * notification mblk that may be used by ULP. We also check if we need to
- * turn LSO back to 'on' when certain restrictions prohibiting us to allow
- * LSO usage in the past have been lifted. This gets called during IP
- * and ULP binding.
- */
-mblk_t *
-ip_lsoinfo_return(ire_t *dst_ire, conn_t *connp, char *ill_name,
- ill_lso_capab_t *lso_cap)
-{
- mblk_t *mp;
- ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
-
- ASSERT(dst_ire != NULL);
- ASSERT(connp != NULL);
- ASSERT(lso_cap != NULL);
-
- connp->conn_lso_ok = B_TRUE;
-
- if ((connp->conn_ulp != IPPROTO_TCP) ||
- CONN_IPSEC_OUT_ENCAPSULATED(connp) ||
- (dst_ire->ire_flags & RTF_MULTIRT) ||
- !CONN_IS_LSO_MD_FASTPATH(connp) ||
- (IPP_ENABLED(IPP_LOCAL_OUT, ipst))) {
- connp->conn_lso_ok = B_FALSE;
- if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) {
- /*
- * Disable LSO for this and all future connections going
- * over the interface.
- */
- lso_cap->ill_lso_on = 0;
- }
- }
-
- if (!connp->conn_lso_ok)
- return (NULL);
- else if (!lso_cap->ill_lso_on) {
- /*
- * If LSO has been previously turned off in the past, and we
- * currently can do LSO (due to IPQoS policy removal, etc.)
- * then enable it for this interface.
- */
- lso_cap->ill_lso_on = 1;
- ip1dbg(("ip_mdinfo_return: reenabling LSO for interface %s\n",
- ill_name));
- }
-
- /* Allocate the LSO info mblk */
- if ((mp = ip_lsoinfo_alloc(lso_cap)) == NULL)
- ip0dbg(("ip_lsoinfo_return: can't enable LSO for "
- "conn %p on %s (ENOMEM)\n", (void *)connp, ill_name));
-
- return (mp);
-}
-
-/*
- * Create destination address attribute, and fill it with the physical
- * destination address and SAP taken from the template DL_UNITDATA_REQ
- * message block.
- */
-boolean_t
-ip_md_addr_attr(multidata_t *mmd, pdesc_t *pd, const mblk_t *dlmp)
-{
- dl_unitdata_req_t *dlurp;
- pattr_t *pa;
- pattrinfo_t pa_info;
- pattr_addr_t **das = (pattr_addr_t **)&pa_info.buf;
- uint_t das_len, das_off;
-
- ASSERT(dlmp != NULL);
-
- dlurp = (dl_unitdata_req_t *)dlmp->b_rptr;
- das_len = dlurp->dl_dest_addr_length;
- das_off = dlurp->dl_dest_addr_offset;
-
- pa_info.type = PATTR_DSTADDRSAP;
- pa_info.len = sizeof (**das) + das_len - 1;
-
- /* create and associate the attribute */
- pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP);
- if (pa != NULL) {
- ASSERT(*das != NULL);
- (*das)->addr_is_group = 0;
- (*das)->addr_len = (uint8_t)das_len;
- bcopy((caddr_t)dlurp + das_off, (*das)->addr, das_len);
- }
-
- return (pa != NULL);
-}
-
-/*
- * Create hardware checksum attribute and fill it with the values passed.
- */
-boolean_t
-ip_md_hcksum_attr(multidata_t *mmd, pdesc_t *pd, uint32_t start_offset,
- uint32_t stuff_offset, uint32_t end_offset, uint32_t flags)
-{
- pattr_t *pa;
- pattrinfo_t pa_info;
-
- ASSERT(mmd != NULL);
-
- pa_info.type = PATTR_HCKSUM;
- pa_info.len = sizeof (pattr_hcksum_t);
-
- /* create and associate the attribute */
- pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP);
- if (pa != NULL) {
- pattr_hcksum_t *hck = (pattr_hcksum_t *)pa_info.buf;
-
- hck->hcksum_start_offset = start_offset;
- hck->hcksum_stuff_offset = stuff_offset;
- hck->hcksum_end_offset = end_offset;
- hck->hcksum_flags = flags;
- }
- return (pa != NULL);
-}
-
-/*
- * Create zerocopy attribute and fill it with the specified flags
- */
-boolean_t
-ip_md_zcopy_attr(multidata_t *mmd, pdesc_t *pd, uint_t flags)
-{
- pattr_t *pa;
- pattrinfo_t pa_info;
-
- ASSERT(mmd != NULL);
- pa_info.type = PATTR_ZCOPY;
- pa_info.len = sizeof (pattr_zcopy_t);
-
- /* create and associate the attribute */
- pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP);
- if (pa != NULL) {
- pattr_zcopy_t *zcopy = (pattr_zcopy_t *)pa_info.buf;
-
- zcopy->zcopy_flags = flags;
- }
- return (pa != NULL);
-}
-
-/*
- * Check if ip_wput_frag_mdt() and ip_wput_frag_mdt_v6() can handle a message
- * block chain. We could rewrite to handle arbitrary message block chains but
- * that would make the code complicated and slow. Right now there three
- * restrictions:
- *
- * 1. The first message block must contain the complete IP header and
- * at least 1 byte of payload data.
- * 2. At most MULTIDATA_MAX_PBUFS non-empty message blocks are allowed
- * so that we can use a single Multidata message.
- * 3. No frag must be distributed over two or more message blocks so
- * that we don't need more than two packet descriptors per frag.
- *
- * The above restrictions allow us to support userland applications (which
- * will send down a single message block) and NFS over UDP (which will
- * send down a chain of at most three message blocks).
- *
- * We also don't use MDT for payloads with less than or equal to
- * ip_wput_frag_mdt_min bytes because it would cause too much overhead.
- */
-boolean_t
-ip_can_frag_mdt(mblk_t *mp, ssize_t hdr_len, ssize_t len)
-{
- int blocks;
- ssize_t total, missing, size;
-
- ASSERT(mp != NULL);
- ASSERT(hdr_len > 0);
-
- size = MBLKL(mp) - hdr_len;
- if (size <= 0)
- return (B_FALSE);
-
- /* The first mblk contains the header and some payload. */
- blocks = 1;
- total = size;
- size %= len;
- missing = (size == 0) ? 0 : (len - size);
- mp = mp->b_cont;
-
- while (mp != NULL) {
- /*
- * Give up if we encounter a zero length message block.
- * In practice, this should rarely happen and therefore
- * not worth the trouble of freeing and re-linking the
- * mblk from the chain to handle such case.
- */
- if ((size = MBLKL(mp)) == 0)
- return (B_FALSE);
-
- /* Too many payload buffers for a single Multidata message? */
- if (++blocks > MULTIDATA_MAX_PBUFS)
- return (B_FALSE);
-
- total += size;
- /* Is a frag distributed over two or more message blocks? */
- if (missing > size)
- return (B_FALSE);
- size -= missing;
-
- size %= len;
- missing = (size == 0) ? 0 : (len - size);
-
- mp = mp->b_cont;
- }
-
- return (total > ip_wput_frag_mdt_min);
-}
-
-/*
- * Outbound IPv4 fragmentation routine using MDT.
- */
-static void
-ip_wput_frag_mdt(ire_t *ire, mblk_t *mp, ip_pkt_t pkt_type, int len,
- uint32_t frag_flag, int offset)
-{
- ipha_t *ipha_orig;
- int i1, ip_data_end;
- uint_t pkts, wroff, hdr_chunk_len, pbuf_idx;
- mblk_t *hdr_mp, *md_mp = NULL;
- unsigned char *hdr_ptr, *pld_ptr;
- multidata_t *mmd;
- ip_pdescinfo_t pdi;
- ill_t *ill;
- ip_stack_t *ipst = ire->ire_ipst;
-
- ASSERT(DB_TYPE(mp) == M_DATA);
- ASSERT(MBLKL(mp) > sizeof (ipha_t));
-
- ill = ire_to_ill(ire);
- ASSERT(ill != NULL);
-
- ipha_orig = (ipha_t *)mp->b_rptr;
- mp->b_rptr += sizeof (ipha_t);
-
- /* Calculate how many packets we will send out */
- i1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgsize(mp);
- pkts = (i1 + len - 1) / len;
- ASSERT(pkts > 1);
-
- /* Allocate a message block which will hold all the IP Headers. */
- wroff = ipst->ips_ip_wroff_extra;
- hdr_chunk_len = wroff + IP_SIMPLE_HDR_LENGTH;
-
- i1 = pkts * hdr_chunk_len;
- /*
- * Create the header buffer, Multidata and destination address
- * and SAP attribute that should be associated with it.
- */
- if ((hdr_mp = allocb(i1, BPRI_HI)) == NULL ||
- ((hdr_mp->b_wptr += i1),
- (mmd = mmd_alloc(hdr_mp, &md_mp, KM_NOSLEEP)) == NULL) ||
- !ip_md_addr_attr(mmd, NULL, ire->ire_nce->nce_res_mp)) {
- freemsg(mp);
- if (md_mp == NULL) {
- freemsg(hdr_mp);
- } else {
-free_mmd: IP_STAT(ipst, ip_frag_mdt_discarded);
- freemsg(md_mp);
- }
- IP_STAT(ipst, ip_frag_mdt_allocfail);
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
- return;
- }
- IP_STAT(ipst, ip_frag_mdt_allocd);
-
- /*
- * Add a payload buffer to the Multidata; this operation must not
- * fail, or otherwise our logic in this routine is broken. There
- * is no memory allocation done by the routine, so any returned
- * failure simply tells us that we've done something wrong.
- *
- * A failure tells us that either we're adding the same payload
- * buffer more than once, or we're trying to add more buffers than
- * allowed. None of the above cases should happen, and we panic
- * because either there's horrible heap corruption, and/or
- * programming mistake.
- */
- if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0)
- goto pbuf_panic;
-
- hdr_ptr = hdr_mp->b_rptr;
- pld_ptr = mp->b_rptr;
-
- /* Establish the ending byte offset, based on the starting offset. */
- offset <<= 3;
- ip_data_end = offset + ntohs(ipha_orig->ipha_length) -
- IP_SIMPLE_HDR_LENGTH;
-
- pdi.flags = PDESC_HBUF_REF | PDESC_PBUF_REF;
-
- while (pld_ptr < mp->b_wptr) {
- ipha_t *ipha;
- uint16_t offset_and_flags;
- uint16_t ip_len;
- int error;
-
- ASSERT((hdr_ptr + hdr_chunk_len) <= hdr_mp->b_wptr);
- ipha = (ipha_t *)(hdr_ptr + wroff);
- ASSERT(OK_32PTR(ipha));
- *ipha = *ipha_orig;
-
- if (ip_data_end - offset > len) {
- offset_and_flags = IPH_MF;
- } else {
- /*
- * Last frag. Set len to the length of this last piece.
- */
- len = ip_data_end - offset;
- /* A frag of a frag might have IPH_MF non-zero */
- offset_and_flags =
- ntohs(ipha->ipha_fragment_offset_and_flags) &
- IPH_MF;
- }
- offset_and_flags |= (uint16_t)(offset >> 3);
- offset_and_flags |= (uint16_t)frag_flag;
- /* Store the offset and flags in the IP header. */
- ipha->ipha_fragment_offset_and_flags = htons(offset_and_flags);
-
- /* Store the length in the IP header. */
- ip_len = (uint16_t)(len + IP_SIMPLE_HDR_LENGTH);
- ipha->ipha_length = htons(ip_len);
-
- /*
- * Set the IP header checksum. Note that mp is just
- * the header, so this is easy to pass to ip_csum.
- */
- ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
-
- DTRACE_IP7(send, mblk_t *, md_mp, conn_t *, NULL, void_ip_t *,
- ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *,
- NULL, int, 0);
-
- /*
- * Record offset and size of header and data of the next packet
- * in the multidata message.
- */
- PDESC_HDR_ADD(&pdi, hdr_ptr, wroff, IP_SIMPLE_HDR_LENGTH, 0);
- PDESC_PLD_INIT(&pdi);
- i1 = MIN(mp->b_wptr - pld_ptr, len);
- ASSERT(i1 > 0);
- PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, pld_ptr, i1);
- if (i1 == len) {
- pld_ptr += len;
- } else {
- i1 = len - i1;
- mp = mp->b_cont;
- ASSERT(mp != NULL);
- ASSERT(MBLKL(mp) >= i1);
- /*
- * Attach the next payload message block to the
- * multidata message.
- */
- if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0)
- goto pbuf_panic;
- PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, mp->b_rptr, i1);
- pld_ptr = mp->b_rptr + i1;
- }
-
- if ((mmd_addpdesc(mmd, (pdescinfo_t *)&pdi, &error,
- KM_NOSLEEP)) == NULL) {
- /*
- * Any failure other than ENOMEM indicates that we
- * have passed in invalid pdesc info or parameters
- * to mmd_addpdesc, which must not happen.
- *
- * EINVAL is a result of failure on boundary checks
- * against the pdesc info contents. It should not
- * happen, and we panic because either there's
- * horrible heap corruption, and/or programming
- * mistake.
- */
- if (error != ENOMEM) {
- cmn_err(CE_PANIC, "ip_wput_frag_mdt: "
- "pdesc logic error detected for "
- "mmd %p pinfo %p (%d)\n",
- (void *)mmd, (void *)&pdi, error);
- /* NOTREACHED */
- }
- IP_STAT(ipst, ip_frag_mdt_addpdescfail);
- /* Free unattached payload message blocks as well */
- md_mp->b_cont = mp->b_cont;
- goto free_mmd;
- }
-
- /* Advance fragment offset. */
- offset += len;
-
- /* Advance to location for next header in the buffer. */
- hdr_ptr += hdr_chunk_len;
-
- /* Did we reach the next payload message block? */
- if (pld_ptr == mp->b_wptr && mp->b_cont != NULL) {
- mp = mp->b_cont;
- /*
- * Attach the next message block with payload
- * data to the multidata message.
- */
- if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0)
- goto pbuf_panic;
- pld_ptr = mp->b_rptr;
- }
- }
-
- ASSERT(hdr_mp->b_wptr == hdr_ptr);
- ASSERT(mp->b_wptr == pld_ptr);
-
- /* Update IP statistics */
- IP_STAT_UPDATE(ipst, ip_frag_mdt_pkt_out, pkts);
-
- UPDATE_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates, pkts);
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragOKs);
-
- len = ntohs(ipha_orig->ipha_length) + (pkts - 1) * IP_SIMPLE_HDR_LENGTH;
- UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits, pkts);
- UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets, len);
-
- if (pkt_type == OB_PKT) {
- ire->ire_ob_pkt_count += pkts;
- if (ire->ire_ipif != NULL)
- atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, pkts);
- } else {
- /* The type is IB_PKT in the forwarding path. */
- ire->ire_ib_pkt_count += pkts;
- ASSERT(!IRE_IS_LOCAL(ire));
- if (ire->ire_type & IRE_BROADCAST) {
- atomic_add_32(&ire->ire_ipif->ipif_ib_pkt_count, pkts);
- } else {
- UPDATE_MIB(ill->ill_ip_mib,
- ipIfStatsHCOutForwDatagrams, pkts);
- atomic_add_32(&ire->ire_ipif->ipif_fo_pkt_count, pkts);
- }
- }
- ire->ire_last_used_time = lbolt;
- /* Send it down */
- putnext(ire->ire_stq, md_mp);
- return;
-
-pbuf_panic:
- cmn_err(CE_PANIC, "ip_wput_frag_mdt: payload buffer logic "
- "error for mmd %p pbuf %p (%d)", (void *)mmd, (void *)mp,
- pbuf_idx);
- /* NOTREACHED */
-}
-
/*
* Outbound IP fragmentation routine.
- *
- * NOTE : This routine does not ire_refrele the ire that is passed in
- * as the argument.
+ * Assumes the caller has checked whether or not fragmentation should
+ * be allowed. Here we copy the DF bit from the header to all the generated
+ * fragments.
*/
-static void
-ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
- uint32_t frag_flag, zoneid_t zoneid, ip_stack_t *ipst, conn_t *connp)
+int
+ip_fragment_v4(mblk_t *mp_orig, nce_t *nce, iaflags_t ixaflags,
+ uint_t pkt_len, uint32_t max_frag, uint32_t xmit_hint, zoneid_t szone,
+ zoneid_t nolzid, pfirepostfrag_t postfragfn, uintptr_t *ixa_cookie)
{
int i1;
- mblk_t *ll_hdr_mp;
- int ll_hdr_len;
int hdr_len;
mblk_t *hdr_mp;
ipha_t *ipha;
int ip_data_end;
int len;
- mblk_t *mp = mp_orig, *mp1;
+ mblk_t *mp = mp_orig;
int offset;
- queue_t *q;
- uint32_t v_hlen_tos_len;
- mblk_t *first_mp;
- boolean_t mctl_present;
- ill_t *ill;
- ill_t *out_ill;
- mblk_t *xmit_mp;
+ ill_t *ill = nce->nce_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
mblk_t *carve_mp;
- ire_t *ire1 = NULL;
- ire_t *save_ire = NULL;
- mblk_t *next_mp = NULL;
- boolean_t last_frag = B_FALSE;
- boolean_t multirt_send = B_FALSE;
- ire_t *first_ire = NULL;
- irb_t *irb = NULL;
- mib2_ipIfStatsEntry_t *mibptr = NULL;
-
- ill = ire_to_ill(ire);
- mibptr = (ill != NULL) ? ill->ill_ip_mib : &ipst->ips_ip_mib;
+ uint32_t frag_flag;
+ uint_t priority = mp->b_band;
+ int error = 0;
- BUMP_MIB(mibptr, ipIfStatsOutFragReqds);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragReqds);
- if (max_frag == 0) {
- ip1dbg(("ip_wput_frag: ire frag size is 0"
- " - dropping packet\n"));
- BUMP_MIB(mibptr, ipIfStatsOutFragFails);
+ if (pkt_len != msgdsize(mp)) {
+ ip0dbg(("Packet length mismatch: %d, %ld\n",
+ pkt_len, msgdsize(mp)));
freemsg(mp);
- return;
+ return (EINVAL);
}
- /*
- * IPsec does not allow hw accelerated packets to be fragmented
- * This check is made in ip_wput_ipsec_out prior to coming here
- * via ip_wput_ire_fragmentit.
- *
- * If at this point we have an ire whose ARP request has not
- * been sent out, we call ip_xmit_v4->ire_arpresolve to trigger
- * sending of ARP query and change ire's state to ND_INCOMPLETE.
- * This packet and all fragmentable packets for this ire will
- * continue to get dropped while ire_nce->nce_state remains in
- * ND_INCOMPLETE. Post-ARP resolution, after ire's nce_state changes to
- * ND_REACHABLE, all subsquent large packets for this ire will
- * get fragemented and sent out by this function.
- */
- if (ire->ire_nce && ire->ire_nce->nce_state != ND_REACHABLE) {
- /* If nce_state is ND_INITIAL, trigger ARP query */
- (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE, NULL);
- ip1dbg(("ip_wput_frag: mac address for ire is unresolved"
- " - dropping packet\n"));
- BUMP_MIB(mibptr, ipIfStatsOutFragFails);
+ if (max_frag == 0) {
+ ip1dbg(("ip_fragment_v4: max_frag is zero. Dropping packet\n"));
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
+ ip_drop_output("FragFails: zero max_frag", mp, ill);
freemsg(mp);
- return;
- }
-
- TRACE_0(TR_FAC_IP, TR_IP_WPUT_FRAG_START,
- "ip_wput_frag_start:");
-
- if (mp->b_datap->db_type == M_CTL) {
- first_mp = mp;
- mp_orig = mp = mp->b_cont;
- mctl_present = B_TRUE;
- } else {
- first_mp = mp;
- mctl_present = B_FALSE;
+ return (EINVAL);
}
ASSERT(MBLKL(mp) >= sizeof (ipha_t));
ipha = (ipha_t *)mp->b_rptr;
+ ASSERT(ntohs(ipha->ipha_length) == pkt_len);
+ frag_flag = ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_DF;
/*
- * If the Don't Fragment flag is on, generate an ICMP destination
- * unreachable, fragmentation needed.
- */
- offset = ntohs(ipha->ipha_fragment_offset_and_flags);
- if (offset & IPH_DF) {
- BUMP_MIB(mibptr, ipIfStatsOutFragFails);
- if (is_system_labeled()) {
- max_frag = tsol_pmtu_adjust(mp, ire->ire_max_frag,
- ire->ire_max_frag - max_frag, AF_INET);
- }
- /*
- * Need to compute hdr checksum if called from ip_wput_ire.
- * Note that ip_rput_forward verifies the checksum before
- * calling this routine so in that case this is a noop.
- */
- ipha->ipha_hdr_checksum = 0;
- ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
- icmp_frag_needed(ire->ire_stq, first_mp, max_frag, zoneid,
- ipst);
- TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END,
- "ip_wput_frag_end:(%S)",
- "don't fragment");
- return;
- }
- /*
- * Labeled systems adjust max_frag if they add a label
- * to send the correct path mtu. We need the real mtu since we
- * are fragmenting the packet after label adjustment.
- */
- if (is_system_labeled())
- max_frag = ire->ire_max_frag;
- if (mctl_present)
- freeb(first_mp);
- /*
* Establish the starting offset. May not be zero if we are fragging
* a fragment that is being forwarded.
*/
- offset = offset & IPH_OFFSET;
+ offset = ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_OFFSET;
/* TODO why is this test needed? */
- v_hlen_tos_len = ((uint32_t *)ipha)[0];
- if (((max_frag - LENGTH) & ~7) < 8) {
+ if (((max_frag - ntohs(ipha->ipha_length)) & ~7) < 8) {
/* TODO: notify ulp somehow */
- BUMP_MIB(mibptr, ipIfStatsOutFragFails);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
+ ip_drop_output("FragFails: bad starting offset", mp, ill);
freemsg(mp);
- TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END,
- "ip_wput_frag_end:(%S)",
- "len < 8");
- return;
+ return (EINVAL);
}
- hdr_len = (V_HLEN & 0xF) << 2;
-
+ hdr_len = IPH_HDR_LENGTH(ipha);
ipha->ipha_hdr_checksum = 0;
/*
@@ -24173,40 +11742,14 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
*/
len = (max_frag - hdr_len) & ~7;
- /* Check if we can use MDT to send out the frags. */
- ASSERT(!IRE_IS_LOCAL(ire));
- if (hdr_len == IP_SIMPLE_HDR_LENGTH &&
- ipst->ips_ip_multidata_outbound &&
- !(ire->ire_flags & RTF_MULTIRT) &&
- !IPP_ENABLED(IPP_LOCAL_OUT, ipst) &&
- ill != NULL && ILL_MDT_CAPABLE(ill) &&
- IP_CAN_FRAG_MDT(mp, IP_SIMPLE_HDR_LENGTH, len)) {
- ASSERT(ill->ill_mdt_capab != NULL);
- if (!ill->ill_mdt_capab->ill_mdt_on) {
- /*
- * If MDT has been previously turned off in the past,
- * and we currently can do MDT (due to IPQoS policy
- * removal, etc.) then enable it for this interface.
- */
- ill->ill_mdt_capab->ill_mdt_on = 1;
- ip1dbg(("ip_wput_frag: enabled MDT for interface %s\n",
- ill->ill_name));
- }
- ip_wput_frag_mdt(ire, mp, pkt_type, len, frag_flag,
- offset);
- return;
- }
-
/* Get a copy of the header for the trailing frags */
- hdr_mp = ip_wput_frag_copyhdr((uchar_t *)ipha, hdr_len, offset, ipst,
+ hdr_mp = ip_fragment_copyhdr((uchar_t *)ipha, hdr_len, offset, ipst,
mp);
- if (!hdr_mp) {
- BUMP_MIB(mibptr, ipIfStatsOutFragFails);
+ if (hdr_mp == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
+ ip_drop_output("FragFails: no hdr_mp", mp, ill);
freemsg(mp);
- TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END,
- "ip_wput_frag_end:(%S)",
- "couldn't copy hdr");
- return;
+ return (ENOBUFS);
}
/* Store the starting offset, with the MoreFrags flag. */
@@ -24233,279 +11776,28 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
* original IP header.
*/
if (!(mp = ip_carve_mp(&mp_orig, i1))) {
- BUMP_MIB(mibptr, ipIfStatsOutFragFails);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
+ ip_drop_output("FragFails: could not carve mp", mp_orig, ill);
freeb(hdr_mp);
freemsg(mp_orig);
- TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END,
- "ip_wput_frag_end:(%S)",
- "couldn't carve first");
- return;
+ return (ENOBUFS);
}
- /*
- * Multirouting case. Each fragment is replicated
- * via all non-condemned RTF_MULTIRT routes
- * currently resolved.
- * We ensure that first_ire is the first RTF_MULTIRT
- * ire in the bucket.
- */
- if (ire->ire_flags & RTF_MULTIRT) {
- irb = ire->ire_bucket;
- ASSERT(irb != NULL);
-
- multirt_send = B_TRUE;
-
- /* Make sure we do not omit any multiroute ire. */
- IRB_REFHOLD(irb);
- for (first_ire = irb->irb_ire;
- first_ire != NULL;
- first_ire = first_ire->ire_next) {
- if ((first_ire->ire_flags & RTF_MULTIRT) &&
- (first_ire->ire_addr == ire->ire_addr) &&
- !(first_ire->ire_marks &
- (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)))
- break;
- }
-
- if (first_ire != NULL) {
- if (first_ire != ire) {
- IRE_REFHOLD(first_ire);
- /*
- * Do not release the ire passed in
- * as the argument.
- */
- ire = first_ire;
- } else {
- first_ire = NULL;
- }
- }
- IRB_REFRELE(irb);
-
- /*
- * Save the first ire; we will need to restore it
- * for the trailing frags.
- * We REFHOLD save_ire, as each iterated ire will be
- * REFRELEd.
- */
- save_ire = ire;
- IRE_REFHOLD(save_ire);
- }
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates);
- /*
- * First fragment emission loop.
- * In most cases, the emission loop below is entered only
- * once. Only in the case where the ire holds the RTF_MULTIRT
- * flag, do we loop to process all RTF_MULTIRT ires in the
- * bucket, and send the fragment through all crossed
- * RTF_MULTIRT routes.
- */
- do {
- if (ire->ire_flags & RTF_MULTIRT) {
- /*
- * We are in a multiple send case, need to get
- * the next ire and make a copy of the packet.
- * ire1 holds here the next ire to process in the
- * bucket. If multirouting is expected,
- * any non-RTF_MULTIRT ire that has the
- * right destination address is ignored.
- *
- * We have to take into account the MTU of
- * each walked ire. max_frag is set by the
- * the caller and generally refers to
- * the primary ire entry. Here we ensure that
- * no route with a lower MTU will be used, as
- * fragments are carved once for all ires,
- * then replicated.
- */
- ASSERT(irb != NULL);
- IRB_REFHOLD(irb);
- for (ire1 = ire->ire_next;
- ire1 != NULL;
- ire1 = ire1->ire_next) {
- if ((ire1->ire_flags & RTF_MULTIRT) == 0)
- continue;
- if (ire1->ire_addr != ire->ire_addr)
- continue;
- if (ire1->ire_marks &
- (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))
- continue;
- /*
- * Ensure we do not exceed the MTU
- * of the next route.
- */
- if (ire1->ire_max_frag < max_frag) {
- ip_multirt_bad_mtu(ire1, max_frag);
- continue;
- }
-
- /* Got one. */
- IRE_REFHOLD(ire1);
- break;
- }
- IRB_REFRELE(irb);
-
- if (ire1 != NULL) {
- next_mp = copyb(mp);
- if ((next_mp == NULL) ||
- ((mp->b_cont != NULL) &&
- ((next_mp->b_cont =
- dupmsg(mp->b_cont)) == NULL))) {
- freemsg(next_mp);
- next_mp = NULL;
- ire_refrele(ire1);
- ire1 = NULL;
- }
- }
-
- /* Last multiroute ire; don't loop anymore. */
- if (ire1 == NULL) {
- multirt_send = B_FALSE;
- }
- }
-
- ll_hdr_len = 0;
- LOCK_IRE_FP_MP(ire);
- ll_hdr_mp = ire->ire_nce->nce_fp_mp;
- if (ll_hdr_mp != NULL) {
- ASSERT(ll_hdr_mp->b_datap->db_type == M_DATA);
- ll_hdr_len = ll_hdr_mp->b_wptr - ll_hdr_mp->b_rptr;
- } else {
- ll_hdr_mp = ire->ire_nce->nce_res_mp;
- }
-
- /* If there is a transmit header, get a copy for this frag. */
- /*
- * TODO: should check db_ref before calling ip_carve_mp since
- * it might give us a dup.
- */
- if (!ll_hdr_mp) {
- /* No xmit header. */
- xmit_mp = mp;
-
- /* We have a link-layer header that can fit in our mblk. */
- } else if (mp->b_datap->db_ref == 1 &&
- ll_hdr_len != 0 &&
- ll_hdr_len <= mp->b_rptr - mp->b_datap->db_base) {
- /* M_DATA fastpath */
- mp->b_rptr -= ll_hdr_len;
- bcopy(ll_hdr_mp->b_rptr, mp->b_rptr, ll_hdr_len);
- xmit_mp = mp;
-
- /* Corner case if copyb has failed */
- } else if (!(xmit_mp = copyb(ll_hdr_mp))) {
- UNLOCK_IRE_FP_MP(ire);
- BUMP_MIB(mibptr, ipIfStatsOutFragFails);
- freeb(hdr_mp);
- freemsg(mp);
- freemsg(mp_orig);
- TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END,
- "ip_wput_frag_end:(%S)",
- "discard");
-
- if (multirt_send) {
- ASSERT(ire1);
- ASSERT(next_mp);
-
- freemsg(next_mp);
- ire_refrele(ire1);
- }
- if (save_ire != NULL)
- IRE_REFRELE(save_ire);
-
- if (first_ire != NULL)
- ire_refrele(first_ire);
- return;
-
- /*
- * Case of res_mp OR the fastpath mp can't fit
- * in the mblk
- */
- } else {
- xmit_mp->b_cont = mp;
-
- /*
- * Get priority marking, if any.
- * We propagate the CoS marking from the
- * original packet that went to QoS processing
- * in ip_wput_ire to the newly carved mp.
- */
- if (DB_TYPE(xmit_mp) == M_DATA)
- xmit_mp->b_band = mp->b_band;
- }
- UNLOCK_IRE_FP_MP(ire);
-
- q = ire->ire_stq;
- out_ill = (ill_t *)q->q_ptr;
-
- BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsOutFragCreates);
-
- DTRACE_PROBE4(ip4__physical__out__start,
- ill_t *, NULL, ill_t *, out_ill,
- ipha_t *, ipha, mblk_t *, xmit_mp);
-
- FW_HOOKS(ipst->ips_ip4_physical_out_event,
- ipst->ips_ipv4firewall_physical_out,
- NULL, out_ill, ipha, xmit_mp, mp, 0, ipst);
-
- DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, xmit_mp);
-
- if (xmit_mp != NULL) {
- DTRACE_IP7(send, mblk_t *, xmit_mp, conn_t *, NULL,
- void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill,
- ipha_t *, ipha, ip6_t *, NULL, int, 0);
-
- ILL_SEND_TX(out_ill, ire, connp, xmit_mp, 0, connp);
-
- BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutTransmits);
- UPDATE_MIB(out_ill->ill_ip_mib,
- ipIfStatsHCOutOctets, i1);
-
- if (pkt_type != OB_PKT) {
- /*
- * Update the packet count and MIB stats
- * of trailing RTF_MULTIRT ires.
- */
- UPDATE_OB_PKT_COUNT(ire);
- BUMP_MIB(out_ill->ill_ip_mib,
- ipIfStatsOutFragReqds);
- }
- }
-
- if (multirt_send) {
- /*
- * We are in a multiple send case; look for
- * the next ire and re-enter the loop.
- */
- ASSERT(ire1);
- ASSERT(next_mp);
- /* REFRELE the current ire before looping */
- ire_refrele(ire);
- ire = ire1;
- ire1 = NULL;
- mp = next_mp;
- next_mp = NULL;
- }
- } while (multirt_send);
-
- ASSERT(ire1 == NULL);
-
- /* Restore the original ire; we need it for the trailing frags */
- if (save_ire != NULL) {
- /* REFRELE the last iterated ire */
- ire_refrele(ire);
- /* save_ire has been REFHOLDed */
- ire = save_ire;
- save_ire = NULL;
- q = ire->ire_stq;
+ error = postfragfn(mp, nce, ixaflags, i1, xmit_hint, szone, nolzid,
+ ixa_cookie);
+ if (error != 0 && error != EWOULDBLOCK) {
+ /* No point in sending the other fragments */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
+ ip_drop_output("FragFails: postfragfn failed", mp_orig, ill);
+ freeb(hdr_mp);
+ freemsg(mp_orig);
+ return (error);
}
- if (pkt_type == OB_PKT) {
- UPDATE_OB_PKT_COUNT(ire);
- } else {
- out_ill = (ill_t *)q->q_ptr;
- BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams);
- UPDATE_IB_PKT_COUNT(ire);
- }
+ /* No need to redo state machine in loop */
+ ixaflags &= ~IXAF_REACH_CONF;
/* Advance the offset to the second frag starting point. */
offset += len;
@@ -24547,7 +11839,7 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
break;
}
/* Get priority marking, if any. */
- mp->b_band = carve_mp->b_band;
+ mp->b_band = priority;
mp->b_cont = carve_mp;
}
ipha = (ipha_t *)mp->b_rptr;
@@ -24581,7 +11873,7 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
} else {
mp = hdr_mp;
/* Get priority marking, if any. */
- mp->b_band = carve_mp->b_band;
+ mp->b_band = priority;
mp->b_cont = carve_mp;
}
ipha = (ipha_t *)mp->b_rptr;
@@ -24605,254 +11897,40 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
*/
ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
- /* Attach a transmit header, if any, and ship it. */
- if (pkt_type == OB_PKT) {
- UPDATE_OB_PKT_COUNT(ire);
- } else {
- out_ill = (ill_t *)q->q_ptr;
- BUMP_MIB(out_ill->ill_ip_mib,
- ipIfStatsHCOutForwDatagrams);
- UPDATE_IB_PKT_COUNT(ire);
- }
-
- if (ire->ire_flags & RTF_MULTIRT) {
- irb = ire->ire_bucket;
- ASSERT(irb != NULL);
-
- multirt_send = B_TRUE;
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates);
- /*
- * Save the original ire; we will need to restore it
- * for the tailing frags.
- */
- save_ire = ire;
- IRE_REFHOLD(save_ire);
+ error = postfragfn(mp, nce, ixaflags, ip_len, xmit_hint, szone,
+ nolzid, ixa_cookie);
+ /* All done if we just consumed the hdr_mp. */
+ if (mp == hdr_mp) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragOKs);
+ return (error);
}
- /*
- * Emission loop for this fragment, similar
- * to what is done for the first fragment.
- */
- do {
- if (multirt_send) {
- /*
- * We are in a multiple send case, need to get
- * the next ire and make a copy of the packet.
- */
- ASSERT(irb != NULL);
- IRB_REFHOLD(irb);
- for (ire1 = ire->ire_next;
- ire1 != NULL;
- ire1 = ire1->ire_next) {
- if (!(ire1->ire_flags & RTF_MULTIRT))
- continue;
- if (ire1->ire_addr != ire->ire_addr)
- continue;
- if (ire1->ire_marks &
- (IRE_MARK_CONDEMNED |
- IRE_MARK_TESTHIDDEN))
- continue;
- /*
- * Ensure we do not exceed the MTU
- * of the next route.
- */
- if (ire1->ire_max_frag < max_frag) {
- ip_multirt_bad_mtu(ire1,
- max_frag);
- continue;
- }
-
- /* Got one. */
- IRE_REFHOLD(ire1);
- break;
- }
- IRB_REFRELE(irb);
-
- if (ire1 != NULL) {
- next_mp = copyb(mp);
- if ((next_mp == NULL) ||
- ((mp->b_cont != NULL) &&
- ((next_mp->b_cont =
- dupmsg(mp->b_cont)) == NULL))) {
- freemsg(next_mp);
- next_mp = NULL;
- ire_refrele(ire1);
- ire1 = NULL;
- }
- }
-
- /* Last multiroute ire; don't loop anymore. */
- if (ire1 == NULL) {
- multirt_send = B_FALSE;
- }
- }
-
- /* Update transmit header */
- ll_hdr_len = 0;
- LOCK_IRE_FP_MP(ire);
- ll_hdr_mp = ire->ire_nce->nce_fp_mp;
- if (ll_hdr_mp != NULL) {
- ASSERT(ll_hdr_mp->b_datap->db_type == M_DATA);
- ll_hdr_len = MBLKL(ll_hdr_mp);
- } else {
- ll_hdr_mp = ire->ire_nce->nce_res_mp;
- }
-
- if (!ll_hdr_mp) {
- xmit_mp = mp;
-
- /*
- * We have link-layer header that can fit in
- * our mblk.
- */
- } else if (mp->b_datap->db_ref == 1 &&
- ll_hdr_len != 0 &&
- ll_hdr_len <= mp->b_rptr - mp->b_datap->db_base) {
- /* M_DATA fastpath */
- mp->b_rptr -= ll_hdr_len;
- bcopy(ll_hdr_mp->b_rptr, mp->b_rptr,
- ll_hdr_len);
- xmit_mp = mp;
-
- /*
- * Case of res_mp OR the fastpath mp can't fit
- * in the mblk
- */
- } else if ((xmit_mp = copyb(ll_hdr_mp)) != NULL) {
- xmit_mp->b_cont = mp;
- /* Get priority marking, if any. */
- if (DB_TYPE(xmit_mp) == M_DATA)
- xmit_mp->b_band = mp->b_band;
-
- /* Corner case if copyb failed */
- } else {
- /*
- * Exit both the replication and
- * fragmentation loops.
- */
- UNLOCK_IRE_FP_MP(ire);
- goto drop_pkt;
- }
- UNLOCK_IRE_FP_MP(ire);
-
- mp1 = mp;
- out_ill = (ill_t *)q->q_ptr;
-
- BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsOutFragCreates);
-
- DTRACE_PROBE4(ip4__physical__out__start,
- ill_t *, NULL, ill_t *, out_ill,
- ipha_t *, ipha, mblk_t *, xmit_mp);
-
- FW_HOOKS(ipst->ips_ip4_physical_out_event,
- ipst->ips_ipv4firewall_physical_out,
- NULL, out_ill, ipha, xmit_mp, mp, 0, ipst);
-
- DTRACE_PROBE1(ip4__physical__out__end,
- mblk_t *, xmit_mp);
-
- if (mp != mp1 && hdr_mp == mp1)
- hdr_mp = mp;
- if (mp != mp1 && mp_orig == mp1)
- mp_orig = mp;
-
- if (xmit_mp != NULL) {
- DTRACE_IP7(send, mblk_t *, xmit_mp, conn_t *,
- NULL, void_ip_t *, ipha,
- __dtrace_ipsr_ill_t *, out_ill, ipha_t *,
- ipha, ip6_t *, NULL, int, 0);
-
- ILL_SEND_TX(out_ill, ire, connp,
- xmit_mp, 0, connp);
-
- BUMP_MIB(out_ill->ill_ip_mib,
- ipIfStatsHCOutTransmits);
- UPDATE_MIB(out_ill->ill_ip_mib,
- ipIfStatsHCOutOctets, ip_len);
-
- if (pkt_type != OB_PKT) {
- /*
- * Update the packet count of trailing
- * RTF_MULTIRT ires.
- */
- UPDATE_OB_PKT_COUNT(ire);
- }
- }
-
- /* All done if we just consumed the hdr_mp. */
- if (mp == hdr_mp) {
- last_frag = B_TRUE;
- BUMP_MIB(out_ill->ill_ip_mib,
- ipIfStatsOutFragOKs);
- }
-
- if (multirt_send) {
- /*
- * We are in a multiple send case; look for
- * the next ire and re-enter the loop.
- */
- ASSERT(ire1);
- ASSERT(next_mp);
- /* REFRELE the current ire before looping */
- ire_refrele(ire);
- ire = ire1;
- ire1 = NULL;
- q = ire->ire_stq;
- mp = next_mp;
- next_mp = NULL;
- }
- } while (multirt_send);
- /*
- * Restore the original ire; we need it for the
- * trailing frags
- */
- if (save_ire != NULL) {
- ASSERT(ire1 == NULL);
- /* REFRELE the last iterated ire */
- ire_refrele(ire);
- /* save_ire has been REFHOLDed */
- ire = save_ire;
- q = ire->ire_stq;
- save_ire = NULL;
+ if (error != 0 && error != EWOULDBLOCK) {
+ DTRACE_PROBE2(ip__xmit__frag__fail, ill_t *, ill,
+ mblk_t *, hdr_mp);
+ /* No point in sending the other fragments */
+ break;
}
- if (last_frag) {
- TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END,
- "ip_wput_frag_end:(%S)",
- "consumed hdr_mp");
-
- if (first_ire != NULL)
- ire_refrele(first_ire);
- return;
- }
/* Otherwise, advance and loop. */
offset += len;
}
-
-drop_pkt:
/* Clean up following allocation failure. */
- BUMP_MIB(mibptr, ipIfStatsOutFragFails);
- freemsg(mp);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
+ ip_drop_output("FragFails: loop ended", NULL, ill);
if (mp != hdr_mp)
freeb(hdr_mp);
if (mp != mp_orig)
freemsg(mp_orig);
-
- if (save_ire != NULL)
- IRE_REFRELE(save_ire);
- if (first_ire != NULL)
- ire_refrele(first_ire);
-
- TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END,
- "ip_wput_frag_end:(%S)",
- "end--alloc failure");
+ return (error);
}
/*
* Copy the header plus those options which have the copy bit set
- * src is the template to make sure we preserve the cred for TX purposes.
*/
static mblk_t *
-ip_wput_frag_copyhdr(uchar_t *rptr, int hdr_len, int offset, ip_stack_t *ipst,
+ip_fragment_copyhdr(uchar_t *rptr, int hdr_len, int offset, ip_stack_t *ipst,
mblk_t *src)
{
mblk_t *mp;
@@ -24908,310 +11986,13 @@ ip_wput_frag_copyhdr(uchar_t *rptr, int hdr_len, int offset, ip_stack_t *ipst,
}
/*
- * Delivery to local recipients including fanout to multiple recipients.
- * Does not do checksumming of UDP/TCP.
- * Note: q should be the read side queue for either the ill or conn.
- * Note: rq should be the read side q for the lower (ill) stream.
- * We don't send packets to IPPF processing, thus the last argument
- * to all the fanout calls are B_FALSE.
- */
-void
-ip_wput_local(queue_t *q, ill_t *ill, ipha_t *ipha, mblk_t *mp, ire_t *ire,
- int fanout_flags, zoneid_t zoneid)
-{
- uint32_t protocol;
- mblk_t *first_mp;
- boolean_t mctl_present;
- int ire_type;
-#define rptr ((uchar_t *)ipha)
- ip_stack_t *ipst = ill->ill_ipst;
-
- TRACE_1(TR_FAC_IP, TR_IP_WPUT_LOCAL_START,
- "ip_wput_local_start: q %p", q);
-
- if (ire != NULL) {
- ire_type = ire->ire_type;
- } else {
- /*
- * Only ip_multicast_loopback() calls us with a NULL ire. If the
- * packet is not multicast, we can't tell the ire type.
- */
- ASSERT(CLASSD(ipha->ipha_dst));
- ire_type = IRE_BROADCAST;
- }
-
- first_mp = mp;
- if (first_mp->b_datap->db_type == M_CTL) {
- ipsec_out_t *io = (ipsec_out_t *)first_mp->b_rptr;
- if (!io->ipsec_out_secure) {
- /*
- * This ipsec_out_t was allocated in ip_wput
- * for multicast packets to store the ill_index.
- * As this is being delivered locally, we don't
- * need this anymore.
- */
- mp = first_mp->b_cont;
- freeb(first_mp);
- first_mp = mp;
- mctl_present = B_FALSE;
- } else {
- /*
- * Convert IPSEC_OUT to IPSEC_IN, preserving all
- * security properties for the looped-back packet.
- */
- mctl_present = B_TRUE;
- mp = first_mp->b_cont;
- ASSERT(mp != NULL);
- ipsec_out_to_in(first_mp);
- }
- } else {
- mctl_present = B_FALSE;
- }
-
- DTRACE_PROBE4(ip4__loopback__in__start,
- ill_t *, ill, ill_t *, NULL,
- ipha_t *, ipha, mblk_t *, first_mp);
-
- FW_HOOKS(ipst->ips_ip4_loopback_in_event,
- ipst->ips_ipv4firewall_loopback_in,
- ill, NULL, ipha, first_mp, mp, 0, ipst);
-
- DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, first_mp);
-
- if (first_mp == NULL)
- return;
-
- if (ipst->ips_ip4_observe.he_interested) {
- zoneid_t szone, dzone, lookup_zoneid = ALL_ZONES;
- zoneid_t stackzoneid = netstackid_to_zoneid(
- ipst->ips_netstack->netstack_stackid);
-
- dzone = (stackzoneid == GLOBAL_ZONEID) ? zoneid : stackzoneid;
- /*
- * 127.0.0.1 is special, as we cannot lookup its zoneid by
- * address. Restrict the lookup below to the destination zone.
- */
- if (ipha->ipha_src == ntohl(INADDR_LOOPBACK))
- lookup_zoneid = zoneid;
- szone = ip_get_zoneid_v4(ipha->ipha_src, mp, ipst,
- lookup_zoneid);
- ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst);
- }
-
- DTRACE_IP7(receive, mblk_t *, first_mp, conn_t *, NULL, void_ip_t *,
- ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL,
- int, 1);
-
- ipst->ips_loopback_packets++;
-
- ip2dbg(("ip_wput_local: from 0x%x to 0x%x in zone %d\n",
- ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), zoneid));
- if (!IS_SIMPLE_IPH(ipha)) {
- ip_wput_local_options(ipha, ipst);
- }
-
- protocol = ipha->ipha_protocol;
- switch (protocol) {
- case IPPROTO_ICMP: {
- ire_t *ire_zone;
- ilm_t *ilm;
- mblk_t *mp1;
- zoneid_t last_zoneid;
- ilm_walker_t ilw;
-
- if (CLASSD(ipha->ipha_dst) && !IS_LOOPBACK(ill)) {
- ASSERT(ire_type == IRE_BROADCAST);
- /*
- * In the multicast case, applications may have joined
- * the group from different zones, so we need to deliver
- * the packet to each of them. Loop through the
- * multicast memberships structures (ilm) on the receive
- * ill and send a copy of the packet up each matching
- * one. However, we don't do this for multicasts sent on
- * the loopback interface (PHYI_LOOPBACK flag set) as
- * they must stay in the sender's zone.
- *
- * ilm_add_v6() ensures that ilms in the same zone are
- * contiguous in the ill_ilm list. We use this property
- * to avoid sending duplicates needed when two
- * applications in the same zone join the same group on
- * different logical interfaces: we ignore the ilm if
- * it's zoneid is the same as the last matching one.
- * In addition, the sending of the packet for
- * ire_zoneid is delayed until all of the other ilms
- * have been exhausted.
- */
- last_zoneid = -1;
- ilm = ilm_walker_start(&ilw, ill);
- for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
- if (ipha->ipha_dst != ilm->ilm_addr ||
- ilm->ilm_zoneid == last_zoneid ||
- ilm->ilm_zoneid == zoneid ||
- !(ilm->ilm_ipif->ipif_flags & IPIF_UP))
- continue;
- mp1 = ip_copymsg(first_mp);
- if (mp1 == NULL)
- continue;
- icmp_inbound(q, mp1, B_TRUE, ilw.ilw_walk_ill,
- 0, 0, mctl_present, B_FALSE, ill,
- ilm->ilm_zoneid);
- last_zoneid = ilm->ilm_zoneid;
- }
- ilm_walker_finish(&ilw);
- /*
- * Loopback case: the sending endpoint has
- * IP_MULTICAST_LOOP disabled, therefore we don't
- * dispatch the multicast packet to the sending zone.
- */
- if (fanout_flags & IP_FF_NO_MCAST_LOOP) {
- freemsg(first_mp);
- return;
- }
- } else if (ire_type == IRE_BROADCAST) {
- /*
- * In the broadcast case, there may be many zones
- * which need a copy of the packet delivered to them.
- * There is one IRE_BROADCAST per broadcast address
- * and per zone; we walk those using a helper function.
- * In addition, the sending of the packet for zoneid is
- * delayed until all of the other ires have been
- * processed.
- */
- IRB_REFHOLD(ire->ire_bucket);
- ire_zone = NULL;
- while ((ire_zone = ire_get_next_bcast_ire(ire_zone,
- ire)) != NULL) {
- mp1 = ip_copymsg(first_mp);
- if (mp1 == NULL)
- continue;
-
- UPDATE_IB_PKT_COUNT(ire_zone);
- ire_zone->ire_last_used_time = lbolt;
- icmp_inbound(q, mp1, B_TRUE, ill, 0, 0,
- mctl_present, B_FALSE, ill,
- ire_zone->ire_zoneid);
- }
- IRB_REFRELE(ire->ire_bucket);
- }
- icmp_inbound(q, first_mp, (ire_type == IRE_BROADCAST), ill, 0,
- 0, mctl_present, B_FALSE, ill, zoneid);
- TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END,
- "ip_wput_local_end: q %p (%S)",
- q, "icmp");
- return;
- }
- case IPPROTO_IGMP:
- if ((mp = igmp_input(q, mp, ill)) == NULL) {
- /* Bad packet - discarded by igmp_input */
- TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END,
- "ip_wput_local_end: q %p (%S)",
- q, "igmp_input--bad packet");
- if (mctl_present)
- freeb(first_mp);
- return;
- }
- /*
- * igmp_input() may have returned the pulled up message.
- * So first_mp and ipha need to be reinitialized.
- */
- ipha = (ipha_t *)mp->b_rptr;
- if (mctl_present)
- first_mp->b_cont = mp;
- else
- first_mp = mp;
- /* deliver to local raw users */
- break;
- case IPPROTO_ENCAP:
- /*
- * This case is covered by either ip_fanout_proto, or by
- * the above security processing for self-tunneled packets.
- */
- break;
- case IPPROTO_UDP: {
- uint16_t *up;
- uint32_t ports;
-
- up = (uint16_t *)(rptr + IPH_HDR_LENGTH(ipha) +
- UDP_PORTS_OFFSET);
- /* Force a 'valid' checksum. */
- up[3] = 0;
-
- ports = *(uint32_t *)up;
- ip_fanout_udp(q, first_mp, ill, ipha, ports,
- (ire_type == IRE_BROADCAST),
- fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE |
- IP_FF_SEND_SLLA | IP_FF_IPINFO, mctl_present, B_FALSE,
- ill, zoneid);
- TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END,
- "ip_wput_local_end: q %p (%S)", q, "ip_fanout_udp");
- return;
- }
- case IPPROTO_TCP: {
-
- /*
- * For TCP, discard broadcast packets.
- */
- if ((ushort_t)ire_type == IRE_BROADCAST) {
- freemsg(first_mp);
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- ip2dbg(("ip_wput_local: discard broadcast\n"));
- return;
- }
-
- if (mp->b_datap->db_type == M_DATA) {
- /*
- * M_DATA mblk, so init mblk (chain) for no struio().
- */
- mblk_t *mp1 = mp;
-
- do {
- mp1->b_datap->db_struioflag = 0;
- } while ((mp1 = mp1->b_cont) != NULL);
- }
- ASSERT((rptr + IPH_HDR_LENGTH(ipha) + TCP_PORTS_OFFSET + 4)
- <= mp->b_wptr);
- ip_fanout_tcp(q, first_mp, ill, ipha,
- fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE |
- IP_FF_SYN_ADDIRE | IP_FF_IPINFO,
- mctl_present, B_FALSE, zoneid);
- TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END,
- "ip_wput_local_end: q %p (%S)", q, "ip_fanout_tcp");
- return;
- }
- case IPPROTO_SCTP:
- {
- uint32_t ports;
-
- bcopy(rptr + IPH_HDR_LENGTH(ipha), &ports, sizeof (ports));
- ip_fanout_sctp(first_mp, ill, ipha, ports,
- fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE |
- IP_FF_IPINFO, mctl_present, B_FALSE, zoneid);
- return;
- }
-
- default:
- break;
- }
- /*
- * Find a client for some other protocol. We give
- * copies to multiple clients, if more than one is
- * bound.
- */
- ip_fanout_proto(q, first_mp, ill, ipha,
- fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | IP_FF_RAWIP,
- mctl_present, B_FALSE, ill, zoneid);
- TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END,
- "ip_wput_local_end: q %p (%S)", q, "ip_fanout_proto");
-#undef rptr
-}
-
-/*
- * Update any source route, record route, or timestamp options.
+ * Update any source route, record route, or timestamp options when
+ * sending a packet back to ourselves.
* Check that we are at end of strict source route.
- * The options have been sanity checked by ip_wput_options().
+ * The options have been sanity checked by ip_output_options().
*/
-static void
-ip_wput_local_options(ipha_t *ipha, ip_stack_t *ipst)
+void
+ip_output_local_options(ipha_t *ipha, ip_stack_t *ipst)
{
ipoptp_t opts;
uchar_t *opt;
@@ -25219,10 +12000,8 @@ ip_wput_local_options(ipha_t *ipha, ip_stack_t *ipst)
uint8_t optlen;
ipaddr_t dst;
uint32_t ts;
- ire_t *ire;
timestruc_t now;
- ip2dbg(("ip_wput_local_options\n"));
for (optval = ipoptp_first(&opts, ipha);
optval != IPOPT_EOL;
optval = ipoptp_next(&opts)) {
@@ -25246,7 +12025,7 @@ ip_wput_local_options(ipha_t *ipha, ip_stack_t *ipst)
* it is a packet with a loose source route which
* reaches us before consuming the whole source route
*/
- ip1dbg(("ip_wput_local_options: not end of SR\n"));
+
if (optval == IPOPT_SSRR) {
return;
}
@@ -25267,7 +12046,7 @@ ip_wput_local_options(ipha_t *ipha, ip_stack_t *ipst)
off > optlen - IP_ADDR_LEN) {
/* No more room - ignore */
ip1dbg((
- "ip_wput_forward_options: end of RR\n"));
+ "ip_output_local_options: end of RR\n"));
break;
}
dst = htonl(INADDR_LOOPBACK);
@@ -25285,14 +12064,10 @@ ip_wput_local_options(ipha_t *ipha, ip_stack_t *ipst)
/* Verify that the address matched */
off = opt[IPOPT_OFFSET] - 1;
bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
- ire = ire_ctable_lookup(dst, 0, IRE_LOCAL,
- NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE,
- ipst);
- if (ire == NULL) {
+ if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
/* Not for us */
break;
}
- ire_refrele(ire);
/* FALLTHRU */
case IPOPT_TS_TSANDADDR:
off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
@@ -25302,8 +12077,8 @@ ip_wput_local_options(ipha_t *ipha, ip_stack_t *ipst)
* ip_*put_options should have already
* dropped this packet.
*/
- cmn_err(CE_PANIC, "ip_wput_local_options: "
- "unknown IT - bug in ip_wput_options?\n");
+ cmn_err(CE_PANIC, "ip_output_local_options: "
+ "unknown IT - bug in ip_output_options?\n");
return; /* Keep "lint" happy */
}
if (opt[IPOPT_OFFSET] - 1 + off > optlen) {
@@ -25339,1098 +12114,240 @@ ip_wput_local_options(ipha_t *ipha, ip_stack_t *ipst)
}
/*
- * Send out a multicast packet on interface ipif.
- * The sender does not have an conn.
- * Caller verifies that this isn't a PHYI_LOOPBACK.
- */
-void
-ip_wput_multicast(queue_t *q, mblk_t *mp, ipif_t *ipif, zoneid_t zoneid)
-{
- ipha_t *ipha;
- ire_t *ire;
- ipaddr_t dst;
- mblk_t *first_mp;
- ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
-
- /* igmp_sendpkt always allocates a ipsec_out_t */
- ASSERT(mp->b_datap->db_type == M_CTL);
- ASSERT(!ipif->ipif_isv6);
- ASSERT(!IS_LOOPBACK(ipif->ipif_ill));
-
- first_mp = mp;
- mp = first_mp->b_cont;
- ASSERT(mp->b_datap->db_type == M_DATA);
- ipha = (ipha_t *)mp->b_rptr;
-
- /*
- * Find an IRE which matches the destination and the outgoing
- * queue (i.e. the outgoing interface.)
- */
- if (ipif->ipif_flags & IPIF_POINTOPOINT)
- dst = ipif->ipif_pp_dst_addr;
- else
- dst = ipha->ipha_dst;
- /*
- * The source address has already been initialized by the
- * caller and hence matching on ILL (MATCH_IRE_ILL) would
- * be sufficient rather than MATCH_IRE_IPIF.
- *
- * This function is used for sending IGMP packets. For IPMP,
- * we sidestep IGMP snooping issues by sending all multicast
- * traffic on a single interface in the IPMP group.
- */
- ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid, NULL,
- MATCH_IRE_ILL, ipst);
- if (!ire) {
- /*
- * Mark this packet to make it be delivered to
- * ip_wput_ire after the new ire has been
- * created.
- */
- mp->b_prev = NULL;
- mp->b_next = NULL;
- ip_newroute_ipif(q, first_mp, ipif, dst, NULL, RTF_SETSRC,
- zoneid, &zero_info);
- return;
- }
-
- /*
- * Honor the RTF_SETSRC flag; this is the only case
- * where we force this addr whatever the current src addr is,
- * because this address is set by igmp_sendpkt(), and
- * cannot be specified by any user.
- */
- if (ire->ire_flags & RTF_SETSRC) {
- ipha->ipha_src = ire->ire_src_addr;
- }
-
- ip_wput_ire(q, first_mp, ire, NULL, B_FALSE, zoneid);
-}
-
-/*
- * NOTE : This function does not ire_refrele the ire argument passed in.
+ * Prepend an M_DATA fastpath header, and if none present prepend a
+ * DL_UNITDATA_REQ. Frees the mblk on failure.
+ *
+ * nce_dlur_mp and nce_fp_mp can not disappear once they have been set.
+ * If there is a change to them, the nce will be deleted (condemned) and
+ * a new nce_t will be created when packets are sent. Thus we need no locks
+ * to access those fields.
*
- * Copy the link layer header and do IPQoS if needed. Frees the mblk on
- * failure. The nce_fp_mp can vanish any time in the case of
- * IRE_BROADCAST due to DL_NOTE_FASTPATH_FLUSH. Hence we have to hold
- * the ire_lock to access the nce_fp_mp in this case.
- * IPQoS assumes that the first M_DATA contains the IP header. So, if we are
- * prepending a fastpath message IPQoS processing must precede it, we also set
- * the b_band of the fastpath message to that of the mblk returned by IPQoS
- * (IPQoS might have set the b_band for CoS marking).
- * However, if we are prepending DL_UNITDATA_REQ message, IPQoS processing
- * must follow it so that IPQoS can mark the dl_priority field for CoS
- * marking, if needed.
+ * We preserve b_band to support IPQoS. If a DL_UNITDATA_REQ is prepended
+ * we place b_band in dl_priority.dl_max.
*/
static mblk_t *
-ip_wput_attach_llhdr(mblk_t *mp, ire_t *ire, ip_proc_t proc,
- uint32_t ill_index, ipha_t **iphap)
+ip_xmit_attach_llhdr(mblk_t *mp, nce_t *nce)
{
uint_t hlen;
- ipha_t *ipha;
mblk_t *mp1;
- boolean_t qos_done = B_FALSE;
- uchar_t *ll_hdr;
- ip_stack_t *ipst = ire->ire_ipst;
+ uint_t priority;
+ uchar_t *rptr;
-#define rptr ((uchar_t *)ipha)
+ rptr = mp->b_rptr;
- ipha = (ipha_t *)mp->b_rptr;
- hlen = 0;
- LOCK_IRE_FP_MP(ire);
- if ((mp1 = ire->ire_nce->nce_fp_mp) != NULL) {
- ASSERT(DB_TYPE(mp1) == M_DATA);
- /* Initiate IPPF processing */
- if ((proc != 0) && IPP_ENABLED(proc, ipst)) {
- UNLOCK_IRE_FP_MP(ire);
- ip_process(proc, &mp, ill_index);
- if (mp == NULL)
- return (NULL);
+ ASSERT(DB_TYPE(mp) == M_DATA);
+ priority = mp->b_band;
- ipha = (ipha_t *)mp->b_rptr;
- LOCK_IRE_FP_MP(ire);
- if ((mp1 = ire->ire_nce->nce_fp_mp) == NULL) {
- qos_done = B_TRUE;
- goto no_fp_mp;
- }
- ASSERT(DB_TYPE(mp1) == M_DATA);
- }
+ ASSERT(nce != NULL);
+ if ((mp1 = nce->nce_fp_mp) != NULL) {
hlen = MBLKL(mp1);
/*
* Check if we have enough room to prepend fastpath
* header
*/
if (hlen != 0 && (rptr - mp->b_datap->db_base) >= hlen) {
- ll_hdr = rptr - hlen;
- bcopy(mp1->b_rptr, ll_hdr, hlen);
+ rptr -= hlen;
+ bcopy(mp1->b_rptr, rptr, hlen);
/*
* Set the b_rptr to the start of the link layer
* header
*/
- mp->b_rptr = ll_hdr;
- mp1 = mp;
- } else {
- mp1 = copyb(mp1);
- if (mp1 == NULL)
- goto unlock_err;
- mp1->b_band = mp->b_band;
- mp1->b_cont = mp;
- /*
- * XXX disable ICK_VALID and compute checksum
- * here; can happen if nce_fp_mp changes and
- * it can't be copied now due to insufficient
- * space. (unlikely, fp mp can change, but it
- * does not increase in length)
- */
+ mp->b_rptr = rptr;
+ return (mp);
}
- UNLOCK_IRE_FP_MP(ire);
- } else {
-no_fp_mp:
- mp1 = copyb(ire->ire_nce->nce_res_mp);
+ mp1 = copyb(mp1);
if (mp1 == NULL) {
-unlock_err:
- UNLOCK_IRE_FP_MP(ire);
+ ill_t *ill = nce->nce_ill;
+
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards", mp, ill);
freemsg(mp);
return (NULL);
}
- UNLOCK_IRE_FP_MP(ire);
+ mp1->b_band = priority;
mp1->b_cont = mp;
- if (!qos_done && (proc != 0) && IPP_ENABLED(proc, ipst)) {
- ip_process(proc, &mp1, ill_index);
- if (mp1 == NULL)
- return (NULL);
-
- if (mp1->b_cont == NULL)
- ipha = NULL;
- else
- ipha = (ipha_t *)mp1->b_cont->b_rptr;
- }
- }
-
- *iphap = ipha;
- return (mp1);
-#undef rptr
-}
-
-/*
- * Finish the outbound IPsec processing for an IPv6 packet. This function
- * is called from ipsec_out_process() if the IPsec packet was processed
- * synchronously, or from {ah,esp}_kcf_callback() if it was processed
- * asynchronously.
- */
-void
-ip_wput_ipsec_out_v6(queue_t *q, mblk_t *ipsec_mp, ip6_t *ip6h, ill_t *ill,
- ire_t *ire_arg)
-{
- in6_addr_t *v6dstp;
- ire_t *ire;
- mblk_t *mp;
- ip6_t *ip6h1;
- uint_t ill_index;
- ipsec_out_t *io;
- boolean_t hwaccel;
- uint32_t flags = IP6_NO_IPPOLICY;
- int match_flags;
- zoneid_t zoneid;
- boolean_t ill_need_rele = B_FALSE;
- boolean_t ire_need_rele = B_FALSE;
- ip_stack_t *ipst;
-
- mp = ipsec_mp->b_cont;
- ip6h1 = (ip6_t *)mp->b_rptr;
- io = (ipsec_out_t *)ipsec_mp->b_rptr;
- ASSERT(io->ipsec_out_ns != NULL);
- ipst = io->ipsec_out_ns->netstack_ip;
- ill_index = io->ipsec_out_ill_index;
- if (io->ipsec_out_reachable) {
- flags |= IPV6_REACHABILITY_CONFIRMATION;
- }
- hwaccel = io->ipsec_out_accelerated;
- zoneid = io->ipsec_out_zoneid;
- ASSERT(zoneid != ALL_ZONES);
- ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
- match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
- /* Multicast addresses should have non-zero ill_index. */
- v6dstp = &ip6h->ip6_dst;
- ASSERT(ip6h->ip6_nxt != IPPROTO_RAW);
- ASSERT(!IN6_IS_ADDR_MULTICAST(v6dstp) || ill_index != 0);
-
- if (ill == NULL && ill_index != 0) {
- ill = ip_grab_ill(ipsec_mp, ill_index, B_TRUE, ipst);
- /* Failure case frees things for us. */
- if (ill == NULL)
- return;
-
- ill_need_rele = B_TRUE;
- }
- ASSERT(mp != NULL);
-
- if (IN6_IS_ADDR_MULTICAST(v6dstp)) {
- boolean_t unspec_src;
- ipif_t *ipif;
-
- /*
- * Use the ill_index to get the right ill.
- */
- unspec_src = io->ipsec_out_unspec_src;
- (void) ipif_lookup_zoneid(ill, zoneid, 0, &ipif);
- if (ipif == NULL) {
- if (ill_need_rele)
- ill_refrele(ill);
- freemsg(ipsec_mp);
- return;
- }
-
- if (ire_arg != NULL) {
- ire = ire_arg;
- } else {
- ire = ire_ctable_lookup_v6(v6dstp, 0, 0, ipif,
- zoneid, msg_getlabel(mp), match_flags, ipst);
- ire_need_rele = B_TRUE;
- }
- if (ire != NULL) {
- ipif_refrele(ipif);
- /*
- * XXX Do the multicast forwarding now, as the IPsec
- * processing has been done.
- */
- goto send;
- }
-
- ip0dbg(("ip_wput_ipsec_out_v6: multicast: IRE disappeared\n"));
- mp->b_prev = NULL;
- mp->b_next = NULL;
-
- /*
- * If the IPsec packet was processed asynchronously,
- * drop it now.
- */
- if (q == NULL) {
- if (ill_need_rele)
- ill_refrele(ill);
- freemsg(ipsec_mp);
- ipif_refrele(ipif);
- return;
- }
-
- ip_newroute_ipif_v6(q, ipsec_mp, ipif, v6dstp, &ip6h->ip6_src,
- unspec_src, zoneid);
- ipif_refrele(ipif);
- } else {
- if (ire_arg != NULL) {
- ire = ire_arg;
- } else {
- ire = ire_cache_lookup_v6(v6dstp, zoneid, NULL, ipst);
- ire_need_rele = B_TRUE;
- }
- if (ire != NULL)
- goto send;
- /*
- * ire disappeared underneath.
- *
- * What we need to do here is the ip_newroute
- * logic to get the ire without doing the IPsec
- * processing. Follow the same old path. But this
- * time, ip_wput or ire_add_then_send will call us
- * directly as all the IPsec operations are done.
- */
- ip1dbg(("ip_wput_ipsec_out_v6: IRE disappeared\n"));
- mp->b_prev = NULL;
- mp->b_next = NULL;
-
- /*
- * If the IPsec packet was processed asynchronously,
- * drop it now.
- */
- if (q == NULL) {
- if (ill_need_rele)
- ill_refrele(ill);
- freemsg(ipsec_mp);
- return;
- }
-
- ip_newroute_v6(q, ipsec_mp, v6dstp, &ip6h->ip6_src, ill,
- zoneid, ipst);
- }
- if (ill != NULL && ill_need_rele)
- ill_refrele(ill);
- return;
-send:
- if (ill != NULL && ill_need_rele)
- ill_refrele(ill);
-
- /* Local delivery */
- if (ire->ire_stq == NULL) {
- ill_t *out_ill;
- ASSERT(q != NULL);
-
- /* PFHooks: LOOPBACK_OUT */
- out_ill = ire_to_ill(ire);
-
+ DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp);
+ DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp);
+ DB_CKSUMEND(mp1) = DB_CKSUMEND(mp);
+ DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp);
+ DB_LSOMSS(mp1) = DB_LSOMSS(mp);
+ DTRACE_PROBE1(ip__xmit__copyb, (mblk_t *), mp1);
/*
- * DTrace this as ip:::send. A blocked packet will fire the
- * send probe, but not the receive probe.
+ * XXX disable ICK_VALID and compute checksum
+ * here; can happen if nce_fp_mp changes and
+ * it can't be copied now due to insufficient
+ * space. (unlikely, fp mp can change, but it
+ * does not increase in length)
*/
- DTRACE_IP7(send, mblk_t *, ipsec_mp, conn_t *, NULL,
- void_ip_t *, ip6h, __dtrace_ipsr_ill_t *, out_ill,
- ipha_t *, NULL, ip6_t *, ip6h, int, 1);
-
- DTRACE_PROBE4(ip6__loopback__out__start,
- ill_t *, NULL, ill_t *, out_ill,
- ip6_t *, ip6h1, mblk_t *, ipsec_mp);
-
- FW_HOOKS6(ipst->ips_ip6_loopback_out_event,
- ipst->ips_ipv6firewall_loopback_out,
- NULL, out_ill, ip6h1, ipsec_mp, mp, 0, ipst);
-
- DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, ipsec_mp);
-
- if (ipsec_mp != NULL) {
- ip_wput_local_v6(RD(q), out_ill,
- ip6h, ipsec_mp, ire, 0, zoneid);
- }
- if (ire_need_rele)
- ire_refrele(ire);
- return;
- }
- /*
- * Everything is done. Send it out on the wire.
- * We force the insertion of a fragment header using the
- * IPH_FRAG_HDR flag in two cases:
- * - after reception of an ICMPv6 "packet too big" message
- * with a MTU < 1280 (cf. RFC 2460 section 5)
- * - for multirouted IPv6 packets, so that the receiver can
- * discard duplicates according to their fragment identifier
- */
- /* XXX fix flow control problems. */
- if (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN > ire->ire_max_frag ||
- (ire->ire_frag_flag & IPH_FRAG_HDR)) {
- if (hwaccel) {
- /*
- * hardware acceleration does not handle these
- * "slow path" cases.
- */
- /* IPsec KSTATS: should bump bean counter here. */
- if (ire_need_rele)
- ire_refrele(ire);
- freemsg(ipsec_mp);
- return;
- }
- if (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN !=
- (mp->b_cont ? msgdsize(mp) :
- mp->b_wptr - (uchar_t *)ip6h)) {
- /* IPsec KSTATS: should bump bean counter here. */
- ip0dbg(("Packet length mismatch: %d, %ld\n",
- ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN,
- msgdsize(mp)));
- if (ire_need_rele)
- ire_refrele(ire);
- freemsg(ipsec_mp);
- return;
- }
- ASSERT(mp->b_prev == NULL);
- ip2dbg(("Fragmenting Size = %d, mtu = %d\n",
- ntohs(ip6h->ip6_plen) +
- IPV6_HDR_LEN, ire->ire_max_frag));
- ip_wput_frag_v6(mp, ire, flags, NULL, B_FALSE,
- ire->ire_max_frag);
- } else {
- UPDATE_OB_PKT_COUNT(ire);
- ire->ire_last_used_time = lbolt;
- ip_xmit_v6(mp, ire, flags, NULL, B_FALSE, hwaccel ? io : NULL);
+ return (mp1);
}
- if (ire_need_rele)
- ire_refrele(ire);
- freeb(ipsec_mp);
-}
+ mp1 = copyb(nce->nce_dlur_mp);
-void
-ipsec_hw_putnext(queue_t *q, mblk_t *mp)
-{
- mblk_t *hada_mp; /* attributes M_CTL mblk */
- da_ipsec_t *hada; /* data attributes */
- ill_t *ill = (ill_t *)q->q_ptr;
-
- IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_hw_putnext: accelerated packet\n"));
+ if (mp1 == NULL) {
+ ill_t *ill = nce->nce_ill;
- if ((ill->ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP)) == 0) {
- /* IPsec KSTATS: Bump lose counter here! */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards", mp, ill);
freemsg(mp);
- return;
+ return (NULL);
}
-
- /*
- * It's an IPsec packet that must be
- * accelerated by the Provider, and the
- * outbound ill is IPsec acceleration capable.
- * Prepends the mblk with an IPHADA_M_CTL, and ship it
- * to the ill.
- * IPsec KSTATS: should bump packet counter here.
- */
-
- hada_mp = allocb(sizeof (da_ipsec_t), BPRI_HI);
- if (hada_mp == NULL) {
- /* IPsec KSTATS: should bump packet counter here. */
- freemsg(mp);
- return;
+ mp1->b_cont = mp;
+ if (priority != 0) {
+ mp1->b_band = priority;
+ ((dl_unitdata_req_t *)(mp1->b_rptr))->dl_priority.dl_max =
+ priority;
}
-
- hada_mp->b_datap->db_type = M_CTL;
- hada_mp->b_wptr = hada_mp->b_rptr + sizeof (*hada);
- hada_mp->b_cont = mp;
-
- hada = (da_ipsec_t *)hada_mp->b_rptr;
- bzero(hada, sizeof (da_ipsec_t));
- hada->da_type = IPHADA_M_CTL;
-
- putnext(q, hada_mp);
+ return (mp1);
+#undef rptr
}
/*
* Finish the outbound IPsec processing. This function is called from
* ipsec_out_process() if the IPsec packet was processed
- * synchronously, or from {ah,esp}_kcf_callback() if it was processed
+ * synchronously, or from {ah,esp}_kcf_callback_outbound() if it was processed
* asynchronously.
+ *
+ * This is common to IPv4 and IPv6.
*/
-void
-ip_wput_ipsec_out(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, ill_t *ill,
- ire_t *ire_arg)
+int
+ip_output_post_ipsec(mblk_t *mp, ip_xmit_attr_t *ixa)
{
- uint32_t v_hlen_tos_len;
- ipaddr_t dst;
- ipif_t *ipif = NULL;
- ire_t *ire;
- ire_t *ire1 = NULL;
- mblk_t *next_mp = NULL;
- uint32_t max_frag;
- boolean_t multirt_send = B_FALSE;
- mblk_t *mp;
- ipha_t *ipha1;
- uint_t ill_index;
- ipsec_out_t *io;
- int match_flags;
- irb_t *irb = NULL;
- boolean_t ill_need_rele = B_FALSE, ire_need_rele = B_TRUE;
- zoneid_t zoneid;
- ipxmit_state_t pktxmit_state;
- ip_stack_t *ipst;
-
-#ifdef _BIG_ENDIAN
-#define LENGTH (v_hlen_tos_len & 0xFFFF)
-#else
-#define LENGTH ((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00))
-#endif
+ iaflags_t ixaflags = ixa->ixa_flags;
+ uint_t pktlen;
- mp = ipsec_mp->b_cont;
- ipha1 = (ipha_t *)mp->b_rptr;
- ASSERT(mp != NULL);
- v_hlen_tos_len = ((uint32_t *)ipha)[0];
- dst = ipha->ipha_dst;
- io = (ipsec_out_t *)ipsec_mp->b_rptr;
- ill_index = io->ipsec_out_ill_index;
- zoneid = io->ipsec_out_zoneid;
- ASSERT(zoneid != ALL_ZONES);
- ipst = io->ipsec_out_ns->netstack_ip;
- ASSERT(io->ipsec_out_ns != NULL);
-
- match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
- if (ill == NULL && ill_index != 0) {
- ill = ip_grab_ill(ipsec_mp, ill_index, B_FALSE, ipst);
- /* Failure case frees things for us. */
- if (ill == NULL)
- return;
+ /* AH/ESP don't update ixa_pktlen when they modify the packet */
+ if (ixaflags & IXAF_IS_IPV4) {
+ ipha_t *ipha = (ipha_t *)mp->b_rptr;
- ill_need_rele = B_TRUE;
- }
-
- if (CLASSD(dst)) {
- boolean_t conn_dontroute;
- /*
- * Use the ill_index to get the right ipif.
- */
- conn_dontroute = io->ipsec_out_dontroute;
- if (ill_index == 0)
- ipif = ipif_lookup_group(dst, zoneid, ipst);
- else
- (void) ipif_lookup_zoneid(ill, zoneid, 0, &ipif);
- if (ipif == NULL) {
- ip1dbg(("ip_wput_ipsec_out: No ipif for"
- " multicast\n"));
- BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes);
- freemsg(ipsec_mp);
- goto done;
- }
- /*
- * ipha_src has already been intialized with the
- * value of the ipif in ip_wput. All we need now is
- * an ire to send this downstream.
- */
- ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid,
- msg_getlabel(mp), match_flags, ipst);
- if (ire != NULL) {
- ill_t *ill1;
- /*
- * Do the multicast forwarding now, as the IPsec
- * processing has been done.
- */
- if (ipst->ips_ip_g_mrouter && !conn_dontroute &&
- (ill1 = ire_to_ill(ire))) {
- if (ip_mforward(ill1, ipha, mp)) {
- freemsg(ipsec_mp);
- ip1dbg(("ip_wput_ipsec_out: mforward "
- "failed\n"));
- ire_refrele(ire);
- goto done;
- }
- }
- goto send;
- }
-
- ip0dbg(("ip_wput_ipsec_out: multicast: IRE disappeared\n"));
- mp->b_prev = NULL;
- mp->b_next = NULL;
-
- /*
- * If the IPsec packet was processed asynchronously,
- * drop it now.
- */
- if (q == NULL) {
- freemsg(ipsec_mp);
- goto done;
- }
-
- /*
- * We may be using a wrong ipif to create the ire.
- * But it is okay as the source address is assigned
- * for the packet already. Next outbound packet would
- * create the IRE with the right IPIF in ip_wput.
- *
- * Also handle RTF_MULTIRT routes.
- */
- ip_newroute_ipif(q, ipsec_mp, ipif, dst, NULL, RTF_MULTIRT,
- zoneid, &zero_info);
+ ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
+ pktlen = ntohs(ipha->ipha_length);
} else {
- if (ire_arg != NULL) {
- ire = ire_arg;
- ire_need_rele = B_FALSE;
- } else {
- ire = ire_cache_lookup(dst, zoneid,
- msg_getlabel(mp), ipst);
- }
- if (ire != NULL) {
- goto send;
- }
-
- /*
- * ire disappeared underneath.
- *
- * What we need to do here is the ip_newroute
- * logic to get the ire without doing the IPsec
- * processing. Follow the same old path. But this
- * time, ip_wput or ire_add_then_put will call us
- * directly as all the IPsec operations are done.
- */
- ip1dbg(("ip_wput_ipsec_out: IRE disappeared\n"));
- mp->b_prev = NULL;
- mp->b_next = NULL;
+ ip6_t *ip6h = (ip6_t *)mp->b_rptr;
- /*
- * If the IPsec packet was processed asynchronously,
- * drop it now.
- */
- if (q == NULL) {
- freemsg(ipsec_mp);
- goto done;
- }
-
- /*
- * Since we're going through ip_newroute() again, we
- * need to make sure we don't:
- *
- * 1.) Trigger the ASSERT() with the ipha_ident
- * overloading.
- * 2.) Redo transport-layer checksumming, since we've
- * already done all that to get this far.
- *
- * The easiest way not do either of the above is to set
- * the ipha_ident field to IP_HDR_INCLUDED.
- */
- ipha->ipha_ident = IP_HDR_INCLUDED;
- ip_newroute(q, ipsec_mp, dst, (CONN_Q(q) ? Q_TO_CONN(q) : NULL),
- zoneid, ipst);
- }
- goto done;
-send:
- if (ire->ire_stq == NULL) {
- ill_t *out_ill;
- /*
- * Loopbacks go through ip_wput_local except for one case.
- * We come here if we generate a icmp_frag_needed message
- * after IPsec processing is over. When this function calls
- * ip_wput_ire_fragmentit, ip_wput_frag might end up calling
- * icmp_frag_needed. The message generated comes back here
- * through icmp_frag_needed -> icmp_pkt -> ip_wput ->
- * ipsec_out_process -> ip_wput_ipsec_out. We need to set the
- * source address as it is usually set in ip_wput_ire. As
- * ipsec_out_proc_begin is set, ip_wput calls ipsec_out_process
- * and we end up here. We can't enter ip_wput_ire once the
- * IPsec processing is over and hence we need to do it here.
- */
- ASSERT(q != NULL);
- UPDATE_OB_PKT_COUNT(ire);
- ire->ire_last_used_time = lbolt;
- if (ipha->ipha_src == 0)
- ipha->ipha_src = ire->ire_src_addr;
-
- /* PFHooks: LOOPBACK_OUT */
- out_ill = ire_to_ill(ire);
-
- /*
- * DTrace this as ip:::send. A blocked packet will fire the
- * send probe, but not the receive probe.
- */
- DTRACE_IP7(send, mblk_t *, ipsec_mp, conn_t *, NULL,
- void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill,
- ipha_t *, ipha, ip6_t *, NULL, int, 1);
-
- DTRACE_PROBE4(ip4__loopback__out__start,
- ill_t *, NULL, ill_t *, out_ill,
- ipha_t *, ipha1, mblk_t *, ipsec_mp);
-
- FW_HOOKS(ipst->ips_ip4_loopback_out_event,
- ipst->ips_ipv4firewall_loopback_out,
- NULL, out_ill, ipha1, ipsec_mp, mp, 0, ipst);
-
- DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, ipsec_mp);
-
- if (ipsec_mp != NULL)
- ip_wput_local(RD(q), out_ill,
- ipha, ipsec_mp, ire, 0, zoneid);
- if (ire_need_rele)
- ire_refrele(ire);
- goto done;
+ ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION);
+ pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
}
- if (ire->ire_max_frag < (unsigned int)LENGTH) {
- /*
- * We are through with IPsec processing.
- * Fragment this and send it on the wire.
- */
- if (io->ipsec_out_accelerated) {
- /*
- * The packet has been accelerated but must
- * be fragmented. This should not happen
- * since AH and ESP must not accelerate
- * packets that need fragmentation, however
- * the configuration could have changed
- * since the AH or ESP processing.
- * Drop packet.
- * IPsec KSTATS: bump bean counter here.
- */
- IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_wput_ipsec_out: "
- "fragmented accelerated packet!\n"));
- freemsg(ipsec_mp);
- } else {
- ip_wput_ire_fragmentit(ipsec_mp, ire,
- zoneid, ipst, NULL);
- }
- if (ire_need_rele)
- ire_refrele(ire);
- goto done;
- }
-
- ip2dbg(("ip_wput_ipsec_out: ipsec_mp %p, ire %p, ire_ipif %p, "
- "ipif %p\n", (void *)ipsec_mp, (void *)ire,
- (void *)ire->ire_ipif, (void *)ipif));
-
/*
- * Multiroute the secured packet.
+ * We release any hard reference on the SAs here to make
+ * sure the SAs can be garbage collected. ipsr_sa has a soft reference
+ * on the SAs.
+ * If in the future we want the hard latching of the SAs in the
+ * ip_xmit_attr_t then we should remove this.
*/
- if (ire->ire_flags & RTF_MULTIRT) {
- ire_t *first_ire;
- irb = ire->ire_bucket;
- ASSERT(irb != NULL);
- /*
- * This ire has been looked up as the one that
- * goes through the given ipif;
- * make sure we do not omit any other multiroute ire
- * that may be present in the bucket before this one.
- */
- IRB_REFHOLD(irb);
- for (first_ire = irb->irb_ire;
- first_ire != NULL;
- first_ire = first_ire->ire_next) {
- if ((first_ire->ire_flags & RTF_MULTIRT) &&
- (first_ire->ire_addr == ire->ire_addr) &&
- !(first_ire->ire_marks &
- (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)))
- break;
- }
-
- if ((first_ire != NULL) && (first_ire != ire)) {
- /*
- * Don't change the ire if the packet must
- * be fragmented if sent via this new one.
- */
- if (first_ire->ire_max_frag >= (unsigned int)LENGTH) {
- IRE_REFHOLD(first_ire);
- if (ire_need_rele)
- ire_refrele(ire);
- else
- ire_need_rele = B_TRUE;
- ire = first_ire;
- }
- }
- IRB_REFRELE(irb);
-
- multirt_send = B_TRUE;
- max_frag = ire->ire_max_frag;
+ if (ixa->ixa_ipsec_esp_sa != NULL) {
+ IPSA_REFRELE(ixa->ixa_ipsec_esp_sa);
+ ixa->ixa_ipsec_esp_sa = NULL;
+ }
+ if (ixa->ixa_ipsec_ah_sa != NULL) {
+ IPSA_REFRELE(ixa->ixa_ipsec_ah_sa);
+ ixa->ixa_ipsec_ah_sa = NULL;
}
- /*
- * In most cases, the emission loop below is entered only once.
- * Only in the case where the ire holds the RTF_MULTIRT
- * flag, we loop to process all RTF_MULTIRT ires in the
- * bucket, and send the packet through all crossed
- * RTF_MULTIRT routes.
- */
- do {
- if (multirt_send) {
+ /* Do we need to fragment? */
+ if ((ixa->ixa_flags & IXAF_IPV6_ADD_FRAGHDR) ||
+ pktlen > ixa->ixa_fragsize) {
+ if (ixaflags & IXAF_IS_IPV4) {
+ ASSERT(!(ixa->ixa_flags & IXAF_IPV6_ADD_FRAGHDR));
/*
- * ire1 holds here the next ire to process in the
- * bucket. If multirouting is expected,
- * any non-RTF_MULTIRT ire that has the
- * right destination address is ignored.
+ * We check for the DF case in ipsec_out_process
+ * hence this only handles the non-DF case.
*/
- ASSERT(irb != NULL);
- IRB_REFHOLD(irb);
- for (ire1 = ire->ire_next;
- ire1 != NULL;
- ire1 = ire1->ire_next) {
- if ((ire1->ire_flags & RTF_MULTIRT) == 0)
- continue;
- if (ire1->ire_addr != ire->ire_addr)
- continue;
- if (ire1->ire_marks &
- (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))
- continue;
- /* No loopback here */
- if (ire1->ire_stq == NULL)
- continue;
- /*
- * Ensure we do not exceed the MTU
- * of the next route.
- */
- if (ire1->ire_max_frag < (unsigned int)LENGTH) {
- ip_multirt_bad_mtu(ire1, max_frag);
- continue;
- }
-
- IRE_REFHOLD(ire1);
- break;
- }
- IRB_REFRELE(irb);
- if (ire1 != NULL) {
- /*
- * We are in a multiple send case, need to
- * make a copy of the packet.
- */
- next_mp = copymsg(ipsec_mp);
- if (next_mp == NULL) {
- ire_refrele(ire1);
- ire1 = NULL;
- }
+ return (ip_fragment_v4(mp, ixa->ixa_nce, ixa->ixa_flags,
+ pktlen, ixa->ixa_fragsize,
+ ixa->ixa_xmit_hint, ixa->ixa_zoneid,
+ ixa->ixa_no_loop_zoneid, ixa->ixa_postfragfn,
+ &ixa->ixa_cookie));
+ } else {
+ mp = ip_fraghdr_add_v6(mp, ixa->ixa_ident, ixa);
+ if (mp == NULL) {
+ /* MIB and ip_drop_output already done */
+ return (ENOMEM);
}
- }
- /*
- * Everything is done. Send it out on the wire
- *
- * ip_xmit_v4 will call ip_wput_attach_llhdr and then
- * either send it on the wire or, in the case of
- * HW acceleration, call ipsec_hw_putnext.
- */
- if (ire->ire_nce &&
- ire->ire_nce->nce_state != ND_REACHABLE) {
- DTRACE_PROBE2(ip__wput__ipsec__bail,
- (ire_t *), ire, (mblk_t *), ipsec_mp);
- /*
- * If ire's link-layer is unresolved (this
- * would only happen if the incomplete ire
- * was added to cachetable via forwarding path)
- * don't bother going to ip_xmit_v4. Just drop the
- * packet.
- * There is a slight risk here, in that, if we
- * have the forwarding path create an incomplete
- * IRE, then until the IRE is completed, any
- * transmitted IPsec packets will be dropped
- * instead of being queued waiting for resolution.
- *
- * But the likelihood of a forwarding packet and a wput
- * packet sending to the same dst at the same time
- * and there not yet be an ARP entry for it is small.
- * Furthermore, if this actually happens, it might
- * be likely that wput would generate multiple
- * packets (and forwarding would also have a train
- * of packets) for that destination. If this is
- * the case, some of them would have been dropped
- * anyway, since ARP only queues a few packets while
- * waiting for resolution
- *
- * NOTE: We should really call ip_xmit_v4,
- * and let it queue the packet and send the
- * ARP query and have ARP come back thus:
- * <ARP> ip_wput->ip_output->ip-wput_nondata->
- * ip_xmit_v4->ip_wput_attach_llhdr + ipsec
- * hw accel work. But it's too complex to get
- * the IPsec hw acceleration approach to fit
- * well with ip_xmit_v4 doing ARP without
- * doing IPsec simplification. For now, we just
- * poke ip_xmit_v4 to trigger the arp resolve, so
- * that we can continue with the send on the next
- * attempt.
- *
- * XXX THis should be revisited, when
- * the IPsec/IP interaction is cleaned up
- */
- ip1dbg(("ip_wput_ipsec_out: ire is incomplete"
- " - dropping packet\n"));
- freemsg(ipsec_mp);
- /*
- * Call ip_xmit_v4() to trigger ARP query
- * in case the nce_state is ND_INITIAL
- */
- (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE, NULL);
- goto drop_pkt;
- }
-
- DTRACE_PROBE4(ip4__physical__out__start, ill_t *, NULL,
- ill_t *, ire->ire_ipif->ipif_ill, ipha_t *, ipha1,
- mblk_t *, ipsec_mp);
- FW_HOOKS(ipst->ips_ip4_physical_out_event,
- ipst->ips_ipv4firewall_physical_out, NULL,
- ire->ire_ipif->ipif_ill, ipha1, ipsec_mp, mp, 0, ipst);
- DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, ipsec_mp);
- if (ipsec_mp == NULL)
- goto drop_pkt;
-
- ip1dbg(("ip_wput_ipsec_out: calling ip_xmit_v4\n"));
- pktxmit_state = ip_xmit_v4(mp, ire,
- (io->ipsec_out_accelerated ? io : NULL), B_FALSE, NULL);
-
- if ((pktxmit_state == SEND_FAILED) ||
- (pktxmit_state == LLHDR_RESLV_FAILED)) {
-
- freeb(ipsec_mp); /* ip_xmit_v4 frees the mp */
-drop_pkt:
- BUMP_MIB(((ill_t *)ire->ire_stq->q_ptr)->ill_ip_mib,
- ipIfStatsOutDiscards);
- if (ire_need_rele)
- ire_refrele(ire);
- if (ire1 != NULL) {
- ire_refrele(ire1);
- freemsg(next_mp);
+ pktlen += sizeof (ip6_frag_t);
+ if (pktlen > ixa->ixa_fragsize) {
+ return (ip_fragment_v6(mp, ixa->ixa_nce,
+ ixa->ixa_flags, pktlen,
+ ixa->ixa_fragsize, ixa->ixa_xmit_hint,
+ ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid,
+ ixa->ixa_postfragfn, &ixa->ixa_cookie));
}
- goto done;
}
-
- freeb(ipsec_mp);
- if (ire_need_rele)
- ire_refrele(ire);
-
- if (ire1 != NULL) {
- ire = ire1;
- ire_need_rele = B_TRUE;
- ASSERT(next_mp);
- ipsec_mp = next_mp;
- mp = ipsec_mp->b_cont;
- ire1 = NULL;
- next_mp = NULL;
- io = (ipsec_out_t *)ipsec_mp->b_rptr;
- } else {
- multirt_send = B_FALSE;
- }
- } while (multirt_send);
-done:
- if (ill != NULL && ill_need_rele)
- ill_refrele(ill);
- if (ipif != NULL)
- ipif_refrele(ipif);
+ }
+ return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixa->ixa_flags,
+ pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid,
+ ixa->ixa_no_loop_zoneid, NULL));
}
/*
- * Get the ill corresponding to the specified ire, and compare its
- * capabilities with the protocol and algorithms specified by the
- * the SA obtained from ipsec_out. If they match, annotate the
- * ipsec_out structure to indicate that the packet needs acceleration.
- *
- *
- * A packet is eligible for outbound hardware acceleration if the
- * following conditions are satisfied:
- *
- * 1. the packet will not be fragmented
- * 2. the provider supports the algorithm
- * 3. there is no pending control message being exchanged
- * 4. snoop is not attached
- * 5. the destination address is not a broadcast or multicast address.
- *
- * Rationale:
- * - Hardware drivers do not support fragmentation with
- * the current interface.
- * - snoop, multicast, and broadcast may result in exposure of
- * a cleartext datagram.
- * We check all five of these conditions here.
+ * Finish the inbound IPsec processing. This function is called from
+ * ipsec_out_process() if the IPsec packet was processed
+ * synchronously, or from {ah,esp}_kcf_callback_outbound() if it was processed
+ * asynchronously.
*
- * XXX would like to nuke "ire_t *" parameter here; problem is that
- * IRE is only way to figure out if a v4 address is a broadcast and
- * thus ineligible for acceleration...
+ * This is common to IPv4 and IPv6.
*/
-static void
-ipsec_out_is_accelerated(mblk_t *ipsec_mp, ipsa_t *sa, ill_t *ill, ire_t *ire)
+void
+ip_input_post_ipsec(mblk_t *mp, ip_recv_attr_t *ira)
{
- ipsec_out_t *io;
- mblk_t *data_mp;
- uint_t plen, overhead;
- ip_stack_t *ipst;
- phyint_t *phyint;
-
- if ((sa->ipsa_flags & IPSA_F_HW) == 0)
- return;
-
- if (ill == NULL)
- return;
- ipst = ill->ill_ipst;
- phyint = ill->ill_phyint;
-
- /*
- * Destination address is a broadcast or multicast. Punt.
- */
- if ((ire != NULL) && (ire->ire_type & (IRE_BROADCAST|IRE_LOOPBACK|
- IRE_LOCAL)))
- return;
-
- data_mp = ipsec_mp->b_cont;
+ iaflags_t iraflags = ira->ira_flags;
- if (ill->ill_isv6) {
- ip6_t *ip6h = (ip6_t *)data_mp->b_rptr;
+ /* Length might have changed */
+ if (iraflags & IRAF_IS_IPV4) {
+ ipha_t *ipha = (ipha_t *)mp->b_rptr;
- if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))
- return;
+ ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
+ ira->ira_pktlen = ntohs(ipha->ipha_length);
+ ira->ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
+ ira->ira_protocol = ipha->ipha_protocol;
- plen = ip6h->ip6_plen;
+ ip_fanout_v4(mp, ipha, ira);
} else {
- ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
-
- if (CLASSD(ipha->ipha_dst))
+ ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+ uint8_t *nexthdrp;
+
+ ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION);
+ ira->ira_pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
+ if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ira->ira_ip_hdr_length,
+ &nexthdrp)) {
+ /* Malformed packet */
+ BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ira->ira_ill);
+ freemsg(mp);
return;
-
- plen = ipha->ipha_length;
- }
- /*
- * Is there a pending DLPI control message being exchanged
- * between IP/IPsec and the DLS Provider? If there is, it
- * could be a SADB update, and the state of the DLS Provider
- * SADB might not be in sync with the SADB maintained by
- * IPsec. To avoid dropping packets or using the wrong keying
- * material, we do not accelerate this packet.
- */
- if (ill->ill_dlpi_pending != DL_PRIM_INVAL) {
- IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_out_check_is_accelerated: "
- "ill_dlpi_pending! don't accelerate packet\n"));
- return;
- }
-
- /*
- * Is the Provider in promiscous mode? If it does, we don't
- * accelerate the packet since it will bounce back up to the
- * listeners in the clear.
- */
- if (phyint->phyint_flags & PHYI_PROMISC) {
- IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_out_check_is_accelerated: "
- "ill in promiscous mode, don't accelerate packet\n"));
- return;
- }
-
- /*
- * Will the packet require fragmentation?
- */
-
- /*
- * IPsec ESP note: this is a pessimistic estimate, but the same
- * as is used elsewhere.
- * SPI + sequence + MAC + IV(blocksize) + padding(blocksize-1)
- * + 2-byte trailer
- */
- overhead = (sa->ipsa_type == SADB_SATYPE_AH) ? IPSEC_MAX_AH_HDR_SIZE :
- IPSEC_BASE_ESP_HDR_SIZE(sa);
-
- if ((plen + overhead) > ill->ill_max_mtu)
- return;
-
- io = (ipsec_out_t *)ipsec_mp->b_rptr;
-
- /*
- * Can the ill accelerate this IPsec protocol and algorithm
- * specified by the SA?
- */
- if (!ipsec_capab_match(ill, io->ipsec_out_capab_ill_index,
- ill->ill_isv6, sa, ipst->ips_netstack)) {
- return;
+ }
+ ira->ira_protocol = *nexthdrp;
+ ip_fanout_v6(mp, ip6h, ira);
}
-
- /*
- * Tell AH or ESP that the outbound ill is capable of
- * accelerating this packet.
- */
- io->ipsec_out_is_capab_ill = B_TRUE;
}
/*
* Select which AH & ESP SA's to use (if any) for the outbound packet.
*
* If this function returns B_TRUE, the requested SA's have been filled
- * into the ipsec_out_*_sa pointers.
+ * into the ixa_ipsec_*_sa pointers.
*
* If the function returns B_FALSE, the packet has been "consumed", most
* likely by an ACQUIRE sent up via PF_KEY to a key management daemon.
*
* The SA references created by the protocol-specific "select"
- * function will be released when the ipsec_mp is freed, thanks to the
- * ipsec_out_free destructor -- see spd.c.
+ * function will be released in ip_output_post_ipsec.
*/
static boolean_t
-ipsec_out_select_sa(mblk_t *ipsec_mp)
+ipsec_out_select_sa(mblk_t *mp, ip_xmit_attr_t *ixa)
{
boolean_t need_ah_acquire = B_FALSE, need_esp_acquire = B_FALSE;
- ipsec_out_t *io;
ipsec_policy_t *pp;
ipsec_action_t *ap;
- io = (ipsec_out_t *)ipsec_mp->b_rptr;
- ASSERT(io->ipsec_out_type == IPSEC_OUT);
- ASSERT(io->ipsec_out_len == sizeof (ipsec_out_t));
- if (!io->ipsec_out_secure) {
- /*
- * We came here by mistake.
- * Don't bother with ipsec processing
- * We should "discourage" this path in the future.
- */
- ASSERT(io->ipsec_out_proc_begin == B_FALSE);
- return (B_FALSE);
- }
- ASSERT(io->ipsec_out_need_policy == B_FALSE);
- ASSERT((io->ipsec_out_policy != NULL) ||
- (io->ipsec_out_act != NULL));
+ ASSERT(ixa->ixa_flags & IXAF_IPSEC_SECURE);
+ ASSERT((ixa->ixa_ipsec_policy != NULL) ||
+ (ixa->ixa_ipsec_action != NULL));
- ASSERT(io->ipsec_out_failed == B_FALSE);
-
- /*
- * IPsec processing has started.
- */
- io->ipsec_out_proc_begin = B_TRUE;
- ap = io->ipsec_out_act;
+ ap = ixa->ixa_ipsec_action;
if (ap == NULL) {
- pp = io->ipsec_out_policy;
+ pp = ixa->ixa_ipsec_policy;
ASSERT(pp != NULL);
ap = pp->ipsp_act;
ASSERT(ap != NULL);
@@ -26438,22 +12355,23 @@ ipsec_out_select_sa(mblk_t *ipsec_mp)
/*
* We have an action. now, let's select SA's.
- * (In the future, we can cache this in the conn_t..)
+ * A side effect of setting ixa_ipsec_*_sa is that it will
+ * be cached in the conn_t.
*/
if (ap->ipa_want_esp) {
- if (io->ipsec_out_esp_sa == NULL) {
- need_esp_acquire = !ipsec_outbound_sa(ipsec_mp,
+ if (ixa->ixa_ipsec_esp_sa == NULL) {
+ need_esp_acquire = !ipsec_outbound_sa(mp, ixa,
IPPROTO_ESP);
}
- ASSERT(need_esp_acquire || io->ipsec_out_esp_sa != NULL);
+ ASSERT(need_esp_acquire || ixa->ixa_ipsec_esp_sa != NULL);
}
if (ap->ipa_want_ah) {
- if (io->ipsec_out_ah_sa == NULL) {
- need_ah_acquire = !ipsec_outbound_sa(ipsec_mp,
+ if (ixa->ixa_ipsec_ah_sa == NULL) {
+ need_ah_acquire = !ipsec_outbound_sa(mp, ixa,
IPPROTO_AH);
}
- ASSERT(need_ah_acquire || io->ipsec_out_ah_sa != NULL);
+ ASSERT(need_ah_acquire || ixa->ixa_ipsec_ah_sa != NULL);
/*
* The ESP and AH processing order needs to be preserved
* when both protocols are required (ESP should be applied
@@ -26471,16 +12389,16 @@ ipsec_out_select_sa(mblk_t *ipsec_mp)
* acquire _all_ of the SAs we need.
*/
if (need_ah_acquire || need_esp_acquire) {
- if (io->ipsec_out_ah_sa != NULL) {
- IPSA_REFRELE(io->ipsec_out_ah_sa);
- io->ipsec_out_ah_sa = NULL;
+ if (ixa->ixa_ipsec_ah_sa != NULL) {
+ IPSA_REFRELE(ixa->ixa_ipsec_ah_sa);
+ ixa->ixa_ipsec_ah_sa = NULL;
}
- if (io->ipsec_out_esp_sa != NULL) {
- IPSA_REFRELE(io->ipsec_out_esp_sa);
- io->ipsec_out_esp_sa = NULL;
+ if (ixa->ixa_ipsec_esp_sa != NULL) {
+ IPSA_REFRELE(ixa->ixa_ipsec_esp_sa);
+ ixa->ixa_ipsec_esp_sa = NULL;
}
- sadb_acquire(ipsec_mp, io, need_ah_acquire, need_esp_acquire);
+ sadb_acquire(mp, ixa, need_ah_acquire, need_esp_acquire);
return (B_FALSE);
}
@@ -26488,110 +12406,64 @@ ipsec_out_select_sa(mblk_t *ipsec_mp)
}
/*
- * Process an IPSEC_OUT message and see what you can
- * do with it.
- * IPQoS Notes:
- * We do IPPF processing if IPP_LOCAL_OUT is enabled before processing for
- * IPsec.
- * XXX would like to nuke ire_t.
- * XXX ill_index better be "real"
+ * Handle IPsec output processing.
+ * This function is only entered once for a given packet.
+ * We try to do things synchronously, but if we need to have user-level
+ * set up SAs, or ESP or AH uses asynchronous kEF, then the operation
+ * will be completed
+ * - when the SAs are added in esp_add_sa_finish/ah_add_sa_finish
+ * - when asynchronous ESP is done it will do AH
+ *
+ * In all cases we come back in ip_output_post_ipsec() to fragment and
+ * send out the packet.
*/
-void
-ipsec_out_process(queue_t *q, mblk_t *ipsec_mp, ire_t *ire, uint_t ill_index)
+int
+ipsec_out_process(mblk_t *mp, ip_xmit_attr_t *ixa)
{
- ipsec_out_t *io;
- ipsec_policy_t *pp;
- ipsec_action_t *ap;
- ipha_t *ipha;
- ip6_t *ip6h;
- mblk_t *mp;
- ill_t *ill;
- zoneid_t zoneid;
- ipsec_status_t ipsec_rc;
- boolean_t ill_need_rele = B_FALSE;
- ip_stack_t *ipst;
+ ill_t *ill = ixa->ixa_nce->nce_ill;
+ ip_stack_t *ipst = ixa->ixa_ipst;
ipsec_stack_t *ipss;
+ ipsec_policy_t *pp;
+ ipsec_action_t *ap;
- io = (ipsec_out_t *)ipsec_mp->b_rptr;
- ASSERT(io->ipsec_out_type == IPSEC_OUT);
- ASSERT(io->ipsec_out_len == sizeof (ipsec_out_t));
- ipst = io->ipsec_out_ns->netstack_ip;
- mp = ipsec_mp->b_cont;
-
- /*
- * Initiate IPPF processing. We do it here to account for packets
- * coming here that don't have any policy (i.e. !io->ipsec_out_secure).
- * We can check for ipsec_out_proc_begin even for such packets, as
- * they will always be false (asserted below).
- */
- if (IPP_ENABLED(IPP_LOCAL_OUT, ipst) && !io->ipsec_out_proc_begin) {
- ip_process(IPP_LOCAL_OUT, &mp, io->ipsec_out_ill_index != 0 ?
- io->ipsec_out_ill_index : ill_index);
- if (mp == NULL) {
- ip2dbg(("ipsec_out_process: packet dropped "\
- "during IPPF processing\n"));
- freeb(ipsec_mp);
- BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
- return;
- }
- }
+ ASSERT(ixa->ixa_flags & IXAF_IPSEC_SECURE);
- if (!io->ipsec_out_secure) {
- /*
- * We came here by mistake.
- * Don't bother with ipsec processing
- * Should "discourage" this path in the future.
- */
- ASSERT(io->ipsec_out_proc_begin == B_FALSE);
- goto done;
- }
- ASSERT(io->ipsec_out_need_policy == B_FALSE);
- ASSERT((io->ipsec_out_policy != NULL) ||
- (io->ipsec_out_act != NULL));
- ASSERT(io->ipsec_out_failed == B_FALSE);
+ ASSERT((ixa->ixa_ipsec_policy != NULL) ||
+ (ixa->ixa_ipsec_action != NULL));
ipss = ipst->ips_netstack->netstack_ipsec;
if (!ipsec_loaded(ipss)) {
- ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr;
- if (IPH_HDR_VERSION(ipha) == IP_VERSION) {
- BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
- } else {
- BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
- }
- ip_drop_packet(ipsec_mp, B_FALSE, NULL, ire,
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_packet(mp, B_TRUE, ill,
DROPPER(ipss, ipds_ip_ipsec_not_loaded),
&ipss->ipsec_dropper);
- return;
+ return (ENOTSUP);
}
- /*
- * IPsec processing has started.
- */
- io->ipsec_out_proc_begin = B_TRUE;
- ap = io->ipsec_out_act;
+ ap = ixa->ixa_ipsec_action;
if (ap == NULL) {
- pp = io->ipsec_out_policy;
+ pp = ixa->ixa_ipsec_policy;
ASSERT(pp != NULL);
ap = pp->ipsp_act;
ASSERT(ap != NULL);
}
- /*
- * Save the outbound ill index. When the packet comes back
- * from IPsec, we make sure the ill hasn't changed or disappeared
- * before sending it the accelerated packet.
- */
- if ((ire != NULL) && (io->ipsec_out_capab_ill_index == 0)) {
- ill = ire_to_ill(ire);
- io->ipsec_out_capab_ill_index = ill->ill_phyint->phyint_ifindex;
+ /* Handle explicit drop action and bypass. */
+ switch (ap->ipa_act.ipa_type) {
+ case IPSEC_ACT_DISCARD:
+ case IPSEC_ACT_REJECT:
+ ip_drop_packet(mp, B_FALSE, ill,
+ DROPPER(ipss, ipds_spd_explicit), &ipss->ipsec_spd_dropper);
+ return (EHOSTUNREACH); /* IPsec policy failure */
+ case IPSEC_ACT_BYPASS:
+ return (ip_output_post_ipsec(mp, ixa));
}
/*
* The order of processing is first insert a IP header if needed.
* Then insert the ESP header and then the AH header.
*/
- if ((io->ipsec_out_se_done == B_FALSE) &&
- (ap->ipa_want_se)) {
+ if ((ixa->ixa_flags & IXAF_IS_IPV4) && ap->ipa_want_se) {
/*
* First get the outer IP header before sending
* it to ESP.
@@ -26600,19 +12472,16 @@ ipsec_out_process(queue_t *q, mblk_t *ipsec_mp, ire_t *ire, uint_t ill_index)
mblk_t *outer_mp, *inner_mp;
if ((outer_mp = allocb(sizeof (ipha_t), BPRI_HI)) == NULL) {
- (void) mi_strlog(q, 0, SL_ERROR|SL_TRACE|SL_CONSOLE,
+ (void) mi_strlog(ill->ill_rq, 0,
+ SL_ERROR|SL_TRACE|SL_CONSOLE,
"ipsec_out_process: "
"Self-Encapsulation failed: Out of memory\n");
- freemsg(ipsec_mp);
- if (ill != NULL) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
- } else {
- BUMP_MIB(&ipst->ips_ip_mib,
- ipIfStatsOutDiscards);
- }
- return;
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards", mp, ill);
+ freemsg(mp);
+ return (ENOBUFS);
}
- inner_mp = ipsec_mp->b_cont;
+ inner_mp = mp;
ASSERT(inner_mp->b_datap->db_type == M_DATA);
oipha = (ipha_t *)outer_mp->b_rptr;
iipha = (ipha_t *)inner_mp->b_rptr;
@@ -26626,139 +12495,51 @@ ipsec_out_process(queue_t *q, mblk_t *ipsec_mp, ire_t *ire, uint_t ill_index)
oipha->ipha_hdr_checksum = 0;
oipha->ipha_hdr_checksum = ip_csum_hdr(oipha);
outer_mp->b_cont = inner_mp;
- ipsec_mp->b_cont = outer_mp;
+ mp = outer_mp;
- io->ipsec_out_se_done = B_TRUE;
- io->ipsec_out_tunnel = B_TRUE;
+ ixa->ixa_flags |= IXAF_IPSEC_TUNNEL;
}
- if (((ap->ipa_want_ah && (io->ipsec_out_ah_sa == NULL)) ||
- (ap->ipa_want_esp && (io->ipsec_out_esp_sa == NULL))) &&
- !ipsec_out_select_sa(ipsec_mp))
- return;
+ /* If we need to wait for a SA then we can't return any errno */
+ if (((ap->ipa_want_ah && (ixa->ixa_ipsec_ah_sa == NULL)) ||
+ (ap->ipa_want_esp && (ixa->ixa_ipsec_esp_sa == NULL))) &&
+ !ipsec_out_select_sa(mp, ixa))
+ return (0);
/*
* By now, we know what SA's to use. Toss over to ESP & AH
* to do the heavy lifting.
*/
- zoneid = io->ipsec_out_zoneid;
- ASSERT(zoneid != ALL_ZONES);
- if ((io->ipsec_out_esp_done == B_FALSE) && (ap->ipa_want_esp)) {
- ASSERT(io->ipsec_out_esp_sa != NULL);
- io->ipsec_out_esp_done = B_TRUE;
- /*
- * Note that since hw accel can only apply one transform,
- * not two, we skip hw accel for ESP if we also have AH
- * This is an design limitation of the interface
- * which should be revisited.
- */
- ASSERT(ire != NULL);
- if (io->ipsec_out_ah_sa == NULL) {
- ill = (ill_t *)ire->ire_stq->q_ptr;
- ipsec_out_is_accelerated(ipsec_mp,
- io->ipsec_out_esp_sa, ill, ire);
- }
+ if (ap->ipa_want_esp) {
+ ASSERT(ixa->ixa_ipsec_esp_sa != NULL);
- ipsec_rc = io->ipsec_out_esp_sa->ipsa_output_func(ipsec_mp);
- switch (ipsec_rc) {
- case IPSEC_STATUS_SUCCESS:
- break;
- case IPSEC_STATUS_FAILED:
- if (ill != NULL) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
- } else {
- BUMP_MIB(&ipst->ips_ip_mib,
- ipIfStatsOutDiscards);
- }
- /* FALLTHRU */
- case IPSEC_STATUS_PENDING:
- return;
+ mp = ixa->ixa_ipsec_esp_sa->ipsa_output_func(mp, ixa);
+ if (mp == NULL) {
+ /*
+ * Either it failed or is pending. In the former case
+ * ipIfStatsInDiscards was increased.
+ */
+ return (0);
}
}
- if ((io->ipsec_out_ah_done == B_FALSE) && (ap->ipa_want_ah)) {
- ASSERT(io->ipsec_out_ah_sa != NULL);
- io->ipsec_out_ah_done = B_TRUE;
- if (ire == NULL) {
- int idx = io->ipsec_out_capab_ill_index;
- ill = ill_lookup_on_ifindex(idx, B_FALSE,
- NULL, NULL, NULL, NULL, ipst);
- ill_need_rele = B_TRUE;
- } else {
- ill = (ill_t *)ire->ire_stq->q_ptr;
- }
- ipsec_out_is_accelerated(ipsec_mp, io->ipsec_out_ah_sa, ill,
- ire);
+ if (ap->ipa_want_ah) {
+ ASSERT(ixa->ixa_ipsec_ah_sa != NULL);
- ipsec_rc = io->ipsec_out_ah_sa->ipsa_output_func(ipsec_mp);
- switch (ipsec_rc) {
- case IPSEC_STATUS_SUCCESS:
- break;
- case IPSEC_STATUS_FAILED:
- if (ill != NULL) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
- } else {
- BUMP_MIB(&ipst->ips_ip_mib,
- ipIfStatsOutDiscards);
- }
- /* FALLTHRU */
- case IPSEC_STATUS_PENDING:
- if (ill != NULL && ill_need_rele)
- ill_refrele(ill);
- return;
+ mp = ixa->ixa_ipsec_ah_sa->ipsa_output_func(mp, ixa);
+ if (mp == NULL) {
+ /*
+ * Either it failed or is pending. In the former case
+ * ipIfStatsInDiscards was increased.
+ */
+ return (0);
}
}
/*
- * We are done with IPsec processing. Send it over the wire.
- */
-done:
- mp = ipsec_mp->b_cont;
- ipha = (ipha_t *)mp->b_rptr;
- if (IPH_HDR_VERSION(ipha) == IP_VERSION) {
- ip_wput_ipsec_out(q, ipsec_mp, ipha, ire->ire_ipif->ipif_ill,
- ire);
- } else {
- ip6h = (ip6_t *)ipha;
- ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h, ire->ire_ipif->ipif_ill,
- ire);
- }
- if (ill != NULL && ill_need_rele)
- ill_refrele(ill);
-}
-
-/* ARGSUSED */
-void
-ip_restart_optmgmt(ipsq_t *dummy_sq, queue_t *q, mblk_t *first_mp, void *dummy)
-{
- opt_restart_t *or;
- int err;
- conn_t *connp;
- cred_t *cr;
-
- ASSERT(CONN_Q(q));
- connp = Q_TO_CONN(q);
-
- ASSERT(first_mp->b_datap->db_type == M_CTL);
- or = (opt_restart_t *)first_mp->b_rptr;
- /*
- * We checked for a db_credp the first time svr4_optcom_req
- * was called (from ip_wput_nondata). So we can just ASSERT here.
+ * We are done with IPsec processing. Send it over
+ * the wire.
*/
- cr = msg_getcred(first_mp, NULL);
- ASSERT(cr != NULL);
-
- if (or->or_type == T_SVR4_OPTMGMT_REQ) {
- err = svr4_optcom_req(q, first_mp, cr,
- &ip_opt_obj, B_FALSE);
- } else {
- ASSERT(or->or_type == T_OPTMGMT_REQ);
- err = tpi_optcom_req(q, first_mp, cr,
- &ip_opt_obj, B_FALSE);
- }
- if (err != EINPROGRESS) {
- /* operation is done */
- CONN_OPER_PENDING_DONE(connp);
- }
+ return (ip_output_post_ipsec(mp, ixa));
}
/*
@@ -26811,6 +12592,11 @@ ip_reprocess_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
err = (*ipip->ipi_func_restart)(ipsq->ipsq_xop->ipx_current_ipif, sin,
q, mp, ipip, mp1->b_rptr);
+ DTRACE_PROBE4(ipif__ioctl, char *, "ip_reprocess_ioctl finish",
+ int, ipip->ipi_cmd,
+ ill_t *, ipsq->ipsq_xop->ipx_current_ipif->ipif_ill,
+ ipif_t *, ipsq->ipsq_xop->ipx_current_ipif);
+
ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq);
}
@@ -26865,12 +12651,16 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
*/
if (ipip->ipi_cmd == SIOCLIFADDIF) {
err = ip_sioctl_addif(NULL, NULL, q, mp, NULL, NULL);
+ DTRACE_PROBE4(ipif__ioctl, char *, "ip_process_ioctl finish",
+ int, ipip->ipi_cmd, ill_t *, NULL, ipif_t *, NULL);
ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL);
return;
}
ci.ci_ipif = NULL;
- if (ipip->ipi_cmd_type == MISC_CMD) {
+ switch (ipip->ipi_cmd_type) {
+ case MISC_CMD:
+ case MSFILT_CMD:
/*
* All MISC_CMD ioctls come in here -- e.g. SIOCGLIFCONF.
*/
@@ -26883,28 +12673,29 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
ci.ci_sin = NULL;
ci.ci_sin6 = NULL;
ci.ci_lifr = NULL;
- } else {
- switch (ipip->ipi_cmd_type) {
- case IF_CMD:
- case LIF_CMD:
- extract_funcp = ip_extract_lifreq;
- break;
+ extract_funcp = NULL;
+ break;
- case ARP_CMD:
- case XARP_CMD:
- extract_funcp = ip_extract_arpreq;
- break;
+ case IF_CMD:
+ case LIF_CMD:
+ extract_funcp = ip_extract_lifreq;
+ break;
- case MSFILT_CMD:
- extract_funcp = ip_extract_msfilter;
- break;
+ case ARP_CMD:
+ case XARP_CMD:
+ extract_funcp = ip_extract_arpreq;
+ break;
- default:
- ASSERT(0);
- }
+ default:
+ ASSERT(0);
+ }
- err = (*extract_funcp)(q, mp, ipip, &ci, ip_process_ioctl);
+ if (extract_funcp != NULL) {
+ err = (*extract_funcp)(q, mp, ipip, &ci);
if (err != 0) {
+ DTRACE_PROBE4(ipif__ioctl,
+ char *, "ip_process_ioctl finish err",
+ int, ipip->ipi_cmd, ill_t *, NULL, ipif_t *, NULL);
ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL);
return;
}
@@ -26923,8 +12714,17 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
*/
err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip,
ci.ci_lifr);
- if (ci.ci_ipif != NULL)
+ if (ci.ci_ipif != NULL) {
+ DTRACE_PROBE4(ipif__ioctl,
+ char *, "ip_process_ioctl finish RD",
+ int, ipip->ipi_cmd, ill_t *, ci.ci_ipif->ipif_ill,
+ ipif_t *, ci.ci_ipif);
ipif_refrele(ci.ci_ipif);
+ } else {
+ DTRACE_PROBE4(ipif__ioctl,
+ char *, "ip_process_ioctl finish RD",
+ int, ipip->ipi_cmd, ill_t *, NULL, ipif_t *, NULL);
+ }
ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL);
return;
}
@@ -26932,7 +12732,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
ASSERT(ci.ci_ipif != NULL);
/*
- * If ipsq is non-NULL, we are already being called exclusively.
+ * If ipsq is non-NULL, we are already being called exclusively
*/
ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq));
if (ipsq == NULL) {
@@ -26944,7 +12744,6 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
}
entered_ipsq = B_TRUE;
}
-
/*
* Release the ipif so that ipif_down and friends that wait for
* references to go away are not misled about the current ipif_refcnt
@@ -26962,6 +12761,10 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
*/
err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, ci.ci_lifr);
+ DTRACE_PROBE4(ipif__ioctl, char *, "ip_process_ioctl finish WR",
+ int, ipip->ipi_cmd,
+ ill_t *, ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill,
+ ipif_t *, ci.ci_ipif);
ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq);
if (entered_ipsq)
@@ -27012,31 +12815,21 @@ ip_ioctl_finish(queue_t *q, mblk_t *mp, int err, int mode, ipsq_t *ipsq)
ipsq_current_finish(ipsq);
}
-/* Called from ip_wput for all non data messages */
-/* ARGSUSED */
+/* Handles all non data messages */
void
-ip_wput_nondata(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
+ip_wput_nondata(queue_t *q, mblk_t *mp)
{
mblk_t *mp1;
- ire_t *ire, *fake_ire;
- ill_t *ill;
struct iocblk *iocp;
ip_ioctl_cmd_t *ipip;
- cred_t *cr;
conn_t *connp;
- int err;
- nce_t *nce;
- ipif_t *ipif;
- ip_stack_t *ipst;
+ cred_t *cr;
char *proto_str;
- if (CONN_Q(q)) {
+ if (CONN_Q(q))
connp = Q_TO_CONN(q);
- ipst = connp->conn_netstack->netstack_ip;
- } else {
+ else
connp = NULL;
- ipst = ILLQ_TO_IPST(q);
- }
switch (DB_TYPE(mp)) {
case M_IOCTL:
@@ -27064,17 +12857,10 @@ ip_wput_nondata(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
}
if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) {
/*
- * the ioctl is one we recognise, but is not
- * consumed by IP as a module, pass M_IOCDATA
- * for processing downstream, but only for
- * common Streams ioctls.
+ * The ioctl is one we recognise, but is not consumed
+ * by IP as a module and we are a module, so we drop
*/
- if (ipip->ipi_flags & IPI_PASS_DOWN) {
- putnext(q, mp);
- return;
- } else {
- goto nak;
- }
+ goto nak;
}
/* IOCTL continuation following copyin or copyout. */
@@ -27110,8 +12896,8 @@ ip_wput_nondata(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
/*
* Refhold the conn, till the ioctl completes. This is
* needed in case the ioctl ends up in the pending mp
- * list. Every mp in the ill_pending_mp list and
- * the ipx_pending_mp must have a refhold on the conn
+ * list. Every mp in the ipx_pending_mp list
+ * must have a refhold on the conn
* to resume processing. The refhold is released when
* the ioctl completes. (normally or abnormally)
* In all cases ip_ioctl_finish is called to finish
@@ -27119,7 +12905,6 @@ ip_wput_nondata(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
*/
if (connp != NULL) {
/* This is not a reentry */
- ASSERT(ipsq == NULL);
CONN_INC_REF(connp);
} else {
if (!(ipip->ipi_flags & IPI_MODOK)) {
@@ -27128,18 +12913,12 @@ ip_wput_nondata(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
}
}
- ip_process_ioctl(ipsq, q, mp, ipip);
+ ip_process_ioctl(NULL, q, mp, ipip);
} else {
mi_copyout(q, mp);
}
return;
-nak:
- iocp->ioc_error = EINVAL;
- mp->b_datap->db_type = M_IOCNAK;
- iocp->ioc_count = 0;
- qreply(q, mp);
- return;
case M_IOCNAK:
/*
@@ -27147,35 +12926,13 @@ nak:
* an IOCTL we sent it. This shouldn't happen.
*/
(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
- "ip_wput: unexpected M_IOCNAK, ioc_cmd 0x%x",
+ "ip_wput_nondata: unexpected M_IOCNAK, ioc_cmd 0x%x",
((struct iocblk *)mp->b_rptr)->ioc_cmd);
freemsg(mp);
return;
case M_IOCACK:
/* /dev/ip shouldn't see this */
- if (CONN_Q(q))
- goto nak;
-
- /*
- * Finish socket ioctls passed through to ARP. We use the
- * ioc_cmd values we set in ip_sioctl_arp() to decide whether
- * we need to become writer before calling ip_sioctl_iocack().
- * Note that qwriter_ip() will release the refhold, and that a
- * refhold is OK without ILL_CAN_LOOKUP() since we're on the
- * ill stream.
- */
- iocp = (struct iocblk *)mp->b_rptr;
- if (iocp->ioc_cmd == AR_ENTRY_SQUERY) {
- ip_sioctl_iocack(NULL, q, mp, NULL);
- return;
- }
-
- ASSERT(iocp->ioc_cmd == AR_ENTRY_DELETE ||
- iocp->ioc_cmd == AR_ENTRY_ADD);
- ill = q->q_ptr;
- ill_refhold(ill);
- qwriter_ip(ill, q, mp, ip_sioctl_iocack, CUR_OP, B_FALSE);
- return;
+ goto nak;
case M_FLUSH:
if (*mp->b_rptr & FLUSHW)
flushq(q, FLUSHALL);
@@ -27190,117 +12947,17 @@ nak:
}
freemsg(mp);
return;
- case IRE_DB_REQ_TYPE:
- if (connp == NULL) {
- proto_str = "IRE_DB_REQ_TYPE";
- goto protonak;
- }
- /* An Upper Level Protocol wants a copy of an IRE. */
- ip_ire_req(q, mp);
- return;
case M_CTL:
- if (mp->b_wptr - mp->b_rptr < sizeof (uint32_t))
- break;
-
- /* M_CTL messages are used by ARP to tell us things. */
- if ((mp->b_wptr - mp->b_rptr) < sizeof (arc_t))
- break;
- switch (((arc_t *)mp->b_rptr)->arc_cmd) {
- case AR_ENTRY_SQUERY:
- putnext(q, mp);
- return;
- case AR_CLIENT_NOTIFY:
- ip_arp_news(q, mp);
- return;
- case AR_DLPIOP_DONE:
- ASSERT(q->q_next != NULL);
- ill = (ill_t *)q->q_ptr;
- /* qwriter_ip releases the refhold */
- /* refhold on ill stream is ok without ILL_CAN_LOOKUP */
- ill_refhold(ill);
- qwriter_ip(ill, q, mp, ip_arp_done, CUR_OP, B_FALSE);
- return;
- case AR_ARP_CLOSING:
- /*
- * ARP (above us) is closing. If no ARP bringup is
- * currently pending, ack the message so that ARP
- * can complete its close. Also mark ill_arp_closing
- * so that new ARP bringups will fail. If any
- * ARP bringup is currently in progress, we will
- * ack this when the current ARP bringup completes.
- */
- ASSERT(q->q_next != NULL);
- ill = (ill_t *)q->q_ptr;
- mutex_enter(&ill->ill_lock);
- ill->ill_arp_closing = 1;
- if (!ill->ill_arp_bringup_pending) {
- mutex_exit(&ill->ill_lock);
- qreply(q, mp);
- } else {
- mutex_exit(&ill->ill_lock);
- freemsg(mp);
- }
- return;
- case AR_ARP_EXTEND:
- /*
- * The ARP module above us is capable of duplicate
- * address detection. Old ATM drivers will not send
- * this message.
- */
- ASSERT(q->q_next != NULL);
- ill = (ill_t *)q->q_ptr;
- ill->ill_arp_extend = B_TRUE;
- freemsg(mp);
- return;
- default:
- break;
- }
break;
case M_PROTO:
case M_PCPROTO:
/*
- * The only PROTO messages we expect are copies of option
- * negotiation acknowledgements, AH and ESP bind requests
- * are also expected.
+ * The only PROTO messages we expect are SNMP-related.
*/
switch (((union T_primitives *)mp->b_rptr)->type) {
- case O_T_BIND_REQ:
- case T_BIND_REQ: {
- /* Request can get queued in bind */
- if (connp == NULL) {
- proto_str = "O_T_BIND_REQ/T_BIND_REQ";
- goto protonak;
- }
- /*
- * The transports except SCTP call ip_bind_{v4,v6}()
- * directly instead of a a putnext. SCTP doesn't
- * generate any T_BIND_REQ since it has its own
- * fanout data structures. However, ESP and AH
- * come in for regular binds; all other cases are
- * bind retries.
- */
- ASSERT(!IPCL_IS_SCTP(connp));
-
- /* Don't increment refcnt if this is a re-entry */
- if (ipsq == NULL)
- CONN_INC_REF(connp);
-
- mp = connp->conn_af_isv6 ? ip_bind_v6(q, mp,
- connp, NULL) : ip_bind_v4(q, mp, connp);
- ASSERT(mp != NULL);
-
- ASSERT(!IPCL_IS_TCP(connp));
- ASSERT(!IPCL_IS_UDP(connp));
- ASSERT(!IPCL_IS_RAWIP(connp));
- ASSERT(!IPCL_IS_IPTUN(connp));
-
- /* The case of AH and ESP */
- qreply(q, mp);
- CONN_OPER_PENDING_DONE(connp);
- return;
- }
case T_SVR4_OPTMGMT_REQ:
- ip2dbg(("ip_wput: T_SVR4_OPTMGMT_REQ flags %x\n",
+ ip2dbg(("ip_wput_nondata: T_SVR4_OPTMGMT_REQ "
+ "flags %x\n",
((struct T_optmgmt_req *)mp->b_rptr)->MGMT_flags));
if (connp == NULL) {
@@ -27324,460 +12981,17 @@ nak:
return;
}
- if (!snmpcom_req(q, mp, ip_snmp_set,
- ip_snmp_get, cr)) {
- /*
- * Call svr4_optcom_req so that it can
- * generate the ack. We don't come here
- * if this operation is being restarted.
- * ip_restart_optmgmt will drop the conn ref.
- * In the case of ipsec option after the ipsec
- * load is complete conn_restart_ipsec_waiter
- * drops the conn ref.
- */
- ASSERT(ipsq == NULL);
- CONN_INC_REF(connp);
- if (ip_check_for_ipsec_opt(q, mp))
- return;
- err = svr4_optcom_req(q, mp, cr, &ip_opt_obj,
- B_FALSE);
- if (err != EINPROGRESS) {
- /* Operation is done */
- CONN_OPER_PENDING_DONE(connp);
- }
- }
- return;
- case T_OPTMGMT_REQ:
- ip2dbg(("ip_wput: T_OPTMGMT_REQ\n"));
- /*
- * Note: No snmpcom_req support through new
- * T_OPTMGMT_REQ.
- * Call tpi_optcom_req so that it can
- * generate the ack.
- */
- if (connp == NULL) {
- proto_str = "T_OPTMGMT_REQ";
- goto protonak;
- }
-
- /*
- * All Solaris components should pass a db_credp
- * for this TPI message, hence we ASSERT.
- * But in case there is some other M_PROTO that looks
- * like a TPI message sent by some other kernel
- * component, we check and return an error.
- */
- cr = msg_getcred(mp, NULL);
- ASSERT(cr != NULL);
- if (cr == NULL) {
- mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL);
- if (mp != NULL)
- qreply(q, mp);
- return;
- }
- ASSERT(ipsq == NULL);
- /*
- * We don't come here for restart. ip_restart_optmgmt
- * will drop the conn ref. In the case of ipsec option
- * after the ipsec load is complete
- * conn_restart_ipsec_waiter drops the conn ref.
- */
- CONN_INC_REF(connp);
- if (ip_check_for_ipsec_opt(q, mp))
- return;
- err = tpi_optcom_req(q, mp, cr, &ip_opt_obj, B_FALSE);
- if (err != EINPROGRESS) {
- /* Operation is done */
- CONN_OPER_PENDING_DONE(connp);
- }
- return;
- case T_UNBIND_REQ:
- if (connp == NULL) {
- proto_str = "T_UNBIND_REQ";
+ if (!snmpcom_req(q, mp, ip_snmp_set, ip_snmp_get, cr)) {
+ proto_str = "Bad SNMPCOM request?";
goto protonak;
}
- ip_unbind(Q_TO_CONN(q));
- mp = mi_tpi_ok_ack_alloc(mp);
- qreply(q, mp);
return;
default:
- /*
- * Have to drop any DLPI messages coming down from
- * arp (such as an info_req which would cause ip
- * to receive an extra info_ack if it was passed
- * through.
- */
- ip1dbg(("ip_wput_nondata: dropping M_PROTO %d\n",
+ ip1dbg(("ip_wput_nondata: dropping M_PROTO prim %u\n",
(int)*(uint_t *)mp->b_rptr));
freemsg(mp);
return;
}
- /* NOTREACHED */
- case IRE_DB_TYPE: {
- nce_t *nce;
- ill_t *ill;
- in6_addr_t gw_addr_v6;
-
- /*
- * This is a response back from a resolver. It
- * consists of a message chain containing:
- * IRE_MBLK-->LL_HDR_MBLK->pkt
- * The IRE_MBLK is the one we allocated in ip_newroute.
- * The LL_HDR_MBLK is the DLPI header to use to get
- * the attached packet, and subsequent ones for the
- * same destination, transmitted.
- */
- if ((mp->b_wptr - mp->b_rptr) != sizeof (ire_t)) /* ire */
- break;
- /*
- * First, check to make sure the resolution succeeded.
- * If it failed, the second mblk will be empty.
- * If it is, free the chain, dropping the packet.
- * (We must ire_delete the ire; that frees the ire mblk)
- * We're doing this now to support PVCs for ATM; it's
- * a partial xresolv implementation. When we fully implement
- * xresolv interfaces, instead of freeing everything here
- * we'll initiate neighbor discovery.
- *
- * For v4 (ARP and other external resolvers) the resolver
- * frees the message, so no check is needed. This check
- * is required, though, for a full xresolve implementation.
- * Including this code here now both shows how external
- * resolvers can NACK a resolution request using an
- * existing design that has no specific provisions for NACKs,
- * and also takes into account that the current non-ARP
- * external resolver has been coded to use this method of
- * NACKing for all IPv6 (xresolv) cases,
- * whether our xresolv implementation is complete or not.
- *
- */
- ire = (ire_t *)mp->b_rptr;
- ill = ire_to_ill(ire);
- mp1 = mp->b_cont; /* dl_unitdata_req */
- if (mp1->b_rptr == mp1->b_wptr) {
- if (ire->ire_ipversion == IPV6_VERSION) {
- /*
- * XRESOLV interface.
- */
- ASSERT(ill->ill_flags & ILLF_XRESOLV);
- mutex_enter(&ire->ire_lock);
- gw_addr_v6 = ire->ire_gateway_addr_v6;
- mutex_exit(&ire->ire_lock);
- if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) {
- nce = ndp_lookup_v6(ill, B_FALSE,
- &ire->ire_addr_v6, B_FALSE);
- } else {
- nce = ndp_lookup_v6(ill, B_FALSE,
- &gw_addr_v6, B_FALSE);
- }
- if (nce != NULL) {
- nce_resolv_failed(nce);
- ndp_delete(nce);
- NCE_REFRELE(nce);
- }
- }
- mp->b_cont = NULL;
- freemsg(mp1); /* frees the pkt as well */
- ASSERT(ire->ire_nce == NULL);
- ire_delete((ire_t *)mp->b_rptr);
- return;
- }
-
- /*
- * Split them into IRE_MBLK and pkt and feed it into
- * ire_add_then_send. Then in ire_add_then_send
- * the IRE will be added, and then the packet will be
- * run back through ip_wput. This time it will make
- * it to the wire.
- */
- mp->b_cont = NULL;
- mp = mp1->b_cont; /* now, mp points to pkt */
- mp1->b_cont = NULL;
- ip1dbg(("ip_wput_nondata: reply from external resolver \n"));
- if (ire->ire_ipversion == IPV6_VERSION) {
- /*
- * XRESOLV interface. Find the nce and put a copy
- * of the dl_unitdata_req in nce_res_mp
- */
- ASSERT(ill->ill_flags & ILLF_XRESOLV);
- mutex_enter(&ire->ire_lock);
- gw_addr_v6 = ire->ire_gateway_addr_v6;
- mutex_exit(&ire->ire_lock);
- if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) {
- nce = ndp_lookup_v6(ill, B_FALSE,
- &ire->ire_addr_v6, B_FALSE);
- } else {
- nce = ndp_lookup_v6(ill, B_FALSE,
- &gw_addr_v6, B_FALSE);
- }
- if (nce != NULL) {
- /*
- * We have to protect nce_res_mp here
- * from being accessed by other threads
- * while we change the mblk pointer.
- * Other functions will also lock the nce when
- * accessing nce_res_mp.
- *
- * The reason we change the mblk pointer
- * here rather than copying the resolved address
- * into the template is that, unlike with
- * ethernet, we have no guarantee that the
- * resolved address length will be
- * smaller than or equal to the lla length
- * with which the template was allocated,
- * (for ethernet, they're equal)
- * so we have to use the actual resolved
- * address mblk - which holds the real
- * dl_unitdata_req with the resolved address.
- *
- * Doing this is the same behavior as was
- * previously used in the v4 ARP case.
- */
- mutex_enter(&nce->nce_lock);
- if (nce->nce_res_mp != NULL)
- freemsg(nce->nce_res_mp);
- nce->nce_res_mp = mp1;
- mutex_exit(&nce->nce_lock);
- /*
- * We do a fastpath probe here because
- * we have resolved the address without
- * using Neighbor Discovery.
- * In the non-XRESOLV v6 case, the fastpath
- * probe is done right after neighbor
- * discovery completes.
- */
- if (nce->nce_res_mp != NULL) {
- int res;
- nce_fastpath_list_add(nce);
- res = ill_fastpath_probe(ill,
- nce->nce_res_mp);
- if (res != 0 && res != EAGAIN)
- nce_fastpath_list_delete(nce);
- }
-
- ire_add_then_send(q, ire, mp);
- /*
- * Now we have to clean out any packets
- * that may have been queued on the nce
- * while it was waiting for address resolution
- * to complete.
- */
- mutex_enter(&nce->nce_lock);
- mp1 = nce->nce_qd_mp;
- nce->nce_qd_mp = NULL;
- mutex_exit(&nce->nce_lock);
- while (mp1 != NULL) {
- mblk_t *nxt_mp;
- queue_t *fwdq = NULL;
- ill_t *inbound_ill;
- uint_t ifindex;
-
- nxt_mp = mp1->b_next;
- mp1->b_next = NULL;
- /*
- * Retrieve ifindex stored in
- * ip_rput_data_v6()
- */
- ifindex =
- (uint_t)(uintptr_t)mp1->b_prev;
- inbound_ill =
- ill_lookup_on_ifindex(ifindex,
- B_TRUE, NULL, NULL, NULL,
- NULL, ipst);
- mp1->b_prev = NULL;
- if (inbound_ill != NULL)
- fwdq = inbound_ill->ill_rq;
-
- if (fwdq != NULL) {
- put(fwdq, mp1);
- ill_refrele(inbound_ill);
- } else
- put(WR(ill->ill_rq), mp1);
- mp1 = nxt_mp;
- }
- NCE_REFRELE(nce);
- } else { /* nce is NULL; clean up */
- ire_delete(ire);
- freemsg(mp);
- freemsg(mp1);
- return;
- }
- } else {
- nce_t *arpce;
- /*
- * Link layer resolution succeeded. Recompute the
- * ire_nce.
- */
- ASSERT(ire->ire_type & (IRE_CACHE|IRE_BROADCAST));
- if ((arpce = ndp_lookup_v4(ill,
- (ire->ire_gateway_addr != INADDR_ANY ?
- &ire->ire_gateway_addr : &ire->ire_addr),
- B_FALSE)) == NULL) {
- freeb(ire->ire_mp);
- freeb(mp1);
- freemsg(mp);
- return;
- }
- mutex_enter(&arpce->nce_lock);
- arpce->nce_last = TICK_TO_MSEC(lbolt64);
- if (arpce->nce_state == ND_REACHABLE) {
- /*
- * Someone resolved this before us;
- * cleanup the res_mp. Since ire has
- * not been added yet, the call to ire_add_v4
- * from ire_add_then_send (when a dup is
- * detected) will clean up the ire.
- */
- freeb(mp1);
- } else {
- ASSERT(arpce->nce_res_mp == NULL);
- arpce->nce_res_mp = mp1;
- arpce->nce_state = ND_REACHABLE;
- }
- mutex_exit(&arpce->nce_lock);
- if (ire->ire_marks & IRE_MARK_NOADD) {
- /*
- * this ire will not be added to the ire
- * cache table, so we can set the ire_nce
- * here, as there are no atomicity constraints.
- */
- ire->ire_nce = arpce;
- /*
- * We are associating this nce with the ire
- * so change the nce ref taken in
- * ndp_lookup_v4() from
- * NCE_REFHOLD to NCE_REFHOLD_NOTR
- */
- NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce);
- } else {
- NCE_REFRELE(arpce);
- }
- ire_add_then_send(q, ire, mp);
- }
- return; /* All is well, the packet has been sent. */
- }
- case IRE_ARPRESOLVE_TYPE: {
-
- if ((mp->b_wptr - mp->b_rptr) != sizeof (ire_t)) /* fake_ire */
- break;
- mp1 = mp->b_cont; /* dl_unitdata_req */
- mp->b_cont = NULL;
- /*
- * First, check to make sure the resolution succeeded.
- * If it failed, the second mblk will be empty.
- */
- if (mp1->b_rptr == mp1->b_wptr) {
- /* cleanup the incomplete ire, free queued packets */
- freemsg(mp); /* fake ire */
- freeb(mp1); /* dl_unitdata response */
- return;
- }
-
- /*
- * Update any incomplete nce_t found. We search the ctable
- * and find the nce from the ire->ire_nce because we need
- * to pass the ire to ip_xmit_v4 later, and can find both
- * ire and nce in one lookup.
- */
- fake_ire = (ire_t *)mp->b_rptr;
-
- /*
- * By the time we come back here from ARP the logical outgoing
- * interface of the incomplete ire we added in ire_forward()
- * could have disappeared, causing the incomplete ire to also
- * disappear. So we need to retreive the proper ipif for the
- * ire before looking in ctable. In the case of IPMP, the
- * ipif may be on the IPMP ill, so look it up based on the
- * ire_ipif_ifindex we stashed back in ire_init_common().
- * Then, we can verify that ire_ipif_seqid still exists.
- */
- ill = ill_lookup_on_ifindex(fake_ire->ire_ipif_ifindex, B_FALSE,
- NULL, NULL, NULL, NULL, ipst);
- if (ill == NULL) {
- ip1dbg(("ill for incomplete ire vanished\n"));
- freemsg(mp); /* fake ire */
- freeb(mp1); /* dl_unitdata response */
- return;
- }
-
- /* Get the outgoing ipif */
- mutex_enter(&ill->ill_lock);
- ipif = ipif_lookup_seqid(ill, fake_ire->ire_ipif_seqid);
- if (ipif == NULL) {
- mutex_exit(&ill->ill_lock);
- ill_refrele(ill);
- ip1dbg(("logical intrf to incomplete ire vanished\n"));
- freemsg(mp); /* fake_ire */
- freeb(mp1); /* dl_unitdata response */
- return;
- }
-
- ipif_refhold_locked(ipif);
- mutex_exit(&ill->ill_lock);
- ill_refrele(ill);
- ire = ire_arpresolve_lookup(fake_ire->ire_addr,
- fake_ire->ire_gateway_addr, ipif, fake_ire->ire_zoneid,
- ipst, ((ill_t *)q->q_ptr)->ill_wq);
- ipif_refrele(ipif);
- if (ire == NULL) {
- /*
- * no ire was found; check if there is an nce
- * for this lookup; if it has no ire's pointing at it
- * cleanup.
- */
- if ((nce = ndp_lookup_v4(q->q_ptr,
- (fake_ire->ire_gateway_addr != INADDR_ANY ?
- &fake_ire->ire_gateway_addr : &fake_ire->ire_addr),
- B_FALSE)) != NULL) {
- /*
- * cleanup:
- * We check for refcnt 2 (one for the nce
- * hash list + 1 for the ref taken by
- * ndp_lookup_v4) to check that there are
- * no ire's pointing at the nce.
- */
- if (nce->nce_refcnt == 2)
- ndp_delete(nce);
- NCE_REFRELE(nce);
- }
- freeb(mp1); /* dl_unitdata response */
- freemsg(mp); /* fake ire */
- return;
- }
-
- nce = ire->ire_nce;
- DTRACE_PROBE2(ire__arpresolve__type,
- ire_t *, ire, nce_t *, nce);
- mutex_enter(&nce->nce_lock);
- nce->nce_last = TICK_TO_MSEC(lbolt64);
- if (nce->nce_state == ND_REACHABLE) {
- /*
- * Someone resolved this before us;
- * our response is not needed any more.
- */
- mutex_exit(&nce->nce_lock);
- freeb(mp1); /* dl_unitdata response */
- } else {
- ASSERT(nce->nce_res_mp == NULL);
- nce->nce_res_mp = mp1;
- nce->nce_state = ND_REACHABLE;
- mutex_exit(&nce->nce_lock);
- nce_fastpath(nce);
- }
- /*
- * The cached nce_t has been updated to be reachable;
- * Clear the IRE_MARK_UNCACHED flag and free the fake_ire.
- */
- fake_ire->ire_marks &= ~IRE_MARK_UNCACHED;
- freemsg(mp);
- /*
- * send out queued packets.
- */
- (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE, NULL);
-
- IRE_REFRELE(ire);
- return;
- }
default:
break;
}
@@ -27787,6 +13001,13 @@ nak:
freemsg(mp);
return;
+nak:
+ iocp->ioc_error = EINVAL;
+ mp->b_datap->db_type = M_IOCNAK;
+ iocp->ioc_count = 0;
+ qreply(q, mp);
+ return;
+
protonak:
cmn_err(CE_NOTE, "IP doesn't process %s as a module", proto_str);
if ((mp = mi_tpi_err_ack_alloc(mp, TPROTO, EINVAL)) != NULL)
@@ -27794,14 +13015,15 @@ protonak:
}
/*
- * Process IP options in an outbound packet. Modify the destination if there
- * is a source route option.
+ * Process IP options in an outbound packet. Verify that the nexthop in a
+ * strict source route is onlink.
* Returns non-zero if something fails in which case an ICMP error has been
* sent and mp freed.
+ *
+ * Assumes the ULP has called ip_massage_options to move nexthop into ipha_dst.
*/
-static int
-ip_wput_options(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha,
- boolean_t mctl_present, zoneid_t zoneid, ip_stack_t *ipst)
+int
+ip_output_options(mblk_t *mp, ipha_t *ipha, ip_xmit_attr_t *ixa, ill_t *ill)
{
ipoptp_t opts;
uchar_t *opt;
@@ -27809,14 +13031,11 @@ ip_wput_options(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha,
uint8_t optlen;
ipaddr_t dst;
intptr_t code = 0;
- mblk_t *mp;
- ire_t *ire = NULL;
+ ire_t *ire;
+ ip_stack_t *ipst = ixa->ixa_ipst;
+ ip_recv_attr_t iras;
- ip2dbg(("ip_wput_options\n"));
- mp = ipsec_mp;
- if (mctl_present) {
- mp = ipsec_mp->b_cont;
- }
+ ip2dbg(("ip_output_options\n"));
dst = ipha->ipha_dst;
for (optval = ipoptp_first(&opts, ipha);
@@ -27824,7 +13043,7 @@ ip_wput_options(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha,
optval = ipoptp_next(&opts)) {
opt = opts.ipoptp_cur;
optlen = opts.ipoptp_len;
- ip2dbg(("ip_wput_options: opt %d, len %d\n",
+ ip2dbg(("ip_output_options: opt %d, len %d\n",
optval, optlen));
switch (optval) {
uint32_t off;
@@ -27832,25 +13051,25 @@ ip_wput_options(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha,
case IPOPT_LSRR:
if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
ip1dbg((
- "ip_wput_options: bad option offset\n"));
+ "ip_output_options: bad option offset\n"));
code = (char *)&opt[IPOPT_OLEN] -
(char *)ipha;
goto param_prob;
}
off = opt[IPOPT_OFFSET];
- ip1dbg(("ip_wput_options: next hop 0x%x\n",
+ ip1dbg(("ip_output_options: next hop 0x%x\n",
ntohl(dst)));
/*
* For strict: verify that dst is directly
* reachable.
*/
if (optval == IPOPT_SSRR) {
- ire = ire_ftable_lookup(dst, 0, 0,
- IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0,
- msg_getlabel(mp),
- MATCH_IRE_TYPE | MATCH_IRE_SECATTR, ipst);
+ ire = ire_ftable_lookup_v4(dst, 0, 0,
+ IRE_IF_ALL, NULL, ALL_ZONES, ixa->ixa_tsl,
+ MATCH_IRE_TYPE | MATCH_IRE_SECATTR, 0, ipst,
+ NULL);
if (ire == NULL) {
- ip1dbg(("ip_wput_options: SSRR not"
+ ip1dbg(("ip_output_options: SSRR not"
" directly reachable: 0x%x\n",
ntohl(dst)));
goto bad_src_route;
@@ -27861,7 +13080,7 @@ ip_wput_options(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha,
case IPOPT_RR:
if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
ip1dbg((
- "ip_wput_options: bad option offset\n"));
+ "ip_output_options: bad option offset\n"));
code = (char *)&opt[IPOPT_OLEN] -
(char *)ipha;
goto param_prob;
@@ -27879,7 +13098,7 @@ ip_wput_options(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha,
}
if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
ip1dbg((
- "ip_wput_options: bad option offset\n"));
+ "ip_output_options: bad option offset\n"));
code = (char *)&opt[IPOPT_OFFSET] -
(char *)ipha;
goto param_prob;
@@ -27913,33 +13132,31 @@ ip_wput_options(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha,
if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0)
return (0);
- ip1dbg(("ip_wput_options: error processing IP options."));
+ ip1dbg(("ip_output_options: error processing IP options."));
code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha;
param_prob:
- /*
- * Since ip_wput() isn't close to finished, we fill
- * in enough of the header for credible error reporting.
- */
- if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst)) {
- /* Failed */
- freemsg(ipsec_mp);
- return (-1);
- }
- icmp_param_problem(q, ipsec_mp, (uint8_t)code, zoneid, ipst);
+ bzero(&iras, sizeof (iras));
+ iras.ira_ill = iras.ira_rill = ill;
+ iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+ iras.ira_rifindex = iras.ira_ruifindex;
+ iras.ira_flags = IRAF_IS_IPV4;
+
+ ip_drop_output("ip_output_options", mp, ill);
+ icmp_param_problem(mp, (uint8_t)code, &iras);
+ ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
return (-1);
bad_src_route:
- /*
- * Since ip_wput() isn't close to finished, we fill
- * in enough of the header for credible error reporting.
- */
- if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst)) {
- /* Failed */
- freemsg(ipsec_mp);
- return (-1);
- }
- icmp_unreachable(q, ipsec_mp, ICMP_SOURCE_ROUTE_FAILED, zoneid, ipst);
+ bzero(&iras, sizeof (iras));
+ iras.ira_ill = iras.ira_rill = ill;
+ iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+ iras.ira_rifindex = iras.ira_ruifindex;
+ iras.ira_flags = IRAF_IS_IPV4;
+
+ ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ill);
+ icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, &iras);
+ ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
return (-1);
}
@@ -28082,29 +13299,60 @@ conn_drain_insert(conn_t *connp, idl_tx_list_t *tx_list)
/*
* For non streams based sockets assert flow control.
*/
- if (IPCL_IS_NONSTR(connp)) {
- DTRACE_PROBE1(su__txq__full, conn_t *, connp);
- (*connp->conn_upcalls->su_txq_full)
- (connp->conn_upper_handle, B_TRUE);
- } else {
- conn_setqfull(connp);
- noenable(connp->conn_wq);
- }
+ conn_setqfull(connp, NULL);
mutex_exit(CONN_DRAIN_LIST_LOCK(connp));
}
+static void
+conn_idl_remove(conn_t *connp)
+{
+ idl_t *idl = connp->conn_idl;
+
+ if (idl != NULL) {
+ /*
+ * Remove ourself from the drain list, if we did not do
+ * a putq, or if the conn is closing.
+ * Note: It is possible that q->q_first is non-null. It means
+ * that these messages landed after we did a enableok() in
+ * ip_wsrv. Thus STREAMS will call ip_wsrv once again to
+ * service them.
+ */
+ if (connp->conn_drain_next == connp) {
+ /* Singleton in the list */
+ ASSERT(connp->conn_drain_prev == connp);
+ idl->idl_conn = NULL;
+ } else {
+ connp->conn_drain_prev->conn_drain_next =
+ connp->conn_drain_next;
+ connp->conn_drain_next->conn_drain_prev =
+ connp->conn_drain_prev;
+ if (idl->idl_conn == connp)
+ idl->idl_conn = connp->conn_drain_next;
+ }
+ }
+ connp->conn_drain_next = NULL;
+ connp->conn_drain_prev = NULL;
+
+ conn_clrqfull(connp, NULL);
+ /*
+ * For streams based sockets open up flow control.
+ */
+ if (!IPCL_IS_NONSTR(connp))
+ enableok(connp->conn_wq);
+}
+
/*
* This conn is closing, and we are called from ip_close. OR
- * This conn has been serviced by ip_wsrv, and we need to do the tail
- * processing.
- * If this conn is part of the drain list, we may need to sustain the drain
- * process by qenabling the next conn in the drain list. We may also need to
- * remove this conn from the list, if it is done.
+ * this conn is draining because flow-control on the ill has been relieved.
+ *
+ * We must also need to remove conn's on this idl from the list, and also
+ * inform the sockfs upcalls about the change in flow-control.
*/
static void
conn_drain_tail(conn_t *connp, boolean_t closing)
{
idl_t *idl;
+ conn_t *next_connp;
/*
* connp->conn_idl is stable at this point, and no lock is needed
@@ -28116,24 +13364,21 @@ conn_drain_tail(conn_t *connp, boolean_t closing)
* instance of service trying to call conn_drain_insert on this conn
* now.
*/
- ASSERT(!closing || (connp->conn_idl != NULL));
+ ASSERT(!closing || connp == NULL || connp->conn_idl != NULL);
/*
* If connp->conn_idl is null, the conn has not been inserted into any
* drain list even once since creation of the conn. Just return.
*/
- if (connp->conn_idl == NULL)
+ if (connp == NULL || connp->conn_idl == NULL)
return;
- mutex_enter(CONN_DRAIN_LIST_LOCK(connp));
-
if (connp->conn_drain_prev == NULL) {
/* This conn is currently not in the drain list. */
- mutex_exit(CONN_DRAIN_LIST_LOCK(connp));
return;
}
idl = connp->conn_idl;
- if (idl->idl_conn_draining == connp) {
+ if (!closing) {
/*
* This conn is the current drainer. If this is the last conn
* in the drain list, we need to do more checks, in the 'if'
@@ -28141,186 +13386,45 @@ conn_drain_tail(conn_t *connp, boolean_t closing)
* to sustain the draining, and is handled in the 'else'
* below.
*/
- if (connp->conn_drain_next == idl->idl_conn) {
- /*
- * This conn is the last in this list. This round
- * of draining is complete. If idl_repeat is set,
- * it means another flow enabling has happened from
- * the driver/streams and we need to another round
- * of draining.
- * If there are more than 2 conns in the drain list,
- * do a left rotate by 1, so that all conns except the
- * conn at the head move towards the head by 1, and the
- * the conn at the head goes to the tail. This attempts
- * a more even share for all queues that are being
- * drained.
- */
- if ((connp->conn_drain_next != connp) &&
- (idl->idl_conn->conn_drain_next != connp)) {
- idl->idl_conn = idl->idl_conn->conn_drain_next;
- }
- if (idl->idl_repeat) {
- qenable(idl->idl_conn->conn_wq);
- idl->idl_conn_draining = idl->idl_conn;
- idl->idl_repeat = 0;
- } else {
- idl->idl_conn_draining = NULL;
- }
- } else {
- /*
- * If the next queue that we are now qenable'ing,
- * is closing, it will remove itself from this list
- * and qenable the subsequent queue in ip_close().
- * Serialization is acheived thru idl_lock.
- */
- qenable(connp->conn_drain_next->conn_wq);
- idl->idl_conn_draining = connp->conn_drain_next;
- }
- }
- if (!connp->conn_did_putbq || closing) {
- /*
- * Remove ourself from the drain list, if we did not do
- * a putbq, or if the conn is closing.
- * Note: It is possible that q->q_first is non-null. It means
- * that these messages landed after we did a enableok() in
- * ip_wsrv. Thus STREAMS will call ip_wsrv once again to
- * service them.
- */
- if (connp->conn_drain_next == connp) {
- /* Singleton in the list */
- ASSERT(connp->conn_drain_prev == connp);
- idl->idl_conn = NULL;
- idl->idl_conn_draining = NULL;
- } else {
- connp->conn_drain_prev->conn_drain_next =
- connp->conn_drain_next;
- connp->conn_drain_next->conn_drain_prev =
- connp->conn_drain_prev;
- if (idl->idl_conn == connp)
- idl->idl_conn = connp->conn_drain_next;
- ASSERT(idl->idl_conn_draining != connp);
-
- }
- connp->conn_drain_next = NULL;
- connp->conn_drain_prev = NULL;
+ next_connp = connp->conn_drain_next;
+ while (next_connp != connp) {
+ conn_t *delconnp = next_connp;
- /*
- * For non streams based sockets open up flow control.
- */
- if (IPCL_IS_NONSTR(connp)) {
- (*connp->conn_upcalls->su_txq_full)
- (connp->conn_upper_handle, B_FALSE);
- } else {
- conn_clrqfull(connp);
- enableok(connp->conn_wq);
+ next_connp = next_connp->conn_drain_next;
+ conn_idl_remove(delconnp);
}
+ ASSERT(connp->conn_drain_next == idl->idl_conn);
}
+ conn_idl_remove(connp);
- mutex_exit(CONN_DRAIN_LIST_LOCK(connp));
}
/*
* Write service routine. Shared perimeter entry point.
- * ip_wsrv can be called in any of the following ways.
- * 1. The device queue's messages has fallen below the low water mark
- * and STREAMS has backenabled the ill_wq. We walk thru all the
- * the drain lists and backenable the first conn in each list.
- * 2. The above causes STREAMS to run ip_wsrv on the conn_wq of the
- * qenabled non-tcp upper layers. We start dequeing messages and call
- * ip_wput for each message.
+ * The device queue's messages has fallen below the low water mark and STREAMS
+ * has backenabled the ill_wq. Send sockfs notification about flow-control onx
+ * each waiting conn.
*/
-
void
ip_wsrv(queue_t *q)
{
- conn_t *connp;
ill_t *ill;
- mblk_t *mp;
-
- if (q->q_next) {
- ill = (ill_t *)q->q_ptr;
- if (ill->ill_state_flags == 0) {
- ip_stack_t *ipst = ill->ill_ipst;
- /*
- * The device flow control has opened up.
- * Walk through conn drain lists and qenable the
- * first conn in each list. This makes sense only
- * if the stream is fully plumbed and setup.
- * Hence the if check above.
- */
- ip1dbg(("ip_wsrv: walking\n"));
- conn_walk_drain(ipst, &ipst->ips_idl_tx_list[0]);
- }
- return;
- }
-
- connp = Q_TO_CONN(q);
- ip1dbg(("ip_wsrv: %p %p\n", (void *)q, (void *)connp));
+ ill = (ill_t *)q->q_ptr;
+ if (ill->ill_state_flags == 0) {
+ ip_stack_t *ipst = ill->ill_ipst;
- /*
- * 1. Set conn_draining flag to signal that service is active.
- *
- * 2. ip_output determines whether it has been called from service,
- * based on the last parameter. If it is IP_WSRV it concludes it
- * has been called from service.
- *
- * 3. Message ordering is preserved by the following logic.
- * i. A directly called ip_output (i.e. not thru service) will queue
- * the message at the tail, if conn_draining is set (i.e. service
- * is running) or if q->q_first is non-null.
- *
- * ii. If ip_output is called from service, and if ip_output cannot
- * putnext due to flow control, it does a putbq.
- *
- * 4. noenable the queue so that a putbq from ip_wsrv does not reenable
- * (causing an infinite loop).
- */
- ASSERT(!connp->conn_did_putbq);
-
- while ((q->q_first != NULL) && !connp->conn_did_putbq) {
- connp->conn_draining = 1;
- noenable(q);
- while ((mp = getq(q)) != NULL) {
- ASSERT(CONN_Q(q));
-
- DTRACE_PROBE1(ip__wsrv__ip__output, conn_t *, connp);
- ip_output(Q_TO_CONN(q), mp, q, IP_WSRV);
- if (connp->conn_did_putbq) {
- /* ip_wput did a putbq */
- break;
- }
- }
/*
- * At this point, a thread coming down from top, calling
- * ip_wput, may end up queueing the message. We have not yet
- * enabled the queue, so ip_wsrv won't be called again.
- * To avoid this race, check q->q_first again (in the loop)
- * If the other thread queued the message before we call
- * enableok(), we will catch it in the q->q_first check.
- * If the other thread queues the message after we call
- * enableok(), ip_wsrv will be called again by STREAMS.
+ * The device flow control has opened up.
+ * Walk through conn drain lists and qenable the
+ * first conn in each list. This makes sense only
+ * if the stream is fully plumbed and setup.
+ * Hence the ill_state_flags check above.
*/
- connp->conn_draining = 0;
- enableok(q);
+ ip1dbg(("ip_wsrv: walking\n"));
+ conn_walk_drain(ipst, &ipst->ips_idl_tx_list[0]);
+ enableok(ill->ill_wq);
}
-
- /* Enable the next conn for draining */
- conn_drain_tail(connp, B_FALSE);
-
- /*
- * conn_direct_blocked is used to indicate blocked
- * condition for direct path (ILL_DIRECT_CAPABLE()).
- * This is the only place where it is set without
- * checking for ILL_DIRECT_CAPABLE() and setting it
- * to 0 is ok even if it is not ILL_DIRECT_CAPABLE().
- */
- if (!connp->conn_did_putbq && connp->conn_direct_blocked) {
- DTRACE_PROBE1(ip__wsrv__direct__blocked, conn_t *, connp);
- connp->conn_direct_blocked = B_FALSE;
- }
-
- connp->conn_did_putbq = 0;
}
/*
@@ -28369,21 +13473,7 @@ conn_walk_drain(ip_stack_t *ipst, idl_tx_list_t *tx_list)
for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++) {
idl = &tx_list->txl_drain_list[i];
mutex_enter(&idl->idl_lock);
- if (idl->idl_conn == NULL) {
- mutex_exit(&idl->idl_lock);
- continue;
- }
- /*
- * If this list is not being drained currently by
- * an ip_wsrv thread, start the process.
- */
- if (idl->idl_conn_draining == NULL) {
- ASSERT(idl->idl_repeat == 0);
- qenable(idl->idl_conn->conn_wq);
- idl->idl_conn_draining = idl->idl_conn;
- } else {
- idl->idl_repeat = 1;
- }
+ conn_drain_tail(idl->idl_conn, B_FALSE);
mutex_exit(&idl->idl_lock);
}
}
@@ -28393,240 +13483,190 @@ conn_walk_drain(ip_stack_t *ipst, idl_tx_list_t *tx_list)
* "matches" the conn.
*/
boolean_t
-conn_wantpacket(conn_t *connp, ill_t *ill, ipha_t *ipha, int fanout_flags,
- zoneid_t zoneid)
+conn_wantpacket(conn_t *connp, ip_recv_attr_t *ira, ipha_t *ipha)
{
- ill_t *bound_ill;
- boolean_t found;
- ipif_t *ipif;
- ire_t *ire;
- ipaddr_t dst, src;
- ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
+ ill_t *ill = ira->ira_rill;
+ zoneid_t zoneid = ira->ira_zoneid;
+ uint_t in_ifindex;
+ ipaddr_t dst, src;
dst = ipha->ipha_dst;
src = ipha->ipha_src;
/*
- * conn_incoming_ill is set by IP_BOUND_IF which limits
+ * conn_incoming_ifindex is set by IP_BOUND_IF which limits
* unicast, broadcast and multicast reception to
- * conn_incoming_ill. conn_wantpacket itself is called
- * only for BROADCAST and multicast.
+ * conn_incoming_ifindex.
+ * conn_wantpacket is called for unicast, broadcast and
+ * multicast packets.
*/
- bound_ill = connp->conn_incoming_ill;
- if (bound_ill != NULL) {
- if (IS_IPMP(bound_ill)) {
- if (bound_ill->ill_grp != ill->ill_grp)
- return (B_FALSE);
- } else {
- if (bound_ill != ill)
- return (B_FALSE);
- }
- }
+ in_ifindex = connp->conn_incoming_ifindex;
- if (!CLASSD(dst)) {
- if (IPCL_ZONE_MATCH(connp, zoneid))
- return (B_TRUE);
- /*
- * The conn is in a different zone; we need to check that this
- * broadcast address is configured in the application's zone.
- */
- ipif = ipif_get_next_ipif(NULL, ill);
- if (ipif == NULL)
+ /* mpathd can bind to the under IPMP interface, which we allow */
+ if (in_ifindex != 0 && in_ifindex != ill->ill_phyint->phyint_ifindex) {
+ if (!IS_UNDER_IPMP(ill))
return (B_FALSE);
- ire = ire_ctable_lookup(dst, 0, IRE_BROADCAST, ipif,
- connp->conn_zoneid, NULL,
- (MATCH_IRE_TYPE | MATCH_IRE_ILL), ipst);
- ipif_refrele(ipif);
- if (ire != NULL) {
- ire_refrele(ire);
- return (B_TRUE);
- } else {
+
+ if (in_ifindex != ipmp_ill_get_ipmp_ifindex(ill))
return (B_FALSE);
- }
}
- if ((fanout_flags & IP_FF_NO_MCAST_LOOP) &&
- connp->conn_zoneid == zoneid) {
- /*
- * Loopback case: the sending endpoint has IP_MULTICAST_LOOP
- * disabled, therefore we don't dispatch the multicast packet to
- * the sending zone.
- */
+ if (!IPCL_ZONE_MATCH(connp, zoneid))
return (B_FALSE);
- }
- if (IS_LOOPBACK(ill) && connp->conn_zoneid != zoneid) {
- /*
- * Multicast packet on the loopback interface: we only match
- * conns who joined the group in the specified zone.
- */
- return (B_FALSE);
- }
+ if (!(ira->ira_flags & IRAF_MULTICAST))
+ return (B_TRUE);
if (connp->conn_multi_router) {
/* multicast packet and multicast router socket: send up */
return (B_TRUE);
}
- mutex_enter(&connp->conn_lock);
- found = (ilg_lookup_ill_withsrc(connp, dst, src, ill) != NULL);
- mutex_exit(&connp->conn_lock);
- return (found);
+ if (ipha->ipha_protocol == IPPROTO_PIM ||
+ ipha->ipha_protocol == IPPROTO_RSVP)
+ return (B_TRUE);
+
+ return (conn_hasmembers_ill_withsrc_v4(connp, dst, src, ira->ira_ill));
}
-static void
-conn_setqfull(conn_t *connp)
+void
+conn_setqfull(conn_t *connp, boolean_t *flow_stopped)
{
- queue_t *q = connp->conn_wq;
+ if (IPCL_IS_NONSTR(connp)) {
+ (*connp->conn_upcalls->su_txq_full)
+ (connp->conn_upper_handle, B_TRUE);
+ if (flow_stopped != NULL)
+ *flow_stopped = B_TRUE;
+ } else {
+ queue_t *q = connp->conn_wq;
- if (!(q->q_flag & QFULL)) {
- mutex_enter(QLOCK(q));
+ ASSERT(q != NULL);
if (!(q->q_flag & QFULL)) {
- /* still need to set QFULL */
- q->q_flag |= QFULL;
- mutex_exit(QLOCK(q));
- } else {
- mutex_exit(QLOCK(q));
+ mutex_enter(QLOCK(q));
+ if (!(q->q_flag & QFULL)) {
+ /* still need to set QFULL */
+ q->q_flag |= QFULL;
+ /* set flow_stopped to true under QLOCK */
+ if (flow_stopped != NULL)
+ *flow_stopped = B_TRUE;
+ mutex_exit(QLOCK(q));
+ } else {
+ /* flow_stopped is left unchanged */
+ mutex_exit(QLOCK(q));
+ }
}
}
}
-static void
-conn_clrqfull(conn_t *connp)
+void
+conn_clrqfull(conn_t *connp, boolean_t *flow_stopped)
{
- queue_t *q = connp->conn_wq;
+ if (IPCL_IS_NONSTR(connp)) {
+ (*connp->conn_upcalls->su_txq_full)
+ (connp->conn_upper_handle, B_FALSE);
+ if (flow_stopped != NULL)
+ *flow_stopped = B_FALSE;
+ } else {
+ queue_t *q = connp->conn_wq;
- if (q->q_flag & QFULL) {
- mutex_enter(QLOCK(q));
+ ASSERT(q != NULL);
if (q->q_flag & QFULL) {
- q->q_flag &= ~QFULL;
- mutex_exit(QLOCK(q));
- if (q->q_flag & QWANTW)
- qbackenable(q, 0);
- } else {
- mutex_exit(QLOCK(q));
+ mutex_enter(QLOCK(q));
+ if (q->q_flag & QFULL) {
+ q->q_flag &= ~QFULL;
+ /* set flow_stopped to false under QLOCK */
+ if (flow_stopped != NULL)
+ *flow_stopped = B_FALSE;
+ mutex_exit(QLOCK(q));
+ if (q->q_flag & QWANTW)
+ qbackenable(q, 0);
+ } else {
+ /* flow_stopped is left unchanged */
+ mutex_exit(QLOCK(q));
+ }
}
}
+ connp->conn_direct_blocked = B_FALSE;
}
/*
- * Finish processing of "arp_up" when AR_DLPIOP_DONE is received from arp.
+ * Return the length in bytes of the IPv4 headers (base header, label, and
+ * other IP options) that will be needed based on the
+ * ip_pkt_t structure passed by the caller.
+ *
+ * The returned length does not include the length of the upper level
+ * protocol (ULP) header.
+ * The caller needs to check that the length doesn't exceed the max for IPv4.
*/
-/* ARGSUSED */
-static void
-ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy_arg)
+int
+ip_total_hdrs_len_v4(const ip_pkt_t *ipp)
{
- ill_t *ill = (ill_t *)q->q_ptr;
- mblk_t *mp1, *mp2;
- ipif_t *ipif;
- int err = 0;
- conn_t *connp = NULL;
- ipsq_t *ipsq;
- arc_t *arc;
-
- ip1dbg(("ip_arp_done(%s)\n", ill->ill_name));
-
- ASSERT((mp->b_wptr - mp->b_rptr) >= sizeof (arc_t));
- ASSERT(((arc_t *)mp->b_rptr)->arc_cmd == AR_DLPIOP_DONE);
-
- ASSERT(IAM_WRITER_ILL(ill));
- mp2 = mp->b_cont;
- mp->b_cont = NULL;
+ int len;
- /*
- * We have now received the arp bringup completion message
- * from ARP. Mark the arp bringup as done. Also if the arp
- * stream has already started closing, send up the AR_ARP_CLOSING
- * ack now since ARP is waiting in close for this ack.
- */
- mutex_enter(&ill->ill_lock);
- ill->ill_arp_bringup_pending = 0;
- if (ill->ill_arp_closing) {
- mutex_exit(&ill->ill_lock);
- /* Let's reuse the mp for sending the ack */
- arc = (arc_t *)mp->b_rptr;
- mp->b_wptr = mp->b_rptr + sizeof (arc_t);
- arc->arc_cmd = AR_ARP_CLOSING;
- qreply(q, mp);
- } else {
- mutex_exit(&ill->ill_lock);
- freeb(mp);
+ len = IP_SIMPLE_HDR_LENGTH;
+ if (ipp->ipp_fields & IPPF_LABEL_V4) {
+ ASSERT(ipp->ipp_label_len_v4 != 0);
+ /* We need to round up here */
+ len += (ipp->ipp_label_len_v4 + 3) & ~3;
}
- ipsq = ill->ill_phyint->phyint_ipsq;
- ipif = ipsq->ipsq_xop->ipx_pending_ipif;
- mp1 = ipsq_pending_mp_get(ipsq, &connp);
- ASSERT(!((mp1 != NULL) ^ (ipif != NULL)));
- if (mp1 == NULL) {
- /* bringup was aborted by the user */
- freemsg(mp2);
- return;
+ if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
+ ASSERT(ipp->ipp_ipv4_options_len != 0);
+ ASSERT((ipp->ipp_ipv4_options_len & 3) == 0);
+ len += ipp->ipp_ipv4_options_len;
}
+ return (len);
+}
- /*
- * If an IOCTL is waiting on this (ipx_current_ioctl != 0), then we
- * must have an associated conn_t. Otherwise, we're bringing this
- * interface back up as part of handling an asynchronous event (e.g.,
- * physical address change).
- */
- if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
- ASSERT(connp != NULL);
- q = CONNP_TO_WQ(connp);
- } else {
- ASSERT(connp == NULL);
- q = ill->ill_rq;
- }
+/*
+ * All-purpose routine to build an IPv4 header with options based
+ * on the abstract ip_pkt_t.
+ *
+ * The caller has to set the source and destination address as well as
+ * ipha_length. The caller has to massage any source route and compensate
+ * for the ULP pseudo-header checksum due to the source route.
+ */
+void
+ip_build_hdrs_v4(uchar_t *buf, uint_t buf_len, const ip_pkt_t *ipp,
+ uint8_t protocol)
+{
+ ipha_t *ipha = (ipha_t *)buf;
+ uint8_t *cp;
- /*
- * If the DL_BIND_REQ fails, it is noted
- * in arc_name_offset.
- */
- err = *((int *)mp2->b_rptr);
- if (err == 0) {
- if (ipif->ipif_isv6) {
- if ((err = ipif_up_done_v6(ipif)) != 0)
- ip0dbg(("ip_arp_done: init failed\n"));
- } else {
- if ((err = ipif_up_done(ipif)) != 0)
- ip0dbg(("ip_arp_done: init failed\n"));
- }
- } else {
- ip0dbg(("ip_arp_done: DL_BIND_REQ failed\n"));
- }
+ /* Initialize IPv4 header */
+ ipha->ipha_type_of_service = ipp->ipp_type_of_service;
+ ipha->ipha_length = 0; /* Caller will set later */
+ ipha->ipha_ident = 0;
+ ipha->ipha_fragment_offset_and_flags = 0;
+ ipha->ipha_ttl = ipp->ipp_unicast_hops;
+ ipha->ipha_protocol = protocol;
+ ipha->ipha_hdr_checksum = 0;
- freemsg(mp2);
+ if ((ipp->ipp_fields & IPPF_ADDR) &&
+ IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
+ ipha->ipha_src = ipp->ipp_addr_v4;
- if ((err == 0) && (ill->ill_up_ipifs)) {
- err = ill_up_ipifs(ill, q, mp1);
- if (err == EINPROGRESS)
- return;
+ cp = (uint8_t *)&ipha[1];
+ if (ipp->ipp_fields & IPPF_LABEL_V4) {
+ ASSERT(ipp->ipp_label_len_v4 != 0);
+ bcopy(ipp->ipp_label_v4, cp, ipp->ipp_label_len_v4);
+ cp += ipp->ipp_label_len_v4;
+ /* We need to round up here */
+ while ((uintptr_t)cp & 0x3) {
+ *cp++ = IPOPT_NOP;
+ }
}
- /*
- * If we have a moved ipif to bring up, and everything has succeeded
- * to this point, bring it up on the IPMP ill. Otherwise, leave it
- * down -- the admin can try to bring it up by hand if need be.
- */
- if (ill->ill_move_ipif != NULL) {
- ipif = ill->ill_move_ipif;
- ill->ill_move_ipif = NULL;
- if (err == 0) {
- err = ipif_up(ipif, q, mp1);
- if (err == EINPROGRESS)
- return;
- }
+ if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
+ ASSERT(ipp->ipp_ipv4_options_len != 0);
+ ASSERT((ipp->ipp_ipv4_options_len & 3) == 0);
+ bcopy(ipp->ipp_ipv4_options, cp, ipp->ipp_ipv4_options_len);
+ cp += ipp->ipp_ipv4_options_len;
}
+ ipha->ipha_version_and_hdr_length =
+ (uint8_t)((IP_VERSION << 4) + buf_len / 4);
- /*
- * The operation must complete without EINPROGRESS since
- * ipsq_pending_mp_get() has removed the mblk. Otherwise, the
- * operation will be stuck forever in the ipsq.
- */
- ASSERT(err != EINPROGRESS);
- if (ipsq->ipsq_xop->ipx_current_ioctl != 0)
- ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
- else
- ipsq_current_finish(ipsq);
+ ASSERT((int)(cp - buf) == buf_len);
}
/* Allocate the private structure */
@@ -28659,47 +13699,43 @@ ip_priv_free(void *buf)
* which holds the state information for this packet and invokes the
* the classifier (via ipp_packet_process). The classification, depending on
* configured filters, results in a list of actions for this packet. Invoking
- * an action may cause the packet to be dropped, in which case the resulting
- * mblk (*mpp) is NULL. proc indicates the callout position for
- * this packet and ill_index is the interface this packet on or will leave
+ * an action may cause the packet to be dropped, in which case we return NULL.
+ * proc indicates the callout position for
+ * this packet and ill is the interface this packet arrived on or will leave
* on (inbound and outbound resp.).
+ *
+ * We do the processing on the rill (mapped to the upper if ipmp), but MIB
+ * on the ill corrsponding to the destination IP address.
*/
-void
-ip_process(ip_proc_t proc, mblk_t **mpp, uint32_t ill_index)
+mblk_t *
+ip_process(ip_proc_t proc, mblk_t *mp, ill_t *rill, ill_t *ill)
{
- mblk_t *mp;
ip_priv_t *priv;
ipp_action_id_t aid;
int rc = 0;
ipp_packet_t *pp;
-#define IP_CLASS "ip"
/* If the classifier is not loaded, return */
if ((aid = ipp_action_lookup(IPGPC_CLASSIFY)) == IPP_ACTION_INVAL) {
- return;
+ return (mp);
}
- mp = *mpp;
ASSERT(mp != NULL);
/* Allocate the packet structure */
- rc = ipp_packet_alloc(&pp, IP_CLASS, aid);
- if (rc != 0) {
- *mpp = NULL;
- freemsg(mp);
- return;
- }
+ rc = ipp_packet_alloc(&pp, "ip", aid);
+ if (rc != 0)
+ goto drop;
/* Allocate the private structure */
rc = ip_priv_alloc((void **)&priv);
if (rc != 0) {
- *mpp = NULL;
- freemsg(mp);
ipp_packet_free(pp);
- return;
+ goto drop;
}
priv->proc = proc;
- priv->ill_index = ill_index;
+ priv->ill_index = ill_get_upper_ifindex(rill);
+
ipp_packet_set_private(pp, priv, ip_priv_free);
ipp_packet_set_data(pp, mp);
@@ -28708,14 +13744,23 @@ ip_process(ip_proc_t proc, mblk_t **mpp, uint32_t ill_index)
if (pp != NULL) {
mp = ipp_packet_get_data(pp);
ipp_packet_free(pp);
- if (rc != 0) {
- freemsg(mp);
- *mpp = NULL;
- }
+ if (rc != 0)
+ goto drop;
+ return (mp);
} else {
- *mpp = NULL;
+ /* No mp to trace in ip_drop_input/ip_drop_output */
+ mp = NULL;
}
-#undef IP_CLASS
+drop:
+ if (proc == IPP_LOCAL_IN || proc == IPP_FWD_IN) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ip_process", mp, ill);
+ } else {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ip_process", mp, ill);
+ }
+ freemsg(mp);
+ return (NULL);
}
/*
@@ -28723,102 +13768,92 @@ ip_process(ip_proc_t proc, mblk_t **mpp, uint32_t ill_index)
* all the interfaces crossed by the related multirt routes.
* The call is considered successful if the operation succeeds
* on at least one interface.
+ *
+ * This assumes that a set of IRE_HOST/RTF_MULTIRT has been created for the
+ * multicast addresses with the ire argument being the first one.
+ * We walk the bucket to find all the of those.
+ *
+ * Common to IPv4 and IPv6.
*/
static int
-ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t,
- uint_t *, mcast_record_t, ipaddr_t, mblk_t *), ire_t *ire, conn_t *connp,
- boolean_t checkonly, ipaddr_t group, mcast_record_t fmode, ipaddr_t src,
- mblk_t *first_mp)
+ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t,
+ const in6_addr_t *, ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *),
+ ire_t *ire, conn_t *connp, boolean_t checkonly, const in6_addr_t *v6group,
+ mcast_record_t fmode, const in6_addr_t *v6src)
{
ire_t *ire_gw;
irb_t *irb;
+ int ifindex;
int error = 0;
- opt_restart_t *or;
+ int result;
ip_stack_t *ipst = ire->ire_ipst;
+ ipaddr_t group;
+ boolean_t isv6;
+ int match_flags;
+
+ if (IN6_IS_ADDR_V4MAPPED(v6group)) {
+ IN6_V4MAPPED_TO_IPADDR(v6group, group);
+ isv6 = B_FALSE;
+ } else {
+ isv6 = B_TRUE;
+ }
irb = ire->ire_bucket;
ASSERT(irb != NULL);
- ASSERT(DB_TYPE(first_mp) == M_CTL);
-
- or = (opt_restart_t *)first_mp->b_rptr;
- IRB_REFHOLD(irb);
+ result = 0;
+ irb_refhold(irb);
for (; ire != NULL; ire = ire->ire_next) {
if ((ire->ire_flags & RTF_MULTIRT) == 0)
continue;
- if (ire->ire_addr != group)
- continue;
- ire_gw = ire_ftable_lookup(ire->ire_gateway_addr, 0, 0,
- IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, NULL,
- MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE, ipst);
- /* No resolver exists for the gateway; skip this ire. */
+ /* We handle -ifp routes by matching on the ill if set */
+ match_flags = MATCH_IRE_TYPE;
+ if (ire->ire_ill != NULL)
+ match_flags |= MATCH_IRE_ILL;
+
+ if (isv6) {
+ if (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, v6group))
+ continue;
+
+ ire_gw = ire_ftable_lookup_v6(&ire->ire_gateway_addr_v6,
+ 0, 0, IRE_INTERFACE, ire->ire_ill, ALL_ZONES, NULL,
+ match_flags, 0, ipst, NULL);
+ } else {
+ if (ire->ire_addr != group)
+ continue;
+
+ ire_gw = ire_ftable_lookup_v4(ire->ire_gateway_addr,
+ 0, 0, IRE_INTERFACE, ire->ire_ill, ALL_ZONES, NULL,
+ match_flags, 0, ipst, NULL);
+ }
+ /* No interface route exists for the gateway; skip this ire. */
if (ire_gw == NULL)
continue;
+ if (ire_gw->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+ ire_refrele(ire_gw);
+ continue;
+ }
+ ASSERT(ire_gw->ire_ill != NULL); /* IRE_INTERFACE */
+ ifindex = ire_gw->ire_ill->ill_phyint->phyint_ifindex;
/*
- * This function can return EINPROGRESS. If so the operation
- * will be restarted from ip_restart_optmgmt which will
- * call ip_opt_set and option processing will restart for
- * this option. So we may end up calling 'fn' more than once.
- * This requires that 'fn' is idempotent except for the
- * return value. The operation is considered a success if
+ * The operation is considered a success if
* it succeeds at least once on any one interface.
*/
- error = fn(connp, checkonly, group, ire_gw->ire_src_addr,
- NULL, fmode, src, first_mp);
+ error = fn(connp, checkonly, v6group, INADDR_ANY, ifindex,
+ fmode, v6src);
if (error == 0)
- or->or_private = CGTP_MCAST_SUCCESS;
-
- if (ip_debug > 0) {
- ulong_t off;
- char *ksym;
- ksym = kobj_getsymname((uintptr_t)fn, &off);
- ip2dbg(("ip_multirt_apply_membership: "
- "called %s, multirt group 0x%08x via itf 0x%08x, "
- "error %d [success %u]\n",
- ksym ? ksym : "?",
- ntohl(group), ntohl(ire_gw->ire_src_addr),
- error, or->or_private));
- }
+ result = CGTP_MCAST_SUCCESS;
ire_refrele(ire_gw);
- if (error == EINPROGRESS) {
- IRB_REFRELE(irb);
- return (error);
- }
}
- IRB_REFRELE(irb);
+ irb_refrele(irb);
/*
* Consider the call as successful if we succeeded on at least
* one interface. Otherwise, return the last encountered error.
*/
- return (or->or_private == CGTP_MCAST_SUCCESS ? 0 : error);
-}
-
-/*
- * Issue a warning regarding a route crossing an interface with an
- * incorrect MTU. Only one message every 'ip_multirt_log_interval'
- * amount of time is logged.
- */
-static void
-ip_multirt_bad_mtu(ire_t *ire, uint32_t max_frag)
-{
- hrtime_t current = gethrtime();
- char buf[INET_ADDRSTRLEN];
- ip_stack_t *ipst = ire->ire_ipst;
-
- /* Convert interval in ms to hrtime in ns */
- if (ipst->ips_multirt_bad_mtu_last_time +
- ((hrtime_t)ipst->ips_ip_multirt_log_interval * (hrtime_t)1000000) <=
- current) {
- cmn_err(CE_WARN, "ip: ignoring multiroute "
- "to %s, incorrect MTU %u (expected %u)\n",
- ip_dot_addr(ire->ire_addr, buf),
- ire->ire_max_frag, max_frag);
-
- ipst->ips_multirt_bad_mtu_last_time = current;
- }
+ return (result == CGTP_MCAST_SUCCESS ? 0 : error);
}
/*
@@ -28882,6 +13917,7 @@ ip_cgtp_filter_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
*ip_cgtp_filter_value = (boolean_t)new_value;
+ ill_set_inputfn_all(ipst);
return (0);
}
@@ -28919,6 +13955,9 @@ ip_cgtp_filter_register(netstackid_t stackid, cgtp_filter_ops_t *ops)
}
ipst->ips_ip_cgtp_filter_ops = ops;
+
+ ill_set_inputfn_all(ipst);
+
netstack_rele(ns);
return (0);
}
@@ -28950,6 +13989,9 @@ ip_cgtp_filter_unregister(netstackid_t stackid)
return (ENXIO);
}
ipst->ips_ip_cgtp_filter_ops = NULL;
+
+ ill_set_inputfn_all(ipst);
+
netstack_rele(ns);
return (0);
}
@@ -28984,7 +14026,7 @@ ip_cgtp_filter_is_registered(netstackid_t stackid)
static int
ip_squeue_switch(int val)
{
- int rval = SQ_FILL;
+ int rval;
switch (val) {
case IP_SQUEUE_ENTER_NODRAIN:
@@ -28993,7 +14035,9 @@ ip_squeue_switch(int val)
case IP_SQUEUE_ENTER:
rval = SQ_PROCESS;
break;
+ case IP_SQUEUE_FILL:
default:
+ rval = SQ_FILL;
break;
}
return (rval);
@@ -29046,52 +14090,45 @@ ip_kstat2_init(netstackid_t stackid, ip_stat_t *ip_statisticsp)
kstat_t *ksp;
ip_stat_t template = {
- { "ipsec_fanout_proto", KSTAT_DATA_UINT64 },
{ "ip_udp_fannorm", KSTAT_DATA_UINT64 },
{ "ip_udp_fanmb", KSTAT_DATA_UINT64 },
- { "ip_udp_fanothers", KSTAT_DATA_UINT64 },
- { "ip_udp_fast_path", KSTAT_DATA_UINT64 },
- { "ip_udp_slow_path", KSTAT_DATA_UINT64 },
- { "ip_udp_input_err", KSTAT_DATA_UINT64 },
- { "ip_tcppullup", KSTAT_DATA_UINT64 },
- { "ip_tcpoptions", KSTAT_DATA_UINT64 },
- { "ip_multipkttcp", KSTAT_DATA_UINT64 },
- { "ip_tcp_fast_path", KSTAT_DATA_UINT64 },
- { "ip_tcp_slow_path", KSTAT_DATA_UINT64 },
- { "ip_tcp_input_error", KSTAT_DATA_UINT64 },
+ { "ip_recv_pullup", KSTAT_DATA_UINT64 },
{ "ip_db_ref", KSTAT_DATA_UINT64 },
- { "ip_notaligned1", KSTAT_DATA_UINT64 },
- { "ip_notaligned2", KSTAT_DATA_UINT64 },
- { "ip_multimblk3", KSTAT_DATA_UINT64 },
- { "ip_multimblk4", KSTAT_DATA_UINT64 },
- { "ip_ipoptions", KSTAT_DATA_UINT64 },
- { "ip_classify_fail", KSTAT_DATA_UINT64 },
+ { "ip_notaligned", KSTAT_DATA_UINT64 },
+ { "ip_multimblk", KSTAT_DATA_UINT64 },
{ "ip_opt", KSTAT_DATA_UINT64 },
- { "ip_udp_rput_local", KSTAT_DATA_UINT64 },
{ "ipsec_proto_ahesp", KSTAT_DATA_UINT64 },
{ "ip_conn_flputbq", KSTAT_DATA_UINT64 },
{ "ip_conn_walk_drain", KSTAT_DATA_UINT64 },
{ "ip_out_sw_cksum", KSTAT_DATA_UINT64 },
+ { "ip_out_sw_cksum_bytes", KSTAT_DATA_UINT64 },
{ "ip_in_sw_cksum", KSTAT_DATA_UINT64 },
- { "ip_trash_ire_reclaim_calls", KSTAT_DATA_UINT64 },
- { "ip_trash_ire_reclaim_success", KSTAT_DATA_UINT64 },
- { "ip_ire_arp_timer_expired", KSTAT_DATA_UINT64 },
- { "ip_ire_redirect_timer_expired", KSTAT_DATA_UINT64 },
- { "ip_ire_pmtu_timer_expired", KSTAT_DATA_UINT64 },
- { "ip_input_multi_squeue", KSTAT_DATA_UINT64 },
+ { "ip_ire_reclaim_calls", KSTAT_DATA_UINT64 },
+ { "ip_ire_reclaim_deleted", KSTAT_DATA_UINT64 },
+ { "ip_nce_reclaim_calls", KSTAT_DATA_UINT64 },
+ { "ip_nce_reclaim_deleted", KSTAT_DATA_UINT64 },
+ { "ip_dce_reclaim_calls", KSTAT_DATA_UINT64 },
+ { "ip_dce_reclaim_deleted", KSTAT_DATA_UINT64 },
{ "ip_tcp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 },
{ "ip_tcp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 },
{ "ip_tcp_in_sw_cksum_err", KSTAT_DATA_UINT64 },
- { "ip_tcp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 },
{ "ip_udp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 },
{ "ip_udp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 },
- { "ip_udp_in_sw_cksum_err", KSTAT_DATA_UINT64 },
- { "ip_udp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 },
- { "ip_frag_mdt_pkt_out", KSTAT_DATA_UINT64 },
- { "ip_frag_mdt_discarded", KSTAT_DATA_UINT64 },
- { "ip_frag_mdt_allocfail", KSTAT_DATA_UINT64 },
- { "ip_frag_mdt_addpdescfail", KSTAT_DATA_UINT64 },
- { "ip_frag_mdt_allocd", KSTAT_DATA_UINT64 },
+ { "ip_udp_in_sw_cksum_err", KSTAT_DATA_UINT64 },
+ { "conn_in_recvdstaddr", KSTAT_DATA_UINT64 },
+ { "conn_in_recvopts", KSTAT_DATA_UINT64 },
+ { "conn_in_recvif", KSTAT_DATA_UINT64 },
+ { "conn_in_recvslla", KSTAT_DATA_UINT64 },
+ { "conn_in_recvucred", KSTAT_DATA_UINT64 },
+ { "conn_in_recvttl", KSTAT_DATA_UINT64 },
+ { "conn_in_recvhopopts", KSTAT_DATA_UINT64 },
+ { "conn_in_recvhoplimit", KSTAT_DATA_UINT64 },
+ { "conn_in_recvdstopts", KSTAT_DATA_UINT64 },
+ { "conn_in_recvrthdrdstopts", KSTAT_DATA_UINT64 },
+ { "conn_in_recvrthdr", KSTAT_DATA_UINT64 },
+ { "conn_in_recvpktinfo", KSTAT_DATA_UINT64 },
+ { "conn_in_recvtclass", KSTAT_DATA_UINT64 },
+ { "conn_in_timestamp", KSTAT_DATA_UINT64 },
};
ksp = kstat_create_netstack("ip", 0, "ipstat", "net",
@@ -29420,323 +14457,457 @@ icmp_kstat_update(kstat_t *kp, int rw)
* a port. This is assured in ipcl_sctp_hash_insert();
*/
void
-ip_fanout_sctp_raw(mblk_t *mp, ill_t *recv_ill, ipha_t *ipha, boolean_t isv4,
- uint32_t ports, boolean_t mctl_present, uint_t flags, boolean_t ip_policy,
- zoneid_t zoneid)
+ip_fanout_sctp_raw(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, uint32_t ports,
+ ip_recv_attr_t *ira)
{
conn_t *connp;
queue_t *rq;
- mblk_t *first_mp;
boolean_t secure;
- ip6_t *ip6h;
- ip_stack_t *ipst = recv_ill->ill_ipst;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
sctp_stack_t *sctps = ipst->ips_netstack->netstack_sctp;
- boolean_t sctp_csum_err = B_FALSE;
+ iaflags_t iraflags = ira->ira_flags;
+ ill_t *rill = ira->ira_rill;
- if (flags & IP_FF_SCTP_CSUM_ERR) {
- sctp_csum_err = B_TRUE;
- flags &= ~IP_FF_SCTP_CSUM_ERR;
- }
+ secure = iraflags & IRAF_IPSEC_SECURE;
- first_mp = mp;
- if (mctl_present) {
- mp = first_mp->b_cont;
- secure = ipsec_in_is_secure(first_mp);
- ASSERT(mp != NULL);
- } else {
- secure = B_FALSE;
- }
- ip6h = (isv4) ? NULL : (ip6_t *)ipha;
-
- connp = ipcl_classify_raw(mp, IPPROTO_SCTP, zoneid, ports, ipha, ipst);
+ connp = ipcl_classify_raw(mp, IPPROTO_SCTP, ports, ipha, ip6h,
+ ira, ipst);
if (connp == NULL) {
/*
* Although raw sctp is not summed, OOB chunks must be.
* Drop the packet here if the sctp checksum failed.
*/
- if (sctp_csum_err) {
+ if (iraflags & IRAF_SCTP_CSUM_ERR) {
BUMP_MIB(&sctps->sctps_mib, sctpChecksumError);
- freemsg(first_mp);
+ freemsg(mp);
return;
}
- sctp_ootb_input(first_mp, recv_ill, zoneid, mctl_present);
+ ira->ira_ill = ira->ira_rill = NULL;
+ sctp_ootb_input(mp, ira, ipst);
+ ira->ira_ill = ill;
+ ira->ira_rill = rill;
return;
}
rq = connp->conn_rq;
- if (!canputnext(rq)) {
+ if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld : !canputnext(rq)) {
CONN_DEC_REF(connp);
- BUMP_MIB(recv_ill->ill_ip_mib, rawipIfStatsInOverflows);
- freemsg(first_mp);
+ BUMP_MIB(ill->ill_ip_mib, rawipIfStatsInOverflows);
+ freemsg(mp);
return;
}
- if ((isv4 ? CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
- CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) || secure) {
- first_mp = ipsec_check_inbound_policy(first_mp, connp,
- (isv4 ? ipha : NULL), ip6h, mctl_present);
- if (first_mp == NULL) {
- BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards);
+ if (((iraflags & IRAF_IS_IPV4) ?
+ CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
+ CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) ||
+ secure) {
+ mp = ipsec_check_inbound_policy(mp, connp, ipha,
+ ip6h, ira);
+ if (mp == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ /* Note that mp is NULL */
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
CONN_DEC_REF(connp);
return;
}
}
- /*
- * We probably should not send M_CTL message up to
- * raw socket.
- */
- if (mctl_present)
- freeb(first_mp);
- /* Initiate IPPF processing here if needed. */
- if ((isv4 && IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) ||
- (!isv4 && IP6_IN_IPP(flags, ipst))) {
- ip_process(IPP_LOCAL_IN, &mp,
- recv_ill->ill_phyint->phyint_ifindex);
- if (mp == NULL) {
- CONN_DEC_REF(connp);
- return;
- }
+ if (iraflags & IRAF_ICMP_ERROR) {
+ (connp->conn_recvicmp)(connp, mp, NULL, ira);
+ } else {
+ ill_t *rill = ira->ira_rill;
+
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+ /* This is the SOCK_RAW, IPPROTO_SCTP case. */
+ ira->ira_ill = ira->ira_rill = NULL;
+ (connp->conn_recv)(connp, mp, NULL, ira);
+ ira->ira_ill = ill;
+ ira->ira_rill = rill;
}
+ CONN_DEC_REF(connp);
+}
- if (connp->conn_recvif || connp->conn_recvslla ||
- ((connp->conn_ip_recvpktinfo ||
- (!isv4 && IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))) &&
- (flags & IP_FF_IPINFO))) {
- int in_flags = 0;
+/*
+ * Free a packet that has the link-layer dl_unitdata_req_t or fast-path
+ * header before the ip payload.
+ */
+static void
+ip_xmit_flowctl_drop(ill_t *ill, mblk_t *mp, boolean_t is_fp_mp, int fp_mp_len)
+{
+ int len = (mp->b_wptr - mp->b_rptr);
+ mblk_t *ip_mp;
- /*
- * Since sctp does not support IP_RECVPKTINFO for v4, only pass
- * IPF_RECVIF.
- */
- if (connp->conn_recvif || connp->conn_ip_recvpktinfo) {
- in_flags = IPF_RECVIF;
- }
- if (connp->conn_recvslla) {
- in_flags |= IPF_RECVSLLA;
- }
- if (isv4) {
- mp = ip_add_info(mp, recv_ill, in_flags,
- IPCL_ZONEID(connp), ipst);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ if (is_fp_mp || len != fp_mp_len) {
+ if (len > fp_mp_len) {
+ /*
+ * fastpath header and ip header in the first mblk
+ */
+ mp->b_rptr += fp_mp_len;
} else {
- mp = ip_add_info_v6(mp, recv_ill, &ip6h->ip6_dst);
- if (mp == NULL) {
- BUMP_MIB(recv_ill->ill_ip_mib,
- ipIfStatsInDiscards);
- CONN_DEC_REF(connp);
- return;
- }
+ /*
+ * ip_xmit_attach_llhdr had to prepend an mblk to
+ * attach the fastpath header before ip header.
+ */
+ ip_mp = mp->b_cont;
+ freeb(mp);
+ mp = ip_mp;
+ mp->b_rptr += (fp_mp_len - len);
}
+ } else {
+ ip_mp = mp->b_cont;
+ freeb(mp);
+ mp = ip_mp;
}
-
- BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers);
- /*
- * We are sending the IPSEC_IN message also up. Refer
- * to comments above this function.
- * This is the SOCK_RAW, IPPROTO_SCTP case.
- */
- (connp->conn_recv)(connp, mp, NULL);
- CONN_DEC_REF(connp);
+ ip_drop_output("ipIfStatsOutDiscards - flow ctl", mp, ill);
+ freemsg(mp);
}
-#define UPDATE_IP_MIB_OB_COUNTERS(ill, len) \
-{ \
- BUMP_MIB((ill)->ill_ip_mib, ipIfStatsHCOutTransmits); \
- UPDATE_MIB((ill)->ill_ip_mib, ipIfStatsHCOutOctets, (len)); \
-}
/*
- * This function should be called only if all packet processing
- * including fragmentation is complete. Callers of this function
- * must set mp->b_prev to one of these values:
- * {0, IPP_FWD_OUT, IPP_LOCAL_OUT}
- * prior to handing over the mp as first argument to this function.
+ * Normal post fragmentation function.
+ *
+ * Send a packet using the passed in nce. This handles both IPv4 and IPv6
+ * using the same state machine.
*
- * If the ire passed by caller is incomplete, this function
+ * We return an error on failure. In particular we return EWOULDBLOCK
+ * when the driver flow controls. In that case this ensures that ip_wsrv runs
+ * (currently by canputnext failure resulting in backenabling from GLD.)
+ * This allows the callers of conn_ip_output() to use EWOULDBLOCK as an
+ * indication that they can flow control until ip_wsrv() tells then to restart.
+ *
+ * If the nce passed by caller is incomplete, this function
* queues the packet and if necessary, sends ARP request and bails.
- * If the ire passed is fully resolved, we simply prepend
+ * If the Neighbor Cache passed is fully resolved, we simply prepend
* the link-layer header to the packet, do ipsec hw acceleration
* work if necessary, and send the packet out on the wire.
- *
- * NOTE: IPsec will only call this function with fully resolved
- * ires if hw acceleration is involved.
- * TODO list :
- * a Handle M_MULTIDATA so that
- * tcp_multisend->tcp_multisend_data can
- * call ip_xmit_v4 directly
- * b Handle post-ARP work for fragments so that
- * ip_wput_frag can call this function.
*/
-ipxmit_state_t
-ip_xmit_v4(mblk_t *mp, ire_t *ire, ipsec_out_t *io,
- boolean_t flow_ctl_enabled, conn_t *connp)
+/* ARGSUSED6 */
+int
+ip_xmit(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, uint_t pkt_len,
+ uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, uintptr_t *ixacookie)
{
- nce_t *arpce;
- ipha_t *ipha;
- queue_t *q;
- int ill_index;
- mblk_t *nxt_mp, *first_mp;
- boolean_t xmit_drop = B_FALSE;
- ip_proc_t proc;
- ill_t *out_ill;
- int pkt_len;
+ queue_t *wq;
+ ill_t *ill = nce->nce_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+ uint64_t delta;
+ boolean_t isv6 = ill->ill_isv6;
+ boolean_t fp_mp;
+ ncec_t *ncec = nce->nce_common;
- arpce = ire->ire_nce;
- ASSERT(arpce != NULL);
+ DTRACE_PROBE1(ip__xmit, nce_t *, nce);
- DTRACE_PROBE2(ip__xmit__v4, ire_t *, ire, nce_t *, arpce);
+ ASSERT(mp != NULL);
+ ASSERT(mp->b_datap->db_type == M_DATA);
+ ASSERT(pkt_len == msgdsize(mp));
- mutex_enter(&arpce->nce_lock);
- switch (arpce->nce_state) {
- case ND_REACHABLE:
- /* If there are other queued packets, queue this packet */
- if (arpce->nce_qd_mp != NULL) {
- if (mp != NULL)
- nce_queue_mp_common(arpce, mp, B_FALSE);
- mp = arpce->nce_qd_mp;
+ /*
+ * If we have already been here and are coming back after ARP/ND.
+ * the IXAF_NO_TRACE flag is set. We skip FW_HOOKS, DTRACE and ipobs
+ * in that case since they have seen the packet when it came here
+ * the first time.
+ */
+ if (ixaflags & IXAF_NO_TRACE)
+ goto sendit;
+
+ if (ixaflags & IXAF_IS_IPV4) {
+ ipha_t *ipha = (ipha_t *)mp->b_rptr;
+
+ ASSERT(!isv6);
+ ASSERT(pkt_len == ntohs(((ipha_t *)mp->b_rptr)->ipha_length));
+ if (HOOKS4_INTERESTED_PHYSICAL_OUT(ipst) &&
+ !(ixaflags & IXAF_NO_PFHOOK)) {
+ int error;
+
+ FW_HOOKS(ipst->ips_ip4_physical_out_event,
+ ipst->ips_ipv4firewall_physical_out,
+ NULL, ill, ipha, mp, mp, 0, ipst, error);
+ DTRACE_PROBE1(ip4__physical__out__end,
+ mblk_t *, mp);
+ if (mp == NULL)
+ return (error);
+
+ /* The length could have changed */
+ pkt_len = msgdsize(mp);
+ }
+ if (ipst->ips_ip4_observe.he_interested) {
+ /*
+ * Note that for TX the zoneid is the sending
+ * zone, whether or not MLP is in play.
+ * Since the szone argument is the IP zoneid (i.e.,
+ * zero for exclusive-IP zones) and ipobs wants
+ * the system zoneid, we map it here.
+ */
+ szone = IP_REAL_ZONEID(szone, ipst);
+
+ /*
+ * On the outbound path the destination zone will be
+ * unknown as we're sending this packet out on the
+ * wire.
+ */
+ ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, ALL_ZONES,
+ ill, ipst);
+ }
+ DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL,
+ void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill,
+ ipha_t *, ipha, ip6_t *, NULL, int, 0);
+ } else {
+ ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+
+ ASSERT(isv6);
+ ASSERT(pkt_len ==
+ ntohs(((ip6_t *)mp->b_rptr)->ip6_plen) + IPV6_HDR_LEN);
+ if (HOOKS6_INTERESTED_PHYSICAL_OUT(ipst) &&
+ !(ixaflags & IXAF_NO_PFHOOK)) {
+ int error;
+
+ FW_HOOKS6(ipst->ips_ip6_physical_out_event,
+ ipst->ips_ipv6firewall_physical_out,
+ NULL, ill, ip6h, mp, mp, 0, ipst, error);
+ DTRACE_PROBE1(ip6__physical__out__end,
+ mblk_t *, mp);
+ if (mp == NULL)
+ return (error);
+
+ /* The length could have changed */
+ pkt_len = msgdsize(mp);
+ }
+ if (ipst->ips_ip6_observe.he_interested) {
+ /* See above */
+ szone = IP_REAL_ZONEID(szone, ipst);
+
+ ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, ALL_ZONES,
+ ill, ipst);
}
- arpce->nce_qd_mp = NULL;
- mutex_exit(&arpce->nce_lock);
+ DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL,
+ void_ip_t *, ip6h, __dtrace_ipsr_ill_t *, ill,
+ ipha_t *, NULL, ip6_t *, ip6h, int, 0);
+ }
+sendit:
+ /*
+ * We check the state without a lock because the state can never
+ * move "backwards" to initial or incomplete.
+ */
+ switch (ncec->ncec_state) {
+ case ND_REACHABLE:
+ case ND_STALE:
+ case ND_DELAY:
+ case ND_PROBE:
+ mp = ip_xmit_attach_llhdr(mp, nce);
+ if (mp == NULL) {
+ /*
+ * ip_xmit_attach_llhdr has increased
+ * ipIfStatsOutDiscards and called ip_drop_output()
+ */
+ return (ENOBUFS);
+ }
/*
- * Flush the queue. In the common case, where the
- * ARP is already resolved, it will go through the
- * while loop only once.
+ * check if nce_fastpath completed and we tagged on a
+ * copy of nce_fp_mp in ip_xmit_attach_llhdr().
*/
- while (mp != NULL) {
+ fp_mp = (mp->b_datap->db_type == M_DATA);
- nxt_mp = mp->b_next;
- mp->b_next = NULL;
- ASSERT(mp->b_datap->db_type != M_CTL);
- pkt_len = ntohs(((ipha_t *)mp->b_rptr)->ipha_length);
+ if (fp_mp &&
+ (ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT)) {
+ ill_dld_direct_t *idd;
+
+ idd = &ill->ill_dld_capab->idc_direct;
/*
- * This info is needed for IPQOS to do COS marking
- * in ip_wput_attach_llhdr->ip_process.
+ * Send the packet directly to DLD, where it
+ * may be queued depending on the availability
+ * of transmit resources at the media layer.
+ * Return value should be taken into
+ * account and flow control the TCP.
*/
- proc = (ip_proc_t)(uintptr_t)mp->b_prev;
- mp->b_prev = NULL;
-
- /* set up ill index for outbound qos processing */
- out_ill = ire_to_ill(ire);
- ill_index = out_ill->ill_phyint->phyint_ifindex;
- first_mp = ip_wput_attach_llhdr(mp, ire, proc,
- ill_index, &ipha);
- if (first_mp == NULL) {
- xmit_drop = B_TRUE;
- BUMP_MIB(out_ill->ill_ip_mib,
- ipIfStatsOutDiscards);
- goto next_mp;
- }
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits);
+ UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
+ pkt_len);
- /* non-ipsec hw accel case */
- if (io == NULL || !io->ipsec_out_accelerated) {
- /* send it */
- q = ire->ire_stq;
- if (proc == IPP_FWD_OUT) {
- UPDATE_IB_PKT_COUNT(ire);
- } else {
- UPDATE_OB_PKT_COUNT(ire);
- }
- ire->ire_last_used_time = lbolt;
+ if (ixaflags & IXAF_NO_DEV_FLOW_CTL) {
+ (void) idd->idd_tx_df(idd->idd_tx_dh, mp,
+ (uintptr_t)xmit_hint, IP_DROP_ON_NO_DESC);
+ } else {
+ uintptr_t cookie;
- if (flow_ctl_enabled || canputnext(q)) {
- if (proc == IPP_FWD_OUT) {
+ if ((cookie = idd->idd_tx_df(idd->idd_tx_dh,
+ mp, (uintptr_t)xmit_hint, 0)) != 0) {
+ if (ixacookie != NULL)
+ *ixacookie = cookie;
+ return (EWOULDBLOCK);
+ }
+ }
+ } else {
+ wq = ill->ill_wq;
+
+ if (!(ixaflags & IXAF_NO_DEV_FLOW_CTL) &&
+ !canputnext(wq)) {
+ if (ixacookie != NULL)
+ *ixacookie = 0;
+ ip_xmit_flowctl_drop(ill, mp, fp_mp,
+ nce->nce_fp_mp != NULL ?
+ MBLKL(nce->nce_fp_mp) : 0);
+ return (EWOULDBLOCK);
+ }
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits);
+ UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
+ pkt_len);
+ putnext(wq, mp);
+ }
- BUMP_MIB(out_ill->ill_ip_mib,
- ipIfStatsHCOutForwDatagrams);
+ /*
+ * The rest of this function implements Neighbor Unreachability
+ * detection. Determine if the ncec is eligible for NUD.
+ */
+ if (ncec->ncec_flags & NCE_F_NONUD)
+ return (0);
- }
- UPDATE_IP_MIB_OB_COUNTERS(out_ill,
- pkt_len);
+ ASSERT(ncec->ncec_state != ND_INCOMPLETE);
- DTRACE_IP7(send, mblk_t *, first_mp,
- conn_t *, NULL, void_ip_t *, ipha,
- __dtrace_ipsr_ill_t *, out_ill,
- ipha_t *, ipha, ip6_t *, NULL, int,
- 0);
+ /*
+ * Check for upper layer advice
+ */
+ if (ixaflags & IXAF_REACH_CONF) {
+ timeout_id_t tid;
- ILL_SEND_TX(out_ill,
- ire, connp, first_mp, 0, connp);
- } else {
- BUMP_MIB(out_ill->ill_ip_mib,
- ipIfStatsOutDiscards);
- xmit_drop = B_TRUE;
- freemsg(first_mp);
+ /*
+ * It should be o.k. to check the state without
+ * a lock here, at most we lose an advice.
+ */
+ ncec->ncec_last = TICK_TO_MSEC(lbolt64);
+ if (ncec->ncec_state != ND_REACHABLE) {
+ mutex_enter(&ncec->ncec_lock);
+ ncec->ncec_state = ND_REACHABLE;
+ tid = ncec->ncec_timeout_id;
+ ncec->ncec_timeout_id = 0;
+ mutex_exit(&ncec->ncec_lock);
+ (void) untimeout(tid);
+ if (ip_debug > 2) {
+ /* ip1dbg */
+ pr_addr_dbg("ip_xmit: state"
+ " for %s changed to"
+ " REACHABLE\n", AF_INET6,
+ &ncec->ncec_addr);
}
- } else {
+ }
+ return (0);
+ }
+
+ delta = TICK_TO_MSEC(lbolt64) - ncec->ncec_last;
+ ip1dbg(("ip_xmit: delta = %" PRId64
+ " ill_reachable_time = %d \n", delta,
+ ill->ill_reachable_time));
+ if (delta > (uint64_t)ill->ill_reachable_time) {
+ mutex_enter(&ncec->ncec_lock);
+ switch (ncec->ncec_state) {
+ case ND_REACHABLE:
+ ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
+ /* FALLTHROUGH */
+ case ND_STALE:
/*
- * Safety Pup says: make sure this
- * is going to the right interface!
+ * ND_REACHABLE is identical to
+ * ND_STALE in this specific case. If
+ * reachable time has expired for this
+ * neighbor (delta is greater than
+ * reachable time), conceptually, the
+ * neighbor cache is no longer in
+ * REACHABLE state, but already in
+ * STALE state. So the correct
+ * transition here is to ND_DELAY.
*/
- ill_t *ill1 =
- (ill_t *)ire->ire_stq->q_ptr;
- int ifindex =
- ill1->ill_phyint->phyint_ifindex;
- if (ifindex !=
- io->ipsec_out_capab_ill_index) {
- xmit_drop = B_TRUE;
- freemsg(mp);
- } else {
- UPDATE_IP_MIB_OB_COUNTERS(ill1,
- pkt_len);
-
- DTRACE_IP7(send, mblk_t *, first_mp,
- conn_t *, NULL, void_ip_t *, ipha,
- __dtrace_ipsr_ill_t *, ill1,
- ipha_t *, ipha, ip6_t *, NULL,
- int, 0);
-
- ipsec_hw_putnext(ire->ire_stq, mp);
+ ncec->ncec_state = ND_DELAY;
+ mutex_exit(&ncec->ncec_lock);
+ nce_restart_timer(ncec,
+ ipst->ips_delay_first_probe_time);
+ if (ip_debug > 3) {
+ /* ip2dbg */
+ pr_addr_dbg("ip_xmit: state"
+ " for %s changed to"
+ " DELAY\n", AF_INET6,
+ &ncec->ncec_addr);
}
+ break;
+ case ND_DELAY:
+ case ND_PROBE:
+ mutex_exit(&ncec->ncec_lock);
+ /* Timers have already started */
+ break;
+ case ND_UNREACHABLE:
+ /*
+ * nce_timer has detected that this ncec
+ * is unreachable and initiated deleting
+ * this ncec.
+ * This is a harmless race where we found the
+ * ncec before it was deleted and have
+ * just sent out a packet using this
+ * unreachable ncec.
+ */
+ mutex_exit(&ncec->ncec_lock);
+ break;
+ default:
+ ASSERT(0);
+ mutex_exit(&ncec->ncec_lock);
}
-next_mp:
- mp = nxt_mp;
- } /* while (mp != NULL) */
- if (xmit_drop)
- return (SEND_FAILED);
- else
- return (SEND_PASSED);
+ }
+ return (0);
- case ND_INITIAL:
case ND_INCOMPLETE:
-
/*
- * While we do send off packets to dests that
- * use fully-resolved CGTP routes, we do not
- * handle unresolved CGTP routes.
+ * the state could have changed since we didn't hold the lock.
+ * Re-verify state under lock.
*/
- ASSERT(!(ire->ire_flags & RTF_MULTIRT));
- ASSERT(io == NULL || !io->ipsec_out_accelerated);
-
- if (mp != NULL) {
- /* queue the packet */
- nce_queue_mp_common(arpce, mp, B_FALSE);
+ mutex_enter(&ncec->ncec_lock);
+ if (NCE_ISREACHABLE(ncec)) {
+ mutex_exit(&ncec->ncec_lock);
+ goto sendit;
}
+ /* queue the packet */
+ nce_queue_mp(ncec, mp, ipmp_packet_is_probe(mp, nce->nce_ill));
+ mutex_exit(&ncec->ncec_lock);
+ DTRACE_PROBE2(ip__xmit__incomplete,
+ (ncec_t *), ncec, (mblk_t *), mp);
+ return (0);
- if (arpce->nce_state == ND_INCOMPLETE) {
- mutex_exit(&arpce->nce_lock);
- DTRACE_PROBE3(ip__xmit__incomplete,
- (ire_t *), ire, (mblk_t *), mp,
- (ipsec_out_t *), io);
- return (LOOKUP_IN_PROGRESS);
+ case ND_INITIAL:
+ /*
+ * State could have changed since we didn't hold the lock, so
+ * re-verify state.
+ */
+ mutex_enter(&ncec->ncec_lock);
+ if (NCE_ISREACHABLE(ncec)) {
+ mutex_exit(&ncec->ncec_lock);
+ goto sendit;
+ }
+ nce_queue_mp(ncec, mp, ipmp_packet_is_probe(mp, nce->nce_ill));
+ if (ncec->ncec_state == ND_INITIAL) {
+ ncec->ncec_state = ND_INCOMPLETE;
+ mutex_exit(&ncec->ncec_lock);
+ /*
+ * figure out the source we want to use
+ * and resolve it.
+ */
+ ip_ndp_resolve(ncec);
+ } else {
+ mutex_exit(&ncec->ncec_lock);
}
+ return (0);
- arpce->nce_state = ND_INCOMPLETE;
- mutex_exit(&arpce->nce_lock);
+ case ND_UNREACHABLE:
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards - ND_UNREACHABLE",
+ mp, ill);
+ freemsg(mp);
+ return (0);
- /*
- * Note that ire_add() (called from ire_forward())
- * holds a ref on the ire until ARP is completed.
- */
- ire_arpresolve(ire);
- return (LOOKUP_IN_PROGRESS);
default:
ASSERT(0);
- mutex_exit(&arpce->nce_lock);
- return (LLHDR_RESLV_FAILED);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards - ND_other",
+ mp, ill);
+ freemsg(mp);
+ return (ENETUNREACH);
}
}
-#undef UPDATE_IP_MIB_OB_COUNTERS
-
/*
* Return B_TRUE if the buffers differ in length or content.
* This is used for comparing extension header buffers.
@@ -29803,52 +14974,300 @@ ip_savebuf(void **dstp, uint_t *dstlenp, boolean_t src_valid,
}
/*
- * Free the storage pointed to by the members of an ip6_pkt_t.
+ * Free the storage pointed to by the members of an ip_pkt_t.
*/
void
-ip6_pkt_free(ip6_pkt_t *ipp)
+ip_pkt_free(ip_pkt_t *ipp)
{
- ASSERT(ipp->ipp_pathmtu == NULL && !(ipp->ipp_fields & IPPF_PATHMTU));
+ uint_t fields = ipp->ipp_fields;
- if (ipp->ipp_fields & IPPF_HOPOPTS) {
+ if (fields & IPPF_HOPOPTS) {
kmem_free(ipp->ipp_hopopts, ipp->ipp_hopoptslen);
ipp->ipp_hopopts = NULL;
ipp->ipp_hopoptslen = 0;
}
- if (ipp->ipp_fields & IPPF_RTDSTOPTS) {
- kmem_free(ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen);
- ipp->ipp_rtdstopts = NULL;
- ipp->ipp_rtdstoptslen = 0;
+ if (fields & IPPF_RTHDRDSTOPTS) {
+ kmem_free(ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen);
+ ipp->ipp_rthdrdstopts = NULL;
+ ipp->ipp_rthdrdstoptslen = 0;
}
- if (ipp->ipp_fields & IPPF_DSTOPTS) {
+ if (fields & IPPF_DSTOPTS) {
kmem_free(ipp->ipp_dstopts, ipp->ipp_dstoptslen);
ipp->ipp_dstopts = NULL;
ipp->ipp_dstoptslen = 0;
}
- if (ipp->ipp_fields & IPPF_RTHDR) {
+ if (fields & IPPF_RTHDR) {
kmem_free(ipp->ipp_rthdr, ipp->ipp_rthdrlen);
ipp->ipp_rthdr = NULL;
ipp->ipp_rthdrlen = 0;
}
- ipp->ipp_fields &= ~(IPPF_HOPOPTS | IPPF_RTDSTOPTS | IPPF_DSTOPTS |
- IPPF_RTHDR);
+ if (fields & IPPF_IPV4_OPTIONS) {
+ kmem_free(ipp->ipp_ipv4_options, ipp->ipp_ipv4_options_len);
+ ipp->ipp_ipv4_options = NULL;
+ ipp->ipp_ipv4_options_len = 0;
+ }
+ if (fields & IPPF_LABEL_V4) {
+ kmem_free(ipp->ipp_label_v4, ipp->ipp_label_len_v4);
+ ipp->ipp_label_v4 = NULL;
+ ipp->ipp_label_len_v4 = 0;
+ }
+ if (fields & IPPF_LABEL_V6) {
+ kmem_free(ipp->ipp_label_v6, ipp->ipp_label_len_v6);
+ ipp->ipp_label_v6 = NULL;
+ ipp->ipp_label_len_v6 = 0;
+ }
+ ipp->ipp_fields &= ~(IPPF_HOPOPTS | IPPF_RTHDRDSTOPTS | IPPF_DSTOPTS |
+ IPPF_RTHDR | IPPF_IPV4_OPTIONS | IPPF_LABEL_V4 | IPPF_LABEL_V6);
+}
+
+/*
+ * Copy from src to dst and allocate as needed.
+ * Returns zero or ENOMEM.
+ *
+ * The caller must initialize dst to zero.
+ */
+int
+ip_pkt_copy(ip_pkt_t *src, ip_pkt_t *dst, int kmflag)
+{
+ uint_t fields = src->ipp_fields;
+
+ /* Start with fields that don't require memory allocation */
+ dst->ipp_fields = fields &
+ ~(IPPF_HOPOPTS | IPPF_RTHDRDSTOPTS | IPPF_DSTOPTS |
+ IPPF_RTHDR | IPPF_IPV4_OPTIONS | IPPF_LABEL_V4 | IPPF_LABEL_V6);
+
+ dst->ipp_addr = src->ipp_addr;
+ dst->ipp_unicast_hops = src->ipp_unicast_hops;
+ dst->ipp_hoplimit = src->ipp_hoplimit;
+ dst->ipp_tclass = src->ipp_tclass;
+ dst->ipp_type_of_service = src->ipp_type_of_service;
+
+ if (fields & IPPF_HOPOPTS) {
+ dst->ipp_hopopts = kmem_alloc(src->ipp_hopoptslen, kmflag);
+ if (dst->ipp_hopopts == NULL) {
+ ip_pkt_free(dst);
+ return (ENOMEM);
+ }
+ dst->ipp_fields |= IPPF_HOPOPTS;
+ bcopy(src->ipp_hopopts, dst->ipp_hopopts,
+ src->ipp_hopoptslen);
+ dst->ipp_hopoptslen = src->ipp_hopoptslen;
+ }
+ if (fields & IPPF_RTHDRDSTOPTS) {
+ dst->ipp_rthdrdstopts = kmem_alloc(src->ipp_rthdrdstoptslen,
+ kmflag);
+ if (dst->ipp_rthdrdstopts == NULL) {
+ ip_pkt_free(dst);
+ return (ENOMEM);
+ }
+ dst->ipp_fields |= IPPF_RTHDRDSTOPTS;
+ bcopy(src->ipp_rthdrdstopts, dst->ipp_rthdrdstopts,
+ src->ipp_rthdrdstoptslen);
+ dst->ipp_rthdrdstoptslen = src->ipp_rthdrdstoptslen;
+ }
+ if (fields & IPPF_DSTOPTS) {
+ dst->ipp_dstopts = kmem_alloc(src->ipp_dstoptslen, kmflag);
+ if (dst->ipp_dstopts == NULL) {
+ ip_pkt_free(dst);
+ return (ENOMEM);
+ }
+ dst->ipp_fields |= IPPF_DSTOPTS;
+ bcopy(src->ipp_dstopts, dst->ipp_dstopts,
+ src->ipp_dstoptslen);
+ dst->ipp_dstoptslen = src->ipp_dstoptslen;
+ }
+ if (fields & IPPF_RTHDR) {
+ dst->ipp_rthdr = kmem_alloc(src->ipp_rthdrlen, kmflag);
+ if (dst->ipp_rthdr == NULL) {
+ ip_pkt_free(dst);
+ return (ENOMEM);
+ }
+ dst->ipp_fields |= IPPF_RTHDR;
+ bcopy(src->ipp_rthdr, dst->ipp_rthdr,
+ src->ipp_rthdrlen);
+ dst->ipp_rthdrlen = src->ipp_rthdrlen;
+ }
+ if (fields & IPPF_IPV4_OPTIONS) {
+ dst->ipp_ipv4_options = kmem_alloc(src->ipp_ipv4_options_len,
+ kmflag);
+ if (dst->ipp_ipv4_options == NULL) {
+ ip_pkt_free(dst);
+ return (ENOMEM);
+ }
+ dst->ipp_fields |= IPPF_IPV4_OPTIONS;
+ bcopy(src->ipp_ipv4_options, dst->ipp_ipv4_options,
+ src->ipp_ipv4_options_len);
+ dst->ipp_ipv4_options_len = src->ipp_ipv4_options_len;
+ }
+ if (fields & IPPF_LABEL_V4) {
+ dst->ipp_label_v4 = kmem_alloc(src->ipp_label_len_v4, kmflag);
+ if (dst->ipp_label_v4 == NULL) {
+ ip_pkt_free(dst);
+ return (ENOMEM);
+ }
+ dst->ipp_fields |= IPPF_LABEL_V4;
+ bcopy(src->ipp_label_v4, dst->ipp_label_v4,
+ src->ipp_label_len_v4);
+ dst->ipp_label_len_v4 = src->ipp_label_len_v4;
+ }
+ if (fields & IPPF_LABEL_V6) {
+ dst->ipp_label_v6 = kmem_alloc(src->ipp_label_len_v6, kmflag);
+ if (dst->ipp_label_v6 == NULL) {
+ ip_pkt_free(dst);
+ return (ENOMEM);
+ }
+ dst->ipp_fields |= IPPF_LABEL_V6;
+ bcopy(src->ipp_label_v6, dst->ipp_label_v6,
+ src->ipp_label_len_v6);
+ dst->ipp_label_len_v6 = src->ipp_label_len_v6;
+ }
+ if (fields & IPPF_FRAGHDR) {
+ dst->ipp_fraghdr = kmem_alloc(src->ipp_fraghdrlen, kmflag);
+ if (dst->ipp_fraghdr == NULL) {
+ ip_pkt_free(dst);
+ return (ENOMEM);
+ }
+ dst->ipp_fields |= IPPF_FRAGHDR;
+ bcopy(src->ipp_fraghdr, dst->ipp_fraghdr,
+ src->ipp_fraghdrlen);
+ dst->ipp_fraghdrlen = src->ipp_fraghdrlen;
+ }
+ return (0);
+}
+
+/*
+ * Returns INADDR_ANY if no source route
+ */
+ipaddr_t
+ip_pkt_source_route_v4(const ip_pkt_t *ipp)
+{
+ ipaddr_t nexthop = INADDR_ANY;
+ ipoptp_t opts;
+ uchar_t *opt;
+ uint8_t optval;
+ uint8_t optlen;
+ uint32_t totallen;
+
+ if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS))
+ return (INADDR_ANY);
+
+ totallen = ipp->ipp_ipv4_options_len;
+ if (totallen & 0x3)
+ return (INADDR_ANY);
+
+ for (optval = ipoptp_first2(&opts, totallen, ipp->ipp_ipv4_options);
+ optval != IPOPT_EOL;
+ optval = ipoptp_next(&opts)) {
+ opt = opts.ipoptp_cur;
+ switch (optval) {
+ uint8_t off;
+ case IPOPT_SSRR:
+ case IPOPT_LSRR:
+ if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
+ break;
+ }
+ optlen = opts.ipoptp_len;
+ off = opt[IPOPT_OFFSET];
+ off--;
+ if (optlen < IP_ADDR_LEN ||
+ off > optlen - IP_ADDR_LEN) {
+ /* End of source route */
+ break;
+ }
+ bcopy((char *)opt + off, &nexthop, IP_ADDR_LEN);
+ if (nexthop == htonl(INADDR_LOOPBACK)) {
+ /* Ignore */
+ nexthop = INADDR_ANY;
+ break;
+ }
+ break;
+ }
+ }
+ return (nexthop);
+}
+
+/*
+ * Reverse a source route.
+ */
+void
+ip_pkt_source_route_reverse_v4(ip_pkt_t *ipp)
+{
+ ipaddr_t tmp;
+ ipoptp_t opts;
+ uchar_t *opt;
+ uint8_t optval;
+ uint32_t totallen;
+
+ if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS))
+ return;
+
+ totallen = ipp->ipp_ipv4_options_len;
+ if (totallen & 0x3)
+ return;
+
+ for (optval = ipoptp_first2(&opts, totallen, ipp->ipp_ipv4_options);
+ optval != IPOPT_EOL;
+ optval = ipoptp_next(&opts)) {
+ uint8_t off1, off2;
+
+ opt = opts.ipoptp_cur;
+ switch (optval) {
+ case IPOPT_SSRR:
+ case IPOPT_LSRR:
+ if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
+ break;
+ }
+ off1 = IPOPT_MINOFF_SR - 1;
+ off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1;
+ while (off2 > off1) {
+ bcopy(opt + off2, &tmp, IP_ADDR_LEN);
+ bcopy(opt + off1, opt + off2, IP_ADDR_LEN);
+ bcopy(&tmp, opt + off2, IP_ADDR_LEN);
+ off2 -= IP_ADDR_LEN;
+ off1 += IP_ADDR_LEN;
+ }
+ opt[IPOPT_OFFSET] = IPOPT_MINOFF_SR;
+ break;
+ }
+ }
+}
+
+/*
+ * Returns NULL if no routing header
+ */
+in6_addr_t *
+ip_pkt_source_route_v6(const ip_pkt_t *ipp)
+{
+ in6_addr_t *nexthop = NULL;
+ ip6_rthdr0_t *rthdr;
+
+ if (!(ipp->ipp_fields & IPPF_RTHDR))
+ return (NULL);
+
+ rthdr = (ip6_rthdr0_t *)ipp->ipp_rthdr;
+ if (rthdr->ip6r0_segleft == 0)
+ return (NULL);
+
+ nexthop = (in6_addr_t *)((char *)rthdr + sizeof (*rthdr));
+ return (nexthop);
}
zoneid_t
-ip_get_zoneid_v4(ipaddr_t addr, mblk_t *mp, ip_stack_t *ipst,
+ip_get_zoneid_v4(ipaddr_t addr, mblk_t *mp, ip_recv_attr_t *ira,
zoneid_t lookup_zoneid)
{
+ ip_stack_t *ipst = ira->ira_ill->ill_ipst;
ire_t *ire;
int ire_flags = MATCH_IRE_TYPE;
zoneid_t zoneid = ALL_ZONES;
- if (is_system_labeled() && !tsol_can_accept_raw(mp, B_FALSE))
+ if (is_system_labeled() && !tsol_can_accept_raw(mp, ira, B_FALSE))
return (ALL_ZONES);
if (lookup_zoneid != ALL_ZONES)
ire_flags |= MATCH_IRE_ZONEONLY;
- ire = ire_ctable_lookup(addr, NULL, IRE_LOCAL | IRE_LOOPBACK, NULL,
- lookup_zoneid, NULL, ire_flags, ipst);
+ ire = ire_ftable_lookup_v4(addr, NULL, NULL, IRE_LOCAL | IRE_LOOPBACK,
+ NULL, lookup_zoneid, NULL, ire_flags, 0, ipst, NULL);
if (ire != NULL) {
zoneid = IP_REAL_ZONEID(ire->ire_zoneid, ipst);
ire_refrele(ire);
@@ -29858,24 +15277,23 @@ ip_get_zoneid_v4(ipaddr_t addr, mblk_t *mp, ip_stack_t *ipst,
zoneid_t
ip_get_zoneid_v6(in6_addr_t *addr, mblk_t *mp, const ill_t *ill,
- ip_stack_t *ipst, zoneid_t lookup_zoneid)
+ ip_recv_attr_t *ira, zoneid_t lookup_zoneid)
{
+ ip_stack_t *ipst = ira->ira_ill->ill_ipst;
ire_t *ire;
int ire_flags = MATCH_IRE_TYPE;
zoneid_t zoneid = ALL_ZONES;
- ipif_t *ipif_arg = NULL;
- if (is_system_labeled() && !tsol_can_accept_raw(mp, B_FALSE))
+ if (is_system_labeled() && !tsol_can_accept_raw(mp, ira, B_FALSE))
return (ALL_ZONES);
- if (IN6_IS_ADDR_LINKLOCAL(addr)) {
+ if (IN6_IS_ADDR_LINKLOCAL(addr))
ire_flags |= MATCH_IRE_ILL;
- ipif_arg = ill->ill_ipif;
- }
+
if (lookup_zoneid != ALL_ZONES)
ire_flags |= MATCH_IRE_ZONEONLY;
- ire = ire_ctable_lookup_v6(addr, NULL, IRE_LOCAL | IRE_LOOPBACK,
- ipif_arg, lookup_zoneid, NULL, ire_flags, ipst);
+ ire = ire_ftable_lookup_v6(addr, NULL, NULL, IRE_LOCAL | IRE_LOOPBACK,
+ ill, lookup_zoneid, NULL, ire_flags, 0, ipst, NULL);
if (ire != NULL) {
zoneid = IP_REAL_ZONEID(ire->ire_zoneid, ipst);
ire_refrele(ire);
@@ -29964,3 +15382,29 @@ ipobs_hook(mblk_t *mp, int htype, zoneid_t zsrc, zoneid_t zdst,
imp->b_cont = NULL;
freemsg(imp);
}
+
+/*
+ * Utility routine that checks if `v4srcp' is a valid address on underlying
+ * interface `ill'. If `ipifp' is non-NULL, it's set to a held ipif
+ * associated with `v4srcp' on success. NOTE: if this is not called from
+ * inside the IPSQ (ill_g_lock is not held), `ill' may be removed from the
+ * group during or after this lookup.
+ */
+boolean_t
+ipif_lookup_testaddr_v4(ill_t *ill, const in_addr_t *v4srcp, ipif_t **ipifp)
+{
+ ipif_t *ipif;
+
+ ipif = ipif_lookup_addr_exact(*v4srcp, ill, ill->ill_ipst);
+ if (ipif != NULL) {
+ if (ipifp != NULL)
+ *ipifp = ipif;
+ else
+ ipif_refrele(ipif);
+ return (B_TRUE);
+ }
+
+ ip1dbg(("ipif_lookup_testaddr_v4: cannot find ipif for src %x\n",
+ *v4srcp));
+ return (B_FALSE);
+}
diff --git a/usr/src/uts/common/inet/ip/ip2mac.c b/usr/src/uts/common/inet/ip/ip2mac.c
index e232a5bb63..55a17f762a 100644
--- a/usr/src/uts/common/inet/ip/ip2mac.c
+++ b/usr/src/uts/common/inet/ip/ip2mac.c
@@ -18,6 +18,7 @@
*
* CDDL HEADER END
*/
+
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
@@ -29,7 +30,6 @@
#include <inet/ip2mac.h>
#include <inet/ip2mac_impl.h>
#include <sys/zone.h>
-#include <sys/dlpi.h>
#include <inet/ip_ndp.h>
#include <inet/ip_if.h>
#include <inet/ip6.h>
@@ -38,18 +38,18 @@
* dispatch pending callbacks.
*/
void
-nce_cb_dispatch(nce_t *nce)
+ncec_cb_dispatch(ncec_t *ncec)
{
- nce_cb_t *nce_cb = list_head(&nce->nce_cb);
+ ncec_cb_t *ncec_cb;
ip2mac_t ip2m;
- mutex_enter(&nce->nce_lock);
- if (list_is_empty(&nce->nce_cb)) {
- mutex_exit(&nce->nce_lock);
+ mutex_enter(&ncec->ncec_lock);
+ if (list_is_empty(&ncec->ncec_cb)) {
+ mutex_exit(&ncec->ncec_lock);
return;
}
- nce_ip2mac_response(&ip2m, nce);
- nce_cb_refhold_locked(nce);
+ ncec_ip2mac_response(&ip2m, ncec);
+ ncec_cb_refhold_locked(ncec);
/*
* IP does not hold internal locks like nce_lock across calls to
* other subsystems for fear of recursive lock entry and lock
@@ -58,75 +58,82 @@ nce_cb_dispatch(nce_t *nce)
* across calls into another subsystem, especially if calls can
* happen in either direction).
*/
- nce_cb = list_head(&nce->nce_cb);
- for (; nce_cb != NULL; nce_cb = list_next(&nce->nce_cb, nce_cb)) {
- if (nce_cb->nce_cb_flags & NCE_CB_DISPATCHED)
+ ncec_cb = list_head(&ncec->ncec_cb);
+ for (; ncec_cb != NULL; ncec_cb = list_next(&ncec->ncec_cb, ncec_cb)) {
+ if (ncec_cb->ncec_cb_flags & NCE_CB_DISPATCHED)
continue;
- nce_cb->nce_cb_flags |= NCE_CB_DISPATCHED;
- mutex_exit(&nce->nce_lock);
- (*nce_cb->nce_cb_func)(&ip2m, nce_cb->nce_cb_arg);
- mutex_enter(&nce->nce_lock);
+ ncec_cb->ncec_cb_flags |= NCE_CB_DISPATCHED;
+ mutex_exit(&ncec->ncec_lock);
+ (*ncec_cb->ncec_cb_func)(&ip2m, ncec_cb->ncec_cb_arg);
+ mutex_enter(&ncec->ncec_lock);
}
- nce_cb_refrele(nce);
- mutex_exit(&nce->nce_lock);
+ ncec_cb_refrele(ncec);
+ mutex_exit(&ncec->ncec_lock);
}
/*
* fill up the ip2m response fields with inforamation from the nce.
*/
void
-nce_ip2mac_response(ip2mac_t *ip2m, nce_t *nce)
+ncec_ip2mac_response(ip2mac_t *ip2m, ncec_t *ncec)
{
- boolean_t isv6 = (nce->nce_ipversion == IPV6_VERSION);
+ boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
+ sin_t *sin;
sin6_t *sin6;
struct sockaddr_dl *sdl;
- uchar_t *nce_lladdr;
- ASSERT(MUTEX_HELD(&nce->nce_lock));
+ ASSERT(MUTEX_HELD(&ncec->ncec_lock));
bzero(ip2m, sizeof (*ip2m));
- if (NCE_ISREACHABLE(nce) && (nce->nce_flags & NCE_F_CONDEMNED) == 0)
+ if (NCE_ISREACHABLE(ncec) && !NCE_ISCONDEMNED(ncec))
ip2m->ip2mac_err = 0;
else
ip2m->ip2mac_err = ESRCH;
if (isv6) {
sin6 = (sin6_t *)&ip2m->ip2mac_pa;
sin6->sin6_family = AF_INET6;
- sin6->sin6_addr = nce->nce_addr;
+ sin6->sin6_addr = ncec->ncec_addr;
+ } else {
+ sin = (sin_t *)&ip2m->ip2mac_pa;
+ sin->sin_family = AF_INET;
+ IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &sin->sin_addr);
}
if (ip2m->ip2mac_err == 0) {
sdl = &ip2m->ip2mac_ha;
sdl->sdl_family = AF_LINK;
- sdl->sdl_type = nce->nce_ill->ill_type;
+ sdl->sdl_type = ncec->ncec_ill->ill_type;
+ /*
+ * should we put ncec_ill->ill_name in there? why?
+ * likewise for the sdl_index
+ */
sdl->sdl_nlen = 0;
- sdl->sdl_alen = nce->nce_ill->ill_phys_addr_length;
- nce_lladdr = nce->nce_res_mp->b_rptr +
- NCE_LL_ADDR_OFFSET(nce->nce_ill);
- bcopy(nce_lladdr, LLADDR(sdl), sdl->sdl_alen);
+ sdl->sdl_alen = ncec->ncec_ill->ill_phys_addr_length;
+ if (ncec->ncec_lladdr != NULL)
+ bcopy(ncec->ncec_lladdr, LLADDR(sdl), sdl->sdl_alen);
}
}
void
-nce_cb_refhold_locked(nce_t *nce)
+ncec_cb_refhold_locked(ncec_t *ncec)
{
- ASSERT(MUTEX_HELD(&nce->nce_lock));
- nce->nce_cb_walker_cnt++;
+ ASSERT(MUTEX_HELD(&ncec->ncec_lock));
+ ncec->ncec_cb_walker_cnt++;
}
void
-nce_cb_refrele(nce_t *nce)
+ncec_cb_refrele(ncec_t *ncec)
{
- nce_cb_t *nce_cb, *nce_cb_next = NULL;
+ ncec_cb_t *ncec_cb, *ncec_cb_next = NULL;
- ASSERT(MUTEX_HELD(&nce->nce_lock));
- if (--nce->nce_cb_walker_cnt == 0) {
- for (nce_cb = list_head(&nce->nce_cb); nce_cb != NULL;
- nce_cb = nce_cb_next) {
+ ASSERT(MUTEX_HELD(&ncec->ncec_lock));
+ if (--ncec->ncec_cb_walker_cnt == 0) {
+ for (ncec_cb = list_head(&ncec->ncec_cb); ncec_cb != NULL;
+ ncec_cb = ncec_cb_next) {
- nce_cb_next = list_next(&nce->nce_cb, nce_cb);
- if ((nce_cb->nce_cb_flags & NCE_CB_DISPATCHED) == 0)
+ ncec_cb_next = list_next(&ncec->ncec_cb, ncec_cb);
+ if ((ncec_cb->ncec_cb_flags & NCE_CB_DISPATCHED) == 0)
continue;
- list_remove(&nce->nce_cb, nce_cb);
- kmem_free(nce_cb, sizeof (*nce_cb));
+ list_remove(&ncec->ncec_cb, ncec_cb);
+ kmem_free(ncec_cb, sizeof (*ncec_cb));
}
}
}
@@ -136,25 +143,25 @@ nce_cb_refrele(nce_t *nce)
* after address resolution succeeds/fails.
*/
static ip2mac_id_t
-nce_add_cb(nce_t *nce, ip2mac_callback_t *cb, void *cbarg)
+ncec_add_cb(ncec_t *ncec, ip2mac_callback_t *cb, void *cbarg)
{
- nce_cb_t *nce_cb;
+ ncec_cb_t *nce_cb;
ip2mac_id_t ip2mid = NULL;
- ASSERT(MUTEX_HELD(&nce->nce_lock));
+ ASSERT(MUTEX_HELD(&ncec->ncec_lock));
if ((nce_cb = kmem_zalloc(sizeof (*nce_cb), KM_NOSLEEP)) == NULL)
return (ip2mid);
- nce_cb->nce_cb_func = cb;
- nce_cb->nce_cb_arg = cbarg;
+ nce_cb->ncec_cb_func = cb;
+ nce_cb->ncec_cb_arg = cbarg;
/*
- * We identify the nce_cb_t during cancellation by the address
+ * We identify the ncec_cb_t during cancellation by the address
* of the nce_cb_t itself, and, as a short-cut for eliminating
- * clear mismatches, only look in the callback list of nce's
+ * clear mismatches, only look in the callback list of ncec's
* whose address is equal to the nce_cb_id.
*/
- nce_cb->nce_cb_id = nce; /* no refs! just an address */
- list_insert_tail(&nce->nce_cb, nce_cb);
- ip2mid = nce; /* this is the id to be used in ip2mac_cancel */
+ nce_cb->ncec_cb_id = ncec; /* no refs! just an address */
+ list_insert_tail(&ncec->ncec_cb, nce_cb);
+ ip2mid = ncec; /* this is the id to be used in ip2mac_cancel */
return (nce_cb);
}
@@ -167,29 +174,24 @@ nce_add_cb(nce_t *nce, ip2mac_callback_t *cb, void *cbarg)
* the resolution completes.
*/
ip2mac_id_t
-ip2mac(uint_t flags, ip2mac_t *ip2m, ip2mac_callback_t *cb, void *cbarg,
+ip2mac(uint_t op, ip2mac_t *ip2m, ip2mac_callback_t *cb, void *cbarg,
zoneid_t zoneid)
{
- nce_t *nce;
+ ncec_t *ncec;
+ nce_t *nce = NULL;
boolean_t isv6;
ill_t *ill;
netstack_t *ns;
ip_stack_t *ipst;
ip2mac_id_t ip2mid = NULL;
+ sin_t *sin;
sin6_t *sin6;
int err;
uint64_t delta;
+ boolean_t need_resolve = B_FALSE;
isv6 = (ip2m->ip2mac_pa.ss_family == AF_INET6);
- if (!isv6) {
- /*
- * IPv4 is not currently supported.
- */
- ip2m->ip2mac_err = ENOTSUP;
- return (NULL);
- }
-
ns = netstack_find_by_zoneid(zoneid);
if (ns == NULL) {
ip2m->ip2mac_err = EINVAL;
@@ -205,8 +207,7 @@ ip2mac(uint_t flags, ip2mac_t *ip2m, ip2mac_callback_t *cb, void *cbarg,
/*
* find the ill from the ip2m->ip2mac_ifindex
*/
- ill = ill_lookup_on_ifindex(ip2m->ip2mac_ifindex, isv6, NULL,
- NULL, NULL, NULL, ipst);
+ ill = ill_lookup_on_ifindex(ip2m->ip2mac_ifindex, isv6, ipst);
if (ill == NULL) {
ip2m->ip2mac_err = ENXIO;
netstack_rele(ns);
@@ -214,32 +215,39 @@ ip2mac(uint_t flags, ip2mac_t *ip2m, ip2mac_callback_t *cb, void *cbarg,
}
if (isv6) {
sin6 = (sin6_t *)&ip2m->ip2mac_pa;
- if (flags == IP2MAC_LOOKUP) {
- nce = ndp_lookup_v6(ill, B_FALSE, &sin6->sin6_addr,
- B_FALSE);
+ if (op == IP2MAC_LOOKUP) {
+ nce = nce_lookup_v6(ill, &sin6->sin6_addr);
} else {
- err = ndp_lookup_then_add_v6(ill, B_FALSE, NULL,
- &sin6->sin6_addr, &ipv6_all_ones, &ipv6_all_zeros,
- 0, 0, ND_INCOMPLETE, &nce);
+ err = nce_lookup_then_add_v6(ill, NULL,
+ ill->ill_phys_addr_length,
+ &sin6->sin6_addr, 0, ND_UNCHANGED, &nce);
}
} else {
- ip2m->ip2mac_err = ENOTSUP; /* yet. */
- goto done;
+ sin = (sin_t *)&ip2m->ip2mac_pa;
+ if (op == IP2MAC_LOOKUP) {
+ nce = nce_lookup_v4(ill, &sin->sin_addr.s_addr);
+ } else {
+ err = nce_lookup_then_add_v4(ill, NULL,
+ ill->ill_phys_addr_length,
+ &sin->sin_addr.s_addr, 0, ND_UNCHANGED, &nce);
+ }
}
- if (flags == IP2MAC_LOOKUP) {
+ if (op == IP2MAC_LOOKUP) {
if (nce == NULL) {
ip2m->ip2mac_err = ESRCH;
goto done;
}
- mutex_enter(&nce->nce_lock);
- if (NCE_ISREACHABLE(nce)) {
- nce_ip2mac_response(ip2m, nce);
+ ncec = nce->nce_common;
+ delta = TICK_TO_MSEC(lbolt64) - ncec->ncec_last;
+ mutex_enter(&ncec->ncec_lock);
+ if (NCE_ISREACHABLE(ncec) &&
+ delta < (uint64_t)ill->ill_reachable_time) {
+ ncec_ip2mac_response(ip2m, ncec);
ip2m->ip2mac_err = 0;
} else {
ip2m->ip2mac_err = ESRCH;
}
- mutex_exit(&nce->nce_lock);
- NCE_REFRELE(nce);
+ mutex_exit(&ncec->ncec_lock);
goto done;
} else {
if (err != 0 && err != EEXIST) {
@@ -247,13 +255,20 @@ ip2mac(uint_t flags, ip2mac_t *ip2m, ip2mac_callback_t *cb, void *cbarg,
goto done;
}
}
- delta = TICK_TO_MSEC(lbolt64) - nce->nce_last;
- mutex_enter(&nce->nce_lock);
- if (nce->nce_flags & NCE_F_CONDEMNED) {
+ ncec = nce->nce_common;
+ delta = TICK_TO_MSEC(lbolt64) - ncec->ncec_last;
+ mutex_enter(&ncec->ncec_lock);
+ if (NCE_ISCONDEMNED(ncec)) {
ip2m->ip2mac_err = ESRCH;
- } else if (!NCE_ISREACHABLE(nce) ||
- delta > (uint64_t)ill->ill_reachable_time) {
- if (NCE_ISREACHABLE(nce)) {
+ } else {
+ if (NCE_ISREACHABLE(ncec)) {
+ if (NCE_MYADDR(ncec) ||
+ delta < (uint64_t)ill->ill_reachable_time) {
+ ncec_ip2mac_response(ip2m, ncec);
+ ip2m->ip2mac_err = 0;
+ mutex_exit(&ncec->ncec_lock);
+ goto done;
+ }
/*
* Since we do not control the packet output
* path for ip2mac() callers, we need to verify
@@ -268,39 +283,48 @@ ip2mac(uint_t flags, ip2mac_t *ip2m, ip2mac_callback_t *cb, void *cbarg,
* so that we can return the stale information but
* also update the caller if the lladdr changes.
*/
- nce->nce_rcnt = ill->ill_xmit_count;
- nce->nce_state = ND_PROBE;
- err = 0; /* treat this nce as a new one */
+ ncec->ncec_rcnt = ill->ill_xmit_count;
+ ncec->ncec_state = ND_PROBE;
+ need_resolve = B_TRUE; /* reachable but very old nce */
+ } else if (ncec->ncec_state == ND_INITIAL) {
+ need_resolve = B_TRUE; /* ND_INITIAL nce */
+ ncec->ncec_state = ND_INCOMPLETE;
}
- if (nce->nce_rcnt > 0) {
+ /*
+ * NCE not known to be reachable in the recent past. We must
+ * reconfirm the information before returning it to the caller
+ */
+ if (ncec->ncec_rcnt > 0) {
/*
- * Still resolving this nce, so we can
- * queue the callback information in nce->nce_cb
+ * Still resolving this ncec, so we can queue the
+ * callback information in ncec->ncec_cb
*/
- ip2mid = nce_add_cb(nce, cb, cbarg);
+ ip2mid = ncec_add_cb(ncec, cb, cbarg);
ip2m->ip2mac_err = EINPROGRESS;
} else {
/*
- * Resolution failed.
+ * No more retransmits allowed -- resolution failed.
*/
ip2m->ip2mac_err = ESRCH;
}
- } else {
- nce_ip2mac_response(ip2m, nce);
- ip2m->ip2mac_err = 0;
}
- if (ip2m->ip2mac_err == EINPROGRESS && err != EEXIST)
- ip_ndp_resolve(nce);
- mutex_exit(&nce->nce_lock);
- NCE_REFRELE(nce);
+ mutex_exit(&ncec->ncec_lock);
done:
+ /*
+ * if NCE_ISREACHABLE(ncec) but very old, or if it is ND_INITIAL,
+ * trigger resolve.
+ */
+ if (need_resolve)
+ ip_ndp_resolve(ncec);
+ if (nce != NULL)
+ nce_refrele(nce);
netstack_rele(ns);
ill_refrele(ill);
return (ip2mid);
}
/*
- * data passed to nce_walk for canceling outstanding callbacks.
+ * data passed to ncec_walk for canceling outstanding callbacks.
*/
typedef struct ip2mac_cancel_data_s {
ip2mac_id_t ip2m_cancel_id;
@@ -308,23 +332,23 @@ typedef struct ip2mac_cancel_data_s {
} ip2mac_cancel_data_t;
/*
- * callback invoked for each active nce. If the ip2mac_id_t corresponds
- * to an active nce_cb_t in the nce's callback list, we want to remove
+ * callback invoked for each active ncec. If the ip2mac_id_t corresponds
+ * to an active nce_cb_t in the ncec's callback list, we want to remove
* the callback (if there are no walkers) or return EBUSY to the caller
*/
static int
-ip2mac_cancel_callback(nce_t *nce, void *arg)
+ip2mac_cancel_callback(ncec_t *ncec, void *arg)
{
ip2mac_cancel_data_t *ip2m_wdata = arg;
- nce_cb_t *ip2m_nce_cb = ip2m_wdata->ip2m_cancel_id;
- nce_cb_t *nce_cb;
+ ncec_cb_t *ip2m_nce_cb = ip2m_wdata->ip2m_cancel_id;
+ ncec_cb_t *ncec_cb;
- if (ip2m_nce_cb->nce_cb_id != nce)
+ if (ip2m_nce_cb->ncec_cb_id != ncec)
return (0);
- mutex_enter(&nce->nce_lock);
- if (list_is_empty(&nce->nce_cb)) {
- mutex_exit(&nce->nce_lock);
+ mutex_enter(&ncec->ncec_lock);
+ if (list_is_empty(&ncec->ncec_cb)) {
+ mutex_exit(&ncec->ncec_lock);
return (0);
}
/*
@@ -335,22 +359,22 @@ ip2mac_cancel_callback(nce_t *nce, void *arg)
* across calls into another subsystem, especially if calls can
* happen in either direction).
*/
- nce_cb = list_head(&nce->nce_cb);
- for (; nce_cb != NULL; nce_cb = list_next(&nce->nce_cb, nce_cb)) {
- if (nce_cb != ip2m_nce_cb)
+ ncec_cb = list_head(&ncec->ncec_cb);
+ for (; ncec_cb != NULL; ncec_cb = list_next(&ncec->ncec_cb, ncec_cb)) {
+ if (ncec_cb != ip2m_nce_cb)
continue;
/*
* If there are no walkers we can remove the nce_cb.
* Otherwise the exiting walker will clean up.
*/
- if (nce->nce_cb_walker_cnt == 0) {
- list_remove(&nce->nce_cb, nce_cb);
+ if (ncec->ncec_cb_walker_cnt == 0) {
+ list_remove(&ncec->ncec_cb, ncec_cb);
} else {
ip2m_wdata->ip2m_cancel_err = EBUSY;
}
break;
}
- mutex_exit(&nce->nce_lock);
+ mutex_exit(&ncec->ncec_lock);
return (0);
}
@@ -379,7 +403,7 @@ ip2mac_cancel(ip2mac_id_t ip2mid, zoneid_t zoneid)
ip2m_wdata.ip2m_cancel_id = ip2mid;
ip2m_wdata.ip2m_cancel_err = 0;
- ndp_walk(NULL, ip2mac_cancel_callback, &ip2m_wdata, ipst);
+ ncec_walk(NULL, ip2mac_cancel_callback, &ip2m_wdata, ipst);
/*
* We may return EBUSY if a walk to dispatch callbacks is
* in progress, in which case the caller needs to synchronize
diff --git a/usr/src/uts/common/inet/ip/ip6.c b/usr/src/uts/common/inet/ip/ip6.c
index 38fe7b2562..ed54c08884 100644
--- a/usr/src/uts/common/inet/ip/ip6.c
+++ b/usr/src/uts/common/inet/ip/ip6.c
@@ -53,8 +53,8 @@
#include <sys/vtrace.h>
#include <sys/isa_defs.h>
#include <sys/atomic.h>
-#include <sys/iphada.h>
#include <sys/policy.h>
+#include <sys/mac.h>
#include <net/if.h>
#include <net/if_types.h>
#include <net/route.h>
@@ -79,9 +79,7 @@
#include <inet/tcp.h>
#include <inet/tcp_impl.h>
#include <inet/udp_impl.h>
-#include <inet/sctp/sctp_impl.h>
#include <inet/ipp_common.h>
-#include <inet/ilb_ip.h>
#include <inet/ip_multi.h>
#include <inet/ip_if.h>
@@ -89,7 +87,6 @@
#include <inet/ip_rts.h>
#include <inet/ip_ndp.h>
#include <net/pfkeyv2.h>
-#include <inet/ipsec_info.h>
#include <inet/sadb.h>
#include <inet/ipsec_impl.h>
#include <inet/iptun/iptun_impl.h>
@@ -110,8 +107,6 @@
/* Temporary; for CR 6451644 work-around */
#include <sys/ethernet.h>
-extern int ip_squeue_flag;
-
/*
* Naming conventions:
* These rules should be judiciously applied
@@ -179,154 +174,75 @@ const in6_addr_t ipv6_solicited_node_mcast =
{ 0x000002ffU, 0, 0x01000000U, 0x000000ffU };
#endif /* _BIG_ENDIAN */
-/* Leave room for ip_newroute to tack on the src and target addresses */
-#define OK_RESOLVER_MP_V6(mp) \
- ((mp) && ((mp)->b_wptr - (mp)->b_rptr) >= (2 * IPV6_ADDR_LEN))
-
-#define IP6_MBLK_OK 0
-#define IP6_MBLK_HDR_ERR 1
-#define IP6_MBLK_LEN_ERR 2
-
-static void icmp_inbound_too_big_v6(queue_t *, mblk_t *, ill_t *, ill_t *,
- boolean_t, zoneid_t);
-static void icmp_pkt_v6(queue_t *, mblk_t *, void *, size_t,
- const in6_addr_t *, boolean_t, zoneid_t, ip_stack_t *);
-static void icmp_redirect_v6(queue_t *, mblk_t *, ill_t *ill);
-static int ip_bind_connected_v6(conn_t *, mblk_t **, uint8_t, in6_addr_t *,
- uint16_t, const in6_addr_t *, ip6_pkt_t *, uint16_t,
- boolean_t, boolean_t, cred_t *);
-static boolean_t ip_bind_get_ire_v6(mblk_t **, ire_t *, const in6_addr_t *,
- iulp_t *, ip_stack_t *);
-static int ip_bind_laddr_v6(conn_t *, mblk_t **, uint8_t,
- const in6_addr_t *, uint16_t, boolean_t);
-static void ip_fanout_proto_v6(queue_t *, mblk_t *, ip6_t *, ill_t *,
- ill_t *, uint8_t, uint_t, uint_t, boolean_t, zoneid_t);
-static void ip_fanout_tcp_v6(queue_t *, mblk_t *, ip6_t *, ill_t *,
- ill_t *, uint_t, uint_t, boolean_t, zoneid_t);
-static void ip_fanout_udp_v6(queue_t *, mblk_t *, ip6_t *, uint32_t,
- ill_t *, ill_t *, uint_t, boolean_t, zoneid_t);
-static int ip_process_options_v6(queue_t *, mblk_t *, ip6_t *,
- uint8_t *, uint_t, uint8_t, ip_stack_t *);
-static mblk_t *ip_rput_frag_v6(ill_t *, ill_t *, mblk_t *, ip6_t *,
- ip6_frag_t *, uint_t, uint_t *, uint32_t *, uint16_t *);
+static boolean_t icmp_inbound_verify_v6(mblk_t *, icmp6_t *, ip_recv_attr_t *);
+static void icmp_inbound_too_big_v6(icmp6_t *, ip_recv_attr_t *);
+static void icmp_pkt_v6(mblk_t *, void *, size_t, const in6_addr_t *,
+ ip_recv_attr_t *);
+static void icmp_redirect_v6(mblk_t *, ip6_t *, nd_redirect_t *,
+ ip_recv_attr_t *);
+static void icmp_send_redirect_v6(mblk_t *, in6_addr_t *,
+ in6_addr_t *, ip_recv_attr_t *);
+static void icmp_send_reply_v6(mblk_t *, ip6_t *, icmp6_t *,
+ ip_recv_attr_t *);
static boolean_t ip_source_routed_v6(ip6_t *, mblk_t *, ip_stack_t *);
-static void ip_wput_ire_v6(queue_t *, mblk_t *, ire_t *, int, int,
- conn_t *, int, int, zoneid_t);
-static boolean_t ipif_lookup_testaddr_v6(ill_t *, const in6_addr_t *,
- ipif_t **);
-
-/*
- * A template for an IPv6 AR_ENTRY_QUERY
- */
-static areq_t ipv6_areq_template = {
- AR_ENTRY_QUERY, /* cmd */
- sizeof (areq_t)+(2*IPV6_ADDR_LEN), /* name offset */
- sizeof (areq_t), /* name len (filled by ill_arp_alloc) */
- ETHERTYPE_IPV6, /* protocol, from arps perspective */
- sizeof (areq_t), /* target addr offset */
- IPV6_ADDR_LEN, /* target addr_length */
- 0, /* flags */
- sizeof (areq_t) + IPV6_ADDR_LEN, /* sender addr offset */
- IPV6_ADDR_LEN, /* sender addr length */
- 6, /* xmit_count */
- 1000, /* (re)xmit_interval in milliseconds */
- 4 /* max # of requests to buffer */
- /* anything else filled in by the code */
-};
/*
- * Handle IPv6 ICMP packets sent to us. Consume the mblk passed in.
- * The message has already been checksummed and if needed,
- * a copy has been made to be sent any interested ICMP client (conn)
- * Note that this is different than icmp_inbound() which does the fanout
- * to conn's as well as local processing of the ICMP packets.
+ * icmp_inbound_v6 deals with ICMP messages that are handled by IP.
+ * If the ICMP message is consumed by IP, i.e., it should not be delivered
+ * to any IPPROTO_ICMP raw sockets, then it returns NULL.
+ * Likewise, if the ICMP error is misformed (too short, etc), then it
+ * returns NULL. The caller uses this to determine whether or not to send
+ * to raw sockets.
*
* All error messages are passed to the matching transport stream.
*
- * Zones notes:
- * The packet is only processed in the context of the specified zone: typically
- * only this zone will reply to an echo request. This means that the caller must
- * call icmp_inbound_v6() for each relevant zone.
+ * See comment for icmp_inbound_v4() on how IPsec is handled.
*/
-static void
-icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, ill_t *inill,
- uint_t hdr_length, boolean_t mctl_present, uint_t flags, zoneid_t zoneid,
- mblk_t *dl_mp)
+mblk_t *
+icmp_inbound_v6(mblk_t *mp, ip_recv_attr_t *ira)
{
icmp6_t *icmp6;
- ip6_t *ip6h;
+ ip6_t *ip6h; /* Outer header */
+ int ip_hdr_length; /* Outer header length */
boolean_t interested;
- in6_addr_t origsrc;
- mblk_t *first_mp;
- ipsec_in_t *ii;
+ ill_t *ill = ira->ira_ill;
ip_stack_t *ipst = ill->ill_ipst;
-
- ASSERT(ill != NULL);
- first_mp = mp;
- if (mctl_present) {
- mp = first_mp->b_cont;
- ASSERT(mp != NULL);
-
- ii = (ipsec_in_t *)first_mp->b_rptr;
- ASSERT(ii->ipsec_in_type == IPSEC_IN);
- }
+ mblk_t *mp_ret = NULL;
ip6h = (ip6_t *)mp->b_rptr;
BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInMsgs);
- if ((mp->b_wptr - mp->b_rptr) < (hdr_length + ICMP6_MINLEN)) {
- if (!pullupmsg(mp, hdr_length + ICMP6_MINLEN)) {
- ip1dbg(("icmp_inbound_v6: pullupmsg failed\n"));
- BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
- freemsg(first_mp);
- return;
- }
- ip6h = (ip6_t *)mp->b_rptr;
- }
- if (ipst->ips_icmp_accept_clear_messages == 0) {
- first_mp = ipsec_check_global_policy(first_mp, NULL,
- NULL, ip6h, mctl_present, ipst->ips_netstack);
- if (first_mp == NULL)
- return;
- }
+ /* Make sure ira_l2src is set for ndp_input */
+ if (!(ira->ira_flags & IRAF_L2SRC_SET))
+ ip_setl2src(mp, ira, ira->ira_rill);
- /*
- * On a labeled system, we have to check whether the zone itself is
- * permitted to receive raw traffic.
- */
- if (is_system_labeled()) {
- if (zoneid == ALL_ZONES)
- zoneid = tsol_packet_to_zoneid(mp);
- if (!tsol_can_accept_raw(mp, B_FALSE)) {
- ip1dbg(("icmp_inbound_v6: zone %d can't receive raw",
- zoneid));
+ ip_hdr_length = ira->ira_ip_hdr_length;
+ if ((mp->b_wptr - mp->b_rptr) < (ip_hdr_length + ICMP6_MINLEN)) {
+ if (ira->ira_pktlen < (ip_hdr_length + ICMP6_MINLEN)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
+ ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
+ freemsg(mp);
+ return (NULL);
+ }
+ ip6h = ip_pullup(mp, ip_hdr_length + ICMP6_MINLEN, ira);
+ if (ip6h == NULL) {
BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
- freemsg(first_mp);
- return;
+ freemsg(mp);
+ return (NULL);
}
}
- icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]);
+ icmp6 = (icmp6_t *)(&mp->b_rptr[ip_hdr_length]);
+ DTRACE_PROBE2(icmp__inbound__v6, ip6_t *, ip6h, icmp6_t *, icmp6);
ip2dbg(("icmp_inbound_v6: type %d code %d\n", icmp6->icmp6_type,
icmp6->icmp6_code));
- interested = !(icmp6->icmp6_type & ICMP6_INFOMSG_MASK);
- /* Initiate IPPF processing here */
- if (IP6_IN_IPP(flags, ipst)) {
-
- /*
- * If the ifindex changes due to SIOCSLIFINDEX
- * packet may return to IP on the wrong ill.
- */
- ip_process(IPP_LOCAL_IN, &mp, ill->ill_phyint->phyint_ifindex);
- if (mp == NULL) {
- if (mctl_present) {
- freeb(first_mp);
- }
- return;
- }
- }
+ /*
+ * We will set "interested" to "true" if we should pass a copy to
+ * the transport i.e., if it is an error message.
+ */
+ interested = !(icmp6->icmp6_type & ICMP6_INFOMSG_MASK);
switch (icmp6->icmp6_type) {
case ICMP6_DST_UNREACH:
@@ -344,9 +260,9 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, ill_t *inill,
break;
case ICMP6_PACKET_TOO_BIG:
- icmp_inbound_too_big_v6(q, first_mp, ill, inill, mctl_present,
- zoneid);
- return;
+ BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInPktTooBigs);
+ break;
+
case ICMP6_ECHO_REQUEST:
BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInEchos);
if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
@@ -362,93 +278,22 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, ill_t *inill,
mblk_t *mp1;
mp1 = copymsg(mp);
- freemsg(mp);
if (mp1 == NULL) {
- BUMP_MIB(ill->ill_icmp6_mib,
- ipv6IfIcmpInErrors);
- if (mctl_present)
- freeb(first_mp);
- return;
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards - copymsg",
+ mp, ill);
+ freemsg(mp);
+ return (NULL);
}
+ freemsg(mp);
mp = mp1;
ip6h = (ip6_t *)mp->b_rptr;
- icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]);
- if (mctl_present)
- first_mp->b_cont = mp;
- else
- first_mp = mp;
+ icmp6 = (icmp6_t *)(&mp->b_rptr[ip_hdr_length]);
}
- /*
- * Turn the echo into an echo reply.
- * Remove any extension headers (do not reverse a source route)
- * and clear the flow id (keep traffic class for now).
- */
- if (hdr_length != IPV6_HDR_LEN) {
- int i;
-
- for (i = 0; i < IPV6_HDR_LEN; i++)
- mp->b_rptr[hdr_length - i - 1] =
- mp->b_rptr[IPV6_HDR_LEN - i - 1];
- mp->b_rptr += (hdr_length - IPV6_HDR_LEN);
- ip6h = (ip6_t *)mp->b_rptr;
- ip6h->ip6_nxt = IPPROTO_ICMPV6;
- hdr_length = IPV6_HDR_LEN;
- }
- ip6h->ip6_vcf &= ~IPV6_FLOWINFO_FLOWLABEL;
icmp6->icmp6_type = ICMP6_ECHO_REPLY;
-
- ip6h->ip6_plen =
- htons((uint16_t)(msgdsize(mp) - IPV6_HDR_LEN));
- origsrc = ip6h->ip6_src;
- /*
- * Reverse the source and destination addresses.
- * If the return address is a multicast, zero out the source
- * (ip_wput_v6 will set an address).
- */
- if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
- ip6h->ip6_src = ipv6_all_zeros;
- ip6h->ip6_dst = origsrc;
- } else {
- ip6h->ip6_src = ip6h->ip6_dst;
- ip6h->ip6_dst = origsrc;
- }
-
- /* set the hop limit */
- ip6h->ip6_hops = ipst->ips_ipv6_def_hops;
-
- /*
- * Prepare for checksum by putting icmp length in the icmp
- * checksum field. The checksum is calculated in ip_wput_v6.
- */
- icmp6->icmp6_cksum = ip6h->ip6_plen;
-
- if (!mctl_present) {
- /*
- * This packet should go out the same way as it
- * came in i.e in clear. To make sure that global
- * policy will not be applied to this in ip_wput,
- * we attach a IPSEC_IN mp and clear ipsec_in_secure.
- */
- ASSERT(first_mp == mp);
- first_mp = ipsec_in_alloc(B_FALSE, ipst->ips_netstack);
- if (first_mp == NULL) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(mp);
- return;
- }
- ii = (ipsec_in_t *)first_mp->b_rptr;
-
- /* This is not a secure packet */
- ii->ipsec_in_secure = B_FALSE;
- first_mp->b_cont = mp;
- }
- if (!ipsec_in_to_out(first_mp, NULL, ip6h, zoneid)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- return;
- }
- put(WR(q), first_mp);
- return;
+ icmp_send_reply_v6(mp, ip6h, icmp6, ira);
+ return (NULL);
case ICMP6_ECHO_REPLY:
BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInEchoReplies);
@@ -464,343 +309,478 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, ill_t *inill,
case ND_NEIGHBOR_SOLICIT:
BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInNeighborSolicits);
- if (mctl_present)
- freeb(first_mp);
- /* XXX may wish to pass first_mp up to ndp_input someday. */
- ndp_input(inill, mp, dl_mp);
- return;
+ ndp_input(mp, ira);
+ return (NULL);
case ND_NEIGHBOR_ADVERT:
BUMP_MIB(ill->ill_icmp6_mib,
ipv6IfIcmpInNeighborAdvertisements);
- if (mctl_present)
- freeb(first_mp);
- /* XXX may wish to pass first_mp up to ndp_input someday. */
- ndp_input(inill, mp, dl_mp);
- return;
+ ndp_input(mp, ira);
+ return (NULL);
- case ND_REDIRECT: {
+ case ND_REDIRECT:
BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInRedirects);
if (ipst->ips_ipv6_ignore_redirect)
break;
- /*
- * As there is no upper client to deliver, we don't
- * need the first_mp any more.
- */
- if (mctl_present)
- freeb(first_mp);
- if (!pullupmsg(mp, -1)) {
- BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
- break;
- }
- icmp_redirect_v6(q, mp, ill);
- return;
- }
+ /* We now allow a RAW socket to receive this. */
+ interested = B_TRUE;
+ break;
/*
* The next three icmp messages will be handled by MLD.
* Pass all valid MLD packets up to any process(es)
- * listening on a raw ICMP socket. MLD messages are
- * freed by mld_input function.
+ * listening on a raw ICMP socket.
*/
case MLD_LISTENER_QUERY:
case MLD_LISTENER_REPORT:
case MLD_LISTENER_REDUCTION:
- if (mctl_present)
- freeb(first_mp);
- mld_input(q, mp, ill);
- return;
+ mp = mld_input(mp, ira);
+ return (mp);
default:
break;
}
- if (interested) {
- icmp_inbound_error_fanout_v6(q, first_mp, ip6h, icmp6, ill,
- inill, mctl_present, zoneid);
- } else {
- freemsg(first_mp);
- }
-}
+ /*
+ * See if there is an ICMP client to avoid an extra copymsg/freemsg
+ * if there isn't one.
+ */
+ if (ipst->ips_ipcl_proto_fanout_v6[IPPROTO_ICMPV6].connf_head != NULL) {
+ /* If there is an ICMP client and we want one too, copy it. */
-/*
- * Process received IPv6 ICMP Packet too big.
- * After updating any IRE it does the fanout to any matching transport streams.
- * Assumes the IPv6 plus ICMPv6 headers have been pulled up but nothing else.
- */
-/* ARGSUSED */
-static void
-icmp_inbound_too_big_v6(queue_t *q, mblk_t *mp, ill_t *ill, ill_t *inill,
- boolean_t mctl_present, zoneid_t zoneid)
-{
- ip6_t *ip6h;
- ip6_t *inner_ip6h;
- icmp6_t *icmp6;
- uint16_t hdr_length;
- uint32_t mtu;
- ire_t *ire, *first_ire;
- mblk_t *first_mp;
- ip_stack_t *ipst = ill->ill_ipst;
+ if (!interested) {
+ /* Caller will deliver to RAW sockets */
+ return (mp);
+ }
+ mp_ret = copymsg(mp);
+ if (mp_ret == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
+ }
+ } else if (!interested) {
+ /* Neither we nor raw sockets are interested. Drop packet now */
+ freemsg(mp);
+ return (NULL);
+ }
- first_mp = mp;
- if (mctl_present)
- mp = first_mp->b_cont;
/*
- * We must have exclusive use of the mblk to update the MTU
- * in the packet.
- * If not, we copy it.
- *
- * If there's an M_CTL present, we know that allocated first_mp
- * earlier in this function, so we know first_mp has refcnt of one.
+ * ICMP error or redirect packet. Make sure we have enough of
+ * the header and that db_ref == 1 since we might end up modifying
+ * the packet.
*/
- ASSERT(!mctl_present || first_mp->b_datap->db_ref == 1);
+ if (mp->b_cont != NULL) {
+ if (ip_pullup(mp, -1, ira) == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards - ip_pullup",
+ mp, ill);
+ freemsg(mp);
+ return (mp_ret);
+ }
+ }
+
if (mp->b_datap->db_ref > 1) {
mblk_t *mp1;
mp1 = copymsg(mp);
- freemsg(mp);
if (mp1 == NULL) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- if (mctl_present)
- freeb(first_mp);
- return;
+ ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
+ freemsg(mp);
+ return (mp_ret);
}
+ freemsg(mp);
mp = mp1;
- if (mctl_present)
- first_mp->b_cont = mp;
- else
- first_mp = mp;
}
+
+ /*
+ * In case mp has changed, verify the message before any further
+ * processes.
+ */
ip6h = (ip6_t *)mp->b_rptr;
- if (ip6h->ip6_nxt != IPPROTO_ICMPV6)
- hdr_length = ip_hdr_length_v6(mp, ip6h);
- else
- hdr_length = IPV6_HDR_LEN;
+ icmp6 = (icmp6_t *)(&mp->b_rptr[ip_hdr_length]);
+ if (!icmp_inbound_verify_v6(mp, icmp6, ira)) {
+ freemsg(mp);
+ return (mp_ret);
+ }
- icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]);
- ASSERT((size_t)(mp->b_wptr - mp->b_rptr) >= hdr_length + ICMP6_MINLEN);
- inner_ip6h = (ip6_t *)&icmp6[1]; /* Packet in error */
- if ((uchar_t *)&inner_ip6h[1] > mp->b_wptr) {
- if (!pullupmsg(mp, (uchar_t *)&inner_ip6h[1] - mp->b_rptr)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(first_mp);
- return;
+ switch (icmp6->icmp6_type) {
+ case ND_REDIRECT:
+ icmp_redirect_v6(mp, ip6h, (nd_redirect_t *)icmp6, ira);
+ break;
+ case ICMP6_PACKET_TOO_BIG:
+ /* Update DCE and adjust MTU is icmp header if needed */
+ icmp_inbound_too_big_v6(icmp6, ira);
+ /* FALLTHRU */
+ default:
+ icmp_inbound_error_fanout_v6(mp, icmp6, ira);
+ break;
+ }
+
+ return (mp_ret);
+}
+
+/*
+ * Send an ICMP echo reply.
+ * The caller has already updated the payload part of the packet.
+ * We handle the ICMP checksum, IP source address selection and feed
+ * the packet into ip_output_simple.
+ */
+static void
+icmp_send_reply_v6(mblk_t *mp, ip6_t *ip6h, icmp6_t *icmp6,
+ ip_recv_attr_t *ira)
+{
+ uint_t ip_hdr_length = ira->ira_ip_hdr_length;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+ ip_xmit_attr_t ixas;
+ in6_addr_t origsrc;
+
+ /*
+ * Remove any extension headers (do not reverse a source route)
+ * and clear the flow id (keep traffic class for now).
+ */
+ if (ip_hdr_length != IPV6_HDR_LEN) {
+ int i;
+
+ for (i = 0; i < IPV6_HDR_LEN; i++) {
+ mp->b_rptr[ip_hdr_length - i - 1] =
+ mp->b_rptr[IPV6_HDR_LEN - i - 1];
}
+ mp->b_rptr += (ip_hdr_length - IPV6_HDR_LEN);
ip6h = (ip6_t *)mp->b_rptr;
- icmp6 = (icmp6_t *)&mp->b_rptr[hdr_length];
- inner_ip6h = (ip6_t *)&icmp6[1];
+ ip6h->ip6_nxt = IPPROTO_ICMPV6;
+ i = ntohs(ip6h->ip6_plen);
+ i -= (ip_hdr_length - IPV6_HDR_LEN);
+ ip6h->ip6_plen = htons(i);
+ ip_hdr_length = IPV6_HDR_LEN;
+ ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == msgdsize(mp));
}
+ ip6h->ip6_vcf &= ~IPV6_FLOWINFO_FLOWLABEL;
+
+ /* Reverse the source and destination addresses. */
+ origsrc = ip6h->ip6_src;
+ ip6h->ip6_src = ip6h->ip6_dst;
+ ip6h->ip6_dst = origsrc;
+
+ /* set the hop limit */
+ ip6h->ip6_hops = ipst->ips_ipv6_def_hops;
/*
- * For link local destinations matching simply on IRE type is not
- * sufficient. Same link local addresses for different ILL's is
- * possible.
+ * Prepare for checksum by putting icmp length in the icmp
+ * checksum field. The checksum is calculated in ip_output
*/
- if (IN6_IS_ADDR_LINKLOCAL(&inner_ip6h->ip6_dst)) {
- first_ire = ire_ctable_lookup_v6(&inner_ip6h->ip6_dst, NULL,
- IRE_CACHE, ill->ill_ipif, ALL_ZONES, NULL,
- MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst);
-
- if (first_ire == NULL) {
- if (ip_debug > 2) {
- /* ip1dbg */
- pr_addr_dbg("icmp_inbound_too_big_v6:"
- "no ire for dst %s\n", AF_INET6,
- &inner_ip6h->ip6_dst);
- }
- freemsg(first_mp);
- return;
- }
+ icmp6->icmp6_cksum = ip6h->ip6_plen;
- mtu = ntohl(icmp6->icmp6_mtu);
- rw_enter(&first_ire->ire_bucket->irb_lock, RW_READER);
- for (ire = first_ire; ire != NULL &&
- IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &inner_ip6h->ip6_dst);
- ire = ire->ire_next) {
- mutex_enter(&ire->ire_lock);
- if (mtu < IPV6_MIN_MTU) {
- ip1dbg(("Received mtu less than IPv6 "
- "min mtu %d: %d\n", IPV6_MIN_MTU, mtu));
- mtu = IPV6_MIN_MTU;
- /*
- * If an mtu less than IPv6 min mtu is received,
- * we must include a fragment header in
- * subsequent packets.
- */
- ire->ire_frag_flag |= IPH_FRAG_HDR;
- }
- ip1dbg(("Received mtu from router: %d\n", mtu));
- ire->ire_max_frag = MIN(ire->ire_max_frag, mtu);
- if (ire->ire_max_frag == mtu) {
- /* Decreased it */
- ire->ire_marks |= IRE_MARK_PMTU;
- }
- /* Record the new max frag size for the ULP. */
- if (ire->ire_frag_flag & IPH_FRAG_HDR) {
- /*
- * If we need a fragment header in every packet
- * (above case or multirouting), make sure the
- * ULP takes it into account when computing the
- * payload size.
- */
- icmp6->icmp6_mtu = htonl(ire->ire_max_frag -
- sizeof (ip6_frag_t));
- } else {
- icmp6->icmp6_mtu = htonl(ire->ire_max_frag);
- }
- mutex_exit(&ire->ire_lock);
- }
- rw_exit(&first_ire->ire_bucket->irb_lock);
- ire_refrele(first_ire);
- } else {
- irb_t *irb = NULL;
+ bzero(&ixas, sizeof (ixas));
+ ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
+ ixas.ixa_zoneid = ira->ira_zoneid;
+ ixas.ixa_cred = kcred;
+ ixas.ixa_cpid = NOPID;
+ ixas.ixa_tsl = ira->ira_tsl; /* Behave as a multi-level responder */
+ ixas.ixa_ifindex = 0;
+ ixas.ixa_ipst = ipst;
+ ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
+
+ if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
/*
- * for non-link local destinations we match only on the IRE type
+ * This packet should go out the same way as it
+ * came in i.e in clear, independent of the IPsec
+ * policy for transmitting packets.
*/
- ire = ire_ctable_lookup_v6(&inner_ip6h->ip6_dst, NULL,
- IRE_CACHE, ill->ill_ipif, ALL_ZONES, NULL, MATCH_IRE_TYPE,
- ipst);
- if (ire == NULL) {
- if (ip_debug > 2) {
- /* ip1dbg */
- pr_addr_dbg("icmp_inbound_too_big_v6:"
- "no ire for dst %s\n",
- AF_INET6, &inner_ip6h->ip6_dst);
- }
- freemsg(first_mp);
+ ixas.ixa_flags |= IXAF_NO_IPSEC;
+ } else {
+ if (!ipsec_in_to_out(ira, &ixas, mp, NULL, ip6h)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ /* Note: mp already consumed and ip_drop_packet done */
return;
}
- irb = ire->ire_bucket;
- ire_refrele(ire);
- rw_enter(&irb->irb_lock, RW_READER);
- for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
- if (IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6,
- &inner_ip6h->ip6_dst)) {
- mtu = ntohl(icmp6->icmp6_mtu);
- mutex_enter(&ire->ire_lock);
- if (mtu < IPV6_MIN_MTU) {
- ip1dbg(("Received mtu less than IPv6"
- "min mtu %d: %d\n",
- IPV6_MIN_MTU, mtu));
- mtu = IPV6_MIN_MTU;
- /*
- * If an mtu less than IPv6 min mtu is
- * received, we must include a fragment
- * header in subsequent packets.
- */
- ire->ire_frag_flag |= IPH_FRAG_HDR;
- }
+ }
- ip1dbg(("Received mtu from router: %d\n", mtu));
- ire->ire_max_frag = MIN(ire->ire_max_frag, mtu);
- if (ire->ire_max_frag == mtu) {
- /* Decreased it */
- ire->ire_marks |= IRE_MARK_PMTU;
- }
- /* Record the new max frag size for the ULP. */
- if (ire->ire_frag_flag & IPH_FRAG_HDR) {
- /*
- * If we need a fragment header in
- * every packet (above case or
- * multirouting), make sure the ULP
- * takes it into account when computing
- * the payload size.
- */
- icmp6->icmp6_mtu =
- htonl(ire->ire_max_frag -
- sizeof (ip6_frag_t));
- } else {
- icmp6->icmp6_mtu =
- htonl(ire->ire_max_frag);
- }
- mutex_exit(&ire->ire_lock);
- }
- }
- rw_exit(&irb->irb_lock);
+ /* Was the destination (now source) link-local? Send out same group */
+ if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
+ ixas.ixa_flags |= IXAF_SCOPEID_SET;
+ if (IS_UNDER_IPMP(ill))
+ ixas.ixa_scopeid = ill_get_upper_ifindex(ill);
+ else
+ ixas.ixa_scopeid = ill->ill_phyint->phyint_ifindex;
+ }
+
+ if (ira->ira_flags & IRAF_MULTIBROADCAST) {
+ /*
+ * Not one or our addresses (IRE_LOCALs), thus we let
+ * ip_output_simple pick the source.
+ */
+ ip6h->ip6_src = ipv6_all_zeros;
+ ixas.ixa_flags |= IXAF_SET_SOURCE;
}
- icmp_inbound_error_fanout_v6(q, first_mp, ip6h, icmp6, ill, inill,
- mctl_present, zoneid);
+
+ /* Should we send using dce_pmtu? */
+ if (ipst->ips_ipv6_icmp_return_pmtu)
+ ixas.ixa_flags |= IXAF_PMTU_DISCOVERY;
+
+ (void) ip_output_simple(mp, &ixas);
+ ixa_cleanup(&ixas);
+
}
/*
- * Fanout for ICMPv6 errors containing IP-in-IPv6 packets. Returns B_TRUE if a
- * tunnel consumed the message, and B_FALSE otherwise.
+ * Verify the ICMP messages for either for ICMP error or redirect packet.
+ * The caller should have fully pulled up the message. If it's a redirect
+ * packet, only basic checks on IP header will be done; otherwise, verify
+ * the packet by looking at the included ULP header.
+ *
+ * Called before icmp_inbound_error_fanout_v6 is called.
*/
static boolean_t
-icmp_inbound_iptun_fanout_v6(mblk_t *first_mp, ip6_t *rip6h, ill_t *ill,
- ip_stack_t *ipst)
+icmp_inbound_verify_v6(mblk_t *mp, icmp6_t *icmp6, ip_recv_attr_t *ira)
{
- conn_t *connp;
+ ill_t *ill = ira->ira_ill;
+ uint16_t hdr_length;
+ uint8_t *nexthdrp;
+ uint8_t nexthdr;
+ ip_stack_t *ipst = ill->ill_ipst;
+ conn_t *connp;
+ ip6_t *ip6h; /* Inner header */
- if ((connp = ipcl_iptun_classify_v6(&rip6h->ip6_src, &rip6h->ip6_dst,
- ipst)) == NULL)
- return (B_FALSE);
+ ip6h = (ip6_t *)&icmp6[1];
+ if ((uchar_t *)ip6h + IPV6_HDR_LEN > mp->b_wptr)
+ goto truncated;
+
+ if (icmp6->icmp6_type == ND_REDIRECT) {
+ hdr_length = sizeof (nd_redirect_t);
+ } else {
+ if ((IPH_HDR_VERSION(ip6h) != IPV6_VERSION))
+ goto discard_pkt;
+ hdr_length = IPV6_HDR_LEN;
+ }
+
+ if ((uchar_t *)ip6h + hdr_length > mp->b_wptr)
+ goto truncated;
+
+ /*
+ * Stop here for ICMP_REDIRECT.
+ */
+ if (icmp6->icmp6_type == ND_REDIRECT)
+ return (B_TRUE);
+
+ /*
+ * ICMP errors only.
+ */
+ if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_length, &nexthdrp))
+ goto discard_pkt;
+ nexthdr = *nexthdrp;
+
+ /* Try to pass the ICMP message to clients who need it */
+ switch (nexthdr) {
+ case IPPROTO_UDP:
+ /*
+ * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
+ * transport header.
+ */
+ if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN >
+ mp->b_wptr)
+ goto truncated;
+ break;
+ case IPPROTO_TCP: {
+ tcpha_t *tcpha;
+
+ /*
+ * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
+ * transport header.
+ */
+ if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN >
+ mp->b_wptr)
+ goto truncated;
+
+ tcpha = (tcpha_t *)((uchar_t *)ip6h + hdr_length);
+ /*
+ * With IPMP we need to match across group, which we do
+ * since we have the upper ill from ira_ill.
+ */
+ connp = ipcl_tcp_lookup_reversed_ipv6(ip6h, tcpha, TCPS_LISTEN,
+ ill->ill_phyint->phyint_ifindex, ipst);
+ if (connp == NULL)
+ goto discard_pkt;
+
+ if ((connp->conn_verifyicmp != NULL) &&
+ !connp->conn_verifyicmp(connp, tcpha, NULL, icmp6, ira)) {
+ CONN_DEC_REF(connp);
+ goto discard_pkt;
+ }
+ CONN_DEC_REF(connp);
+ break;
+ }
+ case IPPROTO_SCTP:
+ /*
+ * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
+ * transport header.
+ */
+ if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN >
+ mp->b_wptr)
+ goto truncated;
+ break;
+ case IPPROTO_ESP:
+ case IPPROTO_AH:
+ break;
+ case IPPROTO_ENCAP:
+ case IPPROTO_IPV6: {
+ /* Look for self-encapsulated packets that caused an error */
+ ip6_t *in_ip6h;
+
+ in_ip6h = (ip6_t *)((uint8_t *)ip6h + hdr_length);
+ if ((uint8_t *)in_ip6h + (nexthdr == IPPROTO_ENCAP ?
+ sizeof (ipha_t) : sizeof (ip6_t)) > mp->b_wptr)
+ goto truncated;
+ break;
+ }
+ default:
+ break;
+ }
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
- connp->conn_recv(connp, first_mp, NULL);
- CONN_DEC_REF(connp);
return (B_TRUE);
+
+discard_pkt:
+ /* Bogus ICMP error. */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ return (B_FALSE);
+
+truncated:
+ /* We pulled up everthing already. Must be truncated */
+ BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
+ return (B_FALSE);
}
/*
- * Fanout received ICMPv6 error packets to the transports.
- * Assumes the IPv6 plus ICMPv6 headers have been pulled up but nothing else.
+ * Process received IPv6 ICMP Packet too big.
+ * The caller is responsible for validating the packet before passing it in
+ * and also to fanout the ICMP error to any matching transport conns. Assumes
+ * the message has been fully pulled up.
+ *
+ * Before getting here, the caller has called icmp_inbound_verify_v6()
+ * that should have verified with ULP to prevent undoing the changes we're
+ * going to make to DCE. For example, TCP might have verified that the packet
+ * which generated error is in the send window.
+ *
+ * In some cases modified this MTU in the ICMP header packet; the caller
+ * should pass to the matching ULP after this returns.
*/
-void
-icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
- icmp6_t *icmp6, ill_t *ill, ill_t *inill, boolean_t mctl_present,
- zoneid_t zoneid)
+static void
+icmp_inbound_too_big_v6(icmp6_t *icmp6, ip_recv_attr_t *ira)
{
- uint16_t *up; /* Pointer to ports in ULP header */
- uint32_t ports; /* reversed ports for fanout */
- ip6_t rip6h; /* With reversed addresses */
- uint16_t hdr_length;
- uint8_t *nexthdrp;
- uint8_t nexthdr;
- mblk_t *first_mp;
- ipsec_in_t *ii;
- tcpha_t *tcpha;
- conn_t *connp;
+ uint32_t mtu;
+ dce_t *dce;
+ ill_t *ill = ira->ira_ill; /* Upper ill if IPMP */
ip_stack_t *ipst = ill->ill_ipst;
+ int old_max_frag;
+ in6_addr_t final_dst;
+ ip6_t *ip6h; /* Inner IP header */
- first_mp = mp;
- if (mctl_present) {
- mp = first_mp->b_cont;
- ASSERT(mp != NULL);
+ /* Caller has already pulled up everything. */
+ ip6h = (ip6_t *)&icmp6[1];
+ final_dst = ip_get_dst_v6(ip6h, NULL, NULL);
- ii = (ipsec_in_t *)first_mp->b_rptr;
- ASSERT(ii->ipsec_in_type == IPSEC_IN);
+ /*
+ * For link local destinations matching simply on address is not
+ * sufficient. Same link local addresses for different ILL's is
+ * possible.
+ */
+ if (IN6_IS_ADDR_LINKSCOPE(&final_dst)) {
+ dce = dce_lookup_and_add_v6(&final_dst,
+ ill->ill_phyint->phyint_ifindex, ipst);
} else {
- ii = NULL;
+ dce = dce_lookup_and_add_v6(&final_dst, 0, ipst);
+ }
+ if (dce == NULL) {
+ /* Couldn't add a unique one - ENOMEM */
+ if (ip_debug > 2) {
+ /* ip1dbg */
+ pr_addr_dbg("icmp_inbound_too_big_v6:"
+ "no dce for dst %s\n", AF_INET6,
+ &final_dst);
+ }
+ return;
}
- hdr_length = (uint16_t)((uchar_t *)icmp6 - (uchar_t *)ip6h);
- ASSERT((size_t)(mp->b_wptr - (uchar_t *)icmp6) >= ICMP6_MINLEN);
+ mtu = ntohl(icmp6->icmp6_mtu);
+ mutex_enter(&dce->dce_lock);
+ if (dce->dce_flags & DCEF_PMTU)
+ old_max_frag = dce->dce_pmtu;
+ else
+ old_max_frag = ill->ill_mtu;
+
+ if (mtu < IPV6_MIN_MTU) {
+ ip1dbg(("Received mtu less than IPv6 "
+ "min mtu %d: %d\n", IPV6_MIN_MTU, mtu));
+ mtu = IPV6_MIN_MTU;
+ /*
+ * If an mtu less than IPv6 min mtu is received,
+ * we must include a fragment header in
+ * subsequent packets.
+ */
+ dce->dce_flags |= DCEF_TOO_SMALL_PMTU;
+ } else {
+ dce->dce_flags &= ~DCEF_TOO_SMALL_PMTU;
+ }
+ ip1dbg(("Received mtu from router: %d\n", mtu));
+ dce->dce_pmtu = MIN(old_max_frag, mtu);
+
+ /* Prepare to send the new max frag size for the ULP. */
+ if (dce->dce_flags & DCEF_TOO_SMALL_PMTU) {
+ /*
+ * If we need a fragment header in every packet
+ * (above case or multirouting), make sure the
+ * ULP takes it into account when computing the
+ * payload size.
+ */
+ icmp6->icmp6_mtu = htonl(dce->dce_pmtu - sizeof (ip6_frag_t));
+ } else {
+ icmp6->icmp6_mtu = htonl(dce->dce_pmtu);
+ }
+ /* We now have a PMTU for sure */
+ dce->dce_flags |= DCEF_PMTU;
+ dce->dce_last_change_time = TICK_TO_SEC(lbolt64);
+ mutex_exit(&dce->dce_lock);
/*
- * Need to pullup everything in order to use
- * ip_hdr_length_nexthdr_v6()
+ * After dropping the lock the new value is visible to everyone.
+ * Then we bump the generation number so any cached values reinspect
+ * the dce_t.
*/
- if (mp->b_cont != NULL) {
- if (!pullupmsg(mp, -1)) {
- ip1dbg(("icmp_inbound_error_fanout_v6: "
- "pullupmsg failed\n"));
- goto drop_pkt;
- }
- ip6h = (ip6_t *)mp->b_rptr;
- icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]);
- }
+ dce_increment_generation(dce);
+ dce_refrele(dce);
+}
- ip6h = (ip6_t *)&icmp6[1]; /* Packet in error */
- if ((uchar_t *)&ip6h[1] > mp->b_wptr)
- goto drop_pkt;
+/*
+ * Fanout received ICMPv6 error packets to the transports.
+ * Assumes the IPv6 plus ICMPv6 headers have been pulled up but nothing else.
+ *
+ * The caller must have called icmp_inbound_verify_v6.
+ */
+void
+icmp_inbound_error_fanout_v6(mblk_t *mp, icmp6_t *icmp6, ip_recv_attr_t *ira)
+{
+ uint16_t *up; /* Pointer to ports in ULP header */
+ uint32_t ports; /* reversed ports for fanout */
+ ip6_t rip6h; /* With reversed addresses */
+ ip6_t *ip6h; /* Inner IP header */
+ uint16_t hdr_length; /* Inner IP header length */
+ uint8_t *nexthdrp;
+ uint8_t nexthdr;
+ tcpha_t *tcpha;
+ conn_t *connp;
+ ill_t *ill = ira->ira_ill; /* Upper in the case of IPMP */
+ ip_stack_t *ipst = ill->ill_ipst;
+ ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
+
+ /* Caller has already pulled up everything. */
+ ip6h = (ip6_t *)&icmp6[1];
+ ASSERT(mp->b_cont == NULL);
+ ASSERT((uchar_t *)&ip6h[1] <= mp->b_wptr);
if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_length, &nexthdrp))
goto drop_pkt;
nexthdr = *nexthdrp;
-
- /* Set message type, must be done after pullups */
- mp->b_datap->db_type = M_CTL;
+ ira->ira_protocol = nexthdr;
/*
* We need a separate IP header with the source and destination
@@ -814,174 +794,128 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
/* Try to pass the ICMP message to clients who need it */
switch (nexthdr) {
case IPPROTO_UDP: {
- /*
- * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
- * UDP header to get the port information.
- */
- if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN >
- mp->b_wptr) {
- break;
- }
/* Attempt to find a client stream based on port. */
up = (uint16_t *)((uchar_t *)ip6h + hdr_length);
- ((uint16_t *)&ports)[0] = up[1];
- ((uint16_t *)&ports)[1] = up[0];
- ip_fanout_udp_v6(q, first_mp, &rip6h, ports, ill, inill,
- IP6_NO_IPPOLICY, mctl_present, zoneid);
+ /* Note that we send error to all matches. */
+ ira->ira_flags |= IRAF_ICMP_ERROR;
+ ip_fanout_udp_multi_v6(mp, &rip6h, up[0], up[1], ira);
+ ira->ira_flags &= ~IRAF_ICMP_ERROR;
return;
}
case IPPROTO_TCP: {
/*
- * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
- * the TCP header to get the port information.
- */
- if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN >
- mp->b_wptr) {
- break;
- }
-
- /*
* Attempt to find a client stream based on port.
* Note that we do a reverse lookup since the header is
* in the form we sent it out.
*/
- tcpha = (tcpha_t *)((char *)ip6h + hdr_length);
+ tcpha = (tcpha_t *)((uchar_t *)ip6h + hdr_length);
+ /*
+ * With IPMP we need to match across group, which we do
+ * since we have the upper ill from ira_ill.
+ */
connp = ipcl_tcp_lookup_reversed_ipv6(ip6h, tcpha,
TCPS_LISTEN, ill->ill_phyint->phyint_ifindex, ipst);
if (connp == NULL) {
goto drop_pkt;
}
- SQUEUE_ENTER_ONE(connp->conn_sqp, first_mp, tcp_input, connp,
- SQ_FILL, SQTAG_TCP6_INPUT_ICMP_ERR);
+ if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) ||
+ (ira->ira_flags & IRAF_IPSEC_SECURE)) {
+ mp = ipsec_check_inbound_policy(mp, connp,
+ NULL, ip6h, ira);
+ if (mp == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ /* Note that mp is NULL */
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ CONN_DEC_REF(connp);
+ return;
+ }
+ }
+
+ ira->ira_flags |= IRAF_ICMP_ERROR;
+ if (IPCL_IS_TCP(connp)) {
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
+ connp->conn_recvicmp, connp, ira, SQ_FILL,
+ SQTAG_TCP6_INPUT_ICMP_ERR);
+ } else {
+ /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
+ ill_t *rill = ira->ira_rill;
+
+ ira->ira_ill = ira->ira_rill = NULL;
+ (connp->conn_recv)(connp, mp, NULL, ira);
+ CONN_DEC_REF(connp);
+ ira->ira_ill = ill;
+ ira->ira_rill = rill;
+ }
+ ira->ira_flags &= ~IRAF_ICMP_ERROR;
return;
}
case IPPROTO_SCTP:
- /*
- * Verify we have at least ICMP_MIN_SCTP_HDR_LEN bytes of
- * transport header to get the port information.
- */
- if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_SCTP_HDR_LEN >
- mp->b_wptr) {
- if (!pullupmsg(mp, (uchar_t *)ip6h + hdr_length +
- ICMP_MIN_SCTP_HDR_LEN - mp->b_rptr)) {
- goto drop_pkt;
- }
- }
-
up = (uint16_t *)((uchar_t *)ip6h + hdr_length);
+ /* Find a SCTP client stream for this packet. */
((uint16_t *)&ports)[0] = up[1];
((uint16_t *)&ports)[1] = up[0];
- ip_fanout_sctp(first_mp, inill, (ipha_t *)ip6h, ports, 0,
- mctl_present, IP6_NO_IPPOLICY, zoneid);
- return;
- case IPPROTO_ESP:
- case IPPROTO_AH: {
- int ipsec_rc;
- ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
- /*
- * We need a IPSEC_IN in the front to fanout to AH/ESP.
- * We will re-use the IPSEC_IN if it is already present as
- * AH/ESP will not affect any fields in the IPSEC_IN for
- * ICMP errors. If there is no IPSEC_IN, allocate a new
- * one and attach it in the front.
- */
- if (ii != NULL) {
- /*
- * ip_fanout_proto_again converts the ICMP errors
- * that come back from AH/ESP to M_DATA so that
- * if it is non-AH/ESP and we do a pullupmsg in
- * this function, it would work. Convert it back
- * to M_CTL before we send up as this is a ICMP
- * error. This could have been generated locally or
- * by some router. Validate the inner IPSEC
- * headers.
- *
- * NOTE : ill_index is used by ip_fanout_proto_again
- * to locate the ill.
- */
- ASSERT(ill != NULL);
- ii->ipsec_in_ill_index =
- ill->ill_phyint->phyint_ifindex;
- ii->ipsec_in_rill_index =
- inill->ill_phyint->phyint_ifindex;
- first_mp->b_cont->b_datap->db_type = M_CTL;
- } else {
- /*
- * IPSEC_IN is not present. We attach a ipsec_in
- * message and send up to IPSEC for validating
- * and removing the IPSEC headers. Clear
- * ipsec_in_secure so that when we return
- * from IPSEC, we don't mistakenly think that this
- * is a secure packet came from the network.
- *
- * NOTE : ill_index is used by ip_fanout_proto_again
- * to locate the ill.
- */
- ASSERT(first_mp == mp);
- first_mp = ipsec_in_alloc(B_FALSE, ipst->ips_netstack);
- ASSERT(ill != NULL);
- if (first_mp == NULL) {
- freemsg(mp);
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- return;
- }
- ii = (ipsec_in_t *)first_mp->b_rptr;
-
- /* This is not a secure packet */
- ii->ipsec_in_secure = B_FALSE;
- first_mp->b_cont = mp;
- mp->b_datap->db_type = M_CTL;
- ii->ipsec_in_ill_index =
- ill->ill_phyint->phyint_ifindex;
- ii->ipsec_in_rill_index =
- inill->ill_phyint->phyint_ifindex;
- }
+ ira->ira_flags |= IRAF_ICMP_ERROR;
+ ip_fanout_sctp(mp, NULL, &rip6h, ports, ira);
+ ira->ira_flags &= ~IRAF_ICMP_ERROR;
+ return;
+ case IPPROTO_ESP:
+ case IPPROTO_AH:
if (!ipsec_loaded(ipss)) {
- ip_proto_not_sup(q, first_mp, 0, zoneid, ipst);
+ ip_proto_not_sup(mp, ira);
return;
}
if (nexthdr == IPPROTO_ESP)
- ipsec_rc = ipsecesp_icmp_error(first_mp);
+ mp = ipsecesp_icmp_error(mp, ira);
else
- ipsec_rc = ipsecah_icmp_error(first_mp);
- if (ipsec_rc == IPSEC_STATUS_FAILED)
+ mp = ipsecah_icmp_error(mp, ira);
+ if (mp == NULL)
return;
- ip_fanout_proto_again(first_mp, ill, inill, NULL);
- return;
- }
- case IPPROTO_ENCAP:
- case IPPROTO_IPV6:
- if ((uint8_t *)ip6h + hdr_length +
- (nexthdr == IPPROTO_ENCAP ? sizeof (ipha_t) :
- sizeof (ip6_t)) > mp->b_wptr) {
+ /* Just in case ipsec didn't preserve the NULL b_cont */
+ if (mp->b_cont != NULL) {
+ if (!pullupmsg(mp, -1))
+ goto drop_pkt;
+ }
+
+ /*
+ * If succesful, the mp has been modified to not include
+ * the ESP/AH header so we can fanout to the ULP's icmp
+ * error handler.
+ */
+ if (mp->b_wptr - mp->b_rptr < IPV6_HDR_LEN)
goto drop_pkt;
+
+ ip6h = (ip6_t *)mp->b_rptr;
+ /* Don't call hdr_length_v6() unless you have to. */
+ if (ip6h->ip6_nxt != IPPROTO_ICMPV6)
+ hdr_length = ip_hdr_length_v6(mp, ip6h);
+ else
+ hdr_length = IPV6_HDR_LEN;
+
+ /* Verify the modified message before any further processes. */
+ icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]);
+ if (!icmp_inbound_verify_v6(mp, icmp6, ira)) {
+ freemsg(mp);
+ return;
}
- if (nexthdr == IPPROTO_ENCAP ||
- !IN6_ARE_ADDR_EQUAL(
- &((ip6_t *)(((uint8_t *)ip6h) + hdr_length))->ip6_src,
- &ip6h->ip6_src) ||
- !IN6_ARE_ADDR_EQUAL(
- &((ip6_t *)(((uint8_t *)ip6h) + hdr_length))->ip6_dst,
- &ip6h->ip6_dst)) {
- /*
- * For tunnels that have used IPsec protection,
- * we need to adjust the MTU to take into account
- * the IPsec overhead.
- */
- if (ii != NULL) {
- icmp6->icmp6_mtu = htonl(
- ntohl(icmp6->icmp6_mtu) -
- ipsec_in_extra_length(first_mp));
- }
- } else {
+ icmp_inbound_error_fanout_v6(mp, icmp6, ira);
+ return;
+
+ case IPPROTO_IPV6: {
+ /* Look for self-encapsulated packets that caused an error */
+ ip6_t *in_ip6h;
+
+ in_ip6h = (ip6_t *)((uint8_t *)ip6h + hdr_length);
+
+ if (IN6_ARE_ADDR_EQUAL(&in_ip6h->ip6_src, &ip6h->ip6_src) &&
+ IN6_ARE_ADDR_EQUAL(&in_ip6h->ip6_dst, &ip6h->ip6_dst)) {
/*
* Self-encapsulated case. As in the ipv4 case,
* we need to strip the 2nd IP header. Since mp
@@ -989,126 +923,124 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
* the 3rd header + data over the 2nd header.
*/
uint16_t unused_len;
- ip6_t *inner_ip6h = (ip6_t *)
- ((uchar_t *)ip6h + hdr_length);
/*
* Make sure we don't do recursion more than once.
*/
- if (!ip_hdr_length_nexthdr_v6(mp, inner_ip6h,
+ if (!ip_hdr_length_nexthdr_v6(mp, in_ip6h,
&unused_len, &nexthdrp) ||
*nexthdrp == IPPROTO_IPV6) {
goto drop_pkt;
}
/*
- * We are about to modify the packet. Make a copy if
- * someone else has a reference to it.
- */
- if (DB_REF(mp) > 1) {
- mblk_t *mp1;
- uint16_t icmp6_offset;
-
- mp1 = copymsg(mp);
- if (mp1 == NULL) {
- goto drop_pkt;
- }
- icmp6_offset = (uint16_t)
- ((uchar_t *)icmp6 - mp->b_rptr);
- freemsg(mp);
- mp = mp1;
-
- icmp6 = (icmp6_t *)(mp->b_rptr + icmp6_offset);
- ip6h = (ip6_t *)&icmp6[1];
- inner_ip6h = (ip6_t *)
- ((uchar_t *)ip6h + hdr_length);
-
- if (mctl_present)
- first_mp->b_cont = mp;
- else
- first_mp = mp;
- }
-
- /*
- * Need to set db_type back to M_DATA before
- * refeeding mp into this function.
- */
- DB_TYPE(mp) = M_DATA;
-
- /*
* Copy the 3rd header + remaining data on top
* of the 2nd header.
*/
- bcopy(inner_ip6h, ip6h,
- mp->b_wptr - (uchar_t *)inner_ip6h);
+ bcopy(in_ip6h, ip6h, mp->b_wptr - (uchar_t *)in_ip6h);
/*
* Subtract length of the 2nd header.
*/
mp->b_wptr -= hdr_length;
+ ip6h = (ip6_t *)mp->b_rptr;
+ /* Don't call hdr_length_v6() unless you have to. */
+ if (ip6h->ip6_nxt != IPPROTO_ICMPV6)
+ hdr_length = ip_hdr_length_v6(mp, ip6h);
+ else
+ hdr_length = IPV6_HDR_LEN;
+
+ /*
+ * Verify the modified message before any further
+ * processes.
+ */
+ icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]);
+ if (!icmp_inbound_verify_v6(mp, icmp6, ira)) {
+ freemsg(mp);
+ return;
+ }
+
/*
* Now recurse, and see what I _really_ should be
* doing here.
*/
- icmp_inbound_error_fanout_v6(q, first_mp,
- (ip6_t *)mp->b_rptr, icmp6, ill, inill,
- mctl_present, zoneid);
+ icmp_inbound_error_fanout_v6(mp, icmp6, ira);
return;
}
- if (icmp_inbound_iptun_fanout_v6(first_mp, &rip6h, ill, ipst))
+ /* FALLTHRU */
+ }
+ case IPPROTO_ENCAP:
+ if ((connp = ipcl_iptun_classify_v6(&rip6h.ip6_src,
+ &rip6h.ip6_dst, ipst)) != NULL) {
+ ira->ira_flags |= IRAF_ICMP_ERROR;
+ connp->conn_recvicmp(connp, mp, NULL, ira);
+ CONN_DEC_REF(connp);
+ ira->ira_flags &= ~IRAF_ICMP_ERROR;
return;
+ }
/*
- * No IP tunnel is associated with this error. Perhaps a raw
- * socket will want it.
+ * No IP tunnel is interested, fallthrough and see
+ * if a raw socket will want it.
*/
/* FALLTHRU */
default:
- ip_fanout_proto_v6(q, first_mp, &rip6h, ill, inill, nexthdr, 0,
- IP6_NO_IPPOLICY, mctl_present, zoneid);
+ ira->ira_flags |= IRAF_ICMP_ERROR;
+ ASSERT(ira->ira_protocol == nexthdr);
+ ip_fanout_proto_v6(mp, &rip6h, ira);
+ ira->ira_flags &= ~IRAF_ICMP_ERROR;
return;
}
/* NOTREACHED */
drop_pkt:
BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
ip1dbg(("icmp_inbound_error_fanout_v6: drop pkt\n"));
- freemsg(first_mp);
+ freemsg(mp);
}
/*
* Process received IPv6 ICMP Redirect messages.
+ * Assumes the caller has verified that the headers are in the pulled up mblk.
+ * Consumes mp.
*/
/* ARGSUSED */
static void
-icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill)
+icmp_redirect_v6(mblk_t *mp, ip6_t *ip6h, nd_redirect_t *rd,
+ ip_recv_attr_t *ira)
{
- ip6_t *ip6h;
- uint16_t hdr_length;
- nd_redirect_t *rd;
- ire_t *ire;
- ire_t *prev_ire;
+ ire_t *ire, *nire;
+ ire_t *prev_ire = NULL;
ire_t *redir_ire;
in6_addr_t *src, *dst, *gateway;
nd_opt_hdr_t *opt;
nce_t *nce;
- int nce_flags = 0;
+ int ncec_flags = 0;
int err = 0;
boolean_t redirect_to_router = B_FALSE;
int len;
int optlen;
- iulp_t ulp_info = { 0 };
- ill_t *prev_ire_ill;
- ipif_t *ipif;
+ ill_t *ill = ira->ira_rill;
+ ill_t *rill = ira->ira_rill;
ip_stack_t *ipst = ill->ill_ipst;
- ip6h = (ip6_t *)mp->b_rptr;
- if (ip6h->ip6_nxt != IPPROTO_ICMPV6)
- hdr_length = ip_hdr_length_v6(mp, ip6h);
- else
- hdr_length = IPV6_HDR_LEN;
+ /*
+ * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
+ * and make it be the IPMP upper so avoid being confused by a packet
+ * addressed to a unicast address on a different ill.
+ */
+ if (IS_UNDER_IPMP(rill)) {
+ rill = ipmp_ill_hold_ipmp_ill(rill);
+ if (rill == NULL) {
+ BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
+ ip_drop_input("ipv6IfIcmpInBadRedirects - IPMP ill",
+ mp, ill);
+ freemsg(mp);
+ return;
+ }
+ ASSERT(rill != ira->ira_rill);
+ }
- rd = (nd_redirect_t *)&mp->b_rptr[hdr_length];
- len = mp->b_wptr - mp->b_rptr - hdr_length;
+ len = mp->b_wptr - (uchar_t *)rd;
src = &ip6h->ip6_src;
dst = &rd->nd_rd_dst;
gateway = &rd->nd_rd_target;
@@ -1121,37 +1053,35 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill)
(IN6_IS_ADDR_V4MAPPED(dst)) ||
(IN6_IS_ADDR_MULTICAST(dst))) {
BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
- freemsg(mp);
- return;
+ ip_drop_input("ipv6IfIcmpInBadRedirects - addr/len", mp, ill);
+ goto fail_redirect;
}
if (!(IN6_IS_ADDR_LINKLOCAL(gateway) ||
IN6_ARE_ADDR_EQUAL(gateway, dst))) {
BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
- freemsg(mp);
- return;
+ ip_drop_input("ipv6IfIcmpInBadRedirects - bad gateway",
+ mp, ill);
+ goto fail_redirect;
}
- if (len > sizeof (nd_redirect_t)) {
- if (!ndp_verify_optlen((nd_opt_hdr_t *)&rd[1],
- len - sizeof (nd_redirect_t))) {
+ optlen = len - sizeof (nd_redirect_t);
+ if (optlen != 0) {
+ if (!ndp_verify_optlen((nd_opt_hdr_t *)&rd[1], optlen)) {
BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
- freemsg(mp);
- return;
+ ip_drop_input("ipv6IfIcmpInBadRedirects - options",
+ mp, ill);
+ goto fail_redirect;
}
}
if (!IN6_ARE_ADDR_EQUAL(gateway, dst)) {
redirect_to_router = B_TRUE;
- nce_flags |= NCE_F_ISROUTER;
+ ncec_flags |= NCE_F_ISROUTER;
+ } else {
+ gateway = dst; /* Add nce for dst */
}
- /* ipif will be refreleased afterwards */
- ipif = ipif_get_next_ipif(NULL, ill);
- if (ipif == NULL) {
- freemsg(mp);
- return;
- }
/*
* Verify that the IP source address of the redirect is
@@ -1160,10 +1090,11 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill)
* Also, Make sure we had a route for the dest in question and
* that route was pointing to the old gateway (the source of the
* redirect packet.)
+ * Note: this merely says that there is some IRE which matches that
+ * gateway; not that the longest match matches that gateway.
*/
-
- prev_ire = ire_route_lookup_v6(dst, 0, src, 0, ipif, NULL, ALL_ZONES,
- NULL, MATCH_IRE_GW | MATCH_IRE_ILL | MATCH_IRE_DEFAULT, ipst);
+ prev_ire = ire_ftable_lookup_v6(dst, 0, src, 0, rill,
+ ALL_ZONES, NULL, MATCH_IRE_GW | MATCH_IRE_ILL, 0, ipst, NULL);
/*
* Check that
@@ -1171,92 +1102,44 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill)
* old gateway is still directly reachable
*/
if (prev_ire == NULL ||
- prev_ire->ire_type == IRE_LOCAL) {
+ (prev_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) ||
+ (prev_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
- ipif_refrele(ipif);
+ ip_drop_input("ipv6IfIcmpInBadRedirects - ire", mp, ill);
goto fail_redirect;
}
- prev_ire_ill = ire_to_ill(prev_ire);
- ASSERT(prev_ire_ill != NULL);
- if (prev_ire_ill->ill_flags & ILLF_NONUD)
- nce_flags |= NCE_F_NONUD;
-
- /*
- * Should we use the old ULP info to create the new gateway? From
- * a user's perspective, we should inherit the info so that it
- * is a "smooth" transition. If we do not do that, then new
- * connections going thru the new gateway will have no route metrics,
- * which is counter-intuitive to user. From a network point of
- * view, this may or may not make sense even though the new gateway
- * is still directly connected to us so the route metrics should not
- * change much.
- *
- * But if the old ire_uinfo is not initialized, we do another
- * recursive lookup on the dest using the new gateway. There may
- * be a route to that. If so, use it to initialize the redirect
- * route.
- */
- if (prev_ire->ire_uinfo.iulp_set) {
- bcopy(&prev_ire->ire_uinfo, &ulp_info, sizeof (iulp_t));
- } else if (redirect_to_router) {
- /*
- * Only do the following if the redirection is really to
- * a router.
- */
- ire_t *tmp_ire;
- ire_t *sire;
- tmp_ire = ire_ftable_lookup_v6(dst, 0, gateway, 0, NULL, &sire,
- ALL_ZONES, 0, NULL,
- (MATCH_IRE_RECURSIVE | MATCH_IRE_GW | MATCH_IRE_DEFAULT),
- ipst);
- if (sire != NULL) {
- bcopy(&sire->ire_uinfo, &ulp_info, sizeof (iulp_t));
- ASSERT(tmp_ire != NULL);
- ire_refrele(tmp_ire);
- ire_refrele(sire);
- } else if (tmp_ire != NULL) {
- bcopy(&tmp_ire->ire_uinfo, &ulp_info,
- sizeof (iulp_t));
- ire_refrele(tmp_ire);
- }
- }
+ ASSERT(prev_ire->ire_ill != NULL);
+ if (prev_ire->ire_ill->ill_flags & ILLF_NONUD)
+ ncec_flags |= NCE_F_NONUD;
- optlen = mp->b_wptr - mp->b_rptr - hdr_length - sizeof (nd_redirect_t);
opt = (nd_opt_hdr_t *)&rd[1];
opt = ndp_get_option(opt, optlen, ND_OPT_TARGET_LINKADDR);
if (opt != NULL) {
- err = ndp_lookup_then_add_v6(ill,
- B_FALSE, /* don't match across illgrp */
+ err = nce_lookup_then_add_v6(rill,
(uchar_t *)&opt[1], /* Link layer address */
- gateway,
- &ipv6_all_ones, /* prefix mask */
- &ipv6_all_zeros, /* Mapping mask */
- 0,
- nce_flags,
- ND_STALE,
- &nce);
+ rill->ill_phys_addr_length,
+ gateway, ncec_flags, ND_STALE, &nce);
switch (err) {
case 0:
- NCE_REFRELE(nce);
+ nce_refrele(nce);
break;
case EEXIST:
/*
* Check to see if link layer address has changed and
- * process the nce_state accordingly.
+ * process the ncec_state accordingly.
*/
- ndp_process(nce, (uchar_t *)&opt[1], 0, B_FALSE);
- NCE_REFRELE(nce);
+ nce_process(nce->nce_common,
+ (uchar_t *)&opt[1], 0, B_FALSE);
+ nce_refrele(nce);
break;
default:
ip1dbg(("icmp_redirect_v6: NCE create failed %d\n",
err));
- ipif_refrele(ipif);
goto fail_redirect;
}
}
if (redirect_to_router) {
- /* icmp_redirect_ok_v6() must have already verified this */
ASSERT(IN6_IS_ADDR_LINKLOCAL(gateway));
/*
@@ -1266,65 +1149,68 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill)
ire = ire_create_v6(
dst,
&ipv6_all_ones, /* mask */
- &prev_ire->ire_src_addr_v6, /* source addr */
gateway, /* gateway addr */
- &prev_ire->ire_max_frag, /* max frag */
- NULL, /* no src nce */
- NULL, /* no rfq */
- NULL, /* no stq */
IRE_HOST,
- prev_ire->ire_ipif,
- NULL,
- 0,
- 0,
+ prev_ire->ire_ill,
+ ALL_ZONES,
(RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST),
- &ulp_info,
- NULL,
NULL,
ipst);
} else {
- queue_t *stq;
-
- stq = (ipif->ipif_net_type == IRE_IF_RESOLVER)
- ? ipif->ipif_rq : ipif->ipif_wq;
+ ipif_t *ipif;
+ in6_addr_t gw;
/*
* Just create an on link entry, i.e. interface route.
+ * The gateway field is our link-local on the ill.
*/
+ mutex_enter(&rill->ill_lock);
+ for (ipif = rill->ill_ipif; ipif != NULL;
+ ipif = ipif->ipif_next) {
+ if (!(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
+ IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr))
+ break;
+ }
+ if (ipif == NULL) {
+ /* We have no link-local address! */
+ mutex_exit(&rill->ill_lock);
+ goto fail_redirect;
+ }
+ gw = ipif->ipif_v6lcl_addr;
+ mutex_exit(&rill->ill_lock);
+
ire = ire_create_v6(
dst, /* gateway == dst */
&ipv6_all_ones, /* mask */
- &prev_ire->ire_src_addr_v6, /* source addr */
- &ipv6_all_zeros, /* gateway addr */
- &prev_ire->ire_max_frag, /* max frag */
- NULL, /* no src nce */
- NULL, /* ire rfq */
- stq, /* ire stq */
- ipif->ipif_net_type, /* IF_[NO]RESOLVER */
- prev_ire->ire_ipif,
- &ipv6_all_ones,
- 0,
- 0,
+ &gw, /* gateway addr */
+ rill->ill_net_type, /* IF_[NO]RESOLVER */
+ prev_ire->ire_ill,
+ ALL_ZONES,
(RTF_DYNAMIC | RTF_HOST),
- &ulp_info,
- NULL,
NULL,
ipst);
}
- /* Release reference from earlier ipif_get_next_ipif() */
- ipif_refrele(ipif);
-
if (ire == NULL)
goto fail_redirect;
- if (ire_add(&ire, NULL, NULL, NULL, B_FALSE) == 0) {
+ nire = ire_add(ire);
+ /* Check if it was a duplicate entry */
+ if (nire != NULL && nire != ire) {
+ ASSERT(nire->ire_identical_ref > 1);
+ ire_delete(nire);
+ ire_refrele(nire);
+ nire = NULL;
+ }
+ ire = nire;
+ if (ire != NULL) {
+ ire_refrele(ire); /* Held in ire_add */
/* tell routing sockets that we received a redirect */
ip_rts_change_v6(RTM_REDIRECT,
&rd->nd_rd_dst,
&rd->nd_rd_target,
- &ipv6_all_ones, 0, &ire->ire_src_addr_v6,
+ &ipv6_all_ones, 0, src,
(RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 0,
(RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_AUTHOR), ipst);
@@ -1334,10 +1220,9 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill)
* modifying an existing redirect.
*/
redir_ire = ire_ftable_lookup_v6(dst, 0, src, IRE_HOST,
- ire->ire_ipif, NULL, ALL_ZONES, 0, NULL,
- (MATCH_IRE_GW | MATCH_IRE_TYPE | MATCH_IRE_ILL), ipst);
-
- ire_refrele(ire); /* Held in ire_add_v6 */
+ prev_ire->ire_ill, ALL_ZONES, NULL,
+ (MATCH_IRE_GW | MATCH_IRE_TYPE | MATCH_IRE_ILL), 0, ipst,
+ NULL);
if (redir_ire != NULL) {
if (redir_ire->ire_flags & RTF_DYNAMIC)
@@ -1346,8 +1231,6 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill)
}
}
- if (prev_ire->ire_type == IRE_CACHE)
- ire_delete(prev_ire);
ire_refrele(prev_ire);
prev_ire = NULL;
@@ -1355,101 +1238,8 @@ fail_redirect:
if (prev_ire != NULL)
ire_refrele(prev_ire);
freemsg(mp);
-}
-
-static ill_t *
-ip_queue_to_ill_v6(queue_t *q, ip_stack_t *ipst)
-{
- ill_t *ill;
-
- ASSERT(WR(q) == q);
-
- if (q->q_next != NULL) {
- ill = (ill_t *)q->q_ptr;
- if (ILL_CAN_LOOKUP(ill))
- ill_refhold(ill);
- else
- ill = NULL;
- } else {
- ill = ill_lookup_on_name(ipif_loopback_name, B_FALSE, B_TRUE,
- NULL, NULL, NULL, NULL, NULL, ipst);
- }
- if (ill == NULL)
- ip0dbg(("ip_queue_to_ill_v6: no ill\n"));
- return (ill);
-}
-
-/*
- * Assigns an appropriate source address to the packet.
- * If origdst is one of our IP addresses that use it as the source.
- * If the queue is an ill queue then select a source from that ill.
- * Otherwise pick a source based on a route lookup back to the origsrc.
- *
- * src is the return parameter. Returns a pointer to src or NULL if failure.
- */
-static in6_addr_t *
-icmp_pick_source_v6(queue_t *wq, in6_addr_t *origsrc, in6_addr_t *origdst,
- in6_addr_t *src, zoneid_t zoneid, ip_stack_t *ipst)
-{
- ill_t *ill;
- ire_t *ire;
- ipif_t *ipif;
-
- ASSERT(!(wq->q_flag & QREADR));
- if (wq->q_next != NULL) {
- ill = (ill_t *)wq->q_ptr;
- } else {
- ill = NULL;
- }
-
- ire = ire_route_lookup_v6(origdst, 0, 0, (IRE_LOCAL|IRE_LOOPBACK),
- NULL, NULL, zoneid, NULL, (MATCH_IRE_TYPE|MATCH_IRE_ZONEONLY),
- ipst);
- if (ire != NULL) {
- /* Destined to one of our addresses */
- *src = *origdst;
- ire_refrele(ire);
- return (src);
- }
- if (ire != NULL) {
- ire_refrele(ire);
- ire = NULL;
- }
- if (ill == NULL) {
- /* What is the route back to the original source? */
- ire = ire_route_lookup_v6(origsrc, 0, 0, 0,
- NULL, NULL, zoneid, NULL,
- (MATCH_IRE_DEFAULT|MATCH_IRE_RECURSIVE), ipst);
- if (ire == NULL) {
- BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutNoRoutes);
- return (NULL);
- }
- ASSERT(ire->ire_ipif != NULL);
- ill = ire->ire_ipif->ipif_ill;
- ire_refrele(ire);
- }
- ipif = ipif_select_source_v6(ill, origsrc, B_FALSE,
- IPV6_PREFER_SRC_DEFAULT, zoneid);
- if (ipif != NULL) {
- *src = ipif->ipif_v6src_addr;
- ipif_refrele(ipif);
- return (src);
- }
- /*
- * Unusual case - can't find a usable source address to reach the
- * original source. Use what in the route to the source.
- */
- ire = ire_route_lookup_v6(origsrc, 0, 0, 0,
- NULL, NULL, zoneid, NULL,
- (MATCH_IRE_DEFAULT|MATCH_IRE_RECURSIVE), ipst);
- if (ire == NULL) {
- BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutNoRoutes);
- return (NULL);
- }
- ASSERT(ire != NULL);
- *src = ire->ire_src_addr_v6;
- ire_refrele(ire);
- return (src);
+ if (rill != ira->ira_rill)
+ ill_refrele(rill);
}
/*
@@ -1459,17 +1249,12 @@ icmp_pick_source_v6(queue_t *wq, in6_addr_t *origsrc, in6_addr_t *origdst,
* Note: assumes that icmp_pkt_err_ok_v6 has been called to
* verify that an icmp error packet can be sent.
*
- * If q is an ill write side queue (which is the case when packets
- * arrive from ip_rput) then ip_wput code will ensure that packets to
- * link-local destinations are sent out that ill.
- *
* If v6src_ptr is set use it as a source. Otherwise select a reasonable
* source address (see above function).
*/
static void
-icmp_pkt_v6(queue_t *q, mblk_t *mp, void *stuff, size_t len,
- const in6_addr_t *v6src_ptr, boolean_t mctl_present, zoneid_t zoneid,
- ip_stack_t *ipst)
+icmp_pkt_v6(mblk_t *mp, void *stuff, size_t len,
+ const in6_addr_t *v6src_ptr, ip_recv_attr_t *ira)
{
ip6_t *ip6h;
in6_addr_t v6dst;
@@ -1477,98 +1262,82 @@ icmp_pkt_v6(queue_t *q, mblk_t *mp, void *stuff, size_t len,
size_t msg_len;
mblk_t *mp1;
icmp6_t *icmp6;
- ill_t *ill;
in6_addr_t v6src;
- mblk_t *ipsec_mp;
- ipsec_out_t *io;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+ ip_xmit_attr_t ixas;
- ill = ip_queue_to_ill_v6(q, ipst);
- if (ill == NULL) {
- freemsg(mp);
- return;
+ ip6h = (ip6_t *)mp->b_rptr;
+
+ bzero(&ixas, sizeof (ixas));
+ ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
+ ixas.ixa_zoneid = ira->ira_zoneid;
+ ixas.ixa_ifindex = 0;
+ ixas.ixa_ipst = ipst;
+ ixas.ixa_cred = kcred;
+ ixas.ixa_cpid = NOPID;
+ ixas.ixa_tsl = ira->ira_tsl; /* Behave as a multi-level responder */
+ ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
+
+ /*
+ * If the source of the original packet was link-local, then
+ * make sure we send on the same ill (group) as we received it on.
+ */
+ if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
+ ixas.ixa_flags |= IXAF_SCOPEID_SET;
+ if (IS_UNDER_IPMP(ill))
+ ixas.ixa_scopeid = ill_get_upper_ifindex(ill);
+ else
+ ixas.ixa_scopeid = ill->ill_phyint->phyint_ifindex;
}
- if (mctl_present) {
+ if (ira->ira_flags & IRAF_IPSEC_SECURE) {
/*
- * If it is :
- *
- * 1) a IPSEC_OUT, then this is caused by outbound
- * datagram originating on this host. IPSEC processing
- * may or may not have been done. Refer to comments above
- * icmp_inbound_error_fanout for details.
+ * Apply IPsec based on how IPsec was applied to
+ * the packet that had the error.
*
- * 2) a IPSEC_IN if we are generating a icmp_message
- * for an incoming datagram destined for us i.e called
- * from ip_fanout_send_icmp.
+ * If it was an outbound packet that caused the ICMP
+ * error, then the caller will have setup the IRA
+ * appropriately.
*/
- ipsec_info_t *in;
-
- ipsec_mp = mp;
- mp = ipsec_mp->b_cont;
-
- in = (ipsec_info_t *)ipsec_mp->b_rptr;
- ip6h = (ip6_t *)mp->b_rptr;
-
- ASSERT(in->ipsec_info_type == IPSEC_OUT ||
- in->ipsec_info_type == IPSEC_IN);
-
- if (in->ipsec_info_type == IPSEC_IN) {
- /*
- * Convert the IPSEC_IN to IPSEC_OUT.
- */
- if (!ipsec_in_to_out(ipsec_mp, NULL, ip6h, zoneid)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- ill_refrele(ill);
- return;
- }
- } else {
- ASSERT(in->ipsec_info_type == IPSEC_OUT);
- io = (ipsec_out_t *)in;
- /*
- * Clear out ipsec_out_proc_begin, so we do a fresh
- * ire lookup.
- */
- io->ipsec_out_proc_begin = B_FALSE;
+ if (!ipsec_in_to_out(ira, &ixas, mp, NULL, ip6h)) {
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+ /* Note: mp already consumed and ip_drop_packet done */
+ return;
}
} else {
/*
* This is in clear. The icmp message we are building
- * here should go out in clear.
- */
- ipsec_in_t *ii;
- ASSERT(mp->b_datap->db_type == M_DATA);
- ipsec_mp = ipsec_in_alloc(B_FALSE, ipst->ips_netstack);
- if (ipsec_mp == NULL) {
- freemsg(mp);
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- ill_refrele(ill);
- return;
- }
- ii = (ipsec_in_t *)ipsec_mp->b_rptr;
-
- /* This is not a secure packet */
- ii->ipsec_in_secure = B_FALSE;
- ipsec_mp->b_cont = mp;
- ip6h = (ip6_t *)mp->b_rptr;
- /*
- * Convert the IPSEC_IN to IPSEC_OUT.
+ * here should go out in clear, independent of our policy.
*/
- if (!ipsec_in_to_out(ipsec_mp, NULL, ip6h, zoneid)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- ill_refrele(ill);
- return;
- }
+ ixas.ixa_flags |= IXAF_NO_IPSEC;
}
- io = (ipsec_out_t *)ipsec_mp->b_rptr;
+ /*
+ * If the caller specified the source we use that.
+ * Otherwise, if the packet was for one of our unicast addresses, make
+ * sure we respond with that as the source. Otherwise
+ * have ip_output_simple pick the source address.
+ */
if (v6src_ptr != NULL) {
v6src = *v6src_ptr;
} else {
- if (icmp_pick_source_v6(q, &ip6h->ip6_src, &ip6h->ip6_dst,
- &v6src, zoneid, ipst) == NULL) {
- freemsg(ipsec_mp);
- ill_refrele(ill);
- return;
+ ire_t *ire;
+ uint_t match_flags = MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY;
+
+ if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src) ||
+ IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst))
+ match_flags |= MATCH_IRE_ILL;
+
+ ire = ire_ftable_lookup_v6(&ip6h->ip6_dst, 0, 0,
+ (IRE_LOCAL|IRE_LOOPBACK), ill, ira->ira_zoneid, NULL,
+ match_flags, 0, ipst, NULL);
+ if (ire != NULL) {
+ v6src = ip6h->ip6_dst;
+ ire_refrele(ire);
+ } else {
+ v6src = ipv6_all_zeros;
+ ixas.ixa_flags |= IXAF_SET_SOURCE;
}
}
v6dst = ip6h->ip6_src;
@@ -1577,34 +1346,28 @@ icmp_pkt_v6(queue_t *q, mblk_t *mp, void *stuff, size_t len,
if (msg_len > len_needed) {
if (!adjmsg(mp, len_needed - msg_len)) {
BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutErrors);
- freemsg(ipsec_mp);
- ill_refrele(ill);
+ freemsg(mp);
return;
}
msg_len = len_needed;
}
- mp1 = allocb_tmpl(IPV6_HDR_LEN + len, mp);
+ mp1 = allocb(IPV6_HDR_LEN + len, BPRI_MED);
if (mp1 == NULL) {
BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutErrors);
- freemsg(ipsec_mp);
- ill_refrele(ill);
+ freemsg(mp);
return;
}
- ill_refrele(ill);
mp1->b_cont = mp;
mp = mp1;
- ASSERT(ipsec_mp->b_datap->db_type == M_CTL &&
- io->ipsec_out_type == IPSEC_OUT);
- ipsec_mp->b_cont = mp;
/*
- * Set ipsec_out_icmp_loopback so we can let the ICMP messages this
+ * Set IXAF_TRUSTED_ICMP so we can let the ICMP messages this
* node generates be accepted in peace by all on-host destinations.
* If we do NOT assume that all on-host destinations trust
- * self-generated ICMP messages, then rework here, ip.c, and spd.c.
- * (Look for ipsec_out_icmp_loopback).
+ * self-generated ICMP messages, then rework here, ip6.c, and spd.c.
+ * (Look for IXAF_TRUSTED_ICMP).
*/
- io->ipsec_out_icmp_loopback = B_TRUE;
+ ixas.ixa_flags |= IXAF_TRUSTED_ICMP;
ip6h = (ip6_t *)mp->b_rptr;
mp1->b_wptr = (uchar_t *)ip6h + (IPV6_HDR_LEN + len);
@@ -1624,20 +1387,21 @@ icmp_pkt_v6(queue_t *q, mblk_t *mp, void *stuff, size_t len,
bcopy(stuff, (char *)icmp6, len);
/*
* Prepare for checksum by putting icmp length in the icmp
- * checksum field. The checksum is calculated in ip_wput_v6.
+ * checksum field. The checksum is calculated in ip_output_wire_v6.
*/
icmp6->icmp6_cksum = ip6h->ip6_plen;
if (icmp6->icmp6_type == ND_REDIRECT) {
ip6h->ip6_hops = IPV6_MAX_HOPS;
}
- /* Send to V6 writeside put routine */
- put(q, ipsec_mp);
+
+ (void) ip_output_simple(mp, &ixas);
+ ixa_cleanup(&ixas);
}
/*
* Update the output mib when ICMPv6 packets are sent.
*/
-static void
+void
icmp_update_out_mib_v6(ill_t *ill, icmp6_t *icmp6)
{
BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutMsgs);
@@ -1712,14 +1476,19 @@ icmp_update_out_mib_v6(ill_t *ill, icmp6_t *icmp6)
* ICMP error packet should be sent.
*/
static mblk_t *
-icmp_pkt_err_ok_v6(queue_t *q, mblk_t *mp,
- boolean_t llbcast, boolean_t mcast_ok, ip_stack_t *ipst)
+icmp_pkt_err_ok_v6(mblk_t *mp, boolean_t mcast_ok, ip_recv_attr_t *ira)
{
- ip6_t *ip6h;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+ boolean_t llbcast;
+ ip6_t *ip6h;
if (!mp)
return (NULL);
+ /* We view multicast and broadcast as the same.. */
+ llbcast = (ira->ira_flags &
+ (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) != 0;
ip6h = (ip6_t *)mp->b_rptr;
/* Check if source address uniquely identifies the host */
@@ -1737,17 +1506,8 @@ icmp_pkt_err_ok_v6(queue_t *q, mblk_t *mp,
if (mp->b_wptr - mp->b_rptr < len_needed) {
if (!pullupmsg(mp, len_needed)) {
- ill_t *ill;
-
- ill = ip_queue_to_ill_v6(q, ipst);
- if (ill == NULL) {
- BUMP_MIB(&ipst->ips_icmp6_mib,
- ipv6IfIcmpInErrors);
- } else {
- BUMP_MIB(ill->ill_icmp6_mib,
- ipv6IfIcmpInErrors);
- ill_refrele(ill);
- }
+ BUMP_MIB(ill->ill_icmp6_mib,
+ ipv6IfIcmpInErrors);
freemsg(mp);
return (NULL);
}
@@ -1771,6 +1531,16 @@ icmp_pkt_err_ok_v6(queue_t *q, mblk_t *mp,
freemsg(mp);
return (NULL);
}
+ /*
+ * If this is a labeled system, then check to see if we're allowed to
+ * send a response to this particular sender. If not, then just drop.
+ */
+ if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) {
+ BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutErrors);
+ freemsg(mp);
+ return (NULL);
+ }
+
if (icmp_err_rate_limit(ipst)) {
/*
* Only send ICMP error packets every so often.
@@ -1784,37 +1554,117 @@ icmp_pkt_err_ok_v6(queue_t *q, mblk_t *mp,
}
/*
+ * Called when a packet was sent out the same link that it arrived on.
+ * Check if it is ok to send a redirect and then send it.
+ */
+void
+ip_send_potential_redirect_v6(mblk_t *mp, ip6_t *ip6h, ire_t *ire,
+ ip_recv_attr_t *ira)
+{
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+ in6_addr_t *v6targ;
+ ire_t *src_ire_v6 = NULL;
+ mblk_t *mp1;
+ ire_t *nhop_ire = NULL;
+
+ /*
+ * Don't send a redirect when forwarding a source
+ * routed packet.
+ */
+ if (ip_source_routed_v6(ip6h, mp, ipst))
+ return;
+
+ if (ire->ire_type & IRE_ONLINK) {
+ /* Target is directly connected */
+ v6targ = &ip6h->ip6_dst;
+ } else {
+ /* Determine the most specific IRE used to send the packets */
+ nhop_ire = ire_nexthop(ire);
+ if (nhop_ire == NULL)
+ return;
+
+ /*
+ * We won't send redirects to a router
+ * that doesn't have a link local
+ * address, but will forward.
+ */
+ if (!IN6_IS_ADDR_LINKLOCAL(&nhop_ire->ire_addr_v6)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
+ ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
+ ire_refrele(nhop_ire);
+ return;
+ }
+ v6targ = &nhop_ire->ire_addr_v6;
+ }
+ src_ire_v6 = ire_ftable_lookup_v6(&ip6h->ip6_src,
+ NULL, NULL, IRE_INTERFACE, ire->ire_ill, ALL_ZONES, NULL,
+ MATCH_IRE_ILL | MATCH_IRE_TYPE, 0, ipst, NULL);
+
+ if (src_ire_v6 == NULL) {
+ if (nhop_ire != NULL)
+ ire_refrele(nhop_ire);
+ return;
+ }
+
+ /*
+ * The source is directly connected.
+ */
+ mp1 = copymsg(mp);
+ if (mp1 != NULL)
+ icmp_send_redirect_v6(mp1, v6targ, &ip6h->ip6_dst, ira);
+
+ if (nhop_ire != NULL)
+ ire_refrele(nhop_ire);
+ ire_refrele(src_ire_v6);
+}
+
+/*
* Generate an ICMPv6 redirect message.
* Include target link layer address option if it exits.
* Always include redirect header.
*/
static void
-icmp_send_redirect_v6(queue_t *q, mblk_t *mp, in6_addr_t *targetp,
- in6_addr_t *dest, ill_t *ill, boolean_t llbcast)
+icmp_send_redirect_v6(mblk_t *mp, in6_addr_t *targetp, in6_addr_t *dest,
+ ip_recv_attr_t *ira)
{
nd_redirect_t *rd;
nd_opt_rd_hdr_t *rdh;
uchar_t *buf;
- nce_t *nce = NULL;
+ ncec_t *ncec = NULL;
nd_opt_hdr_t *opt;
int len;
int ll_opt_len = 0;
int max_redir_hdr_data_len;
int pkt_len;
in6_addr_t *srcp;
- ip_stack_t *ipst = ill->ill_ipst;
-
- /*
- * We are called from ip_rput where we could
- * not have attached an IPSEC_IN.
- */
- ASSERT(mp->b_datap->db_type == M_DATA);
+ ill_t *ill;
+ boolean_t need_refrele;
+ ip_stack_t *ipst = ira->ira_ill->ill_ipst;
- mp = icmp_pkt_err_ok_v6(q, mp, llbcast, B_FALSE, ipst);
+ mp = icmp_pkt_err_ok_v6(mp, B_FALSE, ira);
if (mp == NULL)
return;
- nce = ndp_lookup_v6(ill, B_TRUE, targetp, B_FALSE);
- if (nce != NULL && nce->nce_state != ND_INCOMPLETE) {
+
+ if (IS_UNDER_IPMP(ira->ira_ill)) {
+ ill = ipmp_ill_hold_ipmp_ill(ira->ira_ill);
+ if (ill == NULL) {
+ ill = ira->ira_ill;
+ BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
+ ip_drop_output("no IPMP ill for sending redirect",
+ mp, ill);
+ freemsg(mp);
+ return;
+ }
+ need_refrele = B_TRUE;
+ } else {
+ ill = ira->ira_ill;
+ need_refrele = B_FALSE;
+ }
+
+ ncec = ncec_lookup_illgrp_v6(ill, targetp);
+ if (ncec != NULL && ncec->ncec_state != ND_INCOMPLETE &&
+ ncec->ncec_lladdr != NULL) {
ll_opt_len = (sizeof (nd_opt_hdr_t) +
ill->ill_phys_addr_length + 7)/8 * 8;
}
@@ -1822,8 +1672,10 @@ icmp_send_redirect_v6(queue_t *q, mblk_t *mp, in6_addr_t *targetp,
ASSERT(len % 4 == 0);
buf = kmem_alloc(len, KM_NOSLEEP);
if (buf == NULL) {
- if (nce != NULL)
- NCE_REFRELE(nce);
+ if (ncec != NULL)
+ ncec_refrele(ncec);
+ if (need_refrele)
+ ill_refrele(ill);
freemsg(mp);
return;
}
@@ -1836,15 +1688,14 @@ icmp_send_redirect_v6(queue_t *q, mblk_t *mp, in6_addr_t *targetp,
rd->nd_rd_dst = *dest;
opt = (nd_opt_hdr_t *)(buf + sizeof (nd_redirect_t));
- if (nce != NULL && ll_opt_len != 0) {
+ if (ncec != NULL && ll_opt_len != 0) {
opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
opt->nd_opt_len = ll_opt_len/8;
- bcopy((char *)nce->nce_res_mp->b_rptr +
- NCE_LL_ADDR_OFFSET(ill), &opt[1],
+ bcopy((char *)ncec->ncec_lladdr, &opt[1],
ill->ill_phys_addr_length);
}
- if (nce != NULL)
- NCE_REFRELE(nce);
+ if (ncec != NULL)
+ ncec_refrele(ncec);
rdh = (nd_opt_rd_hdr_t *)(buf + sizeof (nd_redirect_t) + ll_opt_len);
rdh->nd_opt_rh_type = (uint8_t)ND_OPT_REDIRECTED_HEADER;
/* max_redir_hdr_data_len and nd_opt_rh_len must be multiple of 8 */
@@ -1862,321 +1713,136 @@ icmp_send_redirect_v6(queue_t *q, mblk_t *mp, in6_addr_t *targetp,
}
rdh->nd_opt_rh_reserved1 = 0;
rdh->nd_opt_rh_reserved2 = 0;
- /* ipif_v6src_addr contains the link-local source address */
- srcp = &ill->ill_ipif->ipif_v6src_addr;
+ /* ipif_v6lcl_addr contains the link-local source address */
+ srcp = &ill->ill_ipif->ipif_v6lcl_addr;
/* Redirects sent by router, and router is global zone */
- icmp_pkt_v6(q, mp, buf, len, srcp, B_FALSE, GLOBAL_ZONEID, ipst);
+ ASSERT(ira->ira_zoneid == ALL_ZONES);
+ ira->ira_zoneid = GLOBAL_ZONEID;
+ icmp_pkt_v6(mp, buf, len, srcp, ira);
kmem_free(buf, len);
+ if (need_refrele)
+ ill_refrele(ill);
}
/* Generate an ICMP time exceeded message. (May be called as writer.) */
void
-icmp_time_exceeded_v6(queue_t *q, mblk_t *mp, uint8_t code,
- boolean_t llbcast, boolean_t mcast_ok, zoneid_t zoneid,
- ip_stack_t *ipst)
+icmp_time_exceeded_v6(mblk_t *mp, uint8_t code, boolean_t mcast_ok,
+ ip_recv_attr_t *ira)
{
icmp6_t icmp6;
- boolean_t mctl_present;
- mblk_t *first_mp;
- EXTRACT_PKT_MP(mp, first_mp, mctl_present);
-
- mp = icmp_pkt_err_ok_v6(q, mp, llbcast, mcast_ok, ipst);
- if (mp == NULL) {
- if (mctl_present)
- freeb(first_mp);
+ mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira);
+ if (mp == NULL)
return;
- }
+
bzero(&icmp6, sizeof (icmp6_t));
icmp6.icmp6_type = ICMP6_TIME_EXCEEDED;
icmp6.icmp6_code = code;
- icmp_pkt_v6(q, first_mp, &icmp6, sizeof (icmp6_t), NULL, mctl_present,
- zoneid, ipst);
+ icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira);
}
/*
* Generate an ICMP unreachable message.
+ * When called from ip_output side a minimal ip_recv_attr_t needs to be
+ * constructed by the caller.
*/
void
-icmp_unreachable_v6(queue_t *q, mblk_t *mp, uint8_t code,
- boolean_t llbcast, boolean_t mcast_ok, zoneid_t zoneid,
- ip_stack_t *ipst)
+icmp_unreachable_v6(mblk_t *mp, uint8_t code, boolean_t mcast_ok,
+ ip_recv_attr_t *ira)
{
icmp6_t icmp6;
- boolean_t mctl_present;
- mblk_t *first_mp;
- EXTRACT_PKT_MP(mp, first_mp, mctl_present);
-
- mp = icmp_pkt_err_ok_v6(q, mp, llbcast, mcast_ok, ipst);
- if (mp == NULL) {
- if (mctl_present)
- freeb(first_mp);
+ mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira);
+ if (mp == NULL)
return;
- }
+
bzero(&icmp6, sizeof (icmp6_t));
icmp6.icmp6_type = ICMP6_DST_UNREACH;
icmp6.icmp6_code = code;
- icmp_pkt_v6(q, first_mp, &icmp6, sizeof (icmp6_t), NULL, mctl_present,
- zoneid, ipst);
+ icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira);
}
/*
* Generate an ICMP pkt too big message.
+ * When called from ip_output side a minimal ip_recv_attr_t needs to be
+ * constructed by the caller.
*/
-static void
-icmp_pkt2big_v6(queue_t *q, mblk_t *mp, uint32_t mtu,
- boolean_t llbcast, boolean_t mcast_ok, zoneid_t zoneid, ip_stack_t *ipst)
+void
+icmp_pkt2big_v6(mblk_t *mp, uint32_t mtu, boolean_t mcast_ok,
+ ip_recv_attr_t *ira)
{
icmp6_t icmp6;
- mblk_t *first_mp;
- boolean_t mctl_present;
- EXTRACT_PKT_MP(mp, first_mp, mctl_present);
-
- mp = icmp_pkt_err_ok_v6(q, mp, llbcast, mcast_ok, ipst);
- if (mp == NULL) {
- if (mctl_present)
- freeb(first_mp);
+ mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira);
+ if (mp == NULL)
return;
- }
+
bzero(&icmp6, sizeof (icmp6_t));
icmp6.icmp6_type = ICMP6_PACKET_TOO_BIG;
icmp6.icmp6_code = 0;
icmp6.icmp6_mtu = htonl(mtu);
- icmp_pkt_v6(q, first_mp, &icmp6, sizeof (icmp6_t), NULL, mctl_present,
- zoneid, ipst);
+ icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira);
}
/*
* Generate an ICMP parameter problem message. (May be called as writer.)
* 'offset' is the offset from the beginning of the packet in error.
+ * When called from ip_output side a minimal ip_recv_attr_t needs to be
+ * constructed by the caller.
*/
static void
-icmp_param_problem_v6(queue_t *q, mblk_t *mp, uint8_t code,
- uint32_t offset, boolean_t llbcast, boolean_t mcast_ok, zoneid_t zoneid,
- ip_stack_t *ipst)
+icmp_param_problem_v6(mblk_t *mp, uint8_t code, uint32_t offset,
+ boolean_t mcast_ok, ip_recv_attr_t *ira)
{
icmp6_t icmp6;
- boolean_t mctl_present;
- mblk_t *first_mp;
-
- EXTRACT_PKT_MP(mp, first_mp, mctl_present);
- mp = icmp_pkt_err_ok_v6(q, mp, llbcast, mcast_ok, ipst);
- if (mp == NULL) {
- if (mctl_present)
- freeb(first_mp);
+ mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira);
+ if (mp == NULL)
return;
- }
+
bzero((char *)&icmp6, sizeof (icmp6_t));
icmp6.icmp6_type = ICMP6_PARAM_PROB;
icmp6.icmp6_code = code;
icmp6.icmp6_pptr = htonl(offset);
- icmp_pkt_v6(q, first_mp, &icmp6, sizeof (icmp6_t), NULL, mctl_present,
- zoneid, ipst);
+ icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira);
}
-/*
- * This code will need to take into account the possibility of binding
- * to a link local address on a multi-homed host, in which case the
- * outgoing interface (from the conn) will need to be used when getting
- * an ire for the dst. Going through proper outgoing interface and
- * choosing the source address corresponding to the outgoing interface
- * is necessary when the destination address is a link-local address and
- * IPV6_BOUND_IF or IPV6_PKTINFO or scope_id has been set.
- * This can happen when active connection is setup; thus ipp pointer
- * is passed here from tcp_connect_*() routines, in non-TCP cases NULL
- * pointer is passed as ipp pointer.
- */
-mblk_t *
-ip_bind_v6(queue_t *q, mblk_t *mp, conn_t *connp, ip6_pkt_t *ipp)
+void
+icmp_param_problem_nexthdr_v6(mblk_t *mp, boolean_t mcast_ok,
+ ip_recv_attr_t *ira)
{
- ssize_t len;
- int protocol;
- struct T_bind_req *tbr;
- sin6_t *sin6;
- ipa6_conn_t *ac6;
- in6_addr_t *v6srcp;
- in6_addr_t *v6dstp;
- uint16_t lport;
- uint16_t fport;
- uchar_t *ucp;
- int error = 0;
- boolean_t local_bind;
- ipa6_conn_x_t *acx6;
- boolean_t verify_dst;
- ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
- cred_t *cr;
-
- /*
- * All Solaris components should pass a db_credp
- * for this TPI message, hence we ASSERT.
- * But in case there is some other M_PROTO that looks
- * like a TPI message sent by some other kernel
- * component, we check and return an error.
- */
- cr = msg_getcred(mp, NULL);
- ASSERT(cr != NULL);
- if (cr == NULL) {
- error = EINVAL;
- goto bad_addr;
- }
-
- ASSERT(connp->conn_af_isv6);
- len = mp->b_wptr - mp->b_rptr;
- if (len < (sizeof (*tbr) + 1)) {
- (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
- "ip_bind_v6: bogus msg, len %ld", len);
- goto bad_addr;
- }
- /* Back up and extract the protocol identifier. */
- mp->b_wptr--;
- tbr = (struct T_bind_req *)mp->b_rptr;
- /* Reset the message type in preparation for shipping it back. */
- mp->b_datap->db_type = M_PCPROTO;
-
- protocol = *mp->b_wptr & 0xFF;
- connp->conn_ulp = (uint8_t)protocol;
-
- /*
- * Check for a zero length address. This is from a protocol that
- * wants to register to receive all packets of its type.
- */
- if (tbr->ADDR_length == 0) {
- if ((protocol == IPPROTO_TCP || protocol == IPPROTO_SCTP ||
- protocol == IPPROTO_ESP || protocol == IPPROTO_AH) &&
- ipst->ips_ipcl_proto_fanout_v6[protocol].connf_head !=
- NULL) {
- /*
- * TCP, SCTP, AH, and ESP have single protocol fanouts.
- * Do not allow others to bind to these.
- */
- goto bad_addr;
- }
-
- /*
- *
- * The udp module never sends down a zero-length address,
- * and allowing this on a labeled system will break MLP
- * functionality.
- */
- if (is_system_labeled() && protocol == IPPROTO_UDP)
- goto bad_addr;
-
- /* Allow ipsec plumbing */
- if ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
- (protocol != IPPROTO_AH) && (protocol != IPPROTO_ESP))
- goto bad_addr;
-
- connp->conn_srcv6 = ipv6_all_zeros;
- ipcl_proto_insert_v6(connp, protocol);
-
- tbr->PRIM_type = T_BIND_ACK;
- return (mp);
- }
-
- /* Extract the address pointer from the message. */
- ucp = (uchar_t *)mi_offset_param(mp, tbr->ADDR_offset,
- tbr->ADDR_length);
- if (ucp == NULL) {
- ip1dbg(("ip_bind_v6: no address\n"));
- goto bad_addr;
- }
- if (!OK_32PTR(ucp)) {
- ip1dbg(("ip_bind_v6: unaligned address\n"));
- goto bad_addr;
- }
-
- switch (tbr->ADDR_length) {
- default:
- ip1dbg(("ip_bind_v6: bad address length %d\n",
- (int)tbr->ADDR_length));
- goto bad_addr;
-
- case IPV6_ADDR_LEN:
- /* Verification of local address only */
- v6srcp = (in6_addr_t *)ucp;
- lport = 0;
- local_bind = B_TRUE;
- break;
-
- case sizeof (sin6_t):
- sin6 = (sin6_t *)ucp;
- v6srcp = &sin6->sin6_addr;
- lport = sin6->sin6_port;
- local_bind = B_TRUE;
- break;
-
- case sizeof (ipa6_conn_t):
- /*
- * Verify that both the source and destination addresses
- * are valid.
- */
- ac6 = (ipa6_conn_t *)ucp;
- v6srcp = &ac6->ac6_laddr;
- v6dstp = &ac6->ac6_faddr;
- fport = ac6->ac6_fport;
- /* For raw socket, the local port is not set. */
- lport = ac6->ac6_lport != 0 ? ac6->ac6_lport :
- connp->conn_lport;
- local_bind = B_FALSE;
- /* Always verify destination reachability. */
- verify_dst = B_TRUE;
- break;
-
- case sizeof (ipa6_conn_x_t):
- /*
- * Verify that the source address is valid.
- */
- acx6 = (ipa6_conn_x_t *)ucp;
- ac6 = &acx6->ac6x_conn;
- v6srcp = &ac6->ac6_laddr;
- v6dstp = &ac6->ac6_faddr;
- fport = ac6->ac6_fport;
- lport = ac6->ac6_lport;
- local_bind = B_FALSE;
- /*
- * Client that passed ipa6_conn_x_t to us specifies whether to
- * verify destination reachability.
- */
- verify_dst = (acx6->ac6x_flags & ACX_VERIFY_DST) != 0;
- break;
- }
- if (local_bind) {
- error = ip_proto_bind_laddr_v6(connp, &mp->b_cont, protocol,
- v6srcp, lport, tbr->ADDR_length != IPV6_ADDR_LEN);
- } else {
- error = ip_proto_bind_connected_v6(connp, &mp->b_cont, protocol,
- v6srcp, lport, v6dstp, ipp, fport, B_TRUE, verify_dst, cr);
- }
+ ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+ uint16_t hdr_length;
+ uint8_t *nexthdrp;
+ uint32_t offset;
+ ill_t *ill = ira->ira_ill;
- if (error == 0) {
- /* Send it home. */
- mp->b_datap->db_type = M_PCPROTO;
- tbr->PRIM_type = T_BIND_ACK;
- return (mp);
+ /* Determine the offset of the bad nexthdr value */
+ if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_length, &nexthdrp)) {
+ /* Malformed packet */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ freemsg(mp);
+ return;
}
-bad_addr:
- ASSERT(error != EINPROGRESS);
- if (error > 0)
- mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error);
- else
- mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0);
- return (mp);
+ offset = nexthdrp - mp->b_rptr;
+ icmp_param_problem_v6(mp, ICMP6_PARAMPROB_NEXTHEADER, offset,
+ mcast_ok, ira);
}
/*
- * Here address is verified to be a valid local address.
- * If the IRE_DB_REQ_TYPE mp is present, a multicast
- * address is also considered a valid local address.
+ * Verify whether or not the IP address is a valid local address.
+ * Could be a unicast, including one for a down interface.
+ * If allow_mcbc then a multicast or broadcast address is also
+ * acceptable.
+ *
* In the case of a multicast address, however, the
* upper protocol is expected to reset the src address
- * to 0 if it sees an ire with IN6_IS_ADDR_MULTICAST returned so that
+ * to zero when we return IPVL_MCAST so that
* no packets are emitted with multicast address as
* source address.
* The addresses valid for bind are:
@@ -2193,855 +1859,418 @@ bad_addr:
* When the address is loopback or multicast, there might be many matching IREs
* so bind has to look up based on the zone.
*/
-/*
- * Verify the local IP address. Does not change the conn_t except
- * conn_fully_bound and conn_policy_cached.
- */
-static int
-ip_bind_laddr_v6(conn_t *connp, mblk_t **mpp, uint8_t protocol,
- const in6_addr_t *v6src, uint16_t lport, boolean_t fanout_insert)
+ip_laddr_t
+ip_laddr_verify_v6(const in6_addr_t *v6src, zoneid_t zoneid,
+ ip_stack_t *ipst, boolean_t allow_mcbc, uint_t scopeid)
{
- int error = 0;
- ire_t *src_ire = NULL;
- zoneid_t zoneid;
- mblk_t *mp = NULL;
- boolean_t ire_requested;
- boolean_t ipsec_policy_set;
- ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
-
- if (mpp)
- mp = *mpp;
-
- ire_requested = (mp != NULL && DB_TYPE(mp) == IRE_DB_REQ_TYPE);
- ipsec_policy_set = (mp != NULL && DB_TYPE(mp) == IPSEC_POLICY_SET);
-
- /*
- * If it was previously connected, conn_fully_bound would have
- * been set.
- */
- connp->conn_fully_bound = B_FALSE;
-
- zoneid = IPCL_ZONEID(connp);
+ ire_t *src_ire;
+ uint_t match_flags;
+ ill_t *ill = NULL;
- if (!IN6_IS_ADDR_UNSPECIFIED(v6src)) {
- src_ire = ire_route_lookup_v6(v6src, 0, 0,
- 0, NULL, NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, ipst);
- /*
- * If an address other than in6addr_any is requested,
- * we verify that it is a valid address for bind
- * Note: Following code is in if-else-if form for
- * readability compared to a condition check.
- */
- ASSERT(src_ire == NULL || !(src_ire->ire_type & IRE_BROADCAST));
- /* LINTED - statement has no consequent */
- if (IRE_IS_LOCAL(src_ire)) {
- /*
- * (2) Bind to address of local UP interface
- */
- } else if (IN6_IS_ADDR_MULTICAST(v6src)) {
- ipif_t *multi_ipif = NULL;
- ire_t *save_ire;
- /*
- * (4) bind to multicast address.
- * Fake out the IRE returned to upper
- * layer to be a broadcast IRE in
- * ip_bind_insert_ire_v6().
- * Pass other information that matches
- * the ipif (e.g. the source address).
- * conn_multicast_ill is only used for
- * IPv6 packets
- */
- mutex_enter(&connp->conn_lock);
- if (connp->conn_multicast_ill != NULL) {
- (void) ipif_lookup_zoneid(
- connp->conn_multicast_ill, zoneid, 0,
- &multi_ipif);
- } else {
- /*
- * Look for default like
- * ip_wput_v6
- */
- multi_ipif = ipif_lookup_group_v6(
- &ipv6_unspecified_group, zoneid, ipst);
- }
- mutex_exit(&connp->conn_lock);
- save_ire = src_ire;
- src_ire = NULL;
- if (multi_ipif == NULL || !ire_requested ||
- (src_ire = ipif_to_ire_v6(multi_ipif)) == NULL) {
- src_ire = save_ire;
- error = EADDRNOTAVAIL;
- } else {
- ASSERT(src_ire != NULL);
- if (save_ire != NULL)
- ire_refrele(save_ire);
- }
- if (multi_ipif != NULL)
- ipif_refrele(multi_ipif);
- } else {
- if (!ip_addr_exists_v6(v6src, zoneid, ipst)) {
- /*
- * Not a valid address for bind
- */
- error = EADDRNOTAVAIL;
- }
- }
+ ASSERT(!IN6_IS_ADDR_V4MAPPED(v6src));
+ ASSERT(!IN6_IS_ADDR_UNSPECIFIED(v6src));
- if (error != 0) {
- /* Red Alert! Attempting to be a bogon! */
- if (ip_debug > 2) {
- /* ip1dbg */
- pr_addr_dbg("ip_bind_laddr_v6: bad src"
- " address %s\n", AF_INET6, v6src);
- }
- goto bad_addr;
- }
+ match_flags = MATCH_IRE_ZONEONLY;
+ if (scopeid != 0) {
+ ill = ill_lookup_on_ifindex(scopeid, B_TRUE, ipst);
+ if (ill == NULL)
+ return (IPVL_BAD);
+ match_flags |= MATCH_IRE_ILL;
}
+ src_ire = ire_ftable_lookup_v6(v6src, NULL, NULL, 0,
+ ill, zoneid, NULL, match_flags, 0, ipst, NULL);
+ if (ill != NULL)
+ ill_refrele(ill);
+
/*
- * Allow setting new policies. For example, disconnects come
- * down as ipa_t bind. As we would have set conn_policy_cached
- * to B_TRUE before, we should set it to B_FALSE, so that policy
- * can change after the disconnect.
+ * If an address other than in6addr_any is requested,
+ * we verify that it is a valid address for bind
+ * Note: Following code is in if-else-if form for
+ * readability compared to a condition check.
*/
- connp->conn_policy_cached = B_FALSE;
-
- /* If not fanout_insert this was just an address verification */
- if (fanout_insert) {
+ if (src_ire != NULL && (src_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK))) {
/*
- * The addresses have been verified. Time to insert in
- * the correct fanout list.
+ * (2) Bind to address of local UP interface
*/
- connp->conn_srcv6 = *v6src;
- connp->conn_remv6 = ipv6_all_zeros;
- connp->conn_lport = lport;
- connp->conn_fport = 0;
- error = ipcl_bind_insert_v6(connp, protocol, v6src, lport);
- }
- if (error == 0) {
- if (ire_requested) {
- if (!ip_bind_get_ire_v6(mpp, src_ire, v6src, NULL,
- ipst)) {
- error = -1;
- goto bad_addr;
- }
- mp = *mpp;
- } else if (ipsec_policy_set) {
- if (!ip_bind_ipsec_policy_set(connp, mp)) {
- error = -1;
- goto bad_addr;
- }
- }
- }
-bad_addr:
- if (error != 0) {
- if (connp->conn_anon_port) {
- (void) tsol_mlp_anon(crgetzone(connp->conn_cred),
- connp->conn_mlp_type, connp->conn_ulp, ntohs(lport),
- B_FALSE);
- }
- connp->conn_mlp_type = mlptSingle;
- }
-
- if (src_ire != NULL)
ire_refrele(src_ire);
+ return (IPVL_UNICAST_UP);
+ } else if (IN6_IS_ADDR_MULTICAST(v6src)) {
+ /* (4) bind to multicast address. */
+ if (src_ire != NULL)
+ ire_refrele(src_ire);
- if (ipsec_policy_set) {
- ASSERT(mp != NULL);
- freeb(mp);
/*
- * As of now assume that nothing else accompanies
- * IPSEC_POLICY_SET.
+ * Note: caller should take IPV6_MULTICAST_IF
+ * into account when selecting a real source address.
*/
- *mpp = NULL;
- }
-
- return (error);
-}
-int
-ip_proto_bind_laddr_v6(conn_t *connp, mblk_t **mpp, uint8_t protocol,
- const in6_addr_t *v6srcp, uint16_t lport, boolean_t fanout_insert)
-{
- int error;
- boolean_t orig_pkt_isv6 = connp->conn_pkt_isv6;
- ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
-
- ASSERT(connp->conn_af_isv6);
- connp->conn_ulp = protocol;
+ if (allow_mcbc)
+ return (IPVL_MCAST);
+ else
+ return (IPVL_BAD);
+ } else {
+ ipif_t *ipif;
- if (IN6_IS_ADDR_V4MAPPED(v6srcp) && !connp->conn_ipv6_v6only) {
- /* Bind to IPv4 address */
- ipaddr_t v4src;
+ /*
+ * (3) Bind to address of local DOWN interface?
+ * (ipif_lookup_addr() looks up all interfaces
+ * but we do not get here for UP interfaces
+ * - case (2) above)
+ */
+ if (src_ire != NULL)
+ ire_refrele(src_ire);
- IN6_V4MAPPED_TO_IPADDR(v6srcp, v4src);
+ ipif = ipif_lookup_addr_v6(v6src, NULL, zoneid, ipst);
+ if (ipif == NULL)
+ return (IPVL_BAD);
- error = ip_bind_laddr_v4(connp, mpp, protocol, v4src, lport,
- fanout_insert);
- if (error != 0)
- goto bad_addr;
- connp->conn_pkt_isv6 = B_FALSE;
- } else {
- if (IN6_IS_ADDR_V4MAPPED(v6srcp)) {
- error = 0;
- goto bad_addr;
+ /* Not a useful source? */
+ if (ipif->ipif_flags & (IPIF_NOLOCAL | IPIF_ANYCAST)) {
+ ipif_refrele(ipif);
+ return (IPVL_BAD);
}
- error = ip_bind_laddr_v6(connp, mpp, protocol, v6srcp,
- lport, fanout_insert);
- if (error != 0)
- goto bad_addr;
- connp->conn_pkt_isv6 = B_TRUE;
+ ipif_refrele(ipif);
+ return (IPVL_UNICAST_DOWN);
}
-
- if (orig_pkt_isv6 != connp->conn_pkt_isv6)
- ip_setpktversion(connp, connp->conn_pkt_isv6, B_TRUE, ipst);
- return (0);
-
-bad_addr:
- if (error < 0)
- error = -TBADADDR;
- return (error);
}
/*
- * Verify that both the source and destination addresses
- * are valid. If verify_dst, then destination address must also be reachable,
- * i.e. have a route. Protocols like TCP want this. Tunnels do not.
- * It takes ip6_pkt_t * as one of the arguments to determine correct
- * source address when IPV6_PKTINFO or scope_id is set along with a link-local
- * destination address. Note that parameter ipp is only useful for TCP connect
- * when scope_id is set or IPV6_PKTINFO option is set with an ifindex. For all
- * non-TCP cases, it is NULL and for all other tcp cases it is not useful.
+ * Verify that both the source and destination addresses are valid. If
+ * IPDF_VERIFY_DST is not set, then the destination address may be unreachable,
+ * i.e. have no route to it. Protocols like TCP want to verify destination
+ * reachability, while tunnels do not.
+ *
+ * Determine the route, the interface, and (optionally) the source address
+ * to use to reach a given destination.
+ * Note that we allow connect to broadcast and multicast addresses when
+ * IPDF_ALLOW_MCBC is set.
+ * first_hop and dst_addr are normally the same, but if source routing
+ * they will differ; in that case the first_hop is what we'll use for the
+ * routing lookup but the dce and label checks will be done on dst_addr,
+ *
+ * If uinfo is set, then we fill in the best available information
+ * we have for the destination. This is based on (in priority order) any
+ * metrics and path MTU stored in a dce_t, route metrics, and finally the
+ * ill_mtu.
+ *
+ * Tsol note: If we have a source route then dst_addr != firsthop. But we
+ * always do the label check on dst_addr.
*
+ * Assumes that the caller has set ixa_scopeid for link-local communication.
*/
int
-ip_bind_connected_v6(conn_t *connp, mblk_t **mpp, uint8_t protocol,
- in6_addr_t *v6src, uint16_t lport, const in6_addr_t *v6dst,
- ip6_pkt_t *ipp, uint16_t fport, boolean_t fanout_insert,
- boolean_t verify_dst, cred_t *cr)
+ip_set_destination_v6(in6_addr_t *src_addrp, const in6_addr_t *dst_addr,
+ const in6_addr_t *firsthop, ip_xmit_attr_t *ixa, iulp_t *uinfo,
+ uint32_t flags, uint_t mac_mode)
{
- ire_t *src_ire;
- ire_t *dst_ire;
+ ire_t *ire;
int error = 0;
- ire_t *sire = NULL;
- ire_t *md_dst_ire = NULL;
- ill_t *md_ill = NULL;
- ill_t *dst_ill = NULL;
- ipif_t *src_ipif = NULL;
- zoneid_t zoneid;
- boolean_t ill_held = B_FALSE;
- mblk_t *mp = NULL;
- boolean_t ire_requested = B_FALSE;
- boolean_t ipsec_policy_set = B_FALSE;
- ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
- ts_label_t *tsl = NULL;
- cred_t *effective_cred = NULL;
-
- if (mpp)
- mp = *mpp;
-
- if (mp != NULL) {
- ire_requested = (DB_TYPE(mp) == IRE_DB_REQ_TYPE);
- ipsec_policy_set = (DB_TYPE(mp) == IPSEC_POLICY_SET);
- }
-
- src_ire = dst_ire = NULL;
- /*
- * If we never got a disconnect before, clear it now.
- */
- connp->conn_fully_bound = B_FALSE;
+ in6_addr_t setsrc; /* RTF_SETSRC */
+ zoneid_t zoneid = ixa->ixa_zoneid; /* Honors SO_ALLZONES */
+ ip_stack_t *ipst = ixa->ixa_ipst;
+ dce_t *dce;
+ uint_t pmtu;
+ uint_t ifindex;
+ uint_t generation;
+ nce_t *nce;
+ ill_t *ill = NULL;
+ boolean_t multirt = B_FALSE;
+
+ ASSERT(!IN6_IS_ADDR_V4MAPPED(dst_addr));
- zoneid = connp->conn_zoneid;
+ ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4));
/*
- * Check whether Trusted Solaris policy allows communication with this
- * host, and pretend that the destination is unreachable if not.
- *
- * This is never a problem for TCP, since that transport is known to
- * compute the label properly as part of the tcp_rput_other T_BIND_ACK
- * handling. If the remote is unreachable, it will be detected at that
- * point, so there's no reason to check it here.
- *
- * Note that for sendto (and other datagram-oriented friends), this
- * check is done as part of the data path label computation instead.
- * The check here is just to make non-TCP connect() report the right
- * error.
+ * We never send to zero; the ULPs map it to the loopback address.
+ * We can't allow it since we use zero to mean unitialized in some
+ * places.
*/
- if (is_system_labeled() && !IPCL_IS_TCP(connp)) {
- if ((error = tsol_check_dest(cr, v6dst, IPV6_VERSION,
- connp->conn_mac_mode, &effective_cred)) != 0) {
- if (ip_debug > 2) {
- pr_addr_dbg(
- "ip_bind_connected: no label for dst %s\n",
- AF_INET6, v6dst);
- }
- goto bad_addr;
- }
+ ASSERT(!IN6_IS_ADDR_UNSPECIFIED(dst_addr));
- /*
- * tsol_check_dest() may have created a new cred with
- * a modified security label. Use that cred if it exists
- * for ire lookups.
- */
- if (effective_cred == NULL) {
- tsl = crgetlabel(cr);
- } else {
- tsl = crgetlabel(effective_cred);
+ if (is_system_labeled()) {
+ ts_label_t *tsl = NULL;
+
+ error = tsol_check_dest(ixa->ixa_tsl, dst_addr, IPV6_VERSION,
+ mac_mode, (flags & IPDF_ZONE_IS_GLOBAL) != 0, &tsl);
+ if (error != 0)
+ return (error);
+ if (tsl != NULL) {
+ /* Update the label */
+ ip_xmit_attr_replace_tsl(ixa, tsl);
}
}
- if (IN6_IS_ADDR_MULTICAST(v6dst)) {
- ipif_t *ipif;
+ setsrc = ipv6_all_zeros;
+ /*
+ * Select a route; For IPMP interfaces, we would only select
+ * a "hidden" route (i.e., going through a specific under_ill)
+ * if ixa_ifindex has been specified.
+ */
+ ire = ip_select_route_v6(firsthop, ixa, &generation, &setsrc, &error,
+ &multirt);
+ ASSERT(ire != NULL); /* IRE_NOROUTE if none found */
+ if (error != 0)
+ goto bad_addr;
+ /*
+ * ire can't be a broadcast or multicast unless IPDF_ALLOW_MCBC is set.
+ * If IPDF_VERIFY_DST is set, the destination must be reachable.
+ * Otherwise the destination needn't be reachable.
+ *
+ * If we match on a reject or black hole, then we've got a
+ * local failure. May as well fail out the connect() attempt,
+ * since it's never going to succeed.
+ */
+ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
/*
- * Use an "emulated" IRE_BROADCAST to tell the transport it
- * is a multicast.
- * Pass other information that matches
- * the ipif (e.g. the source address).
+ * If we're verifying destination reachability, we always want
+ * to complain here.
*
- * conn_multicast_ill is only used for IPv6 packets
- */
- mutex_enter(&connp->conn_lock);
- if (connp->conn_multicast_ill != NULL) {
- (void) ipif_lookup_zoneid(connp->conn_multicast_ill,
- zoneid, 0, &ipif);
- } else {
- /* Look for default like ip_wput_v6 */
- ipif = ipif_lookup_group_v6(v6dst, zoneid, ipst);
- }
- mutex_exit(&connp->conn_lock);
- if (ipif == NULL || ire_requested ||
- (dst_ire = ipif_to_ire_v6(ipif)) == NULL) {
- if (ipif != NULL)
- ipif_refrele(ipif);
- if (ip_debug > 2) {
- /* ip1dbg */
- pr_addr_dbg("ip_bind_connected_v6: bad "
- "connected multicast %s\n", AF_INET6,
- v6dst);
- }
- error = ENETUNREACH;
- goto bad_addr;
- }
- if (ipif != NULL)
- ipif_refrele(ipif);
- } else {
- dst_ire = ire_route_lookup_v6(v6dst, NULL, NULL, 0,
- NULL, &sire, zoneid, tsl,
- MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
- MATCH_IRE_PARENT | MATCH_IRE_RJ_BHOLE | MATCH_IRE_SECATTR,
- ipst);
- /*
- * We also prevent ire's with src address INADDR_ANY to
- * be used, which are created temporarily for
- * sending out packets from endpoints that have
- * conn_unspec_src set.
+ * If we're not verifying destination reachability but the
+ * destination has a route, we still want to fail on the
+ * temporary address and broadcast address tests.
+ *
+ * In both cases do we let the code continue so some reasonable
+ * information is returned to the caller. That enables the
+ * caller to use (and even cache) the IRE. conn_ip_ouput will
+ * use the generation mismatch path to check for the unreachable
+ * case thereby avoiding any specific check in the main path.
*/
- if (dst_ire == NULL ||
- (dst_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
- IN6_IS_ADDR_UNSPECIFIED(&dst_ire->ire_src_addr_v6)) {
+ ASSERT(generation == IRE_GENERATION_VERIFY);
+ if (flags & IPDF_VERIFY_DST) {
/*
- * When verifying destination reachability, we always
- * complain.
- *
- * When not verifying destination reachability but we
- * found an IRE, i.e. the destination is reachable,
- * then the other tests still apply and we complain.
+ * Set errno but continue to set up ixa_ire to be
+ * the RTF_REJECT|RTF_BLACKHOLE IRE.
+ * That allows callers to use ip_output to get an
+ * ICMP error back.
*/
- if (verify_dst || (dst_ire != NULL)) {
- if (ip_debug > 2) {
- /* ip1dbg */
- pr_addr_dbg("ip_bind_connected_v6: bad"
- " connected dst %s\n", AF_INET6,
- v6dst);
- }
- if (dst_ire == NULL ||
- !(dst_ire->ire_type & IRE_HOST)) {
- error = ENETUNREACH;
- } else {
- error = EHOSTUNREACH;
- }
- goto bad_addr;
- }
+ if (!(ire->ire_type & IRE_HOST))
+ error = ENETUNREACH;
+ else
+ error = EHOSTUNREACH;
}
}
- /*
- * If the app does a connect(), it means that it will most likely
- * send more than 1 packet to the destination. It makes sense
- * to clear the temporary flag.
- */
- if (dst_ire != NULL && dst_ire->ire_type == IRE_CACHE &&
- (dst_ire->ire_marks & IRE_MARK_TEMPORARY)) {
- irb_t *irb = dst_ire->ire_bucket;
-
- rw_enter(&irb->irb_lock, RW_WRITER);
- /*
- * We need to recheck for IRE_MARK_TEMPORARY after acquiring
- * the lock in order to guarantee irb_tmp_ire_cnt.
- */
- if (dst_ire->ire_marks & IRE_MARK_TEMPORARY) {
- dst_ire->ire_marks &= ~IRE_MARK_TEMPORARY;
- irb->irb_tmp_ire_cnt--;
- }
- rw_exit(&irb->irb_lock);
+ if ((ire->ire_type & (IRE_BROADCAST|IRE_MULTICAST)) &&
+ !(flags & IPDF_ALLOW_MCBC)) {
+ ire_refrele(ire);
+ ire = ire_reject(ipst, B_FALSE);
+ generation = IRE_GENERATION_VERIFY;
+ error = ENETUNREACH;
}
- ASSERT(dst_ire == NULL || dst_ire->ire_ipversion == IPV6_VERSION);
+ /* Cache things */
+ if (ixa->ixa_ire != NULL)
+ ire_refrele_notr(ixa->ixa_ire);
+#ifdef DEBUG
+ ire_refhold_notr(ire);
+ ire_refrele(ire);
+#endif
+ ixa->ixa_ire = ire;
+ ixa->ixa_ire_generation = generation;
/*
- * See if we should notify ULP about MDT; we do this whether or not
- * ire_requested is TRUE, in order to handle active connects; MDT
- * eligibility tests for passive connects are handled separately
- * through tcp_adapt_ire(). We do this before the source address
- * selection, because dst_ire may change after a call to
- * ipif_select_source_v6(). This is a best-effort check, as the
- * packet for this connection may not actually go through
- * dst_ire->ire_stq, and the exact IRE can only be known after
- * calling ip_newroute_v6(). This is why we further check on the
- * IRE during Multidata packet transmission in tcp_multisend().
+ * For multicast with multirt we have a flag passed back from
+ * ire_lookup_multi_ill_v6 since we don't have an IRE for each
+ * possible multicast address.
+ * We also need a flag for multicast since we can't check
+ * whether RTF_MULTIRT is set in ixa_ire for multicast.
*/
- if (ipst->ips_ip_multidata_outbound && !ipsec_policy_set &&
- dst_ire != NULL &&
- !(dst_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST)) &&
- (md_ill = ire_to_ill(dst_ire), md_ill != NULL) &&
- ILL_MDT_CAPABLE(md_ill)) {
- md_dst_ire = dst_ire;
- IRE_REFHOLD(md_dst_ire);
- }
-
- if (dst_ire != NULL &&
- dst_ire->ire_type == IRE_LOCAL &&
- dst_ire->ire_zoneid != zoneid &&
- dst_ire->ire_zoneid != ALL_ZONES) {
- src_ire = ire_ftable_lookup_v6(v6dst, 0, 0, 0, NULL, NULL,
- zoneid, 0, NULL,
- MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
- MATCH_IRE_RJ_BHOLE, ipst);
- if (src_ire == NULL) {
- error = EHOSTUNREACH;
- goto bad_addr;
- } else if (src_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
- if (!(src_ire->ire_type & IRE_HOST))
- error = ENETUNREACH;
- else
- error = EHOSTUNREACH;
- goto bad_addr;
- }
- if (IN6_IS_ADDR_UNSPECIFIED(v6src)) {
- src_ipif = src_ire->ire_ipif;
- ipif_refhold(src_ipif);
- *v6src = src_ipif->ipif_v6lcl_addr;
- }
- ire_refrele(src_ire);
- src_ire = NULL;
- } else if (IN6_IS_ADDR_UNSPECIFIED(v6src) && dst_ire != NULL) {
- if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) {
- *v6src = sire->ire_src_addr_v6;
- ire_refrele(dst_ire);
- dst_ire = sire;
- sire = NULL;
- } else if (dst_ire->ire_type == IRE_CACHE &&
- (dst_ire->ire_flags & RTF_SETSRC)) {
- ASSERT(dst_ire->ire_zoneid == zoneid ||
- dst_ire->ire_zoneid == ALL_ZONES);
- *v6src = dst_ire->ire_src_addr_v6;
+ if (multirt) {
+ ixa->ixa_postfragfn = ip_postfrag_multirt_v6;
+ ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
+ } else {
+ ixa->ixa_postfragfn = ire->ire_postfragfn;
+ ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
+ }
+ if (!(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
+ /* Get an nce to cache. */
+ nce = ire_to_nce(ire, NULL, firsthop);
+ if (nce == NULL) {
+ /* Allocation failure? */
+ ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
} else {
- /*
- * Pick a source address so that a proper inbound load
- * spreading would happen. Use dst_ill specified by the
- * app. when socket option or scopeid is set.
- */
- int err;
-
- if (ipp != NULL && ipp->ipp_ifindex != 0) {
- uint_t if_index;
-
- /*
- * Scope id or IPV6_PKTINFO
- */
-
- if_index = ipp->ipp_ifindex;
- dst_ill = ill_lookup_on_ifindex(
- if_index, B_TRUE, NULL, NULL, NULL, NULL,
- ipst);
- if (dst_ill == NULL) {
- ip1dbg(("ip_bind_connected_v6:"
- " bad ifindex %d\n", if_index));
- error = EADDRNOTAVAIL;
- goto bad_addr;
- }
- ill_held = B_TRUE;
- } else if (connp->conn_outgoing_ill != NULL) {
- /*
- * For IPV6_BOUND_IF socket option,
- * conn_outgoing_ill should be set
- * already in TCP or UDP/ICMP.
- */
- dst_ill = conn_get_held_ill(connp,
- &connp->conn_outgoing_ill, &err);
- if (err == ILL_LOOKUP_FAILED) {
- ip1dbg(("ip_bind_connected_v6:"
- "no ill for bound_if\n"));
- error = EADDRNOTAVAIL;
- goto bad_addr;
- }
- ill_held = B_TRUE;
- } else if (dst_ire->ire_stq != NULL) {
- /* No need to hold ill here */
- dst_ill = (ill_t *)dst_ire->ire_stq->q_ptr;
- } else {
- /* No need to hold ill here */
- dst_ill = dst_ire->ire_ipif->ipif_ill;
- }
- if (ip6_asp_can_lookup(ipst)) {
- src_ipif = ipif_select_source_v6(dst_ill,
- v6dst, B_FALSE, connp->conn_src_preferences,
- zoneid);
- ip6_asp_table_refrele(ipst);
- if (src_ipif == NULL) {
- pr_addr_dbg("ip_bind_connected_v6: "
- "no usable source address for "
- "connection to %s\n",
- AF_INET6, v6dst);
- error = EADDRNOTAVAIL;
- goto bad_addr;
- }
- *v6src = src_ipif->ipif_v6lcl_addr;
- } else {
- error = EADDRNOTAVAIL;
- goto bad_addr;
- }
+ if (ixa->ixa_nce != NULL)
+ nce_refrele(ixa->ixa_nce);
+ ixa->ixa_nce = nce;
}
}
/*
- * We do ire_route_lookup_v6() here (and not an interface lookup)
- * as we assert that v6src should only come from an
- * UP interface for hard binding.
+ * We use use ire_nexthop_ill to avoid the under ipmp
+ * interface for source address selection. Note that for ipmp
+ * probe packets, ixa_ifindex would have been specified, and
+ * the ip_select_route() invocation would have picked an ire
+ * will ire_ill pointing at an under interface.
*/
- src_ire = ire_route_lookup_v6(v6src, 0, 0, 0, NULL,
- NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, ipst);
-
- /* src_ire must be a local|loopback */
- if (!IRE_IS_LOCAL(src_ire)) {
- if (ip_debug > 2) {
- /* ip1dbg */
- pr_addr_dbg("ip_bind_connected_v6: bad "
- "connected src %s\n", AF_INET6, v6src);
- }
- error = EADDRNOTAVAIL;
- goto bad_addr;
- }
+ ill = ire_nexthop_ill(ire);
/*
* If the source address is a loopback address, the
* destination had best be local or multicast.
- * The transports that can't handle multicast will reject
- * those addresses.
+ * If we are sending to an IRE_LOCAL using a loopback source then
+ * it had better be the same zoneid.
*/
- if (src_ire->ire_type == IRE_LOOPBACK &&
- !(IRE_IS_LOCAL(dst_ire) || IN6_IS_ADDR_MULTICAST(v6dst) ||
- IN6_IS_ADDR_V4MAPPED_CLASSD(v6dst))) {
- ip1dbg(("ip_bind_connected_v6: bad connected loopback\n"));
- error = -1;
- goto bad_addr;
+ if (IN6_IS_ADDR_LOOPBACK(src_addrp)) {
+ if ((ire->ire_type & IRE_LOCAL) && ire->ire_zoneid != zoneid) {
+ ire = NULL; /* Stored in ixa_ire */
+ error = EADDRNOTAVAIL;
+ goto bad_addr;
+ }
+ if (!(ire->ire_type & (IRE_LOOPBACK|IRE_LOCAL|IRE_MULTICAST))) {
+ ire = NULL; /* Stored in ixa_ire */
+ error = EADDRNOTAVAIL;
+ goto bad_addr;
+ }
}
- /*
- * Allow setting new policies. For example, disconnects come
- * down as ipa_t bind. As we would have set conn_policy_cached
- * to B_TRUE before, we should set it to B_FALSE, so that policy
- * can change after the disconnect.
- */
- connp->conn_policy_cached = B_FALSE;
/*
- * The addresses have been verified. Initialize the conn
- * before calling the policy as they expect the conns
- * initialized.
+ * Does the caller want us to pick a source address?
*/
- connp->conn_srcv6 = *v6src;
- connp->conn_remv6 = *v6dst;
- connp->conn_lport = lport;
- connp->conn_fport = fport;
-
- ASSERT(!(ipsec_policy_set && ire_requested));
- if (ire_requested) {
- iulp_t *ulp_info = NULL;
+ if (flags & IPDF_SELECT_SRC) {
+ in6_addr_t src_addr;
+
+ /* If unreachable we have no ill but need some source */
+ if (ill == NULL) {
+ src_addr = ipv6_loopback;
+ /* Make sure we look for a better source address */
+ generation = SRC_GENERATION_VERIFY;
+ } else {
+ error = ip_select_source_v6(ill, &setsrc, dst_addr,
+ zoneid, ipst, B_FALSE, ixa->ixa_src_preferences,
+ &src_addr, &generation, NULL);
+ if (error != 0) {
+ ire = NULL; /* Stored in ixa_ire */
+ goto bad_addr;
+ }
+ }
/*
- * Note that sire will not be NULL if this is an off-link
- * connection and there is not cache for that dest yet.
- *
- * XXX Because of an existing bug, if there are multiple
- * default routes, the IRE returned now may not be the actual
- * default route used (default routes are chosen in a
- * round robin fashion). So if the metrics for different
- * default routes are different, we may return the wrong
- * metrics. This will not be a problem if the existing
- * bug is fixed.
+ * We allow the source address to to down.
+ * However, we check that we don't use the loopback address
+ * as a source when sending out on the wire.
*/
- if (sire != NULL)
- ulp_info = &(sire->ire_uinfo);
-
- if (!ip_bind_get_ire_v6(mpp, dst_ire, v6dst, ulp_info,
- ipst)) {
- error = -1;
- goto bad_addr;
- }
- } else if (ipsec_policy_set) {
- if (!ip_bind_ipsec_policy_set(connp, mp)) {
- error = -1;
+ if (IN6_IS_ADDR_LOOPBACK(&src_addr) &&
+ !(ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK|IRE_MULTICAST)) &&
+ !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
+ ire = NULL; /* Stored in ixa_ire */
+ error = EADDRNOTAVAIL;
goto bad_addr;
}
+
+ *src_addrp = src_addr;
+ ixa->ixa_src_generation = generation;
}
/*
- * Cache IPsec policy in this conn. If we have per-socket policy,
- * we'll cache that. If we don't, we'll inherit global policy.
- *
- * We can't insert until the conn reflects the policy. Note that
- * conn_policy_cached is set by ipsec_conn_cache_policy() even for
- * connections where we don't have a policy. This is to prevent
- * global policy lookups in the inbound path.
- *
- * If we insert before we set conn_policy_cached,
- * CONN_INBOUND_POLICY_PRESENT_V6() check can still evaluate true
- * because global policy cound be non-empty. We normally call
- * ipsec_check_policy() for conn_policy_cached connections only if
- * conn_in_enforce_policy is set. But in this case,
- * conn_policy_cached can get set anytime since we made the
- * CONN_INBOUND_POLICY_PRESENT_V6() check and ipsec_check_policy()
- * is called, which will make the above assumption false. Thus, we
- * need to insert after we set conn_policy_cached.
+ * Make sure we don't leave an unreachable ixa_nce in place
+ * since ip_select_route is used when we unplumb i.e., remove
+ * references on ixa_ire, ixa_nce, and ixa_dce.
*/
- if ((error = ipsec_conn_cache_policy(connp, B_FALSE)) != 0)
- goto bad_addr;
+ nce = ixa->ixa_nce;
+ if (nce != NULL && nce->nce_is_condemned) {
+ nce_refrele(nce);
+ ixa->ixa_nce = NULL;
+ ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
+ }
- /* If not fanout_insert this was just an address verification */
- if (fanout_insert) {
- /*
- * The addresses have been verified. Time to insert in
- * the correct fanout list.
- */
- error = ipcl_conn_insert_v6(connp, protocol, v6src, v6dst,
- connp->conn_ports,
- IPCL_IS_TCP(connp) ? connp->conn_tcp->tcp_bound_if : 0);
+
+ ifindex = 0;
+ if (IN6_IS_ADDR_LINKSCOPE(dst_addr)) {
+ /* If we are creating a DCE we'd better have an ifindex */
+ if (ill != NULL)
+ ifindex = ill->ill_phyint->phyint_ifindex;
+ else
+ flags &= ~IPDF_UNIQUE_DCE;
}
- if (error == 0) {
- connp->conn_fully_bound = B_TRUE;
- /*
- * Our initial checks for MDT have passed; the IRE is not
- * LOCAL/LOOPBACK/BROADCAST, and the link layer seems to
- * be supporting MDT. Pass the IRE, IPC and ILL into
- * ip_mdinfo_return(), which performs further checks
- * against them and upon success, returns the MDT info
- * mblk which we will attach to the bind acknowledgment.
- */
- if (md_dst_ire != NULL) {
- mblk_t *mdinfo_mp;
-
- ASSERT(md_ill != NULL);
- ASSERT(md_ill->ill_mdt_capab != NULL);
- if ((mdinfo_mp = ip_mdinfo_return(md_dst_ire, connp,
- md_ill->ill_name, md_ill->ill_mdt_capab)) != NULL) {
- if (mp == NULL) {
- *mpp = mdinfo_mp;
- } else {
- linkb(mp, mdinfo_mp);
- }
- }
+
+ if (flags & IPDF_UNIQUE_DCE) {
+ /* Fallback to the default dce if allocation fails */
+ dce = dce_lookup_and_add_v6(dst_addr, ifindex, ipst);
+ if (dce != NULL) {
+ generation = dce->dce_generation;
+ } else {
+ dce = dce_lookup_v6(dst_addr, ifindex, ipst,
+ &generation);
}
+ } else {
+ dce = dce_lookup_v6(dst_addr, ifindex, ipst, &generation);
}
-bad_addr:
- if (ipsec_policy_set) {
- ASSERT(mp != NULL);
- freeb(mp);
- /*
- * As of now assume that nothing else accompanies
- * IPSEC_POLICY_SET.
- */
- *mpp = NULL;
- }
-refrele_and_quit:
- if (src_ire != NULL)
- IRE_REFRELE(src_ire);
- if (dst_ire != NULL)
- IRE_REFRELE(dst_ire);
- if (sire != NULL)
- IRE_REFRELE(sire);
- if (src_ipif != NULL)
- ipif_refrele(src_ipif);
- if (md_dst_ire != NULL)
- IRE_REFRELE(md_dst_ire);
- if (ill_held && dst_ill != NULL)
- ill_refrele(dst_ill);
- if (effective_cred != NULL)
- crfree(effective_cred);
- return (error);
-}
-
-/* ARGSUSED */
-int
-ip_proto_bind_connected_v6(conn_t *connp, mblk_t **mpp, uint8_t protocol,
- in6_addr_t *v6srcp, uint16_t lport, const in6_addr_t *v6dstp,
- ip6_pkt_t *ipp, uint16_t fport, boolean_t fanout_insert,
- boolean_t verify_dst, cred_t *cr)
-{
- int error = 0;
- boolean_t orig_pkt_isv6 = connp->conn_pkt_isv6;
- ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
-
- ASSERT(connp->conn_af_isv6);
- connp->conn_ulp = protocol;
+ ASSERT(dce != NULL);
+ if (ixa->ixa_dce != NULL)
+ dce_refrele_notr(ixa->ixa_dce);
+#ifdef DEBUG
+ dce_refhold_notr(dce);
+ dce_refrele(dce);
+#endif
+ ixa->ixa_dce = dce;
+ ixa->ixa_dce_generation = generation;
- /* For raw socket, the local port is not set. */
- lport = lport != 0 ? lport : connp->conn_lport;
+ /*
+ * Note that IPv6 multicast supports PMTU discovery unlike IPv4
+ * multicast. But pmtu discovery is only enabled for connected
+ * sockets in general.
+ */
/*
- * Bind to local and remote address. Local might be
- * unspecified in which case it will be extracted from
- * ire_src_addr_v6
+ * Set initial value for fragmentation limit. Either conn_ip_output
+ * or ULP might updates it when there are routing changes.
+ * Handles a NULL ixa_ire->ire_ill or a NULL ixa_nce for RTF_REJECT.
*/
- if (IN6_IS_ADDR_V4MAPPED(v6dstp) && !connp->conn_ipv6_v6only) {
- /* Connect to IPv4 address */
- ipaddr_t v4src;
- ipaddr_t v4dst;
-
- /* Is the source unspecified or mapped? */
- if (!IN6_IS_ADDR_V4MAPPED(v6srcp) &&
- !IN6_IS_ADDR_UNSPECIFIED(v6srcp)) {
- ip1dbg(("ip_proto_bind_connected_v6: "
- "dst is mapped, but not the src\n"));
- goto bad_addr;
- }
- IN6_V4MAPPED_TO_IPADDR(v6srcp, v4src);
- IN6_V4MAPPED_TO_IPADDR(v6dstp, v4dst);
+ pmtu = ip_get_pmtu(ixa);
+ ixa->ixa_fragsize = pmtu;
+ /* Make sure ixa_fragsize and ixa_pmtu remain identical */
+ if (ixa->ixa_flags & IXAF_VERIFY_PMTU)
+ ixa->ixa_pmtu = pmtu;
- /* Always verify destination reachability. */
- error = ip_bind_connected_v4(connp, mpp, protocol, &v4src,
- lport, v4dst, fport, B_TRUE, B_TRUE, cr);
- if (error != 0)
- goto bad_addr;
- IN6_IPADDR_TO_V4MAPPED(v4src, v6srcp);
- connp->conn_pkt_isv6 = B_FALSE;
- } else if (IN6_IS_ADDR_V4MAPPED(v6srcp)) {
- ip1dbg(("ip_proto_bind_connected_v6: "
- "src is mapped, but not the dst\n"));
- goto bad_addr;
- } else {
- error = ip_bind_connected_v6(connp, mpp, protocol, v6srcp,
- lport, v6dstp, ipp, fport, B_TRUE, verify_dst, cr);
- if (error != 0)
- goto bad_addr;
- connp->conn_pkt_isv6 = B_TRUE;
- }
+ /*
+ * Extract information useful for some transports.
+ * First we look for DCE metrics. Then we take what we have in
+ * the metrics in the route, where the offlink is used if we have
+ * one.
+ */
+ if (uinfo != NULL) {
+ bzero(uinfo, sizeof (*uinfo));
- if (orig_pkt_isv6 != connp->conn_pkt_isv6)
- ip_setpktversion(connp, connp->conn_pkt_isv6, B_TRUE, ipst);
+ if (dce->dce_flags & DCEF_UINFO)
+ *uinfo = dce->dce_uinfo;
- /* Send it home. */
- return (0);
+ rts_merge_metrics(uinfo, &ire->ire_metrics);
-bad_addr:
- if (error == 0)
- error = -TBADADDR;
- return (error);
-}
+ /* Allow ire_metrics to decrease the path MTU from above */
+ if (uinfo->iulp_mtu == 0 || uinfo->iulp_mtu > pmtu)
+ uinfo->iulp_mtu = pmtu;
-/*
- * Get the ire in *mpp. Returns false if it fails (due to lack of space).
- * Makes the IRE be IRE_BROADCAST if dst is a multicast address.
- */
-/* ARGSUSED4 */
-static boolean_t
-ip_bind_get_ire_v6(mblk_t **mpp, ire_t *ire, const in6_addr_t *dst,
- iulp_t *ulp_info, ip_stack_t *ipst)
-{
- mblk_t *mp = *mpp;
- ire_t *ret_ire;
+ uinfo->iulp_localnet = (ire->ire_type & IRE_ONLINK) != 0;
+ uinfo->iulp_loopback = (ire->ire_type & IRE_LOOPBACK) != 0;
+ uinfo->iulp_local = (ire->ire_type & IRE_LOCAL) != 0;
+ }
- ASSERT(mp != NULL);
+ if (ill != NULL)
+ ill_refrele(ill);
- if (ire != NULL) {
- /*
- * mp initialized above to IRE_DB_REQ_TYPE
- * appended mblk. Its <upper protocol>'s
- * job to make sure there is room.
- */
- if ((mp->b_datap->db_lim - mp->b_rptr) < sizeof (ire_t))
- return (B_FALSE);
+ return (error);
- mp->b_datap->db_type = IRE_DB_TYPE;
- mp->b_wptr = mp->b_rptr + sizeof (ire_t);
- bcopy(ire, mp->b_rptr, sizeof (ire_t));
- ret_ire = (ire_t *)mp->b_rptr;
- if (IN6_IS_ADDR_MULTICAST(dst) ||
- IN6_IS_ADDR_V4MAPPED_CLASSD(dst)) {
- ret_ire->ire_type = IRE_BROADCAST;
- ret_ire->ire_addr_v6 = *dst;
- }
- if (ulp_info != NULL) {
- bcopy(ulp_info, &(ret_ire->ire_uinfo),
- sizeof (iulp_t));
- }
- ret_ire->ire_mp = mp;
- } else {
- /*
- * No IRE was found. Remove IRE mblk.
- */
- *mpp = mp->b_cont;
- freeb(mp);
- }
- return (B_TRUE);
-}
+bad_addr:
+ if (ire != NULL)
+ ire_refrele(ire);
-/*
- * Add an ip6i_t header to the front of the mblk.
- * Inline if possible else allocate a separate mblk containing only the ip6i_t.
- * Returns NULL if allocation fails (and frees original message).
- * Used in outgoing path when going through ip_newroute_*v6().
- * Used in incoming path to pass ifindex to transports.
- */
-mblk_t *
-ip_add_info_v6(mblk_t *mp, ill_t *ill, const in6_addr_t *dst)
-{
- mblk_t *mp1;
- ip6i_t *ip6i;
- ip6_t *ip6h;
+ if (ill != NULL)
+ ill_refrele(ill);
- ip6h = (ip6_t *)mp->b_rptr;
- ip6i = (ip6i_t *)(mp->b_rptr - sizeof (ip6i_t));
- if ((uchar_t *)ip6i < mp->b_datap->db_base ||
- mp->b_datap->db_ref > 1) {
- mp1 = allocb(sizeof (ip6i_t), BPRI_MED);
- if (mp1 == NULL) {
- freemsg(mp);
- return (NULL);
- }
- mp1->b_wptr = mp1->b_rptr = mp1->b_datap->db_lim;
- mp1->b_cont = mp;
- mp = mp1;
- ip6i = (ip6i_t *)(mp->b_rptr - sizeof (ip6i_t));
- }
- mp->b_rptr = (uchar_t *)ip6i;
- ip6i->ip6i_vcf = ip6h->ip6_vcf;
- ip6i->ip6i_nxt = IPPROTO_RAW;
- if (ill != NULL) {
- ip6i->ip6i_flags = IP6I_IFINDEX;
- /*
- * If `ill' is in an IPMP group, make sure we use the IPMP
- * interface index so that e.g. IPV6_RECVPKTINFO will get the
- * IPMP interface index and not an underlying interface index.
- */
- if (IS_UNDER_IPMP(ill))
- ip6i->ip6i_ifindex = ipmp_ill_get_ipmp_ifindex(ill);
- else
- ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
- } else {
- ip6i->ip6i_flags = 0;
+ /*
+ * Make sure we don't leave an unreachable ixa_nce in place
+ * since ip_select_route is used when we unplumb i.e., remove
+ * references on ixa_ire, ixa_nce, and ixa_dce.
+ */
+ nce = ixa->ixa_nce;
+ if (nce != NULL && nce->nce_is_condemned) {
+ nce_refrele(nce);
+ ixa->ixa_nce = NULL;
+ ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
}
- ip6i->ip6i_nexthop = *dst;
- return (mp);
+
+ return (error);
}
/*
@@ -3051,53 +2280,29 @@ ip_add_info_v6(mblk_t *mp, ill_t *ill, const in6_addr_t *dst)
* of any incoming packets.
*
* Zones notes:
- * Packets will be distributed to streams in all zones. This is really only
+ * Packets will be distributed to conns in all zones. This is really only
* useful for ICMPv6 as only applications in the global zone can create raw
* sockets for other protocols.
*/
-static void
-ip_fanout_proto_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill,
- ill_t *inill, uint8_t nexthdr, uint_t nexthdr_offset, uint_t flags,
- boolean_t mctl_present, zoneid_t zoneid)
+void
+ip_fanout_proto_v6(mblk_t *mp, ip6_t *ip6h, ip_recv_attr_t *ira)
{
- queue_t *rq;
- mblk_t *mp1, *first_mp1;
- in6_addr_t dst = ip6h->ip6_dst;
- in6_addr_t src = ip6h->ip6_src;
- mblk_t *first_mp = mp;
- boolean_t secure, shared_addr;
- conn_t *connp, *first_connp, *next_connp;
- connf_t *connfp;
- ip_stack_t *ipst = inill->ill_ipst;
- ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
-
- if (mctl_present) {
- mp = first_mp->b_cont;
- secure = ipsec_in_is_secure(first_mp);
- ASSERT(mp != NULL);
- } else {
- secure = B_FALSE;
- }
-
- shared_addr = (zoneid == ALL_ZONES);
- if (shared_addr) {
- /*
- * We don't allow multilevel ports for raw IP, so no need to
- * check for that here.
- */
- zoneid = tsol_packet_to_zoneid(mp);
- }
+ mblk_t *mp1;
+ in6_addr_t laddr = ip6h->ip6_dst;
+ conn_t *connp, *first_connp, *next_connp;
+ connf_t *connfp;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
- connfp = &ipst->ips_ipcl_proto_fanout_v6[nexthdr];
+ connfp = &ipst->ips_ipcl_proto_fanout_v6[ira->ira_protocol];
mutex_enter(&connfp->connf_lock);
connp = connfp->connf_head;
for (connp = connfp->connf_head; connp != NULL;
connp = connp->conn_next) {
- if (IPCL_PROTO_MATCH_V6(connp, nexthdr, ip6h, ill, flags,
- zoneid) &&
- (!is_system_labeled() ||
- tsol_receive_local(mp, &dst, IPV6_VERSION, shared_addr,
- connp)))
+ /* Note: IPCL_PROTO_MATCH_V6 includes conn_wantpacket */
+ if (IPCL_PROTO_MATCH_V6(connp, ira, ip6h) &&
+ (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
+ tsol_receive_local(mp, &laddr, IPV6_VERSION, ira, connp)))
break;
}
@@ -3108,96 +2313,52 @@ ip_fanout_proto_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill,
* unclaimed datagrams?
*/
mutex_exit(&connfp->connf_lock);
- if (ip_fanout_send_icmp_v6(q, first_mp, flags,
- ICMP6_PARAM_PROB, ICMP6_PARAMPROB_NEXTHEADER,
- nexthdr_offset, mctl_present, zoneid, ipst)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInUnknownProtos);
- }
-
+ ip_fanout_send_icmp_v6(mp, ICMP6_PARAM_PROB,
+ ICMP6_PARAMPROB_NEXTHEADER, ira);
return;
}
- ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL);
+ ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
CONN_INC_REF(connp);
first_connp = connp;
/*
* XXX: Fix the multiple protocol listeners case. We should not
- * be walking the conn->next list here.
+ * be walking the conn->conn_next list here.
*/
connp = connp->conn_next;
for (;;) {
while (connp != NULL) {
- if (IPCL_PROTO_MATCH_V6(connp, nexthdr, ip6h, ill,
- flags, zoneid) &&
- (!is_system_labeled() ||
- tsol_receive_local(mp, &dst, IPV6_VERSION,
- shared_addr, connp)))
+ /* Note: IPCL_PROTO_MATCH_V6 includes conn_wantpacket */
+ if (IPCL_PROTO_MATCH_V6(connp, ira, ip6h) &&
+ (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
+ tsol_receive_local(mp, &laddr, IPV6_VERSION,
+ ira, connp)))
break;
connp = connp->conn_next;
}
- /*
- * Just copy the data part alone. The mctl part is
- * needed just for verifying policy and it is never
- * sent up.
- */
- if (connp == NULL ||
- (((first_mp1 = dupmsg(first_mp)) == NULL) &&
- ((first_mp1 = ip_copymsg(first_mp)) == NULL))) {
- /*
- * No more intested clients or memory
- * allocation failed
- */
+ if (connp == NULL) {
+ /* No more interested clients */
+ connp = first_connp;
+ break;
+ }
+ if (((mp1 = dupmsg(mp)) == NULL) &&
+ ((mp1 = copymsg(mp)) == NULL)) {
+ /* Memory allocation failed */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
connp = first_connp;
break;
}
- ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
- mp1 = mctl_present ? first_mp1->b_cont : first_mp1;
+
CONN_INC_REF(connp);
mutex_exit(&connfp->connf_lock);
- rq = connp->conn_rq;
- /*
- * For link-local always add ifindex so that transport can set
- * sin6_scope_id. Avoid it for ICMP error fanout.
- */
- if ((connp->conn_ip_recvpktinfo ||
- IN6_IS_ADDR_LINKLOCAL(&src)) &&
- (flags & IP_FF_IPINFO)) {
- /* Add header */
- mp1 = ip_add_info_v6(mp1, inill, &dst);
- }
- if (mp1 == NULL) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- } else if (
- (IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
- (!IPCL_IS_NONSTR(connp) && !canputnext(rq))) {
- if (flags & IP_FF_RAWIP) {
- BUMP_MIB(ill->ill_ip_mib,
- rawipIfStatsInOverflows);
- } else {
- BUMP_MIB(ill->ill_icmp6_mib,
- ipv6IfIcmpInOverflows);
- }
- freemsg(mp1);
- } else {
- ASSERT(!IPCL_IS_IPTUN(connp));
+ ip_fanout_proto_conn(connp, mp1, NULL, (ip6_t *)mp1->b_rptr,
+ ira);
- if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) ||
- secure) {
- first_mp1 = ipsec_check_inbound_policy(
- first_mp1, connp, NULL, ip6h, mctl_present);
- }
- if (first_mp1 != NULL) {
- if (mctl_present)
- freeb(first_mp1);
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsHCInDelivers);
- (connp->conn_recv)(connp, mp1, NULL);
- }
- }
mutex_enter(&connfp->connf_lock);
/* Follow the next pointer before releasing the conn. */
next_connp = connp->conn_next;
@@ -3208,105 +2369,33 @@ ip_fanout_proto_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill,
/* Last one. Send it upstream. */
mutex_exit(&connfp->connf_lock);
- /* Initiate IPPF processing */
- if (IP6_IN_IPP(flags, ipst)) {
- uint_t ifindex;
-
- mutex_enter(&ill->ill_lock);
- ifindex = ill->ill_phyint->phyint_ifindex;
- mutex_exit(&ill->ill_lock);
- ip_process(IPP_LOCAL_IN, &mp, ifindex);
- if (mp == NULL) {
- CONN_DEC_REF(connp);
- if (mctl_present)
- freeb(first_mp);
- return;
- }
- }
-
- /*
- * For link-local always add ifindex so that transport can set
- * sin6_scope_id. Avoid it for ICMP error fanout.
- */
- if ((connp->conn_ip_recvpktinfo || IN6_IS_ADDR_LINKLOCAL(&src)) &&
- (flags & IP_FF_IPINFO)) {
- /* Add header */
- mp = ip_add_info_v6(mp, inill, &dst);
- if (mp == NULL) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- CONN_DEC_REF(connp);
- if (mctl_present)
- freeb(first_mp);
- return;
- } else if (mctl_present) {
- first_mp->b_cont = mp;
- } else {
- first_mp = mp;
- }
- }
-
- rq = connp->conn_rq;
- if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
- (!IPCL_IS_NONSTR(connp) && !canputnext(rq))) {
-
- if (flags & IP_FF_RAWIP) {
- BUMP_MIB(ill->ill_ip_mib, rawipIfStatsInOverflows);
- } else {
- BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInOverflows);
- }
-
- freemsg(first_mp);
- } else {
- ASSERT(!IPCL_IS_IPTUN(connp));
+ ip_fanout_proto_conn(connp, mp, NULL, ip6h, ira);
- if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || secure) {
- first_mp = ipsec_check_inbound_policy(first_mp, connp,
- NULL, ip6h, mctl_present);
- if (first_mp == NULL) {
- CONN_DEC_REF(connp);
- return;
- }
- }
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
- (connp->conn_recv)(connp, mp, NULL);
- if (mctl_present)
- freeb(first_mp);
- }
CONN_DEC_REF(connp);
}
/*
- * Send an ICMP error after patching up the packet appropriately. Returns
- * non-zero if the appropriate MIB should be bumped; zero otherwise.
+ * Called when it is conceptually a ULP that would sent the packet
+ * e.g., port unreachable and nexthdr unknown. Check that the packet
+ * would have passed the IPsec global policy before sending the error.
+ *
+ * Send an ICMP error after patching up the packet appropriately.
+ * Uses ip_drop_input and bumps the appropriate MIB.
+ * For ICMP6_PARAMPROB_NEXTHEADER we determine the offset to use.
*/
-int
-ip_fanout_send_icmp_v6(queue_t *q, mblk_t *mp, uint_t flags,
- uint_t icmp_type, uint8_t icmp_code, uint_t nexthdr_offset,
- boolean_t mctl_present, zoneid_t zoneid, ip_stack_t *ipst)
+void
+ip_fanout_send_icmp_v6(mblk_t *mp, uint_t icmp_type, uint8_t icmp_code,
+ ip_recv_attr_t *ira)
{
- ip6_t *ip6h;
- mblk_t *first_mp;
- boolean_t secure;
- unsigned char db_type;
- ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
+ ip6_t *ip6h;
+ boolean_t secure;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+ netstack_t *ns = ipst->ips_netstack;
+ ipsec_stack_t *ipss = ns->netstack_ipsec;
+
+ secure = ira->ira_flags & IRAF_IPSEC_SECURE;
- first_mp = mp;
- if (mctl_present) {
- mp = mp->b_cont;
- secure = ipsec_in_is_secure(first_mp);
- ASSERT(mp != NULL);
- } else {
- /*
- * If this is an ICMP error being reported - which goes
- * up as M_CTLs, we need to convert them to M_DATA till
- * we finish checking with global policy because
- * ipsec_check_global_policy() assumes M_DATA as clear
- * and M_CTL as secure.
- */
- db_type = mp->b_datap->db_type;
- mp->b_datap->db_type = M_DATA;
- secure = B_FALSE;
- }
/*
* We are generating an icmp error for some inbound packet.
* Called from all ip_fanout_(udp, tcp, proto) functions.
@@ -3316,572 +2405,155 @@ ip_fanout_send_icmp_v6(queue_t *q, mblk_t *mp, uint_t flags,
*/
ip6h = (ip6_t *)mp->b_rptr;
if (secure || ipss->ipsec_inbound_v6_policy_present) {
- first_mp = ipsec_check_global_policy(first_mp, NULL,
- NULL, ip6h, mctl_present, ipst->ips_netstack);
- if (first_mp == NULL)
- return (0);
- }
-
- if (!mctl_present)
- mp->b_datap->db_type = db_type;
-
- if (flags & IP_FF_SEND_ICMP) {
- if (flags & IP_FF_HDR_COMPLETE) {
- if (ip_hdr_complete_v6(ip6h, zoneid, ipst)) {
- freemsg(first_mp);
- return (1);
- }
- }
- switch (icmp_type) {
- case ICMP6_DST_UNREACH:
- icmp_unreachable_v6(WR(q), first_mp, icmp_code,
- B_FALSE, B_FALSE, zoneid, ipst);
- break;
- case ICMP6_PARAM_PROB:
- icmp_param_problem_v6(WR(q), first_mp, icmp_code,
- nexthdr_offset, B_FALSE, B_FALSE, zoneid, ipst);
- break;
- default:
-#ifdef DEBUG
- panic("ip_fanout_send_icmp_v6: wrong type");
- /*NOTREACHED*/
-#else
- freemsg(first_mp);
- break;
-#endif
- }
- } else {
- freemsg(first_mp);
- return (0);
- }
-
- return (1);
-}
-
-/*
- * Fanout for TCP packets
- * The caller puts <fport, lport> in the ports parameter.
- */
-static void
-ip_fanout_tcp_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill, ill_t *inill,
- uint_t flags, uint_t hdr_len, boolean_t mctl_present, zoneid_t zoneid)
-{
- mblk_t *first_mp;
- boolean_t secure;
- conn_t *connp;
- tcph_t *tcph;
- boolean_t syn_present = B_FALSE;
- ip_stack_t *ipst = inill->ill_ipst;
- ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
-
- first_mp = mp;
- if (mctl_present) {
- mp = first_mp->b_cont;
- secure = ipsec_in_is_secure(first_mp);
- ASSERT(mp != NULL);
- } else {
- secure = B_FALSE;
- }
-
- connp = ipcl_classify_v6(mp, IPPROTO_TCP, hdr_len, zoneid, ipst);
-
- if (connp == NULL ||
- !conn_wantpacket_v6(connp, ill, ip6h, flags, zoneid)) {
- /*
- * No hard-bound match. Send Reset.
- */
- dblk_t *dp = mp->b_datap;
- uint32_t ill_index;
-
- ASSERT((dp->db_struioflag & STRUIO_IP) == 0);
-
- /* Initiate IPPf processing, if needed. */
- if (IPP_ENABLED(IPP_LOCAL_IN, ipst) &&
- (flags & IP6_NO_IPPOLICY)) {
- ill_index = ill->ill_phyint->phyint_ifindex;
- ip_process(IPP_LOCAL_IN, &first_mp, ill_index);
- if (first_mp == NULL) {
- if (connp != NULL)
- CONN_DEC_REF(connp);
- return;
- }
- }
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
- if (connp != NULL) {
- ip_xmit_reset_serialize(first_mp, hdr_len, zoneid,
- ipst->ips_netstack->netstack_tcp, connp);
- CONN_DEC_REF(connp);
- } else {
- tcp_xmit_listeners_reset(first_mp, hdr_len, zoneid,
- ipst->ips_netstack->netstack_tcp, NULL);
- }
-
- return;
- }
-
- tcph = (tcph_t *)&mp->b_rptr[hdr_len];
- if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) {
- if (IPCL_IS_TCP(connp)) {
- squeue_t *sqp;
-
- /*
- * If the queue belongs to a conn, and fused tcp
- * loopback is enabled, assign the eager's squeue
- * to be that of the active connect's.
- */
- if ((flags & IP_FF_LOOPBACK) && do_tcp_fusion &&
- CONN_Q(q) && IPCL_IS_TCP(Q_TO_CONN(q)) &&
- !CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) &&
- !secure &&
- !IP6_IN_IPP(flags, ipst)) {
- ASSERT(Q_TO_CONN(q)->conn_sqp != NULL);
- sqp = Q_TO_CONN(q)->conn_sqp;
- } else {
- sqp = IP_SQUEUE_GET(lbolt);
- }
-
- mp->b_datap->db_struioflag |= STRUIO_EAGER;
- DB_CKSUMSTART(mp) = (intptr_t)sqp;
-
- /*
- * db_cksumstuff is unused in the incoming
- * path; Thus store the ifindex here. It will
- * be cleared in tcp_conn_create_v6().
- */
- DB_CKSUMSTUFF(mp) =
- (intptr_t)ill->ill_phyint->phyint_ifindex;
- syn_present = B_TRUE;
- }
- }
-
- if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp) && !syn_present) {
- uint_t flags = (unsigned int)tcph->th_flags[0] & 0xFF;
- if ((flags & TH_RST) || (flags & TH_URG)) {
- CONN_DEC_REF(connp);
- freemsg(first_mp);
- return;
- }
- if (flags & TH_ACK) {
- ip_xmit_reset_serialize(first_mp, hdr_len, zoneid,
- ipst->ips_netstack->netstack_tcp, connp);
- CONN_DEC_REF(connp);
+ mp = ipsec_check_global_policy(mp, NULL, NULL, ip6h, ira, ns);
+ if (mp == NULL)
return;
- }
+ }
- CONN_DEC_REF(connp);
- freemsg(first_mp);
+ /* We never send errors for protocols that we do implement */
+ if (ira->ira_protocol == IPPROTO_ICMPV6) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ip_fanout_send_icmp_v6", mp, ill);
+ freemsg(mp);
return;
}
- if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) || secure) {
- first_mp = ipsec_check_inbound_policy(first_mp, connp,
- NULL, ip6h, mctl_present);
- if (first_mp == NULL) {
- CONN_DEC_REF(connp);
- return;
- }
- if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp)) {
- ASSERT(syn_present);
- if (mctl_present) {
- ASSERT(first_mp != mp);
- first_mp->b_datap->db_struioflag |=
- STRUIO_POLICY;
- } else {
- ASSERT(first_mp == mp);
- mp->b_datap->db_struioflag &=
- ~STRUIO_EAGER;
- mp->b_datap->db_struioflag |=
- STRUIO_POLICY;
- }
- } else {
- /*
- * Discard first_mp early since we're dealing with a
- * fully-connected conn_t and tcp doesn't do policy in
- * this case. Also, if someone is bound to IPPROTO_TCP
- * over raw IP, they don't expect to see a M_CTL.
- */
- if (mctl_present) {
- freeb(first_mp);
- mctl_present = B_FALSE;
- }
- first_mp = mp;
- }
- }
+ switch (icmp_type) {
+ case ICMP6_DST_UNREACH:
+ ASSERT(icmp_code == ICMP6_DST_UNREACH_NOPORT);
- /* Initiate IPPF processing */
- if (IP6_IN_IPP(flags, ipst)) {
- uint_t ifindex;
+ BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts);
+ ip_drop_input("ipIfStatsNoPorts", mp, ill);
- mutex_enter(&ill->ill_lock);
- ifindex = ill->ill_phyint->phyint_ifindex;
- mutex_exit(&ill->ill_lock);
- ip_process(IPP_LOCAL_IN, &mp, ifindex);
- if (mp == NULL) {
- CONN_DEC_REF(connp);
- if (mctl_present) {
- freeb(first_mp);
- }
- return;
- } else if (mctl_present) {
- /*
- * ip_add_info_v6 might return a new mp.
- */
- ASSERT(first_mp != mp);
- first_mp->b_cont = mp;
- } else {
- first_mp = mp;
- }
- }
+ icmp_unreachable_v6(mp, icmp_code, B_FALSE, ira);
+ break;
+ case ICMP6_PARAM_PROB:
+ ASSERT(icmp_code == ICMP6_PARAMPROB_NEXTHEADER);
- /*
- * For link-local always add ifindex so that TCP can bind to that
- * interface. Avoid it for ICMP error fanout.
- */
- if (!syn_present && ((connp->conn_ip_recvpktinfo ||
- IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src)) &&
- (flags & IP_FF_IPINFO))) {
- /* Add header */
- mp = ip_add_info_v6(mp, inill, &ip6h->ip6_dst);
- if (mp == NULL) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- CONN_DEC_REF(connp);
- if (mctl_present)
- freeb(first_mp);
- return;
- } else if (mctl_present) {
- ASSERT(first_mp != mp);
- first_mp->b_cont = mp;
- } else {
- first_mp = mp;
- }
- }
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInUnknownProtos);
+ ip_drop_input("ipIfStatsInUnknownProtos", mp, ill);
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
- if (IPCL_IS_TCP(connp)) {
- SQUEUE_ENTER_ONE(connp->conn_sqp, first_mp, connp->conn_recv,
- connp, ip_squeue_flag, SQTAG_IP6_TCP_INPUT);
- } else {
- /* SOCK_RAW, IPPROTO_TCP case */
- (connp->conn_recv)(connp, first_mp, NULL);
- CONN_DEC_REF(connp);
+ /* Let the system determine the offset for this one */
+ icmp_param_problem_nexthdr_v6(mp, B_FALSE, ira);
+ break;
+ default:
+#ifdef DEBUG
+ panic("ip_fanout_send_icmp_v6: wrong type");
+ /*NOTREACHED*/
+#else
+ freemsg(mp);
+ break;
+#endif
}
}
/*
+ * Fanout for UDP packets that are multicast or ICMP errors.
+ * (Unicast fanout is handled in ip_input_v6.)
+ *
+ * If SO_REUSEADDR is set all multicast packets
+ * will be delivered to all conns bound to the same port.
+ *
* Fanout for UDP packets.
* The caller puts <fport, lport> in the ports parameter.
* ire_type must be IRE_BROADCAST for multicast and broadcast packets.
*
* If SO_REUSEADDR is set all multicast and broadcast packets
- * will be delivered to all streams bound to the same port.
+ * will be delivered to all conns bound to the same port.
*
* Zones notes:
- * Multicast packets will be distributed to streams in all zones.
+ * Earlier in ip_input on a system with multiple shared-IP zones we
+ * duplicate the multicast and broadcast packets and send them up
+ * with each explicit zoneid that exists on that ill.
+ * This means that here we can match the zoneid with SO_ALLZONES being special.
*/
-static void
-ip_fanout_udp_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, uint32_t ports,
- ill_t *ill, ill_t *inill, uint_t flags, boolean_t mctl_present,
- zoneid_t zoneid)
+void
+ip_fanout_udp_multi_v6(mblk_t *mp, ip6_t *ip6h, uint16_t lport, uint16_t fport,
+ ip_recv_attr_t *ira)
{
- uint32_t dstport, srcport;
- in6_addr_t dst;
- mblk_t *first_mp;
- boolean_t secure;
+ in6_addr_t laddr;
conn_t *connp;
connf_t *connfp;
- conn_t *first_conn;
- conn_t *next_conn;
- mblk_t *mp1, *first_mp1;
- in6_addr_t src;
- boolean_t shared_addr;
- ip_stack_t *ipst = inill->ill_ipst;
- ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
-
- first_mp = mp;
- if (mctl_present) {
- mp = first_mp->b_cont;
- secure = ipsec_in_is_secure(first_mp);
- ASSERT(mp != NULL);
- } else {
- secure = B_FALSE;
- }
+ in6_addr_t faddr;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
- /* Extract ports in net byte order */
- dstport = htons(ntohl(ports) & 0xFFFF);
- srcport = htons(ntohl(ports) >> 16);
- dst = ip6h->ip6_dst;
- src = ip6h->ip6_src;
+ ASSERT(ira->ira_flags & (IRAF_MULTIBROADCAST|IRAF_ICMP_ERROR));
- shared_addr = (zoneid == ALL_ZONES);
- if (shared_addr) {
- /*
- * No need to handle exclusive-stack zones since ALL_ZONES
- * only applies to the shared stack.
- */
- zoneid = tsol_mlp_findzone(IPPROTO_UDP, dstport);
- /*
- * If no shared MLP is found, tsol_mlp_findzone returns
- * ALL_ZONES. In that case, we assume it's SLP, and
- * search for the zone based on the packet label.
- * That will also return ALL_ZONES on failure, but
- * we never allow conn_zoneid to be set to ALL_ZONES.
- */
- if (zoneid == ALL_ZONES)
- zoneid = tsol_packet_to_zoneid(mp);
- }
+ laddr = ip6h->ip6_dst;
+ faddr = ip6h->ip6_src;
/* Attempt to find a client stream based on destination port. */
- connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(dstport, ipst)];
+ connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
mutex_enter(&connfp->connf_lock);
connp = connfp->connf_head;
- if (!IN6_IS_ADDR_MULTICAST(&dst)) {
- /*
- * Not multicast. Send to the one (first) client we find.
- */
- while (connp != NULL) {
- if (IPCL_UDP_MATCH_V6(connp, dstport, dst, srcport,
- src) && IPCL_ZONE_MATCH(connp, zoneid) &&
- conn_wantpacket_v6(connp, ill, ip6h,
- flags, zoneid)) {
- break;
- }
- connp = connp->conn_next;
- }
- if (connp == NULL || connp->conn_upq == NULL)
- goto notfound;
-
- if (is_system_labeled() &&
- !tsol_receive_local(mp, &dst, IPV6_VERSION, shared_addr,
- connp))
- goto notfound;
-
- /* Found a client */
- CONN_INC_REF(connp);
- mutex_exit(&connfp->connf_lock);
-
- if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
- (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) {
- freemsg(first_mp);
- CONN_DEC_REF(connp);
- return;
- }
- if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) || secure) {
- first_mp = ipsec_check_inbound_policy(first_mp,
- connp, NULL, ip6h, mctl_present);
- if (first_mp == NULL) {
- CONN_DEC_REF(connp);
- return;
- }
- }
- /* Initiate IPPF processing */
- if (IP6_IN_IPP(flags, ipst)) {
- uint_t ifindex;
-
- mutex_enter(&ill->ill_lock);
- ifindex = ill->ill_phyint->phyint_ifindex;
- mutex_exit(&ill->ill_lock);
- ip_process(IPP_LOCAL_IN, &mp, ifindex);
- if (mp == NULL) {
- CONN_DEC_REF(connp);
- if (mctl_present)
- freeb(first_mp);
- return;
- }
- }
- /*
- * For link-local always add ifindex so that
- * transport can set sin6_scope_id. Avoid it for
- * ICMP error fanout.
- */
- if ((connp->conn_ip_recvpktinfo ||
- IN6_IS_ADDR_LINKLOCAL(&src)) &&
- (flags & IP_FF_IPINFO)) {
- /* Add header */
- mp = ip_add_info_v6(mp, inill, &dst);
- if (mp == NULL) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- CONN_DEC_REF(connp);
- if (mctl_present)
- freeb(first_mp);
- return;
- } else if (mctl_present) {
- first_mp->b_cont = mp;
- } else {
- first_mp = mp;
- }
- }
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
-
- /* Send it upstream */
- (connp->conn_recv)(connp, mp, NULL);
-
- IP6_STAT(ipst, ip6_udp_fannorm);
- CONN_DEC_REF(connp);
- if (mctl_present)
- freeb(first_mp);
- return;
- }
-
while (connp != NULL) {
- if ((IPCL_UDP_MATCH_V6(connp, dstport, dst, srcport, src)) &&
- conn_wantpacket_v6(connp, ill, ip6h, flags, zoneid) &&
- (!is_system_labeled() ||
- tsol_receive_local(mp, &dst, IPV6_VERSION, shared_addr,
- connp)))
+ if ((IPCL_UDP_MATCH_V6(connp, lport, laddr, fport, faddr)) &&
+ conn_wantpacket_v6(connp, ira, ip6h) &&
+ (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
+ tsol_receive_local(mp, &laddr, IPV6_VERSION, ira, connp)))
break;
connp = connp->conn_next;
}
- if (connp == NULL || connp->conn_upq == NULL)
+ if (connp == NULL)
goto notfound;
- first_conn = connp;
-
CONN_INC_REF(connp);
- connp = connp->conn_next;
- for (;;) {
- while (connp != NULL) {
- if (IPCL_UDP_MATCH_V6(connp, dstport, dst, srcport,
- src) && conn_wantpacket_v6(connp, ill, ip6h,
- flags, zoneid) &&
- (!is_system_labeled() ||
- tsol_receive_local(mp, &dst, IPV6_VERSION,
- shared_addr, connp)))
- break;
- connp = connp->conn_next;
- }
- /*
- * Just copy the data part alone. The mctl part is
- * needed just for verifying policy and it is never
- * sent up.
- */
- if (connp == NULL ||
- (((first_mp1 = dupmsg(first_mp)) == NULL) &&
- ((first_mp1 = ip_copymsg(first_mp)) == NULL))) {
- /*
- * No more interested clients or memory
- * allocation failed
- */
- connp = first_conn;
- break;
- }
- mp1 = mctl_present ? first_mp1->b_cont : first_mp1;
- CONN_INC_REF(connp);
- mutex_exit(&connfp->connf_lock);
- /*
- * For link-local always add ifindex so that transport
- * can set sin6_scope_id. Avoid it for ICMP error
- * fanout.
- */
- if ((connp->conn_ip_recvpktinfo ||
- IN6_IS_ADDR_LINKLOCAL(&src)) &&
- (flags & IP_FF_IPINFO)) {
- /* Add header */
- mp1 = ip_add_info_v6(mp1, inill, &dst);
- }
- /* mp1 could have changed */
- if (mctl_present)
- first_mp1->b_cont = mp1;
- else
- first_mp1 = mp1;
- if (mp1 == NULL) {
- if (mctl_present)
- freeb(first_mp1);
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- goto next_one;
- }
- if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
- (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) {
- BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
- freemsg(first_mp1);
- goto next_one;
- }
- if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) || secure) {
- first_mp1 = ipsec_check_inbound_policy
- (first_mp1, connp, NULL, ip6h,
- mctl_present);
- }
- if (first_mp1 != NULL) {
- if (mctl_present)
- freeb(first_mp1);
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+ if (connp->conn_reuseaddr) {
+ conn_t *first_connp = connp;
+ conn_t *next_connp;
+ mblk_t *mp1;
- /* Send it upstream */
- (connp->conn_recv)(connp, mp1, NULL);
- }
-next_one:
- mutex_enter(&connfp->connf_lock);
- /* Follow the next pointer before releasing the conn. */
- next_conn = connp->conn_next;
- IP6_STAT(ipst, ip6_udp_fanmb);
- CONN_DEC_REF(connp);
- connp = next_conn;
- }
+ connp = connp->conn_next;
+ for (;;) {
+ while (connp != NULL) {
+ if (IPCL_UDP_MATCH_V6(connp, lport, laddr,
+ fport, faddr) &&
+ conn_wantpacket_v6(connp, ira, ip6h) &&
+ (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
+ tsol_receive_local(mp, &laddr, IPV6_VERSION,
+ ira, connp)))
+ break;
+ connp = connp->conn_next;
+ }
+ if (connp == NULL) {
+ /* No more interested clients */
+ connp = first_connp;
+ break;
+ }
+ if (((mp1 = dupmsg(mp)) == NULL) &&
+ ((mp1 = copymsg(mp)) == NULL)) {
+ /* Memory allocation failed */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ connp = first_connp;
+ break;
+ }
- /* Last one. Send it upstream. */
- mutex_exit(&connfp->connf_lock);
+ CONN_INC_REF(connp);
+ mutex_exit(&connfp->connf_lock);
- /* Initiate IPPF processing */
- if (IP6_IN_IPP(flags, ipst)) {
- uint_t ifindex;
+ IP6_STAT(ipst, ip6_udp_fanmb);
+ ip_fanout_udp_conn(connp, mp1, NULL,
+ (ip6_t *)mp1->b_rptr, ira);
- mutex_enter(&ill->ill_lock);
- ifindex = ill->ill_phyint->phyint_ifindex;
- mutex_exit(&ill->ill_lock);
- ip_process(IPP_LOCAL_IN, &mp, ifindex);
- if (mp == NULL) {
+ mutex_enter(&connfp->connf_lock);
+ /* Follow the next pointer before releasing the conn. */
+ next_connp = connp->conn_next;
+ IP6_STAT(ipst, ip6_udp_fanmb);
CONN_DEC_REF(connp);
- if (mctl_present) {
- freeb(first_mp);
- }
- return;
+ connp = next_connp;
}
}
- /*
- * For link-local always add ifindex so that transport can set
- * sin6_scope_id. Avoid it for ICMP error fanout.
- */
- if ((connp->conn_ip_recvpktinfo ||
- IN6_IS_ADDR_LINKLOCAL(&src)) && (flags & IP_FF_IPINFO)) {
- /* Add header */
- mp = ip_add_info_v6(mp, inill, &dst);
- if (mp == NULL) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- CONN_DEC_REF(connp);
- if (mctl_present)
- freeb(first_mp);
- return;
- } else if (mctl_present) {
- first_mp->b_cont = mp;
- } else {
- first_mp = mp;
- }
- }
- if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
- (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) {
- BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
- freemsg(mp);
- } else {
- if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) || secure) {
- first_mp = ipsec_check_inbound_policy(first_mp,
- connp, NULL, ip6h, mctl_present);
- if (first_mp == NULL) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- CONN_DEC_REF(connp);
- return;
- }
- }
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+ /* Last one. Send it upstream. */
+ mutex_exit(&connfp->connf_lock);
- /* Send it upstream */
- (connp->conn_recv)(connp, mp, NULL);
- }
IP6_STAT(ipst, ip6_udp_fanmb);
+ ip_fanout_udp_conn(connp, mp, NULL, ip6h, ira);
CONN_DEC_REF(connp);
- if (mctl_present)
- freeb(first_mp);
return;
notfound:
@@ -3892,28 +2564,26 @@ notfound:
* unclaimed datagrams?
*/
if (ipst->ips_ipcl_proto_fanout_v6[IPPROTO_UDP].connf_head != NULL) {
- ip_fanout_proto_v6(q, first_mp, ip6h, ill, inill, IPPROTO_UDP,
- 0, flags | IP_FF_RAWIP | IP_FF_IPINFO, mctl_present,
- zoneid);
+ ASSERT(ira->ira_protocol == IPPROTO_UDP);
+ ip_fanout_proto_v6(mp, ip6h, ira);
} else {
- if (ip_fanout_send_icmp_v6(q, first_mp, flags,
- ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOPORT, 0,
- mctl_present, zoneid, ipst)) {
- BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts);
- }
+ ip_fanout_send_icmp_v6(mp, ICMP6_DST_UNREACH,
+ ICMP6_DST_UNREACH_NOPORT, ira);
}
}
/*
* int ip_find_hdr_v6()
*
- * This routine is used by the upper layer protocols and the IP tunnel
- * module to:
+ * This routine is used by the upper layer protocols, iptun, and IPsec:
* - Set extension header pointers to appropriate locations
* - Determine IPv6 header length and return it
* - Return a pointer to the last nexthdr value
*
* The caller must initialize ipp_fields.
+ * The upper layer protocols normally set label_separate which makes the
+ * routine put the TX label in ipp_label_v6. If this is not set then
+ * the hop-by-hop options including the label are placed in ipp_hopopts.
*
* NOTE: If multiple extension headers of the same type are present,
* ip_find_hdr_v6() will set the respective extension header pointers
@@ -3923,7 +2593,8 @@ notfound:
* malformed part.
*/
int
-ip_find_hdr_v6(mblk_t *mp, ip6_t *ip6h, ip6_pkt_t *ipp, uint8_t *nexthdrp)
+ip_find_hdr_v6(mblk_t *mp, ip6_t *ip6h, boolean_t label_separate, ip_pkt_t *ipp,
+ uint8_t *nexthdrp)
{
uint_t length, ehdrlen;
uint8_t nexthdr;
@@ -3933,6 +2604,11 @@ ip_find_hdr_v6(mblk_t *mp, ip6_t *ip6h, ip6_pkt_t *ipp, uint8_t *nexthdrp)
ip6_hbh_t *tmphopopts;
ip6_frag_t *tmpfraghdr;
+ ipp->ipp_fields |= IPPF_HOPLIMIT | IPPF_TCLASS | IPPF_ADDR;
+ ipp->ipp_hoplimit = ip6h->ip6_hops;
+ ipp->ipp_tclass = IPV6_FLOW_TCLASS(ip6h->ip6_flow);
+ ipp->ipp_addr = ip6h->ip6_dst;
+
length = IPV6_HDR_LEN;
whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
endptr = mp->b_wptr;
@@ -3944,19 +2620,48 @@ ip_find_hdr_v6(mblk_t *mp, ip6_t *ip6h, ip6_pkt_t *ipp, uint8_t *nexthdrp)
goto done;
switch (nexthdr) {
- case IPPROTO_HOPOPTS:
+ case IPPROTO_HOPOPTS: {
+ /* We check for any CIPSO */
+ uchar_t *secopt;
+ boolean_t hbh_needed;
+ uchar_t *after_secopt;
+
tmphopopts = (ip6_hbh_t *)whereptr;
ehdrlen = 8 * (tmphopopts->ip6h_len + 1);
if ((uchar_t *)tmphopopts + ehdrlen > endptr)
goto done;
nexthdr = tmphopopts->ip6h_nxt;
+
+ if (!label_separate) {
+ secopt = NULL;
+ after_secopt = whereptr;
+ } else {
+ /*
+ * We have dropped packets with bad options in
+ * ip6_input. No need to check return value
+ * here.
+ */
+ (void) tsol_find_secopt_v6(whereptr, ehdrlen,
+ &secopt, &after_secopt, &hbh_needed);
+ }
+ if (secopt != NULL && after_secopt - whereptr > 0) {
+ ipp->ipp_fields |= IPPF_LABEL_V6;
+ ipp->ipp_label_v6 = secopt;
+ ipp->ipp_label_len_v6 = after_secopt - whereptr;
+ } else {
+ ipp->ipp_label_len_v6 = 0;
+ after_secopt = whereptr;
+ hbh_needed = B_TRUE;
+ }
/* return only 1st hbh */
- if (!(ipp->ipp_fields & IPPF_HOPOPTS)) {
+ if (hbh_needed && !(ipp->ipp_fields & IPPF_HOPOPTS)) {
ipp->ipp_fields |= IPPF_HOPOPTS;
- ipp->ipp_hopopts = tmphopopts;
- ipp->ipp_hopoptslen = ehdrlen;
+ ipp->ipp_hopopts = (ip6_hbh_t *)after_secopt;
+ ipp->ipp_hopoptslen = ehdrlen -
+ ipp->ipp_label_len_v6;
}
break;
+ }
case IPPROTO_DSTOPTS:
tmpdstopts = (ip6_dest_t *)whereptr;
ehdrlen = 8 * (tmpdstopts->ip6d_len + 1);
@@ -3993,10 +2698,10 @@ ip_find_hdr_v6(mblk_t *mp, ip6_t *ip6h, ip6_pkt_t *ipp, uint8_t *nexthdrp)
*/
if (ipp->ipp_fields & IPPF_DSTOPTS) {
ipp->ipp_fields &= ~IPPF_DSTOPTS;
- ipp->ipp_fields |= IPPF_RTDSTOPTS;
- ipp->ipp_rtdstopts = ipp->ipp_dstopts;
+ ipp->ipp_fields |= IPPF_RTHDRDSTOPTS;
+ ipp->ipp_rthdrdstopts = ipp->ipp_dstopts;
ipp->ipp_dstopts = NULL;
- ipp->ipp_rtdstoptslen = ipp->ipp_dstoptslen;
+ ipp->ipp_rthdrdstoptslen = ipp->ipp_dstoptslen;
ipp->ipp_dstoptslen = 0;
}
break;
@@ -4025,25 +2730,6 @@ done:
return (length);
}
-int
-ip_hdr_complete_v6(ip6_t *ip6h, zoneid_t zoneid, ip_stack_t *ipst)
-{
- ire_t *ire;
-
- if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) {
- ire = ire_lookup_local_v6(zoneid, ipst);
- if (ire == NULL) {
- ip1dbg(("ip_hdr_complete_v6: no source IRE\n"));
- return (1);
- }
- ip6h->ip6_src = ire->ire_addr_v6;
- ire_refrele(ire);
- }
- ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
- ip6h->ip6_hops = ipst->ips_ipv6_def_hops;
- return (0);
-}
-
/*
* Try to determine where and what are the IPv6 header length and
* pointer to nexthdr value for the upper layer protocol (or an
@@ -4066,7 +2752,7 @@ ip_hdr_length_nexthdr_v6(mblk_t *mp, ip6_t *ip6h, uint16_t *hdr_length_ptr,
ip6_rthdr_t *rthdr;
ip6_frag_t *fraghdr;
- ASSERT((IPH_HDR_VERSION(ip6h) & ~IP_FORWARD_PROG_BIT) == IPV6_VERSION);
+ ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
length = IPV6_HDR_LEN;
whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
endptr = mp->b_wptr;
@@ -4151,1905 +2837,6 @@ ip_hdr_length_v6(mblk_t *mp, ip6_t *ip6h)
}
/*
- * IPv6 -
- * ip_newroute_v6 is called by ip_rput_data_v6 or ip_wput_v6 whenever we need
- * to send out a packet to a destination address for which we do not have
- * specific routing information.
- *
- * Handle non-multicast packets. If ill is non-NULL the match is done
- * for that ill.
- *
- * When a specific ill is specified (using IPV6_PKTINFO,
- * IPV6_MULTICAST_IF, or IPV6_BOUND_IF) we will only match
- * on routing entries (ftable and ctable) that have a matching
- * ire->ire_ipif->ipif_ill. Thus this can only be used
- * for destinations that are on-link for the specific ill
- * and that can appear on multiple links. Thus it is useful
- * for multicast destinations, link-local destinations, and
- * at some point perhaps for site-local destinations (if the
- * node sits at a site boundary).
- * We create the cache entries in the regular ctable since
- * it can not "confuse" things for other destinations.
- *
- * NOTE : These are the scopes of some of the variables that point at IRE,
- * which needs to be followed while making any future modifications
- * to avoid memory leaks.
- *
- * - ire and sire are the entries looked up initially by
- * ire_ftable_lookup_v6.
- * - ipif_ire is used to hold the interface ire associated with
- * the new cache ire. But it's scope is limited, so we always REFRELE
- * it before branching out to error paths.
- * - save_ire is initialized before ire_create, so that ire returned
- * by ire_create will not over-write the ire. We REFRELE save_ire
- * before breaking out of the switch.
- *
- * Thus on failures, we have to REFRELE only ire and sire, if they
- * are not NULL.
- */
-/* ARGSUSED */
-void
-ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp,
- const in6_addr_t *v6srcp, ill_t *ill, zoneid_t zoneid, ip_stack_t *ipst)
-{
- in6_addr_t v6gw;
- in6_addr_t dst;
- ire_t *ire = NULL;
- ipif_t *src_ipif = NULL;
- ill_t *dst_ill = NULL;
- ire_t *sire = NULL;
- ire_t *save_ire;
- ip6_t *ip6h;
- int err = 0;
- mblk_t *first_mp;
- ipsec_out_t *io;
- ushort_t ire_marks = 0;
- int match_flags;
- ire_t *first_sire = NULL;
- mblk_t *copy_mp = NULL;
- mblk_t *xmit_mp = NULL;
- in6_addr_t save_dst;
- uint32_t multirt_flags =
- MULTIRT_CACHEGW | MULTIRT_USESTAMP | MULTIRT_SETSTAMP;
- boolean_t multirt_is_resolvable;
- boolean_t multirt_resolve_next;
- boolean_t need_rele = B_FALSE;
- boolean_t ip6_asp_table_held = B_FALSE;
- tsol_ire_gw_secattr_t *attrp = NULL;
- tsol_gcgrp_t *gcgrp = NULL;
- tsol_gcgrp_addr_t ga;
-
- ASSERT(!IN6_IS_ADDR_MULTICAST(v6dstp));
-
- first_mp = mp;
- if (mp->b_datap->db_type == M_CTL) {
- mp = mp->b_cont;
- io = (ipsec_out_t *)first_mp->b_rptr;
- ASSERT(io->ipsec_out_type == IPSEC_OUT);
- } else {
- io = NULL;
- }
-
- ip6h = (ip6_t *)mp->b_rptr;
-
- if (IN6_IS_ADDR_LOOPBACK(v6dstp)) {
- ip1dbg(("ip_newroute_v6: dst with loopback addr\n"));
- goto icmp_err_ret;
- } else if (IN6_IS_ADDR_LOOPBACK(v6srcp)) {
- ip1dbg(("ip_newroute_v6: src with loopback addr\n"));
- goto icmp_err_ret;
- }
-
- /*
- * If this IRE is created for forwarding or it is not for
- * TCP traffic, mark it as temporary.
- *
- * Is it sufficient just to check the next header??
- */
- if (mp->b_prev != NULL || !IP_FLOW_CONTROLLED_ULP(ip6h->ip6_nxt))
- ire_marks |= IRE_MARK_TEMPORARY;
-
- /*
- * Get what we can from ire_ftable_lookup_v6 which will follow an IRE
- * chain until it gets the most specific information available.
- * For example, we know that there is no IRE_CACHE for this dest,
- * but there may be an IRE_OFFSUBNET which specifies a gateway.
- * ire_ftable_lookup_v6 will look up the gateway, etc.
- */
-
- if (ill == NULL) {
- match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
- MATCH_IRE_PARENT | MATCH_IRE_RJ_BHOLE | MATCH_IRE_SECATTR;
- ire = ire_ftable_lookup_v6(v6dstp, 0, 0, 0,
- NULL, &sire, zoneid, 0, msg_getlabel(mp),
- match_flags, ipst);
- } else {
- match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
- MATCH_IRE_RJ_BHOLE | MATCH_IRE_ILL;
- match_flags |= MATCH_IRE_PARENT | MATCH_IRE_SECATTR;
-
- /*
- * Because nce_xmit() calls ip_output_v6() and NCEs are always
- * tied to an underlying interface, IS_UNDER_IPMP() may be
- * true even when building IREs that will be used for data
- * traffic. As such, use the packet's source address to
- * determine whether the traffic is test traffic, and set
- * MATCH_IRE_MARK_TESTHIDDEN if so.
- */
- if (IS_UNDER_IPMP(ill) && !IN6_IS_ADDR_UNSPECIFIED(v6srcp)) {
- if (ipif_lookup_testaddr_v6(ill, v6srcp, NULL))
- match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
- }
-
- ire = ire_ftable_lookup_v6(v6dstp, NULL, NULL, 0, ill->ill_ipif,
- &sire, zoneid, 0, msg_getlabel(mp), match_flags, ipst);
- }
-
- ip3dbg(("ip_newroute_v6: ire_ftable_lookup_v6() "
- "returned ire %p, sire %p\n", (void *)ire, (void *)sire));
-
- /*
- * We enter a loop that will be run only once in most cases.
- * The loop is re-entered in the case where the destination
- * can be reached through multiple RTF_MULTIRT-flagged routes.
- * The intention is to compute multiple routes to a single
- * destination in a single ip_newroute_v6 call.
- * The information is contained in sire->ire_flags.
- */
- do {
- multirt_resolve_next = B_FALSE;
-
- if (dst_ill != NULL) {
- ill_refrele(dst_ill);
- dst_ill = NULL;
- }
- if (src_ipif != NULL) {
- ipif_refrele(src_ipif);
- src_ipif = NULL;
- }
- if ((sire != NULL) && sire->ire_flags & RTF_MULTIRT) {
- ip3dbg(("ip_newroute_v6: starting new resolution "
- "with first_mp %p, tag %d\n",
- (void *)first_mp, MULTIRT_DEBUG_TAGGED(first_mp)));
-
- /*
- * We check if there are trailing unresolved routes for
- * the destination contained in sire.
- */
- multirt_is_resolvable = ire_multirt_lookup_v6(&ire,
- &sire, multirt_flags, msg_getlabel(mp), ipst);
-
- ip3dbg(("ip_newroute_v6: multirt_is_resolvable %d, "
- "ire %p, sire %p\n",
- multirt_is_resolvable, (void *)ire, (void *)sire));
-
- if (!multirt_is_resolvable) {
- /*
- * No more multirt routes to resolve; give up
- * (all routes resolved or no more resolvable
- * routes).
- */
- if (ire != NULL) {
- ire_refrele(ire);
- ire = NULL;
- }
- } else {
- ASSERT(sire != NULL);
- ASSERT(ire != NULL);
- /*
- * We simply use first_sire as a flag that
- * indicates if a resolvable multirt route has
- * already been found during the preceding
- * loops. If it is not the case, we may have
- * to send an ICMP error to report that the
- * destination is unreachable. We do not
- * IRE_REFHOLD first_sire.
- */
- if (first_sire == NULL) {
- first_sire = sire;
- }
- }
- }
- if ((ire == NULL) || (ire == sire)) {
- /*
- * either ire == NULL (the destination cannot be
- * resolved) or ire == sire (the gateway cannot be
- * resolved). At this point, there are no more routes
- * to resolve for the destination, thus we exit.
- */
- if (ip_debug > 3) {
- /* ip2dbg */
- pr_addr_dbg("ip_newroute_v6: "
- "can't resolve %s\n", AF_INET6, v6dstp);
- }
- ip3dbg(("ip_newroute_v6: "
- "ire %p, sire %p, first_sire %p\n",
- (void *)ire, (void *)sire, (void *)first_sire));
-
- if (sire != NULL) {
- ire_refrele(sire);
- sire = NULL;
- }
-
- if (first_sire != NULL) {
- /*
- * At least one multirt route has been found
- * in the same ip_newroute() call; there is no
- * need to report an ICMP error.
- * first_sire was not IRE_REFHOLDed.
- */
- MULTIRT_DEBUG_UNTAG(first_mp);
- freemsg(first_mp);
- return;
- }
- ip_rts_change_v6(RTM_MISS, v6dstp, 0, 0, 0, 0, 0, 0,
- RTA_DST, ipst);
- goto icmp_err_ret;
- }
-
- ASSERT(ire->ire_ipversion == IPV6_VERSION);
-
- /*
- * Verify that the returned IRE does not have either the
- * RTF_REJECT or RTF_BLACKHOLE flags set and that the IRE is
- * either an IRE_CACHE, IRE_IF_NORESOLVER or IRE_IF_RESOLVER.
- */
- if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) ||
- (ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0)
- goto icmp_err_ret;
-
- /*
- * Increment the ire_ob_pkt_count field for ire if it is an
- * INTERFACE (IF_RESOLVER or IF_NORESOLVER) IRE type, and
- * increment the same for the parent IRE, sire, if it is some
- * sort of prefix IRE (which includes DEFAULT, PREFIX, and HOST)
- */
- if ((ire->ire_type & IRE_INTERFACE) != 0) {
- UPDATE_OB_PKT_COUNT(ire);
- ire->ire_last_used_time = lbolt;
- }
-
- if (sire != NULL) {
- mutex_enter(&sire->ire_lock);
- v6gw = sire->ire_gateway_addr_v6;
- mutex_exit(&sire->ire_lock);
- ASSERT((sire->ire_type & (IRE_CACHETABLE |
- IRE_INTERFACE)) == 0);
- UPDATE_OB_PKT_COUNT(sire);
- sire->ire_last_used_time = lbolt;
- } else {
- v6gw = ipv6_all_zeros;
- }
-
- /*
- * We have a route to reach the destination. Find the
- * appropriate ill, then get a source address that matches the
- * right scope via ipif_select_source_v6().
- *
- * If we are here trying to create an IRE_CACHE for an offlink
- * destination and have an IRE_CACHE entry for VNI, then use
- * ire_stq instead since VNI's queue is a black hole.
- *
- * Note: While we pick a dst_ill we are really only interested
- * in the ill for load spreading. The source ipif is
- * determined by source address selection below.
- */
- if ((ire->ire_type == IRE_CACHE) &&
- IS_VNI(ire->ire_ipif->ipif_ill)) {
- dst_ill = ire->ire_stq->q_ptr;
- ill_refhold(dst_ill);
- } else {
- ill_t *ill = ire->ire_ipif->ipif_ill;
-
- if (IS_IPMP(ill)) {
- dst_ill =
- ipmp_illgrp_hold_next_ill(ill->ill_grp);
- } else {
- dst_ill = ill;
- ill_refhold(dst_ill);
- }
- }
-
- if (dst_ill == NULL) {
- if (ip_debug > 2) {
- pr_addr_dbg("ip_newroute_v6 : no dst "
- "ill for dst %s\n", AF_INET6, v6dstp);
- }
- goto icmp_err_ret;
- }
-
- if (ill != NULL && dst_ill != ill &&
- !IS_IN_SAME_ILLGRP(dst_ill, ill)) {
- /*
- * We should have found a route matching "ill"
- * as we called ire_ftable_lookup_v6 with
- * MATCH_IRE_ILL. Rather than asserting when
- * there is a mismatch, we just drop the packet.
- */
- ip0dbg(("ip_newroute_v6: BOUND_IF failed: "
- "dst_ill %s ill %s\n", dst_ill->ill_name,
- ill->ill_name));
- goto icmp_err_ret;
- }
-
- /*
- * Pick a source address which matches the scope of the
- * destination address.
- * For RTF_SETSRC routes, the source address is imposed by the
- * parent ire (sire).
- */
- ASSERT(src_ipif == NULL);
-
- /*
- * Because nce_xmit() calls ip_output_v6() and NCEs are always
- * tied to the underlying interface, IS_UNDER_IPMP() may be
- * true even when building IREs that will be used for data
- * traffic. As such, see if the packet's source address is a
- * test address, and if so use that test address's ipif for
- * the IRE so that the logic that sets IRE_MARK_TESTHIDDEN in
- * ire_add_v6() can work properly.
- */
- if (ill != NULL && IS_UNDER_IPMP(ill))
- (void) ipif_lookup_testaddr_v6(ill, v6srcp, &src_ipif);
-
- if (src_ipif == NULL && ire->ire_type == IRE_IF_RESOLVER &&
- !IN6_IS_ADDR_UNSPECIFIED(&v6gw) &&
- ip6_asp_can_lookup(ipst)) {
- /*
- * The ire cache entry we're adding is for the
- * gateway itself. The source address in this case
- * is relative to the gateway's address.
- */
- ip6_asp_table_held = B_TRUE;
- src_ipif = ipif_select_source_v6(dst_ill, &v6gw,
- B_TRUE, IPV6_PREFER_SRC_DEFAULT, zoneid);
- if (src_ipif != NULL)
- ire_marks |= IRE_MARK_USESRC_CHECK;
- } else if (src_ipif == NULL) {
- if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) {
- /*
- * Check that the ipif matching the requested
- * source address still exists.
- */
- src_ipif = ipif_lookup_addr_v6(
- &sire->ire_src_addr_v6, NULL, zoneid,
- NULL, NULL, NULL, NULL, ipst);
- }
- if (src_ipif == NULL && ip6_asp_can_lookup(ipst)) {
- ip6_asp_table_held = B_TRUE;
- src_ipif = ipif_select_source_v6(dst_ill,
- v6dstp, B_FALSE,
- IPV6_PREFER_SRC_DEFAULT, zoneid);
- if (src_ipif != NULL)
- ire_marks |= IRE_MARK_USESRC_CHECK;
- }
- }
-
- if (src_ipif == NULL) {
- if (ip_debug > 2) {
- /* ip1dbg */
- pr_addr_dbg("ip_newroute_v6: no src for "
- "dst %s\n", AF_INET6, v6dstp);
- printf("ip_newroute_v6: interface name %s\n",
- dst_ill->ill_name);
- }
- goto icmp_err_ret;
- }
-
- if (ip_debug > 3) {
- /* ip2dbg */
- pr_addr_dbg("ip_newroute_v6: first hop %s\n",
- AF_INET6, &v6gw);
- }
- ip2dbg(("\tire type %s (%d)\n",
- ip_nv_lookup(ire_nv_tbl, ire->ire_type), ire->ire_type));
-
- /*
- * At this point in ip_newroute_v6(), ire is either the
- * IRE_CACHE of the next-hop gateway for an off-subnet
- * destination or an IRE_INTERFACE type that should be used
- * to resolve an on-subnet destination or an on-subnet
- * next-hop gateway.
- *
- * In the IRE_CACHE case, we have the following :
- *
- * 1) src_ipif - used for getting a source address.
- *
- * 2) dst_ill - from which we derive ire_stq/ire_rfq. This
- * means packets using this IRE_CACHE will go out on dst_ill.
- *
- * 3) The IRE sire will point to the prefix that is the longest
- * matching route for the destination. These prefix types
- * include IRE_DEFAULT, IRE_PREFIX, IRE_HOST.
- *
- * The newly created IRE_CACHE entry for the off-subnet
- * destination is tied to both the prefix route and the
- * interface route used to resolve the next-hop gateway
- * via the ire_phandle and ire_ihandle fields, respectively.
- *
- * In the IRE_INTERFACE case, we have the following :
- *
- * 1) src_ipif - used for getting a source address.
- *
- * 2) dst_ill - from which we derive ire_stq/ire_rfq. This
- * means packets using the IRE_CACHE that we will build
- * here will go out on dst_ill.
- *
- * 3) sire may or may not be NULL. But, the IRE_CACHE that is
- * to be created will only be tied to the IRE_INTERFACE that
- * was derived from the ire_ihandle field.
- *
- * If sire is non-NULL, it means the destination is off-link
- * and we will first create the IRE_CACHE for the gateway.
- * Next time through ip_newroute_v6, we will create the
- * IRE_CACHE for the final destination as described above.
- */
- save_ire = ire;
- switch (ire->ire_type) {
- case IRE_CACHE: {
- ire_t *ipif_ire;
-
- ASSERT(sire != NULL);
- if (IN6_IS_ADDR_UNSPECIFIED(&v6gw)) {
- mutex_enter(&ire->ire_lock);
- v6gw = ire->ire_gateway_addr_v6;
- mutex_exit(&ire->ire_lock);
- }
- /*
- * We need 3 ire's to create a new cache ire for an
- * off-link destination from the cache ire of the
- * gateway.
- *
- * 1. The prefix ire 'sire'
- * 2. The cache ire of the gateway 'ire'
- * 3. The interface ire 'ipif_ire'
- *
- * We have (1) and (2). We lookup (3) below.
- *
- * If there is no interface route to the gateway,
- * it is a race condition, where we found the cache
- * but the inteface route has been deleted.
- */
- ipif_ire = ire_ihandle_lookup_offlink_v6(ire, sire);
- if (ipif_ire == NULL) {
- ip1dbg(("ip_newroute_v6:"
- "ire_ihandle_lookup_offlink_v6 failed\n"));
- goto icmp_err_ret;
- }
-
- /*
- * Note: the new ire inherits RTF_SETSRC
- * and RTF_MULTIRT to propagate these flags from prefix
- * to cache.
- */
-
- /*
- * Check cached gateway IRE for any security
- * attributes; if found, associate the gateway
- * credentials group to the destination IRE.
- */
- if ((attrp = save_ire->ire_gw_secattr) != NULL) {
- mutex_enter(&attrp->igsa_lock);
- if ((gcgrp = attrp->igsa_gcgrp) != NULL)
- GCGRP_REFHOLD(gcgrp);
- mutex_exit(&attrp->igsa_lock);
- }
-
- ire = ire_create_v6(
- v6dstp, /* dest address */
- &ipv6_all_ones, /* mask */
- &src_ipif->ipif_v6src_addr, /* source address */
- &v6gw, /* gateway address */
- &save_ire->ire_max_frag,
- NULL, /* src nce */
- dst_ill->ill_rq, /* recv-from queue */
- dst_ill->ill_wq, /* send-to queue */
- IRE_CACHE,
- src_ipif,
- &sire->ire_mask_v6, /* Parent mask */
- sire->ire_phandle, /* Parent handle */
- ipif_ire->ire_ihandle, /* Interface handle */
- sire->ire_flags & /* flags if any */
- (RTF_SETSRC | RTF_MULTIRT),
- &(sire->ire_uinfo),
- NULL,
- gcgrp,
- ipst);
-
- if (ire == NULL) {
- if (gcgrp != NULL) {
- GCGRP_REFRELE(gcgrp);
- gcgrp = NULL;
- }
- ire_refrele(save_ire);
- ire_refrele(ipif_ire);
- break;
- }
-
- /* reference now held by IRE */
- gcgrp = NULL;
-
- ire->ire_marks |= ire_marks;
-
- /*
- * Prevent sire and ipif_ire from getting deleted. The
- * newly created ire is tied to both of them via the
- * phandle and ihandle respectively.
- */
- IRB_REFHOLD(sire->ire_bucket);
- /* Has it been removed already ? */
- if (sire->ire_marks & IRE_MARK_CONDEMNED) {
- IRB_REFRELE(sire->ire_bucket);
- ire_refrele(ipif_ire);
- ire_refrele(save_ire);
- break;
- }
-
- IRB_REFHOLD(ipif_ire->ire_bucket);
- /* Has it been removed already ? */
- if (ipif_ire->ire_marks & IRE_MARK_CONDEMNED) {
- IRB_REFRELE(ipif_ire->ire_bucket);
- IRB_REFRELE(sire->ire_bucket);
- ire_refrele(ipif_ire);
- ire_refrele(save_ire);
- break;
- }
-
- xmit_mp = first_mp;
- if (ire->ire_flags & RTF_MULTIRT) {
- copy_mp = copymsg(first_mp);
- if (copy_mp != NULL) {
- xmit_mp = copy_mp;
- MULTIRT_DEBUG_TAG(first_mp);
- }
- }
- ire_add_then_send(q, ire, xmit_mp);
- if (ip6_asp_table_held) {
- ip6_asp_table_refrele(ipst);
- ip6_asp_table_held = B_FALSE;
- }
- ire_refrele(save_ire);
-
- /* Assert that sire is not deleted yet. */
- ASSERT(sire->ire_ptpn != NULL);
- IRB_REFRELE(sire->ire_bucket);
-
- /* Assert that ipif_ire is not deleted yet. */
- ASSERT(ipif_ire->ire_ptpn != NULL);
- IRB_REFRELE(ipif_ire->ire_bucket);
- ire_refrele(ipif_ire);
-
- if (copy_mp != NULL) {
- /*
- * Search for the next unresolved
- * multirt route.
- */
- copy_mp = NULL;
- ipif_ire = NULL;
- ire = NULL;
- /* re-enter the loop */
- multirt_resolve_next = B_TRUE;
- continue;
- }
- ire_refrele(sire);
- ill_refrele(dst_ill);
- ipif_refrele(src_ipif);
- return;
- }
- case IRE_IF_NORESOLVER:
- /*
- * We have what we need to build an IRE_CACHE.
- *
- * handle the Gated case, where we create
- * a NORESOLVER route for loopback.
- */
- if (dst_ill->ill_net_type != IRE_IF_NORESOLVER)
- break;
- /*
- * TSol note: We are creating the ire cache for the
- * destination 'dst'. If 'dst' is offlink, going
- * through the first hop 'gw', the security attributes
- * of 'dst' must be set to point to the gateway
- * credentials of gateway 'gw'. If 'dst' is onlink, it
- * is possible that 'dst' is a potential gateway that is
- * referenced by some route that has some security
- * attributes. Thus in the former case, we need to do a
- * gcgrp_lookup of 'gw' while in the latter case we
- * need to do gcgrp_lookup of 'dst' itself.
- */
- ga.ga_af = AF_INET6;
- if (!IN6_IS_ADDR_UNSPECIFIED(&v6gw))
- ga.ga_addr = v6gw;
- else
- ga.ga_addr = *v6dstp;
- gcgrp = gcgrp_lookup(&ga, B_FALSE);
-
- /*
- * Note: the new ire inherits sire flags RTF_SETSRC
- * and RTF_MULTIRT to propagate those rules from prefix
- * to cache.
- */
- ire = ire_create_v6(
- v6dstp, /* dest address */
- &ipv6_all_ones, /* mask */
- &src_ipif->ipif_v6src_addr, /* source address */
- &v6gw, /* gateway address */
- &save_ire->ire_max_frag,
- NULL, /* no src nce */
- dst_ill->ill_rq, /* recv-from queue */
- dst_ill->ill_wq, /* send-to queue */
- IRE_CACHE,
- src_ipif,
- &save_ire->ire_mask_v6, /* Parent mask */
- (sire != NULL) ? /* Parent handle */
- sire->ire_phandle : 0,
- save_ire->ire_ihandle, /* Interface handle */
- (sire != NULL) ? /* flags if any */
- sire->ire_flags &
- (RTF_SETSRC | RTF_MULTIRT) : 0,
- &(save_ire->ire_uinfo),
- NULL,
- gcgrp,
- ipst);
-
- if (ire == NULL) {
- if (gcgrp != NULL) {
- GCGRP_REFRELE(gcgrp);
- gcgrp = NULL;
- }
- ire_refrele(save_ire);
- break;
- }
-
- /* reference now held by IRE */
- gcgrp = NULL;
-
- ire->ire_marks |= ire_marks;
-
- if (!IN6_IS_ADDR_UNSPECIFIED(&v6gw))
- dst = v6gw;
- else
- dst = *v6dstp;
- err = ndp_noresolver(dst_ill, &dst);
- if (err != 0) {
- ire_refrele(save_ire);
- break;
- }
-
- /* Prevent save_ire from getting deleted */
- IRB_REFHOLD(save_ire->ire_bucket);
- /* Has it been removed already ? */
- if (save_ire->ire_marks & IRE_MARK_CONDEMNED) {
- IRB_REFRELE(save_ire->ire_bucket);
- ire_refrele(save_ire);
- break;
- }
-
- xmit_mp = first_mp;
- /*
- * In case of MULTIRT, a copy of the current packet
- * to send is made to further re-enter the
- * loop and attempt another route resolution
- */
- if ((sire != NULL) && sire->ire_flags & RTF_MULTIRT) {
- copy_mp = copymsg(first_mp);
- if (copy_mp != NULL) {
- xmit_mp = copy_mp;
- MULTIRT_DEBUG_TAG(first_mp);
- }
- }
- ire_add_then_send(q, ire, xmit_mp);
- if (ip6_asp_table_held) {
- ip6_asp_table_refrele(ipst);
- ip6_asp_table_held = B_FALSE;
- }
-
- /* Assert that it is not deleted yet. */
- ASSERT(save_ire->ire_ptpn != NULL);
- IRB_REFRELE(save_ire->ire_bucket);
- ire_refrele(save_ire);
-
- if (copy_mp != NULL) {
- /*
- * If we found a (no)resolver, we ignore any
- * trailing top priority IRE_CACHE in
- * further loops. This ensures that we do not
- * omit any (no)resolver despite the priority
- * in this call.
- * IRE_CACHE, if any, will be processed
- * by another thread entering ip_newroute(),
- * (on resolver response, for example).
- * We use this to force multiple parallel
- * resolution as soon as a packet needs to be
- * sent. The result is, after one packet
- * emission all reachable routes are generally
- * resolved.
- * Otherwise, complete resolution of MULTIRT
- * routes would require several emissions as
- * side effect.
- */
- multirt_flags &= ~MULTIRT_CACHEGW;
-
- /*
- * Search for the next unresolved multirt
- * route.
- */
- copy_mp = NULL;
- save_ire = NULL;
- ire = NULL;
- /* re-enter the loop */
- multirt_resolve_next = B_TRUE;
- continue;
- }
-
- /* Don't need sire anymore */
- if (sire != NULL)
- ire_refrele(sire);
- ill_refrele(dst_ill);
- ipif_refrele(src_ipif);
- return;
-
- case IRE_IF_RESOLVER:
- /*
- * We can't build an IRE_CACHE yet, but at least we
- * found a resolver that can help.
- */
- dst = *v6dstp;
-
- /*
- * To be at this point in the code with a non-zero gw
- * means that dst is reachable through a gateway that
- * we have never resolved. By changing dst to the gw
- * addr we resolve the gateway first. When
- * ire_add_then_send() tries to put the IP dg to dst,
- * it will reenter ip_newroute() at which time we will
- * find the IRE_CACHE for the gw and create another
- * IRE_CACHE above (for dst itself).
- */
- if (!IN6_IS_ADDR_UNSPECIFIED(&v6gw)) {
- save_dst = dst;
- dst = v6gw;
- v6gw = ipv6_all_zeros;
- }
- if (dst_ill->ill_flags & ILLF_XRESOLV) {
- /*
- * Ask the external resolver to do its thing.
- * Make an mblk chain in the following form:
- * ARQ_REQ_MBLK-->IRE_MBLK-->packet
- */
- mblk_t *ire_mp;
- mblk_t *areq_mp;
- areq_t *areq;
- in6_addr_t *addrp;
-
- ip1dbg(("ip_newroute_v6:ILLF_XRESOLV\n"));
- if (ip6_asp_table_held) {
- ip6_asp_table_refrele(ipst);
- ip6_asp_table_held = B_FALSE;
- }
- ire = ire_create_mp_v6(
- &dst, /* dest address */
- &ipv6_all_ones, /* mask */
- &src_ipif->ipif_v6src_addr,
- /* source address */
- &v6gw, /* gateway address */
- NULL, /* no src nce */
- dst_ill->ill_rq, /* recv-from queue */
- dst_ill->ill_wq, /* send-to queue */
- IRE_CACHE,
- src_ipif,
- &save_ire->ire_mask_v6, /* Parent mask */
- 0,
- save_ire->ire_ihandle,
- /* Interface handle */
- 0, /* flags if any */
- &(save_ire->ire_uinfo),
- NULL,
- NULL,
- ipst);
-
- ire_refrele(save_ire);
- if (ire == NULL) {
- ip1dbg(("ip_newroute_v6:"
- "ire is NULL\n"));
- break;
- }
-
- if ((sire != NULL) &&
- (sire->ire_flags & RTF_MULTIRT)) {
- /*
- * processing a copy of the packet to
- * send for further resolution loops
- */
- copy_mp = copymsg(first_mp);
- if (copy_mp != NULL)
- MULTIRT_DEBUG_TAG(copy_mp);
- }
- ire->ire_marks |= ire_marks;
- ire_mp = ire->ire_mp;
- /*
- * Now create or find an nce for this interface.
- * The hw addr will need to to be set from
- * the reply to the AR_ENTRY_QUERY that
- * we're about to send. This will be done in
- * ire_add_v6().
- */
- err = ndp_resolver(dst_ill, &dst, mp, zoneid);
- switch (err) {
- case 0:
- /*
- * New cache entry created.
- * Break, then ask the external
- * resolver.
- */
- break;
- case EINPROGRESS:
- /*
- * Resolution in progress;
- * packet has been queued by
- * ndp_resolver().
- */
- ire_delete(ire);
- ire = NULL;
- /*
- * Check if another multirt
- * route must be resolved.
- */
- if (copy_mp != NULL) {
- /*
- * If we found a resolver, we
- * ignore any trailing top
- * priority IRE_CACHE in
- * further loops. The reason is
- * the same as for noresolver.
- */
- multirt_flags &=
- ~MULTIRT_CACHEGW;
- /*
- * Search for the next
- * unresolved multirt route.
- */
- first_mp = copy_mp;
- copy_mp = NULL;
- mp = first_mp;
- if (mp->b_datap->db_type ==
- M_CTL) {
- mp = mp->b_cont;
- }
- ASSERT(sire != NULL);
- dst = save_dst;
- /*
- * re-enter the loop
- */
- multirt_resolve_next =
- B_TRUE;
- continue;
- }
-
- if (sire != NULL)
- ire_refrele(sire);
- ill_refrele(dst_ill);
- ipif_refrele(src_ipif);
- return;
- default:
- /*
- * Transient error; packet will be
- * freed.
- */
- ire_delete(ire);
- ire = NULL;
- break;
- }
- if (err != 0)
- break;
- /*
- * Now set up the AR_ENTRY_QUERY and send it.
- */
- areq_mp = ill_arp_alloc(dst_ill,
- (uchar_t *)&ipv6_areq_template,
- (caddr_t)&dst);
- if (areq_mp == NULL) {
- ip1dbg(("ip_newroute_v6:"
- "areq_mp is NULL\n"));
- freemsg(ire_mp);
- break;
- }
- areq = (areq_t *)areq_mp->b_rptr;
- addrp = (in6_addr_t *)((char *)areq +
- areq->areq_target_addr_offset);
- *addrp = dst;
- addrp = (in6_addr_t *)((char *)areq +
- areq->areq_sender_addr_offset);
- *addrp = src_ipif->ipif_v6src_addr;
- /*
- * link the chain, then send up to the resolver.
- */
- linkb(areq_mp, ire_mp);
- linkb(areq_mp, mp);
- ip1dbg(("ip_newroute_v6:"
- "putnext to resolver\n"));
- putnext(dst_ill->ill_rq, areq_mp);
- /*
- * Check if another multirt route
- * must be resolved.
- */
- ire = NULL;
- if (copy_mp != NULL) {
- /*
- * If we find a resolver, we ignore any
- * trailing top priority IRE_CACHE in
- * further loops. The reason is the
- * same as for noresolver.
- */
- multirt_flags &= ~MULTIRT_CACHEGW;
- /*
- * Search for the next unresolved
- * multirt route.
- */
- first_mp = copy_mp;
- copy_mp = NULL;
- mp = first_mp;
- if (mp->b_datap->db_type == M_CTL) {
- mp = mp->b_cont;
- }
- ASSERT(sire != NULL);
- dst = save_dst;
- /*
- * re-enter the loop
- */
- multirt_resolve_next = B_TRUE;
- continue;
- }
-
- if (sire != NULL)
- ire_refrele(sire);
- ill_refrele(dst_ill);
- ipif_refrele(src_ipif);
- return;
- }
- /*
- * Non-external resolver case.
- *
- * TSol note: Please see the note above the
- * IRE_IF_NORESOLVER case.
- */
- ga.ga_af = AF_INET6;
- ga.ga_addr = dst;
- gcgrp = gcgrp_lookup(&ga, B_FALSE);
-
- ire = ire_create_v6(
- &dst, /* dest address */
- &ipv6_all_ones, /* mask */
- &src_ipif->ipif_v6src_addr, /* source address */
- &v6gw, /* gateway address */
- &save_ire->ire_max_frag,
- NULL, /* no src nce */
- dst_ill->ill_rq, /* recv-from queue */
- dst_ill->ill_wq, /* send-to queue */
- IRE_CACHE,
- src_ipif,
- &save_ire->ire_mask_v6, /* Parent mask */
- 0,
- save_ire->ire_ihandle, /* Interface handle */
- 0, /* flags if any */
- &(save_ire->ire_uinfo),
- NULL,
- gcgrp,
- ipst);
-
- if (ire == NULL) {
- if (gcgrp != NULL) {
- GCGRP_REFRELE(gcgrp);
- gcgrp = NULL;
- }
- ire_refrele(save_ire);
- break;
- }
-
- /* reference now held by IRE */
- gcgrp = NULL;
-
- if ((sire != NULL) &&
- (sire->ire_flags & RTF_MULTIRT)) {
- copy_mp = copymsg(first_mp);
- if (copy_mp != NULL)
- MULTIRT_DEBUG_TAG(copy_mp);
- }
-
- ire->ire_marks |= ire_marks;
- err = ndp_resolver(dst_ill, &dst, first_mp, zoneid);
- switch (err) {
- case 0:
- /* Prevent save_ire from getting deleted */
- IRB_REFHOLD(save_ire->ire_bucket);
- /* Has it been removed already ? */
- if (save_ire->ire_marks & IRE_MARK_CONDEMNED) {
- IRB_REFRELE(save_ire->ire_bucket);
- ire_refrele(save_ire);
- break;
- }
-
- /*
- * We have a resolved cache entry,
- * add in the IRE.
- */
- ire_add_then_send(q, ire, first_mp);
- if (ip6_asp_table_held) {
- ip6_asp_table_refrele(ipst);
- ip6_asp_table_held = B_FALSE;
- }
-
- /* Assert that it is not deleted yet. */
- ASSERT(save_ire->ire_ptpn != NULL);
- IRB_REFRELE(save_ire->ire_bucket);
- ire_refrele(save_ire);
- /*
- * Check if another multirt route
- * must be resolved.
- */
- ire = NULL;
- if (copy_mp != NULL) {
- /*
- * If we find a resolver, we ignore any
- * trailing top priority IRE_CACHE in
- * further loops. The reason is the
- * same as for noresolver.
- */
- multirt_flags &= ~MULTIRT_CACHEGW;
- /*
- * Search for the next unresolved
- * multirt route.
- */
- first_mp = copy_mp;
- copy_mp = NULL;
- mp = first_mp;
- if (mp->b_datap->db_type == M_CTL) {
- mp = mp->b_cont;
- }
- ASSERT(sire != NULL);
- dst = save_dst;
- /*
- * re-enter the loop
- */
- multirt_resolve_next = B_TRUE;
- continue;
- }
-
- if (sire != NULL)
- ire_refrele(sire);
- ill_refrele(dst_ill);
- ipif_refrele(src_ipif);
- return;
-
- case EINPROGRESS:
- /*
- * mp was consumed - presumably queued.
- * No need for ire, presumably resolution is
- * in progress, and ire will be added when the
- * address is resolved.
- */
- if (ip6_asp_table_held) {
- ip6_asp_table_refrele(ipst);
- ip6_asp_table_held = B_FALSE;
- }
- ASSERT(ire->ire_nce == NULL);
- ire_delete(ire);
- ire_refrele(save_ire);
- /*
- * Check if another multirt route
- * must be resolved.
- */
- ire = NULL;
- if (copy_mp != NULL) {
- /*
- * If we find a resolver, we ignore any
- * trailing top priority IRE_CACHE in
- * further loops. The reason is the
- * same as for noresolver.
- */
- multirt_flags &= ~MULTIRT_CACHEGW;
- /*
- * Search for the next unresolved
- * multirt route.
- */
- first_mp = copy_mp;
- copy_mp = NULL;
- mp = first_mp;
- if (mp->b_datap->db_type == M_CTL) {
- mp = mp->b_cont;
- }
- ASSERT(sire != NULL);
- dst = save_dst;
- /*
- * re-enter the loop
- */
- multirt_resolve_next = B_TRUE;
- continue;
- }
- if (sire != NULL)
- ire_refrele(sire);
- ill_refrele(dst_ill);
- ipif_refrele(src_ipif);
- return;
- default:
- /* Some transient error */
- ASSERT(ire->ire_nce == NULL);
- ire_refrele(save_ire);
- break;
- }
- break;
- default:
- break;
- }
- if (ip6_asp_table_held) {
- ip6_asp_table_refrele(ipst);
- ip6_asp_table_held = B_FALSE;
- }
- } while (multirt_resolve_next);
-
-err_ret:
- ip1dbg(("ip_newroute_v6: dropped\n"));
- if (src_ipif != NULL)
- ipif_refrele(src_ipif);
- if (dst_ill != NULL) {
- need_rele = B_TRUE;
- ill = dst_ill;
- }
- if (ill != NULL) {
- if (mp->b_prev != NULL) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- } else {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
- }
-
- if (need_rele)
- ill_refrele(ill);
- } else {
- if (mp->b_prev != NULL) {
- BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInDiscards);
- } else {
- BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
- }
- }
- /* Did this packet originate externally? */
- if (mp->b_prev) {
- mp->b_next = NULL;
- mp->b_prev = NULL;
- }
- if (copy_mp != NULL) {
- MULTIRT_DEBUG_UNTAG(copy_mp);
- freemsg(copy_mp);
- }
- MULTIRT_DEBUG_UNTAG(first_mp);
- freemsg(first_mp);
- if (ire != NULL)
- ire_refrele(ire);
- if (sire != NULL)
- ire_refrele(sire);
- return;
-
-icmp_err_ret:
- if (ip6_asp_table_held)
- ip6_asp_table_refrele(ipst);
- if (src_ipif != NULL)
- ipif_refrele(src_ipif);
- if (dst_ill != NULL) {
- need_rele = B_TRUE;
- ill = dst_ill;
- }
- ip1dbg(("ip_newroute_v6: no route\n"));
- if (sire != NULL)
- ire_refrele(sire);
- /*
- * We need to set sire to NULL to avoid double freeing if we
- * ever goto err_ret from below.
- */
- sire = NULL;
- ip6h = (ip6_t *)mp->b_rptr;
- /* Skip ip6i_t header if present */
- if (ip6h->ip6_nxt == IPPROTO_RAW) {
- /* Make sure the IPv6 header is present */
- if ((mp->b_wptr - (uchar_t *)ip6h) <
- sizeof (ip6i_t) + IPV6_HDR_LEN) {
- if (!pullupmsg(mp, sizeof (ip6i_t) + IPV6_HDR_LEN)) {
- ip1dbg(("ip_newroute_v6: pullupmsg failed\n"));
- goto err_ret;
- }
- }
- mp->b_rptr += sizeof (ip6i_t);
- ip6h = (ip6_t *)mp->b_rptr;
- }
- /* Did this packet originate externally? */
- if (mp->b_prev) {
- if (ill != NULL) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInNoRoutes);
- } else {
- BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInNoRoutes);
- }
- mp->b_next = NULL;
- mp->b_prev = NULL;
- q = WR(q);
- } else {
- if (ill != NULL) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutNoRoutes);
- } else {
- BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutNoRoutes);
- }
- if (ip_hdr_complete_v6(ip6h, zoneid, ipst)) {
- /* Failed */
- if (copy_mp != NULL) {
- MULTIRT_DEBUG_UNTAG(copy_mp);
- freemsg(copy_mp);
- }
- MULTIRT_DEBUG_UNTAG(first_mp);
- freemsg(first_mp);
- if (ire != NULL)
- ire_refrele(ire);
- if (need_rele)
- ill_refrele(ill);
- return;
- }
- }
-
- if (need_rele)
- ill_refrele(ill);
-
- /*
- * At this point we will have ire only if RTF_BLACKHOLE
- * or RTF_REJECT flags are set on the IRE. It will not
- * generate ICMP6_DST_UNREACH_NOROUTE if RTF_BLACKHOLE is set.
- */
- if (ire != NULL) {
- if (ire->ire_flags & RTF_BLACKHOLE) {
- ire_refrele(ire);
- if (copy_mp != NULL) {
- MULTIRT_DEBUG_UNTAG(copy_mp);
- freemsg(copy_mp);
- }
- MULTIRT_DEBUG_UNTAG(first_mp);
- freemsg(first_mp);
- return;
- }
- ire_refrele(ire);
- }
- if (ip_debug > 3) {
- /* ip2dbg */
- pr_addr_dbg("ip_newroute_v6: no route to %s\n",
- AF_INET6, v6dstp);
- }
- icmp_unreachable_v6(WR(q), first_mp, ICMP6_DST_UNREACH_NOROUTE,
- B_FALSE, B_FALSE, zoneid, ipst);
-}
-
-/*
- * ip_newroute_ipif_v6 is called by ip_wput_v6 and ip_wput_ipsec_out_v6 whenever
- * we need to send out a packet to a destination address for which we do not
- * have specific routing information. It is only used for multicast packets.
- *
- * If unspec_src we allow creating an IRE with source address zero.
- * ire_send_v6() will delete it after the packet is sent.
- */
-void
-ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif,
- const in6_addr_t *v6dstp, const in6_addr_t *v6srcp, int unspec_src,
- zoneid_t zoneid)
-{
- ire_t *ire = NULL;
- ipif_t *src_ipif = NULL;
- int err = 0;
- ill_t *dst_ill = NULL;
- ire_t *save_ire;
- ipsec_out_t *io;
- ill_t *ill;
- mblk_t *first_mp;
- ire_t *fire = NULL;
- mblk_t *copy_mp = NULL;
- const in6_addr_t *ire_v6srcp;
- boolean_t probe = B_FALSE;
- boolean_t multirt_resolve_next;
- boolean_t ipif_held = B_FALSE;
- boolean_t ill_held = B_FALSE;
- boolean_t ip6_asp_table_held = B_FALSE;
- ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
-
- /*
- * This loop is run only once in most cases.
- * We loop to resolve further routes only when the destination
- * can be reached through multiple RTF_MULTIRT-flagged ires.
- */
- do {
- multirt_resolve_next = B_FALSE;
- if (dst_ill != NULL) {
- ill_refrele(dst_ill);
- dst_ill = NULL;
- }
-
- if (src_ipif != NULL) {
- ipif_refrele(src_ipif);
- src_ipif = NULL;
- }
- ASSERT(ipif != NULL);
- ill = ipif->ipif_ill;
-
- ASSERT(!IN6_IS_ADDR_V4MAPPED(v6dstp));
- if (ip_debug > 2) {
- /* ip1dbg */
- pr_addr_dbg("ip_newroute_ipif_v6: v6dst %s\n",
- AF_INET6, v6dstp);
- printf("ip_newroute_ipif_v6: if %s, v6 %d\n",
- ill->ill_name, ipif->ipif_isv6);
- }
-
- first_mp = mp;
- if (mp->b_datap->db_type == M_CTL) {
- mp = mp->b_cont;
- io = (ipsec_out_t *)first_mp->b_rptr;
- ASSERT(io->ipsec_out_type == IPSEC_OUT);
- } else {
- io = NULL;
- }
-
- /*
- * If the interface is a pt-pt interface we look for an
- * IRE_IF_RESOLVER or IRE_IF_NORESOLVER that matches both the
- * local_address and the pt-pt destination address.
- * Otherwise we just match the local address.
- */
- if (!(ill->ill_flags & ILLF_MULTICAST)) {
- goto err_ret;
- }
-
- /*
- * We check if an IRE_OFFSUBNET for the addr that goes through
- * ipif exists. We need it to determine if the RTF_SETSRC and/or
- * RTF_MULTIRT flags must be honored.
- */
- fire = ipif_lookup_multi_ire_v6(ipif, v6dstp);
- ip2dbg(("ip_newroute_ipif_v6: "
- "ipif_lookup_multi_ire_v6("
- "ipif %p, dst %08x) = fire %p\n",
- (void *)ipif, ntohl(V4_PART_OF_V6((*v6dstp))),
- (void *)fire));
-
- ASSERT(src_ipif == NULL);
-
- /*
- * Because nce_xmit() calls ip_output_v6() and NCEs are always
- * tied to the underlying interface, IS_UNDER_IPMP() may be
- * true even when building IREs that will be used for data
- * traffic. As such, see if the packet's source address is a
- * test address, and if so use that test address's ipif for
- * the IRE so that the logic that sets IRE_MARK_TESTHIDDEN in
- * ire_add_v6() can work properly.
- */
- if (IS_UNDER_IPMP(ill))
- probe = ipif_lookup_testaddr_v6(ill, v6srcp, &src_ipif);
-
- /*
- * Determine the outbound (destination) ill for this route.
- * If IPMP is not in use, that's the same as our ill. If IPMP
- * is in-use and we're on the IPMP interface, or we're on an
- * underlying ill but sending data traffic, use a suitable
- * destination ill from the group. The latter case covers a
- * subtle edge condition with multicast: when we bring up an
- * IPv6 data address, we will create an NCE on an underlying
- * interface, and send solitications to ff02::1, which would
- * take us through here, and cause us to create an IRE for
- * ff02::1. To meet our defined semantics for multicast (and
- * ensure there aren't unexpected echoes), that IRE needs to
- * use the IPMP group's nominated multicast interface.
- *
- * Note: the source ipif is determined by source address
- * selection later.
- */
- if (IS_IPMP(ill) || (IS_UNDER_IPMP(ill) && !probe)) {
- ill_t *ipmp_ill;
- ipmp_illgrp_t *illg;
-
- if (IS_UNDER_IPMP(ill)) {
- ipmp_ill = ipmp_ill_hold_ipmp_ill(ill);
- } else {
- ipmp_ill = ill;
- ill_refhold(ipmp_ill); /* for symmetry */
- }
-
- if (ipmp_ill == NULL)
- goto err_ret;
-
- illg = ipmp_ill->ill_grp;
- if (IN6_IS_ADDR_MULTICAST(v6dstp))
- dst_ill = ipmp_illgrp_hold_cast_ill(illg);
- else
- dst_ill = ipmp_illgrp_hold_next_ill(illg);
-
- ill_refrele(ipmp_ill);
- } else {
- dst_ill = ill;
- ill_refhold(dst_ill); /* for symmetry */
- }
-
- if (dst_ill == NULL) {
- if (ip_debug > 2) {
- pr_addr_dbg("ip_newroute_ipif_v6: "
- "no dst ill for dst %s\n",
- AF_INET6, v6dstp);
- }
- goto err_ret;
- }
-
- /*
- * Pick a source address which matches the scope of the
- * destination address.
- * For RTF_SETSRC routes, the source address is imposed by the
- * parent ire (fire).
- */
-
- if (src_ipif == NULL && fire != NULL &&
- (fire->ire_flags & RTF_SETSRC)) {
- /*
- * Check that the ipif matching the requested source
- * address still exists.
- */
- src_ipif = ipif_lookup_addr_v6(&fire->ire_src_addr_v6,
- NULL, zoneid, NULL, NULL, NULL, NULL, ipst);
- }
-
- if (src_ipif == NULL && ip6_asp_can_lookup(ipst)) {
- ip6_asp_table_held = B_TRUE;
- src_ipif = ipif_select_source_v6(dst_ill, v6dstp,
- B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid);
- }
-
- if (src_ipif == NULL) {
- if (!unspec_src) {
- if (ip_debug > 2) {
- /* ip1dbg */
- pr_addr_dbg("ip_newroute_ipif_v6: "
- "no src for dst %s\n",
- AF_INET6, v6dstp);
- printf(" through interface %s\n",
- dst_ill->ill_name);
- }
- goto err_ret;
- }
- ire_v6srcp = &ipv6_all_zeros;
- src_ipif = ipif;
- ipif_refhold(src_ipif);
- } else {
- ire_v6srcp = &src_ipif->ipif_v6src_addr;
- }
-
- ire = ipif_to_ire_v6(ipif);
- if (ire == NULL) {
- if (ip_debug > 2) {
- /* ip1dbg */
- pr_addr_dbg("ip_newroute_ipif_v6: v6src %s\n",
- AF_INET6, &ipif->ipif_v6lcl_addr);
- printf("ip_newroute_ipif_v6: "
- "if %s\n", dst_ill->ill_name);
- }
- goto err_ret;
- }
- if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE))
- goto err_ret;
-
- ASSERT(ire->ire_ipversion == IPV6_VERSION);
-
- ip1dbg(("ip_newroute_ipif_v6: interface type %s (%d),",
- ip_nv_lookup(ire_nv_tbl, ire->ire_type), ire->ire_type));
- if (ip_debug > 2) {
- /* ip1dbg */
- pr_addr_dbg(" address %s\n",
- AF_INET6, &ire->ire_src_addr_v6);
- }
- save_ire = ire;
- ip2dbg(("ip_newroute_ipif: ire %p, ipif %p\n",
- (void *)ire, (void *)ipif));
-
- if ((fire != NULL) && (fire->ire_flags & RTF_MULTIRT)) {
- /*
- * an IRE_OFFSUBET was looked up
- * on that interface.
- * this ire has RTF_MULTIRT flag,
- * so the resolution loop
- * will be re-entered to resolve
- * additional routes on other
- * interfaces. For that purpose,
- * a copy of the packet is
- * made at this point.
- */
- fire->ire_last_used_time = lbolt;
- copy_mp = copymsg(first_mp);
- if (copy_mp) {
- MULTIRT_DEBUG_TAG(copy_mp);
- }
- }
-
- switch (ire->ire_type) {
- case IRE_IF_NORESOLVER: {
- /*
- * We have what we need to build an IRE_CACHE.
- *
- * handle the Gated case, where we create
- * a NORESOLVER route for loopback.
- */
- if (dst_ill->ill_net_type != IRE_IF_NORESOLVER)
- break;
- /*
- * The newly created ire will inherit the flags of the
- * parent ire, if any.
- */
- ire = ire_create_v6(
- v6dstp, /* dest address */
- &ipv6_all_ones, /* mask */
- ire_v6srcp, /* source address */
- NULL, /* gateway address */
- &save_ire->ire_max_frag,
- NULL, /* no src nce */
- dst_ill->ill_rq, /* recv-from queue */
- dst_ill->ill_wq, /* send-to queue */
- IRE_CACHE,
- src_ipif,
- NULL,
- (fire != NULL) ? /* Parent handle */
- fire->ire_phandle : 0,
- save_ire->ire_ihandle, /* Interface handle */
- (fire != NULL) ?
- (fire->ire_flags & (RTF_SETSRC | RTF_MULTIRT)) :
- 0,
- &ire_uinfo_null,
- NULL,
- NULL,
- ipst);
-
- if (ire == NULL) {
- ire_refrele(save_ire);
- break;
- }
-
- err = ndp_noresolver(dst_ill, v6dstp);
- if (err != 0) {
- ire_refrele(save_ire);
- break;
- }
-
- /* Prevent save_ire from getting deleted */
- IRB_REFHOLD(save_ire->ire_bucket);
- /* Has it been removed already ? */
- if (save_ire->ire_marks & IRE_MARK_CONDEMNED) {
- IRB_REFRELE(save_ire->ire_bucket);
- ire_refrele(save_ire);
- break;
- }
-
- ire_add_then_send(q, ire, first_mp);
- if (ip6_asp_table_held) {
- ip6_asp_table_refrele(ipst);
- ip6_asp_table_held = B_FALSE;
- }
-
- /* Assert that it is not deleted yet. */
- ASSERT(save_ire->ire_ptpn != NULL);
- IRB_REFRELE(save_ire->ire_bucket);
- ire_refrele(save_ire);
- if (fire != NULL) {
- ire_refrele(fire);
- fire = NULL;
- }
-
- /*
- * The resolution loop is re-entered if we
- * actually are in a multirouting case.
- */
- if (copy_mp != NULL) {
- boolean_t need_resolve =
- ire_multirt_need_resolve_v6(v6dstp,
- msg_getlabel(copy_mp), ipst);
- if (!need_resolve) {
- MULTIRT_DEBUG_UNTAG(copy_mp);
- freemsg(copy_mp);
- copy_mp = NULL;
- } else {
- /*
- * ipif_lookup_group_v6() calls
- * ire_lookup_multi_v6() that uses
- * ire_ftable_lookup_v6() to find
- * an IRE_INTERFACE for the group.
- * In the multirt case,
- * ire_lookup_multi_v6() then invokes
- * ire_multirt_lookup_v6() to find
- * the next resolvable ire.
- * As a result, we obtain a new
- * interface, derived from the
- * next ire.
- */
- if (ipif_held) {
- ipif_refrele(ipif);
- ipif_held = B_FALSE;
- }
- ipif = ipif_lookup_group_v6(v6dstp,
- zoneid, ipst);
- ip2dbg(("ip_newroute_ipif: "
- "multirt dst %08x, ipif %p\n",
- ntohl(V4_PART_OF_V6((*v6dstp))),
- (void *)ipif));
- if (ipif != NULL) {
- ipif_held = B_TRUE;
- mp = copy_mp;
- copy_mp = NULL;
- multirt_resolve_next =
- B_TRUE;
- continue;
- } else {
- freemsg(copy_mp);
- }
- }
- }
- ill_refrele(dst_ill);
- if (ipif_held) {
- ipif_refrele(ipif);
- ipif_held = B_FALSE;
- }
- if (src_ipif != NULL)
- ipif_refrele(src_ipif);
- return;
- }
- case IRE_IF_RESOLVER: {
-
- ASSERT(dst_ill->ill_isv6);
-
- /*
- * We obtain a partial IRE_CACHE which we will pass
- * along with the resolver query. When the response
- * comes back it will be there ready for us to add.
- */
- /*
- * the newly created ire will inherit the flags of the
- * parent ire, if any.
- */
- ire = ire_create_v6(
- v6dstp, /* dest address */
- &ipv6_all_ones, /* mask */
- ire_v6srcp, /* source address */
- NULL, /* gateway address */
- &save_ire->ire_max_frag,
- NULL, /* src nce */
- dst_ill->ill_rq, /* recv-from queue */
- dst_ill->ill_wq, /* send-to queue */
- IRE_CACHE,
- src_ipif,
- NULL,
- (fire != NULL) ? /* Parent handle */
- fire->ire_phandle : 0,
- save_ire->ire_ihandle, /* Interface handle */
- (fire != NULL) ?
- (fire->ire_flags & (RTF_SETSRC | RTF_MULTIRT)) :
- 0,
- &ire_uinfo_null,
- NULL,
- NULL,
- ipst);
-
- if (ire == NULL) {
- ire_refrele(save_ire);
- break;
- }
-
- /* Resolve and add ire to the ctable */
- err = ndp_resolver(dst_ill, v6dstp, first_mp, zoneid);
- switch (err) {
- case 0:
- /* Prevent save_ire from getting deleted */
- IRB_REFHOLD(save_ire->ire_bucket);
- /* Has it been removed already ? */
- if (save_ire->ire_marks & IRE_MARK_CONDEMNED) {
- IRB_REFRELE(save_ire->ire_bucket);
- ire_refrele(save_ire);
- break;
- }
- /*
- * We have a resolved cache entry,
- * add in the IRE.
- */
- ire_add_then_send(q, ire, first_mp);
- if (ip6_asp_table_held) {
- ip6_asp_table_refrele(ipst);
- ip6_asp_table_held = B_FALSE;
- }
-
- /* Assert that it is not deleted yet. */
- ASSERT(save_ire->ire_ptpn != NULL);
- IRB_REFRELE(save_ire->ire_bucket);
- ire_refrele(save_ire);
- if (fire != NULL) {
- ire_refrele(fire);
- fire = NULL;
- }
-
- /*
- * The resolution loop is re-entered if we
- * actually are in a multirouting case.
- */
- if (copy_mp != NULL) {
- boolean_t need_resolve =
- ire_multirt_need_resolve_v6(v6dstp,
- msg_getlabel(copy_mp), ipst);
- if (!need_resolve) {
- MULTIRT_DEBUG_UNTAG(copy_mp);
- freemsg(copy_mp);
- copy_mp = NULL;
- } else {
- /*
- * ipif_lookup_group_v6() calls
- * ire_lookup_multi_v6() that
- * uses ire_ftable_lookup_v6()
- * to find an IRE_INTERFACE for
- * the group. In the multirt
- * case, ire_lookup_multi_v6()
- * then invokes
- * ire_multirt_lookup_v6() to
- * find the next resolvable ire.
- * As a result, we obtain a new
- * interface, derived from the
- * next ire.
- */
- if (ipif_held) {
- ipif_refrele(ipif);
- ipif_held = B_FALSE;
- }
- ipif = ipif_lookup_group_v6(
- v6dstp, zoneid, ipst);
- ip2dbg(("ip_newroute_ipif: "
- "multirt dst %08x, "
- "ipif %p\n",
- ntohl(V4_PART_OF_V6(
- (*v6dstp))),
- (void *)ipif));
- if (ipif != NULL) {
- ipif_held = B_TRUE;
- mp = copy_mp;
- copy_mp = NULL;
- multirt_resolve_next =
- B_TRUE;
- continue;
- } else {
- freemsg(copy_mp);
- }
- }
- }
- ill_refrele(dst_ill);
- if (ipif_held) {
- ipif_refrele(ipif);
- ipif_held = B_FALSE;
- }
- if (src_ipif != NULL)
- ipif_refrele(src_ipif);
- return;
-
- case EINPROGRESS:
- /*
- * mp was consumed - presumably queued.
- * No need for ire, presumably resolution is
- * in progress, and ire will be added when the
- * address is resolved.
- */
- if (ip6_asp_table_held) {
- ip6_asp_table_refrele(ipst);
- ip6_asp_table_held = B_FALSE;
- }
- ire_delete(ire);
- ire_refrele(save_ire);
- if (fire != NULL) {
- ire_refrele(fire);
- fire = NULL;
- }
-
- /*
- * The resolution loop is re-entered if we
- * actually are in a multirouting case.
- */
- if (copy_mp != NULL) {
- boolean_t need_resolve =
- ire_multirt_need_resolve_v6(v6dstp,
- msg_getlabel(copy_mp), ipst);
- if (!need_resolve) {
- MULTIRT_DEBUG_UNTAG(copy_mp);
- freemsg(copy_mp);
- copy_mp = NULL;
- } else {
- /*
- * ipif_lookup_group_v6() calls
- * ire_lookup_multi_v6() that
- * uses ire_ftable_lookup_v6()
- * to find an IRE_INTERFACE for
- * the group. In the multirt
- * case, ire_lookup_multi_v6()
- * then invokes
- * ire_multirt_lookup_v6() to
- * find the next resolvable ire.
- * As a result, we obtain a new
- * interface, derived from the
- * next ire.
- */
- if (ipif_held) {
- ipif_refrele(ipif);
- ipif_held = B_FALSE;
- }
- ipif = ipif_lookup_group_v6(
- v6dstp, zoneid, ipst);
- ip2dbg(("ip_newroute_ipif: "
- "multirt dst %08x, "
- "ipif %p\n",
- ntohl(V4_PART_OF_V6(
- (*v6dstp))),
- (void *)ipif));
- if (ipif != NULL) {
- ipif_held = B_TRUE;
- mp = copy_mp;
- copy_mp = NULL;
- multirt_resolve_next =
- B_TRUE;
- continue;
- } else {
- freemsg(copy_mp);
- }
- }
- }
- ill_refrele(dst_ill);
- if (ipif_held) {
- ipif_refrele(ipif);
- ipif_held = B_FALSE;
- }
- if (src_ipif != NULL)
- ipif_refrele(src_ipif);
- return;
- default:
- /* Some transient error */
- ire_refrele(save_ire);
- break;
- }
- break;
- }
- default:
- break;
- }
- if (ip6_asp_table_held) {
- ip6_asp_table_refrele(ipst);
- ip6_asp_table_held = B_FALSE;
- }
- } while (multirt_resolve_next);
-
-err_ret:
- if (ip6_asp_table_held)
- ip6_asp_table_refrele(ipst);
- if (ire != NULL)
- ire_refrele(ire);
- if (fire != NULL)
- ire_refrele(fire);
- if (ipif != NULL && ipif_held)
- ipif_refrele(ipif);
- if (src_ipif != NULL)
- ipif_refrele(src_ipif);
-
- /* Multicast - no point in trying to generate ICMP error */
- if (dst_ill != NULL) {
- ill = dst_ill;
- ill_held = B_TRUE;
- }
- if (mp->b_prev || mp->b_next) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- } else {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
- }
- ip1dbg(("ip_newroute_ipif_v6: dropped\n"));
- mp->b_next = NULL;
- mp->b_prev = NULL;
- freemsg(first_mp);
- if (ill_held)
- ill_refrele(ill);
-}
-
-/*
* Parse and process any hop-by-hop or destination options.
*
* Assumes that q is an ill read queue so that ICMP errors for link-local
@@ -6067,23 +2854,16 @@ err_ret:
* Current code checks for each opt_type (other than pads) if it is in
* the expected nexthdr (hbh or dest)
*/
-static int
-ip_process_options_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
- uint8_t *optptr, uint_t optlen, uint8_t hdr_type, ip_stack_t *ipst)
+int
+ip_process_options_v6(mblk_t *mp, ip6_t *ip6h,
+ uint8_t *optptr, uint_t optlen, uint8_t hdr_type, ip_recv_attr_t *ira)
{
uint8_t opt_type;
uint_t optused;
int ret = 0;
- mblk_t *first_mp;
const char *errtype;
- zoneid_t zoneid;
- ill_t *ill = q->q_ptr;
- ipif_t *ipif;
-
- first_mp = mp;
- if (mp->b_datap->db_type == M_CTL) {
- mp = mp->b_cont;
- }
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
while (optlen != 0) {
opt_type = *optptr;
@@ -6178,13 +2958,9 @@ ip_process_options_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
* around (i.e. before AH processing).
* If we've done AH... stop now.
*/
- if (first_mp != mp) {
- ipsec_in_t *ii;
-
- ii = (ipsec_in_t *)first_mp->b_rptr;
- if (ii->ipsec_in_ah_sa != NULL)
- break;
- }
+ if ((ira->ira_flags & IRAF_IPSEC_SECURE) &&
+ ira->ira_ipsec_ah_sa != NULL)
+ break;
oh = (struct ip6_opt_home_address *)optptr;
/* Check total length and alignment */
@@ -6217,8 +2993,6 @@ ip_process_options_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
/* FALLTHROUGH */
opt_error:
/* Determine which zone should send error */
- zoneid = ipif_lookup_addr_zoneid_v6(
- &ip6h->ip6_dst, ill, ipst);
switch (IP6OPT_TYPE(opt_type)) {
case IP6OPT_TYPE_SKIP:
optused = 2 + optptr[1];
@@ -6232,48 +3006,33 @@ ip_process_options_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
ip1dbg(("ip_process_options_v6: %s "
"opt 0x%x; packet dropped\n",
errtype, opt_type));
- freemsg(first_mp);
+ BUMP_MIB(ill->ill_ip_mib,
+ ipIfStatsInHdrErrors);
+ ip_drop_input("ipIfStatsInHdrErrors",
+ mp, ill);
+ freemsg(mp);
return (-1);
case IP6OPT_TYPE_ICMP:
- if (zoneid == ALL_ZONES) {
- freemsg(first_mp);
- return (-1);
- }
- icmp_param_problem_v6(WR(q), first_mp,
+ BUMP_MIB(ill->ill_ip_mib,
+ ipIfStatsInHdrErrors);
+ ip_drop_input("ipIfStatsInHdrErrors",
+ mp, ill);
+ icmp_param_problem_v6(mp,
ICMP6_PARAMPROB_OPTION,
(uint32_t)(optptr -
(uint8_t *)ip6h),
- B_FALSE, B_FALSE, zoneid, ipst);
+ B_FALSE, ira);
return (-1);
case IP6OPT_TYPE_FORCEICMP:
- /*
- * If we don't have a zone and the dst
- * addr is multicast, then pick a zone
- * based on the inbound interface.
- */
- if (zoneid == ALL_ZONES &&
- IN6_IS_ADDR_MULTICAST(
- &ip6h->ip6_dst)) {
- ipif = ipif_select_source_v6(
- ill, &ip6h->ip6_src,
- B_TRUE,
- IPV6_PREFER_SRC_DEFAULT,
- ALL_ZONES);
- if (ipif != NULL) {
- zoneid =
- ipif->ipif_zoneid;
- ipif_refrele(ipif);
- }
- }
- if (zoneid == ALL_ZONES) {
- freemsg(first_mp);
- return (-1);
- }
- icmp_param_problem_v6(WR(q), first_mp,
+ BUMP_MIB(ill->ill_ip_mib,
+ ipIfStatsInHdrErrors);
+ ip_drop_input("ipIfStatsInHdrErrors",
+ mp, ill);
+ icmp_param_problem_v6(mp,
ICMP6_PARAMPROB_OPTION,
(uint32_t)(optptr -
(uint8_t *)ip6h),
- B_FALSE, B_TRUE, zoneid, ipst);
+ B_TRUE, ira);
return (-1);
default:
ASSERT(0);
@@ -6287,14 +3046,10 @@ ip_process_options_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
bad_opt:
/* Determine which zone should send error */
- zoneid = ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst, ill, ipst);
- if (zoneid == ALL_ZONES) {
- freemsg(first_mp);
- } else {
- icmp_param_problem_v6(WR(q), first_mp, ICMP6_PARAMPROB_OPTION,
- (uint32_t)(optptr - (uint8_t *)ip6h),
- B_FALSE, B_FALSE, zoneid, ipst);
- }
+ ip_drop_input("ICMP_PARAM_PROBLEM", mp, ill);
+ icmp_param_problem_v6(mp, ICMP6_PARAMPROB_OPTION,
+ (uint32_t)(optptr - (uint8_t *)ip6h),
+ B_FALSE, ira);
return (-1);
}
@@ -6302,10 +3057,11 @@ bad_opt:
* Process a routing header that is not yet empty.
* Because of RFC 5095, we now reject all route headers.
*/
-static void
-ip_process_rthdr(queue_t *q, mblk_t *mp, ip6_t *ip6h, ip6_rthdr_t *rth,
- ill_t *ill, mblk_t *hada_mp)
+void
+ip_process_rthdr(mblk_t *mp, ip6_t *ip6h, ip6_rthdr_t *rth,
+ ip_recv_attr_t *ira)
{
+ ill_t *ill = ira->ira_ill;
ip_stack_t *ipst = ill->ill_ipst;
ASSERT(rth->ip6r_segleft != 0);
@@ -6314,19 +3070,15 @@ ip_process_rthdr(queue_t *q, mblk_t *mp, ip6_t *ip6h, ip6_rthdr_t *rth,
/* XXX Check for source routed out same interface? */
BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
- freemsg(hada_mp);
+ ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
freemsg(mp);
return;
}
- if (hada_mp != NULL) {
- freemsg(hada_mp);
- freemsg(mp);
- return;
- }
- /* Sent by forwarding path, and router is global zone */
- icmp_param_problem_v6(WR(q), mp, ICMP6_PARAMPROB_HEADER,
- (uint32_t)((uchar_t *)&rth->ip6r_type - (uchar_t *)ip6h), B_FALSE,
- B_FALSE, GLOBAL_ZONEID, ipst);
+
+ ip_drop_input("ICMP_PARAM_PROBLEM", mp, ill);
+ icmp_param_problem_v6(mp, ICMP6_PARAMPROB_HEADER,
+ (uint32_t)((uchar_t *)&rth->ip6r_type - (uchar_t *)ip6h),
+ B_FALSE, ira);
}
/*
@@ -6335,21 +3087,10 @@ ip_process_rthdr(queue_t *q, mblk_t *mp, ip6_t *ip6h, ip6_rthdr_t *rth,
void
ip_rput_v6(queue_t *q, mblk_t *mp)
{
- mblk_t *first_mp;
- mblk_t *hada_mp = NULL;
- ip6_t *ip6h;
- boolean_t ll_multicast = B_FALSE;
- boolean_t mctl_present = B_FALSE;
ill_t *ill;
- struct iocblk *iocp;
- uint_t flags = 0;
- mblk_t *dl_mp;
- ip_stack_t *ipst;
- int check;
ill = (ill_t *)q->q_ptr;
- ipst = ill->ill_ipst;
- if (ill->ill_state_flags & ILL_CONDEMNED) {
+ if (ill->ill_state_flags & (ILL_CONDEMNED | ILL_LL_SUBNET_PENDING)) {
union DL_primitives *dl;
dl = (union DL_primitives *)mp->b_rptr;
@@ -6367,241 +3108,14 @@ ip_rput_v6(queue_t *q, mblk_t *mp)
return;
}
}
+ if (DB_TYPE(mp) == M_DATA) {
+ struct mac_header_info_s mhi;
- dl_mp = NULL;
- switch (mp->b_datap->db_type) {
- case M_DATA: {
- int hlen;
- uchar_t *ucp;
- struct ether_header *eh;
- dl_unitdata_ind_t *dui;
-
- /*
- * This is a work-around for CR 6451644, a bug in Nemo. It
- * should be removed when that problem is fixed.
- */
- if (ill->ill_mactype == DL_ETHER &&
- (hlen = MBLKHEAD(mp)) >= sizeof (struct ether_header) &&
- (ucp = mp->b_rptr)[-1] == (ETHERTYPE_IPV6 & 0xFF) &&
- ucp[-2] == (ETHERTYPE_IPV6 >> 8)) {
- if (hlen >= sizeof (struct ether_vlan_header) &&
- ucp[-5] == 0 && ucp[-6] == 0x81)
- ucp -= sizeof (struct ether_vlan_header);
- else
- ucp -= sizeof (struct ether_header);
- /*
- * If it's a group address, then fabricate a
- * DL_UNITDATA_IND message.
- */
- if ((ll_multicast = (ucp[0] & 1)) != 0 &&
- (dl_mp = allocb(DL_UNITDATA_IND_SIZE + 16,
- BPRI_HI)) != NULL) {
- eh = (struct ether_header *)ucp;
- dui = (dl_unitdata_ind_t *)dl_mp->b_rptr;
- DB_TYPE(dl_mp) = M_PROTO;
- dl_mp->b_wptr = (uchar_t *)(dui + 1) + 16;
- dui->dl_primitive = DL_UNITDATA_IND;
- dui->dl_dest_addr_length = 8;
- dui->dl_dest_addr_offset = DL_UNITDATA_IND_SIZE;
- dui->dl_src_addr_length = 8;
- dui->dl_src_addr_offset = DL_UNITDATA_IND_SIZE +
- 8;
- dui->dl_group_address = 1;
- ucp = (uchar_t *)(dui + 1);
- if (ill->ill_sap_length > 0)
- ucp += ill->ill_sap_length;
- bcopy(&eh->ether_dhost, ucp, 6);
- bcopy(&eh->ether_shost, ucp + 8, 6);
- ucp = (uchar_t *)(dui + 1);
- if (ill->ill_sap_length < 0)
- ucp += 8 + ill->ill_sap_length;
- bcopy(&eh->ether_type, ucp, 2);
- bcopy(&eh->ether_type, ucp + 8, 2);
- }
- }
- break;
- }
-
- case M_PROTO:
- case M_PCPROTO:
- if (((dl_unitdata_ind_t *)mp->b_rptr)->dl_primitive !=
- DL_UNITDATA_IND) {
- /* Go handle anything other than data elsewhere. */
- ip_rput_dlpi(q, mp);
- return;
- }
- ll_multicast = ip_get_dlpi_mbcast(ill, mp);
-
- /* Save the DLPI header. */
- dl_mp = mp;
- mp = mp->b_cont;
- dl_mp->b_cont = NULL;
- break;
- case M_BREAK:
- panic("ip_rput_v6: got an M_BREAK");
- /*NOTREACHED*/
- case M_IOCACK:
- iocp = (struct iocblk *)mp->b_rptr;
- switch (iocp->ioc_cmd) {
- case DL_IOC_HDR_INFO:
- ill = (ill_t *)q->q_ptr;
- ill_fastpath_ack(ill, mp);
- return;
- default:
- putnext(q, mp);
- return;
- }
- /* FALLTHRU */
- case M_ERROR:
- case M_HANGUP:
- mutex_enter(&ill->ill_lock);
- if (ill->ill_state_flags & ILL_CONDEMNED) {
- mutex_exit(&ill->ill_lock);
- freemsg(mp);
- return;
- }
- ill_refhold_locked(ill);
- mutex_exit(&ill->ill_lock);
- qwriter_ip(ill, q, mp, ip_rput_other, CUR_OP, B_FALSE);
- return;
- case M_CTL:
- if ((MBLKL(mp) > sizeof (int)) &&
- ((da_ipsec_t *)mp->b_rptr)->da_type == IPHADA_M_CTL) {
- ASSERT(MBLKL(mp) >= sizeof (da_ipsec_t));
- mctl_present = B_TRUE;
- break;
- }
- putnext(q, mp);
- return;
- case M_IOCNAK:
- iocp = (struct iocblk *)mp->b_rptr;
- switch (iocp->ioc_cmd) {
- case DL_IOC_HDR_INFO:
- ip_rput_other(NULL, q, mp, NULL);
- return;
- default:
- break;
- }
- /* FALLTHRU */
- default:
- putnext(q, mp);
- return;
- }
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
- UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets,
- (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp));
- /*
- * if db_ref > 1 then copymsg and free original. Packet may be
- * changed and do not want other entity who has a reference to this
- * message to trip over the changes. This is a blind change because
- * trying to catch all places that might change packet is too
- * difficult (since it may be a module above this one).
- */
- if (mp->b_datap->db_ref > 1) {
- mblk_t *mp1;
-
- mp1 = copymsg(mp);
- freemsg(mp);
- if (mp1 == NULL) {
- first_mp = NULL;
- goto discard;
- }
- mp = mp1;
- }
- first_mp = mp;
- if (mctl_present) {
- hada_mp = first_mp;
- mp = first_mp->b_cont;
- }
-
- if ((check = ip_check_v6_mblk(mp, ill)) == IP6_MBLK_HDR_ERR) {
- freemsg(mp);
- return;
- }
-
- ip6h = (ip6_t *)mp->b_rptr;
-
- /*
- * ip:::receive must see ipv6 packets with a full header,
- * and so is placed after the IP6_MBLK_HDR_ERR check.
- */
- DTRACE_IP7(receive, mblk_t *, first_mp, conn_t *, NULL, void_ip_t *,
- ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, ip6h,
- int, 0);
-
- if (check != IP6_MBLK_OK) {
- freemsg(mp);
- return;
- }
-
- DTRACE_PROBE4(ip6__physical__in__start,
- ill_t *, ill, ill_t *, NULL,
- ip6_t *, ip6h, mblk_t *, first_mp);
-
- FW_HOOKS6(ipst->ips_ip6_physical_in_event,
- ipst->ips_ipv6firewall_physical_in,
- ill, NULL, ip6h, first_mp, mp, ll_multicast, ipst);
-
- DTRACE_PROBE1(ip6__physical__in__end, mblk_t *, first_mp);
-
- if (first_mp == NULL)
- return;
-
- /*
- * Attach any necessary label information to this packet.
- */
- if (is_system_labeled() && !tsol_get_pkt_label(mp, IPV6_VERSION)) {
- if (ip6opt_ls != 0)
- ip0dbg(("tsol_get_pkt_label v6 failed\n"));
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
- goto discard;
- }
-
- /* IP observability hook. */
- if (ipst->ips_ip6_observe.he_interested) {
- zoneid_t dzone;
-
- dzone = ip_get_zoneid_v6(&ip6h->ip6_dst, mp, ill, ipst,
- ALL_ZONES);
- ipobs_hook(mp, IPOBS_HOOK_INBOUND, ALL_ZONES, dzone,
- ill, ipst);
- }
-
- if ((ip6h->ip6_vcf & IPV6_VERS_AND_FLOW_MASK) ==
- IPV6_DEFAULT_VERS_AND_FLOW) {
- /*
- * It may be a bit too expensive to do this mapped address
- * check here, but in the interest of robustness, it seems
- * like the correct place.
- * TODO: Avoid this check for e.g. connected TCP sockets
- */
- if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_src)) {
- ip1dbg(("ip_rput_v6: pkt with mapped src addr\n"));
- goto discard;
- }
-
- if (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src)) {
- ip1dbg(("ip_rput_v6: pkt with loopback src"));
- goto discard;
- } else if (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst)) {
- ip1dbg(("ip_rput_v6: pkt with loopback dst"));
- goto discard;
- }
-
- flags |= (ll_multicast ? IP6_IN_LLMCAST : 0);
- ip_rput_data_v6(q, ill, mp, ip6h, flags, hada_mp, dl_mp);
+ ip_mdata_to_mhi(ill, mp, &mhi);
+ ip_input_v6(ill, NULL, mp, &mhi);
} else {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInWrongIPVersion);
- goto discard;
+ ip_rput_notdata(ill, mp);
}
- freemsg(dl_mp);
- return;
-
-discard:
- if (dl_mp != NULL)
- freeb(dl_mp);
- freemsg(first_mp);
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
}
/*
@@ -6703,1507 +3217,72 @@ ipsec_needs_processing_v6(mblk_t *mp, uint8_t *nexthdr)
}
/*
- * Path for AH if options are present. If this is the first time we are
- * sending a datagram to AH, allocate a IPSEC_IN message and prepend it.
- * Otherwise, just fanout. Return value answers the boolean question:
- * "Did I consume the mblk you sent me?"
+ * Path for AH if options are present.
+ * Returns NULL if the mblk was consumed.
*
* Sometimes AH needs to be done before other IPv6 headers for security
* reasons. This function (and its ipsec_needs_processing_v6() above)
* indicates if that is so, and fans out to the appropriate IPsec protocol
* for the datagram passed in.
*/
-static boolean_t
-ipsec_early_ah_v6(queue_t *q, mblk_t *first_mp, boolean_t mctl_present,
- ill_t *ill, ill_t *inill, mblk_t *hada_mp, zoneid_t zoneid)
+mblk_t *
+ipsec_early_ah_v6(mblk_t *mp, ip_recv_attr_t *ira)
{
- mblk_t *mp;
uint8_t nexthdr;
- ipsec_in_t *ii = NULL;
ah_t *ah;
- ipsec_status_t ipsec_rc;
+ ill_t *ill = ira->ira_ill;
ip_stack_t *ipst = ill->ill_ipst;
- netstack_t *ns = ipst->ips_netstack;
- ipsec_stack_t *ipss = ns->netstack_ipsec;
-
- ASSERT((hada_mp == NULL) || (!mctl_present));
+ ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
- switch (ipsec_needs_processing_v6(
- (mctl_present ? first_mp->b_cont : first_mp), &nexthdr)) {
+ switch (ipsec_needs_processing_v6(mp, &nexthdr)) {
case IPSEC_MEMORY_ERROR:
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(hada_mp);
- freemsg(first_mp);
- return (B_TRUE);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ freemsg(mp);
+ return (NULL);
case IPSEC_HDR_DONT_PROCESS:
- return (B_FALSE);
+ return (mp);
}
/* Default means send it to AH! */
ASSERT(nexthdr == IPPROTO_AH);
- if (!mctl_present) {
- mp = first_mp;
- first_mp = ipsec_in_alloc(B_FALSE, ipst->ips_netstack);
- if (first_mp == NULL) {
- ip1dbg(("ipsec_early_ah_v6: IPSEC_IN "
- "allocation failure.\n"));
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(hada_mp);
- freemsg(mp);
- return (B_TRUE);
- }
- /*
- * Store the ill_index so that when we come back
- * from IPSEC we ride on the same queue.
- */
- ii = (ipsec_in_t *)first_mp->b_rptr;
- ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex;
- ii->ipsec_in_rill_index = inill->ill_phyint->phyint_ifindex;
- first_mp->b_cont = mp;
- }
- /*
- * Cache hardware acceleration info.
- */
- if (hada_mp != NULL) {
- ASSERT(ii != NULL);
- IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_early_ah_v6: "
- "caching data attr.\n"));
- ii->ipsec_in_accelerated = B_TRUE;
- ii->ipsec_in_da = hada_mp;
- }
if (!ipsec_loaded(ipss)) {
- ip_proto_not_sup(q, first_mp, IP_FF_SEND_ICMP, zoneid, ipst);
- return (B_TRUE);
- }
-
- ah = ipsec_inbound_ah_sa(first_mp, ns);
- if (ah == NULL)
- return (B_TRUE);
- ASSERT(ii->ipsec_in_ah_sa != NULL);
- ASSERT(ii->ipsec_in_ah_sa->ipsa_input_func != NULL);
- ipsec_rc = ii->ipsec_in_ah_sa->ipsa_input_func(first_mp, ah);
-
- switch (ipsec_rc) {
- case IPSEC_STATUS_SUCCESS:
- /* we're done with IPsec processing, send it up */
- ip_fanout_proto_again(first_mp, ill, inill, NULL);
- break;
- case IPSEC_STATUS_FAILED:
- BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInDiscards);
- break;
- case IPSEC_STATUS_PENDING:
- /* no action needed */
- break;
- }
- return (B_TRUE);
-}
-
-static boolean_t
-ip_iptun_input_v6(mblk_t *ipsec_mp, mblk_t *data_mp,
- size_t hdr_len, uint8_t nexthdr, zoneid_t zoneid, ill_t *ill,
- ip_stack_t *ipst)
-{
- conn_t *connp;
-
- ASSERT(ipsec_mp == NULL || ipsec_mp->b_cont == data_mp);
-
- connp = ipcl_classify_v6(data_mp, nexthdr, hdr_len, zoneid, ipst);
- if (connp != NULL) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
- connp->conn_recv(connp, ipsec_mp != NULL ? ipsec_mp : data_mp,
- NULL);
- CONN_DEC_REF(connp);
- return (B_TRUE);
- }
- return (B_FALSE);
-}
-
-/*
- * Validate the IPv6 mblk for alignment.
- */
-int
-ip_check_v6_mblk(mblk_t *mp, ill_t *ill)
-{
- int pkt_len, ip6_len;
- ip6_t *ip6h = (ip6_t *)mp->b_rptr;
-
- /* check for alignment and full IPv6 header */
- if (!OK_32PTR((uchar_t *)ip6h) ||
- (mp->b_wptr - (uchar_t *)ip6h) < IPV6_HDR_LEN) {
- if (!pullupmsg(mp, IPV6_HDR_LEN)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- ip1dbg(("ip_rput_v6: pullupmsg failed\n"));
- return (IP6_MBLK_HDR_ERR);
- }
- ip6h = (ip6_t *)mp->b_rptr;
- }
-
- ASSERT(OK_32PTR((uchar_t *)ip6h) &&
- (mp->b_wptr - (uchar_t *)ip6h) >= IPV6_HDR_LEN);
-
- if (mp->b_cont == NULL)
- pkt_len = mp->b_wptr - mp->b_rptr;
- else
- pkt_len = msgdsize(mp);
- ip6_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
-
- /*
- * Check for bogus (too short packet) and packet which
- * was padded by the link layer.
- */
- if (ip6_len != pkt_len) {
- ssize_t diff;
-
- if (ip6_len > pkt_len) {
- ip1dbg(("ip_rput_data_v6: packet too short %d %d\n",
- ip6_len, pkt_len));
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
- return (IP6_MBLK_LEN_ERR);
- }
- diff = (ssize_t)(pkt_len - ip6_len);
-
- if (!adjmsg(mp, -diff)) {
- ip1dbg(("ip_rput_data_v6: adjmsg failed\n"));
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- return (IP6_MBLK_LEN_ERR);
- }
-
- /*
- * adjmsg may have freed an mblk from the chain, hence
- * invalidate any hw checksum here. This will force IP to
- * calculate the checksum in sw, but only for this packet.
- */
- DB_CKSUMFLAGS(mp) = 0;
- }
- return (IP6_MBLK_OK);
-}
-
-/*
- * ip_rput_data_v6 -- received IPv6 packets in M_DATA messages show up here.
- * ip_rput_v6 has already verified alignment, the min length, the version,
- * and db_ref = 1.
- *
- * The ill passed in (the arg named inill) is the ill that the packet
- * actually arrived on. We need to remember this when saving the
- * input interface index into potential IPV6_PKTINFO data in
- * ip_add_info_v6().
- *
- * This routine doesn't free dl_mp; that's the caller's responsibility on
- * return. (Note that the callers are complex enough that there's no tail
- * recursion here anyway.)
- */
-void
-ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
- uint_t flags, mblk_t *hada_mp, mblk_t *dl_mp)
-{
- ire_t *ire = NULL;
- ill_t *ill = inill;
- ill_t *outill;
- uint8_t *whereptr;
- uint8_t nexthdr;
- uint16_t remlen;
- uint_t prev_nexthdr_offset;
- uint_t used;
- size_t old_pkt_len;
- size_t pkt_len;
- uint16_t ip6_len;
- uint_t hdr_len;
- boolean_t mctl_present;
- mblk_t *first_mp;
- mblk_t *first_mp1;
- boolean_t no_forward;
- ip6_hbh_t *hbhhdr;
- boolean_t ll_multicast = (flags & IP6_IN_LLMCAST);
- conn_t *connp;
- uint32_t ports;
- zoneid_t zoneid = GLOBAL_ZONEID;
- uint16_t hck_flags, reass_hck_flags;
- uint32_t reass_sum;
- boolean_t cksum_err;
- mblk_t *mp1;
- ip_stack_t *ipst = inill->ill_ipst;
- ilb_stack_t *ilbs = ipst->ips_netstack->netstack_ilb;
- in6_addr_t lb_dst;
- int lb_ret = ILB_PASSED;
-
- EXTRACT_PKT_MP(mp, first_mp, mctl_present);
-
- if (hada_mp != NULL) {
- /*
- * It's an IPsec accelerated packet.
- * Keep a pointer to the data attributes around until
- * we allocate the ipsecinfo structure.
- */
- IPSECHW_DEBUG(IPSECHW_PKT,
- ("ip_rput_data_v6: inbound HW accelerated IPsec pkt\n"));
- hada_mp->b_cont = NULL;
- /*
- * Since it is accelerated, it came directly from
- * the ill.
- */
- ASSERT(mctl_present == B_FALSE);
- ASSERT(mp->b_datap->db_type != M_CTL);
- }
-
- ip6h = (ip6_t *)mp->b_rptr;
- ip6_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
- old_pkt_len = pkt_len = ip6_len;
-
- if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum)
- hck_flags = DB_CKSUMFLAGS(mp);
- else
- hck_flags = 0;
-
- /* Clear checksum flags in case we need to forward */
- DB_CKSUMFLAGS(mp) = 0;
- reass_sum = reass_hck_flags = 0;
-
- nexthdr = ip6h->ip6_nxt;
-
- prev_nexthdr_offset = (uint_t)((uchar_t *)&ip6h->ip6_nxt -
- (uchar_t *)ip6h);
- whereptr = (uint8_t *)&ip6h[1];
- remlen = pkt_len - IPV6_HDR_LEN; /* Track how much is left */
-
- /* Process hop by hop header options */
- if (nexthdr == IPPROTO_HOPOPTS) {
- uint_t ehdrlen;
- uint8_t *optptr;
-
- if (remlen < MIN_EHDR_LEN)
- goto pkt_too_short;
- if (mp->b_cont != NULL &&
- whereptr + MIN_EHDR_LEN > mp->b_wptr) {
- if (!pullupmsg(mp, IPV6_HDR_LEN + MIN_EHDR_LEN)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(hada_mp);
- freemsg(first_mp);
- return;
- }
- ip6h = (ip6_t *)mp->b_rptr;
- whereptr = (uint8_t *)ip6h + pkt_len - remlen;
- }
- hbhhdr = (ip6_hbh_t *)whereptr;
- nexthdr = hbhhdr->ip6h_nxt;
- prev_nexthdr_offset = (uint_t)(whereptr - (uint8_t *)ip6h);
- ehdrlen = 8 * (hbhhdr->ip6h_len + 1);
-
- if (remlen < ehdrlen)
- goto pkt_too_short;
- if (mp->b_cont != NULL &&
- whereptr + ehdrlen > mp->b_wptr) {
- if (!pullupmsg(mp, IPV6_HDR_LEN + ehdrlen)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(hada_mp);
- freemsg(first_mp);
- return;
- }
- ip6h = (ip6_t *)mp->b_rptr;
- whereptr = (uint8_t *)ip6h + pkt_len - remlen;
- hbhhdr = (ip6_hbh_t *)whereptr;
- }
-
- optptr = whereptr + 2;
- whereptr += ehdrlen;
- remlen -= ehdrlen;
- switch (ip_process_options_v6(q, first_mp, ip6h, optptr,
- ehdrlen - 2, IPPROTO_HOPOPTS, ipst)) {
- case -1:
- /*
- * Packet has been consumed and any
- * needed ICMP messages sent.
- */
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
- freemsg(hada_mp);
- return;
- case 0:
- /* no action needed */
- break;
- case 1:
- /* Known router alert */
- goto ipv6forus;
- }
- }
-
- /*
- * On incoming v6 multicast packets we will bypass the ire table,
- * and assume that the read queue corresponds to the targetted
- * interface.
- *
- * The effect of this is the same as the IPv4 original code, but is
- * much cleaner I think. See ip_rput for how that was done.
- */
- if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastPkts);
- UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastOctets, pkt_len);
-
- /*
- * So that we don't end up with dups, only one ill in an IPMP
- * group is nominated to receive multicast data traffic.
- * However, link-locals on any underlying interfaces will have
- * joined their solicited-node multicast addresses and we must
- * accept those packets. (We don't attempt to precisely
- * filter out duplicate solicited-node multicast packets since
- * e.g. an IPMP interface and underlying interface may have
- * the same solicited-node multicast address.) Note that we
- * won't generally have duplicates because we only issue a
- * DL_ENABMULTI_REQ on one interface in a group; the exception
- * is when PHYI_MULTI_BCAST is set.
- */
- if (IS_UNDER_IPMP(ill) && !ill->ill_nom_cast &&
- !IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) {
- goto drop_pkt;
- }
-
- /*
- * XXX TODO Give to mrouted to for multicast forwarding.
- */
- if (ilm_lookup_ill_v6(ill, &ip6h->ip6_dst, B_FALSE,
- ALL_ZONES) == NULL) {
- if (ip_debug > 3) {
- /* ip2dbg */
- pr_addr_dbg("ip_rput_data_v6: got mcast packet"
- " which is not for us: %s\n", AF_INET6,
- &ip6h->ip6_dst);
- }
-drop_pkt: BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(hada_mp);
- freemsg(first_mp);
- return;
- }
- if (ip_debug > 3) {
- /* ip2dbg */
- pr_addr_dbg("ip_rput_data_v6: multicast for us: %s\n",
- AF_INET6, &ip6h->ip6_dst);
- }
- zoneid = GLOBAL_ZONEID;
- goto ipv6forus;
- }
-
- /*
- * Find an ire that matches destination. For link-local addresses
- * we have to match the ill.
- * TBD for site local addresses.
- */
- if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst)) {
- ire = ire_ctable_lookup_v6(&ip6h->ip6_dst, NULL,
- IRE_CACHE|IRE_LOCAL, ill->ill_ipif, ALL_ZONES, NULL,
- MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst);
- } else {
- if (ilb_has_rules(ilbs) && ILB_SUPP_L4(nexthdr)) {
- /* For convenience, we just pull up the mblk. */
- if (mp->b_cont != NULL) {
- if (pullupmsg(mp, -1) == 0) {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsInDiscards);
- freemsg(hada_mp);
- freemsg(first_mp);
- return;
- }
- hdr_len = pkt_len - remlen;
- ip6h = (ip6_t *)mp->b_rptr;
- whereptr = (uint8_t *)ip6h + hdr_len;
- }
- lb_ret = ilb_check_v6(ilbs, ill, mp, ip6h, nexthdr,
- whereptr, &lb_dst);
- if (lb_ret == ILB_DROPPED) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(hada_mp);
- freemsg(first_mp);
- return;
- }
- }
-
- ire = ire_cache_lookup_v6((lb_ret == ILB_BALANCED) ? &lb_dst :
- &ip6h->ip6_dst, ALL_ZONES, msg_getlabel(mp), ipst);
-
- if (ire != NULL && ire->ire_stq != NULL &&
- ire->ire_zoneid != GLOBAL_ZONEID &&
- ire->ire_zoneid != ALL_ZONES) {
- /*
- * Should only use IREs that are visible from the
- * global zone for forwarding.
- */
- ire_refrele(ire);
- ire = ire_cache_lookup_v6(&ip6h->ip6_dst,
- GLOBAL_ZONEID, msg_getlabel(mp), ipst);
- }
- }
-
- if (ire == NULL) {
- /*
- * No matching IRE found. Mark this packet as having
- * originated externally.
- */
- if (!(ill->ill_flags & ILLF_ROUTER) || ll_multicast) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
- if (!(ill->ill_flags & ILLF_ROUTER)) {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsInAddrErrors);
- }
- freemsg(hada_mp);
- freemsg(first_mp);
- return;
- }
- if (ip6h->ip6_hops <= 1) {
- if (hada_mp != NULL)
- goto hada_drop;
- /* Sent by forwarding path, and router is global zone */
- icmp_time_exceeded_v6(WR(q), first_mp,
- ICMP6_TIME_EXCEED_TRANSIT, ll_multicast, B_FALSE,
- GLOBAL_ZONEID, ipst);
- return;
- }
- /*
- * Per RFC 3513 section 2.5.2, we must not forward packets with
- * an unspecified source address.
- */
- if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
- freemsg(hada_mp);
- freemsg(first_mp);
- return;
- }
- mp->b_prev = (mblk_t *)(uintptr_t)
- ill->ill_phyint->phyint_ifindex;
- ip_newroute_v6(q, mp, (lb_ret == ILB_BALANCED) ? &lb_dst :
- &ip6h->ip6_dst, &ip6h->ip6_src,
- IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst) ? ill : NULL,
- GLOBAL_ZONEID, ipst);
- return;
+ ip_proto_not_sup(mp, ira);
+ return (NULL);
}
- /* we have a matching IRE */
- if (ire->ire_stq != NULL) {
- /*
- * To be quicker, we may wish not to chase pointers
- * (ire->ire_ipif->ipif_ill...) and instead store the
- * forwarding policy in the ire. An unfortunate side-
- * effect of this would be requiring an ire flush whenever
- * the ILLF_ROUTER flag changes. For now, chase pointers
- * once and store in the boolean no_forward.
- *
- * This appears twice to keep it out of the non-forwarding,
- * yes-it's-for-us-on-the-right-interface case.
- */
- no_forward = ((ill->ill_flags &
- ire->ire_ipif->ipif_ill->ill_flags & ILLF_ROUTER) == 0);
- ASSERT(first_mp == mp);
- /*
- * This ire has a send-to queue - forward the packet.
- */
- if (no_forward || ll_multicast || (hada_mp != NULL)) {
- freemsg(hada_mp);
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
- if (no_forward) {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsInAddrErrors);
- }
- freemsg(mp);
- ire_refrele(ire);
- return;
- }
- /*
- * ipIfStatsHCInForwDatagrams should only be increment if there
- * will be an attempt to forward the packet, which is why we
- * increment after the above condition has been checked.
- */
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams);
- if (ip6h->ip6_hops <= 1) {
- ip1dbg(("ip_rput_data_v6: hop limit expired.\n"));
- /* Sent by forwarding path, and router is global zone */
- icmp_time_exceeded_v6(WR(q), mp,
- ICMP6_TIME_EXCEED_TRANSIT, ll_multicast, B_FALSE,
- GLOBAL_ZONEID, ipst);
- ire_refrele(ire);
- return;
- }
- /*
- * Per RFC 3513 section 2.5.2, we must not forward packets with
- * an unspecified source address.
- */
- if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
- freemsg(mp);
- ire_refrele(ire);
- return;
- }
-
- if (is_system_labeled()) {
- mblk_t *mp1;
-
- if ((mp1 = tsol_ip_forward(ire, mp)) == NULL) {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsForwProhibits);
- freemsg(mp);
- ire_refrele(ire);
- return;
- }
- /* Size may have changed */
- mp = mp1;
- ip6h = (ip6_t *)mp->b_rptr;
- pkt_len = msgdsize(mp);
- }
-
- if (pkt_len > ire->ire_max_frag) {
- int max_frag = ire->ire_max_frag;
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTooBigErrors);
- /*
- * Handle labeled packet resizing.
- */
- if (is_system_labeled()) {
- max_frag = tsol_pmtu_adjust(mp, max_frag,
- pkt_len - old_pkt_len, AF_INET6);
- }
-
- /* Sent by forwarding path, and router is global zone */
- icmp_pkt2big_v6(WR(q), mp, max_frag,
- ll_multicast, B_TRUE, GLOBAL_ZONEID, ipst);
- ire_refrele(ire);
- return;
- }
+ mp = ipsec_inbound_ah_sa(mp, ira, &ah);
+ if (mp == NULL)
+ return (NULL);
+ ASSERT(ah != NULL);
+ ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
+ ASSERT(ira->ira_ipsec_ah_sa != NULL);
+ ASSERT(ira->ira_ipsec_ah_sa->ipsa_input_func != NULL);
+ mp = ira->ira_ipsec_ah_sa->ipsa_input_func(mp, ah, ira);
+ if (mp == NULL) {
/*
- * Check to see if we're forwarding the packet to a
- * different link from which it came. If so, check the
- * source and destination addresses since routers must not
- * forward any packets with link-local source or
- * destination addresses to other links. Otherwise (if
- * we're forwarding onto the same link), conditionally send
- * a redirect message.
+ * Either it failed or is pending. In the former case
+ * ipIfStatsInDiscards was increased.
*/
- if (ire->ire_rfq != q &&
- !IS_IN_SAME_ILLGRP(ill, (ill_t *)ire->ire_rfq->q_ptr)) {
- if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst) ||
- IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src)) {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsInAddrErrors);
- freemsg(mp);
- ire_refrele(ire);
- return;
- }
- /* TBD add site-local check at site boundary? */
- } else if (ipst->ips_ipv6_send_redirects) {
- in6_addr_t *v6targ;
- in6_addr_t gw_addr_v6;
- ire_t *src_ire_v6 = NULL;
-
- /*
- * Don't send a redirect when forwarding a source
- * routed packet.
- */
- if (ip_source_routed_v6(ip6h, mp, ipst))
- goto forward;
-
- mutex_enter(&ire->ire_lock);
- gw_addr_v6 = ire->ire_gateway_addr_v6;
- mutex_exit(&ire->ire_lock);
- if (!IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) {
- v6targ = &gw_addr_v6;
- /*
- * We won't send redirects to a router
- * that doesn't have a link local
- * address, but will forward.
- */
- if (!IN6_IS_ADDR_LINKLOCAL(v6targ)) {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsInAddrErrors);
- goto forward;
- }
- } else {
- v6targ = &ip6h->ip6_dst;
- }
-
- src_ire_v6 = ire_ftable_lookup_v6(&ip6h->ip6_src,
- NULL, NULL, IRE_INTERFACE, ire->ire_ipif, NULL,
- GLOBAL_ZONEID, 0, NULL,
- MATCH_IRE_IPIF | MATCH_IRE_TYPE,
- ipst);
-
- if (src_ire_v6 != NULL) {
- /*
- * The source is directly connected.
- */
- mp1 = copymsg(mp);
- if (mp1 != NULL) {
- icmp_send_redirect_v6(WR(q),
- mp1, v6targ, &ip6h->ip6_dst,
- ill, B_FALSE);
- }
- ire_refrele(src_ire_v6);
- }
- }
-
-forward:
- /* Hoplimit verified above */
- ip6h->ip6_hops--;
-
- outill = ire->ire_ipif->ipif_ill;
-
- DTRACE_PROBE4(ip6__forwarding__start,
- ill_t *, inill, ill_t *, outill,
- ip6_t *, ip6h, mblk_t *, mp);
-
- FW_HOOKS6(ipst->ips_ip6_forwarding_event,
- ipst->ips_ipv6firewall_forwarding,
- inill, outill, ip6h, mp, mp, 0, ipst);
-
- DTRACE_PROBE1(ip6__forwarding__end, mblk_t *, mp);
-
- if (mp != NULL) {
- UPDATE_IB_PKT_COUNT(ire);
- ire->ire_last_used_time = lbolt;
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams);
- ip_xmit_v6(mp, ire, 0, NULL, B_FALSE, NULL);
- }
- IRE_REFRELE(ire);
- return;
- }
-
- /*
- * Need to put on correct queue for reassembly to find it.
- * No need to use put() since reassembly has its own locks.
- * Note: multicast packets and packets destined to addresses
- * assigned to loopback (ire_rfq is NULL) will be reassembled on
- * the arriving ill. Unlike the IPv4 case, enabling strict
- * destination multihoming will prevent accepting packets
- * addressed to an IRE_LOCAL on lo0.
- */
- if (ire->ire_rfq != q) {
- if ((ire = ip_check_multihome(&ip6h->ip6_dst, ire, ill))
- == NULL) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
- freemsg(hada_mp);
- freemsg(first_mp);
- return;
- }
- if (ire->ire_rfq != NULL) {
- q = ire->ire_rfq;
- ill = (ill_t *)q->q_ptr;
- ASSERT(ill != NULL);
- }
- }
-
- zoneid = ire->ire_zoneid;
- UPDATE_IB_PKT_COUNT(ire);
- ire->ire_last_used_time = lbolt;
- /* Don't use the ire after this point, we'll NULL it out to be sure. */
- ire_refrele(ire);
- ire = NULL;
-ipv6forus:
- /*
- * Looks like this packet is for us one way or another.
- * This is where we'll process destination headers etc.
- */
- for (; ; ) {
- switch (nexthdr) {
- case IPPROTO_TCP: {
- uint16_t *up;
- uint32_t sum;
- int offset;
-
- hdr_len = pkt_len - remlen;
-
- if (hada_mp != NULL) {
- ip0dbg(("tcp hada drop\n"));
- goto hada_drop;
- }
-
-
- /* TCP needs all of the TCP header */
- if (remlen < TCP_MIN_HEADER_LENGTH)
- goto pkt_too_short;
- if (mp->b_cont != NULL &&
- whereptr + TCP_MIN_HEADER_LENGTH > mp->b_wptr) {
- if (!pullupmsg(mp,
- hdr_len + TCP_MIN_HEADER_LENGTH)) {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsInDiscards);
- freemsg(first_mp);
- return;
- }
- hck_flags = 0;
- ip6h = (ip6_t *)mp->b_rptr;
- whereptr = (uint8_t *)ip6h + hdr_len;
- }
- /*
- * Extract the offset field from the TCP header.
- */
- offset = ((uchar_t *)ip6h)[hdr_len + 12] >> 4;
- if (offset != 5) {
- if (offset < 5) {
- ip1dbg(("ip_rput_data_v6: short "
- "TCP data offset"));
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsInDiscards);
- freemsg(first_mp);
- return;
- }
- /*
- * There must be TCP options.
- * Make sure we can grab them.
- */
- offset <<= 2;
- if (remlen < offset)
- goto pkt_too_short;
- if (mp->b_cont != NULL &&
- whereptr + offset > mp->b_wptr) {
- if (!pullupmsg(mp,
- hdr_len + offset)) {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsInDiscards);
- freemsg(first_mp);
- return;
- }
- hck_flags = 0;
- ip6h = (ip6_t *)mp->b_rptr;
- whereptr = (uint8_t *)ip6h + hdr_len;
- }
- }
-
- up = (uint16_t *)&ip6h->ip6_src;
- /*
- * TCP checksum calculation. First sum up the
- * pseudo-header fields:
- * - Source IPv6 address
- * - Destination IPv6 address
- * - TCP payload length
- * - TCP protocol ID
- */
- sum = htons(IPPROTO_TCP + remlen) +
- up[0] + up[1] + up[2] + up[3] +
- up[4] + up[5] + up[6] + up[7] +
- up[8] + up[9] + up[10] + up[11] +
- up[12] + up[13] + up[14] + up[15];
-
- /* Fold initial sum */
- sum = (sum & 0xffff) + (sum >> 16);
-
- mp1 = mp->b_cont;
-
- if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
- IP6_STAT(ipst, ip6_in_sw_cksum);
-
- IP_CKSUM_RECV(hck_flags, sum, (uchar_t *)
- ((uchar_t *)mp->b_rptr + DB_CKSUMSTART(mp)),
- (int32_t)(whereptr - (uchar_t *)mp->b_rptr),
- mp, mp1, cksum_err);
-
- if (cksum_err) {
- BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs);
-
- if (hck_flags & HCK_FULLCKSUM) {
- IP6_STAT(ipst,
- ip6_tcp_in_full_hw_cksum_err);
- } else if (hck_flags & HCK_PARTIALCKSUM) {
- IP6_STAT(ipst,
- ip6_tcp_in_part_hw_cksum_err);
- } else {
- IP6_STAT(ipst, ip6_tcp_in_sw_cksum_err);
- }
- freemsg(first_mp);
- return;
- }
-tcp_fanout:
- ip_fanout_tcp_v6(q, first_mp, ip6h, ill, inill,
- (flags|IP_FF_SEND_ICMP|IP_FF_SYN_ADDIRE|
- IP_FF_IPINFO), hdr_len, mctl_present, zoneid);
- return;
- }
- case IPPROTO_SCTP:
- {
- sctp_hdr_t *sctph;
- uint32_t calcsum, pktsum;
- uint_t hdr_len = pkt_len - remlen;
- sctp_stack_t *sctps;
-
- sctps = inill->ill_ipst->ips_netstack->netstack_sctp;
-
- /* SCTP needs all of the SCTP header */
- if (remlen < sizeof (*sctph)) {
- goto pkt_too_short;
- }
- if (whereptr + sizeof (*sctph) > mp->b_wptr) {
- ASSERT(mp->b_cont != NULL);
- if (!pullupmsg(mp, hdr_len + sizeof (*sctph))) {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsInDiscards);
- freemsg(mp);
- return;
- }
- ip6h = (ip6_t *)mp->b_rptr;
- whereptr = (uint8_t *)ip6h + hdr_len;
- }
-
- sctph = (sctp_hdr_t *)(mp->b_rptr + hdr_len);
- /* checksum */
- pktsum = sctph->sh_chksum;
- sctph->sh_chksum = 0;
- calcsum = sctp_cksum(mp, hdr_len);
- if (calcsum != pktsum) {
- BUMP_MIB(&sctps->sctps_mib, sctpChecksumError);
- freemsg(mp);
- return;
- }
- sctph->sh_chksum = pktsum;
- ports = *(uint32_t *)(mp->b_rptr + hdr_len);
- if ((connp = sctp_fanout(&ip6h->ip6_src, &ip6h->ip6_dst,
- ports, zoneid, mp, sctps)) == NULL) {
- ip_fanout_sctp_raw(first_mp, ill,
- (ipha_t *)ip6h, B_FALSE, ports,
- mctl_present,
- (flags|IP_FF_SEND_ICMP|IP_FF_IPINFO),
- B_TRUE, zoneid);
- return;
- }
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
- sctp_input(connp, (ipha_t *)ip6h, mp, first_mp, ill,
- B_FALSE, mctl_present);
- return;
- }
- case IPPROTO_UDP: {
- uint16_t *up;
- uint32_t sum;
-
- hdr_len = pkt_len - remlen;
-
- if (hada_mp != NULL) {
- ip0dbg(("udp hada drop\n"));
- goto hada_drop;
- }
-
- /* Verify that at least the ports are present */
- if (remlen < UDPH_SIZE)
- goto pkt_too_short;
- if (mp->b_cont != NULL &&
- whereptr + UDPH_SIZE > mp->b_wptr) {
- if (!pullupmsg(mp, hdr_len + UDPH_SIZE)) {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsInDiscards);
- freemsg(first_mp);
- return;
- }
- hck_flags = 0;
- ip6h = (ip6_t *)mp->b_rptr;
- whereptr = (uint8_t *)ip6h + hdr_len;
- }
-
- /*
- * Before going through the regular checksum
- * calculation, make sure the received checksum
- * is non-zero. RFC 2460 says, a 0x0000 checksum
- * in a UDP packet (within IPv6 packet) is invalid
- * and should be replaced by 0xffff. This makes
- * sense as regular checksum calculation will
- * pass for both the cases i.e. 0x0000 and 0xffff.
- * Removing one of the case makes error detection
- * stronger.
- */
-
- if (((udpha_t *)whereptr)->uha_checksum == 0) {
- /* 0x0000 checksum is invalid */
- ip1dbg(("ip_rput_data_v6: Invalid UDP "
- "checksum value 0x0000\n"));
- BUMP_MIB(ill->ill_ip_mib,
- udpIfStatsInCksumErrs);
- freemsg(first_mp);
- return;
- }
-
- up = (uint16_t *)&ip6h->ip6_src;
-
- /*
- * UDP checksum calculation. First sum up the
- * pseudo-header fields:
- * - Source IPv6 address
- * - Destination IPv6 address
- * - UDP payload length
- * - UDP protocol ID
- */
-
- sum = htons(IPPROTO_UDP + remlen) +
- up[0] + up[1] + up[2] + up[3] +
- up[4] + up[5] + up[6] + up[7] +
- up[8] + up[9] + up[10] + up[11] +
- up[12] + up[13] + up[14] + up[15];
-
- /* Fold initial sum */
- sum = (sum & 0xffff) + (sum >> 16);
-
- if (reass_hck_flags != 0) {
- hck_flags = reass_hck_flags;
-
- IP_CKSUM_RECV_REASS(hck_flags,
- (int32_t)(whereptr - (uchar_t *)mp->b_rptr),
- sum, reass_sum, cksum_err);
- } else {
- mp1 = mp->b_cont;
-
- IP_CKSUM_RECV(hck_flags, sum, (uchar_t *)
- ((uchar_t *)mp->b_rptr + DB_CKSUMSTART(mp)),
- (int32_t)(whereptr - (uchar_t *)mp->b_rptr),
- mp, mp1, cksum_err);
- }
-
- if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
- IP6_STAT(ipst, ip6_in_sw_cksum);
-
- if (cksum_err) {
- BUMP_MIB(ill->ill_ip_mib,
- udpIfStatsInCksumErrs);
-
- if (hck_flags & HCK_FULLCKSUM)
- IP6_STAT(ipst,
- ip6_udp_in_full_hw_cksum_err);
- else if (hck_flags & HCK_PARTIALCKSUM)
- IP6_STAT(ipst,
- ip6_udp_in_part_hw_cksum_err);
- else
- IP6_STAT(ipst, ip6_udp_in_sw_cksum_err);
-
- freemsg(first_mp);
- return;
- }
- goto udp_fanout;
- }
- case IPPROTO_ICMPV6: {
- uint16_t *up;
- uint32_t sum;
- uint_t hdr_len = pkt_len - remlen;
-
- if (hada_mp != NULL) {
- ip0dbg(("icmp hada drop\n"));
- goto hada_drop;
- }
-
- up = (uint16_t *)&ip6h->ip6_src;
- sum = htons(IPPROTO_ICMPV6 + remlen) +
- up[0] + up[1] + up[2] + up[3] +
- up[4] + up[5] + up[6] + up[7] +
- up[8] + up[9] + up[10] + up[11] +
- up[12] + up[13] + up[14] + up[15];
- sum = (sum & 0xffff) + (sum >> 16);
- sum = IP_CSUM(mp, hdr_len, sum);
- if (sum != 0) {
- /* IPv6 ICMP checksum failed */
- ip1dbg(("ip_rput_data_v6: ICMPv6 checksum "
- "failed %x\n",
- sum));
- BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInMsgs);
- BUMP_MIB(ill->ill_icmp6_mib,
- ipv6IfIcmpInErrors);
- freemsg(first_mp);
- return;
- }
-
- icmp_fanout:
- /* Check variable for testing applications */
- if (ipst->ips_ipv6_drop_inbound_icmpv6) {
- freemsg(first_mp);
- return;
- }
- /*
- * Assume that there is always at least one conn for
- * ICMPv6 (in.ndpd) i.e. don't optimize the case
- * where there is no conn.
- */
- if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
- ilm_t *ilm;
- ilm_walker_t ilw;
-
- ASSERT(!IS_LOOPBACK(ill));
- /*
- * In the multicast case, applications may have
- * joined the group from different zones, so we
- * need to deliver the packet to each of them.
- * Loop through the multicast memberships
- * structures (ilm) on the receive ill and send
- * a copy of the packet up each matching one.
- */
- ilm = ilm_walker_start(&ilw, inill);
- for (; ilm != NULL;
- ilm = ilm_walker_step(&ilw, ilm)) {
- if (!IN6_ARE_ADDR_EQUAL(
- &ilm->ilm_v6addr, &ip6h->ip6_dst))
- continue;
- if (!ipif_lookup_zoneid(
- ilw.ilw_walk_ill, ilm->ilm_zoneid,
- IPIF_UP, NULL))
- continue;
-
- first_mp1 = ip_copymsg(first_mp);
- if (first_mp1 == NULL)
- continue;
- icmp_inbound_v6(q, first_mp1,
- ilw.ilw_walk_ill, inill,
- hdr_len, mctl_present, 0,
- ilm->ilm_zoneid, dl_mp);
- }
- ilm_walker_finish(&ilw);
- } else {
- first_mp1 = ip_copymsg(first_mp);
- if (first_mp1 != NULL)
- icmp_inbound_v6(q, first_mp1, ill,
- inill, hdr_len, mctl_present, 0,
- zoneid, dl_mp);
- }
- goto proto_fanout;
- }
- case IPPROTO_ENCAP:
- case IPPROTO_IPV6:
- if (ip_iptun_input_v6(mctl_present ? first_mp : NULL,
- mp, pkt_len - remlen, nexthdr, zoneid, ill, ipst)) {
- return;
- }
- /*
- * If there was no IP tunnel data-link bound to
- * receive this packet, then we fall through to
- * allow potential raw sockets bound to either of
- * these protocols to pick it up.
- */
- /* FALLTHRU */
-proto_fanout:
- default: {
- /*
- * Handle protocols with which IPv6 is less intimate.
- */
- uint_t proto_flags = IP_FF_RAWIP|IP_FF_IPINFO;
-
- if (hada_mp != NULL) {
- ip0dbg(("default hada drop\n"));
- goto hada_drop;
- }
-
- /*
- * Enable sending ICMP for "Unknown" nexthdr
- * case. i.e. where we did not FALLTHRU from
- * IPPROTO_ICMPV6 processing case above.
- * If we did FALLTHRU, then the packet has already been
- * processed for IPPF, don't process it again in
- * ip_fanout_proto_v6; set IP6_NO_IPPOLICY in the
- * flags
- */
- if (nexthdr != IPPROTO_ICMPV6)
- proto_flags |= IP_FF_SEND_ICMP;
- else
- proto_flags |= IP6_NO_IPPOLICY;
-
- ip_fanout_proto_v6(q, first_mp, ip6h, ill, inill,
- nexthdr, prev_nexthdr_offset, (flags|proto_flags),
- mctl_present, zoneid);
- return;
- }
-
- case IPPROTO_DSTOPTS: {
- uint_t ehdrlen;
- uint8_t *optptr;
- ip6_dest_t *desthdr;
-
- /* If packet is too short, look no further */
- if (remlen < MIN_EHDR_LEN)
- goto pkt_too_short;
-
- /* Check if AH is present. */
- if (ipsec_early_ah_v6(q, first_mp, mctl_present, ill,
- inill, hada_mp, zoneid)) {
- return;
- }
-
- /*
- * Reinitialize pointers, as ipsec_early_ah_v6() does
- * complete pullups. We don't have to do more pullups
- * as a result.
- */
- whereptr = (uint8_t *)((uintptr_t)mp->b_rptr +
- (uintptr_t)(whereptr - ((uint8_t *)ip6h)));
- ip6h = (ip6_t *)mp->b_rptr;
-
- desthdr = (ip6_dest_t *)whereptr;
- nexthdr = desthdr->ip6d_nxt;
- prev_nexthdr_offset = (uint_t)(whereptr -
- (uint8_t *)ip6h);
- ehdrlen = 8 * (desthdr->ip6d_len + 1);
- if (remlen < ehdrlen)
- goto pkt_too_short;
- optptr = whereptr + 2;
- /*
- * Note: XXX This code does not seem to make
- * distinction between Destination Options Header
- * being before/after Routing Header which can
- * happen if we are at the end of source route.
- * This may become significant in future.
- * (No real significant Destination Options are
- * defined/implemented yet ).
- */
- switch (ip_process_options_v6(q, first_mp, ip6h, optptr,
- ehdrlen - 2, IPPROTO_DSTOPTS, ipst)) {
- case -1:
- /*
- * Packet has been consumed and any needed
- * ICMP errors sent.
- */
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
- freemsg(hada_mp);
- return;
- case 0:
- /* No action needed continue */
- break;
- case 1:
- /*
- * Unnexpected return value
- * (Router alert is a Hop-by-Hop option)
- */
-#ifdef DEBUG
- panic("ip_rput_data_v6: router "
- "alert hbh opt indication in dest opt");
- /*NOTREACHED*/
-#else
- freemsg(hada_mp);
- freemsg(first_mp);
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- return;
-#endif
- }
- used = ehdrlen;
- break;
- }
- case IPPROTO_FRAGMENT: {
- ip6_frag_t *fraghdr;
- size_t no_frag_hdr_len;
-
- if (hada_mp != NULL) {
- ip0dbg(("frag hada drop\n"));
- goto hada_drop;
- }
-
- ASSERT(first_mp == mp);
- if (remlen < sizeof (ip6_frag_t))
- goto pkt_too_short;
-
- if (mp->b_cont != NULL &&
- whereptr + sizeof (ip6_frag_t) > mp->b_wptr) {
- if (!pullupmsg(mp,
- pkt_len - remlen + sizeof (ip6_frag_t))) {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsInDiscards);
- freemsg(mp);
- return;
- }
- hck_flags = 0;
- ip6h = (ip6_t *)mp->b_rptr;
- whereptr = (uint8_t *)ip6h + pkt_len - remlen;
- }
-
- fraghdr = (ip6_frag_t *)whereptr;
- used = (uint_t)sizeof (ip6_frag_t);
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmReqds);
-
- /*
- * Invoke the CGTP (multirouting) filtering module to
- * process the incoming packet. Packets identified as
- * duplicates must be discarded. Filtering is active
- * only if the the ip_cgtp_filter ndd variable is
- * non-zero.
- */
- if (ipst->ips_ip_cgtp_filter &&
- ipst->ips_ip_cgtp_filter_ops != NULL) {
- int cgtp_flt_pkt;
- netstackid_t stackid;
-
- stackid = ipst->ips_netstack->netstack_stackid;
-
- cgtp_flt_pkt =
- ipst->ips_ip_cgtp_filter_ops->cfo_filter_v6(
- stackid, inill->ill_phyint->phyint_ifindex,
- ip6h, fraghdr);
- if (cgtp_flt_pkt == CGTP_IP_PKT_DUPLICATE) {
- freemsg(mp);
- return;
- }
- }
-
- /* Restore the flags */
- DB_CKSUMFLAGS(mp) = hck_flags;
-
- mp = ip_rput_frag_v6(ill, inill, mp, ip6h, fraghdr,
- remlen - used, &prev_nexthdr_offset,
- &reass_sum, &reass_hck_flags);
- if (mp == NULL) {
- /* Reassembly is still pending */
- return;
- }
- /* The first mblk are the headers before the frag hdr */
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmOKs);
-
- first_mp = mp; /* mp has most likely changed! */
- no_frag_hdr_len = mp->b_wptr - mp->b_rptr;
- ip6h = (ip6_t *)mp->b_rptr;
- nexthdr = ((char *)ip6h)[prev_nexthdr_offset];
- whereptr = mp->b_rptr + no_frag_hdr_len;
- remlen = ntohs(ip6h->ip6_plen) +
- (uint16_t)(IPV6_HDR_LEN - no_frag_hdr_len);
- pkt_len = msgdsize(mp);
- used = 0;
- break;
- }
- case IPPROTO_HOPOPTS: {
- if (hada_mp != NULL) {
- ip0dbg(("hop hada drop\n"));
- goto hada_drop;
- }
- /*
- * Illegal header sequence.
- * (Hop-by-hop headers are processed above
- * and required to immediately follow IPv6 header)
- */
- icmp_param_problem_v6(WR(q), first_mp,
- ICMP6_PARAMPROB_NEXTHEADER,
- prev_nexthdr_offset,
- B_FALSE, B_FALSE, zoneid, ipst);
- return;
- }
- case IPPROTO_ROUTING: {
- uint_t ehdrlen;
- ip6_rthdr_t *rthdr;
-
- /* If packet is too short, look no further */
- if (remlen < MIN_EHDR_LEN)
- goto pkt_too_short;
-
- /* Check if AH is present. */
- if (ipsec_early_ah_v6(q, first_mp, mctl_present, ill,
- inill, hada_mp, zoneid)) {
- return;
- }
-
- /*
- * Reinitialize pointers, as ipsec_early_ah_v6() does
- * complete pullups. We don't have to do more pullups
- * as a result.
- */
- whereptr = (uint8_t *)((uintptr_t)mp->b_rptr +
- (uintptr_t)(whereptr - ((uint8_t *)ip6h)));
- ip6h = (ip6_t *)mp->b_rptr;
-
- rthdr = (ip6_rthdr_t *)whereptr;
- nexthdr = rthdr->ip6r_nxt;
- prev_nexthdr_offset = (uint_t)(whereptr -
- (uint8_t *)ip6h);
- ehdrlen = 8 * (rthdr->ip6r_len + 1);
- if (remlen < ehdrlen)
- goto pkt_too_short;
- if (rthdr->ip6r_segleft != 0) {
- /* Not end of source route */
- if (ll_multicast) {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsForwProhibits);
- freemsg(hada_mp);
- freemsg(mp);
- return;
- }
- ip_process_rthdr(q, mp, ip6h, rthdr, ill,
- hada_mp);
- return;
- }
- used = ehdrlen;
- break;
- }
- case IPPROTO_AH:
- case IPPROTO_ESP: {
- /*
- * Fast path for AH/ESP. If this is the first time
- * we are sending a datagram to AH/ESP, allocate
- * a IPSEC_IN message and prepend it. Otherwise,
- * just fanout.
- */
-
- ipsec_in_t *ii;
- int ipsec_rc;
- ipsec_stack_t *ipss;
-
- ipss = ipst->ips_netstack->netstack_ipsec;
- if (!mctl_present) {
- ASSERT(first_mp == mp);
- first_mp = ipsec_in_alloc(B_FALSE,
- ipst->ips_netstack);
- if (first_mp == NULL) {
- ip1dbg(("ip_rput_data_v6: IPSEC_IN "
- "allocation failure.\n"));
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsInDiscards);
- freemsg(mp);
- return;
- }
- /*
- * Store the ill_index so that when we come back
- * from IPSEC we ride on the same queue.
- */
- ii = (ipsec_in_t *)first_mp->b_rptr;
- ii->ipsec_in_ill_index =
- ill->ill_phyint->phyint_ifindex;
- ii->ipsec_in_rill_index =
- inill->ill_phyint->phyint_ifindex;
- first_mp->b_cont = mp;
- /*
- * Cache hardware acceleration info.
- */
- if (hada_mp != NULL) {
- IPSECHW_DEBUG(IPSECHW_PKT,
- ("ip_rput_data_v6: "
- "caching data attr.\n"));
- ii->ipsec_in_accelerated = B_TRUE;
- ii->ipsec_in_da = hada_mp;
- hada_mp = NULL;
- }
- } else {
- ii = (ipsec_in_t *)first_mp->b_rptr;
- }
-
- if (!ipsec_loaded(ipss)) {
- ip_proto_not_sup(q, first_mp, IP_FF_SEND_ICMP,
- zoneid, ipst);
- return;
- }
-
- /* select inbound SA and have IPsec process the pkt */
- if (nexthdr == IPPROTO_ESP) {
- esph_t *esph = ipsec_inbound_esp_sa(first_mp,
- ipst->ips_netstack);
- if (esph == NULL)
- return;
- ASSERT(ii->ipsec_in_esp_sa != NULL);
- ASSERT(ii->ipsec_in_esp_sa->ipsa_input_func !=
- NULL);
- ipsec_rc = ii->ipsec_in_esp_sa->ipsa_input_func(
- first_mp, esph);
- } else {
- ah_t *ah = ipsec_inbound_ah_sa(first_mp,
- ipst->ips_netstack);
- if (ah == NULL)
- return;
- ASSERT(ii->ipsec_in_ah_sa != NULL);
- ASSERT(ii->ipsec_in_ah_sa->ipsa_input_func !=
- NULL);
- ipsec_rc = ii->ipsec_in_ah_sa->ipsa_input_func(
- first_mp, ah);
- }
-
- switch (ipsec_rc) {
- case IPSEC_STATUS_SUCCESS:
- break;
- case IPSEC_STATUS_FAILED:
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- /* FALLTHRU */
- case IPSEC_STATUS_PENDING:
- return;
- }
- /* we're done with IPsec processing, send it up */
- ip_fanout_proto_again(first_mp, ill, inill, NULL);
- return;
- }
- case IPPROTO_NONE:
- /* All processing is done. Count as "delivered". */
- freemsg(hada_mp);
- freemsg(first_mp);
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
- return;
- }
- whereptr += used;
- ASSERT(remlen >= used);
- remlen -= used;
- }
- /* NOTREACHED */
-
-pkt_too_short:
- ip1dbg(("ip_rput_data_v6: packet too short %d %lu %d\n",
- ip6_len, pkt_len, remlen));
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
- freemsg(hada_mp);
- freemsg(first_mp);
- return;
-udp_fanout:
- if (mctl_present || IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
- connp = NULL;
- } else {
- connp = ipcl_classify_v6(mp, IPPROTO_UDP, hdr_len, zoneid,
- ipst);
- if ((connp != NULL) && (connp->conn_upq == NULL)) {
- CONN_DEC_REF(connp);
- connp = NULL;
- }
- }
-
- if (connp == NULL) {
- uint32_t ports;
-
- ports = *(uint32_t *)(mp->b_rptr + hdr_len +
- UDP_PORTS_OFFSET);
- IP6_STAT(ipst, ip6_udp_slow_path);
- ip_fanout_udp_v6(q, first_mp, ip6h, ports, ill, inill,
- (flags|IP_FF_SEND_ICMP|IP_FF_IPINFO), mctl_present,
- zoneid);
- return;
- }
-
- if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
- (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) {
- freemsg(first_mp);
- BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
- CONN_DEC_REF(connp);
- return;
- }
-
- /* Initiate IPPF processing */
- if (IP6_IN_IPP(flags, ipst)) {
- ip_process(IPP_LOCAL_IN, &mp, ill->ill_phyint->phyint_ifindex);
- if (mp == NULL) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- CONN_DEC_REF(connp);
- return;
- }
- }
-
- if (connp->conn_ip_recvpktinfo ||
- IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src)) {
- mp = ip_add_info_v6(mp, inill, &ip6h->ip6_dst);
- if (mp == NULL) {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- CONN_DEC_REF(connp);
- return;
- }
+ return (NULL);
}
- IP6_STAT(ipst, ip6_udp_fast_path);
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
-
- /* Send it upstream */
- (connp->conn_recv)(connp, mp, NULL);
-
- CONN_DEC_REF(connp);
- freemsg(hada_mp);
- return;
-
-hada_drop:
- ip1dbg(("ip_rput_data_v6: malformed accelerated packet\n"));
- /* IPsec kstats: bump counter here */
- freemsg(hada_mp);
- freemsg(first_mp);
+ /* we're done with IPsec processing, send it up */
+ ip_input_post_ipsec(mp, ira);
+ return (NULL);
}
/*
* Reassemble fragment.
* When it returns a completed message the first mblk will only contain
- * the headers prior to the fragment header.
- *
- * prev_nexthdr_offset is an offset indication of where the nexthdr field is
- * of the preceding header. This is needed to patch the previous header's
- * nexthdr field when reassembly completes.
+ * the headers prior to the fragment header, with the nexthdr value updated
+ * to be the header after the fragment header.
*/
-static mblk_t *
-ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
- ip6_frag_t *fraghdr, uint_t remlen, uint_t *prev_nexthdr_offset,
- uint32_t *cksum_val, uint16_t *cksum_flags)
+mblk_t *
+ip_input_fragment_v6(mblk_t *mp, ip6_t *ip6h,
+ ip6_frag_t *fraghdr, uint_t remlen, ip_recv_attr_t *ira)
{
uint32_t ident = ntohl(fraghdr->ip6f_ident);
uint16_t offset;
@@ -8225,12 +3304,12 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
boolean_t pruned = B_FALSE;
uint32_t sum_val;
uint16_t sum_flags;
+ ill_t *ill = ira->ira_ill;
ip_stack_t *ipst = ill->ill_ipst;
-
- if (cksum_val != NULL)
- *cksum_val = 0;
- if (cksum_flags != NULL)
- *cksum_flags = 0;
+ uint_t prev_nexthdr_offset;
+ uint8_t prev_nexthdr;
+ uint8_t *ptr;
+ uint32_t packet_size;
/*
* We utilize hardware computed checksum info only for UDP since
@@ -8238,8 +3317,9 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
* addition, checksum offload support for IP fragments carrying
* UDP payload is commonly implemented across network adapters.
*/
- ASSERT(inill != NULL);
- if (nexthdr == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(inill) &&
+ ASSERT(ira->ira_rill != NULL);
+ if (nexthdr == IPPROTO_UDP && dohwcksum &&
+ ILL_HCKSUM_CAPABLE(ira->ira_rill) &&
(DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) {
mblk_t *mp1 = mp->b_cont;
int32_t len;
@@ -8253,8 +3333,8 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
if ((sum_flags & HCK_PARTIALCKSUM) &&
(mp1 == NULL || mp1->b_cont == NULL) &&
- offset >= (uint16_t)DB_CKSUMSTART(mp) &&
- ((len = offset - (uint16_t)DB_CKSUMSTART(mp)) & 1) == 0) {
+ offset >= DB_CKSUMSTART(mp) &&
+ ((len = offset - DB_CKSUMSTART(mp)) & 1) == 0) {
uint32_t adj;
/*
* Partial checksum has been calculated by hardware
@@ -8281,6 +3361,59 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
DB_CKSUMFLAGS(mp) = 0;
/*
+ * Determine the offset (from the begining of the IP header)
+ * of the nexthdr value which has IPPROTO_FRAGMENT. We use
+ * this when removing the fragment header from the packet.
+ * This packet consists of the IPv6 header, a potential
+ * hop-by-hop options header, a potential pre-routing-header
+ * destination options header, and a potential routing header.
+ */
+ prev_nexthdr_offset = (uint8_t *)&ip6h->ip6_nxt - (uint8_t *)ip6h;
+ prev_nexthdr = ip6h->ip6_nxt;
+ ptr = (uint8_t *)&ip6h[1];
+
+ if (prev_nexthdr == IPPROTO_HOPOPTS) {
+ ip6_hbh_t *hbh_hdr;
+ uint_t hdr_len;
+
+ hbh_hdr = (ip6_hbh_t *)ptr;
+ hdr_len = 8 * (hbh_hdr->ip6h_len + 1);
+ prev_nexthdr = hbh_hdr->ip6h_nxt;
+ prev_nexthdr_offset = (uint8_t *)&hbh_hdr->ip6h_nxt
+ - (uint8_t *)ip6h;
+ ptr += hdr_len;
+ }
+ if (prev_nexthdr == IPPROTO_DSTOPTS) {
+ ip6_dest_t *dest_hdr;
+ uint_t hdr_len;
+
+ dest_hdr = (ip6_dest_t *)ptr;
+ hdr_len = 8 * (dest_hdr->ip6d_len + 1);
+ prev_nexthdr = dest_hdr->ip6d_nxt;
+ prev_nexthdr_offset = (uint8_t *)&dest_hdr->ip6d_nxt
+ - (uint8_t *)ip6h;
+ ptr += hdr_len;
+ }
+ if (prev_nexthdr == IPPROTO_ROUTING) {
+ ip6_rthdr_t *rthdr;
+ uint_t hdr_len;
+
+ rthdr = (ip6_rthdr_t *)ptr;
+ prev_nexthdr = rthdr->ip6r_nxt;
+ prev_nexthdr_offset = (uint8_t *)&rthdr->ip6r_nxt
+ - (uint8_t *)ip6h;
+ hdr_len = 8 * (rthdr->ip6r_len + 1);
+ ptr += hdr_len;
+ }
+ if (prev_nexthdr != IPPROTO_FRAGMENT) {
+ /* Can't handle other headers before the fragment header */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
+ ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
+ freemsg(mp);
+ return (NULL);
+ }
+
+ /*
* Note: Fragment offset in header is in 8-octet units.
* Clearing least significant 3 bits not only extracts
* it but also gets it in units of octets.
@@ -8293,17 +3426,10 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
* of eight?
*/
if (more_frags && (ntohs(ip6h->ip6_plen) & 7)) {
- zoneid_t zoneid;
-
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
- zoneid = ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst, ill, ipst);
- if (zoneid == ALL_ZONES) {
- freemsg(mp);
- return (NULL);
- }
- icmp_param_problem_v6(ill->ill_wq, mp, ICMP6_PARAMPROB_HEADER,
+ ip_drop_input("ICMP_PARAM_PROBLEM", mp, ill);
+ icmp_param_problem_v6(mp, ICMP6_PARAMPROB_HEADER,
(uint32_t)((char *)&ip6h->ip6_plen -
- (char *)ip6h), B_FALSE, B_FALSE, zoneid, ipst);
+ (char *)ip6h), B_FALSE, ira);
return (NULL);
}
@@ -8319,17 +3445,11 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
* greater than IP_MAXPACKET - the max payload size?
*/
if (end > IP_MAXPACKET) {
- zoneid_t zoneid;
-
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
- zoneid = ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst, ill, ipst);
- if (zoneid == ALL_ZONES) {
- freemsg(mp);
- return (NULL);
- }
- icmp_param_problem_v6(ill->ill_wq, mp, ICMP6_PARAMPROB_HEADER,
+ ip_drop_input("Reassembled packet too large", mp, ill);
+ icmp_param_problem_v6(mp, ICMP6_PARAMPROB_HEADER,
(uint32_t)((char *)&fraghdr->ip6f_offlg -
- (char *)ip6h), B_FALSE, B_FALSE, zoneid, ipst);
+ (char *)ip6h), B_FALSE, ira);
return (NULL);
}
@@ -8368,11 +3488,17 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
* there is anything on the reassembly queue, the timer will
* be running.
*/
- msg_len = MBLKSIZE(mp);
+ /* Handle vnic loopback of fragments */
+ if (mp->b_datap->db_ref > 2)
+ msg_len = 0;
+ else
+ msg_len = MBLKSIZE(mp);
+
tail_mp = mp;
while (tail_mp->b_cont != NULL) {
tail_mp = tail_mp->b_cont;
- msg_len += MBLKSIZE(tail_mp);
+ if (tail_mp->b_datap->db_ref <= 2)
+ msg_len += MBLKSIZE(tail_mp);
}
/*
* If the reassembly list for this ILL will get too big
@@ -8381,6 +3507,9 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >=
ipst->ips_ip_reass_queue_bytes) {
+ DTRACE_PROBE3(ip_reass_queue_bytes, uint_t, msg_len,
+ uint_t, ill->ill_frag_count,
+ uint_t, ipst->ips_ip_reass_queue_bytes);
ill_frag_prune(ill,
(ipst->ips_ip_reass_queue_bytes < msg_len) ? 0 :
(ipst->ips_ip_reass_queue_bytes - msg_len));
@@ -8443,6 +3572,7 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
mp1 = allocb(sizeof (*ipf), BPRI_MED);
if (!mp1) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
freemsg(mp);
partial_reass_done:
mutex_exit(&ipfb->ipfb_lock);
@@ -8512,7 +3642,7 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
*/
ipf->ipf_end = end;
ipf->ipf_nf_hdr_len = hdr_length;
- ipf->ipf_prev_nexthdr_offset = *prev_nexthdr_offset;
+ ipf->ipf_prev_nexthdr_offset = prev_nexthdr_offset;
} else {
/* Hard case, hole at the beginning. */
ipf->ipf_tail_mp = NULL;
@@ -8603,7 +3733,7 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
if (ipf->ipf_prev_nexthdr_offset == 0) {
ipf->ipf_nf_hdr_len = hdr_length;
ipf->ipf_prev_nexthdr_offset =
- *prev_nexthdr_offset;
+ prev_nexthdr_offset;
}
}
/* Save current byte count */
@@ -8654,7 +3784,7 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
* header
*/
nexthdr = ipf->ipf_protocol;
- *prev_nexthdr_offset = ipf->ipf_prev_nexthdr_offset;
+ prev_nexthdr_offset = ipf->ipf_prev_nexthdr_offset;
ipfp = ipf->ipf_ptphn;
/* We need to supply these to caller */
@@ -8685,7 +3815,8 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
reass_done:
if (hdr_length < sizeof (ip6_frag_t)) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
- ip1dbg(("ip_rput_frag_v6: bad packet\n"));
+ ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
+ ip1dbg(("ip_input_fragment_v6: bad packet\n"));
freemsg(mp);
return (NULL);
}
@@ -8708,8 +3839,9 @@ reass_done:
mblk_t *nmp;
if (!(nmp = dupb(mp))) {
+ ip1dbg(("ip_input_fragment_v6: dupb failed\n"));
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- ip1dbg(("ip_rput_frag_v6: dupb failed\n"));
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
freemsg(mp);
return (NULL);
}
@@ -8720,19 +3852,24 @@ reass_done:
mp->b_wptr = mp->b_rptr + hdr_length - sizeof (ip6_frag_t);
ip6h = (ip6_t *)mp->b_rptr;
- ((char *)ip6h)[*prev_nexthdr_offset] = nexthdr;
+ ((char *)ip6h)[prev_nexthdr_offset] = nexthdr;
/* Restore original IP length in header. */
- ip6h->ip6_plen = htons((uint16_t)(msgdsize(mp) - IPV6_HDR_LEN));
+ packet_size = msgdsize(mp);
+ ip6h->ip6_plen = htons((uint16_t)(packet_size - IPV6_HDR_LEN));
/* Record the ECN info. */
ip6h->ip6_vcf &= htonl(0xFFCFFFFF);
ip6h->ip6_vcf |= htonl(ecn_info << 20);
- /* Reassembly is successful; return checksum information if needed */
- if (cksum_val != NULL)
- *cksum_val = sum_val;
- if (cksum_flags != NULL)
- *cksum_flags = sum_flags;
+ /* Update the receive attributes */
+ ira->ira_pktlen = packet_size;
+ ira->ira_ip_hdr_length = hdr_length - sizeof (ip6_frag_t);
+ ira->ira_protocol = nexthdr;
+
+ /* Reassembly is successful; set checksum information in packet */
+ DB_CKSUM16(mp) = (uint16_t)sum_val;
+ DB_CKSUMFLAGS(mp) = sum_flags;
+ DB_CKSUMSTART(mp) = ira->ira_ip_hdr_length;
return (mp);
}
@@ -8742,7 +3879,7 @@ reass_done:
* header.
*/
static in6_addr_t
-pluck_out_dst(mblk_t *mp, uint8_t *whereptr, in6_addr_t oldrv)
+pluck_out_dst(const mblk_t *mp, uint8_t *whereptr, in6_addr_t oldrv)
{
ip6_rthdr0_t *rt0;
int segleft, numaddr;
@@ -8758,7 +3895,7 @@ pluck_out_dst(mblk_t *mp, uint8_t *whereptr, in6_addr_t oldrv)
numaddr = rt0->ip6r0_len / 2;
if ((rt0->ip6r0_len & 0x1) ||
- whereptr + (rt0->ip6r0_len + 1) * 8 > mp->b_wptr ||
+ (mp != NULL && whereptr + (rt0->ip6r0_len + 1) * 8 > mp->b_wptr) ||
(segleft > rt0->ip6r0_len / 2)) {
/*
* Corrupt packet. Either the routing header length is odd
@@ -8784,11 +3921,13 @@ pluck_out_dst(mblk_t *mp, uint8_t *whereptr, in6_addr_t oldrv)
* Walk through the options to see if there is a routing header.
* If present get the destination which is the last address of
* the option.
+ * mp needs to be provided in cases when the extension headers might span
+ * b_cont; mp is never modified by this function.
*/
in6_addr_t
-ip_get_dst_v6(ip6_t *ip6h, mblk_t *mp, boolean_t *is_fragment)
+ip_get_dst_v6(ip6_t *ip6h, const mblk_t *mp, boolean_t *is_fragment)
{
- mblk_t *current_mp = mp;
+ const mblk_t *current_mp = mp;
uint8_t nexthdr;
uint8_t *whereptr;
int ehdrlen;
@@ -8798,7 +3937,8 @@ ip_get_dst_v6(ip6_t *ip6h, mblk_t *mp, boolean_t *is_fragment)
ehdrlen = sizeof (ip6_t);
/* We assume at least the IPv6 base header is within one mblk. */
- ASSERT(mp->b_rptr <= whereptr && mp->b_wptr >= whereptr + ehdrlen);
+ ASSERT(mp == NULL ||
+ (mp->b_rptr <= whereptr && mp->b_wptr >= whereptr + ehdrlen));
rv = ip6h->ip6_dst;
nexthdr = ip6h->ip6_nxt;
@@ -8819,7 +3959,8 @@ ip_get_dst_v6(ip6_t *ip6h, mblk_t *mp, boolean_t *is_fragment)
* All IPv6 extension headers have the next-header in byte
* 0, and the (length - 8) in 8-byte-words.
*/
- while (whereptr + ehdrlen >= current_mp->b_wptr) {
+ while (current_mp != NULL &&
+ whereptr + ehdrlen >= current_mp->b_wptr) {
ehdrlen -= (current_mp->b_wptr - whereptr);
current_mp = current_mp->b_cont;
if (current_mp == NULL) {
@@ -8833,7 +3974,7 @@ ip_get_dst_v6(ip6_t *ip6h, mblk_t *mp, boolean_t *is_fragment)
whereptr += ehdrlen;
nexthdr = *whereptr;
- ASSERT(whereptr + 1 < current_mp->b_wptr);
+ ASSERT(current_mp == NULL || whereptr + 1 < current_mp->b_wptr);
ehdrlen = (*(whereptr + 1) + 1) * 8;
}
@@ -8845,7 +3986,7 @@ done:
/*
* ip_source_routed_v6:
- * This function is called by redirect code in ip_rput_data_v6 to
+ * This function is called by redirect code (called from ip_input_v6) to
* know whether this packet is source routed through this node i.e
* whether this node (router) is part of the journey. This
* function is called under two cases :
@@ -8922,22 +4063,14 @@ ip_source_routed_v6(ip6_t *ip6h, mblk_t *mp, ip_stack_t *ipst)
*/
if (rthdr->ip6r0_segleft > 0 ||
rthdr->ip6r0_segleft == 0) {
- ire_t *ire = NULL;
-
numaddr = rthdr->ip6r0_len / 2;
addrptr = (in6_addr_t *)((char *)rthdr +
sizeof (*rthdr));
addrptr += (numaddr - (rthdr->ip6r0_segleft + 1));
if (addrptr != NULL) {
- ire = ire_ctable_lookup_v6(addrptr, NULL,
- IRE_LOCAL, NULL, ALL_ZONES, NULL,
- MATCH_IRE_TYPE,
- ipst);
- if (ire != NULL) {
- ire_refrele(ire);
+ if (ip_type_v6(addrptr, ipst) == IRE_LOCAL)
return (B_TRUE);
- }
- ip1dbg(("ip_source_routed_v6: No ire found\n"));
+ ip1dbg(("ip_source_routed_v6: Not local\n"));
}
}
/* FALLTHRU */
@@ -8948,2387 +4081,19 @@ ip_source_routed_v6(ip6_t *ip6h, mblk_t *mp, ip_stack_t *ipst)
}
/*
- * ip_wput_v6 -- Packets sent down from transport modules show up here.
- * Assumes that the following set of headers appear in the first
- * mblk:
- * ip6i_t (if present) CAN also appear as a separate mblk.
- * ip6_t
- * Any extension headers
- * TCP/UDP/SCTP header (if present)
- * The routine can handle an ICMPv6 header that is not in the first mblk.
- *
- * The order to determine the outgoing interface is as follows:
- * 1. If an ip6i_t with IP6I_IFINDEX set then use that ill.
- * 2. If q is an ill queue and (link local or multicast destination) then
- * use that ill.
- * 3. If IPV6_BOUND_IF has been set use that ill.
- * 4. For multicast: if IPV6_MULTICAST_IF has been set use it. Otherwise
- * look for the best IRE match for the unspecified group to determine
- * the ill.
- * 5. For unicast: Just do an IRE lookup for the best match.
- *
- * arg2 is always a queue_t *.
- * When that queue is an ill_t (i.e. q_next != NULL), then arg must be
- * the zoneid.
- * When that queue is not an ill_t, then arg must be a conn_t pointer.
- */
-void
-ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller)
-{
- conn_t *connp = NULL;
- queue_t *q = (queue_t *)arg2;
- ire_t *ire = NULL;
- ire_t *sctp_ire = NULL;
- ip6_t *ip6h;
- in6_addr_t *v6dstp;
- ill_t *ill = NULL;
- ipif_t *ipif;
- ip6i_t *ip6i;
- int cksum_request; /* -1 => normal. */
- /* 1 => Skip TCP/UDP/SCTP checksum */
- /* Otherwise contains insert offset for checksum */
- int unspec_src;
- boolean_t do_outrequests; /* Increment OutRequests? */
- mib2_ipIfStatsEntry_t *mibptr;
- int match_flags = MATCH_IRE_ILL;
- mblk_t *first_mp;
- boolean_t mctl_present;
- ipsec_out_t *io;
- boolean_t multirt_need_resolve = B_FALSE;
- mblk_t *copy_mp = NULL;
- int err = 0;
- int ip6i_flags = 0;
- zoneid_t zoneid;
- ill_t *saved_ill = NULL;
- boolean_t conn_lock_held;
- boolean_t need_decref = B_FALSE;
- ip_stack_t *ipst;
-
- if (q->q_next != NULL) {
- ill = (ill_t *)q->q_ptr;
- ipst = ill->ill_ipst;
- } else {
- connp = (conn_t *)arg;
- ASSERT(connp != NULL);
- ipst = connp->conn_netstack->netstack_ip;
- }
-
- /*
- * Highest bit in version field is Reachability Confirmation bit
- * used by NUD in ip_xmit_v6().
- */
-#ifdef _BIG_ENDIAN
-#define IPVER(ip6h) ((((uint32_t *)ip6h)[0] >> 28) & 0x7)
-#else
-#define IPVER(ip6h) ((((uint32_t *)ip6h)[0] >> 4) & 0x7)
-#endif
-
- /*
- * M_CTL comes from 5 places
- *
- * 1) TCP sends down IPSEC_OUT(M_CTL) for detached connections
- * both V4 and V6 datagrams.
- *
- * 2) AH/ESP sends down M_CTL after doing their job with both
- * V4 and V6 datagrams.
- *
- * 3) NDP callbacks when nce is resolved and IPSEC_OUT has been
- * attached.
- *
- * 4) Notifications from an external resolver (for XRESOLV ifs)
- *
- * 5) AH/ESP send down IPSEC_CTL(M_CTL) to be relayed to hardware for
- * IPsec hardware acceleration support.
- *
- * We need to handle (1)'s IPv6 case and (3) here. For the
- * IPv4 case in (1), and (2), IPSEC processing has already
- * started. The code in ip_wput() already knows how to handle
- * continuing IPSEC processing (for IPv4 and IPv6). All other
- * M_CTLs (including case (4)) are passed on to ip_wput_nondata()
- * for handling.
- */
- first_mp = mp;
- mctl_present = B_FALSE;
- io = NULL;
-
- /* Multidata transmit? */
- if (DB_TYPE(mp) == M_MULTIDATA) {
- /*
- * We should never get here, since all Multidata messages
- * originating from tcp should have been directed over to
- * tcp_multisend() in the first place.
- */
- BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
- freemsg(mp);
- return;
- } else if (DB_TYPE(mp) == M_CTL) {
- uint32_t mctltype = 0;
- uint32_t mlen = MBLKL(first_mp);
-
- mp = mp->b_cont;
- mctl_present = B_TRUE;
- io = (ipsec_out_t *)first_mp->b_rptr;
-
- /*
- * Validate this M_CTL message. The only three types of
- * M_CTL messages we expect to see in this code path are
- * ipsec_out_t or ipsec_in_t structures (allocated as
- * ipsec_info_t unions), or ipsec_ctl_t structures.
- * The ipsec_out_type and ipsec_in_type overlap in the two
- * data structures, and they are either set to IPSEC_OUT
- * or IPSEC_IN depending on which data structure it is.
- * ipsec_ctl_t is an IPSEC_CTL.
- *
- * All other M_CTL messages are sent to ip_wput_nondata()
- * for handling.
- */
- if (mlen >= sizeof (io->ipsec_out_type))
- mctltype = io->ipsec_out_type;
-
- if ((mlen == sizeof (ipsec_ctl_t)) &&
- (mctltype == IPSEC_CTL)) {
- ip_output(arg, first_mp, arg2, caller);
- return;
- }
-
- if ((mlen < sizeof (ipsec_info_t)) ||
- (mctltype != IPSEC_OUT && mctltype != IPSEC_IN) ||
- mp == NULL) {
- ip_wput_nondata(NULL, q, first_mp, NULL);
- return;
- }
- /* NDP callbacks have q_next non-NULL. That's case #3. */
- if (q->q_next == NULL) {
- ip6h = (ip6_t *)mp->b_rptr;
- /*
- * For a freshly-generated TCP dgram that needs IPV6
- * processing, don't call ip_wput immediately. We can
- * tell this by the ipsec_out_proc_begin. In-progress
- * IPSEC_OUT messages have proc_begin set to TRUE,
- * and we want to send all IPSEC_IN messages to
- * ip_wput() for IPsec processing or finishing.
- */
- if (mctltype == IPSEC_IN ||
- IPVER(ip6h) != IPV6_VERSION ||
- io->ipsec_out_proc_begin) {
- mibptr = &ipst->ips_ip6_mib;
- goto notv6;
- }
- }
- } else if (DB_TYPE(mp) != M_DATA) {
- ip_wput_nondata(NULL, q, mp, NULL);
- return;
- }
-
- ip6h = (ip6_t *)mp->b_rptr;
-
- if (IPVER(ip6h) != IPV6_VERSION) {
- mibptr = &ipst->ips_ip6_mib;
- goto notv6;
- }
-
- if (is_system_labeled() && DB_TYPE(mp) == M_DATA &&
- (connp == NULL || !connp->conn_ulp_labeled)) {
- cred_t *cr;
- pid_t pid;
-
- if (connp != NULL) {
- ASSERT(CONN_CRED(connp) != NULL);
- cr = BEST_CRED(mp, connp, &pid);
- err = tsol_check_label_v6(cr, &mp,
- connp->conn_mac_mode, ipst, pid);
- } else if ((cr = msg_getcred(mp, &pid)) != NULL) {
- err = tsol_check_label_v6(cr, &mp, CONN_MAC_DEFAULT,
- ipst, pid);
- }
- if (mctl_present)
- first_mp->b_cont = mp;
- else
- first_mp = mp;
- if (err != 0) {
- DTRACE_PROBE3(
- tsol_ip_log_drop_checklabel_ip6, char *,
- "conn(1), failed to check/update mp(2)",
- conn_t, connp, mblk_t, mp);
- freemsg(first_mp);
- return;
- }
- ip6h = (ip6_t *)mp->b_rptr;
- }
- if (q->q_next != NULL) {
- /*
- * We don't know if this ill will be used for IPv6
- * until the ILLF_IPV6 flag is set via SIOCSLIFNAME.
- * ipif_set_values() sets the ill_isv6 flag to true if
- * ILLF_IPV6 is set. If the ill_isv6 flag isn't true,
- * just drop the packet.
- */
- if (!ill->ill_isv6) {
- ip1dbg(("ip_wput_v6: Received an IPv6 packet before "
- "ILLF_IPV6 was set\n"));
- freemsg(first_mp);
- return;
- }
- /* For uniformity do a refhold */
- mutex_enter(&ill->ill_lock);
- if (!ILL_CAN_LOOKUP(ill)) {
- mutex_exit(&ill->ill_lock);
- freemsg(first_mp);
- return;
- }
- ill_refhold_locked(ill);
- mutex_exit(&ill->ill_lock);
- mibptr = ill->ill_ip_mib;
-
- ASSERT(mibptr != NULL);
- unspec_src = 0;
- BUMP_MIB(mibptr, ipIfStatsHCOutRequests);
- do_outrequests = B_FALSE;
- zoneid = (zoneid_t)(uintptr_t)arg;
- } else {
- ASSERT(connp != NULL);
- zoneid = connp->conn_zoneid;
-
- /* is queue flow controlled? */
- if ((q->q_first || connp->conn_draining) &&
- (caller == IP_WPUT)) {
- /*
- * 1) TCP sends down M_CTL for detached connections.
- * 2) AH/ESP sends down M_CTL.
- *
- * We don't flow control either of the above. Only
- * UDP and others are flow controlled for which we
- * can't have a M_CTL.
- */
- ASSERT(first_mp == mp);
- (void) putq(q, mp);
- return;
- }
- mibptr = &ipst->ips_ip6_mib;
- unspec_src = connp->conn_unspec_src;
- do_outrequests = B_TRUE;
- if (mp->b_flag & MSGHASREF) {
- mp->b_flag &= ~MSGHASREF;
- ASSERT(connp->conn_ulp == IPPROTO_SCTP);
- SCTP_EXTRACT_IPINFO(mp, sctp_ire);
- need_decref = B_TRUE;
- }
-
- /*
- * If there is a policy, try to attach an ipsec_out in
- * the front. At the end, first_mp either points to a
- * M_DATA message or IPSEC_OUT message linked to a
- * M_DATA message. We have to do it now as we might
- * lose the "conn" if we go through ip_newroute.
- */
- if (!mctl_present &&
- (connp->conn_out_enforce_policy ||
- connp->conn_latch != NULL)) {
- ASSERT(first_mp == mp);
- /* XXX Any better way to get the protocol fast ? */
- if (((mp = ipsec_attach_ipsec_out(&mp, connp, NULL,
- connp->conn_ulp, ipst->ips_netstack)) == NULL)) {
- BUMP_MIB(mibptr, ipIfStatsOutDiscards);
- if (need_decref)
- CONN_DEC_REF(connp);
- return;
- } else {
- ASSERT(mp->b_datap->db_type == M_CTL);
- first_mp = mp;
- mp = mp->b_cont;
- mctl_present = B_TRUE;
- io = (ipsec_out_t *)first_mp->b_rptr;
- }
- }
- }
-
- /* check for alignment and full IPv6 header */
- if (!OK_32PTR((uchar_t *)ip6h) ||
- (mp->b_wptr - (uchar_t *)ip6h) < IPV6_HDR_LEN) {
- ip0dbg(("ip_wput_v6: bad alignment or length\n"));
- if (do_outrequests)
- BUMP_MIB(mibptr, ipIfStatsHCOutRequests);
- BUMP_MIB(mibptr, ipIfStatsOutDiscards);
- freemsg(first_mp);
- if (ill != NULL)
- ill_refrele(ill);
- if (need_decref)
- CONN_DEC_REF(connp);
- return;
- }
- v6dstp = &ip6h->ip6_dst;
- cksum_request = -1;
- ip6i = NULL;
-
- /*
- * Once neighbor discovery has completed, ndp_process() will provide
- * locally generated packets for which processing can be reattempted.
- * In these cases, connp is NULL and the original zone is part of a
- * prepended ipsec_out_t.
- */
- if (io != NULL) {
- /*
- * When coming from icmp_input_v6, the zoneid might not match
- * for the loopback case, because inside icmp_input_v6 the
- * queue_t is a conn queue from the sending side.
- */
- zoneid = io->ipsec_out_zoneid;
- ASSERT(zoneid != ALL_ZONES);
- }
-
- if (ip6h->ip6_nxt == IPPROTO_RAW) {
- /*
- * This is an ip6i_t header followed by an ip6_hdr.
- * Check which fields are set.
- *
- * When the packet comes from a transport we should have
- * all needed headers in the first mblk. However, when
- * going through ip_newroute*_v6 the ip6i might be in
- * a separate mblk when we return here. In that case
- * we pullup everything to ensure that extension and transport
- * headers "stay" in the first mblk.
- */
- ip6i = (ip6i_t *)ip6h;
- ip6i_flags = ip6i->ip6i_flags;
-
- ASSERT((mp->b_wptr - (uchar_t *)ip6i) == sizeof (ip6i_t) ||
- ((mp->b_wptr - (uchar_t *)ip6i) >=
- sizeof (ip6i_t) + IPV6_HDR_LEN));
-
- if ((mp->b_wptr - (uchar_t *)ip6i) == sizeof (ip6i_t)) {
- if (!pullupmsg(mp, -1)) {
- ip1dbg(("ip_wput_v6: pullupmsg failed\n"));
- if (do_outrequests) {
- BUMP_MIB(mibptr,
- ipIfStatsHCOutRequests);
- }
- BUMP_MIB(mibptr, ipIfStatsOutDiscards);
- freemsg(first_mp);
- if (ill != NULL)
- ill_refrele(ill);
- if (need_decref)
- CONN_DEC_REF(connp);
- return;
- }
- ip6h = (ip6_t *)mp->b_rptr;
- v6dstp = &ip6h->ip6_dst;
- ip6i = (ip6i_t *)ip6h;
- }
- ip6h = (ip6_t *)&ip6i[1];
-
- /*
- * Advance rptr past the ip6i_t to get ready for
- * transmitting the packet. However, if the packet gets
- * passed to ip_newroute*_v6 then rptr is moved back so
- * that the ip6i_t header can be inspected when the
- * packet comes back here after passing through
- * ire_add_then_send.
- */
- mp->b_rptr = (uchar_t *)ip6h;
-
- if (ip6i->ip6i_flags & IP6I_IFINDEX) {
- ASSERT(ip6i->ip6i_ifindex != 0);
- if (ill != NULL)
- ill_refrele(ill);
- ill = ill_lookup_on_ifindex(ip6i->ip6i_ifindex, 1,
- NULL, NULL, NULL, NULL, ipst);
- if (ill == NULL) {
- if (do_outrequests) {
- BUMP_MIB(mibptr,
- ipIfStatsHCOutRequests);
- }
- BUMP_MIB(mibptr, ipIfStatsOutDiscards);
- ip1dbg(("ip_wput_v6: bad ifindex %d\n",
- ip6i->ip6i_ifindex));
- if (need_decref)
- CONN_DEC_REF(connp);
- freemsg(first_mp);
- return;
- }
- mibptr = ill->ill_ip_mib;
- /*
- * Preserve the index so that when we return from
- * IPSEC processing, we know where to send the packet.
- */
- if (mctl_present) {
- ASSERT(io != NULL);
- io->ipsec_out_ill_index = ip6i->ip6i_ifindex;
- }
- }
- if (ip6i->ip6i_flags & IP6I_VERIFY_SRC) {
- cred_t *cr = msg_getcred(mp, NULL);
-
- /* rpcmod doesn't send down db_credp for UDP packets */
- if (cr == NULL) {
- if (connp != NULL)
- cr = connp->conn_cred;
- else
- cr = ill->ill_credp;
- }
-
- ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src));
- if (secpolicy_net_rawaccess(cr) != 0) {
- /*
- * Use IPCL_ZONEID to honor SO_ALLZONES.
- */
- ire = ire_route_lookup_v6(&ip6h->ip6_src,
- 0, 0, (IRE_LOCAL|IRE_LOOPBACK), NULL,
- NULL, connp != NULL ?
- IPCL_ZONEID(connp) : zoneid, NULL,
- MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst);
- if (ire == NULL) {
- if (do_outrequests)
- BUMP_MIB(mibptr,
- ipIfStatsHCOutRequests);
- BUMP_MIB(mibptr, ipIfStatsOutDiscards);
- ip1dbg(("ip_wput_v6: bad source "
- "addr\n"));
- freemsg(first_mp);
- if (ill != NULL)
- ill_refrele(ill);
- if (need_decref)
- CONN_DEC_REF(connp);
- return;
- }
- ire_refrele(ire);
- }
- /* No need to verify again when using ip_newroute */
- ip6i->ip6i_flags &= ~IP6I_VERIFY_SRC;
- }
- if (!(ip6i->ip6i_flags & IP6I_NEXTHOP)) {
- /*
- * Make sure they match since ip_newroute*_v6 etc might
- * (unknown to them) inspect ip6i_nexthop when
- * they think they access ip6_dst.
- */
- ip6i->ip6i_nexthop = ip6h->ip6_dst;
- }
- if (ip6i->ip6i_flags & IP6I_NO_ULP_CKSUM)
- cksum_request = 1;
- if (ip6i->ip6i_flags & IP6I_RAW_CHECKSUM)
- cksum_request = ip6i->ip6i_checksum_off;
- if (ip6i->ip6i_flags & IP6I_UNSPEC_SRC)
- unspec_src = 1;
-
- if (do_outrequests && ill != NULL) {
- BUMP_MIB(mibptr, ipIfStatsHCOutRequests);
- do_outrequests = B_FALSE;
- }
- /*
- * Store ip6i_t info that we need after we come back
- * from IPSEC processing.
- */
- if (mctl_present) {
- ASSERT(io != NULL);
- io->ipsec_out_unspec_src = unspec_src;
- }
- }
- if (connp != NULL && connp->conn_dontroute)
- ip6h->ip6_hops = 1;
-
- if (IN6_IS_ADDR_MULTICAST(v6dstp))
- goto ipv6multicast;
-
- /* 1. If an ip6i_t with IP6I_IFINDEX set then use that ill. */
- if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_IFINDEX)) {
- ASSERT(ill != NULL);
- goto send_from_ill;
- }
-
- /*
- * 2. If q is an ill queue and there's a link-local destination
- * then use that ill.
- */
- if (ill != NULL && IN6_IS_ADDR_LINKLOCAL(v6dstp))
- goto send_from_ill;
-
- /* 3. If IPV6_BOUND_IF has been set use that ill. */
- if (connp != NULL && connp->conn_outgoing_ill != NULL) {
- ill_t *conn_outgoing_ill;
-
- conn_outgoing_ill = conn_get_held_ill(connp,
- &connp->conn_outgoing_ill, &err);
- if (err == ILL_LOOKUP_FAILED) {
- if (ill != NULL)
- ill_refrele(ill);
- if (need_decref)
- CONN_DEC_REF(connp);
- freemsg(first_mp);
- return;
- }
- if (ill != NULL)
- ill_refrele(ill);
- ill = conn_outgoing_ill;
- mibptr = ill->ill_ip_mib;
- goto send_from_ill;
- }
-
- /*
- * 4. For unicast: Just do an IRE lookup for the best match.
- * If we get here for a link-local address it is rather random
- * what interface we pick on a multihomed host.
- * *If* there is an IRE_CACHE (and the link-local address
- * isn't duplicated on multi links) this will find the IRE_CACHE.
- * Otherwise it will use one of the matching IRE_INTERFACE routes
- * for the link-local prefix. Hence, applications
- * *should* be encouraged to specify an outgoing interface when sending
- * to a link local address.
- */
- if (connp == NULL || (IP_FLOW_CONTROLLED_ULP(connp->conn_ulp) &&
- !connp->conn_fully_bound)) {
- /*
- * We cache IRE_CACHEs to avoid lookups. We don't do
- * this for the tcp global queue and listen end point
- * as it does not really have a real destination to
- * talk to.
- */
- ire = ire_cache_lookup_v6(v6dstp, zoneid, msg_getlabel(mp),
- ipst);
- } else {
- /*
- * IRE_MARK_CONDEMNED is marked in ire_delete. We don't
- * grab a lock here to check for CONDEMNED as it is okay
- * to send a packet or two with the IRE_CACHE that is going
- * away.
- */
- mutex_enter(&connp->conn_lock);
- ire = sctp_ire != NULL ? sctp_ire : connp->conn_ire_cache;
- if (ire != NULL &&
- IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, v6dstp) &&
- !(ire->ire_marks & IRE_MARK_CONDEMNED)) {
-
- IRE_REFHOLD(ire);
- mutex_exit(&connp->conn_lock);
-
- } else {
- boolean_t cached = B_FALSE;
-
- connp->conn_ire_cache = NULL;
- mutex_exit(&connp->conn_lock);
- /* Release the old ire */
- if (ire != NULL && sctp_ire == NULL)
- IRE_REFRELE_NOTR(ire);
-
- ire = ire_cache_lookup_v6(v6dstp, zoneid,
- msg_getlabel(mp), ipst);
- if (ire != NULL) {
- IRE_REFHOLD_NOTR(ire);
-
- mutex_enter(&connp->conn_lock);
- if (CONN_CACHE_IRE(connp) &&
- (connp->conn_ire_cache == NULL)) {
- rw_enter(&ire->ire_bucket->irb_lock,
- RW_READER);
- if (!(ire->ire_marks &
- IRE_MARK_CONDEMNED)) {
- connp->conn_ire_cache = ire;
- cached = B_TRUE;
- }
- rw_exit(&ire->ire_bucket->irb_lock);
- }
- mutex_exit(&connp->conn_lock);
-
- /*
- * We can continue to use the ire but since it
- * was not cached, we should drop the extra
- * reference.
- */
- if (!cached)
- IRE_REFRELE_NOTR(ire);
- }
- }
- }
-
- if (ire != NULL) {
- if (do_outrequests) {
- /* Handle IRE_LOCAL's that might appear here */
- if (ire->ire_type == IRE_CACHE) {
- mibptr = ((ill_t *)ire->ire_stq->q_ptr)->
- ill_ip_mib;
- } else {
- mibptr = ire->ire_ipif->ipif_ill->ill_ip_mib;
- }
- BUMP_MIB(mibptr, ipIfStatsHCOutRequests);
- }
-
- /*
- * Check if the ire has the RTF_MULTIRT flag, inherited
- * from an IRE_OFFSUBNET ire entry in ip_newroute().
- */
- if (ire->ire_flags & RTF_MULTIRT) {
- /*
- * Force hop limit of multirouted packets if required.
- * The hop limit of such packets is bounded by the
- * ip_multirt_ttl ndd variable.
- * NDP packets must have a hop limit of 255; don't
- * change the hop limit in that case.
- */
- if ((ipst->ips_ip_multirt_ttl > 0) &&
- (ip6h->ip6_hops > ipst->ips_ip_multirt_ttl) &&
- (ip6h->ip6_hops != IPV6_MAX_HOPS)) {
- if (ip_debug > 3) {
- ip2dbg(("ip_wput_v6: forcing multirt "
- "hop limit to %d (was %d) ",
- ipst->ips_ip_multirt_ttl,
- ip6h->ip6_hops));
- pr_addr_dbg("v6dst %s\n", AF_INET6,
- &ire->ire_addr_v6);
- }
- ip6h->ip6_hops = ipst->ips_ip_multirt_ttl;
- }
-
- /*
- * We look at this point if there are pending
- * unresolved routes. ire_multirt_need_resolve_v6()
- * checks in O(n) that all IRE_OFFSUBNET ire
- * entries for the packet's destination and
- * flagged RTF_MULTIRT are currently resolved.
- * If some remain unresolved, we do a copy
- * of the current message. It will be used
- * to initiate additional route resolutions.
- */
- multirt_need_resolve =
- ire_multirt_need_resolve_v6(&ire->ire_addr_v6,
- msg_getlabel(first_mp), ipst);
- ip2dbg(("ip_wput_v6: ire %p, "
- "multirt_need_resolve %d, first_mp %p\n",
- (void *)ire, multirt_need_resolve,
- (void *)first_mp));
- if (multirt_need_resolve) {
- copy_mp = copymsg(first_mp);
- if (copy_mp != NULL) {
- MULTIRT_DEBUG_TAG(copy_mp);
- }
- }
- }
- ip_wput_ire_v6(q, first_mp, ire, unspec_src, cksum_request,
- connp, caller, ip6i_flags, zoneid);
- if (need_decref) {
- CONN_DEC_REF(connp);
- connp = NULL;
- }
- IRE_REFRELE(ire);
-
- /*
- * Try to resolve another multiroute if
- * ire_multirt_need_resolve_v6() deemed it necessary.
- * copy_mp will be consumed (sent or freed) by
- * ip_newroute_v6().
- */
- if (copy_mp != NULL) {
- if (mctl_present) {
- ip6h = (ip6_t *)copy_mp->b_cont->b_rptr;
- } else {
- ip6h = (ip6_t *)copy_mp->b_rptr;
- }
- ip_newroute_v6(q, copy_mp, &ip6h->ip6_dst,
- &ip6h->ip6_src, NULL, zoneid, ipst);
- }
- if (ill != NULL)
- ill_refrele(ill);
- return;
- }
-
- /*
- * No full IRE for this destination. Send it to
- * ip_newroute_v6 to see if anything else matches.
- * Mark this packet as having originated on this
- * machine.
- * Update rptr if there was an ip6i_t header.
- */
- mp->b_prev = NULL;
- mp->b_next = NULL;
- if (ip6i != NULL)
- mp->b_rptr -= sizeof (ip6i_t);
-
- if (unspec_src) {
- if (ip6i == NULL) {
- /*
- * Add ip6i_t header to carry unspec_src
- * until the packet comes back in ip_wput_v6.
- */
- mp = ip_add_info_v6(mp, NULL, v6dstp);
- if (mp == NULL) {
- if (do_outrequests)
- BUMP_MIB(mibptr,
- ipIfStatsHCOutRequests);
- BUMP_MIB(mibptr, ipIfStatsOutDiscards);
- if (mctl_present)
- freeb(first_mp);
- if (ill != NULL)
- ill_refrele(ill);
- if (need_decref)
- CONN_DEC_REF(connp);
- return;
- }
- ip6i = (ip6i_t *)mp->b_rptr;
-
- if (mctl_present) {
- ASSERT(first_mp != mp);
- first_mp->b_cont = mp;
- } else {
- first_mp = mp;
- }
-
- if ((mp->b_wptr - (uchar_t *)ip6i) ==
- sizeof (ip6i_t)) {
- /*
- * ndp_resolver called from ip_newroute_v6
- * expects pulled up message.
- */
- if (!pullupmsg(mp, -1)) {
- ip1dbg(("ip_wput_v6: pullupmsg"
- " failed\n"));
- if (do_outrequests) {
- BUMP_MIB(mibptr,
- ipIfStatsHCOutRequests);
- }
- BUMP_MIB(mibptr, ipIfStatsOutDiscards);
- freemsg(first_mp);
- if (ill != NULL)
- ill_refrele(ill);
- if (need_decref)
- CONN_DEC_REF(connp);
- return;
- }
- ip6i = (ip6i_t *)mp->b_rptr;
- }
- ip6h = (ip6_t *)&ip6i[1];
- v6dstp = &ip6h->ip6_dst;
- }
- ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
- if (mctl_present) {
- ASSERT(io != NULL);
- io->ipsec_out_unspec_src = unspec_src;
- }
- }
- if (do_outrequests)
- BUMP_MIB(mibptr, ipIfStatsHCOutRequests);
- if (need_decref)
- CONN_DEC_REF(connp);
- ip_newroute_v6(q, first_mp, v6dstp, &ip6h->ip6_src, NULL, zoneid, ipst);
- if (ill != NULL)
- ill_refrele(ill);
- return;
-
-
- /*
- * Handle multicast packets with or without an conn.
- * Assumes that the transports set ip6_hops taking
- * IPV6_MULTICAST_HOPS (and the other ways to set the hoplimit)
- * into account.
- */
-ipv6multicast:
- ip2dbg(("ip_wput_v6: multicast\n"));
-
- /*
- * Hold the conn_lock till we refhold the ill of interest that is
- * pointed to from the conn. Since we cannot do an ill/ipif_refrele
- * while holding any locks, postpone the refrele until after the
- * conn_lock is dropped.
- */
- if (connp != NULL) {
- mutex_enter(&connp->conn_lock);
- conn_lock_held = B_TRUE;
- } else {
- conn_lock_held = B_FALSE;
- }
- if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_IFINDEX)) {
- /* 1. If an ip6i_t with IP6I_IFINDEX set then use that ill. */
- ASSERT(ill != NULL);
- } else if (ill != NULL) {
- /*
- * 2. If q is an ill queue and (link local or multicast
- * destination) then use that ill.
- * We don't need the ipif initialization here.
- * This useless assert below is just to prevent lint from
- * reporting a null body if statement.
- */
- ASSERT(ill != NULL);
- } else if (connp != NULL) {
- /*
- * 3. If IPV6_BOUND_IF has been set use that ill.
- *
- * 4. For multicast: if IPV6_MULTICAST_IF has been set use it.
- * Otherwise look for the best IRE match for the unspecified
- * group to determine the ill.
- *
- * conn_multicast_ill is used for only IPv6 packets.
- * conn_multicast_ipif is used for only IPv4 packets.
- * Thus a PF_INET6 socket send both IPv4 and IPv6
- * multicast packets using different IP*_MULTICAST_IF
- * interfaces.
- */
- if (connp->conn_outgoing_ill != NULL) {
- err = ill_check_and_refhold(connp->conn_outgoing_ill);
- if (err == ILL_LOOKUP_FAILED) {
- ip1dbg(("ip_output_v6: multicast"
- " conn_outgoing_ill no ipif\n"));
-multicast_discard:
- ASSERT(saved_ill == NULL);
- if (conn_lock_held)
- mutex_exit(&connp->conn_lock);
- if (ill != NULL)
- ill_refrele(ill);
- freemsg(first_mp);
- if (do_outrequests)
- BUMP_MIB(mibptr, ipIfStatsOutDiscards);
- if (need_decref)
- CONN_DEC_REF(connp);
- return;
- }
- ill = connp->conn_outgoing_ill;
- } else if (connp->conn_multicast_ill != NULL) {
- err = ill_check_and_refhold(connp->conn_multicast_ill);
- if (err == ILL_LOOKUP_FAILED) {
- ip1dbg(("ip_output_v6: multicast"
- " conn_multicast_ill no ipif\n"));
- goto multicast_discard;
- }
- ill = connp->conn_multicast_ill;
- } else {
- mutex_exit(&connp->conn_lock);
- conn_lock_held = B_FALSE;
- ipif = ipif_lookup_group_v6(v6dstp, zoneid, ipst);
- if (ipif == NULL) {
- ip1dbg(("ip_output_v6: multicast no ipif\n"));
- goto multicast_discard;
- }
- /*
- * We have a ref to this ipif, so we can safely
- * access ipif_ill.
- */
- ill = ipif->ipif_ill;
- mutex_enter(&ill->ill_lock);
- if (!ILL_CAN_LOOKUP(ill)) {
- mutex_exit(&ill->ill_lock);
- ipif_refrele(ipif);
- ill = NULL;
- ip1dbg(("ip_output_v6: multicast no ipif\n"));
- goto multicast_discard;
- }
- ill_refhold_locked(ill);
- mutex_exit(&ill->ill_lock);
- ipif_refrele(ipif);
- /*
- * Save binding until IPV6_MULTICAST_IF
- * changes it
- */
- mutex_enter(&connp->conn_lock);
- connp->conn_multicast_ill = ill;
- mutex_exit(&connp->conn_lock);
- }
- }
- if (conn_lock_held)
- mutex_exit(&connp->conn_lock);
-
- if (saved_ill != NULL)
- ill_refrele(saved_ill);
-
- ASSERT(ill != NULL);
- /*
- * For multicast loopback interfaces replace the multicast address
- * with a unicast address for the ire lookup.
- */
- if (IS_LOOPBACK(ill))
- v6dstp = &ill->ill_ipif->ipif_v6lcl_addr;
-
- mibptr = ill->ill_ip_mib;
- if (do_outrequests) {
- BUMP_MIB(mibptr, ipIfStatsHCOutRequests);
- do_outrequests = B_FALSE;
- }
- BUMP_MIB(mibptr, ipIfStatsHCOutMcastPkts);
- UPDATE_MIB(mibptr, ipIfStatsHCOutMcastOctets,
- ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN);
-
- /*
- * As we may lose the conn by the time we reach ip_wput_ire_v6
- * we copy conn_multicast_loop and conn_dontroute on to an
- * ipsec_out. In case if this datagram goes out secure,
- * we need the ill_index also. Copy that also into the
- * ipsec_out.
- */
- if (mctl_present) {
- io = (ipsec_out_t *)first_mp->b_rptr;
- ASSERT(first_mp->b_datap->db_type == M_CTL);
- ASSERT(io->ipsec_out_type == IPSEC_OUT);
- } else {
- ASSERT(mp == first_mp);
- if ((first_mp = ipsec_alloc_ipsec_out(ipst->ips_netstack)) ==
- NULL) {
- BUMP_MIB(mibptr, ipIfStatsOutDiscards);
- freemsg(mp);
- if (ill != NULL)
- ill_refrele(ill);
- if (need_decref)
- CONN_DEC_REF(connp);
- return;
- }
- io = (ipsec_out_t *)first_mp->b_rptr;
- /* This is not a secure packet */
- io->ipsec_out_secure = B_FALSE;
- io->ipsec_out_use_global_policy = B_TRUE;
- io->ipsec_out_zoneid =
- (zoneid != ALL_ZONES ? zoneid : GLOBAL_ZONEID);
- first_mp->b_cont = mp;
- mctl_present = B_TRUE;
- }
- io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
- io->ipsec_out_unspec_src = unspec_src;
- if (connp != NULL)
- io->ipsec_out_dontroute = connp->conn_dontroute;
-
-send_from_ill:
- ASSERT(ill != NULL);
- ASSERT(mibptr == ill->ill_ip_mib);
-
- if (do_outrequests) {
- BUMP_MIB(mibptr, ipIfStatsHCOutRequests);
- do_outrequests = B_FALSE;
- }
-
- /*
- * Because nce_xmit() calls ip_output_v6() and NCEs are always tied to
- * an underlying interface, IS_UNDER_IPMP() may be true even when
- * building IREs that will be used for data traffic. As such, use the
- * packet's source address to determine whether the traffic is test
- * traffic, and set MATCH_IRE_MARK_TESTHIDDEN if so.
- *
- * Separately, we also need to mark probe packets so that ND can
- * process them specially; see the comments in nce_queue_mp_common().
- */
- if (IS_UNDER_IPMP(ill) && !IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) &&
- ipif_lookup_testaddr_v6(ill, &ip6h->ip6_src, NULL)) {
- if (ip6i == NULL) {
- if ((mp = ip_add_info_v6(mp, NULL, v6dstp)) == NULL) {
- if (mctl_present)
- freeb(first_mp);
- goto discard;
- }
-
- if (mctl_present)
- first_mp->b_cont = mp;
- else
- first_mp = mp;
-
- /* ndp_resolver() expects a pulled-up message */
- if (MBLKL(mp) == sizeof (ip6i_t) &&
- pullupmsg(mp, -1) == 0) {
- ip1dbg(("ip_output_v6: pullupmsg failed\n"));
-discard: BUMP_MIB(mibptr, ipIfStatsOutDiscards);
- ill_refrele(ill);
- if (need_decref)
- CONN_DEC_REF(connp);
- return;
- }
- ip6i = (ip6i_t *)mp->b_rptr;
- ip6h = (ip6_t *)&ip6i[1];
- v6dstp = &ip6h->ip6_dst;
- mp->b_rptr = (uchar_t *)ip6h; /* rewound below */
- }
- ip6i->ip6i_flags |= IP6I_IPMP_PROBE;
- match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
- }
-
- if (io != NULL)
- io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
-
- /*
- * When a specific ill is specified (using IPV6_PKTINFO,
- * IPV6_MULTICAST_IF, or IPV6_BOUND_IF) we will only match
- * on routing entries (ftable and ctable) that have a matching
- * ire->ire_ipif->ipif_ill. Thus this can only be used
- * for destinations that are on-link for the specific ill
- * and that can appear on multiple links. Thus it is useful
- * for multicast destinations, link-local destinations, and
- * at some point perhaps for site-local destinations (if the
- * node sits at a site boundary).
- * We create the cache entries in the regular ctable since
- * it can not "confuse" things for other destinations.
- * table.
- *
- * NOTE : conn_ire_cache is not used for caching ire_ctable_lookups.
- * It is used only when ire_cache_lookup is used above.
- */
- ire = ire_ctable_lookup_v6(v6dstp, 0, 0, ill->ill_ipif,
- zoneid, msg_getlabel(mp), match_flags, ipst);
- if (ire != NULL) {
- /*
- * Check if the ire has the RTF_MULTIRT flag, inherited
- * from an IRE_OFFSUBNET ire entry in ip_newroute().
- */
- if (ire->ire_flags & RTF_MULTIRT) {
- /*
- * Force hop limit of multirouted packets if required.
- * The hop limit of such packets is bounded by the
- * ip_multirt_ttl ndd variable.
- * NDP packets must have a hop limit of 255; don't
- * change the hop limit in that case.
- */
- if ((ipst->ips_ip_multirt_ttl > 0) &&
- (ip6h->ip6_hops > ipst->ips_ip_multirt_ttl) &&
- (ip6h->ip6_hops != IPV6_MAX_HOPS)) {
- if (ip_debug > 3) {
- ip2dbg(("ip_wput_v6: forcing multirt "
- "hop limit to %d (was %d) ",
- ipst->ips_ip_multirt_ttl,
- ip6h->ip6_hops));
- pr_addr_dbg("v6dst %s\n", AF_INET6,
- &ire->ire_addr_v6);
- }
- ip6h->ip6_hops = ipst->ips_ip_multirt_ttl;
- }
-
- /*
- * We look at this point if there are pending
- * unresolved routes. ire_multirt_need_resolve_v6()
- * checks in O(n) that all IRE_OFFSUBNET ire
- * entries for the packet's destination and
- * flagged RTF_MULTIRT are currently resolved.
- * If some remain unresolved, we make a copy
- * of the current message. It will be used
- * to initiate additional route resolutions.
- */
- multirt_need_resolve =
- ire_multirt_need_resolve_v6(&ire->ire_addr_v6,
- msg_getlabel(first_mp), ipst);
- ip2dbg(("ip_wput_v6[send_from_ill]: ire %p, "
- "multirt_need_resolve %d, first_mp %p\n",
- (void *)ire, multirt_need_resolve,
- (void *)first_mp));
- if (multirt_need_resolve) {
- copy_mp = copymsg(first_mp);
- if (copy_mp != NULL) {
- MULTIRT_DEBUG_TAG(copy_mp);
- }
- }
- }
-
- ip1dbg(("ip_wput_v6: send on %s, ire = %p, ill index = %d\n",
- ill->ill_name, (void *)ire,
- ill->ill_phyint->phyint_ifindex));
- ip_wput_ire_v6(q, first_mp, ire, unspec_src, cksum_request,
- connp, caller, ip6i_flags, zoneid);
- ire_refrele(ire);
- if (need_decref) {
- CONN_DEC_REF(connp);
- connp = NULL;
- }
-
- /*
- * Try to resolve another multiroute if
- * ire_multirt_need_resolve_v6() deemed it necessary.
- * copy_mp will be consumed (sent or freed) by
- * ip_newroute_[ipif_]v6().
- */
- if (copy_mp != NULL) {
- if (mctl_present) {
- ip6h = (ip6_t *)copy_mp->b_cont->b_rptr;
- } else {
- ip6h = (ip6_t *)copy_mp->b_rptr;
- }
- if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
- ipif = ipif_lookup_group_v6(&ip6h->ip6_dst,
- zoneid, ipst);
- if (ipif == NULL) {
- ip1dbg(("ip_wput_v6: No ipif for "
- "multicast\n"));
- MULTIRT_DEBUG_UNTAG(copy_mp);
- freemsg(copy_mp);
- return;
- }
- ip_newroute_ipif_v6(q, copy_mp, ipif,
- &ip6h->ip6_dst, &ip6h->ip6_src, unspec_src,
- zoneid);
- ipif_refrele(ipif);
- } else {
- ip_newroute_v6(q, copy_mp, &ip6h->ip6_dst,
- &ip6h->ip6_src, ill, zoneid, ipst);
- }
- }
- ill_refrele(ill);
- return;
- }
- if (need_decref) {
- CONN_DEC_REF(connp);
- connp = NULL;
- }
-
- /* Update rptr if there was an ip6i_t header. */
- if (ip6i != NULL)
- mp->b_rptr -= sizeof (ip6i_t);
- if (unspec_src) {
- if (ip6i == NULL) {
- /*
- * Add ip6i_t header to carry unspec_src
- * until the packet comes back in ip_wput_v6.
- */
- if (mctl_present) {
- first_mp->b_cont =
- ip_add_info_v6(mp, NULL, v6dstp);
- mp = first_mp->b_cont;
- if (mp == NULL)
- freeb(first_mp);
- } else {
- first_mp = mp = ip_add_info_v6(mp, NULL,
- v6dstp);
- }
- if (mp == NULL) {
- BUMP_MIB(mibptr, ipIfStatsOutDiscards);
- ill_refrele(ill);
- return;
- }
- ip6i = (ip6i_t *)mp->b_rptr;
- if ((mp->b_wptr - (uchar_t *)ip6i) ==
- sizeof (ip6i_t)) {
- /*
- * ndp_resolver called from ip_newroute_v6
- * expects a pulled up message.
- */
- if (!pullupmsg(mp, -1)) {
- ip1dbg(("ip_wput_v6: pullupmsg"
- " failed\n"));
- BUMP_MIB(mibptr, ipIfStatsOutDiscards);
- freemsg(first_mp);
- return;
- }
- ip6i = (ip6i_t *)mp->b_rptr;
- }
- ip6h = (ip6_t *)&ip6i[1];
- v6dstp = &ip6h->ip6_dst;
- }
- ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
- if (mctl_present) {
- ASSERT(io != NULL);
- io->ipsec_out_unspec_src = unspec_src;
- }
- }
- if (IN6_IS_ADDR_MULTICAST(v6dstp)) {
- ip_newroute_ipif_v6(q, first_mp, ill->ill_ipif, v6dstp,
- &ip6h->ip6_src, unspec_src, zoneid);
- } else {
- ip_newroute_v6(q, first_mp, v6dstp, &ip6h->ip6_src, ill,
- zoneid, ipst);
- }
- ill_refrele(ill);
- return;
-
-notv6:
- /* FIXME?: assume the caller calls the right version of ip_output? */
- if (q->q_next == NULL) {
- connp = Q_TO_CONN(q);
-
- /*
- * We can change conn_send for all types of conn, even
- * though only TCP uses it right now.
- * FIXME: sctp could use conn_send but doesn't currently.
- */
- ip_setpktversion(connp, B_FALSE, B_TRUE, ipst);
- }
- BUMP_MIB(mibptr, ipIfStatsOutWrongIPVersion);
- (void) ip_output(arg, first_mp, arg2, caller);
- if (ill != NULL)
- ill_refrele(ill);
-}
-
-/*
- * If this is a conn_t queue, then we pass in the conn. This includes the
- * zoneid.
- * Otherwise, this is a message for an ill_t queue,
- * in which case we use the global zoneid since those are all part of
- * the global zone.
- */
-void
-ip_wput_v6(queue_t *q, mblk_t *mp)
-{
- if (CONN_Q(q))
- ip_output_v6(Q_TO_CONN(q), mp, q, IP_WPUT);
- else
- ip_output_v6(GLOBAL_ZONEID, mp, q, IP_WPUT);
-}
-
-/*
- * NULL send-to queue - packet is to be delivered locally.
- */
-void
-ip_wput_local_v6(queue_t *q, ill_t *ill, ip6_t *ip6h, mblk_t *first_mp,
- ire_t *ire, int fanout_flags, zoneid_t zoneid)
-{
- uint32_t ports;
- mblk_t *mp = first_mp, *first_mp1;
- boolean_t mctl_present;
- uint8_t nexthdr;
- uint16_t hdr_length;
- ipsec_out_t *io;
- mib2_ipIfStatsEntry_t *mibptr;
- ilm_t *ilm;
- uint_t nexthdr_offset;
- ip_stack_t *ipst = ill->ill_ipst;
-
- if (DB_TYPE(mp) == M_CTL) {
- io = (ipsec_out_t *)mp->b_rptr;
- if (!io->ipsec_out_secure) {
- mp = mp->b_cont;
- freeb(first_mp);
- first_mp = mp;
- mctl_present = B_FALSE;
- } else {
- mctl_present = B_TRUE;
- mp = first_mp->b_cont;
- ipsec_out_to_in(first_mp);
- }
- } else {
- mctl_present = B_FALSE;
- }
-
- /*
- * Remove reachability confirmation bit from version field
- * before passing the packet on to any firewall hooks or
- * looping back the packet.
- */
- if (ip6h->ip6_vcf & IP_FORWARD_PROG)
- ip6h->ip6_vcf &= ~IP_FORWARD_PROG;
-
- DTRACE_PROBE4(ip6__loopback__in__start,
- ill_t *, ill, ill_t *, NULL,
- ip6_t *, ip6h, mblk_t *, first_mp);
-
- FW_HOOKS6(ipst->ips_ip6_loopback_in_event,
- ipst->ips_ipv6firewall_loopback_in,
- ill, NULL, ip6h, first_mp, mp, 0, ipst);
-
- DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, first_mp);
-
- if (first_mp == NULL)
- return;
-
- if (ipst->ips_ip6_observe.he_interested) {
- zoneid_t szone, dzone, lookup_zoneid = ALL_ZONES;
- zoneid_t stackzoneid = netstackid_to_zoneid(
- ipst->ips_netstack->netstack_stackid);
-
- szone = (stackzoneid == GLOBAL_ZONEID) ? zoneid : stackzoneid;
- /*
- * ::1 is special, as we cannot lookup its zoneid by
- * address. For this case, restrict the lookup to the
- * source zone.
- */
- if (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst))
- lookup_zoneid = zoneid;
- dzone = ip_get_zoneid_v6(&ip6h->ip6_dst, mp, ill, ipst,
- lookup_zoneid);
- ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst);
- }
-
- DTRACE_IP7(receive, mblk_t *, first_mp, conn_t *, NULL, void_ip_t *,
- ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, ip6h,
- int, 1);
-
- nexthdr = ip6h->ip6_nxt;
- mibptr = ill->ill_ip_mib;
-
- /* Fastpath */
- switch (nexthdr) {
- case IPPROTO_TCP:
- case IPPROTO_UDP:
- case IPPROTO_ICMPV6:
- case IPPROTO_SCTP:
- hdr_length = IPV6_HDR_LEN;
- nexthdr_offset = (uint_t)((uchar_t *)&ip6h->ip6_nxt -
- (uchar_t *)ip6h);
- break;
- default: {
- uint8_t *nexthdrp;
-
- if (!ip_hdr_length_nexthdr_v6(mp, ip6h,
- &hdr_length, &nexthdrp)) {
- /* Malformed packet */
- BUMP_MIB(mibptr, ipIfStatsOutDiscards);
- freemsg(first_mp);
- return;
- }
- nexthdr = *nexthdrp;
- nexthdr_offset = nexthdrp - (uint8_t *)ip6h;
- break;
- }
- }
-
- UPDATE_OB_PKT_COUNT(ire);
- ire->ire_last_used_time = lbolt;
-
- switch (nexthdr) {
- case IPPROTO_TCP:
- if (DB_TYPE(mp) == M_DATA) {
- /*
- * M_DATA mblk, so init mblk (chain) for
- * no struio().
- */
- mblk_t *mp1 = mp;
-
- do {
- mp1->b_datap->db_struioflag = 0;
- } while ((mp1 = mp1->b_cont) != NULL);
- }
- ports = *(uint32_t *)(mp->b_rptr + hdr_length +
- TCP_PORTS_OFFSET);
- ip_fanout_tcp_v6(q, first_mp, ip6h, ill, ill,
- fanout_flags|IP_FF_SEND_ICMP|IP_FF_SYN_ADDIRE|
- IP_FF_IPINFO|IP6_NO_IPPOLICY|IP_FF_LOOPBACK,
- hdr_length, mctl_present, ire->ire_zoneid);
- return;
-
- case IPPROTO_UDP:
- ports = *(uint32_t *)(mp->b_rptr + hdr_length +
- UDP_PORTS_OFFSET);
- ip_fanout_udp_v6(q, first_mp, ip6h, ports, ill, ill,
- fanout_flags|IP_FF_SEND_ICMP|IP_FF_IPINFO|
- IP6_NO_IPPOLICY, mctl_present, ire->ire_zoneid);
- return;
-
- case IPPROTO_SCTP:
- {
- ports = *(uint32_t *)(mp->b_rptr + hdr_length);
- ip_fanout_sctp(first_mp, ill, (ipha_t *)ip6h, ports,
- fanout_flags|IP_FF_SEND_ICMP|IP_FF_IPINFO,
- mctl_present, IP6_NO_IPPOLICY, ire->ire_zoneid);
- return;
- }
- case IPPROTO_ICMPV6: {
- icmp6_t *icmp6;
-
- /* check for full IPv6+ICMPv6 header */
- if ((mp->b_wptr - mp->b_rptr) <
- (hdr_length + ICMP6_MINLEN)) {
- if (!pullupmsg(mp, hdr_length + ICMP6_MINLEN)) {
- ip1dbg(("ip_wput_v6: ICMP hdr pullupmsg"
- " failed\n"));
- BUMP_MIB(mibptr, ipIfStatsOutDiscards);
- freemsg(first_mp);
- return;
- }
- ip6h = (ip6_t *)mp->b_rptr;
- }
- icmp6 = (icmp6_t *)((uchar_t *)ip6h + hdr_length);
-
- /* Update output mib stats */
- icmp_update_out_mib_v6(ill, icmp6);
-
- /* Check variable for testing applications */
- if (ipst->ips_ipv6_drop_inbound_icmpv6) {
- freemsg(first_mp);
- return;
- }
- /*
- * Assume that there is always at least one conn for
- * ICMPv6 (in.ndpd) i.e. don't optimize the case
- * where there is no conn.
- */
- if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
- !IS_LOOPBACK(ill)) {
- ilm_walker_t ilw;
-
- /*
- * In the multicast case, applications may have
- * joined the group from different zones, so we
- * need to deliver the packet to each of them.
- * Loop through the multicast memberships
- * structures (ilm) on the receive ill and send
- * a copy of the packet up each matching one.
- * However, we don't do this for multicasts sent
- * on the loopback interface (PHYI_LOOPBACK flag
- * set) as they must stay in the sender's zone.
- */
- ilm = ilm_walker_start(&ilw, ill);
- for (; ilm != NULL;
- ilm = ilm_walker_step(&ilw, ilm)) {
- if (!IN6_ARE_ADDR_EQUAL(
- &ilm->ilm_v6addr, &ip6h->ip6_dst))
- continue;
- if ((fanout_flags &
- IP_FF_NO_MCAST_LOOP) &&
- ilm->ilm_zoneid == ire->ire_zoneid)
- continue;
- if (!ipif_lookup_zoneid(
- ilw.ilw_walk_ill, ilm->ilm_zoneid,
- IPIF_UP, NULL))
- continue;
-
- first_mp1 = ip_copymsg(first_mp);
- if (first_mp1 == NULL)
- continue;
- icmp_inbound_v6(q, first_mp1,
- ilw.ilw_walk_ill, ill, hdr_length,
- mctl_present, IP6_NO_IPPOLICY,
- ilm->ilm_zoneid, NULL);
- }
- ilm_walker_finish(&ilw);
- } else {
- first_mp1 = ip_copymsg(first_mp);
- if (first_mp1 != NULL)
- icmp_inbound_v6(q, first_mp1, ill, ill,
- hdr_length, mctl_present,
- IP6_NO_IPPOLICY, ire->ire_zoneid,
- NULL);
- }
- }
- /* FALLTHRU */
- default: {
- /*
- * Handle protocols with which IPv6 is less intimate.
- */
- fanout_flags |= IP_FF_RAWIP|IP_FF_IPINFO;
-
- /*
- * Enable sending ICMP for "Unknown" nexthdr
- * case. i.e. where we did not FALLTHRU from
- * IPPROTO_ICMPV6 processing case above.
- */
- if (nexthdr != IPPROTO_ICMPV6)
- fanout_flags |= IP_FF_SEND_ICMP;
- /*
- * Note: There can be more than one stream bound
- * to a particular protocol. When this is the case,
- * each one gets a copy of any incoming packets.
- */
- ip_fanout_proto_v6(q, first_mp, ip6h, ill, ill, nexthdr,
- nexthdr_offset, fanout_flags|IP6_NO_IPPOLICY,
- mctl_present, ire->ire_zoneid);
- return;
- }
- }
-}
-
-/*
- * Send packet using IRE.
- * Checksumming is controlled by cksum_request:
- * -1 => normal i.e. TCP/UDP/SCTP/ICMPv6 are checksummed and nothing else.
- * 1 => Skip TCP/UDP/SCTP checksum
- * Otherwise => checksum_request contains insert offset for checksum
- *
- * Assumes that the following set of headers appear in the first
- * mblk:
- * ip6_t
- * Any extension headers
- * TCP/UDP/SCTP header (if present)
- * The routine can handle an ICMPv6 header that is not in the first mblk.
- *
- * NOTE : This function does not ire_refrele the ire passed in as the
- * argument unlike ip_wput_ire where the REFRELE is done.
- * Refer to ip_wput_ire for more on this.
- */
-static void
-ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
- int cksum_request, conn_t *connp, int caller, int flags, zoneid_t zoneid)
-{
- ip6_t *ip6h;
- uint8_t nexthdr;
- uint16_t hdr_length;
- uint_t reachable = 0x0;
- ill_t *ill;
- mib2_ipIfStatsEntry_t *mibptr;
- mblk_t *first_mp;
- boolean_t mctl_present;
- ipsec_out_t *io;
- boolean_t conn_dontroute; /* conn value for multicast */
- boolean_t conn_multicast_loop; /* conn value for multicast */
- boolean_t multicast_forward; /* Should we forward ? */
- int max_frag;
- ip_stack_t *ipst = ire->ire_ipst;
- ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
-
- ill = ire_to_ill(ire);
- first_mp = mp;
- multicast_forward = B_FALSE;
-
- if (mp->b_datap->db_type != M_CTL) {
- ip6h = (ip6_t *)first_mp->b_rptr;
- } else {
- io = (ipsec_out_t *)first_mp->b_rptr;
- ASSERT(io->ipsec_out_type == IPSEC_OUT);
- /*
- * Grab the zone id now because the M_CTL can be discarded by
- * ip_wput_ire_parse_ipsec_out() below.
- */
- ASSERT(zoneid == io->ipsec_out_zoneid);
- ASSERT(zoneid != ALL_ZONES);
- ip6h = (ip6_t *)first_mp->b_cont->b_rptr;
- /*
- * For the multicast case, ipsec_out carries conn_dontroute and
- * conn_multicast_loop as conn may not be available here. We
- * need this for multicast loopback and forwarding which is done
- * later in the code.
- */
- if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
- conn_dontroute = io->ipsec_out_dontroute;
- conn_multicast_loop = io->ipsec_out_multicast_loop;
- /*
- * If conn_dontroute is not set or conn_multicast_loop
- * is set, we need to do forwarding/loopback. For
- * datagrams from ip_wput_multicast, conn_dontroute is
- * set to B_TRUE and conn_multicast_loop is set to
- * B_FALSE so that we neither do forwarding nor
- * loopback.
- */
- if (!conn_dontroute || conn_multicast_loop)
- multicast_forward = B_TRUE;
- }
- }
-
- /*
- * If the sender didn't supply the hop limit and there is a default
- * unicast hop limit associated with the output interface, we use
- * that if the packet is unicast. Interface specific unicast hop
- * limits as set via the SIOCSLIFLNKINFO ioctl.
- */
- if (ill->ill_max_hops != 0 && !(flags & IP6I_HOPLIMIT) &&
- !(IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))) {
- ip6h->ip6_hops = ill->ill_max_hops;
- }
-
- if (ire->ire_type == IRE_LOCAL && ire->ire_zoneid != zoneid &&
- ire->ire_zoneid != ALL_ZONES) {
- /*
- * When a zone sends a packet to another zone, we try to deliver
- * the packet under the same conditions as if the destination
- * was a real node on the network. To do so, we look for a
- * matching route in the forwarding table.
- * RTF_REJECT and RTF_BLACKHOLE are handled just like
- * ip_newroute_v6() does.
- * Note that IRE_LOCAL are special, since they are used
- * when the zoneid doesn't match in some cases. This means that
- * we need to handle ipha_src differently since ire_src_addr
- * belongs to the receiving zone instead of the sending zone.
- * When ip_restrict_interzone_loopback is set, then
- * ire_cache_lookup_v6() ensures that IRE_LOCAL are only used
- * for loopback between zones when the logical "Ethernet" would
- * have looped them back.
- */
- ire_t *src_ire;
-
- src_ire = ire_ftable_lookup_v6(&ip6h->ip6_dst, 0, 0, 0,
- NULL, NULL, zoneid, 0, NULL, (MATCH_IRE_RECURSIVE |
- MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE), ipst);
- if (src_ire != NULL &&
- !(src_ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) &&
- (!ipst->ips_ip_restrict_interzone_loopback ||
- ire_local_same_lan(ire, src_ire))) {
- if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) &&
- !unspec_src) {
- ip6h->ip6_src = src_ire->ire_src_addr_v6;
- }
- ire_refrele(src_ire);
- } else {
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutNoRoutes);
- if (src_ire != NULL) {
- if (src_ire->ire_flags & RTF_BLACKHOLE) {
- ire_refrele(src_ire);
- freemsg(first_mp);
- return;
- }
- ire_refrele(src_ire);
- }
- if (ip_hdr_complete_v6(ip6h, zoneid, ipst)) {
- /* Failed */
- freemsg(first_mp);
- return;
- }
- icmp_unreachable_v6(q, first_mp,
- ICMP6_DST_UNREACH_NOROUTE, B_FALSE, B_FALSE,
- zoneid, ipst);
- return;
- }
- }
-
- if (mp->b_datap->db_type == M_CTL ||
- ipss->ipsec_outbound_v6_policy_present) {
- mp = ip_wput_ire_parse_ipsec_out(first_mp, NULL, ip6h, ire,
- connp, unspec_src, zoneid);
- if (mp == NULL) {
- return;
- }
- }
-
- first_mp = mp;
- if (mp->b_datap->db_type == M_CTL) {
- io = (ipsec_out_t *)mp->b_rptr;
- ASSERT(io->ipsec_out_type == IPSEC_OUT);
- mp = mp->b_cont;
- mctl_present = B_TRUE;
- } else {
- mctl_present = B_FALSE;
- }
-
- ip6h = (ip6_t *)mp->b_rptr;
- nexthdr = ip6h->ip6_nxt;
- mibptr = ill->ill_ip_mib;
-
- if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) && !unspec_src) {
- ipif_t *ipif;
-
- /*
- * Select the source address using ipif_select_source_v6.
- */
- ipif = ipif_select_source_v6(ill, &ip6h->ip6_dst, B_FALSE,
- IPV6_PREFER_SRC_DEFAULT, zoneid);
- if (ipif == NULL) {
- if (ip_debug > 2) {
- /* ip1dbg */
- pr_addr_dbg("ip_wput_ire_v6: no src for "
- "dst %s\n", AF_INET6, &ip6h->ip6_dst);
- printf("through interface %s\n", ill->ill_name);
- }
- freemsg(first_mp);
- return;
- }
- ip6h->ip6_src = ipif->ipif_v6src_addr;
- ipif_refrele(ipif);
- }
- if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
- if ((connp != NULL && connp->conn_multicast_loop) ||
- !IS_LOOPBACK(ill)) {
- if (ilm_lookup_ill_v6(ill, &ip6h->ip6_dst, B_FALSE,
- ALL_ZONES) != NULL) {
- mblk_t *nmp;
- int fanout_flags = 0;
-
- if (connp != NULL &&
- !connp->conn_multicast_loop) {
- fanout_flags |= IP_FF_NO_MCAST_LOOP;
- }
- ip1dbg(("ip_wput_ire_v6: "
- "Loopback multicast\n"));
- nmp = ip_copymsg(first_mp);
- if (nmp != NULL) {
- ip6_t *nip6h;
- mblk_t *mp_ip6h;
-
- if (mctl_present) {
- nip6h = (ip6_t *)
- nmp->b_cont->b_rptr;
- mp_ip6h = nmp->b_cont;
- } else {
- nip6h = (ip6_t *)nmp->b_rptr;
- mp_ip6h = nmp;
- }
-
- DTRACE_PROBE4(
- ip6__loopback__out__start,
- ill_t *, NULL,
- ill_t *, ill,
- ip6_t *, nip6h,
- mblk_t *, nmp);
-
- FW_HOOKS6(
- ipst->ips_ip6_loopback_out_event,
- ipst->ips_ipv6firewall_loopback_out,
- NULL, ill, nip6h, nmp, mp_ip6h,
- 0, ipst);
-
- DTRACE_PROBE1(
- ip6__loopback__out__end,
- mblk_t *, nmp);
-
- /*
- * DTrace this as ip:::send. A blocked
- * packet will fire the send probe, but
- * not the receive probe.
- */
- DTRACE_IP7(send, mblk_t *, nmp,
- conn_t *, NULL, void_ip_t *, nip6h,
- __dtrace_ipsr_ill_t *, ill,
- ipha_t *, NULL, ip6_t *, nip6h,
- int, 1);
-
- if (nmp != NULL) {
- /*
- * Deliver locally and to
- * every local zone, except
- * the sending zone when
- * IPV6_MULTICAST_LOOP is
- * disabled.
- */
- ip_wput_local_v6(RD(q), ill,
- nip6h, nmp, ire,
- fanout_flags, zoneid);
- }
- } else {
- BUMP_MIB(mibptr, ipIfStatsOutDiscards);
- ip1dbg(("ip_wput_ire_v6: "
- "copymsg failed\n"));
- }
- }
- }
- if (ip6h->ip6_hops == 0 ||
- IN6_IS_ADDR_MC_NODELOCAL(&ip6h->ip6_dst) ||
- IS_LOOPBACK(ill)) {
- /*
- * Local multicast or just loopback on loopback
- * interface.
- */
- BUMP_MIB(mibptr, ipIfStatsHCOutMcastPkts);
- UPDATE_MIB(mibptr, ipIfStatsHCOutMcastOctets,
- ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN);
- ip1dbg(("ip_wput_ire_v6: local multicast only\n"));
- freemsg(first_mp);
- return;
- }
- }
-
- if (ire->ire_stq != NULL) {
- uint32_t sum;
- uint_t ill_index = ((ill_t *)ire->ire_stq->q_ptr)->
- ill_phyint->phyint_ifindex;
- queue_t *dev_q = ire->ire_stq->q_next;
-
- /*
- * non-NULL send-to queue - packet is to be sent
- * out an interface.
- */
-
- /* Driver is flow-controlling? */
- if (!IP_FLOW_CONTROLLED_ULP(nexthdr) &&
- DEV_Q_FLOW_BLOCKED(dev_q)) {
- /*
- * Queue packet if we have an conn to give back
- * pressure. We can't queue packets intended for
- * hardware acceleration since we've tossed that
- * state already. If the packet is being fed back
- * from ire_send_v6, we don't know the position in
- * the queue to enqueue the packet and we discard
- * the packet.
- */
- if (ipst->ips_ip_output_queue && connp != NULL &&
- !mctl_present && caller != IRE_SEND) {
- if (caller == IP_WSRV) {
- idl_tx_list_t *idl_txl;
-
- idl_txl = &ipst->ips_idl_tx_list[0];
- connp->conn_did_putbq = 1;
- (void) putbq(connp->conn_wq, mp);
- conn_drain_insert(connp, idl_txl);
- /*
- * caller == IP_WSRV implies we are
- * the service thread, and the
- * queue is already noenabled.
- * The check for canput and
- * the putbq is not atomic.
- * So we need to check again.
- */
- if (canput(dev_q))
- connp->conn_did_putbq = 0;
- } else {
- (void) putq(connp->conn_wq, mp);
- }
- return;
- }
- BUMP_MIB(mibptr, ipIfStatsOutDiscards);
- freemsg(first_mp);
- return;
- }
-
- /*
- * Look for reachability confirmations from the transport.
- */
- if (ip6h->ip6_vcf & IP_FORWARD_PROG) {
- reachable |= IPV6_REACHABILITY_CONFIRMATION;
- ip6h->ip6_vcf &= ~IP_FORWARD_PROG;
- if (mctl_present)
- io->ipsec_out_reachable = B_TRUE;
- }
- /* Fastpath */
- switch (nexthdr) {
- case IPPROTO_TCP:
- case IPPROTO_UDP:
- case IPPROTO_ICMPV6:
- case IPPROTO_SCTP:
- hdr_length = IPV6_HDR_LEN;
- break;
- default: {
- uint8_t *nexthdrp;
-
- if (!ip_hdr_length_nexthdr_v6(mp, ip6h,
- &hdr_length, &nexthdrp)) {
- /* Malformed packet */
- BUMP_MIB(mibptr, ipIfStatsOutDiscards);
- freemsg(first_mp);
- return;
- }
- nexthdr = *nexthdrp;
- break;
- }
- }
-
- if (cksum_request != -1 && nexthdr != IPPROTO_ICMPV6) {
- uint16_t *up;
- uint16_t *insp;
-
- /*
- * The packet header is processed once for all, even
- * in the multirouting case. We disable hardware
- * checksum if the packet is multirouted, as it will be
- * replicated via several interfaces, and not all of
- * them may have this capability.
- */
- if (cksum_request == 1 &&
- !(ire->ire_flags & RTF_MULTIRT)) {
- /* Skip the transport checksum */
- goto cksum_done;
- }
- /*
- * Do user-configured raw checksum.
- * Compute checksum and insert at offset "cksum_request"
- */
-
- /* check for enough headers for checksum */
- cksum_request += hdr_length; /* offset from rptr */
- if ((mp->b_wptr - mp->b_rptr) <
- (cksum_request + sizeof (int16_t))) {
- if (!pullupmsg(mp,
- cksum_request + sizeof (int16_t))) {
- ip1dbg(("ip_wput_v6: ICMP hdr pullupmsg"
- " failed\n"));
- BUMP_MIB(mibptr, ipIfStatsOutDiscards);
- freemsg(first_mp);
- return;
- }
- ip6h = (ip6_t *)mp->b_rptr;
- }
- insp = (uint16_t *)((uchar_t *)ip6h + cksum_request);
- ASSERT(((uintptr_t)insp & 0x1) == 0);
- up = (uint16_t *)&ip6h->ip6_src;
- /*
- * icmp has placed length and routing
- * header adjustment in *insp.
- */
- sum = htons(nexthdr) +
- up[0] + up[1] + up[2] + up[3] +
- up[4] + up[5] + up[6] + up[7] +
- up[8] + up[9] + up[10] + up[11] +
- up[12] + up[13] + up[14] + up[15];
- sum = (sum & 0xffff) + (sum >> 16);
- *insp = IP_CSUM(mp, hdr_length, sum);
- } else if (nexthdr == IPPROTO_TCP) {
- uint16_t *up;
-
- /*
- * Check for full IPv6 header + enough TCP header
- * to get at the checksum field.
- */
- if ((mp->b_wptr - mp->b_rptr) <
- (hdr_length + TCP_CHECKSUM_OFFSET +
- TCP_CHECKSUM_SIZE)) {
- if (!pullupmsg(mp, hdr_length +
- TCP_CHECKSUM_OFFSET + TCP_CHECKSUM_SIZE)) {
- ip1dbg(("ip_wput_v6: TCP hdr pullupmsg"
- " failed\n"));
- BUMP_MIB(mibptr, ipIfStatsOutDiscards);
- freemsg(first_mp);
- return;
- }
- ip6h = (ip6_t *)mp->b_rptr;
- }
-
- up = (uint16_t *)&ip6h->ip6_src;
- /*
- * Note: The TCP module has stored the length value
- * into the tcp checksum field, so we don't
- * need to explicitly sum it in here.
- */
- sum = up[0] + up[1] + up[2] + up[3] +
- up[4] + up[5] + up[6] + up[7] +
- up[8] + up[9] + up[10] + up[11] +
- up[12] + up[13] + up[14] + up[15];
-
- /* Fold the initial sum */
- sum = (sum & 0xffff) + (sum >> 16);
-
- up = (uint16_t *)(((uchar_t *)ip6h) +
- hdr_length + TCP_CHECKSUM_OFFSET);
-
- IP_CKSUM_XMIT(ill, ire, mp, ip6h, up, IPPROTO_TCP,
- hdr_length, ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN,
- ire->ire_max_frag, mctl_present, sum);
-
- /* Software checksum? */
- if (DB_CKSUMFLAGS(mp) == 0) {
- IP6_STAT(ipst, ip6_out_sw_cksum);
- IP6_STAT_UPDATE(ipst,
- ip6_tcp_out_sw_cksum_bytes,
- (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN) -
- hdr_length);
- }
- } else if (nexthdr == IPPROTO_UDP) {
- uint16_t *up;
-
- /*
- * check for full IPv6 header + enough UDP header
- * to get at the UDP checksum field
- */
- if ((mp->b_wptr - mp->b_rptr) < (hdr_length +
- UDP_CHECKSUM_OFFSET + UDP_CHECKSUM_SIZE)) {
- if (!pullupmsg(mp, hdr_length +
- UDP_CHECKSUM_OFFSET + UDP_CHECKSUM_SIZE)) {
- ip1dbg(("ip_wput_v6: UDP hdr pullupmsg"
- " failed\n"));
- BUMP_MIB(mibptr, ipIfStatsOutDiscards);
- freemsg(first_mp);
- return;
- }
- ip6h = (ip6_t *)mp->b_rptr;
- }
- up = (uint16_t *)&ip6h->ip6_src;
- /*
- * Note: The UDP module has stored the length value
- * into the udp checksum field, so we don't
- * need to explicitly sum it in here.
- */
- sum = up[0] + up[1] + up[2] + up[3] +
- up[4] + up[5] + up[6] + up[7] +
- up[8] + up[9] + up[10] + up[11] +
- up[12] + up[13] + up[14] + up[15];
-
- /* Fold the initial sum */
- sum = (sum & 0xffff) + (sum >> 16);
-
- up = (uint16_t *)(((uchar_t *)ip6h) +
- hdr_length + UDP_CHECKSUM_OFFSET);
-
- IP_CKSUM_XMIT(ill, ire, mp, ip6h, up, IPPROTO_UDP,
- hdr_length, ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN,
- ire->ire_max_frag, mctl_present, sum);
-
- /* Software checksum? */
- if (DB_CKSUMFLAGS(mp) == 0) {
- IP6_STAT(ipst, ip6_out_sw_cksum);
- IP6_STAT_UPDATE(ipst,
- ip6_udp_out_sw_cksum_bytes,
- (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN) -
- hdr_length);
- }
- } else if (nexthdr == IPPROTO_ICMPV6) {
- uint16_t *up;
- icmp6_t *icmp6;
-
- /* check for full IPv6+ICMPv6 header */
- if ((mp->b_wptr - mp->b_rptr) <
- (hdr_length + ICMP6_MINLEN)) {
- if (!pullupmsg(mp, hdr_length + ICMP6_MINLEN)) {
- ip1dbg(("ip_wput_v6: ICMP hdr pullupmsg"
- " failed\n"));
- BUMP_MIB(mibptr, ipIfStatsOutDiscards);
- freemsg(first_mp);
- return;
- }
- ip6h = (ip6_t *)mp->b_rptr;
- }
- icmp6 = (icmp6_t *)((uchar_t *)ip6h + hdr_length);
- up = (uint16_t *)&ip6h->ip6_src;
- /*
- * icmp has placed length and routing
- * header adjustment in icmp6_cksum.
- */
- sum = htons(IPPROTO_ICMPV6) +
- up[0] + up[1] + up[2] + up[3] +
- up[4] + up[5] + up[6] + up[7] +
- up[8] + up[9] + up[10] + up[11] +
- up[12] + up[13] + up[14] + up[15];
- sum = (sum & 0xffff) + (sum >> 16);
- icmp6->icmp6_cksum = IP_CSUM(mp, hdr_length, sum);
-
- /* Update output mib stats */
- icmp_update_out_mib_v6(ill, icmp6);
- } else if (nexthdr == IPPROTO_SCTP) {
- sctp_hdr_t *sctph;
-
- if (MBLKL(mp) < (hdr_length + sizeof (*sctph))) {
- if (!pullupmsg(mp, hdr_length +
- sizeof (*sctph))) {
- ip1dbg(("ip_wput_v6: SCTP hdr pullupmsg"
- " failed\n"));
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsOutDiscards);
- freemsg(mp);
- return;
- }
- ip6h = (ip6_t *)mp->b_rptr;
- }
- sctph = (sctp_hdr_t *)(mp->b_rptr + hdr_length);
- sctph->sh_chksum = 0;
- sctph->sh_chksum = sctp_cksum(mp, hdr_length);
- }
-
- cksum_done:
- /*
- * We force the insertion of a fragment header using the
- * IPH_FRAG_HDR flag in two cases:
- * - after reception of an ICMPv6 "packet too big" message
- * with a MTU < 1280 (cf. RFC 2460 section 5)
- * - for multirouted IPv6 packets, so that the receiver can
- * discard duplicates according to their fragment identifier
- *
- * Two flags modifed from the API can modify this behavior.
- * The first is IPV6_USE_MIN_MTU. With this API the user
- * can specify how to manage PMTUD for unicast and multicast.
- *
- * IPV6_DONTFRAG disallows fragmentation.
- */
- max_frag = ire->ire_max_frag;
- switch (IP6I_USE_MIN_MTU_API(flags)) {
- case IPV6_USE_MIN_MTU_DEFAULT:
- case IPV6_USE_MIN_MTU_UNICAST:
- if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
- max_frag = IPV6_MIN_MTU;
- }
- break;
-
- case IPV6_USE_MIN_MTU_NEVER:
- max_frag = IPV6_MIN_MTU;
- break;
- }
- if (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN > max_frag ||
- (ire->ire_frag_flag & IPH_FRAG_HDR)) {
- if (connp != NULL && (flags & IP6I_DONTFRAG)) {
- icmp_pkt2big_v6(ire->ire_stq, first_mp,
- max_frag, B_FALSE, B_TRUE, zoneid, ipst);
- return;
- }
-
- if (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN !=
- (mp->b_cont ? msgdsize(mp) :
- mp->b_wptr - (uchar_t *)ip6h)) {
- ip0dbg(("Packet length mismatch: %d, %ld\n",
- ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN,
- msgdsize(mp)));
- freemsg(first_mp);
- return;
- }
- /* Do IPSEC processing first */
- if (mctl_present) {
- ipsec_out_process(q, first_mp, ire, ill_index);
- return;
- }
- ASSERT(mp->b_prev == NULL);
- ip2dbg(("Fragmenting Size = %d, mtu = %d\n",
- ntohs(ip6h->ip6_plen) +
- IPV6_HDR_LEN, max_frag));
- ASSERT(mp == first_mp);
- /* Initiate IPPF processing */
- if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) {
- ip_process(IPP_LOCAL_OUT, &mp, ill_index);
- if (mp == NULL) {
- return;
- }
- }
- ip_wput_frag_v6(mp, ire, reachable, connp,
- caller, max_frag);
- return;
- }
- /* Do IPSEC processing first */
- if (mctl_present) {
- int extra_len = ipsec_out_extra_length(first_mp);
-
- if (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN + extra_len >
- max_frag && connp != NULL &&
- (flags & IP6I_DONTFRAG)) {
- /*
- * IPsec headers will push the packet over the
- * MTU limit. Issue an ICMPv6 Packet Too Big
- * message for this packet if the upper-layer
- * that issued this packet will be able to
- * react to the icmp_pkt2big_v6() that we'll
- * generate.
- */
- icmp_pkt2big_v6(ire->ire_stq, first_mp,
- max_frag, B_FALSE, B_TRUE, zoneid, ipst);
- return;
- }
- ipsec_out_process(q, first_mp, ire, ill_index);
- return;
- }
- /*
- * XXX multicast: add ip_mforward_v6() here.
- * Check conn_dontroute
- */
-#ifdef lint
- /*
- * XXX The only purpose of this statement is to avoid lint
- * errors. See the above "XXX multicast". When that gets
- * fixed, remove this whole #ifdef lint section.
- */
- ip3dbg(("multicast forward is %s.\n",
- (multicast_forward ? "TRUE" : "FALSE")));
-#endif
-
- UPDATE_OB_PKT_COUNT(ire);
- ire->ire_last_used_time = lbolt;
- ASSERT(mp == first_mp);
- ip_xmit_v6(mp, ire, reachable, connp, caller, NULL);
- } else {
- /*
- * DTrace this as ip:::send. A blocked packet will fire the
- * send probe, but not the receive probe.
- */
- DTRACE_IP7(send, mblk_t *, first_mp, conn_t *, NULL,
- void_ip_t *, ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *,
- NULL, ip6_t *, ip6h, int, 1);
- DTRACE_PROBE4(ip6__loopback__out__start,
- ill_t *, NULL, ill_t *, ill,
- ip6_t *, ip6h, mblk_t *, first_mp);
- FW_HOOKS6(ipst->ips_ip6_loopback_out_event,
- ipst->ips_ipv6firewall_loopback_out,
- NULL, ill, ip6h, first_mp, mp, 0, ipst);
- DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, first_mp);
- if (first_mp != NULL) {
- ip_wput_local_v6(RD(q), ill, ip6h, first_mp, ire, 0,
- zoneid);
- }
- }
-}
-
-/*
- * Outbound IPv6 fragmentation routine using MDT.
- */
-static void
-ip_wput_frag_mdt_v6(mblk_t *mp, ire_t *ire, size_t max_chunk,
- size_t unfragmentable_len, uint8_t nexthdr, uint_t prev_nexthdr_offset)
-{
- ip6_t *ip6h = (ip6_t *)mp->b_rptr;
- uint_t pkts, wroff, hdr_chunk_len, pbuf_idx;
- mblk_t *hdr_mp, *md_mp = NULL;
- int i1;
- multidata_t *mmd;
- unsigned char *hdr_ptr, *pld_ptr;
- ip_pdescinfo_t pdi;
- uint32_t ident;
- size_t len;
- uint16_t offset;
- queue_t *stq = ire->ire_stq;
- ill_t *ill = (ill_t *)stq->q_ptr;
- ip_stack_t *ipst = ill->ill_ipst;
-
- ASSERT(DB_TYPE(mp) == M_DATA);
- ASSERT(MBLKL(mp) > unfragmentable_len);
-
- /*
- * Move read ptr past unfragmentable portion, we don't want this part
- * of the data in our fragments.
- */
- mp->b_rptr += unfragmentable_len;
-
- /* Calculate how many packets we will send out */
- i1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgsize(mp);
- pkts = (i1 + max_chunk - 1) / max_chunk;
- ASSERT(pkts > 1);
-
- /* Allocate a message block which will hold all the IP Headers. */
- wroff = ipst->ips_ip_wroff_extra;
- hdr_chunk_len = wroff + unfragmentable_len + sizeof (ip6_frag_t);
-
- i1 = pkts * hdr_chunk_len;
- /*
- * Create the header buffer, Multidata and destination address
- * and SAP attribute that should be associated with it.
- */
- if ((hdr_mp = allocb(i1, BPRI_HI)) == NULL ||
- ((hdr_mp->b_wptr += i1),
- (mmd = mmd_alloc(hdr_mp, &md_mp, KM_NOSLEEP)) == NULL) ||
- !ip_md_addr_attr(mmd, NULL, ire->ire_nce->nce_res_mp)) {
- freemsg(mp);
- if (md_mp == NULL) {
- freemsg(hdr_mp);
- } else {
-free_mmd: IP6_STAT(ipst, ip6_frag_mdt_discarded);
- freemsg(md_mp);
- }
- IP6_STAT(ipst, ip6_frag_mdt_allocfail);
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
- return;
- }
- IP6_STAT(ipst, ip6_frag_mdt_allocd);
-
- /*
- * Add a payload buffer to the Multidata; this operation must not
- * fail, or otherwise our logic in this routine is broken. There
- * is no memory allocation done by the routine, so any returned
- * failure simply tells us that we've done something wrong.
- *
- * A failure tells us that either we're adding the same payload
- * buffer more than once, or we're trying to add more buffers than
- * allowed. None of the above cases should happen, and we panic
- * because either there's horrible heap corruption, and/or
- * programming mistake.
- */
- if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) {
- goto pbuf_panic;
- }
-
- hdr_ptr = hdr_mp->b_rptr;
- pld_ptr = mp->b_rptr;
-
- pdi.flags = PDESC_HBUF_REF | PDESC_PBUF_REF;
-
- ident = htonl(atomic_add_32_nv(&ire->ire_ident, 1));
-
- /*
- * len is the total length of the fragmentable data in this
- * datagram. For each fragment sent, we will decrement len
- * by the amount of fragmentable data sent in that fragment
- * until len reaches zero.
- */
- len = ntohs(ip6h->ip6_plen) - (unfragmentable_len - IPV6_HDR_LEN);
-
- offset = 0;
- prev_nexthdr_offset += wroff;
-
- while (len != 0) {
- size_t mlen;
- ip6_t *fip6h;
- ip6_frag_t *fraghdr;
- int error;
-
- ASSERT((hdr_ptr + hdr_chunk_len) <= hdr_mp->b_wptr);
- mlen = MIN(len, max_chunk);
- len -= mlen;
-
- fip6h = (ip6_t *)(hdr_ptr + wroff);
- ASSERT(OK_32PTR(fip6h));
- bcopy(ip6h, fip6h, unfragmentable_len);
- hdr_ptr[prev_nexthdr_offset] = IPPROTO_FRAGMENT;
-
- fip6h->ip6_plen = htons((uint16_t)(mlen +
- unfragmentable_len - IPV6_HDR_LEN + sizeof (ip6_frag_t)));
-
- fraghdr = (ip6_frag_t *)((unsigned char *)fip6h +
- unfragmentable_len);
- fraghdr->ip6f_nxt = nexthdr;
- fraghdr->ip6f_reserved = 0;
- fraghdr->ip6f_offlg = htons(offset) |
- ((len != 0) ? IP6F_MORE_FRAG : 0);
- fraghdr->ip6f_ident = ident;
-
- /*
- * Record offset and size of header and data of the next packet
- * in the multidata message.
- */
- PDESC_HDR_ADD(&pdi, hdr_ptr, wroff,
- unfragmentable_len + sizeof (ip6_frag_t), 0);
- PDESC_PLD_INIT(&pdi);
- i1 = MIN(mp->b_wptr - pld_ptr, mlen);
- ASSERT(i1 > 0);
- PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, pld_ptr, i1);
- if (i1 == mlen) {
- pld_ptr += mlen;
- } else {
- i1 = mlen - i1;
- mp = mp->b_cont;
- ASSERT(mp != NULL);
- ASSERT(MBLKL(mp) >= i1);
- /*
- * Attach the next payload message block to the
- * multidata message.
- */
- if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0)
- goto pbuf_panic;
- PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, mp->b_rptr, i1);
- pld_ptr = mp->b_rptr + i1;
- }
-
- if ((mmd_addpdesc(mmd, (pdescinfo_t *)&pdi, &error,
- KM_NOSLEEP)) == NULL) {
- /*
- * Any failure other than ENOMEM indicates that we
- * have passed in invalid pdesc info or parameters
- * to mmd_addpdesc, which must not happen.
- *
- * EINVAL is a result of failure on boundary checks
- * against the pdesc info contents. It should not
- * happen, and we panic because either there's
- * horrible heap corruption, and/or programming
- * mistake.
- */
- if (error != ENOMEM) {
- cmn_err(CE_PANIC, "ip_wput_frag_mdt_v6: "
- "pdesc logic error detected for "
- "mmd %p pinfo %p (%d)\n",
- (void *)mmd, (void *)&pdi, error);
- /* NOTREACHED */
- }
- IP6_STAT(ipst, ip6_frag_mdt_addpdescfail);
- /* Free unattached payload message blocks as well */
- md_mp->b_cont = mp->b_cont;
- goto free_mmd;
- }
-
- /* Advance fragment offset. */
- offset += mlen;
-
- /* Advance to location for next header in the buffer. */
- hdr_ptr += hdr_chunk_len;
-
- /* Did we reach the next payload message block? */
- if (pld_ptr == mp->b_wptr && mp->b_cont != NULL) {
- mp = mp->b_cont;
- /*
- * Attach the next message block with payload
- * data to the multidata message.
- */
- if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0)
- goto pbuf_panic;
- pld_ptr = mp->b_rptr;
- }
- }
-
- ASSERT(hdr_mp->b_wptr == hdr_ptr);
- ASSERT(mp->b_wptr == pld_ptr);
-
- /* Update IP statistics */
- UPDATE_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates, pkts);
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragOKs);
- UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits, pkts);
- /*
- * The ipv6 header len is accounted for in unfragmentable_len so
- * when calculating the fragmentation overhead just add the frag
- * header len.
- */
- UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
- (ntohs(ip6h->ip6_plen) - (unfragmentable_len - IPV6_HDR_LEN)) +
- pkts * (unfragmentable_len + sizeof (ip6_frag_t)));
- IP6_STAT_UPDATE(ipst, ip6_frag_mdt_pkt_out, pkts);
-
- ire->ire_ob_pkt_count += pkts;
- if (ire->ire_ipif != NULL)
- atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, pkts);
-
- ire->ire_last_used_time = lbolt;
- /* Send it down */
- putnext(stq, md_mp);
- return;
-
-pbuf_panic:
- cmn_err(CE_PANIC, "ip_wput_frag_mdt_v6: payload buffer logic "
- "error for mmd %p pbuf %p (%d)", (void *)mmd, (void *)mp,
- pbuf_idx);
- /* NOTREACHED */
-}
-
-/*
* IPv6 fragmentation. Essentially the same as IPv4 fragmentation.
* We have not optimized this in terms of number of mblks
* allocated. For instance, for each fragment sent we always allocate a
* mblk to hold the IPv6 header and fragment header.
*
- * Assumes that all the extension headers are contained in the first mblk.
- *
- * The fragment header is inserted after an hop-by-hop options header
- * and after [an optional destinations header followed by] a routing header.
- *
- * NOTE : This function does not ire_refrele the ire passed in as
- * the argument.
+ * Assumes that all the extension headers are contained in the first mblk
+ * and that the fragment header has has already been added by calling
+ * ip_fraghdr_add_v6.
*/
-void
-ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp,
- int caller, int max_frag)
+int
+ip_fragment_v6(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, uint_t pkt_len,
+ uint32_t max_frag, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid,
+ pfirepostfrag_t postfragfn, uintptr_t *ixa_cookie)
{
ip6_t *ip6h = (ip6_t *)mp->b_rptr;
ip6_t *fip6h;
@@ -11337,27 +4102,31 @@ ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp,
mblk_t *dmp;
ip6_frag_t *fraghdr;
size_t unfragmentable_len;
- size_t len;
size_t mlen;
size_t max_chunk;
- uint32_t ident;
uint16_t off_flags;
uint16_t offset = 0;
- ill_t *ill;
+ ill_t *ill = nce->nce_ill;
uint8_t nexthdr;
- uint_t prev_nexthdr_offset;
uint8_t *ptr;
- ip_stack_t *ipst = ire->ire_ipst;
-
- ASSERT(ire->ire_type == IRE_CACHE);
- ill = (ill_t *)ire->ire_stq->q_ptr;
+ ip_stack_t *ipst = ill->ill_ipst;
+ uint_t priority = mp->b_band;
+ int error = 0;
- if (max_frag <= 0) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragReqds);
+ if (max_frag == 0) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
+ ip_drop_output("FragFails: zero max_frag", mp, ill);
freemsg(mp);
- return;
+ return (EINVAL);
}
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragReqds);
+
+ /*
+ * Caller should have added fraghdr_t to pkt_len, and also
+ * updated ip6_plen.
+ */
+ ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == pkt_len);
+ ASSERT(msgdsize(mp) == pkt_len);
/*
* Determine the length of the unfragmentable portion of this
@@ -11366,7 +4135,6 @@ ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp,
* destination options header, and a potential routing header.
*/
nexthdr = ip6h->ip6_nxt;
- prev_nexthdr_offset = (uint8_t *)&ip6h->ip6_nxt - (uint8_t *)ip6h;
ptr = (uint8_t *)&ip6h[1];
if (nexthdr == IPPROTO_HOPOPTS) {
@@ -11376,8 +4144,6 @@ ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp,
hbh_hdr = (ip6_hbh_t *)ptr;
hdr_len = 8 * (hbh_hdr->ip6h_len + 1);
nexthdr = hbh_hdr->ip6h_nxt;
- prev_nexthdr_offset = (uint8_t *)&hbh_hdr->ip6h_nxt
- - (uint8_t *)ip6h;
ptr += hdr_len;
}
if (nexthdr == IPPROTO_DSTOPTS) {
@@ -11388,8 +4154,6 @@ ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp,
if (dest_hdr->ip6d_nxt == IPPROTO_ROUTING) {
hdr_len = 8 * (dest_hdr->ip6d_len + 1);
nexthdr = dest_hdr->ip6d_nxt;
- prev_nexthdr_offset = (uint8_t *)&dest_hdr->ip6d_nxt
- - (uint8_t *)ip6h;
ptr += hdr_len;
}
}
@@ -11399,82 +4163,73 @@ ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp,
rthdr = (ip6_rthdr_t *)ptr;
nexthdr = rthdr->ip6r_nxt;
- prev_nexthdr_offset = (uint8_t *)&rthdr->ip6r_nxt
- - (uint8_t *)ip6h;
hdr_len = 8 * (rthdr->ip6r_len + 1);
ptr += hdr_len;
}
+ if (nexthdr != IPPROTO_FRAGMENT) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
+ ip_drop_output("FragFails: bad nexthdr", mp, ill);
+ freemsg(mp);
+ return (EINVAL);
+ }
unfragmentable_len = (uint_t)(ptr - (uint8_t *)ip6h);
+ unfragmentable_len += sizeof (ip6_frag_t);
- max_chunk = (min(max_frag, ire->ire_max_frag) - unfragmentable_len -
- sizeof (ip6_frag_t)) & ~7;
-
- /* Check if we can use MDT to send out the frags. */
- ASSERT(!IRE_IS_LOCAL(ire));
- if (ipst->ips_ip_multidata_outbound && reachable == 0 &&
- !(ire->ire_flags & RTF_MULTIRT) && ILL_MDT_CAPABLE(ill) &&
- IP_CAN_FRAG_MDT(mp, unfragmentable_len, max_chunk)) {
- ip_wput_frag_mdt_v6(mp, ire, max_chunk, unfragmentable_len,
- nexthdr, prev_nexthdr_offset);
- return;
- }
+ max_chunk = (max_frag - unfragmentable_len) & ~7;
/*
* Allocate an mblk with enough room for the link-layer
- * header, the unfragmentable part of the datagram, and the
- * fragment header. This (or a copy) will be used as the
+ * header and the unfragmentable part of the datagram, which includes
+ * the fragment header. This (or a copy) will be used as the
* first mblk for each fragment we send.
*/
- hmp = allocb_tmpl(unfragmentable_len + sizeof (ip6_frag_t) +
- ipst->ips_ip_wroff_extra, mp);
+ hmp = allocb_tmpl(unfragmentable_len + ipst->ips_ip_wroff_extra, mp);
if (hmp == NULL) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
+ ip_drop_output("FragFails: no hmp", mp, ill);
freemsg(mp);
- return;
+ return (ENOBUFS);
}
hmp->b_rptr += ipst->ips_ip_wroff_extra;
- hmp->b_wptr = hmp->b_rptr + unfragmentable_len + sizeof (ip6_frag_t);
+ hmp->b_wptr = hmp->b_rptr + unfragmentable_len;
fip6h = (ip6_t *)hmp->b_rptr;
- fraghdr = (ip6_frag_t *)(hmp->b_rptr + unfragmentable_len);
-
bcopy(ip6h, fip6h, unfragmentable_len);
- hmp->b_rptr[prev_nexthdr_offset] = IPPROTO_FRAGMENT;
-
- ident = atomic_add_32_nv(&ire->ire_ident, 1);
-
- fraghdr->ip6f_nxt = nexthdr;
- fraghdr->ip6f_reserved = 0;
- fraghdr->ip6f_offlg = 0;
- fraghdr->ip6f_ident = htonl(ident);
/*
- * len is the total length of the fragmentable data in this
- * datagram. For each fragment sent, we will decrement len
+ * pkt_len is set to the total length of the fragmentable data in this
+ * datagram. For each fragment sent, we will decrement pkt_len
* by the amount of fragmentable data sent in that fragment
* until len reaches zero.
*/
- len = ntohs(ip6h->ip6_plen) - (unfragmentable_len - IPV6_HDR_LEN);
+ pkt_len -= unfragmentable_len;
/*
* Move read ptr past unfragmentable portion, we don't want this part
* of the data in our fragments.
*/
mp->b_rptr += unfragmentable_len;
+ if (mp->b_rptr == mp->b_wptr) {
+ mblk_t *mp1 = mp->b_cont;
+ freeb(mp);
+ mp = mp1;
+ }
- while (len != 0) {
- mlen = MIN(len, max_chunk);
- len -= mlen;
- if (len != 0) {
+ while (pkt_len != 0) {
+ mlen = MIN(pkt_len, max_chunk);
+ pkt_len -= mlen;
+ if (pkt_len != 0) {
/* Not last */
hmp0 = copyb(hmp);
if (hmp0 == NULL) {
- freeb(hmp);
- freemsg(mp);
BUMP_MIB(ill->ill_ip_mib,
ipIfStatsOutFragFails);
- ip1dbg(("ip_wput_frag_v6: copyb failed\n"));
- return;
+ ip_drop_output("FragFails: copyb failed",
+ mp, ill);
+ freeb(hmp);
+ freemsg(mp);
+ ip1dbg(("ip_fragment_v6: copyb failed\n"));
+ return (ENOBUFS);
}
off_flags = IP6F_MORE_FRAG;
} else {
@@ -11484,10 +4239,11 @@ ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp,
off_flags = 0;
}
fip6h = (ip6_t *)(hmp0->b_rptr);
- fraghdr = (ip6_frag_t *)(hmp0->b_rptr + unfragmentable_len);
+ fraghdr = (ip6_frag_t *)(hmp0->b_rptr + unfragmentable_len -
+ sizeof (ip6_frag_t));
fip6h->ip6_plen = htons((uint16_t)(mlen +
- unfragmentable_len - IPV6_HDR_LEN + sizeof (ip6_frag_t)));
+ unfragmentable_len - IPV6_HDR_LEN));
/*
* Note: Optimization alert.
* In IPv6 (and IPv4) protocol header, Fragment Offset
@@ -11504,654 +4260,197 @@ ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp,
if (!(dmp = ip_carve_mp(&mp, mlen))) {
/* mp has already been freed by ip_carve_mp() */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
+ ip_drop_output("FragFails: could not carve mp",
+ hmp0, ill);
if (hmp != NULL)
freeb(hmp);
freeb(hmp0);
ip1dbg(("ip_carve_mp: failed\n"));
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
- return;
+ return (ENOBUFS);
}
hmp0->b_cont = dmp;
/* Get the priority marking, if any */
- hmp0->b_band = dmp->b_band;
- UPDATE_OB_PKT_COUNT(ire);
- ire->ire_last_used_time = lbolt;
- ip_xmit_v6(hmp0, ire, reachable | IP6_NO_IPPOLICY, connp,
- caller, NULL);
- reachable = 0; /* No need to redo state machine in loop */
+ hmp0->b_band = priority;
+
BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates);
+
+ error = postfragfn(hmp0, nce, ixaflags,
+ mlen + unfragmentable_len, xmit_hint, szone, nolzid,
+ ixa_cookie);
+ if (error != 0 && error != EWOULDBLOCK && hmp != NULL) {
+ /* No point in sending the other fragments */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
+ ip_drop_output("FragFails: postfragfn failed",
+ hmp, ill);
+ freeb(hmp);
+ freemsg(mp);
+ return (error);
+ }
+ /* No need to redo state machine in loop */
+ ixaflags &= ~IXAF_REACH_CONF;
+
offset += mlen;
}
BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragOKs);
+ return (error);
}
/*
- * Determine if the ill and multicast aspects of that packets
- * "matches" the conn.
+ * Add a fragment header to an IPv6 packet.
+ * Assumes that all the extension headers are contained in the first mblk.
+ *
+ * The fragment header is inserted after an hop-by-hop options header
+ * and after [an optional destinations header followed by] a routing header.
*/
-boolean_t
-conn_wantpacket_v6(conn_t *connp, ill_t *ill, ip6_t *ip6h, int fanout_flags,
- zoneid_t zoneid)
+mblk_t *
+ip_fraghdr_add_v6(mblk_t *mp, uint32_t ident, ip_xmit_attr_t *ixa)
{
- ill_t *bound_ill;
- boolean_t wantpacket;
- in6_addr_t *v6dst_ptr = &ip6h->ip6_dst;
- in6_addr_t *v6src_ptr = &ip6h->ip6_src;
+ ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+ ip6_t *fip6h;
+ mblk_t *hmp;
+ ip6_frag_t *fraghdr;
+ size_t unfragmentable_len;
+ uint8_t nexthdr;
+ uint_t prev_nexthdr_offset;
+ uint8_t *ptr;
+ uint_t priority = mp->b_band;
+ ip_stack_t *ipst = ixa->ixa_ipst;
/*
- * conn_incoming_ill is set by IPV6_BOUND_IF which limits
- * unicast and multicast reception to conn_incoming_ill.
- * conn_wantpacket_v6 is called both for unicast and
- * multicast.
+ * Determine the length of the unfragmentable portion of this
+ * datagram. This consists of the IPv6 header, a potential
+ * hop-by-hop options header, a potential pre-routing-header
+ * destination options header, and a potential routing header.
*/
- bound_ill = connp->conn_incoming_ill;
- if (bound_ill != NULL) {
- if (IS_IPMP(bound_ill)) {
- if (bound_ill->ill_grp != ill->ill_grp)
- return (B_FALSE);
- } else {
- if (bound_ill != ill)
- return (B_FALSE);
- }
- }
+ nexthdr = ip6h->ip6_nxt;
+ prev_nexthdr_offset = (uint8_t *)&ip6h->ip6_nxt - (uint8_t *)ip6h;
+ ptr = (uint8_t *)&ip6h[1];
- if (connp->conn_multi_router)
- return (B_TRUE);
+ if (nexthdr == IPPROTO_HOPOPTS) {
+ ip6_hbh_t *hbh_hdr;
+ uint_t hdr_len;
- if (!IN6_IS_ADDR_MULTICAST(v6dst_ptr) &&
- !IN6_IS_ADDR_V4MAPPED_CLASSD(v6dst_ptr)) {
- /*
- * Unicast case: we match the conn only if it's in the specified
- * zone.
- */
- return (IPCL_ZONE_MATCH(connp, zoneid));
+ hbh_hdr = (ip6_hbh_t *)ptr;
+ hdr_len = 8 * (hbh_hdr->ip6h_len + 1);
+ nexthdr = hbh_hdr->ip6h_nxt;
+ prev_nexthdr_offset = (uint8_t *)&hbh_hdr->ip6h_nxt
+ - (uint8_t *)ip6h;
+ ptr += hdr_len;
}
+ if (nexthdr == IPPROTO_DSTOPTS) {
+ ip6_dest_t *dest_hdr;
+ uint_t hdr_len;
- if ((fanout_flags & IP_FF_NO_MCAST_LOOP) &&
- (connp->conn_zoneid == zoneid || zoneid == ALL_ZONES)) {
- /*
- * Loopback case: the sending endpoint has IP_MULTICAST_LOOP
- * disabled, therefore we don't dispatch the multicast packet to
- * the sending zone.
- */
- return (B_FALSE);
+ dest_hdr = (ip6_dest_t *)ptr;
+ if (dest_hdr->ip6d_nxt == IPPROTO_ROUTING) {
+ hdr_len = 8 * (dest_hdr->ip6d_len + 1);
+ nexthdr = dest_hdr->ip6d_nxt;
+ prev_nexthdr_offset = (uint8_t *)&dest_hdr->ip6d_nxt
+ - (uint8_t *)ip6h;
+ ptr += hdr_len;
+ }
}
+ if (nexthdr == IPPROTO_ROUTING) {
+ ip6_rthdr_t *rthdr;
+ uint_t hdr_len;
- if (IS_LOOPBACK(ill) && connp->conn_zoneid != zoneid &&
- zoneid != ALL_ZONES) {
- /*
- * Multicast packet on the loopback interface: we only match
- * conns who joined the group in the specified zone.
- */
- return (B_FALSE);
+ rthdr = (ip6_rthdr_t *)ptr;
+ nexthdr = rthdr->ip6r_nxt;
+ prev_nexthdr_offset = (uint8_t *)&rthdr->ip6r_nxt
+ - (uint8_t *)ip6h;
+ hdr_len = 8 * (rthdr->ip6r_len + 1);
+ ptr += hdr_len;
}
+ unfragmentable_len = (uint_t)(ptr - (uint8_t *)ip6h);
- mutex_enter(&connp->conn_lock);
- wantpacket =
- ilg_lookup_ill_withsrc_v6(connp, v6dst_ptr, v6src_ptr, ill) != NULL;
- mutex_exit(&connp->conn_lock);
-
- return (wantpacket);
-}
-
-
-/*
- * Transmit a packet and update any NUD state based on the flags
- * XXX need to "recover" any ip6i_t when doing putq!
- *
- * NOTE : This function does not ire_refrele the ire passed in as the
- * argument.
- */
-void
-ip_xmit_v6(mblk_t *mp, ire_t *ire, uint_t flags, conn_t *connp,
- int caller, ipsec_out_t *io)
-{
- mblk_t *mp1;
- nce_t *nce = ire->ire_nce;
- ill_t *ill;
- ill_t *out_ill;
- uint64_t delta;
- ip6_t *ip6h;
- queue_t *stq = ire->ire_stq;
- ire_t *ire1 = NULL;
- ire_t *save_ire = ire;
- boolean_t multirt_send = B_FALSE;
- mblk_t *next_mp = NULL;
- ip_stack_t *ipst = ire->ire_ipst;
- boolean_t fp_prepend = B_FALSE;
- uint32_t hlen;
+ /*
+ * Allocate an mblk with enough room for the link-layer
+ * header, the unfragmentable part of the datagram, and the
+ * fragment header.
+ */
+ hmp = allocb_tmpl(unfragmentable_len + sizeof (ip6_frag_t) +
+ ipst->ips_ip_wroff_extra, mp);
+ if (hmp == NULL) {
+ ill_t *ill = ixa->ixa_nce->nce_ill;
- ip6h = (ip6_t *)mp->b_rptr;
- ASSERT(!IN6_IS_ADDR_V4MAPPED(&ire->ire_addr_v6));
- ASSERT(ire->ire_ipversion == IPV6_VERSION);
- ASSERT(nce != NULL);
- ASSERT(mp->b_datap->db_type == M_DATA);
- ASSERT(stq != NULL);
-
- ill = ire_to_ill(ire);
- if (!ill) {
- ip0dbg(("ip_xmit_v6: ire_to_ill failed\n"));
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards: allocb failure", mp, ill);
freemsg(mp);
- return;
+ return (NULL);
}
+ hmp->b_rptr += ipst->ips_ip_wroff_extra;
+ hmp->b_wptr = hmp->b_rptr + unfragmentable_len + sizeof (ip6_frag_t);
- /* Flow-control check has been done in ip_wput_ire_v6 */
- if (IP_FLOW_CONTROLLED_ULP(ip6h->ip6_nxt) || caller == IP_WPUT ||
- caller == IP_WSRV || canput(stq->q_next)) {
- uint32_t ill_index;
-
- /*
- * In most cases, the emission loop below is entered only
- * once. Only in the case where the ire holds the
- * RTF_MULTIRT flag, do we loop to process all RTF_MULTIRT
- * flagged ires in the bucket, and send the packet
- * through all crossed RTF_MULTIRT routes.
- */
- if (ire->ire_flags & RTF_MULTIRT) {
- /*
- * Multirouting case. The bucket where ire is stored
- * probably holds other RTF_MULTIRT flagged ires
- * to the destination. In this call to ip_xmit_v6,
- * we attempt to send the packet through all
- * those ires. Thus, we first ensure that ire is the
- * first RTF_MULTIRT ire in the bucket,
- * before walking the ire list.
- */
- ire_t *first_ire;
- irb_t *irb = ire->ire_bucket;
- ASSERT(irb != NULL);
- multirt_send = B_TRUE;
-
- /* Make sure we do not omit any multiroute ire. */
- IRB_REFHOLD(irb);
- for (first_ire = irb->irb_ire;
- first_ire != NULL;
- first_ire = first_ire->ire_next) {
- if ((first_ire->ire_flags & RTF_MULTIRT) &&
- (IN6_ARE_ADDR_EQUAL(&first_ire->ire_addr_v6,
- &ire->ire_addr_v6)) &&
- !(first_ire->ire_marks &
- (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)))
- break;
- }
-
- if ((first_ire != NULL) && (first_ire != ire)) {
- IRE_REFHOLD(first_ire);
- /* ire will be released by the caller */
- ire = first_ire;
- nce = ire->ire_nce;
- stq = ire->ire_stq;
- ill = ire_to_ill(ire);
- }
- IRB_REFRELE(irb);
- } else if (connp != NULL && IPCL_IS_TCP(connp) &&
- connp->conn_mdt_ok && !connp->conn_tcp->tcp_mdt &&
- ILL_MDT_USABLE(ill)) {
- /*
- * This tcp connection was marked as MDT-capable, but
- * it has been turned off due changes in the interface.
- * Now that the interface support is back, turn it on
- * by notifying tcp. We don't directly modify tcp_mdt,
- * since we leave all the details to the tcp code that
- * knows better.
- */
- mblk_t *mdimp = ip_mdinfo_alloc(ill->ill_mdt_capab);
-
- if (mdimp == NULL) {
- ip0dbg(("ip_xmit_v6: can't re-enable MDT for "
- "connp %p (ENOMEM)\n", (void *)connp));
- } else {
- CONN_INC_REF(connp);
- SQUEUE_ENTER_ONE(connp->conn_sqp, mdimp,
- tcp_input, connp, SQ_FILL,
- SQTAG_TCP_INPUT_MCTL);
- }
- }
-
- do {
- mblk_t *mp_ip6h;
-
- if (multirt_send) {
- irb_t *irb;
- /*
- * We are in a multiple send case, need to get
- * the next ire and make a duplicate of the
- * packet. ire1 holds here the next ire to
- * process in the bucket. If multirouting is
- * expected, any non-RTF_MULTIRT ire that has
- * the right destination address is ignored.
- */
- irb = ire->ire_bucket;
- ASSERT(irb != NULL);
-
- IRB_REFHOLD(irb);
- for (ire1 = ire->ire_next;
- ire1 != NULL;
- ire1 = ire1->ire_next) {
- if (!(ire1->ire_flags & RTF_MULTIRT))
- continue;
- if (!IN6_ARE_ADDR_EQUAL(
- &ire1->ire_addr_v6,
- &ire->ire_addr_v6))
- continue;
- if (ire1->ire_marks &
- IRE_MARK_CONDEMNED)
- continue;
-
- /* Got one */
- if (ire1 != save_ire) {
- IRE_REFHOLD(ire1);
- }
- break;
- }
- IRB_REFRELE(irb);
-
- if (ire1 != NULL) {
- next_mp = copyb(mp);
- if ((next_mp == NULL) ||
- ((mp->b_cont != NULL) &&
- ((next_mp->b_cont =
- dupmsg(mp->b_cont)) == NULL))) {
- freemsg(next_mp);
- next_mp = NULL;
- ire_refrele(ire1);
- ire1 = NULL;
- }
- }
-
- /* Last multiroute ire; don't loop anymore. */
- if (ire1 == NULL) {
- multirt_send = B_FALSE;
- }
- }
-
- ill_index =
- ((ill_t *)stq->q_ptr)->ill_phyint->phyint_ifindex;
-
- /* Initiate IPPF processing */
- if (IP6_OUT_IPP(flags, ipst)) {
- ip_process(IPP_LOCAL_OUT, &mp, ill_index);
- if (mp == NULL) {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsOutDiscards);
- if (next_mp != NULL)
- freemsg(next_mp);
- if (ire != save_ire) {
- ire_refrele(ire);
- }
- return;
- }
- ip6h = (ip6_t *)mp->b_rptr;
- }
- mp_ip6h = mp;
-
- /*
- * Check for fastpath, we need to hold nce_lock to
- * prevent fastpath update from chaining nce_fp_mp.
- */
-
- ASSERT(nce->nce_ipversion != IPV4_VERSION);
- mutex_enter(&nce->nce_lock);
- if ((mp1 = nce->nce_fp_mp) != NULL) {
- uchar_t *rptr;
-
- hlen = MBLKL(mp1);
- rptr = mp->b_rptr - hlen;
- /*
- * make sure there is room for the fastpath
- * datalink header
- */
- if (rptr < mp->b_datap->db_base) {
- mp1 = copyb(mp1);
- mutex_exit(&nce->nce_lock);
- if (mp1 == NULL) {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsOutDiscards);
- freemsg(mp);
- if (next_mp != NULL)
- freemsg(next_mp);
- if (ire != save_ire) {
- ire_refrele(ire);
- }
- return;
- }
- mp1->b_cont = mp;
-
- /* Get the priority marking, if any */
- mp1->b_band = mp->b_band;
- mp = mp1;
- } else {
- mp->b_rptr = rptr;
- /*
- * fastpath - pre-pend datalink
- * header
- */
- bcopy(mp1->b_rptr, rptr, hlen);
- mutex_exit(&nce->nce_lock);
- fp_prepend = B_TRUE;
- }
- } else {
- /*
- * Get the DL_UNITDATA_REQ.
- */
- mp1 = nce->nce_res_mp;
- if (mp1 == NULL) {
- mutex_exit(&nce->nce_lock);
- ip1dbg(("ip_xmit_v6: No resolution "
- "block ire = %p\n", (void *)ire));
- freemsg(mp);
- if (next_mp != NULL)
- freemsg(next_mp);
- if (ire != save_ire) {
- ire_refrele(ire);
- }
- return;
- }
- /*
- * Prepend the DL_UNITDATA_REQ.
- */
- mp1 = copyb(mp1);
- mutex_exit(&nce->nce_lock);
- if (mp1 == NULL) {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsOutDiscards);
- freemsg(mp);
- if (next_mp != NULL)
- freemsg(next_mp);
- if (ire != save_ire) {
- ire_refrele(ire);
- }
- return;
- }
- mp1->b_cont = mp;
-
- /* Get the priority marking, if any */
- mp1->b_band = mp->b_band;
- mp = mp1;
- }
-
- out_ill = (ill_t *)stq->q_ptr;
-
- DTRACE_PROBE4(ip6__physical__out__start,
- ill_t *, NULL, ill_t *, out_ill,
- ip6_t *, ip6h, mblk_t *, mp);
+ fip6h = (ip6_t *)hmp->b_rptr;
+ fraghdr = (ip6_frag_t *)(hmp->b_rptr + unfragmentable_len);
- FW_HOOKS6(ipst->ips_ip6_physical_out_event,
- ipst->ips_ipv6firewall_physical_out,
- NULL, out_ill, ip6h, mp, mp_ip6h, 0, ipst);
+ bcopy(ip6h, fip6h, unfragmentable_len);
+ fip6h->ip6_plen = htons(ntohs(fip6h->ip6_plen) + sizeof (ip6_frag_t));
+ hmp->b_rptr[prev_nexthdr_offset] = IPPROTO_FRAGMENT;
- DTRACE_PROBE1(ip6__physical__out__end, mblk_t *, mp);
+ fraghdr->ip6f_nxt = nexthdr;
+ fraghdr->ip6f_reserved = 0;
+ fraghdr->ip6f_offlg = 0;
+ fraghdr->ip6f_ident = htonl(ident);
- if (mp == NULL) {
- if (multirt_send) {
- ASSERT(ire1 != NULL);
- if (ire != save_ire) {
- ire_refrele(ire);
- }
- /*
- * Proceed with the next RTF_MULTIRT
- * ire, also set up the send-to queue
- * accordingly.
- */
- ire = ire1;
- ire1 = NULL;
- stq = ire->ire_stq;
- nce = ire->ire_nce;
- ill = ire_to_ill(ire);
- mp = next_mp;
- next_mp = NULL;
- continue;
- } else {
- ASSERT(next_mp == NULL);
- ASSERT(ire1 == NULL);
- break;
- }
- }
+ /* Get the priority marking, if any */
+ hmp->b_band = priority;
- if (ipst->ips_ip6_observe.he_interested) {
- zoneid_t szone;
+ /*
+ * Move read ptr past unfragmentable portion, we don't want this part
+ * of the data in our fragments.
+ */
+ mp->b_rptr += unfragmentable_len;
+ hmp->b_cont = mp;
+ return (hmp);
+}
- /*
- * Both of these functions expect b_rptr to
- * be where the IPv6 header starts, so advance
- * past the link layer header.
- */
- if (fp_prepend)
- mp_ip6h->b_rptr += hlen;
- szone = ip_get_zoneid_v6(&ip6h->ip6_src,
- mp_ip6h, out_ill, ipst, ALL_ZONES);
- ipobs_hook(mp_ip6h, IPOBS_HOOK_OUTBOUND, szone,
- ALL_ZONES, out_ill, ipst);
- if (fp_prepend)
- mp_ip6h->b_rptr -= hlen;
- }
+/*
+ * Determine if the ill and multicast aspects of that packets
+ * "matches" the conn.
+ */
+boolean_t
+conn_wantpacket_v6(conn_t *connp, ip_recv_attr_t *ira, ip6_t *ip6h)
+{
+ ill_t *ill = ira->ira_rill;
+ zoneid_t zoneid = ira->ira_zoneid;
+ uint_t in_ifindex;
+ in6_addr_t *v6dst_ptr = &ip6h->ip6_dst;
+ in6_addr_t *v6src_ptr = &ip6h->ip6_src;
- /*
- * Update ire and MIB counters; for save_ire, this has
- * been done by the caller.
- */
- if (ire != save_ire) {
- UPDATE_OB_PKT_COUNT(ire);
- ire->ire_last_used_time = lbolt;
+ /*
+ * conn_incoming_ifindex is set by IPV6_BOUND_IF and as link-local
+ * scopeid. This is used to limit
+ * unicast and multicast reception to conn_incoming_ifindex.
+ * conn_wantpacket_v6 is called both for unicast and
+ * multicast packets.
+ */
+ in_ifindex = connp->conn_incoming_ifindex;
- if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsHCOutMcastPkts);
- UPDATE_MIB(ill->ill_ip_mib,
- ipIfStatsHCOutMcastOctets,
- ntohs(ip6h->ip6_plen) +
- IPV6_HDR_LEN);
- }
- }
+ /* mpathd can bind to the under IPMP interface, which we allow */
+ if (in_ifindex != 0 && in_ifindex != ill->ill_phyint->phyint_ifindex) {
+ if (!IS_UNDER_IPMP(ill))
+ return (B_FALSE);
- /*
- * Send it down. XXX Do we want to flow control AH/ESP
- * packets that carry TCP payloads? We don't flow
- * control TCP packets, but we should also not
- * flow-control TCP packets that have been protected.
- * We don't have an easy way to find out if an AH/ESP
- * packet was originally TCP or not currently.
- */
- if (io == NULL) {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsHCOutTransmits);
- UPDATE_MIB(ill->ill_ip_mib,
- ipIfStatsHCOutOctets,
- ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN);
- DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL,
- void_ip_t *, ip6h, __dtrace_ipsr_ill_t *,
- out_ill, ipha_t *, NULL, ip6_t *, ip6h,
- int, 0);
-
- putnext(stq, mp);
- } else {
- /*
- * Safety Pup says: make sure this is
- * going to the right interface!
- */
- if (io->ipsec_out_capab_ill_index !=
- ill_index) {
- /* IPsec kstats: bump lose counter */
- freemsg(mp1);
- } else {
- BUMP_MIB(ill->ill_ip_mib,
- ipIfStatsHCOutTransmits);
- UPDATE_MIB(ill->ill_ip_mib,
- ipIfStatsHCOutOctets,
- ntohs(ip6h->ip6_plen) +
- IPV6_HDR_LEN);
- DTRACE_IP7(send, mblk_t *, mp,
- conn_t *, NULL, void_ip_t *, ip6h,
- __dtrace_ipsr_ill_t *, out_ill,
- ipha_t *, NULL, ip6_t *, ip6h, int,
- 0);
- ipsec_hw_putnext(stq, mp);
- }
- }
+ if (in_ifindex != ipmp_ill_get_ipmp_ifindex(ill))
+ return (B_FALSE);
+ }
- if (nce->nce_flags & (NCE_F_NONUD|NCE_F_PERMANENT)) {
- if (ire != save_ire) {
- ire_refrele(ire);
- }
- if (multirt_send) {
- ASSERT(ire1 != NULL);
- /*
- * Proceed with the next RTF_MULTIRT
- * ire, also set up the send-to queue
- * accordingly.
- */
- ire = ire1;
- ire1 = NULL;
- stq = ire->ire_stq;
- nce = ire->ire_nce;
- ill = ire_to_ill(ire);
- mp = next_mp;
- next_mp = NULL;
- continue;
- }
- ASSERT(next_mp == NULL);
- ASSERT(ire1 == NULL);
- return;
- }
+ if (!IPCL_ZONE_MATCH(connp, zoneid))
+ return (B_FALSE);
- ASSERT(nce->nce_state != ND_INCOMPLETE);
+ if (!(ira->ira_flags & IRAF_MULTICAST))
+ return (B_TRUE);
- /*
- * Check for upper layer advice
- */
- if (flags & IPV6_REACHABILITY_CONFIRMATION) {
- /*
- * It should be o.k. to check the state without
- * a lock here, at most we lose an advice.
- */
- nce->nce_last = TICK_TO_MSEC(lbolt64);
- if (nce->nce_state != ND_REACHABLE) {
-
- mutex_enter(&nce->nce_lock);
- nce->nce_state = ND_REACHABLE;
- nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
- mutex_exit(&nce->nce_lock);
- (void) untimeout(nce->nce_timeout_id);
- if (ip_debug > 2) {
- /* ip1dbg */
- pr_addr_dbg("ip_xmit_v6: state"
- " for %s changed to"
- " REACHABLE\n", AF_INET6,
- &ire->ire_addr_v6);
- }
- }
- if (ire != save_ire) {
- ire_refrele(ire);
- }
- if (multirt_send) {
- ASSERT(ire1 != NULL);
- /*
- * Proceed with the next RTF_MULTIRT
- * ire, also set up the send-to queue
- * accordingly.
- */
- ire = ire1;
- ire1 = NULL;
- stq = ire->ire_stq;
- nce = ire->ire_nce;
- ill = ire_to_ill(ire);
- mp = next_mp;
- next_mp = NULL;
- continue;
- }
- ASSERT(next_mp == NULL);
- ASSERT(ire1 == NULL);
- return;
- }
+ if (connp->conn_multi_router)
+ return (B_TRUE);
- delta = TICK_TO_MSEC(lbolt64) - nce->nce_last;
- ip1dbg(("ip_xmit_v6: delta = %" PRId64
- " ill_reachable_time = %d \n", delta,
- ill->ill_reachable_time));
- if (delta > (uint64_t)ill->ill_reachable_time) {
- nce = ire->ire_nce;
- mutex_enter(&nce->nce_lock);
- switch (nce->nce_state) {
- case ND_REACHABLE:
- case ND_STALE:
- /*
- * ND_REACHABLE is identical to
- * ND_STALE in this specific case. If
- * reachable time has expired for this
- * neighbor (delta is greater than
- * reachable time), conceptually, the
- * neighbor cache is no longer in
- * REACHABLE state, but already in
- * STALE state. So the correct
- * transition here is to ND_DELAY.
- */
- nce->nce_state = ND_DELAY;
- mutex_exit(&nce->nce_lock);
- NDP_RESTART_TIMER(nce,
- ipst->ips_delay_first_probe_time);
- if (ip_debug > 3) {
- /* ip2dbg */
- pr_addr_dbg("ip_xmit_v6: state"
- " for %s changed to"
- " DELAY\n", AF_INET6,
- &ire->ire_addr_v6);
- }
- break;
- case ND_DELAY:
- case ND_PROBE:
- mutex_exit(&nce->nce_lock);
- /* Timers have already started */
- break;
- case ND_UNREACHABLE:
- /*
- * ndp timer has detected that this nce
- * is unreachable and initiated deleting
- * this nce and all its associated IREs.
- * This is a race where we found the
- * ire before it was deleted and have
- * just sent out a packet using this
- * unreachable nce.
- */
- mutex_exit(&nce->nce_lock);
- break;
- default:
- ASSERT(0);
- }
- }
+ if (ira->ira_protocol == IPPROTO_RSVP)
+ return (B_TRUE);
- if (multirt_send) {
- ASSERT(ire1 != NULL);
- /*
- * Proceed with the next RTF_MULTIRT ire,
- * Also set up the send-to queue accordingly.
- */
- if (ire != save_ire) {
- ire_refrele(ire);
- }
- ire = ire1;
- ire1 = NULL;
- stq = ire->ire_stq;
- nce = ire->ire_nce;
- ill = ire_to_ill(ire);
- mp = next_mp;
- next_mp = NULL;
- }
- } while (multirt_send);
- /*
- * In the multirouting case, release the last ire used for
- * emission. save_ire will be released by the caller.
- */
- if (ire != save_ire) {
- ire_refrele(ire);
- }
- } else {
- /*
- * Can't apply backpressure, just discard the packet.
- */
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
- freemsg(mp);
- return;
- }
+ return (conn_hasmembers_ill_withsrc_v6(connp, v6dst_ptr, v6src_ptr,
+ ira->ira_ill));
}
/*
@@ -12189,37 +4488,52 @@ pr_addr_dbg(char *fmt1, int af, const void *addr)
/*
- * Return the length in bytes of the IPv6 headers (base header, ip6i_t
- * if needed and extension headers) that will be needed based on the
- * ip6_pkt_t structure passed by the caller.
+ * Return the length in bytes of the IPv6 headers (base header
+ * extension headers) that will be needed based on the
+ * ip_pkt_t structure passed by the caller.
*
* The returned length does not include the length of the upper level
* protocol (ULP) header.
*/
int
-ip_total_hdrs_len_v6(ip6_pkt_t *ipp)
+ip_total_hdrs_len_v6(const ip_pkt_t *ipp)
{
int len;
len = IPV6_HDR_LEN;
- if (ipp->ipp_fields & IPPF_HAS_IP6I)
- len += sizeof (ip6i_t);
- if (ipp->ipp_fields & IPPF_HOPOPTS) {
+
+ /*
+ * If there's a security label here, then we ignore any hop-by-hop
+ * options the user may try to set.
+ */
+ if (ipp->ipp_fields & IPPF_LABEL_V6) {
+ uint_t hopoptslen;
+ /*
+ * Note that ipp_label_len_v6 is just the option - not
+ * the hopopts extension header. It also needs to be padded
+ * to a multiple of 8 bytes.
+ */
+ ASSERT(ipp->ipp_label_len_v6 != 0);
+ hopoptslen = ipp->ipp_label_len_v6 + sizeof (ip6_hbh_t);
+ hopoptslen = (hopoptslen + 7)/8 * 8;
+ len += hopoptslen;
+ } else if (ipp->ipp_fields & IPPF_HOPOPTS) {
ASSERT(ipp->ipp_hopoptslen != 0);
len += ipp->ipp_hopoptslen;
}
- if (ipp->ipp_fields & IPPF_RTHDR) {
- ASSERT(ipp->ipp_rthdrlen != 0);
- len += ipp->ipp_rthdrlen;
- }
+
/*
* En-route destination options
* Only do them if there's a routing header as well
*/
- if ((ipp->ipp_fields & (IPPF_RTDSTOPTS|IPPF_RTHDR)) ==
- (IPPF_RTDSTOPTS|IPPF_RTHDR)) {
- ASSERT(ipp->ipp_rtdstoptslen != 0);
- len += ipp->ipp_rtdstoptslen;
+ if ((ipp->ipp_fields & (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) ==
+ (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) {
+ ASSERT(ipp->ipp_rthdrdstoptslen != 0);
+ len += ipp->ipp_rthdrdstoptslen;
+ }
+ if (ipp->ipp_fields & IPPF_RTHDR) {
+ ASSERT(ipp->ipp_rthdrlen != 0);
+ len += ipp->ipp_rthdrlen;
}
if (ipp->ipp_fields & IPPF_DSTOPTS) {
ASSERT(ipp->ipp_dstoptslen != 0);
@@ -12230,80 +4544,40 @@ ip_total_hdrs_len_v6(ip6_pkt_t *ipp)
/*
* All-purpose routine to build a header chain of an IPv6 header
- * followed by any required extension headers and a proto header,
- * preceeded (where necessary) by an ip6i_t private header.
+ * followed by any required extension headers and a proto header.
*
- * The fields of the IPv6 header that are derived from the ip6_pkt_t
- * will be filled in appropriately.
- * Thus the caller must fill in the rest of the IPv6 header, such as
- * traffic class/flowid, source address (if not set here), hoplimit (if not
- * set here) and destination address.
+ * The caller has to set the source and destination address as well as
+ * ip6_plen. The caller has to massage any routing header and compensate
+ * for the ULP pseudo-header checksum due to the source route.
*
- * The extension headers and ip6i_t header will all be fully filled in.
+ * The extension headers will all be fully filled in.
*/
void
-ip_build_hdrs_v6(uchar_t *ext_hdrs, uint_t ext_hdrs_len,
- ip6_pkt_t *ipp, uint8_t protocol)
+ip_build_hdrs_v6(uchar_t *buf, uint_t buf_len, const ip_pkt_t *ipp,
+ uint8_t protocol, uint32_t flowinfo)
{
uint8_t *nxthdr_ptr;
uint8_t *cp;
- ip6i_t *ip6i;
- ip6_t *ip6h = (ip6_t *)ext_hdrs;
+ ip6_t *ip6h = (ip6_t *)buf;
- /*
- * If sending private ip6i_t header down (checksum info, nexthop,
- * or ifindex), adjust ip header pointer and set ip6i_t header pointer,
- * then fill it in. (The checksum info will be filled in by icmp).
- */
- if (ipp->ipp_fields & IPPF_HAS_IP6I) {
- ip6i = (ip6i_t *)ip6h;
- ip6h = (ip6_t *)&ip6i[1];
-
- ip6i->ip6i_flags = 0;
- ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
- if (ipp->ipp_fields & IPPF_IFINDEX ||
- ipp->ipp_fields & IPPF_SCOPE_ID) {
- ASSERT(ipp->ipp_ifindex != 0);
- ip6i->ip6i_flags |= IP6I_IFINDEX;
- ip6i->ip6i_ifindex = ipp->ipp_ifindex;
- }
- if (ipp->ipp_fields & IPPF_ADDR) {
- /*
- * Enable per-packet source address verification if
- * IPV6_PKTINFO specified the source address.
- * ip6_src is set in the transport's _wput function.
- */
- ASSERT(!IN6_IS_ADDR_UNSPECIFIED(
- &ipp->ipp_addr));
- ip6i->ip6i_flags |= IP6I_VERIFY_SRC;
- }
- if (ipp->ipp_fields & IPPF_UNICAST_HOPS) {
- ip6h->ip6_hops = ipp->ipp_unicast_hops;
- /*
- * We need to set this flag so that IP doesn't
- * rewrite the IPv6 header's hoplimit with the
- * current default value.
- */
- ip6i->ip6i_flags |= IP6I_HOPLIMIT;
- }
- if (ipp->ipp_fields & IPPF_NEXTHOP) {
- ASSERT(!IN6_IS_ADDR_UNSPECIFIED(
- &ipp->ipp_nexthop));
- ip6i->ip6i_flags |= IP6I_NEXTHOP;
- ip6i->ip6i_nexthop = ipp->ipp_nexthop;
- }
- /*
- * tell IP this is an ip6i_t private header
- */
- ip6i->ip6i_nxt = IPPROTO_RAW;
- }
/* Initialize IPv6 header */
- ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
+ ip6h->ip6_vcf =
+ (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
+ (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
+
if (ipp->ipp_fields & IPPF_TCLASS) {
- ip6h->ip6_vcf = (ip6h->ip6_vcf & ~IPV6_FLOWINFO_TCLASS) |
- (ipp->ipp_tclass << 20);
+ /* Overrides the class part of flowinfo */
+ ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
+ ipp->ipp_tclass);
}
- if (ipp->ipp_fields & IPPF_ADDR)
+
+ if (ipp->ipp_fields & IPPF_HOPLIMIT)
+ ip6h->ip6_hops = ipp->ipp_hoplimit;
+ else
+ ip6h->ip6_hops = ipp->ipp_unicast_hops;
+
+ if ((ipp->ipp_fields & IPPF_ADDR) &&
+ !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
ip6h->ip6_src = ipp->ipp_addr;
nxthdr_ptr = (uint8_t *)&ip6h->ip6_nxt;
@@ -12313,7 +4587,47 @@ ip_build_hdrs_v6(uchar_t *ext_hdrs, uint_t ext_hdrs_len,
* any extension headers in the right order:
* Hop-by-hop, destination, routing, and final destination opts.
*/
- if (ipp->ipp_fields & IPPF_HOPOPTS) {
+ /*
+ * If there's a security label here, then we ignore any hop-by-hop
+ * options the user may try to set.
+ */
+ if (ipp->ipp_fields & IPPF_LABEL_V6) {
+ /*
+ * Hop-by-hop options with the label.
+ * Note that ipp_label_v6 is just the option - not
+ * the hopopts extension header. It also needs to be padded
+ * to a multiple of 8 bytes.
+ */
+ ip6_hbh_t *hbh = (ip6_hbh_t *)cp;
+ uint_t hopoptslen;
+ uint_t padlen;
+
+ padlen = ipp->ipp_label_len_v6 + sizeof (ip6_hbh_t);
+ hopoptslen = (padlen + 7)/8 * 8;
+ padlen = hopoptslen - padlen;
+
+ *nxthdr_ptr = IPPROTO_HOPOPTS;
+ nxthdr_ptr = &hbh->ip6h_nxt;
+ hbh->ip6h_len = hopoptslen/8 - 1;
+ cp += sizeof (ip6_hbh_t);
+ bcopy(ipp->ipp_label_v6, cp, ipp->ipp_label_len_v6);
+ cp += ipp->ipp_label_len_v6;
+
+ ASSERT(padlen <= 7);
+ switch (padlen) {
+ case 0:
+ break;
+ case 1:
+ cp[0] = IP6OPT_PAD1;
+ break;
+ default:
+ cp[0] = IP6OPT_PADN;
+ cp[1] = padlen - 2;
+ bzero(&cp[2], padlen - 2);
+ break;
+ }
+ cp += padlen;
+ } else if (ipp->ipp_fields & IPPF_HOPOPTS) {
/* Hop-by-hop options */
ip6_hbh_t *hbh = (ip6_hbh_t *)cp;
@@ -12327,15 +4641,15 @@ ip_build_hdrs_v6(uchar_t *ext_hdrs, uint_t ext_hdrs_len,
* En-route destination options
* Only do them if there's a routing header as well
*/
- if ((ipp->ipp_fields & (IPPF_RTDSTOPTS|IPPF_RTHDR)) ==
- (IPPF_RTDSTOPTS|IPPF_RTHDR)) {
+ if ((ipp->ipp_fields & (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) ==
+ (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) {
ip6_dest_t *dst = (ip6_dest_t *)cp;
*nxthdr_ptr = IPPROTO_DSTOPTS;
nxthdr_ptr = &dst->ip6d_nxt;
- bcopy(ipp->ipp_rtdstopts, cp, ipp->ipp_rtdstoptslen);
- cp += ipp->ipp_rtdstoptslen;
+ bcopy(ipp->ipp_rthdrdstopts, cp, ipp->ipp_rthdrdstoptslen);
+ cp += ipp->ipp_rthdrdstoptslen;
}
/*
* Routing header next
@@ -12365,7 +4679,7 @@ ip_build_hdrs_v6(uchar_t *ext_hdrs, uint_t ext_hdrs_len,
* Now set the last header pointer to the proto passed in
*/
*nxthdr_ptr = protocol;
- ASSERT((int)(cp - ext_hdrs) == ext_hdrs_len);
+ ASSERT((int)(cp - buf) == buf_len);
}
/*
@@ -12509,108 +4823,28 @@ ip_massage_options_v6(ip6_t *ip6h, ip6_rthdr_t *rth, netstack_t *ns)
return (cksm);
}
-/*
- * Propagate a multicast group membership operation (join/leave) (*fn) on
- * all interfaces crossed by the related multirt routes.
- * The call is considered successful if the operation succeeds
- * on at least one interface.
- * The function is called if the destination address in the packet to send
- * is multirouted.
- */
-int
-ip_multirt_apply_membership_v6(int (*fn)(conn_t *, boolean_t,
- const in6_addr_t *, int, mcast_record_t, const in6_addr_t *, mblk_t *),
- ire_t *ire, conn_t *connp, boolean_t checkonly, const in6_addr_t *v6grp,
- mcast_record_t fmode, const in6_addr_t *v6src, mblk_t *first_mp)
-{
- ire_t *ire_gw;
- irb_t *irb;
- int index, error = 0;
- opt_restart_t *or;
- ip_stack_t *ipst = ire->ire_ipst;
-
- irb = ire->ire_bucket;
- ASSERT(irb != NULL);
-
- ASSERT(DB_TYPE(first_mp) == M_CTL);
- or = (opt_restart_t *)first_mp->b_rptr;
-
- IRB_REFHOLD(irb);
- for (; ire != NULL; ire = ire->ire_next) {
- if ((ire->ire_flags & RTF_MULTIRT) == 0)
- continue;
- if (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, v6grp))
- continue;
-
- ire_gw = ire_ftable_lookup_v6(&ire->ire_gateway_addr_v6, 0, 0,
- IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, NULL,
- MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE, ipst);
- /* No resolver exists for the gateway; skip this ire. */
- if (ire_gw == NULL)
- continue;
- index = ire_gw->ire_ipif->ipif_ill->ill_phyint->phyint_ifindex;
- /*
- * A resolver exists: we can get the interface on which we have
- * to apply the operation.
- */
- error = fn(connp, checkonly, v6grp, index, fmode, v6src,
- first_mp);
- if (error == 0)
- or->or_private = CGTP_MCAST_SUCCESS;
-
- if (ip_debug > 0) {
- ulong_t off;
- char *ksym;
-
- ksym = kobj_getsymname((uintptr_t)fn, &off);
- ip2dbg(("ip_multirt_apply_membership_v6: "
- "called %s, multirt group 0x%08x via itf 0x%08x, "
- "error %d [success %u]\n",
- ksym ? ksym : "?",
- ntohl(V4_PART_OF_V6((*v6grp))),
- ntohl(V4_PART_OF_V6(ire_gw->ire_src_addr_v6)),
- error, or->or_private));
- }
-
- ire_refrele(ire_gw);
- if (error == EINPROGRESS) {
- IRB_REFRELE(irb);
- return (error);
- }
- }
- IRB_REFRELE(irb);
- /*
- * Consider the call as successful if we succeeded on at least
- * one interface. Otherwise, return the last encountered error.
- */
- return (or->or_private == CGTP_MCAST_SUCCESS ? 0 : error);
-}
-
void
*ip6_kstat_init(netstackid_t stackid, ip6_stat_t *ip6_statisticsp)
{
kstat_t *ksp;
ip6_stat_t template = {
- { "ip6_udp_fast_path", KSTAT_DATA_UINT64 },
- { "ip6_udp_slow_path", KSTAT_DATA_UINT64 },
{ "ip6_udp_fannorm", KSTAT_DATA_UINT64 },
{ "ip6_udp_fanmb", KSTAT_DATA_UINT64 },
+ { "ip6_recv_pullup", KSTAT_DATA_UINT64 },
+ { "ip6_db_ref", KSTAT_DATA_UINT64 },
+ { "ip6_notaligned", KSTAT_DATA_UINT64 },
+ { "ip6_multimblk", KSTAT_DATA_UINT64 },
+ { "ipsec_proto_ahesp", KSTAT_DATA_UINT64 },
{ "ip6_out_sw_cksum", KSTAT_DATA_UINT64 },
+ { "ip6_out_sw_cksum_bytes", KSTAT_DATA_UINT64 },
{ "ip6_in_sw_cksum", KSTAT_DATA_UINT64 },
{ "ip6_tcp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 },
{ "ip6_tcp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 },
{ "ip6_tcp_in_sw_cksum_err", KSTAT_DATA_UINT64 },
- { "ip6_tcp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 },
{ "ip6_udp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 },
{ "ip6_udp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 },
{ "ip6_udp_in_sw_cksum_err", KSTAT_DATA_UINT64 },
- { "ip6_udp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 },
- { "ip6_frag_mdt_pkt_out", KSTAT_DATA_UINT64 },
- { "ip6_frag_mdt_discarded", KSTAT_DATA_UINT64 },
- { "ip6_frag_mdt_allocfail", KSTAT_DATA_UINT64 },
- { "ip6_frag_mdt_addpdescfail", KSTAT_DATA_UINT64 },
- { "ip6_frag_mdt_allocd", KSTAT_DATA_UINT64 },
};
ksp = kstat_create_netstack("ip", 0, "ip6stat", "net",
KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t),
@@ -12641,7 +4875,7 @@ ip6_kstat_fini(netstackid_t stackid, kstat_t *ksp)
* IPV6_SRC_PREFERENCES socket option.
*/
int
-ip6_set_src_preferences(conn_t *connp, uint32_t prefs)
+ip6_set_src_preferences(ip_xmit_attr_t *ixa, uint32_t prefs)
{
/*
* We only support preferences that are covered by
@@ -12675,47 +4909,15 @@ ip6_set_src_preferences(conn_t *connp, uint32_t prefs)
return (EINVAL);
}
- connp->conn_src_preferences = prefs;
+ ixa->ixa_src_preferences = prefs;
return (0);
}
size_t
-ip6_get_src_preferences(conn_t *connp, uint32_t *val)
+ip6_get_src_preferences(ip_xmit_attr_t *ixa, uint32_t *val)
{
- *val = connp->conn_src_preferences;
- return (sizeof (connp->conn_src_preferences));
-}
-
-int
-ip6_set_pktinfo(cred_t *cr, conn_t *connp, struct in6_pktinfo *pkti)
-{
- ire_t *ire;
- ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
-
- /*
- * Verify the source address and ifindex. Privileged users can use
- * any source address. For ancillary data the source address is
- * checked in ip_wput_v6.
- */
- if (pkti->ipi6_ifindex != 0) {
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- if (!phyint_exists(pkti->ipi6_ifindex, ipst)) {
- rw_exit(&ipst->ips_ill_g_lock);
- return (ENXIO);
- }
- rw_exit(&ipst->ips_ill_g_lock);
- }
- if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr) &&
- secpolicy_net_rawaccess(cr) != 0) {
- ire = ire_route_lookup_v6(&pkti->ipi6_addr, 0, 0,
- (IRE_LOCAL|IRE_LOOPBACK), NULL, NULL,
- connp->conn_zoneid, NULL, MATCH_IRE_TYPE, ipst);
- if (ire != NULL)
- ire_refrele(ire);
- else
- return (ENXIO);
- }
- return (0);
+ *val = ixa->ixa_src_preferences;
+ return (sizeof (ixa->ixa_src_preferences));
}
/*
@@ -12743,7 +4945,7 @@ ipsec_ah_get_hdr_size_v6(mblk_t *mp, boolean_t till_ah)
whereptr = (uint8_t *)&ip6h[1];
for (;;) {
/* Assume IP has already stripped it */
- ASSERT(nexthdr != IPPROTO_FRAGMENT && nexthdr != IPPROTO_RAW);
+ ASSERT(nexthdr != IPPROTO_FRAGMENT);
switch (nexthdr) {
case IPPROTO_HOPOPTS:
hbhhdr = (ip6_hbh_t *)whereptr;
@@ -12815,11 +5017,12 @@ ipsec_ah_get_hdr_size_v6(mblk_t *mp, boolean_t till_ah)
* inside the IPSQ (ill_g_lock is not held), `ill' may be removed from the
* group during or after this lookup.
*/
-static boolean_t
+boolean_t
ipif_lookup_testaddr_v6(ill_t *ill, const in6_addr_t *v6srcp, ipif_t **ipifp)
{
ipif_t *ipif;
+
ipif = ipif_lookup_addr_exact_v6(v6srcp, ill, ill->ill_ipst);
if (ipif != NULL) {
if (ipifp != NULL)
diff --git a/usr/src/uts/common/inet/ip/ip6_asp.c b/usr/src/uts/common/inet/ip/ip6_asp.c
index d54e821359..5c499e6526 100644
--- a/usr/src/uts/common/inet/ip/ip6_asp.c
+++ b/usr/src/uts/common/inet/ip/ip6_asp.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/ksynch.h>
@@ -41,6 +39,7 @@
#include <inet/ip6.h>
#include <inet/ip6_asp.h>
#include <inet/ip_ire.h>
+#include <inet/ip_if.h>
#include <inet/ipclassifier.h>
#define IN6ADDR_MASK128_INIT \
@@ -415,18 +414,13 @@ ip6_asp_replace(mblk_t *mp, ip6_asp_t *new_table, size_t new_size,
ipst->ips_ip6_asp_table = tmp_table;
ipst->ips_ip6_asp_table_count = count;
- /*
- * The user has changed the address selection policy table. IPv6
- * source address selection for existing IRE_CACHE and
- * RTF_DYNAMIC entries used the old table, so we need to
- * clear the cache.
- */
- ire_walk_v6(ire_delete_cache_v6, NULL, ALL_ZONES, ipst);
-
unlock_end:
ipst->ips_ip6_asp_uip = B_FALSE;
mutex_exit(&ipst->ips_ip6_asp_lock);
+ /* Let conn_ixa caching know that source address selection changed */
+ ip_update_source_selection(ipst);
+
replace_end:
/* Reply to the ioctl */
q = (queue_t *)mp->b_prev;
diff --git a/usr/src/uts/common/inet/ip/ip6_if.c b/usr/src/uts/common/inet/ip/ip6_if.c
index a986a755ac..364a44b9d4 100644
--- a/usr/src/uts/common/inet/ip/ip6_if.c
+++ b/usr/src/uts/common/inet/ip/ip6_if.c
@@ -76,12 +76,13 @@ static in6_addr_t ipv6_ll_template =
static ipif_t *
ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst,
- queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst);
+ ip_stack_t *ipst);
+
+static int ipif_add_ires_v6(ipif_t *, boolean_t);
/*
- * These two functions, ipif_lookup_group_v6() and ill_lookup_group_v6(),
- * are called when an application does not specify an interface to be
- * used for multicast traffic. It calls ire_lookup_multi_v6() to look
+ * This function is called when an application does not specify an interface
+ * to be used for multicast traffic. It calls ire_lookup_multi_v6() to look
* for an interface route for the specified multicast group. Doing
* this allows the administrator to add prefix routes for multicast to
* indicate which interface to be used for multicast traffic in the above
@@ -89,47 +90,21 @@ ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst,
* multicast group (a /128 route) or anything in between. If there is no
* such multicast route, we just find any multicast capable interface and
* return it.
+ *
+ * We support MULTIRT and RTF_SETSRC on the multicast routes added to the
+ * unicast table. This is used by CGTP.
*/
-ipif_t *
-ipif_lookup_group_v6(const in6_addr_t *group, zoneid_t zoneid, ip_stack_t *ipst)
-{
- ire_t *ire;
- ipif_t *ipif;
-
- ire = ire_lookup_multi_v6(group, zoneid, ipst);
- if (ire != NULL) {
- ipif = ire->ire_ipif;
- ipif_refhold(ipif);
- ire_refrele(ire);
- return (ipif);
- }
-
- return (ipif_lookup_multicast(ipst, zoneid, B_TRUE));
-}
-
ill_t *
-ill_lookup_group_v6(const in6_addr_t *group, zoneid_t zoneid, ip_stack_t *ipst)
+ill_lookup_group_v6(const in6_addr_t *group, zoneid_t zoneid, ip_stack_t *ipst,
+ boolean_t *multirtp, in6_addr_t *setsrcp)
{
- ire_t *ire;
ill_t *ill;
- ipif_t *ipif;
- ire = ire_lookup_multi_v6(group, zoneid, ipst);
- if (ire != NULL) {
- ill = ire->ire_ipif->ipif_ill;
- ill_refhold(ill);
- ire_refrele(ire);
+ ill = ire_lookup_multi_ill_v6(group, zoneid, ipst, multirtp, setsrcp);
+ if (ill != NULL)
return (ill);
- }
-
- ipif = ipif_lookup_multicast(ipst, zoneid, B_TRUE);
- if (ipif == NULL)
- return (NULL);
- ill = ipif->ipif_ill;
- ill_refhold(ill);
- ipif_refrele(ipif);
- return (ill);
+ return (ill_lookup_multicast(ipst, zoneid, B_TRUE));
}
/*
@@ -138,16 +113,12 @@ ill_lookup_group_v6(const in6_addr_t *group, zoneid_t zoneid, ip_stack_t *ipst)
*/
static ipif_t *
ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst,
- queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst)
+ ip_stack_t *ipst)
{
ipif_t *ipif;
ill_t *ill;
- ipsq_t *ipsq;
ill_walk_context_t ctx;
- if (error != NULL)
- *error = 0;
-
/*
* First match all the point-to-point interfaces
* before looking at non-point-to-point interfaces.
@@ -157,7 +128,6 @@ ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst,
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
ill = ILL_START_WALK_V6(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
- GRAB_CONN_LOCK(q);
mutex_enter(&ill->ill_lock);
for (ipif = ill->ill_ipif; ipif != NULL;
ipif = ipif->ipif_next) {
@@ -167,36 +137,19 @@ ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst,
if_addr)) &&
(IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr,
dst))) {
- if (IPIF_CAN_LOOKUP(ipif)) {
+ if (!IPIF_IS_CONDEMNED(ipif)) {
ipif_refhold_locked(ipif);
mutex_exit(&ill->ill_lock);
- RELEASE_CONN_LOCK(q);
rw_exit(&ipst->ips_ill_g_lock);
return (ipif);
- } else if (IPIF_CAN_WAIT(ipif, q)) {
- ipsq = ill->ill_phyint->phyint_ipsq;
- mutex_enter(&ipsq->ipsq_lock);
- mutex_enter(&ipsq->ipsq_xop->ipx_lock);
- mutex_exit(&ill->ill_lock);
- rw_exit(&ipst->ips_ill_g_lock);
- ipsq_enq(ipsq, q, mp, func, NEW_OP,
- ill);
- mutex_exit(&ipsq->ipsq_xop->ipx_lock);
- mutex_exit(&ipsq->ipsq_lock);
- RELEASE_CONN_LOCK(q);
- if (error != NULL)
- *error = EINPROGRESS;
- return (NULL);
}
}
}
mutex_exit(&ill->ill_lock);
- RELEASE_CONN_LOCK(q);
}
rw_exit(&ipst->ips_ill_g_lock);
/* lookup the ipif based on interface address */
- ipif = ipif_lookup_addr_v6(if_addr, NULL, ALL_ZONES, q, mp, func,
- error, ipst);
+ ipif = ipif_lookup_addr_v6(if_addr, NULL, ALL_ZONES, ipst);
ASSERT(ipif == NULL || ipif->ipif_isv6);
return (ipif);
}
@@ -206,17 +159,14 @@ ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst,
*/
static ipif_t *
ipif_lookup_addr_common_v6(const in6_addr_t *addr, ill_t *match_ill,
- boolean_t match_illgrp, zoneid_t zoneid, queue_t *q, mblk_t *mp,
- ipsq_func_t func, int *error, ip_stack_t *ipst)
+ uint32_t match_flags, zoneid_t zoneid, ip_stack_t *ipst)
{
ipif_t *ipif;
ill_t *ill;
boolean_t ptp = B_FALSE;
- ipsq_t *ipsq;
ill_walk_context_t ctx;
-
- if (error != NULL)
- *error = 0;
+ boolean_t match_illgrp = (match_flags & IPIF_MATCH_ILLGRP);
+ boolean_t no_duplicate = (match_flags & IPIF_MATCH_NONDUP);
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
/*
@@ -230,7 +180,6 @@ repeat:
(!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) {
continue;
}
- GRAB_CONN_LOCK(q);
mutex_enter(&ill->ill_lock);
for (ipif = ill->ill_ipif; ipif != NULL;
ipif = ipif->ipif_next) {
@@ -238,6 +187,12 @@ repeat:
ipif->ipif_zoneid != zoneid &&
ipif->ipif_zoneid != ALL_ZONES)
continue;
+
+ if (no_duplicate &&
+ !(ipif->ipif_flags & IPIF_UP)) {
+ continue;
+ }
+
/* Allow the ipif to be down */
if ((!ptp && (IN6_ARE_ADDR_EQUAL(
&ipif->ipif_v6lcl_addr, addr) &&
@@ -245,82 +200,26 @@ repeat:
(ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) &&
IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr,
addr))) {
- if (IPIF_CAN_LOOKUP(ipif)) {
+ if (!IPIF_IS_CONDEMNED(ipif)) {
ipif_refhold_locked(ipif);
mutex_exit(&ill->ill_lock);
- RELEASE_CONN_LOCK(q);
rw_exit(&ipst->ips_ill_g_lock);
return (ipif);
- } else if (IPIF_CAN_WAIT(ipif, q)) {
- ipsq = ill->ill_phyint->phyint_ipsq;
- mutex_enter(&ipsq->ipsq_lock);
- mutex_enter(&ipsq->ipsq_xop->ipx_lock);
- mutex_exit(&ill->ill_lock);
- rw_exit(&ipst->ips_ill_g_lock);
- ipsq_enq(ipsq, q, mp, func, NEW_OP,
- ill);
- mutex_exit(&ipsq->ipsq_xop->ipx_lock);
- mutex_exit(&ipsq->ipsq_lock);
- RELEASE_CONN_LOCK(q);
- if (error != NULL)
- *error = EINPROGRESS;
- return (NULL);
}
}
}
mutex_exit(&ill->ill_lock);
- RELEASE_CONN_LOCK(q);
}
/* If we already did the ptp case, then we are done */
if (ptp) {
rw_exit(&ipst->ips_ill_g_lock);
- if (error != NULL)
- *error = ENXIO;
return (NULL);
}
ptp = B_TRUE;
goto repeat;
}
-boolean_t
-ip_addr_exists_v6(const in6_addr_t *addr, zoneid_t zoneid,
- ip_stack_t *ipst)
-{
- ipif_t *ipif;
- ill_t *ill;
- ill_walk_context_t ctx;
-
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-
- ill = ILL_START_WALK_V6(&ctx, ipst);
- for (; ill != NULL; ill = ill_next(&ctx, ill)) {
- mutex_enter(&ill->ill_lock);
- for (ipif = ill->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
- if (zoneid != ALL_ZONES &&
- ipif->ipif_zoneid != zoneid &&
- ipif->ipif_zoneid != ALL_ZONES)
- continue;
- /* Allow the ipif to be down */
- if (((IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
- addr) &&
- (ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
- ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
- IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr,
- addr))) {
- mutex_exit(&ill->ill_lock);
- rw_exit(&ipst->ips_ill_g_lock);
- return (B_TRUE);
- }
- }
- mutex_exit(&ill->ill_lock);
- }
-
- rw_exit(&ipst->ips_ill_g_lock);
- return (B_FALSE);
-}
-
/*
* Lookup an ipif with the specified address. For point-to-point links we
* look for matches on either the destination address or the local address,
@@ -330,10 +229,24 @@ ip_addr_exists_v6(const in6_addr_t *addr, zoneid_t zoneid,
*/
ipif_t *
ipif_lookup_addr_v6(const in6_addr_t *addr, ill_t *match_ill, zoneid_t zoneid,
- queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst)
+ ip_stack_t *ipst)
{
- return (ipif_lookup_addr_common_v6(addr, match_ill, B_TRUE, zoneid, q,
- mp, func, error, ipst));
+ return (ipif_lookup_addr_common_v6(addr, match_ill, IPIF_MATCH_ILLGRP,
+ zoneid, ipst));
+}
+
+/*
+ * Lookup an ipif with the specified address. Similar to ipif_lookup_addr,
+ * except that we will only return an address if it is not marked as
+ * IPIF_DUPLICATE
+ */
+ipif_t *
+ipif_lookup_addr_nondup_v6(const in6_addr_t *addr, ill_t *match_ill,
+ zoneid_t zoneid, ip_stack_t *ipst)
+{
+ return (ipif_lookup_addr_common_v6(addr, match_ill,
+ (IPIF_MATCH_ILLGRP | IPIF_MATCH_NONDUP), zoneid,
+ ipst));
}
/*
@@ -346,8 +259,8 @@ ipif_lookup_addr_exact_v6(const in6_addr_t *addr, ill_t *match_ill,
ip_stack_t *ipst)
{
ASSERT(match_ill != NULL);
- return (ipif_lookup_addr_common_v6(addr, match_ill, B_FALSE, ALL_ZONES,
- NULL, NULL, NULL, NULL, ipst));
+ return (ipif_lookup_addr_common_v6(addr, match_ill, 0, ALL_ZONES,
+ ipst));
}
/*
@@ -473,23 +386,22 @@ ip_remote_addr_ok_v6(const in6_addr_t *addr, const in6_addr_t *subnet_mask)
/*
* ip_rt_add_v6 is called to add an IPv6 route to the forwarding table.
- * ipif_arg is passed in to associate it with the correct interface
+ * ill is passed in to associate it with the correct interface
* (for link-local destinations and gateways).
+ * If ire_arg is set, then we return the held IRE in that location.
*/
/* ARGSUSED1 */
int
ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
const in6_addr_t *gw_addr, const in6_addr_t *src_addr, int flags,
- ipif_t *ipif_arg, ire_t **ire_arg, queue_t *q, mblk_t *mp, ipsq_func_t func,
- struct rtsa_s *sp, ip_stack_t *ipst)
+ ill_t *ill, ire_t **ire_arg, struct rtsa_s *sp, ip_stack_t *ipst,
+ zoneid_t zoneid)
{
- ire_t *ire;
+ ire_t *ire, *nire;
ire_t *gw_ire = NULL;
ipif_t *ipif;
- boolean_t ipif_refheld = B_FALSE;
uint_t type;
int match_flags = MATCH_IRE_TYPE;
- int error;
tsol_gc_t *gc = NULL;
tsol_gcgrp_t *gcgrp = NULL;
boolean_t gcgrp_xtraref = B_FALSE;
@@ -514,14 +426,19 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
/*
* Get the ipif, if any, corresponding to the gw_addr
+ * If -ifp was specified we restrict ourselves to the ill, otherwise
+ * we match on the gatway and destination to handle unnumbered pt-pt
+ * interfaces.
*/
- ipif = ipif_lookup_interface_v6(gw_addr, dst_addr, q, mp, func,
- &error, ipst);
- if (ipif != NULL)
- ipif_refheld = B_TRUE;
- else if (error == EINPROGRESS) {
- ip1dbg(("ip_rt_add_v6: null and EINPROGRESS"));
- return (error);
+ if (ill != NULL)
+ ipif = ipif_lookup_addr_v6(gw_addr, ill, ALL_ZONES, ipst);
+ else
+ ipif = ipif_lookup_interface_v6(gw_addr, dst_addr, ipst);
+ if (ipif != NULL) {
+ if (IS_VNI(ipif->ipif_ill)) {
+ ipif_refrele(ipif);
+ return (EINVAL);
+ }
}
/*
@@ -535,57 +452,74 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
if (IN6_ARE_ADDR_EQUAL(gw_addr, &ipv6_loopback) &&
IN6_ARE_ADDR_EQUAL(dst_addr, &ipv6_loopback) &&
IN6_ARE_ADDR_EQUAL(mask, &ipv6_all_ones)) {
- ire = ire_ctable_lookup_v6(dst_addr, 0, IRE_LOOPBACK,
- ipif, ALL_ZONES, NULL, match_flags, ipst);
+ ire = ire_ftable_lookup_v6(dst_addr, 0, 0, IRE_LOOPBACK,
+ NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst,
+ NULL);
if (ire != NULL) {
ire_refrele(ire);
- if (ipif_refheld)
- ipif_refrele(ipif);
+ ipif_refrele(ipif);
return (EEXIST);
}
- ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x"
+ ip1dbg(("ip_rt_add_v6: 0x%p creating IRE 0x%x"
"for 0x%x\n", (void *)ipif,
ipif->ipif_ire_type,
ntohl(ipif->ipif_lcl_addr)));
ire = ire_create_v6(
dst_addr,
mask,
- &ipif->ipif_v6src_addr,
- NULL,
- &ipif->ipif_mtu,
- NULL,
- NULL,
- NULL,
- ipif->ipif_net_type,
- ipif,
- NULL,
- 0,
- 0,
- flags,
- &ire_uinfo_null,
NULL,
+ ipif->ipif_ire_type, /* LOOPBACK */
+ ipif->ipif_ill,
+ zoneid,
+ (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0,
NULL,
ipst);
+
if (ire == NULL) {
- if (ipif_refheld)
- ipif_refrele(ipif);
+ ipif_refrele(ipif);
+ return (ENOMEM);
+ }
+ /* src address assigned by the caller? */
+ if ((flags & RTF_SETSRC) &&
+ !IN6_IS_ADDR_UNSPECIFIED(src_addr))
+ ire->ire_setsrc_addr_v6 = *src_addr;
+
+ nire = ire_add(ire);
+ if (nire == NULL) {
+ /*
+ * In the result of failure, ire_add() will have
+ * already deleted the ire in question, so there
+ * is no need to do that here.
+ */
+ ipif_refrele(ipif);
return (ENOMEM);
}
- error = ire_add(&ire, q, mp, func, B_FALSE);
- if (error == 0)
- goto save_ire;
/*
- * In the result of failure, ire_add() will have already
- * deleted the ire in question, so there is no need to
- * do that here.
+ * Check if it was a duplicate entry. This handles
+ * the case of two racing route adds for the same route
*/
- if (ipif_refheld)
+ if (nire != ire) {
+ ASSERT(nire->ire_identical_ref > 1);
+ ire_delete(nire);
+ ire_refrele(nire);
ipif_refrele(ipif);
- return (error);
+ return (EEXIST);
+ }
+ ire = nire;
+ goto save_ire;
}
}
/*
+ * The routes for multicast with CGTP are quite special in that
+ * the gateway is the local interface address, yet RTF_GATEWAY
+ * is set. We turn off RTF_GATEWAY to provide compatibility with
+ * this undocumented and unusual use of multicast routes.
+ */
+ if ((flags & RTF_MULTIRT) && ipif != NULL)
+ flags &= ~RTF_GATEWAY;
+
+ /*
* Traditionally, interface routes are ones where RTF_GATEWAY isn't set
* and the gateway address provided is one of the system's interface
* addresses. By using the routing socket interface and supplying an
@@ -619,8 +553,8 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
* logical interfaces
*
* 192.0.2.32 255.255.255.224 192.0.2.33 U if0
- * 192.0.2.32 255.255.255.224 192.0.2.34 U if0:1
- * 192.0.2.32 255.255.255.224 192.0.2.35 U if0:2
+ * 192.0.2.32 255.255.255.224 192.0.2.34 U if0
+ * 192.0.2.32 255.255.255.224 192.0.2.35 U if0
*
* the ipif's corresponding to each of these interface routes can be
* uniquely identified by the "gateway" (actually interface address).
@@ -635,90 +569,68 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
/* RTF_GATEWAY not set */
if (!(flags & RTF_GATEWAY)) {
- queue_t *stq;
-
if (sp != NULL) {
ip2dbg(("ip_rt_add_v6: gateway security attributes "
"cannot be set with interface route\n"));
- if (ipif_refheld)
+ if (ipif != NULL)
ipif_refrele(ipif);
return (EINVAL);
}
/*
- * As the interface index specified with the RTA_IFP sockaddr is
- * the same for all ipif's off of an ill, the matching logic
- * below uses MATCH_IRE_ILL if such an index was specified.
- * This means that routes sharing the same prefix when added
- * using a RTA_IFP sockaddr must have distinct interface
- * indices (namely, they must be on distinct ill's).
- *
- * On the other hand, since the gateway address will usually be
- * different for each ipif on the system, the matching logic
- * uses MATCH_IRE_IPIF in the case of a traditional interface
- * route. This means that interface routes for the same prefix
- * can be created if they belong to distinct ipif's and if a
- * RTA_IFP sockaddr is not present.
+ * Whether or not ill (RTA_IFP) is set, we require that
+ * the gateway is one of our local addresses.
*/
- if (ipif_arg != NULL) {
- if (ipif_refheld) {
- ipif_refrele(ipif);
- ipif_refheld = B_FALSE;
- }
- ipif = ipif_arg;
- match_flags |= MATCH_IRE_ILL;
- } else {
- /*
- * Check the ipif corresponding to the gw_addr
- */
- if (ipif == NULL)
- return (ENETUNREACH);
- match_flags |= MATCH_IRE_IPIF;
+ if (ipif == NULL)
+ return (ENETUNREACH);
+
+ /*
+ * We use MATCH_IRE_ILL here. If the caller specified an
+ * interface (from the RTA_IFP sockaddr) we use it, otherwise
+ * we use the ill derived from the gateway address.
+ * We can always match the gateway address since we record it
+ * in ire_gateway_addr.
+ * We don't allow RTA_IFP to specify a different ill than the
+ * one matching the ipif to make sure we can delete the route.
+ */
+ match_flags |= MATCH_IRE_GW | MATCH_IRE_ILL;
+ if (ill == NULL) {
+ ill = ipif->ipif_ill;
+ } else if (ill != ipif->ipif_ill) {
+ ipif_refrele(ipif);
+ return (EINVAL);
}
- ASSERT(ipif != NULL);
/*
* We check for an existing entry at this point.
*/
match_flags |= MATCH_IRE_MASK;
- ire = ire_ftable_lookup_v6(dst_addr, mask, 0, IRE_INTERFACE,
- ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
+ ire = ire_ftable_lookup_v6(dst_addr, mask, gw_addr,
+ IRE_INTERFACE, ill, ALL_ZONES, NULL, match_flags, 0, ipst,
+ NULL);
if (ire != NULL) {
ire_refrele(ire);
- if (ipif_refheld)
- ipif_refrele(ipif);
+ ipif_refrele(ipif);
return (EEXIST);
}
- stq = (ipif->ipif_net_type == IRE_IF_RESOLVER)
- ? ipif->ipif_rq : ipif->ipif_wq;
-
/*
* Create a copy of the IRE_LOOPBACK, IRE_IF_NORESOLVER or
- * IRE_IF_RESOLVER with the modified address and netmask.
+ * IRE_IF_RESOLVER with the modified address, netmask, and
+ * gateway.
*/
ire = ire_create_v6(
dst_addr,
mask,
- &ipif->ipif_v6src_addr,
- NULL,
- &ipif->ipif_mtu,
- NULL,
- NULL,
- stq,
- ipif->ipif_net_type,
- ipif,
- NULL,
- 0,
- 0,
+ gw_addr,
+ ill->ill_net_type,
+ ill,
+ zoneid,
flags,
- &ire_uinfo_null,
- NULL,
NULL,
ipst);
if (ire == NULL) {
- if (ipif_refheld)
- ipif_refrele(ipif);
+ ipif_refrele(ipif);
return (ENOMEM);
}
@@ -731,32 +643,44 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
* RTF_BLACKHOLE flag as these interface routes, by
* definition, can only be that.
*
- * If the IRE type (as defined by ipif->ipif_net_type) is
+ * If the IRE type (as defined by ill->ill_net_type) is
* IRE_LOOPBACK, then we map the request into a
* IRE_IF_NORESOLVER.
*
* Needless to say, the real IRE_LOOPBACK is NOT created by this
* routine, but rather using ire_create_v6() directly.
*/
- if (ipif->ipif_net_type == IRE_LOOPBACK) {
+ if (ill->ill_net_type == IRE_LOOPBACK) {
ire->ire_type = IRE_IF_NORESOLVER;
ire->ire_flags |= RTF_BLACKHOLE;
}
- error = ire_add(&ire, q, mp, func, B_FALSE);
- if (error == 0)
- goto save_ire;
+ /* src address assigned by the caller? */
+ if ((flags & RTF_SETSRC) && !IN6_IS_ADDR_UNSPECIFIED(src_addr))
+ ire->ire_setsrc_addr_v6 = *src_addr;
+
+ nire = ire_add(ire);
+ if (nire == NULL) {
+ /*
+ * In the result of failure, ire_add() will have
+ * already deleted the ire in question, so there
+ * is no need to do that here.
+ */
+ ipif_refrele(ipif);
+ return (ENOMEM);
+ }
/*
- * In the result of failure, ire_add() will have already
- * deleted the ire in question, so there is no need to
- * do that here.
+ * Check if it was a duplicate entry. This handles
+ * the case of two racing route adds for the same route
*/
- if (ipif_refheld)
+ if (nire != ire) {
+ ASSERT(nire->ire_identical_ref > 1);
+ ire_delete(nire);
+ ire_refrele(nire);
ipif_refrele(ipif);
- return (error);
- }
- if (ipif_refheld) {
- ipif_refrele(ipif);
- ipif_refheld = B_FALSE;
+ return (EEXIST);
+ }
+ ire = nire;
+ goto save_ire;
}
/*
@@ -764,14 +688,23 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
* If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the
* gateway, it is currently unreachable and we fail the request
* accordingly.
+ * If RTA_IFP was specified we look on that particular ill.
*/
- ipif = ipif_arg;
- if (ipif_arg != NULL)
+ if (ill != NULL)
match_flags |= MATCH_IRE_ILL;
- gw_ire = ire_ftable_lookup_v6(gw_addr, 0, 0, IRE_INTERFACE, ipif_arg,
- NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
- if (gw_ire == NULL)
+
+ /* Check whether the gateway is reachable. */
+ type = IRE_INTERFACE;
+ if (flags & RTF_INDIRECT)
+ type |= IRE_OFFLINK;
+
+ gw_ire = ire_ftable_lookup_v6(gw_addr, 0, 0, type, ill,
+ ALL_ZONES, NULL, match_flags, 0, ipst, NULL);
+ if (gw_ire == NULL) {
+ if (ipif != NULL)
+ ipif_refrele(ipif);
return (ENETUNREACH);
+ }
/*
* We create one of three types of IREs as a result of this request
@@ -789,10 +722,12 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
type = IRE_PREFIX;
/* check for a duplicate entry */
- ire = ire_ftable_lookup_v6(dst_addr, mask, gw_addr, type, ipif_arg,
- NULL, ALL_ZONES, 0, NULL,
- match_flags | MATCH_IRE_MASK | MATCH_IRE_GW, ipst);
+ ire = ire_ftable_lookup_v6(dst_addr, mask, gw_addr, type, ill,
+ ALL_ZONES, NULL,
+ match_flags | MATCH_IRE_MASK | MATCH_IRE_GW, 0, ipst, NULL);
if (ire != NULL) {
+ if (ipif != NULL)
+ ipif_refrele(ipif);
ire_refrele(gw_ire);
ire_refrele(ire);
return (EEXIST);
@@ -809,6 +744,8 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
/* we hold reference to it upon success */
gcgrp = gcgrp_lookup(&ga, B_TRUE);
if (gcgrp == NULL) {
+ if (ipif != NULL)
+ ipif_refrele(ipif);
ire_refrele(gw_ire);
return (ENOMEM);
}
@@ -824,6 +761,8 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
if (gc == NULL) {
/* release reference held by gcgrp_lookup */
GCGRP_REFRELE(gcgrp);
+ if (ipif != NULL)
+ ipif_refrele(ipif);
ire_refrele(gw_ire);
return (ENOMEM);
}
@@ -833,23 +772,12 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
ire = ire_create_v6(
dst_addr, /* dest address */
mask, /* mask */
- /* src address assigned by the caller? */
- (((flags & RTF_SETSRC) && !IN6_IS_ADDR_UNSPECIFIED(src_addr)) ?
- src_addr : NULL),
gw_addr, /* gateway address */
- &gw_ire->ire_max_frag,
- NULL, /* no src nce */
- NULL, /* no recv-from queue */
- NULL, /* no send-to queue */
(ushort_t)type, /* IRE type */
- ipif_arg,
- NULL,
- 0,
- 0,
+ ill,
+ zoneid,
flags,
- &gw_ire->ire_uinfo, /* Inherit ULP info from gw */
gc, /* security attribute */
- NULL,
ipst);
/*
@@ -862,26 +790,48 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
if (ire == NULL) {
if (gc != NULL)
GC_REFRELE(gc);
+ if (ipif != NULL)
+ ipif_refrele(ipif);
ire_refrele(gw_ire);
return (ENOMEM);
}
+ /* src address assigned by the caller? */
+ if ((flags & RTF_SETSRC) && !IN6_IS_ADDR_UNSPECIFIED(src_addr))
+ ire->ire_setsrc_addr_v6 = *src_addr;
+
/*
* POLICY: should we allow an RTF_HOST with address INADDR_ANY?
* SUN/OS socket stuff does but do we really want to allow ::0 ?
*/
/* Add the new IRE. */
- error = ire_add(&ire, q, mp, func, B_FALSE);
+ nire = ire_add(ire);
+ if (nire == NULL) {
+ /*
+ * In the result of failure, ire_add() will have
+ * already deleted the ire in question, so there
+ * is no need to do that here.
+ */
+ if (ipif != NULL)
+ ipif_refrele(ipif);
+ ire_refrele(gw_ire);
+ return (ENOMEM);
+ }
/*
- * In the result of failure, ire_add() will have already
- * deleted the ire in question, so there is no need to
- * do that here.
+ * Check if it was a duplicate entry. This handles
+ * the case of two racing route adds for the same route
*/
- if (error != 0) {
+ if (nire != ire) {
+ ASSERT(nire->ire_identical_ref > 1);
+ ire_delete(nire);
+ ire_refrele(nire);
+ if (ipif != NULL)
+ ipif_refrele(ipif);
ire_refrele(gw_ire);
- return (error);
+ return (EEXIST);
}
+ ire = nire;
if (flags & RTF_MULTIRT) {
/*
@@ -896,70 +846,51 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
if (ipst->ips_ip_cgtp_filter_ops != NULL &&
!IN6_IS_ADDR_MULTICAST(&(ire->ire_addr_v6))) {
int res;
-
- res = ipst->ips_ip_cgtp_filter_ops->cfo_add_dest_v6(
- ipst->ips_netstack->netstack_stackid,
- &ire->ire_addr_v6,
- &ire->ire_gateway_addr_v6,
- &ire->ire_src_addr_v6,
- &gw_ire->ire_src_addr_v6);
+ ipif_t *src_ipif;
+
+ /* Find the source address corresponding to gw_ire */
+ src_ipif = ipif_lookup_addr_v6(
+ &gw_ire->ire_gateway_addr_v6, NULL, zoneid, ipst);
+ if (src_ipif != NULL) {
+ res = ipst->ips_ip_cgtp_filter_ops->
+ cfo_add_dest_v6(
+ ipst->ips_netstack->netstack_stackid,
+ &ire->ire_addr_v6,
+ &ire->ire_gateway_addr_v6,
+ &ire->ire_setsrc_addr_v6,
+ &src_ipif->ipif_v6lcl_addr);
+ ipif_refrele(src_ipif);
+ } else {
+ res = EADDRNOTAVAIL;
+ }
if (res != 0) {
+ if (ipif != NULL)
+ ipif_refrele(ipif);
ire_refrele(gw_ire);
ire_delete(ire);
+ ire_refrele(ire); /* Held in ire_add */
return (res);
}
}
}
- /*
- * Now that the prefix IRE entry has been created, delete any
- * existing gateway IRE cache entries as well as any IRE caches
- * using the gateway, and force them to be created through
- * ip_newroute_v6.
- */
- if (gc != NULL) {
- ASSERT(gcgrp != NULL);
- ire_clookup_delete_cache_gw_v6(gw_addr, ALL_ZONES, ipst);
- }
-
save_ire:
if (gw_ire != NULL) {
ire_refrele(gw_ire);
+ gw_ire = NULL;
}
- if (ipif != NULL) {
- mblk_t *save_mp;
-
+ if (ire->ire_ill != NULL) {
/*
* Save enough information so that we can recreate the IRE if
- * the interface goes down and then up. The metrics associated
+ * the ILL goes down and then up. The metrics associated
* with the route will be saved as well when rts_setmetrics() is
* called after the IRE has been created. In the case where
* memory cannot be allocated, none of this information will be
* saved.
*/
- save_mp = allocb(sizeof (ifrt_t), BPRI_MED);
- if (save_mp != NULL) {
- ifrt_t *ifrt;
-
- save_mp->b_wptr += sizeof (ifrt_t);
- ifrt = (ifrt_t *)save_mp->b_rptr;
- bzero(ifrt, sizeof (ifrt_t));
- ifrt->ifrt_type = ire->ire_type;
- ifrt->ifrt_v6addr = ire->ire_addr_v6;
- mutex_enter(&ire->ire_lock);
- ifrt->ifrt_v6gateway_addr = ire->ire_gateway_addr_v6;
- ifrt->ifrt_v6src_addr = ire->ire_src_addr_v6;
- mutex_exit(&ire->ire_lock);
- ifrt->ifrt_v6mask = ire->ire_mask_v6;
- ifrt->ifrt_flags = ire->ire_flags;
- ifrt->ifrt_max_frag = ire->ire_max_frag;
- mutex_enter(&ipif->ipif_saved_ire_lock);
- save_mp->b_cont = ipif->ipif_saved_ire_mp;
- ipif->ipif_saved_ire_mp = save_mp;
- ipif->ipif_saved_ire_cnt++;
- mutex_exit(&ipif->ipif_saved_ire_lock);
- }
+ ill_save_ire(ire->ire_ill, ire);
}
+
if (ire_arg != NULL) {
/*
* Store the ire that was successfully added into where ire_arg
@@ -971,28 +902,27 @@ save_ire:
} else {
ire_refrele(ire); /* Held in ire_add */
}
- if (ipif_refheld)
+ if (ipif != NULL)
ipif_refrele(ipif);
return (0);
}
/*
* ip_rt_delete_v6 is called to delete an IPv6 route.
- * ipif_arg is passed in to associate it with the correct interface
+ * ill is passed in to associate it with the correct interface.
* (for link-local destinations and gateways).
*/
/* ARGSUSED4 */
int
ip_rt_delete_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
- const in6_addr_t *gw_addr, uint_t rtm_addrs, int flags, ipif_t *ipif_arg,
- queue_t *q, mblk_t *mp, ipsq_func_t func, ip_stack_t *ipst)
+ const in6_addr_t *gw_addr, uint_t rtm_addrs, int flags, ill_t *ill,
+ ip_stack_t *ipst, zoneid_t zoneid)
{
ire_t *ire = NULL;
ipif_t *ipif;
uint_t type;
uint_t match_flags = MATCH_IRE_TYPE;
int err = 0;
- boolean_t ipif_refheld = B_FALSE;
/*
* If this is the case of RTF_HOST being set, then we set the netmask
@@ -1012,49 +942,49 @@ ip_rt_delete_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
*
* This makes it possible to delete an original
* IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1.
+ * However, we have RTF_KERNEL set on the ones created by ipif_up
+ * and those can not be deleted here.
*
- * As the interface index specified with the RTA_IFP sockaddr is the
- * same for all ipif's off of an ill, the matching logic below uses
- * MATCH_IRE_ILL if such an index was specified. This means a route
- * sharing the same prefix and interface index as the the route
- * intended to be deleted might be deleted instead if a RTA_IFP sockaddr
- * is specified in the request.
- *
- * On the other hand, since the gateway address will usually be
- * different for each ipif on the system, the matching logic
- * uses MATCH_IRE_IPIF in the case of a traditional interface
- * route. This means that interface routes for the same prefix can be
- * uniquely identified if they belong to distinct ipif's and if a
- * RTA_IFP sockaddr is not present.
+ * We use MATCH_IRE_ILL if we know the interface. If the caller
+ * specified an interface (from the RTA_IFP sockaddr) we use it,
+ * otherwise we use the ill derived from the gateway address.
+ * We can always match the gateway address since we record it
+ * in ire_gateway_addr.
*
* For more detail on specifying routes by gateway address and by
* interface index, see the comments in ip_rt_add_v6().
*/
- ipif = ipif_lookup_interface_v6(gw_addr, dst_addr, q, mp, func, &err,
- ipst);
+ ipif = ipif_lookup_interface_v6(gw_addr, dst_addr, ipst);
if (ipif != NULL) {
- ipif_refheld = B_TRUE;
- if (ipif_arg != NULL) {
- ipif_refrele(ipif);
- ipif_refheld = B_FALSE;
- ipif = ipif_arg;
- match_flags |= MATCH_IRE_ILL;
- } else {
- match_flags |= MATCH_IRE_IPIF;
+ ill_t *ill_match;
+
+ if (ill != NULL)
+ ill_match = ill;
+ else
+ ill_match = ipif->ipif_ill;
+
+ match_flags |= MATCH_IRE_ILL;
+ if (ipif->ipif_ire_type == IRE_LOOPBACK) {
+ ire = ire_ftable_lookup_v6(dst_addr, 0, 0, IRE_LOOPBACK,
+ ill_match, ALL_ZONES, NULL, match_flags, 0, ipst,
+ NULL);
+ }
+ if (ire == NULL) {
+ match_flags |= MATCH_IRE_GW;
+ ire = ire_ftable_lookup_v6(dst_addr, mask, gw_addr,
+ IRE_INTERFACE, ill_match, ALL_ZONES, NULL,
+ match_flags, 0, ipst, NULL);
+ }
+ /* Avoid deleting routes created by kernel from an ipif */
+ if (ire != NULL && (ire->ire_flags & RTF_KERNEL)) {
+ ire_refrele(ire);
+ ire = NULL;
}
- if (ipif->ipif_ire_type == IRE_LOOPBACK)
- ire = ire_ctable_lookup_v6(dst_addr, 0, IRE_LOOPBACK,
- ipif, ALL_ZONES, NULL, match_flags, ipst);
- if (ire == NULL)
- ire = ire_ftable_lookup_v6(dst_addr, mask, 0,
- IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL,
- match_flags, ipst);
- } else if (err == EINPROGRESS) {
- return (err);
- } else {
- err = 0;
+ /* Restore in case we didn't find a match */
+ match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_ILL);
}
+
if (ire == NULL) {
/*
* At this point, the gateway address is not one of our own
@@ -1062,15 +992,11 @@ ip_rt_delete_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
* set the IRE type to lookup based on whether
* this is a host route, a default route or just a prefix.
*
- * If an ipif_arg was passed in, then the lookup is based on an
+ * If an ill was passed in, then the lookup is based on an
* interface index so MATCH_IRE_ILL is added to match_flags.
- * In any case, MATCH_IRE_IPIF is cleared and MATCH_IRE_GW is
- * set as the route being looked up is not a traditional
- * interface route.
*/
- match_flags &= ~MATCH_IRE_IPIF;
match_flags |= MATCH_IRE_GW;
- if (ipif_arg != NULL)
+ if (ill != NULL)
match_flags |= MATCH_IRE_ILL;
if (IN6_ARE_ADDR_EQUAL(mask, &ipv6_all_ones))
type = IRE_HOST;
@@ -1079,12 +1005,12 @@ ip_rt_delete_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
else
type = IRE_PREFIX;
ire = ire_ftable_lookup_v6(dst_addr, mask, gw_addr, type,
- ipif_arg, NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
+ ill, ALL_ZONES, NULL, match_flags, 0, ipst, NULL);
}
- if (ipif_refheld) {
+ if (ipif != NULL) {
ipif_refrele(ipif);
- ipif_refheld = B_FALSE;
+ ipif = NULL;
}
if (ire == NULL)
return (ESRCH);
@@ -1103,42 +1029,9 @@ ip_rt_delete_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask,
}
}
- ipif = ire->ire_ipif;
- if (ipif != NULL) {
- mblk_t **mpp;
- mblk_t *mp;
- ifrt_t *ifrt;
- in6_addr_t gw_addr_v6;
-
- /* Remove from ipif_saved_ire_mp list if it is there */
- mutex_enter(&ire->ire_lock);
- gw_addr_v6 = ire->ire_gateway_addr_v6;
- mutex_exit(&ire->ire_lock);
- mutex_enter(&ipif->ipif_saved_ire_lock);
- for (mpp = &ipif->ipif_saved_ire_mp; *mpp != NULL;
- mpp = &(*mpp)->b_cont) {
- /*
- * On a given ipif, the triple of address, gateway and
- * mask is unique for each saved IRE (in the case of
- * ordinary interface routes, the gateway address is
- * all-zeroes).
- */
- mp = *mpp;
- ifrt = (ifrt_t *)mp->b_rptr;
- if (IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6addr,
- &ire->ire_addr_v6) &&
- IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6gateway_addr,
- &gw_addr_v6) &&
- IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6mask,
- &ire->ire_mask_v6)) {
- *mpp = mp->b_cont;
- ipif->ipif_saved_ire_cnt--;
- freeb(mp);
- break;
- }
- }
- mutex_exit(&ipif->ipif_saved_ire_lock);
- }
+ ill = ire->ire_ill;
+ if (ill != NULL)
+ ill_remove_saved_ire(ill, ire);
ire_delete(ire);
ire_refrele(ire);
return (err);
@@ -1197,7 +1090,6 @@ ipif_set6to4addr(ipif_t *ipif)
(void) ip_plen_to_mask_v6(16, &ipif->ipif_v6net_mask);
bcopy(ill->ill_phys_addr, &v4phys, sizeof (struct in_addr));
IN6_V4ADDR_TO_6TO4(&v4phys, &ipif->ipif_v6lcl_addr);
- ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr;
V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
ipif->ipif_v6subnet);
}
@@ -1260,11 +1152,6 @@ ipif_setlinklocal(ipif_t *ipif)
ipif->ipif_v6subnet);
}
- if (ipif->ipif_flags & IPIF_NOLOCAL) {
- ipif->ipif_v6src_addr = ipv6_all_zeros;
- } else {
- ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr;
- }
}
/*
@@ -1280,123 +1167,15 @@ ipif_setdestlinklocal(ipif_t *ipif)
ASSERT(IAM_WRITER_ILL(ill));
if (IN6_IS_ADDR_UNSPECIFIED(&ill->ill_dest_token))
return;
+ /* Skip if we've already set the pp_dst_addr */
+ if (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr))
+ return;
+
ipif_get_linklocal(&ipif->ipif_v6pp_dst_addr, &ill->ill_dest_token);
ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
}
/*
- * This function sets up the multicast mappings in NDP.
- * Unlike ARP, there are no mapping_mps here. We delete the
- * mapping nces and add a new one.
- *
- * Returns non-zero on error and 0 on success.
- */
-int
-ipif_ndp_setup_multicast(ipif_t *ipif, nce_t **ret_nce)
-{
- ill_t *ill = ipif->ipif_ill;
- in6_addr_t v6_mcast_addr = {(uint32_t)V6_MCAST, 0, 0, 0};
- in6_addr_t v6_mcast_mask = {(uint32_t)V6_MCAST, 0, 0, 0};
- in6_addr_t v6_extract_mask;
- uchar_t *phys_addr, *bphys_addr, *alloc_phys;
- nce_t *mnce = NULL;
- int err = 0;
- phyint_t *phyi = ill->ill_phyint;
- uint32_t hw_extract_start;
- dl_unitdata_req_t *dlur;
- ip_stack_t *ipst = ill->ill_ipst;
-
- if (ret_nce != NULL)
- *ret_nce = NULL;
-
- if (ipif->ipif_flags & IPIF_POINTOPOINT)
- return (0);
-
- /*
- * IPMP meta-interfaces don't have any inherent multicast mappings,
- * and instead use the ones on the underlying interfaces.
- */
- if (IS_IPMP(ill))
- return (0);
-
- /*
- * Delete the mapping nce. Normally these should not exist
- * as a previous ipif_down -> ipif_ndp_down should have deleted
- * all the nces. But they can exist if ip_rput_dlpi_writer
- * calls this when PHYI_MULTI_BCAST is set. Mappings are always
- * tied to the underlying ill, so don't match across the illgrp.
- */
- mnce = ndp_lookup_v6(ill, B_FALSE, &v6_mcast_addr, B_FALSE);
- if (mnce != NULL) {
- ndp_delete(mnce);
- NCE_REFRELE(mnce);
- mnce = NULL;
- }
-
- /*
- * Get media specific v6 mapping information. Note that
- * nd_lla_len can be 0 for tunnels.
- */
- alloc_phys = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
- if ((alloc_phys == NULL) && (ill->ill_nd_lla_len != 0))
- return (ENOMEM);
- /*
- * Determine the broadcast address.
- */
- dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
- if (ill->ill_sap_length < 0)
- bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset;
- else
- bphys_addr = (uchar_t *)dlur +
- dlur->dl_dest_addr_offset + ill->ill_sap_length;
-
- /*
- * Check PHYI_MULTI_BCAST and possible length of physical
- * address to determine if we use the mapping or the
- * broadcast address.
- */
- if ((phyi->phyint_flags & PHYI_MULTI_BCAST) ||
- (!MEDIA_V6MINFO(ill->ill_media, ill->ill_nd_lla_len,
- bphys_addr, alloc_phys, &hw_extract_start,
- &v6_extract_mask))) {
- if (ill->ill_phys_addr_length > IP_MAX_HW_LEN) {
- kmem_free(alloc_phys, ill->ill_nd_lla_len);
- return (E2BIG);
- }
- /* Use the link-layer broadcast address for MULTI_BCAST */
- phys_addr = bphys_addr;
- bzero(&v6_extract_mask, sizeof (v6_extract_mask));
- hw_extract_start = ill->ill_nd_lla_len;
- } else {
- phys_addr = alloc_phys;
- }
- if ((ipif->ipif_flags & IPIF_BROADCAST) ||
- (ill->ill_flags & ILLF_MULTICAST) ||
- (phyi->phyint_flags & PHYI_MULTI_BCAST)) {
- mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
- err = ndp_add_v6(ill,
- phys_addr,
- &v6_mcast_addr, /* v6 address */
- &v6_mcast_mask, /* v6 mask */
- &v6_extract_mask,
- hw_extract_start,
- NCE_F_MAPPING | NCE_F_PERMANENT | NCE_F_NONUD,
- ND_REACHABLE,
- &mnce);
- mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
- if (err == 0) {
- if (ret_nce != NULL) {
- *ret_nce = mnce;
- } else {
- NCE_REFRELE(mnce);
- }
- }
- }
- kmem_free(alloc_phys, ill->ill_nd_lla_len);
- return (err);
-}
-
-/*
* Get the resolver set up for a new ipif. (Always called as writer.)
*/
int
@@ -1405,50 +1184,28 @@ ipif_ndp_up(ipif_t *ipif, boolean_t initial)
ill_t *ill = ipif->ipif_ill;
int err = 0;
nce_t *nce = NULL;
- nce_t *mnce = NULL;
boolean_t added_ipif = B_FALSE;
- ASSERT(IAM_WRITER_ILL(ill));
+ DTRACE_PROBE3(ipif__downup, char *, "ipif_ndp_up",
+ ill_t *, ill, ipif_t *, ipif);
ip1dbg(("ipif_ndp_up(%s:%u)\n", ill->ill_name, ipif->ipif_id));
- /*
- * ND not supported on XRESOLV interfaces. If ND support (multicast)
- * added later, take out this check.
- */
- if ((ill->ill_flags & ILLF_XRESOLV) ||
- IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) ||
+ if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) ||
(!(ill->ill_net_type & IRE_INTERFACE))) {
ipif->ipif_addr_ready = 1;
return (0);
}
- /*
- * Need to setup multicast mapping only when the first
- * interface is coming UP.
- */
- if (ill->ill_ipif_up_count == 0 &&
- (ill->ill_flags & ILLF_MULTICAST)) {
- /*
- * We set the multicast before setting up the mapping for
- * local address because ipif_ndp_setup_multicast does
- * ndp_walk to delete nces which will delete the mapping
- * for local address also if we added the mapping for
- * local address first.
- */
- err = ipif_ndp_setup_multicast(ipif, &mnce);
- if (err != 0)
- return (err);
- }
-
if ((ipif->ipif_flags & (IPIF_UNNUMBERED|IPIF_NOLOCAL)) == 0) {
uint16_t flags;
uint16_t state;
- uchar_t *hw_addr = NULL;
+ uchar_t *hw_addr;
ill_t *bound_ill;
ipmp_illgrp_t *illg = ill->ill_grp;
+ uint_t hw_addr_len;
- /* Permanent entries don't need NUD */
- flags = NCE_F_PERMANENT | NCE_F_NONUD;
+ flags = NCE_F_MYADDR | NCE_F_NONUD | NCE_F_PUBLISH |
+ NCE_F_AUTHORITY;
if (ill->ill_flags & ILLF_ROUTER)
flags |= NCE_F_ISROUTER;
@@ -1483,10 +1240,16 @@ ipif_ndp_up(ipif_t *ipif, boolean_t initial)
added_ipif = B_TRUE;
}
hw_addr = bound_ill->ill_nd_lla;
+ hw_addr_len = bound_ill->ill_phys_addr_length;
} else {
bound_ill = ill;
- if (ill->ill_net_type == IRE_IF_RESOLVER)
+ if (ill->ill_net_type == IRE_IF_RESOLVER) {
hw_addr = ill->ill_nd_lla;
+ hw_addr_len = ill->ill_phys_addr_length;
+ } else {
+ hw_addr = NULL;
+ hw_addr_len = 0;
+ }
}
/*
@@ -1496,28 +1259,16 @@ ipif_ndp_up(ipif_t *ipif, boolean_t initial)
* unsolicited advertisements to inform others.
*/
if (initial || !ipif->ipif_addr_ready) {
+ /* Causes Duplicate Address Detection to run */
state = ND_PROBE;
} else {
state = ND_REACHABLE;
flags |= NCE_F_UNSOL_ADV;
}
+
retry:
- /*
- * Create an nce for the local address. We pass a match_illgrp
- * of B_TRUE because the local address must be unique across
- * the illgrp, and the existence of an nce with nce_ill set
- * to any ill in the group is indicative of a duplicate address
- */
- err = ndp_lookup_then_add_v6(bound_ill,
- B_TRUE,
- hw_addr,
- &ipif->ipif_v6lcl_addr,
- &ipv6_all_ones,
- &ipv6_all_zeros,
- 0,
- flags,
- state,
- &nce);
+ err = nce_lookup_then_add_v6(ill, hw_addr, hw_addr_len,
+ &ipif->ipif_v6lcl_addr, flags, state, &nce);
switch (err) {
case 0:
ip1dbg(("ipif_ndp_up: NCE created for %s\n",
@@ -1535,14 +1286,21 @@ retry:
case EEXIST:
ip1dbg(("ipif_ndp_up: NCE already exists for %s\n",
ill->ill_name));
- if (!(nce->nce_flags & NCE_F_PERMANENT)) {
- ndp_delete(nce);
- NCE_REFRELE(nce);
+ if (!NCE_MYADDR(nce->nce_common)) {
+ /*
+ * A leftover nce from before this address
+ * existed
+ */
+ ncec_delete(nce->nce_common);
+ nce_refrele(nce);
nce = NULL;
goto retry;
}
if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
- NCE_REFRELE(nce);
+ nce_refrele(nce);
+ nce = NULL;
+ ip1dbg(("ipif_ndp_up: NCE already exists "
+ "for %s\n", ill->ill_name));
goto fail;
}
/*
@@ -1557,6 +1315,7 @@ retry:
ipif->ipif_addr_ready = 1;
ipif->ipif_added_nce = 1;
nce->nce_ipif_cnt++;
+ err = 0;
break;
default:
ip1dbg(("ipif_ndp_up: NCE creation failed for %s\n",
@@ -1568,15 +1327,9 @@ retry:
ipif->ipif_addr_ready = 1;
}
if (nce != NULL)
- NCE_REFRELE(nce);
- if (mnce != NULL)
- NCE_REFRELE(mnce);
+ nce_refrele(nce);
return (0);
fail:
- if (mnce != NULL) {
- ndp_delete(mnce);
- NCE_REFRELE(mnce);
- }
if (added_ipif)
ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
@@ -1587,181 +1340,7 @@ fail:
void
ipif_ndp_down(ipif_t *ipif)
{
- nce_t *nce;
- ill_t *ill = ipif->ipif_ill;
-
- ASSERT(IAM_WRITER_ILL(ill));
-
- if (ipif->ipif_isv6) {
- if (ipif->ipif_added_nce) {
- /*
- * For IPMP, `ill' can be the IPMP ill but the NCE will
- * always be tied to an underlying IP interface, so we
- * match across the illgrp. This is safe since we
- * ensure uniqueness across the group in ipif_ndp_up().
- */
- nce = ndp_lookup_v6(ill, B_TRUE, &ipif->ipif_v6lcl_addr,
- B_FALSE);
- if (nce != NULL) {
- if (--nce->nce_ipif_cnt == 0)
- ndp_delete(nce); /* last ipif for nce */
- NCE_REFRELE(nce);
- }
- ipif->ipif_added_nce = 0;
- }
-
- /*
- * Make IPMP aware of the deleted data address.
- */
- if (IS_IPMP(ill))
- ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
- }
-
- /*
- * Remove mapping and all other nces dependent on this ill
- * when the last ipif is going away.
- */
- if (ill->ill_ipif_up_count == 0)
- ndp_walk(ill, (pfi_t)ndp_delete_per_ill, ill, ill->ill_ipst);
-}
-
-/*
- * Used when an interface comes up to recreate any extra routes on this
- * interface.
- */
-static ire_t **
-ipif_recover_ire_v6(ipif_t *ipif)
-{
- mblk_t *mp;
- ire_t **ipif_saved_irep;
- ire_t **irep;
- ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
-
- ip1dbg(("ipif_recover_ire_v6(%s:%u)", ipif->ipif_ill->ill_name,
- ipif->ipif_id));
-
- ASSERT(ipif->ipif_isv6);
-
- mutex_enter(&ipif->ipif_saved_ire_lock);
- ipif_saved_irep = (ire_t **)kmem_zalloc(sizeof (ire_t *) *
- ipif->ipif_saved_ire_cnt, KM_NOSLEEP);
- if (ipif_saved_irep == NULL) {
- mutex_exit(&ipif->ipif_saved_ire_lock);
- return (NULL);
- }
-
- irep = ipif_saved_irep;
-
- for (mp = ipif->ipif_saved_ire_mp; mp != NULL; mp = mp->b_cont) {
- ire_t *ire;
- queue_t *rfq;
- queue_t *stq;
- ifrt_t *ifrt;
- in6_addr_t *src_addr;
- in6_addr_t *gateway_addr;
- char buf[INET6_ADDRSTRLEN];
- ushort_t type;
-
- /*
- * When the ire was initially created and then added in
- * ip_rt_add_v6(), it was created either using
- * ipif->ipif_net_type in the case of a traditional interface
- * route, or as one of the IRE_OFFSUBNET types (with the
- * exception of IRE_HOST type redirect ire which is created by
- * icmp_redirect_v6() and which we don't need to save or
- * recover). In the case where ipif->ipif_net_type was
- * IRE_LOOPBACK, ip_rt_add_v6() will update the ire_type to
- * IRE_IF_NORESOLVER before calling ire_add_v6() to satisfy
- * software like GateD and Sun Cluster which creates routes
- * using the the loopback interface's address as a gateway.
- *
- * As ifrt->ifrt_type reflects the already updated ire_type,
- * ire_create_v6() will be called in the same way here as in
- * ip_rt_add_v6(), namely using ipif->ipif_net_type when the
- * route looks like a traditional interface route (where
- * ifrt->ifrt_type & IRE_INTERFACE is true) and otherwise
- * using the saved ifrt->ifrt_type. This means that in
- * the case where ipif->ipif_net_type is IRE_LOOPBACK,
- * the ire created by ire_create_v6() will be an IRE_LOOPBACK,
- * it will then be turned into an IRE_IF_NORESOLVER and then
- * added by ire_add_v6().
- */
- ifrt = (ifrt_t *)mp->b_rptr;
- if (ifrt->ifrt_type & IRE_INTERFACE) {
- rfq = NULL;
- stq = (ipif->ipif_net_type == IRE_IF_RESOLVER)
- ? ipif->ipif_rq : ipif->ipif_wq;
- src_addr = (ifrt->ifrt_flags & RTF_SETSRC)
- ? &ifrt->ifrt_v6src_addr
- : &ipif->ipif_v6src_addr;
- gateway_addr = NULL;
- type = ipif->ipif_net_type;
- } else {
- rfq = NULL;
- stq = NULL;
- src_addr = (ifrt->ifrt_flags & RTF_SETSRC)
- ? &ifrt->ifrt_v6src_addr : NULL;
- gateway_addr = &ifrt->ifrt_v6gateway_addr;
- type = ifrt->ifrt_type;
- }
-
- /*
- * Create a copy of the IRE with the saved address and netmask.
- */
- ip1dbg(("ipif_recover_ire_v6: creating IRE %s (%d) for %s/%d\n",
- ip_nv_lookup(ire_nv_tbl, ifrt->ifrt_type), ifrt->ifrt_type,
- inet_ntop(AF_INET6, &ifrt->ifrt_v6addr, buf, sizeof (buf)),
- ip_mask_to_plen_v6(&ifrt->ifrt_v6mask)));
- ire = ire_create_v6(
- &ifrt->ifrt_v6addr,
- &ifrt->ifrt_v6mask,
- src_addr,
- gateway_addr,
- &ifrt->ifrt_max_frag,
- NULL,
- rfq,
- stq,
- type,
- ipif,
- NULL,
- 0,
- 0,
- ifrt->ifrt_flags,
- &ifrt->ifrt_iulp_info,
- NULL,
- NULL,
- ipst);
- if (ire == NULL) {
- mutex_exit(&ipif->ipif_saved_ire_lock);
- kmem_free(ipif_saved_irep,
- ipif->ipif_saved_ire_cnt * sizeof (ire_t *));
- return (NULL);
- }
-
- /*
- * Some software (for example, GateD and Sun Cluster) attempts
- * to create (what amount to) IRE_PREFIX routes with the
- * loopback address as the gateway. This is primarily done to
- * set up prefixes with the RTF_REJECT flag set (for example,
- * when generating aggregate routes.)
- *
- * If the IRE type (as defined by ipif->ipif_net_type) is
- * IRE_LOOPBACK, then we map the request into a
- * IRE_IF_NORESOLVER.
- */
- if (ipif->ipif_net_type == IRE_LOOPBACK)
- ire->ire_type = IRE_IF_NORESOLVER;
- /*
- * ire held by ire_add, will be refreled' in ipif_up_done
- * towards the end
- */
- (void) ire_add(&ire, NULL, NULL, NULL, B_FALSE);
- *irep = ire;
- irep++;
- ip1dbg(("ipif_recover_ire_v6: added ire %p\n", (void *)ire));
- }
- mutex_exit(&ipif->ipif_saved_ire_lock);
- return (ipif_saved_irep);
+ ipif_nce_down(ipif);
}
/*
@@ -1826,8 +1405,7 @@ ip_common_prefix_v6(const in6_addr_t *a1, const in6_addr_t *a2)
#define IPIF_VALID_IPV6_SOURCE(ipif) \
(((ipif)->ipif_flags & IPIF_UP) && \
- !((ipif)->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) && \
- (ipif)->ipif_addr_ready)
+ !((ipif)->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)))
/* source address candidate */
typedef struct candidate {
@@ -2195,13 +1773,6 @@ rule_addr_type(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo,
static rule_res_t
rule_prefix(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo, ip_stack_t *ipst)
{
- /*
- * For IPMP, we always want to choose a random source address from
- * among any equally usable addresses, so always report a tie.
- */
- if (IS_IPMP(dstinfo->dst_ill))
- return (CAND_TIE);
-
if (!bc->cand_common_pref_set) {
bc->cand_common_pref = ip_common_prefix_v6(&bc->cand_srcaddr,
dstinfo->dst_addr);
@@ -2252,14 +1823,15 @@ rule_must_be_last(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo,
*
* src_prefs is the caller's set of source address preferences. If source
* address selection is being called to determine the source address of a
- * connected socket (from ip_bind_connected_v6()), then the preferences are
- * taken from conn_src_preferences. These preferences can be set on a
+ * connected socket (from ip_set_destination_v6()), then the preferences are
+ * taken from conn_ixa->ixa_src_preferences. These preferences can be set on a
* per-socket basis using the IPV6_SRC_PREFERENCES socket option. The only
* preference currently implemented is for rfc3041 temporary addresses.
*/
ipif_t *
ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
- boolean_t restrict_ill, uint32_t src_prefs, zoneid_t zoneid)
+ boolean_t restrict_ill, uint32_t src_prefs, zoneid_t zoneid,
+ boolean_t allow_usesrc, boolean_t *notreadyp)
{
dstinfo_t dstinfo;
char dstr[INET6_ADDRSTRLEN];
@@ -2306,10 +1878,10 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
* usesrc ifindex. This has higher precedence since it is
* finer grained (i.e per interface) v/s being system wide.
*/
- if (dstill->ill_usesrc_ifindex != 0) {
+ if (dstill->ill_usesrc_ifindex != 0 && allow_usesrc) {
if ((usesrc_ill =
ill_lookup_on_ifindex(dstill->ill_usesrc_ifindex, B_TRUE,
- NULL, NULL, NULL, NULL, ipst)) != NULL) {
+ ipst)) != NULL) {
dstinfo.dst_ill = usesrc_ill;
} else {
return (NULL);
@@ -2412,6 +1984,12 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
if (!IPIF_VALID_IPV6_SOURCE(ipif))
continue;
+ if (!ipif->ipif_addr_ready) {
+ if (notreadyp != NULL)
+ *notreadyp = B_TRUE;
+ continue;
+ }
+
if (zoneid != ALL_ZONES &&
ipif->ipif_zoneid != zoneid &&
ipif->ipif_zoneid != ALL_ZONES)
@@ -2505,7 +2083,7 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
if (IS_IPMP(ill) && ipif != NULL) {
mutex_enter(&ipif->ipif_ill->ill_lock);
next_ipif = ipif->ipif_next;
- if (next_ipif != NULL && IPIF_CAN_LOOKUP(next_ipif))
+ if (next_ipif != NULL && !IPIF_IS_CONDEMNED(next_ipif))
ill->ill_src_ipif = next_ipif;
else
ill->ill_src_ipif = NULL;
@@ -2541,7 +2119,7 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
}
mutex_enter(&ipif->ipif_ill->ill_lock);
- if (IPIF_CAN_LOOKUP(ipif)) {
+ if (!IPIF_IS_CONDEMNED(ipif)) {
ipif_refhold_locked(ipif);
mutex_exit(&ipif->ipif_ill->ill_lock);
rw_exit(&ipst->ips_ill_g_lock);
@@ -2556,187 +2134,72 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
}
/*
- * If old_ipif is not NULL, see if ipif was derived from old
- * ipif and if so, recreate the interface route by re-doing
- * source address selection. This happens when ipif_down ->
- * ipif_update_other_ipifs calls us.
+ * Pick a source address based on the destination ill and an optional setsrc
+ * address.
+ * The result is stored in srcp. If generation is set, then put the source
+ * generation number there before we look for the source address (to avoid
+ * missing changes in the set of source addresses.
+ * If flagsp is set, then us it to pass back ipif_flags.
+ *
+ * If the caller wants to cache the returned source address and detect when
+ * that might be stale, the caller should pass in a generation argument,
+ * which the caller can later compare against ips_src_generation
+ *
+ * The precedence order for selecting an IPv6 source address is:
+ * - RTF_SETSRC on the first ire in the recursive lookup always wins.
+ * - If usrsrc is set, swap the ill to be the usesrc one.
+ * - If IPMP is used on the ill, select a random address from the most
+ * preferred ones below:
+ * That is followed by the long list of IPv6 source address selection rules
+ * starting with rule_isdst(), rule_scope(), etc.
*
- * If old_ipif is NULL, just redo the source address selection
- * if needed. This happens when ipif_up_done_v6 calls us.
+ * We have lower preference for ALL_ZONES IP addresses,
+ * as they pose problems with unlabeled destinations.
+ *
+ * Note that when multiple IP addresses match e.g., with rule_scope() we pick
+ * the first one if IPMP is not in use. With IPMP we randomize.
*/
-void
-ipif_recreate_interface_routes_v6(ipif_t *old_ipif, ipif_t *ipif)
+int
+ip_select_source_v6(ill_t *ill, const in6_addr_t *setsrc, const in6_addr_t *dst,
+ zoneid_t zoneid, ip_stack_t *ipst, uint_t restrict_ill, uint32_t src_prefs,
+ in6_addr_t *srcp, uint32_t *generation, uint64_t *flagsp)
{
- ire_t *ire;
- ire_t *ipif_ire;
- queue_t *stq;
- ill_t *ill;
- ipif_t *nipif = NULL;
- boolean_t nipif_refheld = B_FALSE;
- boolean_t ip6_asp_table_held = B_FALSE;
- ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
-
- ill = ipif->ipif_ill;
-
- if (!(ipif->ipif_flags &
- (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) {
- /*
- * Can't possibly have borrowed the source
- * from old_ipif.
- */
- return;
- }
+ ipif_t *ipif;
+ boolean_t notready = B_FALSE; /* Set if !ipif_addr_ready found */
- /*
- * Is there any work to be done? No work if the address
- * is INADDR_ANY, loopback or NOLOCAL or ANYCAST (
- * ipif_select_source_v6() does not borrow addresses from
- * NOLOCAL and ANYCAST interfaces).
- */
- if ((old_ipif != NULL) &&
- ((IN6_IS_ADDR_UNSPECIFIED(&old_ipif->ipif_v6lcl_addr)) ||
- (old_ipif->ipif_ill->ill_wq == NULL) ||
- (old_ipif->ipif_flags &
- (IPIF_NOLOCAL|IPIF_ANYCAST)))) {
- return;
- }
+ if (flagsp != NULL)
+ *flagsp = 0;
/*
- * Perform the same checks as when creating the
- * IRE_INTERFACE in ipif_up_done_v6.
+ * Need to grab the generation number before we check to
+ * avoid a race with a change to the set of local addresses.
+ * No lock needed since the thread which updates the set of local
+ * addresses use ipif/ill locks and exit those (hence a store memory
+ * barrier) before doing the atomic increase of ips_src_generation.
*/
- if (!(ipif->ipif_flags & IPIF_UP))
- return;
-
- if ((ipif->ipif_flags & IPIF_NOXMIT))
- return;
-
- if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet) &&
- IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask))
- return;
-
- /*
- * We know that ipif uses some other source for its
- * IRE_INTERFACE. Is it using the source of this
- * old_ipif?
- */
- ipif_ire = ipif_to_ire_v6(ipif);
- if (ipif_ire == NULL)
- return;
-
- if (old_ipif != NULL &&
- !IN6_ARE_ADDR_EQUAL(&old_ipif->ipif_v6lcl_addr,
- &ipif_ire->ire_src_addr_v6)) {
- ire_refrele(ipif_ire);
- return;
- }
-
- if (ip_debug > 2) {
- /* ip1dbg */
- pr_addr_dbg("ipif_recreate_interface_routes_v6: deleting IRE"
- " for src %s\n", AF_INET6, &ipif_ire->ire_src_addr_v6);
- }
-
- stq = ipif_ire->ire_stq;
-
- /*
- * Can't use our source address. Select a different source address
- * for the IRE_INTERFACE. We restrict interface route source
- * address selection to ipif's assigned to the same link as the
- * interface.
- */
- if (ip6_asp_can_lookup(ipst)) {
- ip6_asp_table_held = B_TRUE;
- nipif = ipif_select_source_v6(ill, &ipif->ipif_v6subnet,
- B_TRUE, IPV6_PREFER_SRC_DEFAULT, ipif->ipif_zoneid);
- }
- if (nipif == NULL) {
- /* Last resort - all ipif's have IPIF_NOLOCAL */
- nipif = ipif;
- } else {
- nipif_refheld = B_TRUE;
+ if (generation != NULL) {
+ *generation = ipst->ips_src_generation;
}
- ire = ire_create_v6(
- &ipif->ipif_v6subnet, /* dest pref */
- &ipif->ipif_v6net_mask, /* mask */
- &nipif->ipif_v6src_addr, /* src addr */
- NULL, /* no gateway */
- &ipif->ipif_mtu, /* max frag */
- NULL, /* no src nce */
- NULL, /* no recv from queue */
- stq, /* send-to queue */
- ill->ill_net_type, /* IF_[NO]RESOLVER */
- ipif,
- NULL,
- 0,
- 0,
- 0,
- &ire_uinfo_null,
- NULL,
- NULL,
- ipst);
-
- if (ire != NULL) {
- ire_t *ret_ire;
- int error;
-
- /*
- * We don't need ipif_ire anymore. We need to delete
- * before we add so that ire_add does not detect
- * duplicates.
- */
- ire_delete(ipif_ire);
- ret_ire = ire;
- error = ire_add(&ret_ire, NULL, NULL, NULL, B_FALSE);
- ASSERT(error == 0);
- ASSERT(ret_ire == ire);
- if (ret_ire != NULL) {
- /* Held in ire_add */
- ire_refrele(ret_ire);
- }
+ /* Was RTF_SETSRC set on the first IRE in the recursive lookup? */
+ if (setsrc != NULL && !IN6_IS_ADDR_UNSPECIFIED(setsrc)) {
+ *srcp = *setsrc;
+ return (0);
}
- /*
- * Either we are falling through from above or could not
- * allocate a replacement.
- */
- ire_refrele(ipif_ire);
- if (ip6_asp_table_held)
- ip6_asp_table_refrele(ipst);
- if (nipif_refheld)
- ipif_refrele(nipif);
-}
-
-/*
- * This old_ipif is going away.
- *
- * Determine if any other ipif's are using our address as
- * ipif_v6lcl_addr (due to those being IPIF_NOLOCAL, IPIF_ANYCAST, or
- * IPIF_DEPRECATED).
- * Find the IRE_INTERFACE for such ipif's and recreate them
- * to use an different source address following the rules in
- * ipif_up_done_v6.
- */
-void
-ipif_update_other_ipifs_v6(ipif_t *old_ipif)
-{
- ipif_t *ipif;
- ill_t *ill;
- char buf[INET6_ADDRSTRLEN];
-
- ASSERT(IAM_WRITER_IPIF(old_ipif));
-
- ill = old_ipif->ipif_ill;
-
- ip1dbg(("ipif_update_other_ipifs_v6(%s, %s)\n",
- ill->ill_name,
- inet_ntop(AF_INET6, &old_ipif->ipif_v6lcl_addr,
- buf, sizeof (buf))));
- for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
- if (ipif != old_ipif)
- ipif_recreate_interface_routes_v6(old_ipif, ipif);
+ ipif = ipif_select_source_v6(ill, dst, restrict_ill, src_prefs, zoneid,
+ B_TRUE, &notready);
+ if (ipif == NULL) {
+ if (notready)
+ return (ENETDOWN);
+ else
+ return (EADDRNOTAVAIL);
}
+ *srcp = ipif->ipif_v6lcl_addr;
+ if (flagsp != NULL)
+ *flagsp = ipif->ipif_flags;
+ ipif_refrele(ipif);
+ return (0);
}
/*
@@ -2744,11 +2207,10 @@ ipif_update_other_ipifs_v6(ipif_t *old_ipif)
* the physical device.
* q and mp represents an ioctl which will be queued waiting for
* completion of the DLPI message exchange.
- * MUST be called on an ill queue. Can not set conn_pending_ill for that
- * reason thus the DL_PHYS_ADDR_ACK code does not assume ill_pending_q.
+ * MUST be called on an ill queue.
*
- * Returns EINPROGRESS when mp has been consumed by queueing it on
- * ill_pending_mp and the ioctl will complete in ip_rput.
+ * Returns EINPROGRESS when mp has been consumed by queueing it.
+ * The ioctl will complete in ip_rput.
*/
int
ill_dl_phys(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
@@ -2888,6 +2350,7 @@ bad:
return (ENOMEM);
}
+/* Add room for tcp+ip headers */
uint_t ip_loopback_mtu_v6plus = IP_LOOPBACK_MTU + IPV6_HDR_LEN + 20;
/*
@@ -2899,28 +2362,14 @@ uint_t ip_loopback_mtu_v6plus = IP_LOOPBACK_MTU + IPV6_HDR_LEN + 20;
int
ipif_up_done_v6(ipif_t *ipif)
{
- ire_t *ire_array[20];
- ire_t **irep = ire_array;
- ire_t **irep1;
ill_t *ill = ipif->ipif_ill;
- queue_t *stq;
- in6_addr_t v6addr;
- in6_addr_t route_mask;
- ipif_t *src_ipif = NULL;
- ipif_t *tmp_ipif;
- boolean_t flush_ire_cache = B_TRUE;
int err;
- char buf[INET6_ADDRSTRLEN];
- ire_t **ipif_saved_irep = NULL;
- int ipif_saved_ire_cnt;
- int cnt;
- boolean_t src_ipif_held = B_FALSE;
boolean_t loopback = B_FALSE;
- boolean_t ip6_asp_table_held = B_FALSE;
- ip_stack_t *ipst = ill->ill_ipst;
ip1dbg(("ipif_up_done_v6(%s:%u)\n",
ipif->ipif_ill->ill_name, ipif->ipif_id));
+ DTRACE_PROBE3(ipif__downup, char *, "ipif_up_done_v6",
+ ill_t *, ill, ipif_t *, ipif);
/* Check if this is a loopback interface */
if (ipif->ipif_ill->ill_wq == NULL)
@@ -2929,46 +2378,10 @@ ipif_up_done_v6(ipif_t *ipif)
ASSERT(ipif->ipif_isv6);
ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
- /*
- * If all other interfaces for this ill are down or DEPRECATED,
- * or otherwise unsuitable for source address selection, remove
- * any IRE_CACHE entries for this ill to make sure source
- * address selection gets to take this new ipif into account.
- * No need to hold ill_lock while traversing the ipif list since
- * we are writer
- */
- for (tmp_ipif = ill->ill_ipif; tmp_ipif;
- tmp_ipif = tmp_ipif->ipif_next) {
- if (((tmp_ipif->ipif_flags &
- (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) ||
- !(tmp_ipif->ipif_flags & IPIF_UP)) ||
- (tmp_ipif == ipif))
- continue;
- /* first useable pre-existing interface */
- flush_ire_cache = B_FALSE;
- break;
- }
- if (flush_ire_cache)
- ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
- IRE_CACHE, ill_ipif_cache_delete, ill, ill);
+ if (IS_LOOPBACK(ill) || ill->ill_net_type == IRE_IF_NORESOLVER) {
+ nce_t *loop_nce = NULL;
+ uint16_t flags = (NCE_F_MYADDR | NCE_F_NONUD | NCE_F_AUTHORITY);
- /*
- * Figure out which way the send-to queue should go. Only
- * IRE_IF_RESOLVER or IRE_IF_NORESOLVER should show up here.
- */
- switch (ill->ill_net_type) {
- case IRE_IF_RESOLVER:
- stq = ill->ill_rq;
- break;
- case IRE_IF_NORESOLVER:
- case IRE_LOOPBACK:
- stq = ill->ill_wq;
- break;
- default:
- return (EINVAL);
- }
-
- if (IS_LOOPBACK(ill)) {
/*
* lo0:1 and subsequent ipifs were marked IRE_LOCAL in
* ipif_lookup_on_name(), but in the case of zones we can have
@@ -2979,29 +2392,99 @@ ipif_up_done_v6(ipif_t *ipif)
ipif->ipif_ire_type = IRE_LOOPBACK;
else
ipif->ipif_ire_type = IRE_LOCAL;
+ if (ill->ill_net_type != IRE_LOOPBACK)
+ flags |= NCE_F_PUBLISH;
+ err = nce_lookup_then_add_v6(ill, NULL,
+ ill->ill_phys_addr_length,
+ &ipif->ipif_v6lcl_addr, flags, ND_REACHABLE, &loop_nce);
+
+ /* A shared-IP zone sees EEXIST for lo0:N */
+ if (err == 0 || err == EEXIST) {
+ ipif->ipif_added_nce = 1;
+ loop_nce->nce_ipif_cnt++;
+ nce_refrele(loop_nce);
+ err = 0;
+ } else {
+ ASSERT(loop_nce == NULL);
+ return (err);
+ }
}
- if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST) ||
- ((ipif->ipif_flags & IPIF_DEPRECATED) &&
- !(ipif->ipif_flags & IPIF_NOFAILOVER))) {
+ err = ipif_add_ires_v6(ipif, loopback);
+ if (err != 0) {
/*
- * Can't use our source address. Select a different
- * source address for the IRE_INTERFACE and IRE_LOCAL
+ * See comments about return value from
+ * ipif_addr_availability_check() in ipif_add_ires_v6().
*/
- if (ip6_asp_can_lookup(ipst)) {
- ip6_asp_table_held = B_TRUE;
- src_ipif = ipif_select_source_v6(ipif->ipif_ill,
- &ipif->ipif_v6subnet, B_FALSE,
- IPV6_PREFER_SRC_DEFAULT, ipif->ipif_zoneid);
+ if (err != EADDRINUSE) {
+ ipif_ndp_down(ipif);
+ } else {
+ /*
+ * Make IPMP aware of the deleted ipif so that
+ * the needed ipmp cleanup (e.g., of ipif_bound_ill)
+ * can be completed. Note that we do not want to
+ * destroy the nce that was created on the ipmp_ill
+ * for the active copy of the duplicate address in
+ * use.
+ */
+ if (IS_IPMP(ill))
+ ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
+ err = EADDRNOTAVAIL;
}
- if (src_ipif == NULL)
- src_ipif = ipif; /* Last resort */
- else
- src_ipif_held = B_TRUE;
- } else {
- src_ipif = ipif;
+ return (err);
+ }
+
+ if (ill->ill_ipif_up_count == 1 && !loopback) {
+ /* Recover any additional IREs entries for this ill */
+ (void) ill_recover_saved_ire(ill);
}
+ if (ill->ill_need_recover_multicast) {
+ /*
+ * Need to recover all multicast memberships in the driver.
+ * This had to be deferred until we had attached.
+ */
+ ill_recover_multicast(ill);
+ }
+
+ if (ill->ill_ipif_up_count == 1) {
+ /*
+ * Since the interface is now up, it may now be active.
+ */
+ if (IS_UNDER_IPMP(ill))
+ ipmp_ill_refresh_active(ill);
+ }
+
+ /* Join the allhosts multicast address and the solicited node MC */
+ ipif_multicast_up(ipif);
+
+ /* Perhaps ilgs should use this ill */
+ update_conn_ill(NULL, ill->ill_ipst);
+
+ if (ipif->ipif_addr_ready)
+ ipif_up_notify(ipif);
+
+ return (0);
+}
+
+/*
+ * Add the IREs associated with the ipif.
+ * Those MUST be explicitly removed in ipif_delete_ires_v6.
+ */
+static int
+ipif_add_ires_v6(ipif_t *ipif, boolean_t loopback)
+{
+ ill_t *ill = ipif->ipif_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+ ire_t *ire_array[20];
+ ire_t **irep = ire_array;
+ ire_t **irep1;
+ in6_addr_t v6addr;
+ in6_addr_t route_mask;
+ int err;
+ char buf[INET6_ADDRSTRLEN];
+ ire_t *ire_local = NULL; /* LOCAL or LOOPBACK */
+
if (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) &&
!(ipif->ipif_flags & IPIF_NOLOCAL)) {
@@ -3024,45 +2507,38 @@ ipif_up_done_v6(ipif_t *ipif)
err = ip_srcid_insert(&ipif->ipif_v6lcl_addr,
ipif->ipif_zoneid, ipst);
if (err != 0) {
- ip0dbg(("ipif_up_done_v6: srcid_insert %d\n", err));
- if (src_ipif_held)
- ipif_refrele(src_ipif);
- if (ip6_asp_table_held)
- ip6_asp_table_refrele(ipst);
+ ip0dbg(("ipif_add_ires_v6: srcid_insert %d\n", err));
return (err);
}
/*
* If the interface address is set, create the LOCAL
* or LOOPBACK IRE.
*/
- ip1dbg(("ipif_up_done_v6: creating IRE %d for %s\n",
+ ip1dbg(("ipif_add_ires_v6: creating IRE %d for %s\n",
ipif->ipif_ire_type,
inet_ntop(AF_INET6, &ipif->ipif_v6lcl_addr,
buf, sizeof (buf))));
- *irep++ = ire_create_v6(
+ ire_local = ire_create_v6(
&ipif->ipif_v6lcl_addr, /* dest address */
&ipv6_all_ones, /* mask */
- &src_ipif->ipif_v6src_addr, /* source address */
NULL, /* no gateway */
- &ip_loopback_mtu_v6plus, /* max frag size */
- NULL,
- ipif->ipif_rq, /* recv-from queue */
- NULL, /* no send-to queue */
ipif->ipif_ire_type, /* LOCAL or LOOPBACK */
- ipif, /* interface */
- NULL,
- 0,
- 0,
- (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0,
- &ire_uinfo_null,
- NULL,
+ ipif->ipif_ill, /* interface */
+ ipif->ipif_zoneid,
+ ((ipif->ipif_flags & IPIF_PRIVATE) ?
+ RTF_PRIVATE : 0) | RTF_KERNEL,
NULL,
ipst);
+ if (ire_local == NULL) {
+ ip1dbg(("ipif_up_done_v6: NULL ire_local\n"));
+ err = ENOMEM;
+ goto bad;
+ }
}
/* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */
- if (stq != NULL && !(ipif->ipif_flags & IPIF_NOXMIT) &&
+ if (!loopback && !(ipif->ipif_flags & IPIF_NOXMIT) &&
!(IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet) &&
IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask))) {
/* ipif_v6subnet is ipif_v6pp_dst_addr for pt-pt */
@@ -3074,27 +2550,19 @@ ipif_up_done_v6(ipif_t *ipif)
route_mask = ipif->ipif_v6net_mask;
}
- ip1dbg(("ipif_up_done_v6: creating if IRE %d for %s\n",
+ ip1dbg(("ipif_add_ires_v6: creating if IRE %d for %s\n",
ill->ill_net_type,
inet_ntop(AF_INET6, &v6addr, buf, sizeof (buf))));
*irep++ = ire_create_v6(
&v6addr, /* dest pref */
&route_mask, /* mask */
- &src_ipif->ipif_v6src_addr, /* src addr */
- NULL, /* no gateway */
- &ipif->ipif_mtu, /* max frag */
- NULL, /* no src nce */
- NULL, /* no recv from queue */
- stq, /* send-to queue */
+ &ipif->ipif_v6lcl_addr, /* gateway */
ill->ill_net_type, /* IF_[NO]RESOLVER */
- ipif,
- NULL,
- 0,
- 0,
- (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0,
- &ire_uinfo_null,
- NULL,
+ ipif->ipif_ill,
+ ipif->ipif_zoneid,
+ ((ipif->ipif_flags & IPIF_PRIVATE) ?
+ RTF_PRIVATE : 0) | RTF_KERNEL,
NULL,
ipst);
}
@@ -3103,15 +2571,13 @@ ipif_up_done_v6(ipif_t *ipif)
for (irep1 = irep; irep1 > ire_array; ) {
irep1--;
if (*irep1 == NULL) {
- ip1dbg(("ipif_up_done_v6: NULL ire found in"
+ ip1dbg(("ipif_add_ires_v6: NULL ire found in"
" ire_array\n"));
err = ENOMEM;
goto bad;
}
}
- ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
-
/*
* Need to atomically check for IP address availability under
* ip_addr_avail_lock. ill_g_lock is held as reader to ensure no new
@@ -3132,20 +2598,12 @@ ipif_up_done_v6(ipif_t *ipif)
* the other ipif. So we don't want to delete it (otherwise the
* other ipif would be unable to send packets).
* ip_addr_availability_check() identifies this case for us and
- * returns EADDRINUSE; we need to turn it into EADDRNOTAVAIL
+ * returns EADDRINUSE; Caller must turn it into EADDRNOTAVAIL
* which is the expected error code.
*
- * Note that, for the non-XRESOLV case, ipif_ndp_down() will
- * only delete the nce in the case when the nce_ipif_cnt drops
- * to 0.
+ * Note that ipif_ndp_down() will only delete the nce in the
+ * case when the nce_ipif_cnt drops to 0.
*/
- if (err == EADDRINUSE) {
- if (ipif->ipif_ill->ill_flags & ILLF_XRESOLV) {
- freemsg(ipif->ipif_arp_del_mp);
- ipif->ipif_arp_del_mp = NULL;
- }
- err = EADDRNOTAVAIL;
- }
ill->ill_ipif_up_count--;
ipif->ipif_flags &= ~IPIF_UP;
goto bad;
@@ -3153,91 +2611,42 @@ ipif_up_done_v6(ipif_t *ipif)
/*
* Add in all newly created IREs.
- *
- * NOTE : We refrele the ire though we may branch to "bad"
- * later on where we do ire_delete. This is okay
- * because nobody can delete it as we are running
- * exclusively.
*/
+ if (ire_local != NULL) {
+ ire_local = ire_add(ire_local);
+#ifdef DEBUG
+ if (ire_local != NULL) {
+ ire_refhold_notr(ire_local);
+ ire_refrele(ire_local);
+ }
+#endif
+ }
+ rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
+ if (ire_local != NULL)
+ ipif->ipif_ire_local = ire_local;
+ rw_exit(&ipst->ips_ill_g_lock);
+ ire_local = NULL;
+
for (irep1 = irep; irep1 > ire_array; ) {
irep1--;
/* Shouldn't be adding any bcast ire's */
ASSERT((*irep1)->ire_type != IRE_BROADCAST);
ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
- /*
- * refheld by ire_add. refele towards the end of the func
- */
- (void) ire_add(irep1, NULL, NULL, NULL, B_FALSE);
- }
- if (ip6_asp_table_held) {
- ip6_asp_table_refrele(ipst);
- ip6_asp_table_held = B_FALSE;
- }
-
- /* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */
- ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt;
- ipif_saved_irep = ipif_recover_ire_v6(ipif);
-
- if (ill->ill_need_recover_multicast) {
- /*
- * Need to recover all multicast memberships in the driver.
- * This had to be deferred until we had attached.
- */
- ill_recover_multicast(ill);
- }
-
- if (ill->ill_ipif_up_count == 1) {
- /*
- * Since the interface is now up, it may now be active.
- */
- if (IS_UNDER_IPMP(ill))
- ipmp_ill_refresh_active(ill);
- }
-
- /* Join the allhosts multicast address and the solicited node MC */
- ipif_multicast_up(ipif);
-
- /*
- * See if anybody else would benefit from our new ipif.
- */
- if (!loopback &&
- !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) {
- ill_update_source_selection(ill);
- }
-
- for (irep1 = irep; irep1 > ire_array; ) {
- irep1--;
+ /* refheld by ire_add */
+ *irep1 = ire_add(*irep1);
if (*irep1 != NULL) {
- /* was held in ire_add */
- ire_refrele(*irep1);
- }
- }
-
- cnt = ipif_saved_ire_cnt;
- for (irep1 = ipif_saved_irep; cnt > 0; irep1++, cnt--) {
- if (*irep1 != NULL) {
- /* was held in ire_add */
ire_refrele(*irep1);
+ *irep1 = NULL;
}
}
if (ipif->ipif_addr_ready)
ipif_up_notify(ipif);
-
- if (ipif_saved_irep != NULL) {
- kmem_free(ipif_saved_irep,
- ipif_saved_ire_cnt * sizeof (ire_t *));
- }
-
- if (src_ipif_held)
- ipif_refrele(src_ipif);
-
return (0);
bad:
- if (ip6_asp_table_held)
- ip6_asp_table_refrele(ipst);
-
+ if (ire_local != NULL)
+ ire_delete(ire_local);
while (irep > ire_array) {
irep--;
if (*irep != NULL)
@@ -3245,21 +2654,85 @@ bad:
}
(void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst);
- if (ipif_saved_irep != NULL) {
- kmem_free(ipif_saved_irep,
- ipif_saved_ire_cnt * sizeof (ire_t *));
+ return (err);
+}
+
+/* Remove all the IREs created by ipif_add_ires_v6 */
+void
+ipif_delete_ires_v6(ipif_t *ipif)
+{
+ ill_t *ill = ipif->ipif_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+ in6_addr_t v6addr;
+ in6_addr_t route_mask;
+ ire_t *ire;
+ int match_args;
+ boolean_t loopback;
+
+ /* Check if this is a loopback interface */
+ loopback = (ipif->ipif_ill->ill_wq == NULL);
+
+ match_args = MATCH_IRE_TYPE | MATCH_IRE_ILL | MATCH_IRE_MASK |
+ MATCH_IRE_ZONEONLY;
+
+ rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
+ if ((ire = ipif->ipif_ire_local) != NULL) {
+ ipif->ipif_ire_local = NULL;
+ rw_exit(&ipst->ips_ill_g_lock);
+ /*
+ * Move count to ipif so we don't loose the count due to
+ * a down/up dance.
+ */
+ atomic_add_32(&ipif->ipif_ib_pkt_count, ire->ire_ib_pkt_count);
+
+ ire_delete(ire);
+ ire_refrele_notr(ire);
+ } else {
+ rw_exit(&ipst->ips_ill_g_lock);
}
- if (src_ipif_held)
- ipif_refrele(src_ipif);
- ipif_ndp_down(ipif);
- ipif_resolver_down(ipif);
+ match_args |= MATCH_IRE_GW;
- return (err);
+ /*
+ * Delete the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate.
+ * Note that atun interfaces have an all-zero ipif_v6subnet.
+ * Thus we allow a zero subnet as long as the mask is non-zero.
+ */
+ if (IS_UNDER_IPMP(ill))
+ match_args |= MATCH_IRE_TESTHIDDEN;
+
+ if (!loopback && !(ipif->ipif_flags & IPIF_NOXMIT) &&
+ !(IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet) &&
+ IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask))) {
+ /* ipif_v6subnet is ipif_v6pp_dst_addr for pt-pt */
+ v6addr = ipif->ipif_v6subnet;
+
+ if (ipif->ipif_flags & IPIF_POINTOPOINT) {
+ route_mask = ipv6_all_ones;
+ } else {
+ route_mask = ipif->ipif_v6net_mask;
+ }
+
+ ire = ire_ftable_lookup_v6(
+ &v6addr, /* dest pref */
+ &route_mask, /* mask */
+ &ipif->ipif_v6lcl_addr, /* gateway */
+ ill->ill_net_type, /* IF_[NO]RESOLVER */
+ ipif->ipif_ill,
+ ipif->ipif_zoneid,
+ NULL,
+ match_args,
+ 0,
+ ipst,
+ NULL);
+ ASSERT(ire != NULL);
+ ire_delete(ire);
+ ire_refrele(ire);
+ }
}
/*
- * Delete an ND entry and the corresponding IRE_CACHE entry if it exists.
+ * Delete an ND entry if it exists.
*/
/* ARGSUSED */
int
@@ -3267,11 +2740,10 @@ ip_siocdelndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
{
sin6_t *sin6;
- nce_t *nce;
struct lifreq *lifr;
lif_nd_req_t *lnr;
ill_t *ill = ipif->ipif_ill;
- ire_t *ire;
+ nce_t *nce;
lifr = (struct lifreq *)mp->b_cont->b_cont->b_rptr;
lnr = &lifr->lifr_nd;
@@ -3289,29 +2761,27 @@ ip_siocdelndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
/*
* Since ND mappings must be consistent across an IPMP group, prohibit
- * deleting ND mappings on underlying interfaces. Also, since ND
- * mappings for IPMP data addresses are owned by IP itself, prohibit
- * deleting them.
+ * deleting ND mappings on underlying interfaces.
+ * Don't allow deletion of mappings for local addresses.
*/
if (IS_UNDER_IPMP(ill))
return (EPERM);
- if (IS_IPMP(ill)) {
- ire = ire_ctable_lookup_v6(&sin6->sin6_addr, NULL, IRE_LOCAL,
- ipif, ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL,
- ill->ill_ipst);
- if (ire != NULL) {
- ire_refrele(ire);
- return (EPERM);
- }
- }
-
- /* See comment in ndp_query() regarding IS_IPMP(ill) usage */
- nce = ndp_lookup_v6(ill, IS_IPMP(ill), &sin6->sin6_addr, B_FALSE);
+ nce = nce_lookup_v6(ill, &sin6->sin6_addr);
if (nce == NULL)
return (ESRCH);
- ndp_delete(nce);
- NCE_REFRELE(nce);
+
+ if (NCE_MYADDR(nce->nce_common)) {
+ nce_refrele(nce);
+ return (EPERM);
+ }
+
+ /*
+ * delete the nce_common which will also delete the nces on any
+ * under_ill in the case of ipmp.
+ */
+ ncec_delete(nce->nce_common);
+ nce_refrele(nce);
return (0);
}
@@ -3383,9 +2853,9 @@ ip_siocsetndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
return (EPERM);
if (IS_IPMP(ill)) {
- ire = ire_ctable_lookup_v6(&sin6->sin6_addr, NULL, IRE_LOCAL,
- ipif, ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL,
- ill->ill_ipst);
+ ire = ire_ftable_lookup_v6(&sin6->sin6_addr, NULL, NULL,
+ IRE_LOCAL, ill, ALL_ZONES, NULL,
+ MATCH_IRE_TYPE | MATCH_IRE_ILL, 0, ill->ill_ipst, NULL);
if (ire != NULL) {
ire_refrele(ire);
return (EPERM);
diff --git a/usr/src/uts/common/inet/ip/ip6_input.c b/usr/src/uts/common/inet/ip/ip6_input.c
new file mode 100644
index 0000000000..cee5344bf6
--- /dev/null
+++ b/usr/src/uts/common/inet/ip/ip6_input.c
@@ -0,0 +1,2749 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/* Copyright (c) 1990 Mentat Inc. */
+
+#include <sys/types.h>
+#include <sys/stream.h>
+#include <sys/dlpi.h>
+#include <sys/stropts.h>
+#include <sys/sysmacros.h>
+#include <sys/strsubr.h>
+#include <sys/strlog.h>
+#include <sys/strsun.h>
+#include <sys/zone.h>
+#define _SUN_TPI_VERSION 2
+#include <sys/tihdr.h>
+#include <sys/xti_inet.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/kobj.h>
+#include <sys/modctl.h>
+#include <sys/atomic.h>
+#include <sys/policy.h>
+#include <sys/priv.h>
+
+#include <sys/systm.h>
+#include <sys/param.h>
+#include <sys/kmem.h>
+#include <sys/sdt.h>
+#include <sys/socket.h>
+#include <sys/vtrace.h>
+#include <sys/isa_defs.h>
+#include <sys/mac.h>
+#include <net/if.h>
+#include <net/if_arp.h>
+#include <net/route.h>
+#include <sys/sockio.h>
+#include <netinet/in.h>
+#include <net/if_dl.h>
+
+#include <inet/common.h>
+#include <inet/mi.h>
+#include <inet/mib2.h>
+#include <inet/nd.h>
+#include <inet/arp.h>
+#include <inet/snmpcom.h>
+#include <inet/kstatcom.h>
+
+#include <netinet/igmp_var.h>
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#include <netinet/sctp.h>
+
+#include <inet/ip.h>
+#include <inet/ip_impl.h>
+#include <inet/ip6.h>
+#include <inet/ip6_asp.h>
+#include <inet/optcom.h>
+#include <inet/tcp.h>
+#include <inet/tcp_impl.h>
+#include <inet/ip_multi.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ire.h>
+#include <inet/ip_ftable.h>
+#include <inet/ip_rts.h>
+#include <inet/ip_ndp.h>
+#include <inet/ip_listutils.h>
+#include <netinet/igmp.h>
+#include <netinet/ip_mroute.h>
+#include <inet/ipp_common.h>
+
+#include <net/pfkeyv2.h>
+#include <inet/sadb.h>
+#include <inet/ipsec_impl.h>
+#include <inet/ipdrop.h>
+#include <inet/ip_netinfo.h>
+#include <inet/ilb_ip.h>
+#include <sys/squeue_impl.h>
+#include <sys/squeue.h>
+
+#include <sys/ethernet.h>
+#include <net/if_types.h>
+#include <sys/cpuvar.h>
+
+#include <ipp/ipp.h>
+#include <ipp/ipp_impl.h>
+#include <ipp/ipgpc/ipgpc.h>
+
+#include <sys/pattr.h>
+#include <inet/ipclassifier.h>
+#include <inet/sctp_ip.h>
+#include <inet/sctp/sctp_impl.h>
+#include <inet/udp_impl.h>
+#include <sys/sunddi.h>
+
+#include <sys/tsol/label.h>
+#include <sys/tsol/tnet.h>
+
+#include <rpc/pmap_prot.h>
+
+#ifdef DEBUG
+extern boolean_t skip_sctp_cksum;
+#endif
+
+static void ip_input_local_v6(ire_t *, mblk_t *, ip6_t *, ip_recv_attr_t *);
+
+static void ip_input_multicast_v6(ire_t *, mblk_t *, ip6_t *,
+ ip_recv_attr_t *);
+
+#pragma inline(ip_input_common_v6, ip_input_local_v6, ip_forward_xmit_v6)
+
+/*
+ * Direct read side procedure capable of dealing with chains. GLDv3 based
+ * drivers call this function directly with mblk chains while STREAMS
+ * read side procedure ip_rput() calls this for single packet with ip_ring
+ * set to NULL to process one packet at a time.
+ *
+ * The ill will always be valid if this function is called directly from
+ * the driver.
+ *
+ * If ip_input_v6() is called from GLDv3:
+ *
+ * - This must be a non-VLAN IP stream.
+ * - 'mp' is either an untagged or a special priority-tagged packet.
+ * - Any VLAN tag that was in the MAC header has been stripped.
+ *
+ * If the IP header in packet is not 32-bit aligned, every message in the
+ * chain will be aligned before further operations. This is required on SPARC
+ * platform.
+ */
+void
+ip_input_v6(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
+ struct mac_header_info_s *mhip)
+{
+ (void) ip_input_common_v6(ill, ip_ring, mp_chain, mhip, NULL, NULL,
+ NULL);
+}
+
+/*
+ * ip_accept_tcp_v6() - This function is called by the squeue when it retrieves
+ * a chain of packets in the poll mode. The packets have gone through the
+ * data link processing but not IP processing. For performance and latency
+ * reasons, the squeue wants to process the chain in line instead of feeding
+ * it back via ip_input path.
+ *
+ * We set up the ip_recv_attr_t with IRAF_TARGET_SQP to that ip_fanout_v6
+ * will pass back any TCP packets matching the target sqp to
+ * ip_input_common_v6 using ira_target_sqp_mp. Other packets are handled by
+ * ip_input_v6 and ip_fanout_v6 as normal.
+ * The TCP packets that match the target squeue are returned to the caller
+ * as a b_next chain after each packet has been prepend with an mblk
+ * from ip_recv_attr_to_mblk.
+ */
+mblk_t *
+ip_accept_tcp_v6(ill_t *ill, ill_rx_ring_t *ip_ring, squeue_t *target_sqp,
+ mblk_t *mp_chain, mblk_t **last, uint_t *cnt)
+{
+ return (ip_input_common_v6(ill, ip_ring, mp_chain, NULL, target_sqp,
+ last, cnt));
+}
+
+/*
+ * Used by ip_input_v6 and ip_accept_tcp_v6
+ * The last three arguments are only used by ip_accept_tcp_v6, and mhip is
+ * only used by ip_input_v6.
+ */
+mblk_t *
+ip_input_common_v6(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
+ struct mac_header_info_s *mhip, squeue_t *target_sqp,
+ mblk_t **last, uint_t *cnt)
+{
+ mblk_t *mp;
+ ip6_t *ip6h;
+ ip_recv_attr_t iras; /* Receive attributes */
+ rtc_t rtc;
+ iaflags_t chain_flags = 0; /* Fixed for chain */
+ mblk_t *ahead = NULL; /* Accepted head */
+ mblk_t *atail = NULL; /* Accepted tail */
+ uint_t acnt = 0; /* Accepted count */
+
+ ASSERT(mp_chain != NULL);
+ ASSERT(ill != NULL);
+
+ /* These ones do not change as we loop over packets */
+ iras.ira_ill = iras.ira_rill = ill;
+ iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+ iras.ira_rifindex = iras.ira_ruifindex;
+ iras.ira_sqp = NULL;
+ iras.ira_ring = ip_ring;
+ /* For ECMP and outbound transmit ring selection */
+ iras.ira_xmit_hint = ILL_RING_TO_XMIT_HINT(ip_ring);
+
+ iras.ira_target_sqp = target_sqp;
+ iras.ira_target_sqp_mp = NULL;
+ if (target_sqp != NULL)
+ chain_flags |= IRAF_TARGET_SQP;
+
+ /*
+ * We try to have a mhip pointer when possible, but
+ * it might be NULL in some cases. In those cases we
+ * have to assume unicast.
+ */
+ iras.ira_mhip = mhip;
+ iras.ira_flags = 0;
+ if (mhip != NULL) {
+ switch (mhip->mhi_dsttype) {
+ case MAC_ADDRTYPE_MULTICAST :
+ chain_flags |= IRAF_L2DST_MULTICAST;
+ break;
+ case MAC_ADDRTYPE_BROADCAST :
+ chain_flags |= IRAF_L2DST_BROADCAST;
+ break;
+ }
+ }
+
+ /*
+ * Initialize the one-element route cache.
+ *
+ * We do ire caching from one iteration to
+ * another. In the event the packet chain contains
+ * all packets from the same dst, this caching saves
+ * an ire_route_recursive for each of the succeeding
+ * packets in a packet chain.
+ */
+ rtc.rtc_ire = NULL;
+ rtc.rtc_ip6addr = ipv6_all_zeros;
+
+ /* Loop over b_next */
+ for (mp = mp_chain; mp != NULL; mp = mp_chain) {
+ mp_chain = mp->b_next;
+ mp->b_next = NULL;
+
+ /*
+ * if db_ref > 1 then copymsg and free original. Packet
+ * may be changed and we do not want the other entity
+ * who has a reference to this message to trip over the
+ * changes. This is a blind change because trying to
+ * catch all places that might change the packet is too
+ * difficult.
+ *
+ * This corresponds to the fast path case, where we have
+ * a chain of M_DATA mblks. We check the db_ref count
+ * of only the 1st data block in the mblk chain. There
+ * doesn't seem to be a reason why a device driver would
+ * send up data with varying db_ref counts in the mblk
+ * chain. In any case the Fast path is a private
+ * interface, and our drivers don't do such a thing.
+ * Given the above assumption, there is no need to walk
+ * down the entire mblk chain (which could have a
+ * potential performance problem)
+ *
+ * The "(DB_REF(mp) > 1)" check was moved from ip_rput()
+ * to here because of exclusive ip stacks and vnics.
+ * Packets transmitted from exclusive stack over vnic
+ * can have db_ref > 1 and when it gets looped back to
+ * another vnic in a different zone, you have ip_input()
+ * getting dblks with db_ref > 1. So if someone
+ * complains of TCP performance under this scenario,
+ * take a serious look here on the impact of copymsg().
+ */
+ if (DB_REF(mp) > 1) {
+ if ((mp = ip_fix_dbref(mp, &iras)) == NULL)
+ continue;
+ }
+
+ /*
+ * IP header ptr not aligned?
+ * OR IP header not complete in first mblk
+ */
+ ip6h = (ip6_t *)mp->b_rptr;
+ if (!OK_32PTR(ip6h) || MBLKL(mp) < IPV6_HDR_LEN) {
+ mp = ip_check_and_align_header(mp, IPV6_HDR_LEN, &iras);
+ if (mp == NULL)
+ continue;
+ ip6h = (ip6_t *)mp->b_rptr;
+ }
+
+ /* Protect against a mix of Ethertypes and IP versions */
+ if (IPH_HDR_VERSION(ip6h) != IPV6_VERSION) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
+ ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
+ freemsg(mp);
+ /* mhip might point into 1st packet in the chain. */
+ iras.ira_mhip = NULL;
+ continue;
+ }
+
+ /*
+ * Check for Martian addrs; we have to explicitly
+ * test for for zero dst since this is also used as
+ * an indication that the rtc is not used.
+ */
+ if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_dst)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
+ ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
+ freemsg(mp);
+ /* mhip might point into 1st packet in the chain. */
+ iras.ira_mhip = NULL;
+ continue;
+ }
+ /*
+ * Keep L2SRC from a previous packet in chain since mhip
+ * might point into an earlier packet in the chain.
+ */
+ chain_flags |= (iras.ira_flags & IRAF_L2SRC_SET);
+
+ iras.ira_flags = IRAF_VERIFY_ULP_CKSUM | chain_flags;
+ iras.ira_free_flags = 0;
+ iras.ira_cred = NULL;
+ iras.ira_cpid = NOPID;
+ iras.ira_tsl = NULL;
+ iras.ira_zoneid = ALL_ZONES; /* Default for forwarding */
+
+ /*
+ * We must count all incoming packets, even if they end
+ * up being dropped later on. Defer counting bytes until
+ * we have the whole IP header in first mblk.
+ */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
+
+ iras.ira_pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
+ UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets,
+ iras.ira_pktlen);
+
+ /*
+ * Call one of:
+ * ill_input_full_v6
+ * ill_input_short_v6
+ * The former is used in the case of TX. See ill_set_inputfn().
+ */
+ (*ill->ill_inputfn)(mp, ip6h, &ip6h->ip6_dst, &iras, &rtc);
+
+ /* Any references to clean up? No hold on ira_ill */
+ if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED))
+ ira_cleanup(&iras, B_FALSE);
+
+ if (iras.ira_target_sqp_mp != NULL) {
+ /* Better be called from ip_accept_tcp */
+ ASSERT(target_sqp != NULL);
+
+ /* Found one packet to accept */
+ mp = iras.ira_target_sqp_mp;
+ iras.ira_target_sqp_mp = NULL;
+ ASSERT(ip_recv_attr_is_mblk(mp));
+
+ if (atail != NULL)
+ atail->b_next = mp;
+ else
+ ahead = mp;
+ atail = mp;
+ acnt++;
+ mp = NULL;
+ }
+ /* mhip might point into 1st packet in the chain. */
+ iras.ira_mhip = NULL;
+ }
+ /* Any remaining references to the route cache? */
+ if (rtc.rtc_ire != NULL) {
+ ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&rtc.rtc_ip6addr));
+ ire_refrele(rtc.rtc_ire);
+ }
+
+ if (ahead != NULL) {
+ /* Better be called from ip_accept_tcp */
+ ASSERT(target_sqp != NULL);
+ *last = atail;
+ *cnt = acnt;
+ return (ahead);
+ }
+
+ return (NULL);
+}
+
+/*
+ * This input function is used when
+ * - is_system_labeled()
+ *
+ * Note that for IPv6 CGTP filtering is handled only when receiving fragment
+ * headers, and RSVP uses router alert options, thus we don't need anything
+ * extra for them.
+ */
+void
+ill_input_full_v6(mblk_t *mp, void *iph_arg, void *nexthop_arg,
+ ip_recv_attr_t *ira, rtc_t *rtc)
+{
+ ip6_t *ip6h = (ip6_t *)iph_arg;
+ in6_addr_t *nexthop = (in6_addr_t *)nexthop_arg;
+ ill_t *ill = ira->ira_ill;
+
+ ASSERT(ira->ira_tsl == NULL);
+
+ /*
+ * Attach any necessary label information to
+ * this packet
+ */
+ if (is_system_labeled()) {
+ ira->ira_flags |= IRAF_SYSTEM_LABELED;
+
+ /*
+ * This updates ira_cred, ira_tsl and ira_free_flags based
+ * on the label.
+ */
+ if (!tsol_get_pkt_label(mp, IPV6_VERSION, ira)) {
+ if (ip6opt_ls != 0)
+ ip0dbg(("tsol_get_pkt_label v6 failed\n"));
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ /* Note that ira_tsl can be NULL here. */
+
+ /* tsol_get_pkt_label sometimes does pullupmsg */
+ ip6h = (ip6_t *)mp->b_rptr;
+ }
+ ill_input_short_v6(mp, ip6h, nexthop, ira, rtc);
+}
+
+/*
+ * Check for IPv6 addresses that should not appear on the wire
+ * as either source or destination.
+ * If we ever implement Stateless IPv6 Translators (SIIT) we'd have
+ * to revisit the IPv4-mapped part.
+ */
+static boolean_t
+ip6_bad_address(in6_addr_t *addr, boolean_t is_src)
+{
+ if (IN6_IS_ADDR_V4MAPPED(addr)) {
+ ip1dbg(("ip_input_v6: pkt with IPv4-mapped addr"));
+ return (B_TRUE);
+ }
+ if (IN6_IS_ADDR_LOOPBACK(addr)) {
+ ip1dbg(("ip_input_v6: pkt with loopback addr"));
+ return (B_TRUE);
+ }
+ if (!is_src && IN6_IS_ADDR_UNSPECIFIED(addr)) {
+ /*
+ * having :: in the src is ok: it's used for DAD.
+ */
+ ip1dbg(("ip_input_v6: pkt with unspecified addr"));
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+/*
+ * Routing lookup for IPv6 link-locals.
+ * First we look on the inbound interface, then we check for IPMP and
+ * look on the upper interface.
+ * We update ira_ruifindex if we find the IRE on the upper interface.
+ */
+static ire_t *
+ire_linklocal(const in6_addr_t *nexthop, ill_t *ill, ip_recv_attr_t *ira,
+ boolean_t allocate, ip_stack_t *ipst)
+{
+ int match_flags = MATCH_IRE_SECATTR | MATCH_IRE_ILL;
+ ire_t *ire;
+
+ ASSERT(IN6_IS_ADDR_LINKLOCAL(nexthop));
+ ire = ire_route_recursive_v6(nexthop, 0, ill, ALL_ZONES, ira->ira_tsl,
+ match_flags, allocate, ira->ira_xmit_hint, ipst, NULL, NULL, NULL);
+ if (!(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
+ !IS_UNDER_IPMP(ill))
+ return (ire);
+
+ /*
+ * When we are using IMP we need to look for an IRE on both the
+ * under and upper interfaces since there are different
+ * link-local addresses for the under and upper.
+ */
+ ill = ipmp_ill_hold_ipmp_ill(ill);
+ if (ill == NULL)
+ return (ire);
+
+ ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+
+ ire_refrele(ire);
+ ire = ire_route_recursive_v6(nexthop, 0, ill, ALL_ZONES, ira->ira_tsl,
+ match_flags, allocate, ira->ira_xmit_hint, ipst, NULL, NULL, NULL);
+ ill_refrele(ill);
+ return (ire);
+}
+
+/*
+ * This is the tail-end of the full receive side packet handling.
+ * It can be used directly when the configuration is simple.
+ */
+void
+ill_input_short_v6(mblk_t *mp, void *iph_arg, void *nexthop_arg,
+ ip_recv_attr_t *ira, rtc_t *rtc)
+{
+ ire_t *ire;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+ uint_t pkt_len;
+ ssize_t len;
+ ip6_t *ip6h = (ip6_t *)iph_arg;
+ in6_addr_t nexthop = *(in6_addr_t *)nexthop_arg;
+ ilb_stack_t *ilbs = ipst->ips_netstack->netstack_ilb;
+#define rptr ((uchar_t *)ip6h)
+
+ ASSERT(DB_TYPE(mp) == M_DATA);
+
+ /*
+ * Check for source/dest being a bad address: loopback, any, or
+ * v4mapped. All of them start with a 64 bits of zero.
+ */
+ if (ip6h->ip6_src.s6_addr32[0] == 0 &&
+ ip6h->ip6_src.s6_addr32[1] == 0) {
+ if (ip6_bad_address(&ip6h->ip6_src, B_TRUE)) {
+ ip1dbg(("ip_input_v6: pkt with bad src addr\n"));
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
+ ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ }
+ if (ip6h->ip6_dst.s6_addr32[0] == 0 &&
+ ip6h->ip6_dst.s6_addr32[1] == 0) {
+ if (ip6_bad_address(&ip6h->ip6_dst, B_FALSE)) {
+ ip1dbg(("ip_input_v6: pkt with bad dst addr\n"));
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
+ ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ }
+
+ len = mp->b_wptr - rptr;
+ pkt_len = ira->ira_pktlen;
+
+ /* multiple mblk or too short */
+ len -= pkt_len;
+ if (len != 0) {
+ mp = ip_check_length(mp, rptr, len, pkt_len, IPV6_HDR_LEN, ira);
+ if (mp == NULL)
+ return;
+ ip6h = (ip6_t *)mp->b_rptr;
+ }
+
+ DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+ ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, ip6h,
+ int, 0);
+ /*
+ * The event for packets being received from a 'physical'
+ * interface is placed after validation of the source and/or
+ * destination address as being local so that packets can be
+ * redirected to loopback addresses using ipnat.
+ */
+ DTRACE_PROBE4(ip6__physical__in__start,
+ ill_t *, ill, ill_t *, NULL,
+ ip6_t *, ip6h, mblk_t *, mp);
+
+ if (HOOKS6_INTERESTED_PHYSICAL_IN(ipst)) {
+ int ll_multicast = 0;
+ int error;
+ in6_addr_t orig_dst = ip6h->ip6_dst;
+
+ if (ira->ira_flags & IRAF_L2DST_MULTICAST)
+ ll_multicast = HPE_MULTICAST;
+ else if (ira->ira_flags & IRAF_L2DST_BROADCAST)
+ ll_multicast = HPE_BROADCAST;
+
+ FW_HOOKS6(ipst->ips_ip6_physical_in_event,
+ ipst->ips_ipv6firewall_physical_in,
+ ill, NULL, ip6h, mp, mp, ll_multicast, ipst, error);
+
+ DTRACE_PROBE1(ip6__physical__in__end, mblk_t *, mp);
+
+ if (mp == NULL)
+ return;
+
+ /* The length could have changed */
+ ip6h = (ip6_t *)mp->b_rptr;
+ ira->ira_pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
+ pkt_len = ira->ira_pktlen;
+
+ /*
+ * In case the destination changed we override any previous
+ * change to nexthop.
+ */
+ if (!IN6_ARE_ADDR_EQUAL(&orig_dst, &ip6h->ip6_dst))
+ nexthop = ip6h->ip6_dst;
+
+ if (IN6_IS_ADDR_UNSPECIFIED(&nexthop)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
+ ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
+ freemsg(mp);
+ return;
+ }
+
+ }
+
+ if (ipst->ips_ip6_observe.he_interested) {
+ zoneid_t dzone;
+
+ /*
+ * On the inbound path the src zone will be unknown as
+ * this packet has come from the wire.
+ */
+ dzone = ip_get_zoneid_v6(&nexthop, mp, ill, ira, ALL_ZONES);
+ ipobs_hook(mp, IPOBS_HOOK_INBOUND, ALL_ZONES, dzone, ill, ipst);
+ }
+
+ if ((ip6h->ip6_vcf & IPV6_VERS_AND_FLOW_MASK) !=
+ IPV6_DEFAULT_VERS_AND_FLOW) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInWrongIPVersion);
+ ip_drop_input("ipIfStatsInWrongIPVersion", mp, ill);
+ freemsg(mp);
+ return;
+ }
+
+ /*
+ * For IPv6 we update ira_ip_hdr_length and ira_protocol as
+ * we parse the headers, starting with the hop-by-hop options header.
+ */
+ ira->ira_ip_hdr_length = IPV6_HDR_LEN;
+ if ((ira->ira_protocol = ip6h->ip6_nxt) == IPPROTO_HOPOPTS) {
+ ip6_hbh_t *hbhhdr;
+ uint_t ehdrlen;
+ uint8_t *optptr;
+
+ if (pkt_len < IPV6_HDR_LEN + MIN_EHDR_LEN) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
+ ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ if (mp->b_cont != NULL &&
+ rptr + IPV6_HDR_LEN + MIN_EHDR_LEN > mp->b_wptr) {
+ ip6h = ip_pullup(mp, IPV6_HDR_LEN + MIN_EHDR_LEN, ira);
+ if (ip6h == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ }
+ hbhhdr = (ip6_hbh_t *)&ip6h[1];
+ ehdrlen = 8 * (hbhhdr->ip6h_len + 1);
+
+ if (pkt_len < IPV6_HDR_LEN + ehdrlen) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
+ ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ if (mp->b_cont != NULL &&
+ rptr + IPV6_HDR_LEN + ehdrlen > mp->b_wptr) {
+ ip6h = ip_pullup(mp, IPV6_HDR_LEN + ehdrlen, ira);
+ if (ip6h == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ hbhhdr = (ip6_hbh_t *)&ip6h[1];
+ }
+
+ /*
+ * Update ira_ip_hdr_length to skip the hop-by-hop header
+ * once we get to ip_fanout_v6
+ */
+ ira->ira_ip_hdr_length += ehdrlen;
+ ira->ira_protocol = hbhhdr->ip6h_nxt;
+
+ optptr = (uint8_t *)&hbhhdr[1];
+ switch (ip_process_options_v6(mp, ip6h, optptr,
+ ehdrlen - 2, IPPROTO_HOPOPTS, ira)) {
+ case -1:
+ /*
+ * Packet has been consumed and any
+ * needed ICMP messages sent.
+ */
+ return;
+ case 0:
+ /* no action needed */
+ break;
+ case 1:
+ /*
+ * Known router alert. Make use handle it as local
+ * by setting the nexthop to be the all-host multicast
+ * address, and skip multicast membership filter by
+ * marking as a router alert.
+ */
+ ira->ira_flags |= IRAF_ROUTER_ALERT;
+ nexthop = ipv6_all_hosts_mcast;
+ break;
+ }
+ }
+
+ /*
+ * Here we check to see if we machine is setup as
+ * L3 loadbalancer and if the incoming packet is for a VIP
+ *
+ * Check the following:
+ * - there is at least a rule
+ * - protocol of the packet is supported
+ *
+ * We don't load balance IPv6 link-locals.
+ */
+ if (ilb_has_rules(ilbs) && ILB_SUPP_L4(ira->ira_protocol) &&
+ !IN6_IS_ADDR_LINKLOCAL(&nexthop)) {
+ in6_addr_t lb_dst;
+ int lb_ret;
+
+ /* For convenience, we just pull up the mblk. */
+ if (mp->b_cont != NULL) {
+ if (pullupmsg(mp, -1) == 0) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards - pullupmsg",
+ mp, ill);
+ freemsg(mp);
+ return;
+ }
+ ip6h = (ip6_t *)mp->b_rptr;
+ }
+ lb_ret = ilb_check_v6(ilbs, ill, mp, ip6h, ira->ira_protocol,
+ (uint8_t *)ip6h + ira->ira_ip_hdr_length, &lb_dst);
+ if (lb_ret == ILB_DROPPED) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ILB_DROPPED", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ if (lb_ret == ILB_BALANCED) {
+ /* Set the dst to that of the chosen server */
+ nexthop = lb_dst;
+ DB_CKSUMFLAGS(mp) = 0;
+ }
+ }
+
+ /* Can not use route cache with TX since the labels can differ */
+ if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
+ if (IN6_IS_ADDR_MULTICAST(&nexthop)) {
+ ire = ire_multicast(ill);
+ } else if (IN6_IS_ADDR_LINKLOCAL(&nexthop)) {
+ ire = ire_linklocal(&nexthop, ill, ira,
+ (ill->ill_flags & ILLF_ROUTER), ipst);
+ } else {
+ /* Match destination and label */
+ ire = ire_route_recursive_v6(&nexthop, 0, NULL,
+ ALL_ZONES, ira->ira_tsl, MATCH_IRE_SECATTR,
+ (ill->ill_flags & ILLF_ROUTER), ira->ira_xmit_hint,
+ ipst, NULL, NULL, NULL);
+ }
+ /* Update the route cache so we do the ire_refrele */
+ ASSERT(ire != NULL);
+ if (rtc->rtc_ire != NULL)
+ ire_refrele(rtc->rtc_ire);
+ rtc->rtc_ire = ire;
+ rtc->rtc_ip6addr = nexthop;
+ } else if (IN6_ARE_ADDR_EQUAL(&nexthop, &rtc->rtc_ip6addr)) {
+ /* Use the route cache */
+ ASSERT(rtc->rtc_ire != NULL);
+ ire = rtc->rtc_ire;
+ } else {
+ /* Update the route cache */
+ if (IN6_IS_ADDR_MULTICAST(&nexthop)) {
+ ire = ire_multicast(ill);
+ } else if (IN6_IS_ADDR_LINKLOCAL(&nexthop)) {
+ ire = ire_linklocal(&nexthop, ill, ira,
+ (ill->ill_flags & ILLF_ROUTER), ipst);
+ } else {
+ ire = ire_route_recursive_dstonly_v6(&nexthop,
+ (ill->ill_flags & ILLF_ROUTER), ira->ira_xmit_hint,
+ ipst);
+ }
+ ASSERT(ire != NULL);
+ if (rtc->rtc_ire != NULL)
+ ire_refrele(rtc->rtc_ire);
+ rtc->rtc_ire = ire;
+ rtc->rtc_ip6addr = nexthop;
+ }
+
+ ire->ire_ib_pkt_count++;
+
+ /*
+ * Based on ire_type and ire_flags call one of:
+ * ire_recv_local_v6 - for IRE_LOCAL
+ * ire_recv_loopback_v6 - for IRE_LOOPBACK
+ * ire_recv_multirt_v6 - if RTF_MULTIRT
+ * ire_recv_noroute_v6 - if RTF_REJECT or RTF_BLACHOLE
+ * ire_recv_multicast_v6 - for IRE_MULTICAST
+ * ire_recv_noaccept_v6 - for ire_noaccept ones
+ * ire_recv_forward_v6 - for the rest.
+ */
+
+ (*ire->ire_recvfn)(ire, mp, ip6h, ira);
+}
+#undef rptr
+
+/*
+ * ire_recvfn for IREs that need forwarding
+ */
+void
+ire_recv_forward_v6(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
+{
+ ip6_t *ip6h = (ip6_t *)iph_arg;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+ iaflags_t iraflags = ira->ira_flags;
+ ill_t *dst_ill;
+ nce_t *nce;
+ uint32_t added_tx_len;
+ uint32_t mtu, iremtu;
+
+ if (iraflags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+ ip_drop_input("l2 multicast not forwarded", mp, ill);
+ freemsg(mp);
+ return;
+ }
+
+ if (!(ill->ill_flags & ILLF_ROUTER)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+ ip_drop_input("ipIfStatsForwProhibits", mp, ill);
+ freemsg(mp);
+ return;
+ }
+
+ /*
+ * Either ire_nce_capable or ire_dep_parent would be set for the IRE
+ * when it is found by ire_route_recursive, but that some other thread
+ * could have changed the routes with the effect of clearing
+ * ire_dep_parent. In that case we'd end up dropping the packet, or
+ * finding a new nce below.
+ * Get, allocate, or update the nce.
+ * We get a refhold on ire_nce_cache as a result of this to avoid races
+ * where ire_nce_cache is deleted.
+ *
+ * This ensures that we don't forward if the interface is down since
+ * ipif_down removes all the nces.
+ */
+ mutex_enter(&ire->ire_lock);
+ nce = ire->ire_nce_cache;
+ if (nce == NULL) {
+ /* Not yet set up - try to set one up */
+ mutex_exit(&ire->ire_lock);
+ (void) ire_revalidate_nce(ire);
+ mutex_enter(&ire->ire_lock);
+ nce = ire->ire_nce_cache;
+ if (nce == NULL) {
+ mutex_exit(&ire->ire_lock);
+ /* The ire_dep_parent chain went bad, or no memory */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("No ire_dep_parent", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ }
+ nce_refhold(nce);
+ mutex_exit(&ire->ire_lock);
+
+ if (nce->nce_is_condemned) {
+ nce_t *nce1;
+
+ nce1 = ire_handle_condemned_nce(nce, ire, NULL, ip6h, B_FALSE);
+ nce_refrele(nce);
+ if (nce1 == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("No nce", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ nce = nce1;
+ }
+ dst_ill = nce->nce_ill;
+
+ /*
+ * Unless we are forwarding, drop the packet.
+ * Unlike IPv4 we don't allow source routed packets out the same
+ * interface when we are not a router.
+ * Note that ill_forward_set() will set the ILLF_ROUTER on
+ * all the group members when it gets an ipmp-ill or under-ill.
+ */
+ if (!(dst_ill->ill_flags & ILLF_ROUTER)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+ ip_drop_input("ipIfStatsForwProhibits", mp, ill);
+ freemsg(mp);
+ nce_refrele(nce);
+ return;
+ }
+
+ if (ire->ire_zoneid != GLOBAL_ZONEID && ire->ire_zoneid != ALL_ZONES) {
+ ire->ire_ib_pkt_count--;
+ /*
+ * Should only use IREs that are visible from the
+ * global zone for forwarding.
+ * For IPv6 any source route would have already been
+ * advanced in ip_fanout_v6
+ */
+ ire = ire_route_recursive_v6(&ip6h->ip6_dst, 0, NULL,
+ GLOBAL_ZONEID, ira->ira_tsl, MATCH_IRE_SECATTR,
+ (ill->ill_flags & ILLF_ROUTER), ira->ira_xmit_hint, ipst,
+ NULL, NULL, NULL);
+ ire->ire_ib_pkt_count++;
+ (*ire->ire_recvfn)(ire, mp, ip6h, ira);
+ ire_refrele(ire);
+ nce_refrele(nce);
+ return;
+ }
+ /*
+ * ipIfStatsHCInForwDatagrams should only be increment if there
+ * will be an attempt to forward the packet, which is why we
+ * increment after the above condition has been checked.
+ */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams);
+
+ /* Initiate Read side IPPF processing */
+ if (IPP_ENABLED(IPP_FWD_IN, ipst)) {
+ /* ip_process translates an IS_UNDER_IPMP */
+ mp = ip_process(IPP_FWD_IN, mp, ill, ill);
+ if (mp == NULL) {
+ /* ip_drop_packet and MIB done */
+ ip2dbg(("ire_recv_forward_v6: pkt dropped/deferred "
+ "during IPPF processing\n"));
+ nce_refrele(nce);
+ return;
+ }
+ }
+
+ DTRACE_PROBE4(ip6__forwarding__start,
+ ill_t *, ill, ill_t *, dst_ill, ip6_t *, ip6h, mblk_t *, mp);
+
+ if (HOOKS6_INTERESTED_FORWARDING(ipst)) {
+ int error;
+
+ FW_HOOKS(ipst->ips_ip6_forwarding_event,
+ ipst->ips_ipv6firewall_forwarding,
+ ill, dst_ill, ip6h, mp, mp, 0, ipst, error);
+
+ DTRACE_PROBE1(ip6__forwarding__end, mblk_t *, mp);
+
+ if (mp == NULL) {
+ nce_refrele(nce);
+ return;
+ }
+ /*
+ * Even if the destination was changed by the filter we use the
+ * forwarding decision that was made based on the address
+ * in ip_input.
+ */
+
+ /* Might have changed */
+ ip6h = (ip6_t *)mp->b_rptr;
+ ira->ira_pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
+ }
+
+ /* Packet is being forwarded. Turning off hwcksum flag. */
+ DB_CKSUMFLAGS(mp) = 0;
+
+ /*
+ * Per RFC 3513 section 2.5.2, we must not forward packets with
+ * an unspecified source address.
+ * The loopback address check for both src and dst has already
+ * been checked in ip_input_v6
+ * In the future one can envision adding RPF checks using number 3.
+ */
+ switch (ipst->ips_src_check) {
+ case 0:
+ break;
+ case 1:
+ case 2:
+ if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) ||
+ IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
+ ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
+ nce_refrele(nce);
+ freemsg(mp);
+ return;
+ }
+ break;
+ }
+
+ /*
+ * Check to see if we're forwarding the packet to a
+ * different link from which it came. If so, check the
+ * source and destination addresses since routers must not
+ * forward any packets with link-local source or
+ * destination addresses to other links. Otherwise (if
+ * we're forwarding onto the same link), conditionally send
+ * a redirect message.
+ */
+ if (!IS_ON_SAME_LAN(dst_ill, ill)) {
+ if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst) ||
+ IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
+ ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
+ freemsg(mp);
+ nce_refrele(nce);
+ return;
+ }
+ /* TBD add site-local check at site boundary? */
+ } else if (ipst->ips_ipv6_send_redirects) {
+ ip_send_potential_redirect_v6(mp, ip6h, ire, ira);
+ }
+
+ added_tx_len = 0;
+ if (iraflags & IRAF_SYSTEM_LABELED) {
+ mblk_t *mp1;
+ uint32_t old_pkt_len = ira->ira_pktlen;
+
+ /*
+ * Check if it can be forwarded and add/remove
+ * CIPSO options as needed.
+ */
+ if ((mp1 = tsol_ip_forward(ire, mp, ira)) == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+ ip_drop_input("tsol_ip_forward", mp, ill);
+ freemsg(mp);
+ nce_refrele(nce);
+ return;
+ }
+ /*
+ * Size may have changed. Remember amount added in case
+ * ip_fragment needs to send an ICMP too big.
+ */
+ mp = mp1;
+ ip6h = (ip6_t *)mp->b_rptr;
+ ira->ira_pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
+ ira->ira_ip_hdr_length = IPV6_HDR_LEN;
+ if (ira->ira_pktlen > old_pkt_len)
+ added_tx_len = ira->ira_pktlen - old_pkt_len;
+ }
+
+ mtu = dst_ill->ill_mtu;
+ if ((iremtu = ire->ire_metrics.iulp_mtu) != 0 && iremtu < mtu)
+ mtu = iremtu;
+ ip_forward_xmit_v6(nce, mp, ip6h, ira, mtu, added_tx_len);
+ nce_refrele(nce);
+ return;
+
+}
+
+/*
+ * Used for sending out unicast and multicast packets that are
+ * forwarded.
+ */
+void
+ip_forward_xmit_v6(nce_t *nce, mblk_t *mp, ip6_t *ip6h, ip_recv_attr_t *ira,
+ uint32_t mtu, uint32_t added_tx_len)
+{
+ ill_t *dst_ill = nce->nce_ill;
+ uint32_t pkt_len;
+ iaflags_t iraflags = ira->ira_flags;
+ ip_stack_t *ipst = dst_ill->ill_ipst;
+
+ if (ip6h->ip6_hops-- <= 1) {
+ BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ICMP6_TIME_EXCEED_TRANSIT", mp, ira->ira_ill);
+ icmp_time_exceeded_v6(mp, ICMP6_TIME_EXCEED_TRANSIT, B_FALSE,
+ ira);
+ return;
+ }
+
+ /* Initiate Write side IPPF processing before any fragmentation */
+ if (IPP_ENABLED(IPP_FWD_OUT, ipst)) {
+ /* ip_process translates an IS_UNDER_IPMP */
+ mp = ip_process(IPP_FWD_OUT, mp, dst_ill, dst_ill);
+ if (mp == NULL) {
+ /* ip_drop_packet and MIB done */
+ ip2dbg(("ire_recv_forward_v6: pkt dropped/deferred" \
+ " during IPPF processing\n"));
+ return;
+ }
+ }
+
+ pkt_len = ira->ira_pktlen;
+
+ BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams);
+
+ if (pkt_len > mtu) {
+ BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutFragFails);
+ ip_drop_output("ipIfStatsOutFragFails", mp, dst_ill);
+ if (iraflags & IRAF_SYSTEM_LABELED) {
+ /*
+ * Remove any CIPSO option added by
+ * tsol_ip_forward, and make sure we report
+ * a path MTU so that there
+ * is room to add such a CIPSO option for future
+ * packets.
+ */
+ mtu = tsol_pmtu_adjust(mp, mtu, added_tx_len, AF_INET6);
+ }
+ icmp_pkt2big_v6(mp, mtu, B_TRUE, ira);
+ return;
+ }
+
+ ASSERT(pkt_len ==
+ ntohs(((ip6_t *)mp->b_rptr)->ip6_plen) + IPV6_HDR_LEN);
+
+ if (iraflags & IRAF_LOOPBACK_COPY) {
+ /*
+ * IXAF_NO_LOOP_ZONEID is not set hence 6th arg
+ * is don't care
+ */
+ (void) ip_postfrag_loopcheck(mp, nce,
+ (IXAF_LOOPBACK_COPY | IXAF_NO_DEV_FLOW_CTL),
+ pkt_len, ira->ira_xmit_hint, GLOBAL_ZONEID, 0, NULL);
+ } else {
+ (void) ip_xmit(mp, nce, IXAF_NO_DEV_FLOW_CTL,
+ pkt_len, ira->ira_xmit_hint, GLOBAL_ZONEID, 0, NULL);
+ }
+}
+
+/*
+ * ire_recvfn for RTF_REJECT and RTF_BLACKHOLE routes, including IRE_NOROUTE,
+ * which is what ire_route_recursive returns when there is no matching ire.
+ * Send ICMP unreachable unless blackhole.
+ */
+void
+ire_recv_noroute_v6(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
+{
+ ip6_t *ip6h = (ip6_t *)iph_arg;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+
+ /* Would we have forwarded this packet if we had a route? */
+ if (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+ ip_drop_input("l2 multicast not forwarded", mp, ill);
+ freemsg(mp);
+ return;
+ }
+
+ if (!(ill->ill_flags & ILLF_ROUTER)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+ ip_drop_input("ipIfStatsForwProhibits", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ /*
+ * If we had a route this could have been forwarded. Count as such.
+ *
+ * ipIfStatsHCInForwDatagrams should only be increment if there
+ * will be an attempt to forward the packet, which is why we
+ * increment after the above condition has been checked.
+ */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams);
+
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInNoRoutes);
+
+ ip_rts_change_v6(RTM_MISS, &ip6h->ip6_dst, 0, 0, 0, 0, 0, 0, RTA_DST,
+ ipst);
+
+ if (ire->ire_flags & RTF_BLACKHOLE) {
+ ip_drop_input("ipIfStatsInNoRoutes RTF_BLACKHOLE", mp, ill);
+ freemsg(mp);
+ } else {
+ ip_drop_input("ipIfStatsInNoRoutes RTF_REJECT", mp, ill);
+
+ icmp_unreachable_v6(mp, ICMP6_DST_UNREACH_NOROUTE, B_FALSE,
+ ira);
+ }
+}
+
+/*
+ * ire_recvfn for IRE_LOCALs marked with ire_noaccept. Such IREs are used for
+ * VRRP when in noaccept mode.
+ * We silently drop packets except for Neighbor Solicitations and
+ * Neighbor Advertisements.
+ */
+void
+ire_recv_noaccept_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
+ ip_recv_attr_t *ira)
+{
+ ip6_t *ip6h = (ip6_t *)iph_arg;
+ ill_t *ill = ira->ira_ill;
+ icmp6_t *icmp6;
+ int ip_hdr_length;
+
+ if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards - noaccept", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ ip_hdr_length = ira->ira_ip_hdr_length;
+ if ((mp->b_wptr - mp->b_rptr) < (ip_hdr_length + ICMP6_MINLEN)) {
+ if (ira->ira_pktlen < (ip_hdr_length + ICMP6_MINLEN)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
+ ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ ip6h = ip_pullup(mp, ip_hdr_length + ICMP6_MINLEN, ira);
+ if (ip6h == NULL) {
+ BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
+ freemsg(mp);
+ return;
+ }
+ }
+ icmp6 = (icmp6_t *)(&mp->b_rptr[ip_hdr_length]);
+
+ if (icmp6->icmp6_type != ND_NEIGHBOR_SOLICIT &&
+ icmp6->icmp6_type != ND_NEIGHBOR_ADVERT) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards - noaccept", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ ire_recv_local_v6(ire, mp, ip6h, ira);
+}
+
+/*
+ * ire_recvfn for IRE_MULTICAST.
+ */
+void
+ire_recv_multicast_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
+ ip_recv_attr_t *ira)
+{
+ ip6_t *ip6h = (ip6_t *)iph_arg;
+ ill_t *ill = ira->ira_ill;
+
+ ASSERT(ire->ire_ill == ira->ira_ill);
+
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastPkts);
+ UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastOctets, ira->ira_pktlen);
+
+ /* Tag for higher-level protocols */
+ ira->ira_flags |= IRAF_MULTICAST;
+
+ /*
+ * So that we don't end up with dups, only one ill an IPMP group is
+ * nominated to receive multicast traffic.
+ * If we have no cast_ill we are liberal and accept everything.
+ */
+ if (IS_UNDER_IPMP(ill)) {
+ ip_stack_t *ipst = ill->ill_ipst;
+
+ /* For an under ill_grp can change under lock */
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+ if (!ill->ill_nom_cast && ill->ill_grp != NULL &&
+ ill->ill_grp->ig_cast_ill != NULL) {
+ rw_exit(&ipst->ips_ill_g_lock);
+ ip_drop_input("not on cast ill", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ rw_exit(&ipst->ips_ill_g_lock);
+ /*
+ * We switch to the upper ill so that mrouter and hasmembers
+ * can operate on upper here and in ip_input_multicast.
+ */
+ ill = ipmp_ill_hold_ipmp_ill(ill);
+ if (ill != NULL) {
+ ASSERT(ill != ira->ira_ill);
+ ASSERT(ire->ire_ill == ira->ira_ill);
+ ira->ira_ill = ill;
+ ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+ } else {
+ ill = ira->ira_ill;
+ }
+ }
+
+#ifdef notdef
+ /*
+ * Check if we are a multicast router - send ip_mforward a copy of
+ * the packet.
+ * Due to mroute_decap tunnels we consider forwarding packets even if
+ * mrouted has not joined the allmulti group on this interface.
+ */
+ if (ipst->ips_ip_g_mrouter) {
+ int retval;
+
+ /*
+ * Clear the indication that this may have hardware
+ * checksum as we are not using it for forwarding.
+ */
+ DB_CKSUMFLAGS(mp) = 0;
+
+ /*
+ * ip_mforward helps us make these distinctions: If received
+ * on tunnel and not IGMP, then drop.
+ * If IGMP packet, then don't check membership
+ * If received on a phyint and IGMP or PIM, then
+ * don't check membership
+ */
+ retval = ip_mforward_v6(mp, ira);
+ /* ip_mforward updates mib variables if needed */
+
+ switch (retval) {
+ case 0:
+ /*
+ * pkt is okay and arrived on phyint.
+ */
+ break;
+ case -1:
+ /* pkt is mal-formed, toss it */
+ freemsg(mp);
+ goto done;
+ case 1:
+ /*
+ * pkt is okay and arrived on a tunnel
+ *
+ * If we are running a multicast router
+ * we need to see all mld packets, which
+ * are marked with router alerts.
+ */
+ if (ira->ira_flags & IRAF_ROUTER_ALERT)
+ goto forus;
+ ip_drop_input("Multicast on tunnel ignored", mp, ill);
+ freemsg(mp);
+ goto done;
+ }
+ }
+#endif /* notdef */
+
+ /*
+ * If this was a router alert we skip the group membership check.
+ */
+ if (ira->ira_flags & IRAF_ROUTER_ALERT)
+ goto forus;
+
+ /*
+ * Check if we have members on this ill. This is not necessary for
+ * correctness because even if the NIC/GLD had a leaky filter, we
+ * filter before passing to each conn_t.
+ */
+ if (!ill_hasmembers_v6(ill, &ip6h->ip6_dst)) {
+ /*
+ * Nobody interested
+ *
+ * This might just be caused by the fact that
+ * multiple IP Multicast addresses map to the same
+ * link layer multicast - no need to increment counter!
+ */
+ ip_drop_input("Multicast with no members", mp, ill);
+ freemsg(mp);
+ goto done;
+ }
+forus:
+ ip2dbg(("ire_recv_multicast_v6: multicast for us\n"));
+
+ /*
+ * After reassembly and IPsec we will need to duplicate the
+ * multicast packet for all matching zones on the ill.
+ */
+ ira->ira_zoneid = ALL_ZONES;
+
+ /* Reassemble on the ill on which the packet arrived */
+ ip_input_local_v6(ire, mp, ip6h, ira);
+done:
+ if (ill != ire->ire_ill) {
+ ill_refrele(ill);
+ ira->ira_ill = ire->ire_ill;
+ ira->ira_ruifindex = ira->ira_ill->ill_phyint->phyint_ifindex;
+ }
+}
+
+/*
+ * ire_recvfn for IRE_OFFLINK with RTF_MULTIRT.
+ * Drop packets since we don't forward out multirt routes.
+ */
+/* ARGSUSED */
+void
+ire_recv_multirt_v6(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
+{
+ ill_t *ill = ira->ira_ill;
+
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInNoRoutes);
+ ip_drop_input("Not forwarding out MULTIRT", mp, ill);
+ freemsg(mp);
+}
+
+/*
+ * ire_recvfn for IRE_LOOPBACK. This is only used when a FW_HOOK
+ * has rewritten the packet to have a loopback destination address (We
+ * filter out packet with a loopback destination from arriving over the wire).
+ * We don't know what zone to use, thus we always use the GLOBAL_ZONEID.
+ */
+void
+ire_recv_loopback_v6(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
+{
+ ip6_t *ip6h = (ip6_t *)iph_arg;
+ ill_t *ill = ira->ira_ill;
+ ill_t *ire_ill = ire->ire_ill;
+
+ ira->ira_zoneid = GLOBAL_ZONEID;
+
+ /* Switch to the lo0 ill for further processing */
+ if (ire_ill != ill) {
+ /*
+ * Update ira_ill to be the ILL on which the IP address
+ * is hosted.
+ * No need to hold the ill since we have a hold on the ire
+ */
+ ASSERT(ira->ira_ill == ira->ira_rill);
+ ira->ira_ill = ire_ill;
+
+ ip_input_local_v6(ire, mp, ip6h, ira);
+
+ /* Restore */
+ ASSERT(ira->ira_ill == ire_ill);
+ ira->ira_ill = ill;
+ return;
+
+ }
+ ip_input_local_v6(ire, mp, ip6h, ira);
+}
+
+/*
+ * ire_recvfn for IRE_LOCAL.
+ */
+void
+ire_recv_local_v6(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
+{
+ ip6_t *ip6h = (ip6_t *)iph_arg;
+ ill_t *ill = ira->ira_ill;
+ ill_t *ire_ill = ire->ire_ill;
+
+ /* Make a note for DAD that this address is in use */
+ ire->ire_last_used_time = lbolt;
+
+ /* Only target the IRE_LOCAL with the right zoneid. */
+ ira->ira_zoneid = ire->ire_zoneid;
+
+ /*
+ * If the packet arrived on the wrong ill, we check that
+ * this is ok.
+ * If it is, then we ensure that we do the reassembly on
+ * the ill on which the address is hosted. We keep ira_rill as
+ * the one on which the packet arrived, so that IP_PKTINFO and
+ * friends can report this.
+ */
+ if (ire_ill != ill) {
+ ire_t *new_ire;
+
+ new_ire = ip_check_multihome(&ip6h->ip6_dst, ire, ill);
+ if (new_ire == NULL) {
+ /* Drop packet */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+ ip_drop_input("ipIfStatsInForwProhibits", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ /*
+ * Update ira_ill to be the ILL on which the IP address
+ * is hosted. No need to hold the ill since we have a
+ * hold on the ire. Note that we do the switch even if
+ * new_ire == ire (for IPMP, ire would be the one corresponding
+ * to the IPMP ill).
+ */
+ ASSERT(ira->ira_ill == ira->ira_rill);
+ ira->ira_ill = new_ire->ire_ill;
+
+ /* ira_ruifindex tracks the upper for ira_rill */
+ if (IS_UNDER_IPMP(ill))
+ ira->ira_ruifindex = ill_get_upper_ifindex(ill);
+
+ ip_input_local_v6(new_ire, mp, ip6h, ira);
+
+ /* Restore */
+ ASSERT(ira->ira_ill == new_ire->ire_ill);
+ ira->ira_ill = ill;
+ ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+
+ if (new_ire != ire)
+ ire_refrele(new_ire);
+ return;
+ }
+
+ ip_input_local_v6(ire, mp, ip6h, ira);
+}
+
+/*
+ * Common function for packets arriving for the host. Handles
+ * checksum verification, reassembly checks, etc.
+ */
+static void
+ip_input_local_v6(ire_t *ire, mblk_t *mp, ip6_t *ip6h, ip_recv_attr_t *ira)
+{
+ iaflags_t iraflags = ira->ira_flags;
+
+ /*
+ * For multicast we need some extra work before
+ * we call ip_fanout_v6(), since in the case of shared-IP zones
+ * we need to pretend that a packet arrived for each zoneid.
+ */
+ if (iraflags & IRAF_MULTICAST) {
+ ip_input_multicast_v6(ire, mp, ip6h, ira);
+ return;
+ }
+ ip_fanout_v6(mp, ip6h, ira);
+}
+
+/*
+ * Handle multiple zones which want to receive the same multicast packets
+ * on this ill by delivering a packet to each of them.
+ *
+ * Note that for packets delivered to transports we could instead do this
+ * as part of the fanout code, but since we need to handle icmp_inbound
+ * it is simpler to have multicast work the same as IPv4 broadcast.
+ *
+ * The ip_fanout matching for multicast matches based on ilm independent of
+ * zoneid since the zoneid restriction is applied when joining a multicast
+ * group.
+ */
+/* ARGSUSED */
+static void
+ip_input_multicast_v6(ire_t *ire, mblk_t *mp, ip6_t *ip6h, ip_recv_attr_t *ira)
+{
+ ill_t *ill = ira->ira_ill;
+ iaflags_t iraflags = ira->ira_flags;
+ ip_stack_t *ipst = ill->ill_ipst;
+ netstack_t *ns = ipst->ips_netstack;
+ zoneid_t zoneid;
+ mblk_t *mp1;
+ ip6_t *ip6h1;
+
+ /* ire_recv_multicast has switched to the upper ill for IPMP */
+ ASSERT(!IS_UNDER_IPMP(ill));
+
+ /*
+ * If we don't have more than one shared-IP zone, or if
+ * there are no members in anything but the global zone,
+ * then just set the zoneid and proceed.
+ */
+ if (ns->netstack_numzones == 1 ||
+ !ill_hasmembers_otherzones_v6(ill, &ip6h->ip6_dst,
+ GLOBAL_ZONEID)) {
+ ira->ira_zoneid = GLOBAL_ZONEID;
+
+ /* If sender didn't want this zone to receive it, drop */
+ if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
+ ira->ira_no_loop_zoneid == ira->ira_zoneid) {
+ ip_drop_input("Multicast but wrong zoneid", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ ip_fanout_v6(mp, ip6h, ira);
+ return;
+ }
+
+ /*
+ * Here we loop over all zoneids that have members in the group
+ * and deliver a packet to ip_fanout for each zoneid.
+ *
+ * First find any members in the lowest numeric zoneid by looking for
+ * first zoneid larger than -1 (ALL_ZONES).
+ * We terminate the loop when we receive -1 (ALL_ZONES).
+ */
+ zoneid = ill_hasmembers_nextzone_v6(ill, &ip6h->ip6_dst, ALL_ZONES);
+ for (; zoneid != ALL_ZONES;
+ zoneid = ill_hasmembers_nextzone_v6(ill, &ip6h->ip6_dst, zoneid)) {
+ /*
+ * Avoid an extra copymsg/freemsg by skipping global zone here
+ * and doing that at the end.
+ */
+ if (zoneid == GLOBAL_ZONEID)
+ continue;
+
+ ira->ira_zoneid = zoneid;
+
+ /* If sender didn't want this zone to receive it, skip */
+ if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
+ ira->ira_no_loop_zoneid == ira->ira_zoneid)
+ continue;
+
+ mp1 = copymsg(mp);
+ if (mp1 == NULL) {
+ /* Failed to deliver to one zone */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ continue;
+ }
+ ip6h1 = (ip6_t *)mp1->b_rptr;
+ ip_fanout_v6(mp1, ip6h1, ira);
+ }
+
+ /* Do the main ire */
+ ira->ira_zoneid = GLOBAL_ZONEID;
+ /* If sender didn't want this zone to receive it, drop */
+ if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
+ ira->ira_no_loop_zoneid == ira->ira_zoneid) {
+ ip_drop_input("Multicast but wrong zoneid", mp, ill);
+ freemsg(mp);
+ } else {
+ ip_fanout_v6(mp, ip6h, ira);
+ }
+}
+
+
+/*
+ * Determine the zoneid and IRAF_TX_MAC_EXEMPTABLE if trusted extensions
+ * is in use. Updates ira_zoneid and ira_flags as a result.
+ */
+static void
+ip_fanout_tx_v6(mblk_t *mp, ip6_t *ip6h, uint8_t protocol, uint_t ip_hdr_length,
+ ip_recv_attr_t *ira)
+{
+ uint16_t *up;
+ uint16_t lport;
+ zoneid_t zoneid;
+
+ ASSERT(ira->ira_flags & IRAF_SYSTEM_LABELED);
+
+ /*
+ * If the packet is unlabeled we might allow read-down
+ * for MAC_EXEMPT. Below we clear this if it is a multi-level
+ * port (MLP).
+ * Note that ira_tsl can be NULL here.
+ */
+ if (ira->ira_tsl != NULL && ira->ira_tsl->tsl_flags & TSLF_UNLABELED)
+ ira->ira_flags |= IRAF_TX_MAC_EXEMPTABLE;
+
+ if (ira->ira_zoneid != ALL_ZONES)
+ return;
+
+ ira->ira_flags |= IRAF_TX_SHARED_ADDR;
+
+ up = (uint16_t *)((uchar_t *)ip6h + ip_hdr_length);
+ switch (protocol) {
+ case IPPROTO_TCP:
+ case IPPROTO_SCTP:
+ case IPPROTO_UDP:
+ /* Caller ensures this */
+ ASSERT(((uchar_t *)ip6h) + ip_hdr_length +4 <= mp->b_wptr);
+
+ /*
+ * Only these transports support MLP.
+ * We know their destination port numbers is in
+ * the same place in the header.
+ */
+ lport = up[1];
+
+ /*
+ * No need to handle exclusive-stack zones
+ * since ALL_ZONES only applies to the shared IP instance.
+ */
+ zoneid = tsol_mlp_findzone(protocol, lport);
+ /*
+ * If no shared MLP is found, tsol_mlp_findzone returns
+ * ALL_ZONES. In that case, we assume it's SLP, and
+ * search for the zone based on the packet label.
+ *
+ * If there is such a zone, we prefer to find a
+ * connection in it. Otherwise, we look for a
+ * MAC-exempt connection in any zone whose label
+ * dominates the default label on the packet.
+ */
+ if (zoneid == ALL_ZONES)
+ zoneid = tsol_attr_to_zoneid(ira);
+ else
+ ira->ira_flags &= ~IRAF_TX_MAC_EXEMPTABLE;
+ break;
+ default:
+ /* Handle shared address for other protocols */
+ zoneid = tsol_attr_to_zoneid(ira);
+ break;
+ }
+ ira->ira_zoneid = zoneid;
+}
+
+/*
+ * Increment checksum failure statistics
+ */
+static void
+ip_input_cksum_err_v6(uint8_t protocol, uint16_t hck_flags, ill_t *ill)
+{
+ ip_stack_t *ipst = ill->ill_ipst;
+
+ switch (protocol) {
+ case IPPROTO_TCP:
+ BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs);
+
+ if (hck_flags & HCK_FULLCKSUM)
+ IP6_STAT(ipst, ip6_tcp_in_full_hw_cksum_err);
+ else if (hck_flags & HCK_PARTIALCKSUM)
+ IP6_STAT(ipst, ip6_tcp_in_part_hw_cksum_err);
+ else
+ IP6_STAT(ipst, ip6_tcp_in_sw_cksum_err);
+ break;
+ case IPPROTO_UDP:
+ BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs);
+ if (hck_flags & HCK_FULLCKSUM)
+ IP6_STAT(ipst, ip6_udp_in_full_hw_cksum_err);
+ else if (hck_flags & HCK_PARTIALCKSUM)
+ IP6_STAT(ipst, ip6_udp_in_part_hw_cksum_err);
+ else
+ IP6_STAT(ipst, ip6_udp_in_sw_cksum_err);
+ break;
+ case IPPROTO_ICMPV6:
+ BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInMsgs);
+ BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+}
+
+/* Calculate the IPv6 pseudo-header checksum for TCP, UDP, and ICMPV6 */
+uint32_t
+ip_input_cksum_pseudo_v6(ip6_t *ip6h, ip_recv_attr_t *ira)
+{
+ uint_t ulp_len;
+ uint32_t cksum;
+ uint8_t protocol = ira->ira_protocol;
+ uint16_t ip_hdr_length = ira->ira_ip_hdr_length;
+
+#define iphs ((uint16_t *)ip6h)
+
+ switch (protocol) {
+ case IPPROTO_TCP:
+ ulp_len = ira->ira_pktlen - ip_hdr_length;
+
+ /* Protocol and length */
+ cksum = htons(ulp_len) + IP_TCP_CSUM_COMP;
+ /* IP addresses */
+ cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] +
+ iphs[8] + iphs[9] + iphs[10] + iphs[11] +
+ iphs[12] + iphs[13] + iphs[14] + iphs[15] +
+ iphs[16] + iphs[17] + iphs[18] + iphs[19];
+ break;
+
+ case IPPROTO_UDP: {
+ udpha_t *udpha;
+
+ udpha = (udpha_t *)((uchar_t *)ip6h + ip_hdr_length);
+
+ /* Protocol and length */
+ cksum = udpha->uha_length + IP_UDP_CSUM_COMP;
+ /* IP addresses */
+ cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] +
+ iphs[8] + iphs[9] + iphs[10] + iphs[11] +
+ iphs[12] + iphs[13] + iphs[14] + iphs[15] +
+ iphs[16] + iphs[17] + iphs[18] + iphs[19];
+ break;
+ }
+ case IPPROTO_ICMPV6:
+ ulp_len = ira->ira_pktlen - ip_hdr_length;
+
+ /* Protocol and length */
+ cksum = htons(ulp_len) + IP_ICMPV6_CSUM_COMP;
+ /* IP addresses */
+ cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] +
+ iphs[8] + iphs[9] + iphs[10] + iphs[11] +
+ iphs[12] + iphs[13] + iphs[14] + iphs[15] +
+ iphs[16] + iphs[17] + iphs[18] + iphs[19];
+ break;
+ default:
+ cksum = 0;
+ break;
+ }
+#undef iphs
+ return (cksum);
+}
+
+
+/*
+ * Software verification of the ULP checksums.
+ * Returns B_TRUE if ok.
+ * Increments statistics of failed.
+ */
+static boolean_t
+ip_input_sw_cksum_v6(mblk_t *mp, ip6_t *ip6h, ip_recv_attr_t *ira)
+{
+ ip_stack_t *ipst = ira->ira_ill->ill_ipst;
+ uint32_t cksum;
+ uint8_t protocol = ira->ira_protocol;
+ uint16_t ip_hdr_length = ira->ira_ip_hdr_length;
+
+ IP6_STAT(ipst, ip6_in_sw_cksum);
+
+ ASSERT(protocol == IPPROTO_TCP || protocol == IPPROTO_UDP ||
+ protocol == IPPROTO_ICMPV6);
+
+ cksum = ip_input_cksum_pseudo_v6(ip6h, ira);
+ cksum = IP_CSUM(mp, ip_hdr_length, cksum);
+ if (cksum == 0)
+ return (B_TRUE);
+
+ ip_input_cksum_err_v6(protocol, 0, ira->ira_ill);
+ return (B_FALSE);
+}
+
+/*
+ * Verify the ULP checksums.
+ * Returns B_TRUE if ok, or if the ULP doesn't have a well-defined checksum
+ * algorithm.
+ * Increments statistics if failed.
+ */
+static boolean_t
+ip_input_cksum_v6(iaflags_t iraflags, mblk_t *mp, ip6_t *ip6h,
+ ip_recv_attr_t *ira)
+{
+ ill_t *ill = ira->ira_rill;
+ uint16_t hck_flags;
+ uint32_t cksum;
+ mblk_t *mp1;
+ uint_t len;
+ uint8_t protocol = ira->ira_protocol;
+ uint16_t ip_hdr_length = ira->ira_ip_hdr_length;
+
+
+ switch (protocol) {
+ case IPPROTO_TCP:
+ case IPPROTO_ICMPV6:
+ break;
+
+ case IPPROTO_UDP: {
+ udpha_t *udpha;
+
+ udpha = (udpha_t *)((uchar_t *)ip6h + ip_hdr_length);
+ /*
+ * Before going through the regular checksum
+ * calculation, make sure the received checksum
+ * is non-zero. RFC 2460 says, a 0x0000 checksum
+ * in a UDP packet (within IPv6 packet) is invalid
+ * and should be replaced by 0xffff. This makes
+ * sense as regular checksum calculation will
+ * pass for both the cases i.e. 0x0000 and 0xffff.
+ * Removing one of the case makes error detection
+ * stronger.
+ */
+ if (udpha->uha_checksum == 0) {
+ /* 0x0000 checksum is invalid */
+ BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs);
+ return (B_FALSE);
+ }
+ break;
+ }
+ case IPPROTO_SCTP: {
+ sctp_hdr_t *sctph;
+ uint32_t pktsum;
+
+ sctph = (sctp_hdr_t *)((uchar_t *)ip6h + ip_hdr_length);
+#ifdef DEBUG
+ if (skip_sctp_cksum)
+ return (B_TRUE);
+#endif
+ pktsum = sctph->sh_chksum;
+ sctph->sh_chksum = 0;
+ cksum = sctp_cksum(mp, ip_hdr_length);
+ sctph->sh_chksum = pktsum;
+ if (cksum == pktsum)
+ return (B_TRUE);
+
+ /*
+ * Defer until later whether a bad checksum is ok
+ * in order to allow RAW sockets to use Adler checksum
+ * with SCTP.
+ */
+ ira->ira_flags |= IRAF_SCTP_CSUM_ERR;
+ return (B_TRUE);
+ }
+
+ default:
+ /* No ULP checksum to verify. */
+ return (B_TRUE);
+ }
+
+ /*
+ * Revert to software checksum calculation if the interface
+ * isn't capable of checksum offload.
+ * We clear DB_CKSUMFLAGS when going through IPsec in ip_fanout.
+ * Note: IRAF_NO_HW_CKSUM is not currently used.
+ */
+ ASSERT(!IS_IPMP(ill));
+ if ((iraflags & IRAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) ||
+ !dohwcksum) {
+ return (ip_input_sw_cksum_v6(mp, ip6h, ira));
+ }
+
+ /*
+ * We apply this for all ULP protocols. Does the HW know to
+ * not set the flags for SCTP and other protocols.
+ */
+
+ hck_flags = DB_CKSUMFLAGS(mp);
+
+ if (hck_flags & HCK_FULLCKSUM) {
+ /*
+ * Full checksum has been computed by the hardware
+ * and has been attached. If the driver wants us to
+ * verify the correctness of the attached value, in
+ * order to protect against faulty hardware, compare
+ * it against -0 (0xFFFF) to see if it's valid.
+ */
+ if (hck_flags & HCK_FULLCKSUM_OK)
+ return (B_TRUE);
+
+ cksum = DB_CKSUM16(mp);
+ if (cksum == 0xFFFF)
+ return (B_TRUE);
+ ip_input_cksum_err_v6(protocol, hck_flags, ira->ira_ill);
+ return (B_FALSE);
+ }
+
+ mp1 = mp->b_cont;
+ if ((hck_flags & HCK_PARTIALCKSUM) &&
+ (mp1 == NULL || mp1->b_cont == NULL) &&
+ ip_hdr_length >= DB_CKSUMSTART(mp) &&
+ ((len = ip_hdr_length - DB_CKSUMSTART(mp)) & 1) == 0) {
+ uint32_t adj;
+ uchar_t *cksum_start;
+
+ cksum = ip_input_cksum_pseudo_v6(ip6h, ira);
+
+ cksum_start = ((uchar_t *)ip6h + DB_CKSUMSTART(mp));
+
+ /*
+ * Partial checksum has been calculated by hardware
+ * and attached to the packet; in addition, any
+ * prepended extraneous data is even byte aligned,
+ * and there are at most two mblks associated with
+ * the packet. If any such data exists, we adjust
+ * the checksum; also take care any postpended data.
+ */
+ IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, len, adj);
+ /*
+ * One's complement subtract extraneous checksum
+ */
+ cksum += DB_CKSUM16(mp);
+ if (adj >= cksum)
+ cksum = ~(adj - cksum) & 0xFFFF;
+ else
+ cksum -= adj;
+ cksum = (cksum & 0xFFFF) + ((int)cksum >> 16);
+ cksum = (cksum & 0xFFFF) + ((int)cksum >> 16);
+ if (!(~cksum & 0xFFFF))
+ return (B_TRUE);
+
+ ip_input_cksum_err_v6(protocol, hck_flags, ira->ira_ill);
+ return (B_FALSE);
+ }
+ return (ip_input_sw_cksum_v6(mp, ip6h, ira));
+}
+
+
+/*
+ * Handle fanout of received packets.
+ * Unicast packets that are looped back (from ire_send_local_v6) and packets
+ * from the wire are differentiated by checking IRAF_VERIFY_ULP_CKSUM.
+ *
+ * IPQoS Notes
+ * Before sending it to the client, invoke IPPF processing. Policy processing
+ * takes place only if the callout_position, IPP_LOCAL_IN, is enabled.
+ */
+void
+ip_fanout_v6(mblk_t *mp, ip6_t *ip6h, ip_recv_attr_t *ira)
+{
+ ill_t *ill = ira->ira_ill;
+ iaflags_t iraflags = ira->ira_flags;
+ ip_stack_t *ipst = ill->ill_ipst;
+ uint8_t protocol;
+ conn_t *connp;
+#define rptr ((uchar_t *)ip6h)
+ uint_t ip_hdr_length;
+ uint_t min_ulp_header_length;
+ int offset;
+ ssize_t len;
+ netstack_t *ns = ipst->ips_netstack;
+ ipsec_stack_t *ipss = ns->netstack_ipsec;
+ ill_t *rill = ira->ira_rill;
+
+ ASSERT(ira->ira_pktlen == ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN);
+
+ /*
+ * We repeat this as we parse over destination options header and
+ * fragment headers (earlier we've handled any hop-by-hop options
+ * header.)
+ * We update ira_protocol and ira_ip_hdr_length as we skip past
+ * the intermediate headers; they already point past any
+ * hop-by-hop header.
+ */
+repeat:
+ protocol = ira->ira_protocol;
+ ip_hdr_length = ira->ira_ip_hdr_length;
+
+ /*
+ * Time for IPP once we've done reassembly and IPsec.
+ * We skip this for loopback packets since we don't do IPQoS
+ * on loopback.
+ */
+ if (IPP_ENABLED(IPP_LOCAL_IN, ipst) &&
+ !(iraflags & IRAF_LOOPBACK) &&
+ (protocol != IPPROTO_ESP || protocol != IPPROTO_AH ||
+ protocol != IPPROTO_DSTOPTS || protocol != IPPROTO_ROUTING ||
+ protocol != IPPROTO_FRAGMENT)) {
+ /*
+ * Use the interface on which the packet arrived - not where
+ * the IP address is hosted.
+ */
+ /* ip_process translates an IS_UNDER_IPMP */
+ mp = ip_process(IPP_LOCAL_IN, mp, rill, ill);
+ if (mp == NULL) {
+ /* ip_drop_packet and MIB done */
+ return;
+ }
+ }
+
+ /* Determine the minimum required size of the upper-layer header */
+ /* Need to do this for at least the set of ULPs that TX handles. */
+ switch (protocol) {
+ case IPPROTO_TCP:
+ min_ulp_header_length = TCP_MIN_HEADER_LENGTH;
+ break;
+ case IPPROTO_SCTP:
+ min_ulp_header_length = SCTP_COMMON_HDR_LENGTH;
+ break;
+ case IPPROTO_UDP:
+ min_ulp_header_length = UDPH_SIZE;
+ break;
+ case IPPROTO_ICMP:
+ case IPPROTO_ICMPV6:
+ min_ulp_header_length = ICMPH_SIZE;
+ break;
+ case IPPROTO_FRAGMENT:
+ case IPPROTO_DSTOPTS:
+ case IPPROTO_ROUTING:
+ min_ulp_header_length = MIN_EHDR_LEN;
+ break;
+ default:
+ min_ulp_header_length = 0;
+ break;
+ }
+ /* Make sure we have the min ULP header length */
+ len = mp->b_wptr - rptr;
+ if (len < ip_hdr_length + min_ulp_header_length) {
+ if (ira->ira_pktlen < ip_hdr_length + min_ulp_header_length)
+ goto pkt_too_short;
+
+ IP6_STAT(ipst, ip6_recv_pullup);
+ ip6h = ip_pullup(mp, ip_hdr_length + min_ulp_header_length,
+ ira);
+ if (ip6h == NULL)
+ goto discard;
+ len = mp->b_wptr - rptr;
+ }
+
+ /*
+ * If trusted extensions then determine the zoneid and TX specific
+ * ira_flags.
+ */
+ if (iraflags & IRAF_SYSTEM_LABELED) {
+ /* This can update ira->ira_flags and ira->ira_zoneid */
+ ip_fanout_tx_v6(mp, ip6h, protocol, ip_hdr_length, ira);
+ iraflags = ira->ira_flags;
+ }
+
+
+ /* Verify ULP checksum. Handles TCP, UDP, and SCTP */
+ if (iraflags & IRAF_VERIFY_ULP_CKSUM) {
+ if (!ip_input_cksum_v6(iraflags, mp, ip6h, ira)) {
+ /* Bad checksum. Stats are already incremented */
+ ip_drop_input("Bad ULP checksum", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ /* IRAF_SCTP_CSUM_ERR could have been set */
+ iraflags = ira->ira_flags;
+ }
+ switch (protocol) {
+ case IPPROTO_TCP:
+ /* For TCP, discard multicast packets. */
+ if (iraflags & IRAF_MULTIBROADCAST)
+ goto discard;
+
+ /* First mblk contains IP+TCP headers per above check */
+ ASSERT(len >= ip_hdr_length + TCP_MIN_HEADER_LENGTH);
+
+ /* TCP options present? */
+ offset = ((uchar_t *)ip6h)[ip_hdr_length + 12] >> 4;
+ if (offset != 5) {
+ if (offset < 5)
+ goto discard;
+
+ /*
+ * There must be TCP options.
+ * Make sure we can grab them.
+ */
+ offset <<= 2;
+ offset += ip_hdr_length;
+ if (len < offset) {
+ if (ira->ira_pktlen < offset)
+ goto pkt_too_short;
+
+ IP6_STAT(ipst, ip6_recv_pullup);
+ ip6h = ip_pullup(mp, offset, ira);
+ if (ip6h == NULL)
+ goto discard;
+ len = mp->b_wptr - rptr;
+ }
+ }
+
+ /*
+ * Pass up a squeue hint to tcp.
+ * If ira_sqp is already set (this is loopback) we leave it
+ * alone.
+ */
+ if (ira->ira_sqp == NULL) {
+ ira->ira_sqp = ip_squeue_get(ira->ira_ring);
+ }
+
+ /* Look for AF_INET or AF_INET6 that matches */
+ connp = ipcl_classify_v6(mp, IPPROTO_TCP, ip_hdr_length,
+ ira, ipst);
+ if (connp == NULL) {
+ /* Send the TH_RST */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+ tcp_xmit_listeners_reset(mp, ira, ipst, NULL);
+ return;
+ }
+ if (connp->conn_incoming_ifindex != 0 &&
+ connp->conn_incoming_ifindex != ira->ira_ruifindex) {
+ CONN_DEC_REF(connp);
+
+ /* Send the TH_RST */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+ tcp_xmit_listeners_reset(mp, ira, ipst, NULL);
+ return;
+ }
+ if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) ||
+ (iraflags & IRAF_IPSEC_SECURE)) {
+ mp = ipsec_check_inbound_policy(mp, connp,
+ NULL, ip6h, ira);
+ if (mp == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ /* Note that mp is NULL */
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ CONN_DEC_REF(connp);
+ return;
+ }
+ }
+ /* Found a client; up it goes */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+ ira->ira_ill = ira->ira_rill = NULL;
+ if (!IPCL_IS_TCP(connp)) {
+ /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
+ (connp->conn_recv)(connp, mp, NULL, ira);
+ CONN_DEC_REF(connp);
+ ira->ira_ill = ill;
+ ira->ira_rill = rill;
+ return;
+ }
+
+ /*
+ * We do different processing whether called from
+ * ip_accept_tcp and we match the target, don't match
+ * the target, and when we are called by ip_input.
+ */
+ if (iraflags & IRAF_TARGET_SQP) {
+ if (ira->ira_target_sqp == connp->conn_sqp) {
+ mblk_t *attrmp;
+
+ attrmp = ip_recv_attr_to_mblk(ira);
+ if (attrmp == NULL) {
+ BUMP_MIB(ill->ill_ip_mib,
+ ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards",
+ mp, ill);
+ freemsg(mp);
+ CONN_DEC_REF(connp);
+ } else {
+ SET_SQUEUE(attrmp, connp->conn_recv,
+ connp);
+ attrmp->b_cont = mp;
+ ASSERT(ira->ira_target_sqp_mp == NULL);
+ ira->ira_target_sqp_mp = attrmp;
+ /*
+ * Conn ref release when drained from
+ * the squeue.
+ */
+ }
+ } else {
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
+ connp->conn_recv, connp, ira, SQ_FILL,
+ SQTAG_IP6_TCP_INPUT);
+ }
+ } else {
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv,
+ connp, ira, ip_squeue_flag, SQTAG_IP6_TCP_INPUT);
+ }
+ ira->ira_ill = ill;
+ ira->ira_rill = rill;
+ return;
+
+ case IPPROTO_SCTP: {
+ sctp_hdr_t *sctph;
+ uint32_t ports; /* Source and destination ports */
+ sctp_stack_t *sctps = ipst->ips_netstack->netstack_sctp;
+
+ /* For SCTP, discard multicast packets. */
+ if (iraflags & IRAF_MULTIBROADCAST)
+ goto discard;
+
+ /*
+ * Since there is no SCTP h/w cksum support yet, just
+ * clear the flag.
+ */
+ DB_CKSUMFLAGS(mp) = 0;
+
+ /* Length ensured above */
+ ASSERT(MBLKL(mp) >= ip_hdr_length + SCTP_COMMON_HDR_LENGTH);
+ sctph = (sctp_hdr_t *)(rptr + ip_hdr_length);
+
+ /* get the ports */
+ ports = *(uint32_t *)&sctph->sh_sport;
+
+ if (iraflags & IRAF_SCTP_CSUM_ERR) {
+ /*
+ * No potential sctp checksum errors go to the Sun
+ * sctp stack however they might be Adler-32 summed
+ * packets a userland stack bound to a raw IP socket
+ * could reasonably use. Note though that Adler-32 is
+ * a long deprecated algorithm and customer sctp
+ * networks should eventually migrate to CRC-32 at
+ * which time this facility should be removed.
+ */
+ ip_fanout_sctp_raw(mp, NULL, ip6h, ports, ira);
+ return;
+ }
+ connp = sctp_fanout(&ip6h->ip6_src, &ip6h->ip6_dst, ports,
+ ira, mp, sctps);
+ if (connp == NULL) {
+ /* Check for raw socket or OOTB handling */
+ ip_fanout_sctp_raw(mp, NULL, ip6h, ports, ira);
+ return;
+ }
+ if (connp->conn_incoming_ifindex != 0 &&
+ connp->conn_incoming_ifindex != ira->ira_ruifindex) {
+ CONN_DEC_REF(connp);
+
+ /* Check for raw socket or OOTB handling */
+ ip_fanout_sctp_raw(mp, NULL, ip6h, ports, ira);
+ return;
+ }
+
+ /* Found a client; up it goes */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+ sctp_input(connp, NULL, ip6h, mp, ira);
+ /* sctp_input does a rele of the sctp_t */
+ return;
+ }
+
+ case IPPROTO_UDP:
+ /* First mblk contains IP+UDP headers as checked above */
+ ASSERT(MBLKL(mp) >= ip_hdr_length + UDPH_SIZE);
+
+ if (iraflags & IRAF_MULTIBROADCAST) {
+ uint16_t *up; /* Pointer to ports in ULP header */
+
+ up = (uint16_t *)((uchar_t *)ip6h + ip_hdr_length);
+
+ ip_fanout_udp_multi_v6(mp, ip6h, up[1], up[0], ira);
+ return;
+ }
+
+ /* Look for AF_INET or AF_INET6 that matches */
+ connp = ipcl_classify_v6(mp, IPPROTO_UDP, ip_hdr_length,
+ ira, ipst);
+ if (connp == NULL) {
+ no_udp_match:
+ if (ipst->ips_ipcl_proto_fanout_v6[IPPROTO_UDP].
+ connf_head != NULL) {
+ ASSERT(ira->ira_protocol == IPPROTO_UDP);
+ ip_fanout_proto_v6(mp, ip6h, ira);
+ } else {
+ ip_fanout_send_icmp_v6(mp, ICMP6_DST_UNREACH,
+ ICMP6_DST_UNREACH_NOPORT, ira);
+ }
+ return;
+
+ }
+ if (connp->conn_incoming_ifindex != 0 &&
+ connp->conn_incoming_ifindex != ira->ira_ruifindex) {
+ CONN_DEC_REF(connp);
+ goto no_udp_match;
+ }
+ if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld :
+ !canputnext(connp->conn_rq)) {
+ CONN_DEC_REF(connp);
+ BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
+ ip_drop_input("udpIfStatsInOverflows", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) ||
+ (iraflags & IRAF_IPSEC_SECURE)) {
+ mp = ipsec_check_inbound_policy(mp, connp,
+ NULL, ip6h, ira);
+ if (mp == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ /* Note that mp is NULL */
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ CONN_DEC_REF(connp);
+ return;
+ }
+ }
+
+ /* Found a client; up it goes */
+ IP6_STAT(ipst, ip6_udp_fannorm);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+ ira->ira_ill = ira->ira_rill = NULL;
+ (connp->conn_recv)(connp, mp, NULL, ira);
+ CONN_DEC_REF(connp);
+ ira->ira_ill = ill;
+ ira->ira_rill = rill;
+ return;
+ default:
+ break;
+ }
+
+ /*
+ * Clear hardware checksumming flag as it is currently only
+ * used by TCP and UDP.
+ */
+ DB_CKSUMFLAGS(mp) = 0;
+
+ switch (protocol) {
+ case IPPROTO_ICMPV6:
+ BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInMsgs);
+
+ /* Check variable for testing applications */
+ if (ipst->ips_ipv6_drop_inbound_icmpv6) {
+ ip_drop_input("ipv6_drop_inbound_icmpv6", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ /*
+ * We need to accomodate icmp messages coming in clear
+ * until we get everything secure from the wire. If
+ * icmp_accept_clear_messages is zero we check with
+ * the global policy and act accordingly. If it is
+ * non-zero, we accept the message without any checks.
+ * But *this does not mean* that this will be delivered
+ * to RAW socket clients. By accepting we might send
+ * replies back, change our MTU value etc.,
+ * but delivery to the ULP/clients depends on their
+ * policy dispositions.
+ */
+ if (ipst->ips_icmp_accept_clear_messages == 0) {
+ mp = ipsec_check_global_policy(mp, NULL,
+ NULL, ip6h, ira, ns);
+ if (mp == NULL)
+ return;
+ }
+
+ /*
+ * On a labeled system, we have to check whether the zone
+ * itself is permitted to receive raw traffic.
+ */
+ if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
+ if (!tsol_can_accept_raw(mp, ira, B_FALSE)) {
+ BUMP_MIB(ill->ill_icmp6_mib,
+ ipv6IfIcmpInErrors);
+ ip_drop_input("tsol_can_accept_raw", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ }
+
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+ mp = icmp_inbound_v6(mp, ira);
+ if (mp == NULL) {
+ /* No need to pass to RAW sockets */
+ return;
+ }
+ break;
+
+ case IPPROTO_DSTOPTS: {
+ ip6_dest_t *desthdr;
+ uint_t ehdrlen;
+ uint8_t *optptr;
+
+ /* We already check for MIN_EHDR_LEN above */
+
+ /* Check if AH is present and needs to be processed. */
+ mp = ipsec_early_ah_v6(mp, ira);
+ if (mp == NULL)
+ return;
+
+ /*
+ * Reinitialize pointers, as ipsec_early_ah_v6() does
+ * complete pullups. We don't have to do more pullups
+ * as a result.
+ */
+ ip6h = (ip6_t *)mp->b_rptr;
+
+ if (ira->ira_pktlen - ip_hdr_length < MIN_EHDR_LEN)
+ goto pkt_too_short;
+
+ if (mp->b_cont != NULL &&
+ rptr + ip_hdr_length + MIN_EHDR_LEN > mp->b_wptr) {
+ ip6h = ip_pullup(mp, ip_hdr_length + MIN_EHDR_LEN, ira);
+ if (ip6h == NULL)
+ goto discard;
+ }
+ desthdr = (ip6_dest_t *)(rptr + ip_hdr_length);
+ ehdrlen = 8 * (desthdr->ip6d_len + 1);
+ if (ira->ira_pktlen - ip_hdr_length < ehdrlen)
+ goto pkt_too_short;
+ if (mp->b_cont != NULL &&
+ rptr + IPV6_HDR_LEN + ehdrlen > mp->b_wptr) {
+ ip6h = ip_pullup(mp, IPV6_HDR_LEN + ehdrlen, ira);
+ if (ip6h == NULL)
+ goto discard;
+
+ desthdr = (ip6_dest_t *)(rptr + ip_hdr_length);
+ }
+ optptr = (uint8_t *)&desthdr[1];
+
+ /*
+ * Update ira_ip_hdr_length to skip the destination header
+ * when we repeat.
+ */
+ ira->ira_ip_hdr_length += ehdrlen;
+
+ ira->ira_protocol = desthdr->ip6d_nxt;
+
+ /*
+ * Note: XXX This code does not seem to make
+ * distinction between Destination Options Header
+ * being before/after Routing Header which can
+ * happen if we are at the end of source route.
+ * This may become significant in future.
+ * (No real significant Destination Options are
+ * defined/implemented yet ).
+ */
+ switch (ip_process_options_v6(mp, ip6h, optptr,
+ ehdrlen - 2, IPPROTO_DSTOPTS, ira)) {
+ case -1:
+ /*
+ * Packet has been consumed and any needed
+ * ICMP errors sent.
+ */
+ return;
+ case 0:
+ /* No action needed continue */
+ break;
+ case 1:
+ /*
+ * Unnexpected return value
+ * (Router alert is a Hop-by-Hop option)
+ */
+#ifdef DEBUG
+ panic("ip_fanout_v6: router "
+ "alert hbh opt indication in dest opt");
+ /*NOTREACHED*/
+#else
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ freemsg(mp);
+ return;
+#endif
+ }
+ goto repeat;
+ }
+ case IPPROTO_FRAGMENT: {
+ ip6_frag_t *fraghdr;
+
+ if (ira->ira_pktlen - ip_hdr_length < sizeof (ip6_frag_t))
+ goto pkt_too_short;
+
+ if (mp->b_cont != NULL &&
+ rptr + ip_hdr_length + sizeof (ip6_frag_t) > mp->b_wptr) {
+ ip6h = ip_pullup(mp,
+ ip_hdr_length + sizeof (ip6_frag_t), ira);
+ if (ip6h == NULL)
+ goto discard;
+ }
+
+ fraghdr = (ip6_frag_t *)(rptr + ip_hdr_length);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmReqds);
+
+ /*
+ * Invoke the CGTP (multirouting) filtering module to
+ * process the incoming packet. Packets identified as
+ * duplicates must be discarded. Filtering is active
+ * only if the ip_cgtp_filter ndd variable is
+ * non-zero.
+ */
+ if (ipst->ips_ip_cgtp_filter &&
+ ipst->ips_ip_cgtp_filter_ops != NULL) {
+ int cgtp_flt_pkt;
+ netstackid_t stackid;
+
+ stackid = ipst->ips_netstack->netstack_stackid;
+
+ /*
+ * CGTP and IPMP are mutually exclusive so
+ * phyint_ifindex is fine here.
+ */
+ cgtp_flt_pkt =
+ ipst->ips_ip_cgtp_filter_ops->cfo_filter_v6(
+ stackid, ill->ill_phyint->phyint_ifindex,
+ ip6h, fraghdr);
+ if (cgtp_flt_pkt == CGTP_IP_PKT_DUPLICATE) {
+ ip_drop_input("CGTP_IP_PKT_DUPLICATE", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ }
+
+ /*
+ * Update ip_hdr_length to skip the frag header
+ * ip_input_fragment_v6 will determine the extension header
+ * prior to the fragment header and update its nexthdr value,
+ * and also set ira_protocol to the nexthdr that follows the
+ * completed fragment.
+ */
+ ip_hdr_length += sizeof (ip6_frag_t);
+
+ /*
+ * Make sure we have ira_l2src before we loose the original
+ * mblk
+ */
+ if (!(ira->ira_flags & IRAF_L2SRC_SET))
+ ip_setl2src(mp, ira, ira->ira_rill);
+
+ mp = ip_input_fragment_v6(mp, ip6h, fraghdr,
+ ira->ira_pktlen - ip_hdr_length, ira);
+ if (mp == NULL) {
+ /* Reassembly is still pending */
+ return;
+ }
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmOKs);
+
+ /*
+ * The mblk chain has the frag header removed and
+ * ira_protocol, ira_pktlen, ira_ip_hdr_length as well as the
+ * IP header has been updated to refleact the result.
+ */
+ ip6h = (ip6_t *)mp->b_rptr;
+ ip_hdr_length = ira->ira_ip_hdr_length;
+ goto repeat;
+ }
+ case IPPROTO_HOPOPTS:
+ /*
+ * Illegal header sequence.
+ * (Hop-by-hop headers are processed above
+ * and required to immediately follow IPv6 header)
+ */
+ ip_drop_input("ICMP_PARAM_PROBLEM", mp, ill);
+ icmp_param_problem_nexthdr_v6(mp, B_FALSE, ira);
+ return;
+
+ case IPPROTO_ROUTING: {
+ uint_t ehdrlen;
+ ip6_rthdr_t *rthdr;
+
+ /* Check if AH is present and needs to be processed. */
+ mp = ipsec_early_ah_v6(mp, ira);
+ if (mp == NULL)
+ return;
+
+ /*
+ * Reinitialize pointers, as ipsec_early_ah_v6() does
+ * complete pullups. We don't have to do more pullups
+ * as a result.
+ */
+ ip6h = (ip6_t *)mp->b_rptr;
+
+ if (ira->ira_pktlen - ip_hdr_length < MIN_EHDR_LEN)
+ goto pkt_too_short;
+
+ if (mp->b_cont != NULL &&
+ rptr + ip_hdr_length + MIN_EHDR_LEN > mp->b_wptr) {
+ ip6h = ip_pullup(mp, ip_hdr_length + MIN_EHDR_LEN, ira);
+ if (ip6h == NULL)
+ goto discard;
+ }
+ rthdr = (ip6_rthdr_t *)(rptr + ip_hdr_length);
+ protocol = ira->ira_protocol = rthdr->ip6r_nxt;
+ ehdrlen = 8 * (rthdr->ip6r_len + 1);
+ if (ira->ira_pktlen - ip_hdr_length < ehdrlen)
+ goto pkt_too_short;
+ if (mp->b_cont != NULL &&
+ rptr + IPV6_HDR_LEN + ehdrlen > mp->b_wptr) {
+ ip6h = ip_pullup(mp, IPV6_HDR_LEN + ehdrlen, ira);
+ if (ip6h == NULL)
+ goto discard;
+ rthdr = (ip6_rthdr_t *)(rptr + ip_hdr_length);
+ }
+ if (rthdr->ip6r_segleft != 0) {
+ /* Not end of source route */
+ if (ira->ira_flags &
+ (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) {
+ BUMP_MIB(ill->ill_ip_mib,
+ ipIfStatsForwProhibits);
+ ip_drop_input("ipIfStatsInForwProhibits",
+ mp, ill);
+ freemsg(mp);
+ return;
+ }
+ ip_process_rthdr(mp, ip6h, rthdr, ira);
+ return;
+ }
+ ira->ira_ip_hdr_length += ehdrlen;
+ goto repeat;
+ }
+
+ case IPPROTO_AH:
+ case IPPROTO_ESP: {
+ /*
+ * Fast path for AH/ESP.
+ */
+ netstack_t *ns = ipst->ips_netstack;
+ ipsec_stack_t *ipss = ns->netstack_ipsec;
+
+ IP_STAT(ipst, ipsec_proto_ahesp);
+
+ if (!ipsec_loaded(ipss)) {
+ ip_proto_not_sup(mp, ira);
+ return;
+ }
+
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+ /* select inbound SA and have IPsec process the pkt */
+ if (protocol == IPPROTO_ESP) {
+ esph_t *esph;
+
+ mp = ipsec_inbound_esp_sa(mp, ira, &esph);
+ if (mp == NULL)
+ return;
+
+ ASSERT(esph != NULL);
+ ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
+ ASSERT(ira->ira_ipsec_esp_sa != NULL);
+ ASSERT(ira->ira_ipsec_esp_sa->ipsa_input_func != NULL);
+
+ mp = ira->ira_ipsec_esp_sa->ipsa_input_func(mp, esph,
+ ira);
+ } else {
+ ah_t *ah;
+
+ mp = ipsec_inbound_ah_sa(mp, ira, &ah);
+ if (mp == NULL)
+ return;
+
+ ASSERT(ah != NULL);
+ ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
+ ASSERT(ira->ira_ipsec_ah_sa != NULL);
+ ASSERT(ira->ira_ipsec_ah_sa->ipsa_input_func != NULL);
+ mp = ira->ira_ipsec_ah_sa->ipsa_input_func(mp, ah,
+ ira);
+ }
+
+ if (mp == NULL) {
+ /*
+ * Either it failed or is pending. In the former case
+ * ipIfStatsInDiscards was increased.
+ */
+ return;
+ }
+ /* we're done with IPsec processing, send it up */
+ ip_input_post_ipsec(mp, ira);
+ return;
+ }
+ case IPPROTO_NONE:
+ /* All processing is done. Count as "delivered". */
+ freemsg(mp);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+ return;
+
+ case IPPROTO_ENCAP:
+ case IPPROTO_IPV6:
+ /* iptun will verify trusted label */
+ connp = ipcl_classify_v6(mp, protocol, ip_hdr_length,
+ ira, ipst);
+ if (connp != NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+ ira->ira_ill = ira->ira_rill = NULL;
+ connp->conn_recv(connp, mp, NULL, ira);
+ CONN_DEC_REF(connp);
+ ira->ira_ill = ill;
+ ira->ira_rill = rill;
+ return;
+ }
+ /* FALLTHRU */
+ default:
+ /*
+ * On a labeled system, we have to check whether the zone
+ * itself is permitted to receive raw traffic.
+ */
+ if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
+ if (!tsol_can_accept_raw(mp, ira, B_FALSE)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ }
+ break;
+ }
+
+ /*
+ * The above input functions may have returned the pulled up message.
+ * So ip6h need to be reinitialized.
+ */
+ ip6h = (ip6_t *)mp->b_rptr;
+ ira->ira_protocol = protocol;
+ if (ipst->ips_ipcl_proto_fanout_v6[protocol].connf_head == NULL) {
+ /* No user-level listener for these packets packets */
+ ip_proto_not_sup(mp, ira);
+ return;
+ }
+
+ /*
+ * Handle fanout to raw sockets. There
+ * can be more than one stream bound to a particular
+ * protocol. When this is the case, each one gets a copy
+ * of any incoming packets.
+ */
+ ASSERT(ira->ira_protocol == protocol);
+ ip_fanout_proto_v6(mp, ip6h, ira);
+ return;
+
+pkt_too_short:
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
+ ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
+ freemsg(mp);
+ return;
+
+discard:
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ freemsg(mp);
+#undef rptr
+}
diff --git a/usr/src/uts/common/inet/ip/ip6_ire.c b/usr/src/uts/common/inet/ip/ip6_ire.c
index c13a66fcc2..7697ca20c7 100644
--- a/usr/src/uts/common/inet/ip/ip6_ire.c
+++ b/usr/src/uts/common/inet/ip/ip6_ire.c
@@ -60,122 +60,122 @@
#include <sys/tsol/label.h>
#include <sys/tsol/tnet.h>
+#define IS_DEFAULT_ROUTE_V6(ire) \
+ (((ire)->ire_type & IRE_DEFAULT) || \
+ (((ire)->ire_type & IRE_INTERFACE) && \
+ (IN6_IS_ADDR_UNSPECIFIED(&(ire)->ire_addr_v6))))
+
static ire_t ire_null;
-static ire_t *ire_ihandle_lookup_onlink_v6(ire_t *cire);
-static boolean_t ire_match_args_v6(ire_t *ire, const in6_addr_t *addr,
- const in6_addr_t *mask, const in6_addr_t *gateway, int type,
- const ipif_t *ipif, zoneid_t zoneid, uint32_t ihandle,
- const ts_label_t *tsl, int match_flags);
-static ire_t *ire_init_v6(ire_t *, const in6_addr_t *, const in6_addr_t *,
- const in6_addr_t *, const in6_addr_t *, uint_t *, queue_t *, queue_t *,
- ushort_t, ipif_t *, const in6_addr_t *, uint32_t, uint32_t, uint_t,
- const iulp_t *, tsol_gc_t *, tsol_gcgrp_t *, ip_stack_t *);
-static ire_t *ip6_ctable_lookup_impl(ire_ctable_args_t *);
+static ire_t *
+ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask,
+ const in6_addr_t *gateway, int type, const ill_t *ill,
+ zoneid_t zoneid, const ts_label_t *tsl, int flags,
+ ip_stack_t *ipst);
/*
* Initialize the ire that is specific to IPv6 part and call
* ire_init_common to finish it.
+ * Returns zero or errno.
*/
-static ire_t *
+int
ire_init_v6(ire_t *ire, const in6_addr_t *v6addr, const in6_addr_t *v6mask,
- const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway,
- uint_t *max_fragp, queue_t *rfq, queue_t *stq, ushort_t type,
- ipif_t *ipif, const in6_addr_t *v6cmask, uint32_t phandle,
- uint32_t ihandle, uint_t flags, const iulp_t *ulp_info, tsol_gc_t *gc,
- tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
+ const in6_addr_t *v6gateway, ushort_t type, ill_t *ill,
+ zoneid_t zoneid, uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst)
{
+ int error;
/*
- * Reject IRE security attribute creation/initialization
+ * Reject IRE security attmakeribute creation/initialization
* if system is not running in Trusted mode.
*/
- if ((gc != NULL || gcgrp != NULL) && !is_system_labeled())
- return (NULL);
-
+ if (gc != NULL && !is_system_labeled())
+ return (EINVAL);
BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_alloced);
- ire->ire_addr_v6 = *v6addr;
-
- if (v6src_addr != NULL)
- ire->ire_src_addr_v6 = *v6src_addr;
- if (v6mask != NULL) {
- ire->ire_mask_v6 = *v6mask;
- ire->ire_masklen = ip_mask_to_plen_v6(&ire->ire_mask_v6);
- }
+ if (v6addr != NULL)
+ ire->ire_addr_v6 = *v6addr;
if (v6gateway != NULL)
ire->ire_gateway_addr_v6 = *v6gateway;
- if (type == IRE_CACHE && v6cmask != NULL)
- ire->ire_cmask_v6 = *v6cmask;
-
- /*
- * Multirouted packets need to have a fragment header added so that
- * the receiver is able to discard duplicates according to their
- * fragment identifier.
- */
- if (type == IRE_CACHE && (flags & RTF_MULTIRT)) {
- ire->ire_frag_flag = IPH_FRAG_HDR;
+ /* Make sure we don't have stray values in some fields */
+ switch (type) {
+ case IRE_LOOPBACK:
+ ire->ire_gateway_addr_v6 = ire->ire_addr_v6;
+ /* FALLTHRU */
+ case IRE_HOST:
+ case IRE_LOCAL:
+ case IRE_IF_CLONE:
+ ire->ire_mask_v6 = ipv6_all_ones;
+ ire->ire_masklen = IPV6_ABITS;
+ break;
+ case IRE_PREFIX:
+ case IRE_DEFAULT:
+ case IRE_IF_RESOLVER:
+ case IRE_IF_NORESOLVER:
+ if (v6mask != NULL) {
+ ire->ire_mask_v6 = *v6mask;
+ ire->ire_masklen =
+ ip_mask_to_plen_v6(&ire->ire_mask_v6);
+ }
+ break;
+ case IRE_MULTICAST:
+ case IRE_NOROUTE:
+ ASSERT(v6mask == NULL);
+ break;
+ default:
+ ASSERT(0);
+ return (EINVAL);
}
- /* ire_init_common will free the mblks upon encountering any failure */
- if (!ire_init_common(ire, max_fragp, NULL, rfq, stq, type, ipif,
- phandle, ihandle, flags, IPV6_VERSION, ulp_info, gc, gcgrp, ipst))
- return (NULL);
-
- return (ire);
-}
-
-/*
- * Similar to ire_create_v6 except that it is called only when
- * we want to allocate ire as an mblk e.g. we have a external
- * resolver. Do we need this in IPv6 ?
- *
- * IPv6 initializes the ire_nce in ire_add_v6, which expects to
- * find the ire_nce to be null when it is called. So, although
- * we have a src_nce parameter (in the interest of matching up with
- * the argument list of the v4 version), we ignore the src_nce
- * argument here.
- */
-/* ARGSUSED */
-ire_t *
-ire_create_mp_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask,
- const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway,
- nce_t *src_nce, queue_t *rfq, queue_t *stq, ushort_t type,
- ipif_t *ipif, const in6_addr_t *v6cmask,
- uint32_t phandle, uint32_t ihandle, uint_t flags, const iulp_t *ulp_info,
- tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
-{
- ire_t *ire;
- ire_t *ret_ire;
- mblk_t *mp;
+ error = ire_init_common(ire, type, ill, zoneid, flags, IPV6_VERSION,
+ gc, ipst);
+ if (error != NULL)
+ return (error);
- ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr));
+ /* Determine which function pointers to use */
+ ire->ire_postfragfn = ip_xmit; /* Common case */
- /* Allocate the new IRE. */
- mp = allocb(sizeof (ire_t), BPRI_MED);
- if (mp == NULL) {
- ip1dbg(("ire_create_mp_v6: alloc failed\n"));
- return (NULL);
+ switch (ire->ire_type) {
+ case IRE_LOCAL:
+ ire->ire_sendfn = ire_send_local_v6;
+ ire->ire_recvfn = ire_recv_local_v6;
+#ifdef SO_VRRP
+ ASSERT(ire->ire_ill != NULL);
+ if (ire->ire_ill->ill_flags & ILLF_NOACCEPT) {
+ ire->ire_noaccept = B_TRUE;
+ ire->ire_recvfn = ire_recv_noaccept_v6;
+ }
+#endif
+ break;
+ case IRE_LOOPBACK:
+ ire->ire_sendfn = ire_send_local_v6;
+ ire->ire_recvfn = ire_recv_loopback_v6;
+ break;
+ case IRE_MULTICAST:
+ ire->ire_postfragfn = ip_postfrag_loopcheck;
+ ire->ire_sendfn = ire_send_multicast_v6;
+ ire->ire_recvfn = ire_recv_multicast_v6;
+ break;
+ default:
+ /*
+ * For IRE_IF_ALL and IRE_OFFLINK we forward received
+ * packets by default.
+ */
+ ire->ire_sendfn = ire_send_wire_v6;
+ ire->ire_recvfn = ire_recv_forward_v6;
+ break;
}
-
- ire = (ire_t *)mp->b_rptr;
- mp->b_wptr = (uchar_t *)&ire[1];
-
- /* Start clean. */
- *ire = ire_null;
- ire->ire_mp = mp;
- mp->b_datap->db_type = IRE_DB_TYPE;
-
- ret_ire = ire_init_v6(ire, v6addr, v6mask, v6src_addr, v6gateway,
- NULL, rfq, stq, type, ipif, v6cmask, phandle,
- ihandle, flags, ulp_info, gc, gcgrp, ipst);
-
- if (ret_ire == NULL) {
- freeb(ire->ire_mp);
- return (NULL);
+ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+ ire->ire_sendfn = ire_send_noroute_v6;
+ ire->ire_recvfn = ire_recv_noroute_v6;
+ } else if (ire->ire_flags & RTF_MULTIRT) {
+ ire->ire_postfragfn = ip_postfrag_multirt_v6;
+ ire->ire_sendfn = ire_send_multirt_v6;
+ ire->ire_recvfn = ire_recv_multirt_v6;
}
- return (ire);
+ ire->ire_nce_capable = ire_determine_nce_capable(ire);
+ return (0);
}
/*
@@ -183,153 +183,76 @@ ire_create_mp_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask,
*
* NOTE : This is called as writer sometimes though not required
* by this function.
- *
- * See comments above ire_create_mp_v6() for the rationale behind the
- * unused src_nce argument.
*/
/* ARGSUSED */
ire_t *
ire_create_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask,
- const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway,
- uint_t *max_fragp, nce_t *src_nce, queue_t *rfq, queue_t *stq,
- ushort_t type, ipif_t *ipif, const in6_addr_t *v6cmask,
- uint32_t phandle, uint32_t ihandle, uint_t flags, const iulp_t *ulp_info,
- tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
+ const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, zoneid_t zoneid,
+ uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst)
{
ire_t *ire;
- ire_t *ret_ire;
+ int error;
ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr));
ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
if (ire == NULL) {
- ip1dbg(("ire_create_v6: alloc failed\n"));
+ DTRACE_PROBE(kmem__cache__alloc);
return (NULL);
}
*ire = ire_null;
- ret_ire = ire_init_v6(ire, v6addr, v6mask, v6src_addr, v6gateway,
- max_fragp, rfq, stq, type, ipif, v6cmask, phandle,
- ihandle, flags, ulp_info, gc, gcgrp, ipst);
+ error = ire_init_v6(ire, v6addr, v6mask, v6gateway,
+ type, ill, zoneid, flags, gc, ipst);
- if (ret_ire == NULL) {
+ if (error != 0) {
+ DTRACE_PROBE2(ire__init__v6, ire_t *, ire, int, error);
kmem_cache_free(ire_cache, ire);
return (NULL);
}
- ASSERT(ret_ire == ire);
return (ire);
}
/*
- * Find an IRE_INTERFACE for the multicast group.
+ * Find the ill matching a multicast group.
* Allows different routes for multicast addresses
* in the unicast routing table (akin to FF::0/8 but could be more specific)
* which point at different interfaces. This is used when IPV6_MULTICAST_IF
* isn't specified (when sending) and when IPV6_JOIN_GROUP doesn't
* specify the interface to join on.
*
- * Supports link-local addresses by following the ipif/ill when recursing.
+ * Supports link-local addresses by using ire_route_recursive which follows
+ * the ill when recursing.
+ *
+ * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group
+ * and the MULTIRT property can be different for different groups, we
+ * extract RTF_MULTIRT from the special unicast route added for a group
+ * with CGTP and pass that back in the multirtp argument.
+ * This is used in ip_set_destination etc to set ixa_postfragfn for multicast.
+ * We have a setsrcp argument for the same reason.
*/
-ire_t *
-ire_lookup_multi_v6(const in6_addr_t *group, zoneid_t zoneid, ip_stack_t *ipst)
+ill_t *
+ire_lookup_multi_ill_v6(const in6_addr_t *group, zoneid_t zoneid,
+ ip_stack_t *ipst, boolean_t *multirtp, in6_addr_t *setsrcp)
{
ire_t *ire;
- ipif_t *ipif = NULL;
- int match_flags = MATCH_IRE_TYPE;
- in6_addr_t gw_addr_v6;
-
- ire = ire_ftable_lookup_v6(group, 0, 0, 0, NULL, NULL,
- zoneid, 0, NULL, MATCH_IRE_DEFAULT, ipst);
+ ill_t *ill;
- /* We search a resolvable ire in case of multirouting. */
- if ((ire != NULL) && (ire->ire_flags & RTF_MULTIRT)) {
- ire_t *cire = NULL;
- /*
- * If the route is not resolvable, the looked up ire
- * may be changed here. In that case, ire_multirt_lookup_v6()
- * IRE_REFRELE the original ire and change it.
- */
- (void) ire_multirt_lookup_v6(&cire, &ire, MULTIRT_CACHEGW,
- NULL, ipst);
- if (cire != NULL)
- ire_refrele(cire);
- }
- if (ire == NULL)
- return (NULL);
- /*
- * Make sure we follow ire_ipif.
- *
- * We need to determine the interface route through
- * which the gateway will be reached.
- */
- if (ire->ire_ipif != NULL) {
- ipif = ire->ire_ipif;
- match_flags |= MATCH_IRE_ILL;
- }
+ ire = ire_route_recursive_v6(group, 0, NULL, zoneid, NULL,
+ MATCH_IRE_DSTONLY, B_FALSE, 0, ipst, setsrcp, NULL, NULL);
+ ASSERT(ire != NULL);
- switch (ire->ire_type) {
- case IRE_DEFAULT:
- case IRE_PREFIX:
- case IRE_HOST:
- mutex_enter(&ire->ire_lock);
- gw_addr_v6 = ire->ire_gateway_addr_v6;
- mutex_exit(&ire->ire_lock);
- ire_refrele(ire);
- ire = ire_ftable_lookup_v6(&gw_addr_v6, 0, 0,
- IRE_INTERFACE, ipif, NULL, zoneid, 0,
- NULL, match_flags, ipst);
- return (ire);
- case IRE_IF_NORESOLVER:
- case IRE_IF_RESOLVER:
- return (ire);
- default:
+ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
ire_refrele(ire);
return (NULL);
}
-}
-/*
- * Return any local address. We use this to target ourselves
- * when the src address was specified as 'default'.
- * Preference for IRE_LOCAL entries.
- */
-ire_t *
-ire_lookup_local_v6(zoneid_t zoneid, ip_stack_t *ipst)
-{
- ire_t *ire;
- irb_t *irb;
- ire_t *maybe = NULL;
- int i;
+ if (multirtp != NULL)
+ *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0;
- for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) {
- irb = &ipst->ips_ip_cache_table_v6[i];
- if (irb->irb_ire == NULL)
- continue;
- rw_enter(&irb->irb_lock, RW_READER);
- for (ire = irb->irb_ire; ire; ire = ire->ire_next) {
- if ((ire->ire_marks & IRE_MARK_CONDEMNED) ||
- ire->ire_zoneid != zoneid &&
- ire->ire_zoneid != ALL_ZONES)
- continue;
- switch (ire->ire_type) {
- case IRE_LOOPBACK:
- if (maybe == NULL) {
- IRE_REFHOLD(ire);
- maybe = ire;
- }
- break;
- case IRE_LOCAL:
- if (maybe != NULL) {
- ire_refrele(maybe);
- }
- IRE_REFHOLD(ire);
- rw_exit(&irb->irb_lock);
- return (ire);
- }
- }
- rw_exit(&irb->irb_lock);
- }
- return (maybe);
+ ill = ire_nexthop_ill(ire);
+ ire_refrele(ire);
+ return (ill);
}
/*
@@ -369,6 +292,8 @@ ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask)
if (plen < 0 || plen > IPV6_ABITS)
return (NULL);
*bitmask = ipv6_all_zeros;
+ if (plen == 0)
+ return (bitmask);
ptr = (uint32_t *)bitmask;
while (plen > 32) {
@@ -380,196 +305,78 @@ ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask)
}
/*
- * Add a fully initialized IRE to an appropriate
- * table based on ire_type.
- *
- * The forward table contains IRE_PREFIX/IRE_HOST/IRE_HOST and
- * IRE_IF_RESOLVER/IRE_IF_NORESOLVER and IRE_DEFAULT.
- *
- * The cache table contains IRE_BROADCAST/IRE_LOCAL/IRE_LOOPBACK
- * and IRE_CACHE.
- *
- * NOTE : This function is called as writer though not required
- * by this function.
+ * Add a fully initialized IPv6 IRE to the forwarding table.
+ * This returns NULL on failure, or a held IRE on success.
+ * Normally the returned IRE is the same as the argument. But a different
+ * IRE will be returned if the added IRE is deemed identical to an existing
+ * one. In that case ire_identical_ref will be increased.
+ * The caller always needs to do an ire_refrele() on the returned IRE.
*/
-int
-ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func)
+ire_t *
+ire_add_v6(ire_t *ire)
{
ire_t *ire1;
int mask_table_index;
irb_t *irb_ptr;
ire_t **irep;
- int flags;
- ire_t *pire = NULL;
- ill_t *stq_ill;
- boolean_t ndp_g_lock_held = B_FALSE;
- ire_t *ire = *ire_p;
+ int match_flags;
int error;
ip_stack_t *ipst = ire->ire_ipst;
- uint_t marks = 0;
ASSERT(ire->ire_ipversion == IPV6_VERSION);
- ASSERT(ire->ire_mp == NULL); /* Calls should go through ire_add */
- ASSERT(ire->ire_nce == NULL);
-
- /*
- * IREs with source addresses hosted on interfaces that are under IPMP
- * should be hidden so that applications don't accidentally end up
- * sending packets with test addresses as their source addresses, or
- * sending out interfaces that are e.g. IFF_INACTIVE. Hide them here.
- * (We let IREs with unspecified source addresses slip through since
- * ire_send_v6() will delete them automatically.)
- */
- if (ire->ire_ipif != NULL && IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) &&
- !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6)) {
- DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire);
- marks |= IRE_MARK_TESTHIDDEN;
- }
-
- /* Find the appropriate list head. */
- switch (ire->ire_type) {
- case IRE_HOST:
- ire->ire_mask_v6 = ipv6_all_ones;
- ire->ire_masklen = IPV6_ABITS;
- ire->ire_marks |= marks;
- if ((ire->ire_flags & RTF_SETSRC) == 0)
- ire->ire_src_addr_v6 = ipv6_all_zeros;
- break;
- case IRE_CACHE:
- ire->ire_mask_v6 = ipv6_all_ones;
- ire->ire_masklen = IPV6_ABITS;
- ire->ire_marks |= marks;
- break;
- case IRE_LOCAL:
- case IRE_LOOPBACK:
- ire->ire_mask_v6 = ipv6_all_ones;
- ire->ire_masklen = IPV6_ABITS;
- break;
- case IRE_PREFIX:
- case IRE_DEFAULT:
- ire->ire_marks |= marks;
- if ((ire->ire_flags & RTF_SETSRC) == 0)
- ire->ire_src_addr_v6 = ipv6_all_zeros;
- break;
- case IRE_IF_RESOLVER:
- case IRE_IF_NORESOLVER:
- ire->ire_marks |= marks;
- break;
- default:
- printf("ire_add_v6: ire %p has unrecognized IRE type (%d)\n",
- (void *)ire, ire->ire_type);
- ire_delete(ire);
- *ire_p = NULL;
- return (EINVAL);
- }
/* Make sure the address is properly masked. */
V6_MASK_COPY(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6);
- if ((ire->ire_type & IRE_CACHETABLE) == 0) {
- /* IRE goes into Forward Table */
- mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6);
- if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) ==
- NULL) {
- irb_t *ptr;
- int i;
-
- ptr = (irb_t *)mi_zalloc((
- ipst->ips_ip6_ftable_hash_size * sizeof (irb_t)));
- if (ptr == NULL) {
- ire_delete(ire);
- *ire_p = NULL;
- return (ENOMEM);
- }
- for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
- rw_init(&ptr[i].irb_lock, NULL,
- RW_DEFAULT, NULL);
- }
- mutex_enter(&ipst->ips_ire_ft_init_lock);
- if (ipst->ips_ip_forwarding_table_v6[
- mask_table_index] == NULL) {
- ipst->ips_ip_forwarding_table_v6[
- mask_table_index] = ptr;
- mutex_exit(&ipst->ips_ire_ft_init_lock);
- } else {
- /*
- * Some other thread won the race in
- * initializing the forwarding table at the
- * same index.
- */
- mutex_exit(&ipst->ips_ire_ft_init_lock);
- for (i = 0; i < ipst->ips_ip6_ftable_hash_size;
- i++) {
- rw_destroy(&ptr[i].irb_lock);
- }
- mi_free(ptr);
- }
- }
- irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][
- IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6,
- ipst->ips_ip6_ftable_hash_size)]);
- } else {
- irb_ptr = &(ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(
- ire->ire_addr_v6, ipst->ips_ip6_cache_table_size)]);
- }
- /*
- * For xresolv interfaces (v6 interfaces with an external
- * address resolver), ip_newroute_v6/ip_newroute_ipif_v6
- * are unable to prevent the deletion of the interface route
- * while adding an IRE_CACHE for an on-link destination
- * in the IRE_IF_RESOLVER case, since the ire has to go to
- * the external resolver and return. We can't do a REFHOLD on the
- * associated interface ire for fear of the message being freed
- * if the external resolver can't resolve the address.
- * Here we look up the interface ire in the forwarding table
- * and make sure that the interface route has not been deleted.
- */
- if (ire->ire_type == IRE_CACHE &&
- IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6) &&
- (((ill_t *)ire->ire_stq->q_ptr)->ill_net_type == IRE_IF_RESOLVER) &&
- (((ill_t *)ire->ire_stq->q_ptr)->ill_flags & ILLF_XRESOLV)) {
+ mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6);
+ if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) == NULL) {
+ irb_t *ptr;
+ int i;
- pire = ire_ihandle_lookup_onlink_v6(ire);
- if (pire == NULL) {
+ ptr = (irb_t *)mi_zalloc((ipst->ips_ip6_ftable_hash_size *
+ sizeof (irb_t)));
+ if (ptr == NULL) {
ire_delete(ire);
- *ire_p = NULL;
- return (EINVAL);
+ return (NULL);
}
- /* Prevent pire from getting deleted */
- IRB_REFHOLD(pire->ire_bucket);
- /* Has it been removed already? */
- if (pire->ire_marks & IRE_MARK_CONDEMNED) {
- IRB_REFRELE(pire->ire_bucket);
- ire_refrele(pire);
- ire_delete(ire);
- *ire_p = NULL;
- return (EINVAL);
+ for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
+ rw_init(&ptr[i].irb_lock, NULL, RW_DEFAULT, NULL);
+ }
+ mutex_enter(&ipst->ips_ire_ft_init_lock);
+ if (ipst->ips_ip_forwarding_table_v6[mask_table_index] ==
+ NULL) {
+ ipst->ips_ip_forwarding_table_v6[mask_table_index] =
+ ptr;
+ mutex_exit(&ipst->ips_ire_ft_init_lock);
+ } else {
+ /*
+ * Some other thread won the race in
+ * initializing the forwarding table at the
+ * same index.
+ */
+ mutex_exit(&ipst->ips_ire_ft_init_lock);
+ for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
+ rw_destroy(&ptr[i].irb_lock);
+ }
+ mi_free(ptr);
}
}
+ irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][
+ IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6,
+ ipst->ips_ip6_ftable_hash_size)]);
- flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW);
+ match_flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW);
+ if (ire->ire_ill != NULL)
+ match_flags |= MATCH_IRE_ILL;
/*
- * For IRE_CACHES, MATCH_IRE_IPIF is not enough to check
- * for duplicates because :
- *
- * 1) ire_ipif->ipif_ill and ire_stq->q_ptr could be
- * pointing at different ills. A real duplicate is
- * a match on both ire_ipif and ire_stq.
- *
- * 2) We could have multiple packets trying to create
- * an IRE_CACHE for the same ill.
- *
- * Rather than looking at the packet, we depend on the above for
- * MATCH_IRE_ILL here.
- *
- * Unlike IPv4, MATCH_IRE_IPIF is needed here as we could have
- * multiple IRE_CACHES for an ill for the same destination
- * with various scoped addresses i.e represented by ipifs.
- *
- * MATCH_IRE_ILL is done implicitly below for IRE_CACHES.
+ * Start the atomic add of the ire. Grab the bucket lock and the
+ * ill lock. Check for condemned.
*/
- if (ire->ire_ipif != NULL)
- flags |= MATCH_IRE_IPIF;
+ error = ire_atomic_start(irb_ptr, ire);
+ if (error != 0) {
+ ire_delete(ire);
+ return (NULL);
+ }
/*
* If we are creating a hidden IRE, make sure we search for
@@ -577,103 +384,36 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func)
* Otherwise, we might find an IRE on some other interface
* that's not marked hidden.
*/
- if (ire->ire_marks & IRE_MARK_TESTHIDDEN)
- flags |= MATCH_IRE_MARK_TESTHIDDEN;
-
- /*
- * Start the atomic add of the ire. Grab the ill locks,
- * ill_g_usesrc_lock and the bucket lock. Check for condemned.
- * To avoid lock order problems, get the ndp6.ndp_g_lock now itself.
- */
- if (ire->ire_type == IRE_CACHE) {
- mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
- ndp_g_lock_held = B_TRUE;
- }
-
- /*
- * If ipif or ill is changing ire_atomic_start() may queue the
- * request and return EINPROGRESS.
- */
-
- error = ire_atomic_start(irb_ptr, ire, q, mp, func);
- if (error != 0) {
- if (ndp_g_lock_held)
- mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
- /*
- * We don't know whether it is a valid ipif or not.
- * So, set it to NULL. This assumes that the ire has not added
- * a reference to the ipif.
- */
- ire->ire_ipif = NULL;
- ire_delete(ire);
- if (pire != NULL) {
- IRB_REFRELE(pire->ire_bucket);
- ire_refrele(pire);
- }
- *ire_p = NULL;
- return (error);
- }
- /*
- * To avoid creating ires having stale values for the ire_max_frag
- * we get the latest value atomically here. For more details
- * see the block comment in ip_sioctl_mtu and in DL_NOTE_SDU_CHANGE
- * in ip_rput_dlpi_writer
- */
- if (ire->ire_max_fragp == NULL) {
- if (IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6))
- ire->ire_max_frag = ire->ire_ipif->ipif_mtu;
- else
- ire->ire_max_frag = pire->ire_max_frag;
- } else {
- uint_t max_frag;
-
- max_frag = *ire->ire_max_fragp;
- ire->ire_max_fragp = NULL;
- ire->ire_max_frag = max_frag;
- }
+ if (ire->ire_testhidden)
+ match_flags |= MATCH_IRE_TESTHIDDEN;
/*
* Atomically check for duplicate and insert in the table.
*/
for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
- if (ire1->ire_marks & IRE_MARK_CONDEMNED)
+ if (IRE_IS_CONDEMNED(ire1))
continue;
-
- if (ire->ire_type == IRE_CACHE) {
- /*
- * We do MATCH_IRE_ILL implicitly here for IRE_CACHES.
- * As ire_ipif and ire_stq could point to two
- * different ills, we can't pass just ire_ipif to
- * ire_match_args and get a match on both ills.
- * This is just needed for duplicate checks here and
- * so we don't add an extra argument to
- * ire_match_args for this. Do it locally.
- *
- * NOTE : Currently there is no part of the code
- * that asks for both MATH_IRE_IPIF and MATCH_IRE_ILL
- * match for IRE_CACHEs. Thus we don't want to
- * extend the arguments to ire_match_args_v6.
- */
- if (ire1->ire_stq != ire->ire_stq)
- continue;
- /*
- * Multiroute IRE_CACHEs for a given destination can
- * have the same ire_ipif, typically if their source
- * address is forced using RTF_SETSRC, and the same
- * send-to queue. We differentiate them using the parent
- * handle.
- */
- if ((ire1->ire_flags & RTF_MULTIRT) &&
- (ire->ire_flags & RTF_MULTIRT) &&
- (ire1->ire_phandle != ire->ire_phandle))
- continue;
- }
+ /*
+ * Here we need an exact match on zoneid, i.e.,
+ * ire_match_args doesn't fit.
+ */
if (ire1->ire_zoneid != ire->ire_zoneid)
continue;
+
+ if (ire1->ire_type != ire->ire_type)
+ continue;
+
+ /*
+ * Note: We do not allow multiple routes that differ only
+ * in the gateway security attributes; such routes are
+ * considered duplicates.
+ * To change that we explicitly have to treat them as
+ * different here.
+ */
if (ire_match_args_v6(ire1, &ire->ire_addr_v6,
&ire->ire_mask_v6, &ire->ire_gateway_addr_v6,
- ire->ire_type, ire->ire_ipif, ire->ire_zoneid, 0, NULL,
- flags)) {
+ ire->ire_type, ire->ire_ill, ire->ire_zoneid, NULL,
+ match_flags)) {
/*
* Return the old ire after doing a REFHOLD.
* As most of the callers continue to use the IRE
@@ -683,141 +423,25 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func)
*/
ip1dbg(("found dup ire existing %p new %p",
(void *)ire1, (void *)ire));
- IRE_REFHOLD(ire1);
- if (ndp_g_lock_held)
- mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
+ ire_refhold(ire1);
+ atomic_add_32(&ire1->ire_identical_ref, 1);
ire_atomic_end(irb_ptr, ire);
ire_delete(ire);
- if (pire != NULL) {
- /*
- * Assert that it is
- * not yet removed from the list.
- */
- ASSERT(pire->ire_ptpn != NULL);
- IRB_REFRELE(pire->ire_bucket);
- ire_refrele(pire);
- }
- *ire_p = ire1;
- return (0);
+ return (ire1);
}
}
- if (ire->ire_type == IRE_CACHE) {
- const in6_addr_t *addr_v6;
- ill_t *ill = ire_to_ill(ire);
- char buf[INET6_ADDRSTRLEN];
- nce_t *nce;
- /*
- * All IRE_CACHE types must have a nce. If this is
- * not the case the entry will not be added. We need
- * to make sure that if somebody deletes the nce
- * after we looked up, they will find this ire and
- * delete the ire. To delete this ire one needs the
- * bucket lock which we are still holding here. So,
- * even if the nce gets deleted after we looked up,
- * this ire will get deleted.
- *
- * NOTE : Don't need the ire_lock for accessing
- * ire_gateway_addr_v6 as it is appearing first
- * time on the list and rts_setgwr_v6 could not
- * be changing this.
- */
- addr_v6 = &ire->ire_gateway_addr_v6;
- if (IN6_IS_ADDR_UNSPECIFIED(addr_v6))
- addr_v6 = &ire->ire_addr_v6;
-
- /* nce fastpath is per-ill; don't match across illgrp */
- nce = ndp_lookup_v6(ill, B_FALSE, addr_v6, B_TRUE);
- if (nce == NULL)
- goto failed;
-
- /* Pair of refhold, refrele just to get the tracing right */
- NCE_REFHOLD_TO_REFHOLD_NOTR(nce);
- /*
- * Atomically make sure that new IREs don't point
- * to an NCE that is logically deleted (CONDEMNED).
- * ndp_delete() first marks the NCE CONDEMNED.
- * This ensures that the nce_refcnt won't increase
- * due to new nce_lookups or due to addition of new IREs
- * pointing to this NCE. Then ndp_delete() cleans up
- * existing references. If we don't do it atomically here,
- * ndp_delete() -> nce_ire_delete() will not be able to
- * clean up the IRE list completely, and the nce_refcnt
- * won't go down to zero.
- */
- mutex_enter(&nce->nce_lock);
- if (ill->ill_flags & ILLF_XRESOLV) {
- /*
- * If we used an external resolver, we may not
- * have gone through neighbor discovery to get here.
- * Must update the nce_state before the next check.
- */
- if (nce->nce_state == ND_INCOMPLETE)
- nce->nce_state = ND_REACHABLE;
- }
- if (nce->nce_state == ND_INCOMPLETE ||
- (nce->nce_flags & NCE_F_CONDEMNED) ||
- (nce->nce_state == ND_UNREACHABLE)) {
-failed:
- if (ndp_g_lock_held)
- mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
- if (nce != NULL)
- mutex_exit(&nce->nce_lock);
- ire_atomic_end(irb_ptr, ire);
- ip1dbg(("ire_add_v6: No nce for dst %s \n",
- inet_ntop(AF_INET6, &ire->ire_addr_v6,
- buf, sizeof (buf))));
- ire_delete(ire);
- if (pire != NULL) {
- /*
- * Assert that it is
- * not yet removed from the list.
- */
- ASSERT(pire->ire_ptpn != NULL);
- IRB_REFRELE(pire->ire_bucket);
- ire_refrele(pire);
- }
- if (nce != NULL)
- NCE_REFRELE_NOTR(nce);
- *ire_p = NULL;
- return (EINVAL);
- } else {
- ire->ire_nce = nce;
- }
- mutex_exit(&nce->nce_lock);
- }
/*
- * Find the first entry that matches ire_addr - provides
- * tail insertion. *irep will be null if no match.
+ * Normally we do head insertion since most things do not care about
+ * the order of the IREs in the bucket.
+ * However, due to shared-IP zones (and restrict_interzone_loopback)
+ * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same
+ * address. For that reason we do tail insertion for IRE_IF_CLONE.
*/
irep = (ire_t **)irb_ptr;
- while ((ire1 = *irep) != NULL &&
- !IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &ire1->ire_addr_v6))
- irep = &ire1->ire_next;
- ASSERT(!(ire->ire_type & IRE_BROADCAST));
-
- if (*irep != NULL) {
- /*
- * Find the last ire which matches ire_addr_v6.
- * Needed to do tail insertion among entries with the same
- * ire_addr_v6.
- */
- while (IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6,
- &ire1->ire_addr_v6)) {
+ if (ire->ire_type & IRE_IF_CLONE) {
+ while ((ire1 = *irep) != NULL)
irep = &ire1->ire_next;
- ire1 = *irep;
- if (ire1 == NULL)
- break;
- }
- }
-
- if (ire->ire_type == IRE_DEFAULT) {
- /*
- * We keep a count of default gateways which is used when
- * assigning them as routes.
- */
- ipst->ips_ipv6_ire_default_count++;
- ASSERT(ipst->ips_ipv6_ire_default_count != 0); /* Wraparound */
}
/* Insert at *irep */
ire1 = *irep;
@@ -852,62 +476,22 @@ failed:
* in the list for the first time and no one else can bump
* up the reference count on this yet.
*/
- IRE_REFHOLD_LOCKED(ire);
+ ire_refhold_locked(ire);
BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_inserted);
irb_ptr->irb_ire_cnt++;
- if (ire->ire_marks & IRE_MARK_TEMPORARY)
- irb_ptr->irb_tmp_ire_cnt++;
- if (ire->ire_ipif != NULL) {
- DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *), ire->ire_ipif,
+ if (ire->ire_ill != NULL) {
+ DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ire->ire_ill,
(char *), "ire", (void *), ire);
- ire->ire_ipif->ipif_ire_cnt++;
- if (ire->ire_stq != NULL) {
- stq_ill = (ill_t *)ire->ire_stq->q_ptr;
- DTRACE_PROBE3(ill__incr__cnt, (ill_t *), stq_ill,
- (char *), "ire", (void *), ire);
- stq_ill->ill_ire_cnt++;
- }
- } else {
- ASSERT(ire->ire_stq == NULL);
+ ire->ire_ill->ill_ire_cnt++;
+ ASSERT(ire->ire_ill->ill_ire_cnt != 0); /* Wraparound */
}
-
- if (ndp_g_lock_held)
- mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
ire_atomic_end(irb_ptr, ire);
- if (pire != NULL) {
- /* Assert that it is not removed from the list yet */
- ASSERT(pire->ire_ptpn != NULL);
- IRB_REFRELE(pire->ire_bucket);
- ire_refrele(pire);
- }
-
- if (ire->ire_type != IRE_CACHE) {
- /*
- * For ire's with with host mask see if there is an entry
- * in the cache. If there is one flush the whole cache as
- * there might be multiple entries due to RTF_MULTIRT (CGTP).
- * If no entry is found than there is no need to flush the
- * cache.
- */
-
- if (ip_mask_to_plen_v6(&ire->ire_mask_v6) == IPV6_ABITS) {
- ire_t *lire;
- lire = ire_ctable_lookup_v6(&ire->ire_addr_v6, NULL,
- IRE_CACHE, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE,
- ipst);
- if (lire != NULL) {
- ire_refrele(lire);
- ire_flush_cache_v6(ire, IRE_FLUSH_ADD);
- }
- } else {
- ire_flush_cache_v6(ire, IRE_FLUSH_ADD);
- }
- }
+ /* Make any caching of the IREs be notified or updated */
+ ire_flush_cache_v6(ire, IRE_FLUSH_ADD);
- *ire_p = ire;
- return (0);
+ return (ire);
}
/*
@@ -931,7 +515,7 @@ ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst)
return;
for (i = 0; (i < ipst->ips_ip6_ftable_hash_size); i++) {
irb = &irb_ptr[i];
- IRB_REFHOLD(irb);
+ irb_refhold(irb);
for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
if (!(ire->ire_flags & RTF_DYNAMIC))
continue;
@@ -941,50 +525,11 @@ ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst)
if (IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway))
ire_delete(ire);
}
- IRB_REFRELE(irb);
+ irb_refrele(irb);
}
}
/*
- * Delete all the cache entries with this 'addr'. This is the IPv6 counterpart
- * of ip_ire_clookup_and_delete. The difference being this function does not
- * return any value. IPv6 processing of a gratuitous ARP, as it stands, is
- * different than IPv4 in that, regardless of the presence of a cache entry
- * for this address, an ire_walk_v6 is done. Another difference is that unlike
- * in the case of IPv4 this does not take an ipif_t argument, since it is only
- * called by ip_arp_news and the match is always only on the address.
- */
-void
-ip_ire_clookup_and_delete_v6(const in6_addr_t *addr, ip_stack_t *ipst)
-{
- irb_t *irb;
- ire_t *cire;
- boolean_t found = B_FALSE;
-
- irb = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr,
- ipst->ips_ip6_cache_table_size)];
- IRB_REFHOLD(irb);
- for (cire = irb->irb_ire; cire != NULL; cire = cire->ire_next) {
- if (cire->ire_marks & IRE_MARK_CONDEMNED)
- continue;
- if (IN6_ARE_ADDR_EQUAL(&cire->ire_addr_v6, addr)) {
-
- /* This signifies start of a match */
- if (!found)
- found = B_TRUE;
- if (cire->ire_type == IRE_CACHE) {
- if (cire->ire_nce != NULL)
- ndp_delete(cire->ire_nce);
- ire_delete_v6(cire);
- }
- /* End of the match */
- } else if (found)
- break;
- }
- IRB_REFRELE(irb);
-}
-
-/*
* Delete the specified IRE.
* All calls should use ire_delete().
* Sometimes called as writer though not required by this function.
@@ -998,11 +543,20 @@ ire_delete_v6(ire_t *ire)
in6_addr_t gw_addr_v6;
ip_stack_t *ipst = ire->ire_ipst;
+ /*
+ * Make sure ire_generation increases from ire_flush_cache happen
+ * after any lookup/reader has read ire_generation.
+ * Since the rw_enter makes us wait until any lookup/reader has
+ * completed we can exit the lock immediately.
+ */
+ rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER);
+ rw_exit(&ipst->ips_ip6_ire_head_lock);
+
ASSERT(ire->ire_refcnt >= 1);
ASSERT(ire->ire_ipversion == IPV6_VERSION);
- if (ire->ire_type != IRE_CACHE)
- ire_flush_cache_v6(ire, IRE_FLUSH_DELETE);
+ ire_flush_cache_v6(ire, IRE_FLUSH_DELETE);
+
if (ire->ire_type == IRE_DEFAULT) {
/*
* when a default gateway is going away
@@ -1014,368 +568,284 @@ ire_delete_v6(ire_t *ire)
mutex_exit(&ire->ire_lock);
ire_delete_host_redirects_v6(&gw_addr_v6, ipst);
}
-}
-
-/*
- * ire_walk routine to delete all IRE_CACHE and IRE_HOST type redirect
- * entries.
- */
-/*ARGSUSED1*/
-void
-ire_delete_cache_v6(ire_t *ire, char *arg)
-{
- char addrstr1[INET6_ADDRSTRLEN];
- char addrstr2[INET6_ADDRSTRLEN];
-
- if ((ire->ire_type & IRE_CACHE) ||
- (ire->ire_flags & RTF_DYNAMIC)) {
- ip1dbg(("ire_delete_cache_v6: deleted %s type %d through %s\n",
- inet_ntop(AF_INET6, &ire->ire_addr_v6,
- addrstr1, sizeof (addrstr1)),
- ire->ire_type,
- inet_ntop(AF_INET6, &ire->ire_gateway_addr_v6,
- addrstr2, sizeof (addrstr2))));
- ire_delete(ire);
- }
-
-}
-/*
- * ire_walk routine to delete all IRE_CACHE/IRE_HOST type redirect entries
- * that have a given gateway address.
- */
-void
-ire_delete_cache_gw_v6(ire_t *ire, char *addr)
-{
- in6_addr_t *gw_addr = (in6_addr_t *)addr;
- char buf1[INET6_ADDRSTRLEN];
- char buf2[INET6_ADDRSTRLEN];
- in6_addr_t ire_gw_addr_v6;
-
- if (!(ire->ire_type & IRE_CACHE) &&
- !(ire->ire_flags & RTF_DYNAMIC))
- return;
-
- mutex_enter(&ire->ire_lock);
- ire_gw_addr_v6 = ire->ire_gateway_addr_v6;
- mutex_exit(&ire->ire_lock);
+ /*
+ * If we are deleting an IRE_INTERFACE then we make sure we also
+ * delete any IRE_IF_CLONE that has been created from it.
+ * Those are always in ire_dep_children.
+ */
+ if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != 0)
+ ire_dep_delete_if_clone(ire);
- if (IN6_ARE_ADDR_EQUAL(&ire_gw_addr_v6, gw_addr)) {
- ip1dbg(("ire_delete_cache_gw_v6: deleted %s type %d to %s\n",
- inet_ntop(AF_INET6, &ire->ire_src_addr_v6,
- buf1, sizeof (buf1)),
- ire->ire_type,
- inet_ntop(AF_INET6, &ire_gw_addr_v6,
- buf2, sizeof (buf2))));
- ire_delete(ire);
+ /* Remove from parent dependencies and child */
+ rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER);
+ if (ire->ire_dep_parent != NULL) {
+ ire_dep_remove(ire);
}
+ while (ire->ire_dep_children != NULL)
+ ire_dep_remove(ire->ire_dep_children);
+ rw_exit(&ipst->ips_ire_dep_lock);
}
/*
- * Remove all IRE_CACHE entries that match
- * the ire specified. (Sometimes called
- * as writer though not required by this function.)
- *
- * The flag argument indicates if the
- * flush request is due to addition
- * of new route (IRE_FLUSH_ADD) or deletion of old
- * route (IRE_FLUSH_DELETE).
+ * When an IRE is added or deleted this routine is called to make sure
+ * any caching of IRE information is notified or updated.
*
- * This routine takes only the IREs from the forwarding
- * table and flushes the corresponding entries from
- * the cache table.
- *
- * When flushing due to the deletion of an old route, it
- * just checks the cache handles (ire_phandle and ire_ihandle) and
- * deletes the ones that match.
- *
- * When flushing due to the creation of a new route, it checks
- * if a cache entry's address matches the one in the IRE and
- * that the cache entry's parent has a less specific mask than the
- * one in IRE. The destination of such a cache entry could be the
- * gateway for other cache entries, so we need to flush those as
- * well by looking for gateway addresses matching the IRE's address.
+ * The flag argument indicates if the flush request is due to addition
+ * of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE),
+ * or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE).
*/
void
ire_flush_cache_v6(ire_t *ire, int flag)
{
- int i;
- ire_t *cire;
- irb_t *irb;
- ip_stack_t *ipst = ire->ire_ipst;
+ ip_stack_t *ipst = ire->ire_ipst;
- if (ire->ire_type & IRE_CACHE)
+ /*
+ * IRE_IF_CLONE ire's don't provide any new information
+ * than the parent from which they are cloned, so don't
+ * perturb the generation numbers.
+ */
+ if (ire->ire_type & IRE_IF_CLONE)
return;
/*
- * If a default is just created, there is no point
- * in going through the cache, as there will not be any
- * cached ires.
+ * Ensure that an ire_add during a lookup serializes the updates of
+ * the generation numbers under ire_head_lock so that the lookup gets
+ * either the old ire and old generation number, or a new ire and new
+ * generation number.
+ */
+ rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER);
+
+ /*
+ * If a route was just added, we need to notify everybody that
+ * has cached an IRE_NOROUTE since there might now be a better
+ * route for them.
*/
- if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD)
- return;
if (flag == IRE_FLUSH_ADD) {
+ ire_increment_generation(ipst->ips_ire_reject_v6);
+ ire_increment_generation(ipst->ips_ire_blackhole_v6);
+ }
+
+ /* Adding a default can't otherwise provide a better route */
+ if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) {
+ rw_exit(&ipst->ips_ip6_ire_head_lock);
+ return;
+ }
+
+ switch (flag) {
+ case IRE_FLUSH_DELETE:
+ case IRE_FLUSH_GWCHANGE:
/*
- * This selective flush is
- * due to the addition of
- * new IRE.
+ * Update ire_generation for all ire_dep_children chains
+ * starting with this IRE
*/
- for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) {
- irb = &ipst->ips_ip_cache_table_v6[i];
- if ((cire = irb->irb_ire) == NULL)
- continue;
- IRB_REFHOLD(irb);
- for (cire = irb->irb_ire; cire != NULL;
- cire = cire->ire_next) {
- if (cire->ire_type != IRE_CACHE)
- continue;
- /*
- * If 'cire' belongs to the same subnet
- * as the new ire being added, and 'cire'
- * is derived from a prefix that is less
- * specific than the new ire being added,
- * we need to flush 'cire'; for instance,
- * when a new interface comes up.
- */
- if ((V6_MASK_EQ_2(cire->ire_addr_v6,
- ire->ire_mask_v6, ire->ire_addr_v6) &&
- (ip_mask_to_plen_v6(&cire->ire_cmask_v6) <=
- ire->ire_masklen))) {
- ire_delete(cire);
- continue;
- }
- /*
- * This is the case when the ire_gateway_addr
- * of 'cire' belongs to the same subnet as
- * the new ire being added.
- * Flushing such ires is sometimes required to
- * avoid misrouting: say we have a machine with
- * two interfaces (I1 and I2), a default router
- * R on the I1 subnet, and a host route to an
- * off-link destination D with a gateway G on
- * the I2 subnet.
- * Under normal operation, we will have an
- * on-link cache entry for G and an off-link
- * cache entry for D with G as ire_gateway_addr,
- * traffic to D will reach its destination
- * through gateway G.
- * If the administrator does 'ifconfig I2 down',
- * the cache entries for D and G will be
- * flushed. However, G will now be resolved as
- * an off-link destination using R (the default
- * router) as gateway. Then D will also be
- * resolved as an off-link destination using G
- * as gateway - this behavior is due to
- * compatibility reasons, see comment in
- * ire_ihandle_lookup_offlink(). Traffic to D
- * will go to the router R and probably won't
- * reach the destination.
- * The administrator then does 'ifconfig I2 up'.
- * Since G is on the I2 subnet, this routine
- * will flush its cache entry. It must also
- * flush the cache entry for D, otherwise
- * traffic will stay misrouted until the IRE
- * times out.
- */
- if (V6_MASK_EQ_2(cire->ire_gateway_addr_v6,
- ire->ire_mask_v6, ire->ire_addr_v6)) {
- ire_delete(cire);
- continue;
- }
- }
- IRB_REFRELE(irb);
- }
- } else {
+ ire_dep_incr_generation(ire);
+ break;
+ case IRE_FLUSH_ADD: {
+ in6_addr_t addr;
+ in6_addr_t mask;
+ ip_stack_t *ipst = ire->ire_ipst;
+ uint_t masklen;
+
/*
- * delete the cache entries based on
- * handle in the IRE as this IRE is
- * being deleted/changed.
+ * Find an IRE which is a shorter match than the ire to be added
+ * For any such IRE (which we repeat) we update the
+ * ire_generation the same way as in the delete case.
*/
- for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) {
- irb = &ipst->ips_ip_cache_table_v6[i];
- if ((cire = irb->irb_ire) == NULL)
- continue;
- IRB_REFHOLD(irb);
- for (cire = irb->irb_ire; cire != NULL;
- cire = cire->ire_next) {
- if (cire->ire_type != IRE_CACHE)
- continue;
- if ((cire->ire_phandle == 0 ||
- cire->ire_phandle != ire->ire_phandle) &&
- (cire->ire_ihandle == 0 ||
- cire->ire_ihandle != ire->ire_ihandle))
- continue;
- ire_delete(cire);
- }
- IRB_REFRELE(irb);
+ addr = ire->ire_addr_v6;
+ mask = ire->ire_mask_v6;
+ masklen = ip_mask_to_plen_v6(&mask);
+
+ ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, NULL,
+ ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst);
+ while (ire != NULL) {
+ /* We need to handle all in the same bucket */
+ irb_increment_generation(ire->ire_bucket);
+
+ mask = ire->ire_mask_v6;
+ ASSERT(masklen > ip_mask_to_plen_v6(&mask));
+ masklen = ip_mask_to_plen_v6(&mask);
+ ire_refrele(ire);
+ ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0,
+ NULL, ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst);
+ }
}
+ break;
}
+ rw_exit(&ipst->ips_ip6_ire_head_lock);
}
/*
* Matches the arguments passed with the values in the ire.
*
- * Note: for match types that match using "ipif" passed in, ipif
+ * Note: for match types that match using "ill" passed in, ill
* must be checked for non-NULL before calling this routine.
*/
-static boolean_t
+boolean_t
ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
- const in6_addr_t *gateway, int type, const ipif_t *ipif, zoneid_t zoneid,
- uint32_t ihandle, const ts_label_t *tsl, int match_flags)
+ const in6_addr_t *gateway, int type, const ill_t *ill, zoneid_t zoneid,
+ const ts_label_t *tsl, int match_flags)
{
in6_addr_t masked_addr;
in6_addr_t gw_addr_v6;
ill_t *ire_ill = NULL, *dst_ill;
- ill_t *ipif_ill = NULL;
- ipif_t *src_ipif;
+ ip_stack_t *ipst = ire->ire_ipst;
ASSERT(ire->ire_ipversion == IPV6_VERSION);
ASSERT(addr != NULL);
ASSERT(mask != NULL);
ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL);
ASSERT((!(match_flags & MATCH_IRE_ILL)) ||
- (ipif != NULL && ipif->ipif_isv6));
+ (ill != NULL && ill->ill_isv6));
/*
- * If MATCH_IRE_MARK_TESTHIDDEN is set, then only return the IRE if it
- * is in fact hidden, to ensure the caller gets the right one. One
- * exception: if the caller passed MATCH_IRE_IHANDLE, then they
- * already know the identity of the given IRE_INTERFACE entry and
- * there's no point trying to hide it from them.
+ * If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it
+ * is in fact hidden, to ensure the caller gets the right one.
*/
- if (ire->ire_marks & IRE_MARK_TESTHIDDEN) {
- if (match_flags & MATCH_IRE_IHANDLE)
- match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
-
- if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN))
+ if (ire->ire_testhidden) {
+ if (!(match_flags & MATCH_IRE_TESTHIDDEN))
return (B_FALSE);
}
if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid &&
ire->ire_zoneid != ALL_ZONES) {
/*
- * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid is
- * valid and does not match that of ire_zoneid, a failure to
+ * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid
+ * does not match that of ire_zoneid, a failure to
* match is reported at this point. Otherwise, since some IREs
* that are available in the global zone can be used in local
* zones, additional checks need to be performed:
*
- * IRE_CACHE and IRE_LOOPBACK entries should
- * never be matched in this situation.
+ * IRE_LOOPBACK
+ * entries should never be matched in this situation.
+ * Each zone has its own IRE_LOOPBACK.
*
- * IRE entries that have an interface associated with them
- * should in general not match unless they are an IRE_LOCAL
- * or in the case when MATCH_IRE_DEFAULT has been set in
- * the caller. In the case of the former, checking of the
- * other fields supplied should take place.
+ * IRE_LOCAL
+ * We allow them for any zoneid. ire_route_recursive
+ * does additional checks when
+ * ip_restrict_interzone_loopback is set.
*
- * In the case where MATCH_IRE_DEFAULT has been set,
- * all of the ipif's associated with the IRE's ill are
- * checked to see if there is a matching zoneid. If any
- * one ipif has a matching zoneid, this IRE is a
- * potential candidate so checking of the other fields
- * takes place.
+ * If ill_usesrc_ifindex is set
+ * Then we check if the zone has a valid source address
+ * on the usesrc ill.
*
- * In the case where the IRE_INTERFACE has a usable source
- * address (indicated by ill_usesrc_ifindex) in the
- * correct zone then it's permitted to return this IRE
+ * If ire_ill is set, then check that the zone has an ipif
+ * on that ill.
+ *
+ * Outside of this function (in ire_round_robin) we check
+ * that any IRE_OFFLINK has a gateway that reachable from the
+ * zone when we have multiple choices (ECMP).
*/
if (match_flags & MATCH_IRE_ZONEONLY)
return (B_FALSE);
- if (ire->ire_type & (IRE_CACHE | IRE_LOOPBACK))
+ if (ire->ire_type & IRE_LOOPBACK)
return (B_FALSE);
+
+ if (ire->ire_type & IRE_LOCAL)
+ goto matchit;
+
/*
- * Note, IRE_INTERFACE can have the stq as NULL. For
- * example, if the default multicast route is tied to
- * the loopback address.
+ * The normal case of IRE_ONLINK has a matching zoneid.
+ * Here we handle the case when shared-IP zones have been
+ * configured with IP addresses on vniN. In that case it
+ * is ok for traffic from a zone to use IRE_ONLINK routes
+ * if the ill has a usesrc pointing at vniN
+ * Applies to IRE_INTERFACE.
*/
- if ((ire->ire_type & IRE_INTERFACE) &&
- (ire->ire_stq != NULL)) {
- dst_ill = (ill_t *)ire->ire_stq->q_ptr;
+ dst_ill = ire->ire_ill;
+ if (ire->ire_type & IRE_ONLINK) {
+ uint_t ifindex;
+
+ /*
+ * Note there is no IRE_INTERFACE on vniN thus
+ * can't do an IRE lookup for a matching route.
+ */
+ ifindex = dst_ill->ill_usesrc_ifindex;
+ if (ifindex == 0)
+ return (B_FALSE);
+
/*
* If there is a usable source address in the
- * zone, then it's ok to return an
- * IRE_INTERFACE
+ * zone, then it's ok to return this IRE_INTERFACE
*/
- if ((dst_ill->ill_usesrc_ifindex != 0) &&
- (src_ipif = ipif_select_source_v6(dst_ill, addr,
- B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid))
- != NULL) {
- ip3dbg(("ire_match_args: src_ipif %p"
- " dst_ill %p", (void *)src_ipif,
- (void *)dst_ill));
- ipif_refrele(src_ipif);
- } else {
- ip3dbg(("ire_match_args: src_ipif NULL"
+ if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6,
+ zoneid, ipst)) {
+ ip3dbg(("ire_match_args: no usrsrc for zone"
" dst_ill %p\n", (void *)dst_ill));
return (B_FALSE);
}
}
- if (ire->ire_ipif != NULL && ire->ire_type != IRE_LOCAL &&
- !(ire->ire_type & IRE_INTERFACE)) {
+ /*
+ * For exampe, with
+ * route add 11.0.0.0 gw1 -ifp bge0
+ * route add 11.0.0.0 gw2 -ifp bge1
+ * this code would differentiate based on
+ * where the sending zone has addresses.
+ * Only if the zone has an address on bge0 can it use the first
+ * route. It isn't clear if this behavior is documented
+ * anywhere.
+ */
+ if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) {
ipif_t *tipif;
- if ((match_flags & MATCH_IRE_DEFAULT) == 0)
- return (B_FALSE);
- mutex_enter(&ire->ire_ipif->ipif_ill->ill_lock);
- for (tipif = ire->ire_ipif->ipif_ill->ill_ipif;
+ mutex_enter(&dst_ill->ill_lock);
+ for (tipif = dst_ill->ill_ipif;
tipif != NULL; tipif = tipif->ipif_next) {
- if (IPIF_CAN_LOOKUP(tipif) &&
+ if (!IPIF_IS_CONDEMNED(tipif) &&
(tipif->ipif_flags & IPIF_UP) &&
(tipif->ipif_zoneid == zoneid ||
tipif->ipif_zoneid == ALL_ZONES))
break;
}
- mutex_exit(&ire->ire_ipif->ipif_ill->ill_lock);
+ mutex_exit(&dst_ill->ill_lock);
if (tipif == NULL)
return (B_FALSE);
}
}
+matchit:
if (match_flags & MATCH_IRE_GW) {
mutex_enter(&ire->ire_lock);
gw_addr_v6 = ire->ire_gateway_addr_v6;
mutex_exit(&ire->ire_lock);
}
-
- /*
- * For IRE_CACHE entries, MATCH_IRE_ILL means that somebody wants to
- * send out ire_stq (ire_ipif for IRE_CACHE entries is just the means
- * of getting a source address -- i.e., ire_src_addr_v6 ==
- * ire->ire_ipif->ipif_v6src_addr). ire_to_ill() handles this.
- *
- * NOTE: For IPMP, MATCH_IRE_ILL usually matches any ill in the group.
- * However, if MATCH_IRE_MARK_TESTHIDDEN is set (i.e., the IRE is for
- * IPMP test traffic), then the ill must match exactly.
- */
if (match_flags & MATCH_IRE_ILL) {
- ire_ill = ire_to_ill(ire);
- ipif_ill = ipif->ipif_ill;
- }
+ ire_ill = ire->ire_ill;
+ /*
+ * If asked to match an ill, we *must* match
+ * on the ire_ill for ipmp test addresses, or
+ * any of the ill in the group for data addresses.
+ * If we don't, we may as well fail.
+ * However, we need an exception for IRE_LOCALs to ensure
+ * we loopback packets even sent to test addresses on different
+ * interfaces in the group.
+ */
+ if ((match_flags & MATCH_IRE_TESTHIDDEN) &&
+ !(ire->ire_type & IRE_LOCAL)) {
+ if (ire->ire_ill != ill)
+ return (B_FALSE);
+ } else {
+ match_flags &= ~MATCH_IRE_TESTHIDDEN;
+ /*
+ * We know that ill is not NULL, but ire_ill could be
+ * NULL
+ */
+ if (ire_ill == NULL || !IS_ON_SAME_LAN(ill, ire_ill))
+ return (B_FALSE);
+ }
+ }
/* No ire_addr_v6 bits set past the mask */
ASSERT(V6_MASK_EQ(ire->ire_addr_v6, ire->ire_mask_v6,
ire->ire_addr_v6));
V6_MASK_COPY(*addr, *mask, masked_addr);
-
if (V6_MASK_EQ(*addr, *mask, ire->ire_addr_v6) &&
((!(match_flags & MATCH_IRE_GW)) ||
IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) &&
- ((!(match_flags & MATCH_IRE_TYPE)) ||
- (ire->ire_type & type)) &&
- ((!(match_flags & MATCH_IRE_SRC)) ||
- IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6,
- &ipif->ipif_v6src_addr)) &&
- ((!(match_flags & MATCH_IRE_IPIF)) ||
- (ire->ire_ipif == ipif)) &&
- ((!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) ||
- (ire->ire_marks & IRE_MARK_TESTHIDDEN)) &&
- ((!(match_flags & MATCH_IRE_ILL)) ||
- (ire_ill == ipif_ill ||
- (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN) &&
- ire_ill != NULL && IS_IN_SAME_ILLGRP(ipif_ill, ire_ill)))) &&
- ((!(match_flags & MATCH_IRE_IHANDLE)) ||
- (ire->ire_ihandle == ihandle)) &&
+ ((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & type)) &&
+ ((!(match_flags & MATCH_IRE_TESTHIDDEN)) || ire->ire_testhidden) &&
+ ((!(match_flags & MATCH_IRE_MASK)) ||
+ (IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, mask))) &&
((!(match_flags & MATCH_IRE_SECATTR)) ||
(!is_system_labeled()) ||
(tsol_ire_match_gwattr(ire, tsl) == 0))) {
@@ -1386,41 +856,38 @@ ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
}
/*
- * Lookup for a route in all the tables
+ * Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified
+ * gateway address. If ill is non-NULL we also match on it.
+ * The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set.
*/
-ire_t *
-ire_route_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
- const in6_addr_t *gateway, int type, const ipif_t *ipif, ire_t **pire,
- zoneid_t zoneid, const ts_label_t *tsl, int flags, ip_stack_t *ipst)
+boolean_t
+ire_gateway_ok_zone_v6(const in6_addr_t *gateway, zoneid_t zoneid, ill_t *ill,
+ const ts_label_t *tsl, ip_stack_t *ipst, boolean_t lock_held)
{
- ire_t *ire = NULL;
+ ire_t *ire;
+ uint_t match_flags;
- /*
- * ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or
- * MATCH_IRE_ILL is set.
- */
- if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL))
- return (NULL);
+ if (lock_held)
+ ASSERT(RW_READ_HELD(&ipst->ips_ip6_ire_head_lock));
+ else
+ rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
- /*
- * might be asking for a cache lookup,
- * This is not best way to lookup cache,
- * user should call ire_cache_lookup directly.
- *
- * If MATCH_IRE_TYPE was set, first lookup in the cache table and then
- * in the forwarding table, if the applicable type flags were set.
- */
- if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_CACHETABLE) != 0) {
- ire = ire_ctable_lookup_v6(addr, gateway, type, ipif, zoneid,
- tsl, flags, ipst);
- if (ire != NULL)
- return (ire);
- }
- if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_FORWARDTABLE) != 0) {
- ire = ire_ftable_lookup_v6(addr, mask, gateway, type, ipif,
- pire, zoneid, 0, tsl, flags, ipst);
+ match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR;
+ if (ill != NULL)
+ match_flags |= MATCH_IRE_ILL;
+
+ ire = ire_ftable_lookup_impl_v6(gateway, &ipv6_all_zeros,
+ &ipv6_all_zeros, IRE_INTERFACE, ill, zoneid, tsl, match_flags,
+ ipst);
+
+ if (!lock_held)
+ rw_exit(&ipst->ips_ip6_ire_head_lock);
+ if (ire != NULL) {
+ ire_refrele(ire);
+ return (B_TRUE);
+ } else {
+ return (B_FALSE);
}
- return (ire);
}
/*
@@ -1429,63 +896,121 @@ ire_route_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
* required parameters and indicating the
* match required in flag field.
*
- * Looking for default route can be done in three ways
- * 1) pass mask as ipv6_all_zeros and set MATCH_IRE_MASK in flags field
- * along with other matches.
- * 2) pass type as IRE_DEFAULT and set MATCH_IRE_TYPE in flags
- * field along with other matches.
- * 3) if the destination and mask are passed as zeros.
- *
- * A request to return a default route if no route
- * is found, can be specified by setting MATCH_IRE_DEFAULT
- * in flags.
- *
- * It does not support recursion more than one level. It
- * will do recursive lookup only when the lookup maps to
- * a prefix or default route and MATCH_IRE_RECURSIVE flag is passed.
- *
- * If the routing table is setup to allow more than one level
- * of recursion, the cleaning up cache table will not work resulting
- * in invalid routing.
- *
* Supports link-local addresses by following the ipif/ill when recursing.
- *
- * NOTE : When this function returns NULL, pire has already been released.
- * pire is valid only when this function successfully returns an
- * ire.
*/
ire_t *
ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
- const in6_addr_t *gateway, int type, const ipif_t *ipif, ire_t **pire,
- zoneid_t zoneid, uint32_t ihandle, const ts_label_t *tsl, int flags,
- ip_stack_t *ipst)
+ const in6_addr_t *gateway, int type, const ill_t *ill,
+ zoneid_t zoneid, const ts_label_t *tsl, int flags,
+ uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp)
{
- irb_t *irb_ptr;
- ire_t *rire;
ire_t *ire = NULL;
- ire_t *saved_ire;
- nce_t *nce;
- int i;
- in6_addr_t gw_addr_v6;
ASSERT(addr != NULL);
ASSERT((!(flags & MATCH_IRE_MASK)) || mask != NULL);
ASSERT((!(flags & MATCH_IRE_GW)) || gateway != NULL);
- ASSERT(ipif == NULL || ipif->ipif_isv6);
+ ASSERT(ill == NULL || ill->ill_isv6);
+
+ ASSERT(!IN6_IS_ADDR_V4MAPPED(addr));
/*
- * When we return NULL from this function, we should make
- * sure that *pire is NULL so that the callers will not
- * wrongly REFRELE the pire.
+ * ire_match_args_v6() will dereference ill if MATCH_IRE_ILL
+ * is set.
*/
- if (pire != NULL)
- *pire = NULL;
+ if ((flags & (MATCH_IRE_ILL)) && (ill == NULL))
+ return (NULL);
+
+ rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
+ ire = ire_ftable_lookup_impl_v6(addr, mask, gateway, type, ill, zoneid,
+ tsl, flags, ipst);
+ if (ire == NULL) {
+ rw_exit(&ipst->ips_ip6_ire_head_lock);
+ return (NULL);
+ }
+
/*
- * ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or
- * MATCH_IRE_ILL is set.
+ * round-robin only if we have more than one route in the bucket.
+ * ips_ip_ecmp_behavior controls when we do ECMP
+ * 2: always
+ * 1: for IRE_DEFAULT and /0 IRE_INTERFACE
+ * 0: never
+ *
+ * Note: if we found an IRE_IF_CLONE we won't look at the bucket with
+ * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match
+ * and the IRE_INTERFACESs are likely to be shorter matches.
*/
- if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL))
- return (NULL);
+ if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) {
+ if (ipst->ips_ip_ecmp_behavior == 2 ||
+ (ipst->ips_ip_ecmp_behavior == 1 &&
+ IS_DEFAULT_ROUTE_V6(ire))) {
+ ire_t *next_ire;
+ ire_ftable_args_t margs;
+
+ (void) memset(&margs, 0, sizeof (margs));
+ margs.ift_addr_v6 = *addr;
+ if (mask != NULL)
+ margs.ift_mask_v6 = *mask;
+ if (gateway != NULL)
+ margs.ift_gateway_v6 = *gateway;
+ margs.ift_type = type;
+ margs.ift_ill = ill;
+ margs.ift_zoneid = zoneid;
+ margs.ift_tsl = tsl;
+ margs.ift_flags = flags;
+
+ next_ire = ire_round_robin(ire->ire_bucket, &margs,
+ xmit_hint, ire, ipst);
+ if (next_ire == NULL) {
+ /* keep ire if next_ire is null */
+ goto done;
+ }
+ ire_refrele(ire);
+ ire = next_ire;
+ }
+ }
+
+done:
+ /* Return generation before dropping lock */
+ if (generationp != NULL)
+ *generationp = ire->ire_generation;
+
+ rw_exit(&ipst->ips_ip6_ire_head_lock);
+
+ /*
+ * For shared-IP zones we need additional checks to what was
+ * done in ire_match_args to make sure IRE_LOCALs are handled.
+ *
+ * When ip_restrict_interzone_loopback is set, then
+ * we ensure that IRE_LOCAL are only used for loopback
+ * between zones when the logical "Ethernet" would
+ * have looped them back. That is, if in the absense of
+ * the IRE_LOCAL we would have sent to packet out the
+ * same ill.
+ */
+ if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES &&
+ ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES &&
+ ipst->ips_ip_restrict_interzone_loopback) {
+ ire = ire_alt_local(ire, zoneid, tsl, ill, generationp);
+ ASSERT(ire != NULL);
+ }
+
+ return (ire);
+}
+
+/*
+ * Look up a single ire. The caller holds either the read or write lock.
+ */
+ire_t *
+ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask,
+ const in6_addr_t *gateway, int type, const ill_t *ill,
+ zoneid_t zoneid, const ts_label_t *tsl, int flags,
+ ip_stack_t *ipst)
+{
+ irb_t *irb_ptr;
+ ire_t *ire = NULL;
+ int i;
+
+ ASSERT(RW_LOCK_HELD(&ipst->ips_ip6_ire_head_lock));
/*
* If the mask is known, the lookup
@@ -1496,28 +1021,41 @@ ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
uint_t masklen;
masklen = ip_mask_to_plen_v6(mask);
- if (ipst->ips_ip_forwarding_table_v6[masklen] == NULL)
+ if (ipst->ips_ip_forwarding_table_v6[masklen] == NULL) {
return (NULL);
+ }
irb_ptr = &(ipst->ips_ip_forwarding_table_v6[masklen][
IRE_ADDR_MASK_HASH_V6(*addr, *mask,
ipst->ips_ip6_ftable_hash_size)]);
rw_enter(&irb_ptr->irb_lock, RW_READER);
for (ire = irb_ptr->irb_ire; ire != NULL;
ire = ire->ire_next) {
- if (ire->ire_marks & IRE_MARK_CONDEMNED)
+ if (IRE_IS_CONDEMNED(ire))
continue;
if (ire_match_args_v6(ire, addr, mask, gateway, type,
- ipif, zoneid, ihandle, tsl, flags))
+ ill, zoneid, tsl, flags))
goto found_ire;
}
rw_exit(&irb_ptr->irb_lock);
} else {
+ uint_t masklen;
+
/*
* In this case we don't know the mask, we need to
* search the table assuming different mask sizes.
- * we start with 128 bit mask, we don't allow default here.
*/
- for (i = (IP6_MASK_TABLE_SIZE - 1); i > 0; i--) {
+ if (flags & MATCH_IRE_SHORTERMASK) {
+ masklen = ip_mask_to_plen_v6(mask);
+ if (masklen == 0) {
+ /* Nothing shorter than zero */
+ return (NULL);
+ }
+ masklen--;
+ } else {
+ masklen = IP6_MASK_TABLE_SIZE - 1;
+ }
+
+ for (i = masklen; i >= 0; i--) {
in6_addr_t tmpmask;
if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL)
@@ -1529,1334 +1067,415 @@ ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
rw_enter(&irb_ptr->irb_lock, RW_READER);
for (ire = irb_ptr->irb_ire; ire != NULL;
ire = ire->ire_next) {
- if (ire->ire_marks & IRE_MARK_CONDEMNED)
+ if (IRE_IS_CONDEMNED(ire))
continue;
if (ire_match_args_v6(ire, addr,
- &ire->ire_mask_v6, gateway, type, ipif,
- zoneid, ihandle, tsl, flags))
+ &ire->ire_mask_v6, gateway, type, ill,
+ zoneid, tsl, flags))
goto found_ire;
}
rw_exit(&irb_ptr->irb_lock);
}
}
-
- /*
- * We come here if no route has yet been found.
- *
- * Handle the case where default route is
- * requested by specifying type as one of the possible
- * types for that can have a zero mask (IRE_DEFAULT and IRE_INTERFACE).
- *
- * If MATCH_IRE_MASK is specified, then the appropriate default route
- * would have been found above if it exists so it isn't looked up here.
- * If MATCH_IRE_DEFAULT was also specified, then a default route will be
- * searched for later.
- */
- if ((flags & (MATCH_IRE_TYPE | MATCH_IRE_MASK)) == MATCH_IRE_TYPE &&
- (type & (IRE_DEFAULT | IRE_INTERFACE))) {
- if (ipst->ips_ip_forwarding_table_v6[0] != NULL) {
- /* addr & mask is zero for defaults */
- irb_ptr = &ipst->ips_ip_forwarding_table_v6[0][
- IRE_ADDR_HASH_V6(ipv6_all_zeros,
- ipst->ips_ip6_ftable_hash_size)];
- rw_enter(&irb_ptr->irb_lock, RW_READER);
- for (ire = irb_ptr->irb_ire; ire != NULL;
- ire = ire->ire_next) {
-
- if (ire->ire_marks & IRE_MARK_CONDEMNED)
- continue;
-
- if (ire_match_args_v6(ire, addr,
- &ipv6_all_zeros, gateway, type, ipif,
- zoneid, ihandle, tsl, flags))
- goto found_ire;
- }
- rw_exit(&irb_ptr->irb_lock);
- }
- }
- /*
- * We come here only if no route is found.
- * see if the default route can be used which is allowed
- * only if the default matching criteria is specified.
- * The ipv6_ire_default_count tracks the number of IRE_DEFAULT
- * entries. However, the ip_forwarding_table_v6[0] also contains
- * interface routes thus the count can be zero.
- */
- saved_ire = NULL;
- if ((flags & (MATCH_IRE_DEFAULT | MATCH_IRE_MASK)) ==
- MATCH_IRE_DEFAULT) {
- ire_t *ire_origin;
- uint_t g_index;
- uint_t index;
-
- if (ipst->ips_ip_forwarding_table_v6[0] == NULL)
- return (NULL);
- irb_ptr = &(ipst->ips_ip_forwarding_table_v6[0])[0];
-
- /*
- * Keep a tab on the bucket while looking the IRE_DEFAULT
- * entries. We need to keep track of a particular IRE
- * (ire_origin) so this ensures that it will not be unlinked
- * from the hash list during the recursive lookup below.
- */
- IRB_REFHOLD(irb_ptr);
- ire = irb_ptr->irb_ire;
- if (ire == NULL) {
- IRB_REFRELE(irb_ptr);
- return (NULL);
- }
-
- /*
- * Get the index first, since it can be changed by other
- * threads. Then get to the right default route skipping
- * default interface routes if any. As we hold a reference on
- * the IRE bucket, ipv6_ire_default_count can only increase so
- * we can't reach the end of the hash list unexpectedly.
- */
- if (ipst->ips_ipv6_ire_default_count != 0) {
- g_index = ipst->ips_ipv6_ire_default_index++;
- index = g_index % ipst->ips_ipv6_ire_default_count;
- while (index != 0) {
- if (!(ire->ire_type & IRE_INTERFACE))
- index--;
- ire = ire->ire_next;
- }
- ASSERT(ire != NULL);
- } else {
- /*
- * No default route, so we only have default interface
- * routes: don't enter the first loop.
- */
- ire = NULL;
- }
-
- /*
- * Round-robin the default routers list looking for a neighbor
- * that matches the passed in parameters and is reachable. If
- * none found, just return a route from the default router list
- * if it exists. If we can't find a default route (IRE_DEFAULT),
- * look for interface default routes.
- * We start with the ire we found above and we walk the hash
- * list until we're back where we started, see
- * ire_get_next_default_ire(). It doesn't matter if default
- * routes are added or deleted by other threads - we know this
- * ire will stay in the list because we hold a reference on the
- * ire bucket.
- * NB: if we only have interface default routes, ire is NULL so
- * we don't even enter this loop (see above).
- */
- ire_origin = ire;
- for (; ire != NULL;
- ire = ire_get_next_default_ire(ire, ire_origin)) {
-
- if (ire_match_args_v6(ire, addr,
- &ipv6_all_zeros, gateway, type, ipif,
- zoneid, ihandle, tsl, flags)) {
- int match_flags;
-
- /*
- * We have something to work with.
- * If we can find a resolved/reachable
- * entry, we will use this. Otherwise
- * we'll try to find an entry that has
- * a resolved cache entry. We will fallback
- * on this if we don't find anything else.
- */
- if (saved_ire == NULL)
- saved_ire = ire;
- mutex_enter(&ire->ire_lock);
- gw_addr_v6 = ire->ire_gateway_addr_v6;
- mutex_exit(&ire->ire_lock);
- match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
- rire = ire_ctable_lookup_v6(&gw_addr_v6, NULL,
- 0, ire->ire_ipif, zoneid, tsl, match_flags,
- ipst);
- if (rire != NULL) {
- nce = rire->ire_nce;
- if (nce != NULL &&
- NCE_ISREACHABLE(nce) &&
- nce->nce_flags & NCE_F_ISROUTER) {
- ire_refrele(rire);
- IRE_REFHOLD(ire);
- IRB_REFRELE(irb_ptr);
- goto found_ire_held;
- } else if (nce != NULL &&
- !(nce->nce_flags &
- NCE_F_ISROUTER)) {
- /*
- * Make sure we don't use
- * this ire
- */
- if (saved_ire == ire)
- saved_ire = NULL;
- }
- ire_refrele(rire);
- } else if (ipst->
- ips_ipv6_ire_default_count > 1 &&
- zoneid != GLOBAL_ZONEID) {
- /*
- * When we're in a local zone, we're
- * only interested in default routers
- * that are reachable through ipifs
- * within our zone.
- * The potentially expensive call to
- * ire_route_lookup_v6() is avoided when
- * we have only one default route.
- */
- int ire_match_flags = MATCH_IRE_TYPE |
- MATCH_IRE_SECATTR;
-
- if (ire->ire_ipif != NULL) {
- ire_match_flags |=
- MATCH_IRE_ILL;
- }
- rire = ire_route_lookup_v6(&gw_addr_v6,
- NULL, NULL, IRE_INTERFACE,
- ire->ire_ipif, NULL,
- zoneid, tsl, ire_match_flags, ipst);
- if (rire != NULL) {
- ire_refrele(rire);
- saved_ire = ire;
- } else if (saved_ire == ire) {
- /*
- * Make sure we don't use
- * this ire
- */
- saved_ire = NULL;
- }
- }
- }
- }
- if (saved_ire != NULL) {
- ire = saved_ire;
- IRE_REFHOLD(ire);
- IRB_REFRELE(irb_ptr);
- goto found_ire_held;
- } else {
- /*
- * Look for a interface default route matching the
- * args passed in. No round robin here. Just pick
- * the right one.
- */
- for (ire = irb_ptr->irb_ire; ire != NULL;
- ire = ire->ire_next) {
-
- if (!(ire->ire_type & IRE_INTERFACE))
- continue;
-
- if (ire->ire_marks & IRE_MARK_CONDEMNED)
- continue;
-
- if (ire_match_args_v6(ire, addr,
- &ipv6_all_zeros, gateway, type, ipif,
- zoneid, ihandle, tsl, flags)) {
- IRE_REFHOLD(ire);
- IRB_REFRELE(irb_ptr);
- goto found_ire_held;
- }
- }
- IRB_REFRELE(irb_ptr);
- }
- }
ASSERT(ire == NULL);
ip1dbg(("ire_ftable_lookup_v6: returning NULL ire"));
return (NULL);
+
found_ire:
- ASSERT((ire->ire_marks & IRE_MARK_CONDEMNED) == 0);
- IRE_REFHOLD(ire);
+ ire_refhold(ire);
rw_exit(&irb_ptr->irb_lock);
-
-found_ire_held:
- if ((flags & MATCH_IRE_RJ_BHOLE) &&
- (ire->ire_flags & (RTF_BLACKHOLE | RTF_REJECT))) {
- return (ire);
- }
- /*
- * At this point, IRE that was found must be an IRE_FORWARDTABLE
- * or IRE_CACHETABLE type. If this is a recursive lookup and an
- * IRE_INTERFACE type was found, return that. If it was some other
- * IRE_FORWARDTABLE type of IRE (one of the prefix types), then it
- * is necessary to fill in the parent IRE pointed to by pire, and
- * then lookup the gateway address of the parent. For backwards
- * compatiblity, if this lookup returns an
- * IRE other than a IRE_CACHETABLE or IRE_INTERFACE, then one more level
- * of lookup is done.
- */
- if (flags & MATCH_IRE_RECURSIVE) {
- const ipif_t *gw_ipif;
- int match_flags = MATCH_IRE_DSTONLY;
-
- if (ire->ire_type & IRE_INTERFACE)
- return (ire);
- if (pire != NULL)
- *pire = ire;
- /*
- * If we can't find an IRE_INTERFACE or the caller has not
- * asked for pire, we need to REFRELE the saved_ire.
- */
- saved_ire = ire;
-
- if (ire->ire_ipif != NULL)
- match_flags |= MATCH_IRE_ILL;
-
- mutex_enter(&ire->ire_lock);
- gw_addr_v6 = ire->ire_gateway_addr_v6;
- mutex_exit(&ire->ire_lock);
-
- ire = ire_route_lookup_v6(&gw_addr_v6, NULL, NULL, 0,
- ire->ire_ipif, NULL, zoneid, tsl, match_flags, ipst);
- if (ire == NULL) {
- /*
- * In this case we have to deal with the
- * MATCH_IRE_PARENT flag, which means the
- * parent has to be returned if ire is NULL.
- * The aim of this is to have (at least) a starting
- * ire when we want to look at all of the ires in a
- * bucket aimed at a single destination (as is the
- * case in ip_newroute_v6 for the RTF_MULTIRT
- * flagged routes).
- */
- if (flags & MATCH_IRE_PARENT) {
- if (pire != NULL) {
- /*
- * Need an extra REFHOLD, if the
- * parent ire is returned via both
- * ire and pire.
- */
- IRE_REFHOLD(saved_ire);
- }
- ire = saved_ire;
- } else {
- ire_refrele(saved_ire);
- if (pire != NULL)
- *pire = NULL;
- }
- return (ire);
- }
- if (ire->ire_type & (IRE_CACHETABLE | IRE_INTERFACE)) {
- /*
- * If the caller did not ask for pire, release
- * it now.
- */
- if (pire == NULL) {
- ire_refrele(saved_ire);
- }
- return (ire);
- }
- match_flags |= MATCH_IRE_TYPE;
- mutex_enter(&ire->ire_lock);
- gw_addr_v6 = ire->ire_gateway_addr_v6;
- mutex_exit(&ire->ire_lock);
- gw_ipif = ire->ire_ipif;
- ire_refrele(ire);
- ire = ire_route_lookup_v6(&gw_addr_v6, NULL, NULL,
- (IRE_CACHETABLE | IRE_INTERFACE), gw_ipif, NULL, zoneid,
- NULL, match_flags, ipst);
- if (ire == NULL) {
- /*
- * In this case we have to deal with the
- * MATCH_IRE_PARENT flag, which means the
- * parent has to be returned if ire is NULL.
- * The aim of this is to have (at least) a starting
- * ire when we want to look at all of the ires in a
- * bucket aimed at a single destination (as is the
- * case in ip_newroute_v6 for the RTF_MULTIRT
- * flagged routes).
- */
- if (flags & MATCH_IRE_PARENT) {
- if (pire != NULL) {
- /*
- * Need an extra REFHOLD, if the
- * parent ire is returned via both
- * ire and pire.
- */
- IRE_REFHOLD(saved_ire);
- }
- ire = saved_ire;
- } else {
- ire_refrele(saved_ire);
- if (pire != NULL)
- *pire = NULL;
- }
- return (ire);
- } else if (pire == NULL) {
- /*
- * If the caller did not ask for pire, release
- * it now.
- */
- ire_refrele(saved_ire);
- }
- return (ire);
- }
-
- ASSERT(pire == NULL || *pire == NULL);
return (ire);
}
-/*
- * Delete the IRE cache for the gateway and all IRE caches whose
- * ire_gateway_addr_v6 points to this gateway, and allow them to
- * be created on demand by ip_newroute_v6.
- */
-void
-ire_clookup_delete_cache_gw_v6(const in6_addr_t *addr, zoneid_t zoneid,
- ip_stack_t *ipst)
-{
- irb_t *irb;
- ire_t *ire;
-
- irb = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr,
- ipst->ips_ip6_cache_table_size)];
- IRB_REFHOLD(irb);
- for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
- if (ire->ire_marks & IRE_MARK_CONDEMNED)
- continue;
-
- ASSERT(IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, &ipv6_all_ones));
- if (ire_match_args_v6(ire, addr, &ire->ire_mask_v6, 0,
- IRE_CACHE, NULL, zoneid, 0, NULL, MATCH_IRE_TYPE)) {
- ire_delete(ire);
- }
- }
- IRB_REFRELE(irb);
-
- ire_walk_v6(ire_delete_cache_gw_v6, (char *)addr, zoneid, ipst);
-}
-
-/*
- * Looks up cache table for a route.
- * specific lookup can be indicated by
- * passing the MATCH_* flags and the
- * necessary parameters.
- */
-ire_t *
-ire_ctable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *gateway,
- int type, const ipif_t *ipif, zoneid_t zoneid, const ts_label_t *tsl,
- int flags, ip_stack_t *ipst)
-{
- ire_ctable_args_t margs;
-
- margs.ict_addr = (void *)addr;
- margs.ict_gateway = (void *)gateway;
- margs.ict_type = type;
- margs.ict_ipif = ipif;
- margs.ict_zoneid = zoneid;
- margs.ict_tsl = tsl;
- margs.ict_flags = flags;
- margs.ict_ipst = ipst;
- margs.ict_wq = NULL;
-
- return (ip6_ctable_lookup_impl(&margs));
-}
/*
- * Lookup cache.
+ * This function is called by
+ * ip_input/ire_route_recursive when doing a route lookup on only the
+ * destination address.
*
- * In general the zoneid has to match (where ALL_ZONES match all of them).
- * But for IRE_LOCAL we also need to handle the case where L2 should
- * conceptually loop back the packet. This is necessary since neither
- * Ethernet drivers nor Ethernet hardware loops back packets sent to their
- * own MAC address. This loopback is needed when the normal
- * routes (ignoring IREs with different zoneids) would send out the packet on
- * the same ill as the ill with which this IRE_LOCAL is associated.
+ * The optimizations of this function over ire_ftable_lookup are:
+ * o removing unnecessary flag matching
+ * o doing longest prefix match instead of overloading it further
+ * with the unnecessary "best_prefix_match"
*
- * Earlier versions of this code always matched an IRE_LOCAL independently of
- * the zoneid. We preserve that earlier behavior when
- * ip_restrict_interzone_loopback is turned off.
+ * If no route is found we return IRE_NOROUTE.
*/
ire_t *
-ire_cache_lookup_v6(const in6_addr_t *addr, zoneid_t zoneid,
- const ts_label_t *tsl, ip_stack_t *ipst)
-{
- irb_t *irb_ptr;
- ire_t *ire;
-
- irb_ptr = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr,
- ipst->ips_ip6_cache_table_size)];
- rw_enter(&irb_ptr->irb_lock, RW_READER);
- for (ire = irb_ptr->irb_ire; ire; ire = ire->ire_next) {
- if (ire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN))
- continue;
- if (IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, addr)) {
- /*
- * Finally, check if the security policy has any
- * restriction on using this route for the specified
- * message.
- */
- if (tsl != NULL &&
- ire->ire_gw_secattr != NULL &&
- tsol_ire_match_gwattr(ire, tsl) != 0) {
- continue;
- }
-
- if (zoneid == ALL_ZONES || ire->ire_zoneid == zoneid ||
- ire->ire_zoneid == ALL_ZONES) {
- IRE_REFHOLD(ire);
- rw_exit(&irb_ptr->irb_lock);
- return (ire);
- }
-
- if (ire->ire_type == IRE_LOCAL) {
- if (ipst->ips_ip_restrict_interzone_loopback &&
- !ire_local_ok_across_zones(ire, zoneid,
- (void *)addr, tsl, ipst))
- continue;
-
- IRE_REFHOLD(ire);
- rw_exit(&irb_ptr->irb_lock);
- return (ire);
- }
- }
- }
- rw_exit(&irb_ptr->irb_lock);
- return (NULL);
-}
-
-/*
- * Locate the interface ire that is tied to the cache ire 'cire' via
- * cire->ire_ihandle.
- *
- * We are trying to create the cache ire for an onlink destn. or
- * gateway in 'cire'. We are called from ire_add_v6() in the IRE_IF_RESOLVER
- * case for xresolv interfaces, after the ire has come back from
- * an external resolver.
- */
-static ire_t *
-ire_ihandle_lookup_onlink_v6(ire_t *cire)
+ire_ftable_lookup_simple_v6(const in6_addr_t *addr, uint32_t xmit_hint,
+ ip_stack_t *ipst, uint_t *generationp)
{
ire_t *ire;
- int match_flags;
- int i;
- int j;
- irb_t *irb_ptr;
- ip_stack_t *ipst = cire->ire_ipst;
-
- ASSERT(cire != NULL);
- match_flags = MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK;
- /*
- * We know that the mask of the interface ire equals cire->ire_cmask.
- * (When ip_newroute_v6() created 'cire' for an on-link destn.
- * it set its cmask from the interface ire's mask)
- */
- ire = ire_ftable_lookup_v6(&cire->ire_addr_v6, &cire->ire_cmask_v6,
- NULL, IRE_INTERFACE, NULL, NULL, ALL_ZONES, cire->ire_ihandle,
- NULL, match_flags, ipst);
- if (ire != NULL)
- return (ire);
- /*
- * If we didn't find an interface ire above, we can't declare failure.
- * For backwards compatibility, we need to support prefix routes
- * pointing to next hop gateways that are not on-link.
- *
- * In the resolver/noresolver case, ip_newroute_v6() thinks
- * it is creating the cache ire for an onlink destination in 'cire'.
- * But 'cire' is not actually onlink, because ire_ftable_lookup_v6()
- * cheated it, by doing ire_route_lookup_v6() twice and returning an
- * interface ire.
- *
- * Eg. default - gw1 (line 1)
- * gw1 - gw2 (line 2)
- * gw2 - hme0 (line 3)
- *
- * In the above example, ip_newroute_v6() tried to create the cache ire
- * 'cire' for gw1, based on the interface route in line 3. The
- * ire_ftable_lookup_v6() above fails, because there is
- * no interface route to reach gw1. (it is gw2). We fall thru below.
- *
- * Do a brute force search based on the ihandle in a subset of the
- * forwarding tables, corresponding to cire->ire_cmask_v6. Otherwise
- * things become very complex, since we don't have 'pire' in this
- * case. (Also note that this method is not possible in the offlink
- * case because we don't know the mask)
- */
- i = ip_mask_to_plen_v6(&cire->ire_cmask_v6);
- if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL)
- return (NULL);
- for (j = 0; j < ipst->ips_ip6_ftable_hash_size; j++) {
- irb_ptr = &ipst->ips_ip_forwarding_table_v6[i][j];
- rw_enter(&irb_ptr->irb_lock, RW_READER);
- for (ire = irb_ptr->irb_ire; ire != NULL;
- ire = ire->ire_next) {
- if (ire->ire_marks & IRE_MARK_CONDEMNED)
- continue;
- if ((ire->ire_type & IRE_INTERFACE) &&
- (ire->ire_ihandle == cire->ire_ihandle)) {
- IRE_REFHOLD(ire);
- rw_exit(&irb_ptr->irb_lock);
- return (ire);
- }
- }
- rw_exit(&irb_ptr->irb_lock);
+ ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL, ALL_ZONES, NULL,
+ MATCH_IRE_DSTONLY, xmit_hint, ipst, generationp);
+ if (ire == NULL) {
+ ire = ire_reject(ipst, B_TRUE);
+ if (generationp != NULL)
+ *generationp = IRE_GENERATION_VERIFY;
}
- return (NULL);
+ /* ftable_lookup did round robin */
+ return (ire);
}
-
-/*
- * Locate the interface ire that is tied to the cache ire 'cire' via
- * cire->ire_ihandle.
- *
- * We are trying to create the cache ire for an offlink destn based
- * on the cache ire of the gateway in 'cire'. 'pire' is the prefix ire
- * as found by ip_newroute_v6(). We are called from ip_newroute_v6() in
- * the IRE_CACHE case.
- */
ire_t *
-ire_ihandle_lookup_offlink_v6(ire_t *cire, ire_t *pire)
+ip_select_route_v6(const in6_addr_t *dst, ip_xmit_attr_t *ixa,
+ uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp)
{
- ire_t *ire;
- int match_flags;
- in6_addr_t gw_addr;
- ipif_t *gw_ipif;
- ip_stack_t *ipst = cire->ire_ipst;
-
- ASSERT(cire != NULL && pire != NULL);
+ ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4));
- match_flags = MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK;
- if (pire->ire_ipif != NULL)
- match_flags |= MATCH_IRE_ILL;
- /*
- * We know that the mask of the interface ire equals cire->ire_cmask.
- * (When ip_newroute_v6() created 'cire' for an on-link destn. it set
- * its cmask from the interface ire's mask)
- */
- ire = ire_ftable_lookup_v6(&cire->ire_addr_v6, &cire->ire_cmask_v6, 0,
- IRE_INTERFACE, pire->ire_ipif, NULL, ALL_ZONES, cire->ire_ihandle,
- NULL, match_flags, ipst);
- if (ire != NULL)
- return (ire);
- /*
- * If we didn't find an interface ire above, we can't declare failure.
- * For backwards compatibility, we need to support prefix routes
- * pointing to next hop gateways that are not on-link.
- *
- * Assume we are trying to ping some offlink destn, and we have the
- * routing table below.
- *
- * Eg. default - gw1 <--- pire (line 1)
- * gw1 - gw2 (line 2)
- * gw2 - hme0 (line 3)
- *
- * If we already have a cache ire for gw1 in 'cire', the
- * ire_ftable_lookup_v6 above would have failed, since there is no
- * interface ire to reach gw1. We will fallthru below.
- *
- * Here we duplicate the steps that ire_ftable_lookup_v6() did in
- * getting 'cire' from 'pire', in the MATCH_IRE_RECURSIVE case.
- * The differences are the following
- * i. We want the interface ire only, so we call
- * ire_ftable_lookup_v6() instead of ire_route_lookup_v6()
- * ii. We look for only prefix routes in the 1st call below.
- * ii. We want to match on the ihandle in the 2nd call below.
- */
- match_flags = MATCH_IRE_TYPE;
- if (pire->ire_ipif != NULL)
- match_flags |= MATCH_IRE_ILL;
-
- mutex_enter(&pire->ire_lock);
- gw_addr = pire->ire_gateway_addr_v6;
- mutex_exit(&pire->ire_lock);
- ire = ire_ftable_lookup_v6(&gw_addr, 0, 0, IRE_OFFSUBNET,
- pire->ire_ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
- if (ire == NULL)
- return (NULL);
- /*
- * At this point 'ire' corresponds to the entry shown in line 2.
- * gw_addr is 'gw2' in the example above.
- */
- mutex_enter(&ire->ire_lock);
- gw_addr = ire->ire_gateway_addr_v6;
- mutex_exit(&ire->ire_lock);
- gw_ipif = ire->ire_ipif;
- ire_refrele(ire);
-
- match_flags |= MATCH_IRE_IHANDLE;
- ire = ire_ftable_lookup_v6(&gw_addr, 0, 0, IRE_INTERFACE,
- gw_ipif, NULL, ALL_ZONES, cire->ire_ihandle,
- NULL, match_flags, ipst);
- return (ire);
+ return (ip_select_route(dst, ixa, generationp, setsrcp, errorp,
+ multirtp));
}
/*
- * Return the IRE_LOOPBACK, IRE_IF_RESOLVER or IRE_IF_NORESOLVER
- * ire associated with the specified ipif.
+ * Recursively look for a route to the destination. Can also match on
+ * the zoneid, ill, and label. Used for the data paths. See also
+ * ire_route_recursive_dstonly.
*
- * This might occasionally be called when IPIF_UP is not set since
- * the IPV6_MULTICAST_IF as well as creating interface routes
- * allows specifying a down ipif (ipif_lookup* match ipifs that are down).
+ * If ill is set this means we will match it by adding MATCH_IRE_ILL.
*
- * Note that if IPIF_NOLOCAL, IPIF_NOXMIT, or IPIF_DEPRECATED is set on
- * the ipif this routine might return NULL.
- * (Sometimes called as writer though not required by this function.)
+ * If allocate is not set then we will only inspect the existing IREs; never
+ * create an IRE_IF_CLONE. This is used on the receive side when we are not
+ * forwarding.
+ *
+ * Note that this function never returns NULL. It returns an IRE_NOROUTE
+ * instead.
+ *
+ * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
+ * is an error.
+ * Allow at most one RTF_INDIRECT.
*/
ire_t *
-ipif_to_ire_v6(const ipif_t *ipif)
+ire_route_recursive_impl_v6(ire_t *ire,
+ const in6_addr_t *nexthop, uint_t ire_type, const ill_t *ill_arg,
+ zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
+ boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst,
+ in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
{
- ire_t *ire;
- ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
- uint_t match_flags = MATCH_IRE_TYPE | MATCH_IRE_IPIF;
+ int i, j;
+ in6_addr_t v6nexthop = *nexthop;
+ ire_t *ires[MAX_IRE_RECURSION];
+ uint_t generation;
+ uint_t generations[MAX_IRE_RECURSION];
+ boolean_t need_refrele = B_FALSE;
+ boolean_t invalidate = B_FALSE;
+ int prefs[MAX_IRE_RECURSION];
+ ill_t *ill = NULL;
+
+ if (setsrcp != NULL)
+ ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp));
+ if (gwattrp != NULL)
+ ASSERT(*gwattrp == NULL);
+
+ if (ill_arg != NULL)
+ match_args |= MATCH_IRE_ILL;
/*
- * IRE_INTERFACE entries for ills under IPMP are IRE_MARK_TESTHIDDEN
- * so that they aren't accidentally returned. However, if the
- * caller's ipif is on an ill under IPMP, there's no need to hide 'em.
+ * We iterate up to three times to resolve a route, even though
+ * we have four slots in the array. The extra slot is for an
+ * IRE_IF_CLONE we might need to create.
*/
- if (IS_UNDER_IPMP(ipif->ipif_ill))
- match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
-
- ASSERT(ipif->ipif_isv6);
- if (ipif->ipif_ire_type == IRE_LOOPBACK) {
- ire = ire_ctable_lookup_v6(&ipif->ipif_v6lcl_addr, NULL,
- IRE_LOOPBACK, ipif, ALL_ZONES, NULL, match_flags, ipst);
- } else if (ipif->ipif_flags & IPIF_POINTOPOINT) {
- /* In this case we need to lookup destination address. */
- ire = ire_ftable_lookup_v6(&ipif->ipif_v6pp_dst_addr,
- &ipv6_all_ones, NULL, IRE_INTERFACE, ipif, NULL, ALL_ZONES,
- 0, NULL, (match_flags | MATCH_IRE_MASK), ipst);
- } else {
- ire = ire_ftable_lookup_v6(&ipif->ipif_v6subnet,
- &ipif->ipif_v6net_mask, NULL, IRE_INTERFACE, ipif, NULL,
- ALL_ZONES, 0, NULL, (match_flags | MATCH_IRE_MASK), ipst);
- }
- return (ire);
-}
-
-/*
- * Return B_TRUE if a multirt route is resolvable
- * (or if no route is resolved yet), B_FALSE otherwise.
- * This only works in the global zone.
- */
-boolean_t
-ire_multirt_need_resolve_v6(const in6_addr_t *v6dstp, const ts_label_t *tsl,
- ip_stack_t *ipst)
-{
- ire_t *first_fire;
- ire_t *first_cire;
- ire_t *fire;
- ire_t *cire;
- irb_t *firb;
- irb_t *cirb;
- int unres_cnt = 0;
- boolean_t resolvable = B_FALSE;
-
- /* Retrieve the first IRE_HOST that matches the destination */
- first_fire = ire_ftable_lookup_v6(v6dstp, &ipv6_all_ones, 0, IRE_HOST,
- NULL, NULL, ALL_ZONES, 0, tsl, MATCH_IRE_MASK | MATCH_IRE_TYPE |
- MATCH_IRE_SECATTR, ipst);
-
- /* No route at all */
- if (first_fire == NULL) {
- return (B_TRUE);
- }
-
- firb = first_fire->ire_bucket;
- ASSERT(firb);
-
- /* Retrieve the first IRE_CACHE ire for that destination. */
- first_cire = ire_cache_lookup_v6(v6dstp, GLOBAL_ZONEID, tsl, ipst);
-
- /* No resolved route. */
- if (first_cire == NULL) {
- ire_refrele(first_fire);
- return (B_TRUE);
- }
-
- /* At least one route is resolved. */
-
- cirb = first_cire->ire_bucket;
- ASSERT(cirb);
-
- /* Count the number of routes to that dest that are declared. */
- IRB_REFHOLD(firb);
- for (fire = first_fire; fire != NULL; fire = fire->ire_next) {
- if (!(fire->ire_flags & RTF_MULTIRT))
- continue;
- if (!IN6_ARE_ADDR_EQUAL(&fire->ire_addr_v6, v6dstp))
- continue;
- unres_cnt++;
- }
- IRB_REFRELE(firb);
-
-
- /* Then subtract the number of routes to that dst that are resolved */
- IRB_REFHOLD(cirb);
- for (cire = first_cire; cire != NULL; cire = cire->ire_next) {
- if (!(cire->ire_flags & RTF_MULTIRT))
- continue;
- if (!IN6_ARE_ADDR_EQUAL(&cire->ire_addr_v6, v6dstp))
- continue;
- if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN))
- continue;
- unres_cnt--;
- }
- IRB_REFRELE(cirb);
-
- /* At least one route is unresolved; search for a resolvable route. */
- if (unres_cnt > 0)
- resolvable = ire_multirt_lookup_v6(&first_cire, &first_fire,
- MULTIRT_USESTAMP|MULTIRT_CACHEGW, tsl, ipst);
-
- if (first_fire)
- ire_refrele(first_fire);
-
- if (first_cire)
- ire_refrele(first_cire);
-
- return (resolvable);
-}
-
-
-/*
- * Return B_TRUE and update *ire_arg and *fire_arg
- * if at least one resolvable route is found.
- * Return B_FALSE otherwise (all routes are resolved or
- * the remaining unresolved routes are all unresolvable).
- * This only works in the global zone.
- */
-boolean_t
-ire_multirt_lookup_v6(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags,
- const ts_label_t *tsl, ip_stack_t *ipst)
-{
- clock_t delta;
- ire_t *best_fire = NULL;
- ire_t *best_cire = NULL;
- ire_t *first_fire;
- ire_t *first_cire;
- ire_t *fire;
- ire_t *cire;
- irb_t *firb = NULL;
- irb_t *cirb = NULL;
- ire_t *gw_ire;
- boolean_t already_resolved;
- boolean_t res;
- in6_addr_t v6dst;
- in6_addr_t v6gw;
-
- ip2dbg(("ire_multirt_lookup_v6: *ire_arg %p, *fire_arg %p, "
- "flags %04x\n", (void *)*ire_arg, (void *)*fire_arg, flags));
-
- ASSERT(ire_arg);
- ASSERT(fire_arg);
-
- /* Not an IRE_HOST ire; give up. */
- if ((*fire_arg == NULL) ||
- ((*fire_arg)->ire_type != IRE_HOST)) {
- return (B_FALSE);
- }
+ i = 0;
+ while (i < MAX_IRE_RECURSION - 1) {
+ /* ire_ftable_lookup handles round-robin/ECMP */
+ if (ire == NULL) {
+ ire = ire_ftable_lookup_v6(&v6nexthop, 0, 0, ire_type,
+ (ill_arg != NULL ? ill_arg : ill), zoneid, tsl,
+ match_args, xmit_hint, ipst, &generation);
+ } else {
+ /* Caller passed it; extra hold since we will rele */
+ ire_refhold(ire);
+ if (generationp != NULL)
+ generation = *generationp;
+ else
+ generation = IRE_GENERATION_VERIFY;
+ }
- /* This is the first IRE_HOST ire for that destination. */
- first_fire = *fire_arg;
- firb = first_fire->ire_bucket;
- ASSERT(firb);
+ if (ire == NULL)
+ ire = ire_reject(ipst, B_TRUE);
- mutex_enter(&first_fire->ire_lock);
- v6dst = first_fire->ire_addr_v6;
- mutex_exit(&first_fire->ire_lock);
+ /* Need to return the ire with RTF_REJECT|BLACKHOLE */
+ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
+ goto error;
- ip2dbg(("ire_multirt_lookup_v6: dst %08x\n",
- ntohl(V4_PART_OF_V6(v6dst))));
+ ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */
- /*
- * Retrieve the first IRE_CACHE ire for that destination;
- * if we don't find one, no route for that dest is
- * resolved yet.
- */
- first_cire = ire_cache_lookup_v6(&v6dst, GLOBAL_ZONEID, tsl, ipst);
- if (first_cire) {
- cirb = first_cire->ire_bucket;
- }
-
- ip2dbg(("ire_multirt_lookup_v6: first_cire %p\n", (void *)first_cire));
+ prefs[i] = ire_pref(ire);
+ if (i != 0) {
+ /*
+ * Don't allow anything unusual past the first
+ * iteration.
+ */
+ if ((ire->ire_type &
+ (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) ||
+ prefs[i] <= prefs[i-1]) {
+ ire_refrele(ire);
+ ire = ire_reject(ipst, B_TRUE);
+ goto error;
+ }
+ }
+ /* We have a usable IRE */
+ ires[i] = ire;
+ generations[i] = generation;
+ i++;
+
+ /* The first RTF_SETSRC address is passed back if setsrcp */
+ if ((ire->ire_flags & RTF_SETSRC) &&
+ setsrcp != NULL && IN6_IS_ADDR_UNSPECIFIED(setsrcp)) {
+ ASSERT(!IN6_IS_ADDR_UNSPECIFIED(
+ &ire->ire_setsrc_addr_v6));
+ *setsrcp = ire->ire_setsrc_addr_v6;
+ }
- /*
- * Search for a resolvable route, giving the top priority
- * to routes that can be resolved without any call to the resolver.
- */
- IRB_REFHOLD(firb);
+ /* The first ire_gw_secattr is passed back if gwattrp */
+ if (ire->ire_gw_secattr != NULL &&
+ gwattrp != NULL && *gwattrp == NULL)
+ *gwattrp = ire->ire_gw_secattr;
- if (!IN6_IS_ADDR_MULTICAST(&v6dst)) {
/*
- * For all multiroute IRE_HOST ires for that destination,
- * check if the route via the IRE_HOST's gateway is
- * resolved yet.
+ * Check if we have a short-cut pointer to an IRE for this
+ * destination, and that the cached dependency isn't stale.
+ * In that case we've rejoined an existing tree towards a
+ * parent, thus we don't need to continue the loop to
+ * discover the rest of the tree.
*/
- for (fire = first_fire; fire != NULL; fire = fire->ire_next) {
-
- if (!(fire->ire_flags & RTF_MULTIRT))
- continue;
- if (!IN6_ARE_ADDR_EQUAL(&fire->ire_addr_v6, &v6dst))
- continue;
-
- if (fire->ire_gw_secattr != NULL &&
- tsol_ire_match_gwattr(fire, tsl) != 0) {
- continue;
- }
-
- mutex_enter(&fire->ire_lock);
- v6gw = fire->ire_gateway_addr_v6;
- mutex_exit(&fire->ire_lock);
-
- ip2dbg(("ire_multirt_lookup_v6: fire %p, "
- "ire_addr %08x, ire_gateway_addr %08x\n",
- (void *)fire,
- ntohl(V4_PART_OF_V6(fire->ire_addr_v6)),
- ntohl(V4_PART_OF_V6(v6gw))));
+ mutex_enter(&ire->ire_lock);
+ if (ire->ire_dep_parent != NULL &&
+ ire->ire_dep_parent->ire_generation ==
+ ire->ire_dep_parent_generation) {
+ mutex_exit(&ire->ire_lock);
+ ire = NULL;
+ goto done;
+ }
+ mutex_exit(&ire->ire_lock);
- already_resolved = B_FALSE;
+ /*
+ * If this type should have an ire_nce_cache (even if it
+ * doesn't yet have one) then we are done. Includes
+ * IRE_INTERFACE with a full 128 bit mask.
+ */
+ if (ire->ire_nce_capable) {
+ ire = NULL;
+ goto done;
+ }
- if (first_cire) {
- ASSERT(cirb);
+ ASSERT(!(ire->ire_type & IRE_IF_CLONE));
+ /*
+ * For an IRE_INTERFACE we create an IRE_IF_CLONE for this
+ * particular destination
+ */
+ if (ire->ire_type & IRE_INTERFACE) {
+ ire_t *clone;
- IRB_REFHOLD(cirb);
- /*
- * For all IRE_CACHE ires for that
- * destination.
- */
- for (cire = first_cire;
- cire != NULL;
- cire = cire->ire_next) {
-
- if (!(cire->ire_flags & RTF_MULTIRT))
- continue;
- if (!IN6_ARE_ADDR_EQUAL(
- &cire->ire_addr_v6, &v6dst))
- continue;
- if (cire->ire_marks &
- (IRE_MARK_CONDEMNED|
- IRE_MARK_TESTHIDDEN))
- continue;
-
- if (cire->ire_gw_secattr != NULL &&
- tsol_ire_match_gwattr(cire,
- tsl) != 0) {
- continue;
- }
-
- /*
- * Check if the IRE_CACHE's gateway
- * matches the IRE_HOST's gateway.
- */
- if (IN6_ARE_ADDR_EQUAL(
- &cire->ire_gateway_addr_v6,
- &v6gw)) {
- already_resolved = B_TRUE;
- break;
- }
- }
- IRB_REFRELE(cirb);
- }
+ ASSERT(ire->ire_masklen != IPV6_ABITS);
/*
- * This route is already resolved;
- * proceed with next one.
+ * In the case of ip_input and ILLF_FORWARDING not
+ * being set, and in the case of RTM_GET,
+ * there is no point in allocating
+ * an IRE_IF_CLONE. We return the IRE_INTERFACE.
+ * Note that !allocate can result in a ire_dep_parent
+ * which is IRE_IF_* without an IRE_IF_CLONE.
+ * We recover from that when we need to send packets
+ * by ensuring that the generations become
+ * IRE_GENERATION_VERIFY in this case.
*/
- if (already_resolved) {
- ip2dbg(("ire_multirt_lookup_v6: found cire %p, "
- "already resolved\n", (void *)cire));
- continue;
+ if (!allocate) {
+ invalidate = B_TRUE;
+ ire = NULL;
+ goto done;
}
- /*
- * The route is unresolved; is it actually
- * resolvable, i.e. is there a cache or a resolver
- * for the gateway?
- */
- gw_ire = ire_route_lookup_v6(&v6gw, 0, 0, 0, NULL, NULL,
- ALL_ZONES, tsl, MATCH_IRE_RECURSIVE |
- MATCH_IRE_SECATTR, ipst);
-
- ip2dbg(("ire_multirt_lookup_v6: looked up gw_ire %p\n",
- (void *)gw_ire));
-
- /*
- * This route can be resolved without any call to the
- * resolver; if the MULTIRT_CACHEGW flag is set,
- * give the top priority to this ire and exit the
- * loop.
- * This occurs when an resolver reply is processed
- * through ip_wput_nondata()
- */
- if ((flags & MULTIRT_CACHEGW) &&
- (gw_ire != NULL) &&
- (gw_ire->ire_type & IRE_CACHETABLE)) {
+ clone = ire_create_if_clone(ire, &v6nexthop,
+ &generation);
+ if (clone == NULL) {
/*
- * Release the resolver associated to the
- * previous candidate best ire, if any.
+ * Temporary failure - no memory.
+ * Don't want caller to cache IRE_NOROUTE.
*/
- if (best_cire) {
- ire_refrele(best_cire);
- ASSERT(best_fire);
- }
-
- best_fire = fire;
- best_cire = gw_ire;
-
- ip2dbg(("ire_multirt_lookup_v6: found top prio "
- "best_fire %p, best_cire %p\n",
- (void *)best_fire, (void *)best_cire));
- break;
+ invalidate = B_TRUE;
+ ire = ire_blackhole(ipst, B_TRUE);
+ goto error;
}
-
/*
- * Compute the time elapsed since our preceding
- * attempt to resolve that route.
- * If the MULTIRT_USESTAMP flag is set, we take that
- * route into account only if this time interval
- * exceeds ip_multirt_resolution_interval;
- * this prevents us from attempting to resolve a
- * broken route upon each sending of a packet.
+ * Make clone next to last entry and the
+ * IRE_INTERFACE the last in the dependency
+ * chain since the clone depends on the
+ * IRE_INTERFACE.
*/
- delta = lbolt - fire->ire_last_used_time;
- delta = TICK_TO_MSEC(delta);
-
- res = (boolean_t)
- ((delta > ipst->
- ips_ip_multirt_resolution_interval) ||
- (!(flags & MULTIRT_USESTAMP)));
+ ASSERT(i >= 1);
+ ASSERT(i < MAX_IRE_RECURSION);
- ip2dbg(("ire_multirt_lookup_v6: fire %p, delta %lu, "
- "res %d\n",
- (void *)fire, delta, res));
-
- if (res) {
- /*
- * A resolver exists for the gateway: save
- * the current IRE_HOST ire as a candidate
- * best ire. If we later discover that a
- * top priority ire exists (i.e. no need to
- * call the resolver), then this new ire
- * will be preferred to the current one.
- */
- if (gw_ire != NULL) {
- if (best_fire == NULL) {
- ASSERT(best_cire == NULL);
-
- best_fire = fire;
- best_cire = gw_ire;
-
- ip2dbg(("ire_multirt_lookup_v6:"
- "found candidate "
- "best_fire %p, "
- "best_cire %p\n",
- (void *)best_fire,
- (void *)best_cire));
-
- /*
- * If MULTIRT_CACHEGW is not
- * set, we ignore the top
- * priority ires that can
- * be resolved without any
- * call to the resolver;
- * In that case, there is
- * actually no need
- * to continue the loop.
- */
- if (!(flags &
- MULTIRT_CACHEGW)) {
- break;
- }
- continue;
- }
- } else {
- /*
- * No resolver for the gateway: the
- * route is not resolvable.
- * If the MULTIRT_SETSTAMP flag is
- * set, we stamp the IRE_HOST ire,
- * so we will not select it again
- * during this resolution interval.
- */
- if (flags & MULTIRT_SETSTAMP)
- fire->ire_last_used_time =
- lbolt;
- }
- }
+ ires[i] = ires[i-1];
+ generations[i] = generations[i-1];
+ ires[i-1] = clone;
+ generations[i-1] = generation;
+ i++;
- if (gw_ire != NULL)
- ire_refrele(gw_ire);
+ ire = NULL;
+ goto done;
}
- } else { /* IN6_IS_ADDR_MULTICAST(&v6dst) */
- for (fire = first_fire;
- fire != NULL;
- fire = fire->ire_next) {
-
- if (!(fire->ire_flags & RTF_MULTIRT))
- continue;
- if (!IN6_ARE_ADDR_EQUAL(&fire->ire_addr_v6, &v6dst))
- continue;
-
- if (fire->ire_gw_secattr != NULL &&
- tsol_ire_match_gwattr(fire, tsl) != 0) {
- continue;
- }
-
- already_resolved = B_FALSE;
-
- mutex_enter(&fire->ire_lock);
- v6gw = fire->ire_gateway_addr_v6;
- mutex_exit(&fire->ire_lock);
-
- gw_ire = ire_ftable_lookup_v6(&v6gw, 0, 0,
- IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, tsl,
- MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE |
- MATCH_IRE_SECATTR, ipst);
-
- /* No resolver for the gateway; we skip this ire. */
- if (gw_ire == NULL) {
- continue;
- }
+ /*
+ * We only match on the type and optionally ILL when
+ * recursing. The type match is used by some callers
+ * to exclude certain types (such as IRE_IF_CLONE or
+ * IRE_LOCAL|IRE_LOOPBACK).
+ */
+ match_args &= MATCH_IRE_TYPE;
+ v6nexthop = ire->ire_gateway_addr_v6;
+ if (ill == NULL && ire->ire_ill != NULL) {
+ ill = ire->ire_ill;
+ need_refrele = B_TRUE;
+ ill_refhold(ill);
+ match_args |= MATCH_IRE_ILL;
+ }
- if (first_cire) {
+ ire = NULL;
+ }
+ ASSERT(ire == NULL);
+ ire = ire_reject(ipst, B_TRUE);
- IRB_REFHOLD(cirb);
- /*
- * For all IRE_CACHE ires for that
- * destination.
- */
- for (cire = first_cire;
- cire != NULL;
- cire = cire->ire_next) {
-
- if (!(cire->ire_flags & RTF_MULTIRT))
- continue;
- if (!IN6_ARE_ADDR_EQUAL(
- &cire->ire_addr_v6, &v6dst))
- continue;
- if (cire->ire_marks &
- IRE_MARK_CONDEMNED)
- continue;
-
- if (cire->ire_gw_secattr != NULL &&
- tsol_ire_match_gwattr(cire,
- tsl) != 0) {
- continue;
- }
-
- /*
- * Cache entries are linked to the
- * parent routes using the parent handle
- * (ire_phandle). If no cache entry has
- * the same handle as fire, fire is
- * still unresolved.
- */
- ASSERT(cire->ire_phandle != 0);
- if (cire->ire_phandle ==
- fire->ire_phandle) {
- already_resolved = B_TRUE;
- break;
- }
- }
- IRB_REFRELE(cirb);
- }
+error:
+ ASSERT(ire != NULL);
+ if (need_refrele)
+ ill_refrele(ill);
- /*
- * This route is already resolved; proceed with
- * next one.
- */
- if (already_resolved) {
- ire_refrele(gw_ire);
- continue;
- }
+ /*
+ * In the case of MULTIRT we want to try a different IRE the next
+ * time. We let the next packet retry in that case.
+ */
+ if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT))
+ (void) ire_no_good(ires[0]);
- /*
- * Compute the time elapsed since our preceding
- * attempt to resolve that route.
- * If the MULTIRT_USESTAMP flag is set, we take
- * that route into account only if this time
- * interval exceeds ip_multirt_resolution_interval;
- * this prevents us from attempting to resolve a
- * broken route upon each sending of a packet.
- */
- delta = lbolt - fire->ire_last_used_time;
- delta = TICK_TO_MSEC(delta);
-
- res = (boolean_t)
- ((delta > ipst->
- ips_ip_multirt_resolution_interval) ||
- (!(flags & MULTIRT_USESTAMP)));
-
- ip3dbg(("ire_multirt_lookup_v6: fire %p, delta %lx, "
- "flags %04x, res %d\n",
- (void *)fire, delta, flags, res));
-
- if (res) {
- if (best_cire) {
- /*
- * Release the resolver associated
- * to the preceding candidate best
- * ire, if any.
- */
- ire_refrele(best_cire);
- ASSERT(best_fire);
- }
- best_fire = fire;
- best_cire = gw_ire;
- continue;
- }
+cleanup:
+ /* cleanup ires[i] */
+ ire_dep_unbuild(ires, i);
+ for (j = 0; j < i; j++)
+ ire_refrele(ires[j]);
- ire_refrele(gw_ire);
- }
- }
+ ASSERT(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE));
+ /*
+ * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the
+ * ip_select_route since the reject or lack of memory might be gone.
+ */
+ if (generationp != NULL)
+ *generationp = IRE_GENERATION_VERIFY;
+ return (ire);
- if (best_fire) {
- IRE_REFHOLD(best_fire);
+done:
+ ASSERT(ire == NULL);
+ if (need_refrele)
+ ill_refrele(ill);
+
+ /* Build dependencies */
+ if (!ire_dep_build(ires, generations, i)) {
+ /* Something in chain was condemned; tear it apart */
+ ire = ire_blackhole(ipst, B_TRUE);
+ goto cleanup;
}
- IRB_REFRELE(firb);
- /* Release the first IRE_CACHE we initially looked up, if any. */
- if (first_cire)
- ire_refrele(first_cire);
-
- /* Found a resolvable route. */
- if (best_fire) {
- ASSERT(best_cire);
-
- if (*fire_arg)
- ire_refrele(*fire_arg);
- if (*ire_arg)
- ire_refrele(*ire_arg);
+ /*
+ * Release all refholds except the one for ires[0] that we
+ * will return to the caller.
+ */
+ for (j = 1; j < i; j++)
+ ire_refrele(ires[j]);
+ if (invalidate) {
/*
- * Update the passed arguments with the
- * resolvable multirt route we found
+ * Since we needed to allocate but couldn't we need to make
+ * sure that the dependency chain is rebuilt the next time.
*/
- *fire_arg = best_fire;
- *ire_arg = best_cire;
-
- ip2dbg(("ire_multirt_lookup_v6: returning B_TRUE, "
- "*fire_arg %p, *ire_arg %p\n",
- (void *)best_fire, (void *)best_cire));
-
- return (B_TRUE);
+ ire_dep_invalidate_generations(ires[0]);
+ generation = IRE_GENERATION_VERIFY;
+ } else {
+ /*
+ * IREs can have been added or deleted while we did the
+ * recursive lookup and we can't catch those until we've built
+ * the dependencies. We verify the stored
+ * ire_dep_parent_generation to catch any such changes and
+ * return IRE_GENERATION_VERIFY (which will cause
+ * ip_select_route to be called again so we can redo the
+ * recursive lookup next time we send a packet.
+ */
+ generation = ire_dep_validate_generations(ires[0]);
+ if (generations[0] != ires[0]->ire_generation) {
+ /* Something changed at the top */
+ generation = IRE_GENERATION_VERIFY;
+ }
}
+ if (generationp != NULL)
+ *generationp = generation;
- ASSERT(best_cire == NULL);
-
- ip2dbg(("ire_multirt_lookup_v6: returning B_FALSE, *fire_arg %p, "
- "*ire_arg %p\n",
- (void *)*fire_arg, (void *)*ire_arg));
-
- /* No resolvable route. */
- return (B_FALSE);
+ return (ires[0]);
}
-
-/*
- * Find an IRE_OFFSUBNET IRE entry for the multicast address 'v6dstp'
- * that goes through 'ipif'. As a fallback, a route that goes through
- * ipif->ipif_ill can be returned.
- */
ire_t *
-ipif_lookup_multi_ire_v6(ipif_t *ipif, const in6_addr_t *v6dstp)
+ire_route_recursive_v6(const in6_addr_t *nexthop, uint_t ire_type,
+ const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
+ boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst,
+ in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
{
- ire_t *ire;
- ire_t *save_ire = NULL;
- ire_t *gw_ire;
- irb_t *irb;
- in6_addr_t v6gw;
- int match_flags = MATCH_IRE_TYPE | MATCH_IRE_ILL;
- ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
-
- ire = ire_ftable_lookup_v6(v6dstp, 0, 0, 0, NULL, NULL, ALL_ZONES, 0,
- NULL, MATCH_IRE_DEFAULT, ipst);
-
- if (ire == NULL)
- return (NULL);
-
- irb = ire->ire_bucket;
- ASSERT(irb);
-
- IRB_REFHOLD(irb);
- ire_refrele(ire);
- for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
- if (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, v6dstp) ||
- (ipif->ipif_zoneid != ire->ire_zoneid &&
- ire->ire_zoneid != ALL_ZONES)) {
- continue;
- }
-
- switch (ire->ire_type) {
- case IRE_DEFAULT:
- case IRE_PREFIX:
- case IRE_HOST:
- mutex_enter(&ire->ire_lock);
- v6gw = ire->ire_gateway_addr_v6;
- mutex_exit(&ire->ire_lock);
- gw_ire = ire_ftable_lookup_v6(&v6gw, 0, 0,
- IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0,
- NULL, match_flags, ipst);
-
- if (gw_ire != NULL) {
- if (save_ire != NULL) {
- ire_refrele(save_ire);
- }
- IRE_REFHOLD(ire);
- if (gw_ire->ire_ipif == ipif) {
- ire_refrele(gw_ire);
-
- IRB_REFRELE(irb);
- return (ire);
- }
- ire_refrele(gw_ire);
- save_ire = ire;
- }
- break;
- case IRE_IF_NORESOLVER:
- case IRE_IF_RESOLVER:
- if (ire->ire_ipif == ipif) {
- if (save_ire != NULL) {
- ire_refrele(save_ire);
- }
- IRE_REFHOLD(ire);
-
- IRB_REFRELE(irb);
- return (ire);
- }
- break;
- }
- }
- IRB_REFRELE(irb);
-
- return (save_ire);
+ return (ire_route_recursive_impl_v6(NULL, nexthop, ire_type, ill,
+ zoneid, tsl, match_args, allocate, xmit_hint, ipst, setsrcp,
+ gwattrp, generationp));
}
/*
- * This is the implementation of the IPv6 IRE cache lookup procedure.
- * Separating the interface from the implementation allows additional
- * flexibility when specifying search criteria.
+ * Recursively look for a route to the destination.
+ * We only handle a destination match here, yet we have the same arguments
+ * as the full match to allow function pointers to select between the two.
+ *
+ * Note that this function never returns NULL. It returns an IRE_NOROUTE
+ * instead.
+ *
+ * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
+ * is an error.
+ * Allow at most one RTF_INDIRECT.
*/
-static ire_t *
-ip6_ctable_lookup_impl(ire_ctable_args_t *margs)
+ire_t *
+ire_route_recursive_dstonly_v6(const in6_addr_t *nexthop, boolean_t allocate,
+ uint32_t xmit_hint, ip_stack_t *ipst)
{
- irb_t *irb_ptr;
- ire_t *ire;
- ip_stack_t *ipst = margs->ict_ipst;
+ ire_t *ire;
+ ire_t *ire1;
+ uint_t generation;
- if ((margs->ict_flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) &&
- (margs->ict_ipif == NULL)) {
- return (NULL);
- }
+ /* ire_ftable_lookup handles round-robin/ECMP */
+ ire = ire_ftable_lookup_simple_v6(nexthop, xmit_hint, ipst,
+ &generation);
+ ASSERT(ire != NULL);
- irb_ptr = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(
- *((in6_addr_t *)(margs->ict_addr)),
- ipst->ips_ip6_cache_table_size)];
- rw_enter(&irb_ptr->irb_lock, RW_READER);
- for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
- if (ire->ire_marks & IRE_MARK_CONDEMNED)
- continue;
- ASSERT(IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, &ipv6_all_ones));
- if (ire_match_args_v6(ire, (in6_addr_t *)margs->ict_addr,
- &ire->ire_mask_v6, (in6_addr_t *)margs->ict_gateway,
- margs->ict_type, margs->ict_ipif, margs->ict_zoneid, 0,
- margs->ict_tsl, margs->ict_flags)) {
- IRE_REFHOLD(ire);
- rw_exit(&irb_ptr->irb_lock);
- return (ire);
- }
+ /*
+ * If this type should have an ire_nce_cache (even if it
+ * doesn't yet have one) then we are done. Includes
+ * IRE_INTERFACE with a full 128 bit mask.
+ */
+ if (ire->ire_nce_capable)
+ return (ire);
+
+ /*
+ * If the IRE has a current cached parent we know that the whole
+ * parent chain is current, hence we don't need to discover and
+ * build any dependencies by doing a recursive lookup.
+ */
+ mutex_enter(&ire->ire_lock);
+ if (ire->ire_dep_parent != NULL &&
+ ire->ire_dep_parent->ire_generation ==
+ ire->ire_dep_parent_generation) {
+ mutex_exit(&ire->ire_lock);
+ return (ire);
}
+ mutex_exit(&ire->ire_lock);
- rw_exit(&irb_ptr->irb_lock);
- return (NULL);
+ /*
+ * Fallback to loop in the normal code starting with the ire
+ * we found. Normally this would return the same ire.
+ */
+ ire1 = ire_route_recursive_impl_v6(ire, nexthop, 0, NULL, ALL_ZONES,
+ NULL, MATCH_IRE_DSTONLY, allocate, xmit_hint, ipst, NULL, NULL,
+ &generation);
+ ire_refrele(ire);
+ return (ire1);
}
diff --git a/usr/src/uts/common/inet/ip/ip6_output.c b/usr/src/uts/common/inet/ip/ip6_output.c
new file mode 100644
index 0000000000..3e06050781
--- /dev/null
+++ b/usr/src/uts/common/inet/ip/ip6_output.c
@@ -0,0 +1,1315 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/* Copyright (c) 1990 Mentat Inc. */
+
+#include <sys/types.h>
+#include <sys/stream.h>
+#include <sys/strsubr.h>
+#include <sys/dlpi.h>
+#include <sys/strsun.h>
+#include <sys/zone.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/atomic.h>
+
+#include <sys/systm.h>
+#include <sys/param.h>
+#include <sys/kmem.h>
+#include <sys/sdt.h>
+#include <sys/socket.h>
+#include <sys/mac.h>
+#include <net/if.h>
+#include <net/if_arp.h>
+#include <net/route.h>
+#include <sys/sockio.h>
+#include <netinet/in.h>
+#include <net/if_dl.h>
+
+#include <inet/common.h>
+#include <inet/mi.h>
+#include <inet/mib2.h>
+#include <inet/nd.h>
+#include <inet/arp.h>
+#include <inet/snmpcom.h>
+#include <inet/kstatcom.h>
+
+#include <netinet/igmp_var.h>
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#include <netinet/sctp.h>
+
+#include <inet/ip.h>
+#include <inet/ip_impl.h>
+#include <inet/ip6.h>
+#include <inet/ip6_asp.h>
+#include <inet/tcp.h>
+#include <inet/ip_multi.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ire.h>
+#include <inet/ip_ftable.h>
+#include <inet/ip_rts.h>
+#include <inet/optcom.h>
+#include <inet/ip_ndp.h>
+#include <inet/ip_listutils.h>
+#include <netinet/igmp.h>
+#include <netinet/ip_mroute.h>
+#include <inet/ipp_common.h>
+
+#include <net/pfkeyv2.h>
+#include <inet/sadb.h>
+#include <inet/ipsec_impl.h>
+#include <inet/ipdrop.h>
+#include <inet/ip_netinfo.h>
+
+#include <sys/pattr.h>
+#include <inet/ipclassifier.h>
+#include <inet/sctp_ip.h>
+#include <inet/sctp/sctp_impl.h>
+#include <inet/udp_impl.h>
+#include <sys/sunddi.h>
+
+#include <sys/tsol/label.h>
+#include <sys/tsol/tnet.h>
+
+#ifdef DEBUG
+extern boolean_t skip_sctp_cksum;
+#endif
+
+int
+ip_output_simple_v6(mblk_t *mp, ip_xmit_attr_t *ixa)
+{
+ ip6_t *ip6h;
+ in6_addr_t firsthop; /* In IP header */
+ in6_addr_t dst; /* End of source route, or ip6_dst if none */
+ ire_t *ire;
+ in6_addr_t setsrc;
+ int error;
+ ill_t *ill = NULL;
+ dce_t *dce = NULL;
+ nce_t *nce;
+ iaflags_t ixaflags = ixa->ixa_flags;
+ ip_stack_t *ipst = ixa->ixa_ipst;
+ uint8_t *nexthdrp;
+ boolean_t repeat = B_FALSE;
+ boolean_t multirt = B_FALSE;
+ uint_t ifindex;
+
+ ip6h = (ip6_t *)mp->b_rptr;
+ ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
+
+ ASSERT(ixa->ixa_nce == NULL);
+
+ ixa->ixa_pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
+ ASSERT(ixa->ixa_pktlen == msgdsize(mp));
+ if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ixa->ixa_ip_hdr_length,
+ &nexthdrp)) {
+ /* Malformed packet */
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards", mp, NULL);
+ freemsg(mp);
+ return (EINVAL);
+ }
+ ixa->ixa_protocol = *nexthdrp;
+
+ /*
+ * Assumes that source routed packets have already been massaged by
+ * the ULP (ip_massage_options_v6) and as a result ip6_dst is the next
+ * hop in the source route. The final destination is used for IPsec
+ * policy and DCE lookup.
+ */
+ firsthop = ip6h->ip6_dst;
+ dst = ip_get_dst_v6(ip6h, mp, NULL);
+
+repeat_ire:
+ error = 0;
+ setsrc = ipv6_all_zeros;
+ ire = ip_select_route_v6(&firsthop, ixa, NULL, &setsrc, &error,
+ &multirt);
+ ASSERT(ire != NULL); /* IRE_NOROUTE if none found */
+ if (error != 0) {
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards", mp, NULL);
+ freemsg(mp);
+ goto done;
+ }
+
+ if (ire->ire_flags & (RTF_BLACKHOLE|RTF_REJECT)) {
+ /* ire_ill might be NULL hence need to skip some code */
+ if (ixaflags & IXAF_SET_SOURCE)
+ ip6h->ip6_src = ipv6_loopback;
+ ixa->ixa_fragsize = IP_MAXPACKET;
+ ire->ire_ob_pkt_count++;
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
+ /* No dce yet; use default one */
+ error = (ire->ire_sendfn)(ire, mp, ip6h, ixa,
+ &ipst->ips_dce_default->dce_ident);
+ goto done;
+ }
+
+ /* Note that ip6_dst is only used for IRE_MULTICAST */
+ nce = ire_to_nce(ire, INADDR_ANY, &ip6h->ip6_dst);
+ if (nce == NULL) {
+ /* Allocation failure? */
+ ip_drop_output("ire_to_nce", mp, ill);
+ freemsg(mp);
+ error = ENOBUFS;
+ goto done;
+ }
+ if (nce->nce_is_condemned) {
+ nce_t *nce1;
+
+ nce1 = ire_handle_condemned_nce(nce, ire, NULL, ip6h, B_TRUE);
+ nce_refrele(nce);
+ if (nce1 == NULL) {
+ if (!repeat) {
+ /* Try finding a better IRE */
+ repeat = B_TRUE;
+ ire_refrele(ire);
+ goto repeat_ire;
+ }
+ /* Tried twice - drop packet */
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("No nce", mp, ill);
+ freemsg(mp);
+ error = ENOBUFS;
+ goto done;
+ }
+ nce = nce1;
+ }
+ /*
+ * For multicast with multirt we have a flag passed back from
+ * ire_lookup_multi_ill_v6 since we don't have an IRE for each
+ * possible multicast address.
+ * We also need a flag for multicast since we can't check
+ * whether RTF_MULTIRT is set in ixa_ire for multicast.
+ */
+ if (multirt) {
+ ixa->ixa_postfragfn = ip_postfrag_multirt_v6;
+ ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
+ } else {
+ ixa->ixa_postfragfn = ire->ire_postfragfn;
+ ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
+ }
+ ASSERT(ixa->ixa_nce == NULL);
+ ixa->ixa_nce = nce;
+
+ /*
+ * Check for a dce_t with a path mtu.
+ */
+ ifindex = 0;
+ if (IN6_IS_ADDR_LINKSCOPE(&dst))
+ ifindex = nce->nce_common->ncec_ill->ill_phyint->phyint_ifindex;
+
+ dce = dce_lookup_v6(&dst, ifindex, ipst, NULL);
+ ASSERT(dce != NULL);
+
+ if (!(ixaflags & IXAF_PMTU_DISCOVERY)) {
+ ixa->ixa_fragsize = IPV6_MIN_MTU;
+ } else if (dce->dce_flags & DCEF_PMTU) {
+ /*
+ * To avoid a periodic timer to increase the path MTU we
+ * look at dce_last_change_time each time we send a packet.
+ */
+ if (TICK_TO_SEC(lbolt64) - dce->dce_last_change_time >
+ ipst->ips_ip_pathmtu_interval) {
+ /*
+ * Older than 20 minutes. Drop the path MTU information.
+ */
+ mutex_enter(&dce->dce_lock);
+ dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
+ dce->dce_last_change_time = TICK_TO_SEC(lbolt64);
+ mutex_exit(&dce->dce_lock);
+ dce_increment_generation(dce);
+ ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
+ } else {
+ uint_t fragsize;
+
+ fragsize = ip_get_base_mtu(nce->nce_ill, ire);
+ if (fragsize > dce->dce_pmtu)
+ fragsize = dce->dce_pmtu;
+ ixa->ixa_fragsize = fragsize;
+ }
+ } else {
+ ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
+ }
+
+ /*
+ * We use use ire_nexthop_ill (and not ncec_ill) to avoid the under ipmp
+ * interface for source address selection.
+ */
+ ill = ire_nexthop_ill(ire);
+
+ if (ixaflags & IXAF_SET_SOURCE) {
+ in6_addr_t src;
+
+ /*
+ * We use the final destination to get
+ * correct selection for source routed packets
+ */
+
+ /* If unreachable we have no ill but need some source */
+ if (ill == NULL) {
+ src = ipv6_loopback;
+ error = 0;
+ } else {
+ error = ip_select_source_v6(ill, &setsrc, &dst,
+ ixa->ixa_zoneid, ipst, B_FALSE,
+ ixa->ixa_src_preferences, &src, NULL, NULL);
+ }
+ if (error != 0) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards - no source",
+ mp, ill);
+ freemsg(mp);
+ goto done;
+ }
+ ip6h->ip6_src = src;
+ } else if (ixaflags & IXAF_VERIFY_SOURCE) {
+ /* Check if the IP source is assigned to the host. */
+ if (!ip_verify_src(mp, ixa, NULL)) {
+ /* Don't send a packet with a source that isn't ours */
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards - invalid source",
+ mp, ill);
+ freemsg(mp);
+ error = EADDRNOTAVAIL;
+ goto done;
+ }
+ }
+
+ /*
+ * Check against global IPsec policy to set the AH/ESP attributes.
+ * IPsec will set IXAF_IPSEC_* and ixa_ipsec_* as appropriate.
+ */
+ if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) {
+ ASSERT(ixa->ixa_ipsec_policy == NULL);
+ mp = ip_output_attach_policy(mp, NULL, ip6h, NULL, ixa);
+ if (mp == NULL) {
+ /* MIB and ip_drop_packet already done */
+ return (EHOSTUNREACH); /* IPsec policy failure */
+ }
+ }
+
+ if (ill != NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
+ } else {
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
+ }
+
+ /*
+ * We update the statistics on the most specific IRE i.e., the first
+ * one we found.
+ * We don't have an IRE when we fragment, hence ire_ob_pkt_count
+ * can only count the use prior to fragmentation. However the MIB
+ * counters on the ill will be incremented in post fragmentation.
+ */
+ ire->ire_ob_pkt_count++;
+
+ /*
+ * Based on ire_type and ire_flags call one of:
+ * ire_send_local_v6 - for IRE_LOCAL and IRE_LOOPBACK
+ * ire_send_multirt_v6 - if RTF_MULTIRT
+ * ire_send_noroute_v6 - if RTF_REJECT or RTF_BLACHOLE
+ * ire_send_multicast_v6 - for IRE_MULTICAST
+ * ire_send_wire_v6 - for the rest.
+ */
+ error = (ire->ire_sendfn)(ire, mp, ip6h, ixa, &dce->dce_ident);
+done:
+ ire_refrele(ire);
+ if (dce != NULL)
+ dce_refrele(dce);
+ if (ill != NULL)
+ ill_refrele(ill);
+ if (ixa->ixa_nce != NULL)
+ nce_refrele(ixa->ixa_nce);
+ ixa->ixa_nce = NULL;
+ return (error);
+}
+
+/*
+ * ire_sendfn() functions.
+ * These functions use the following xmit_attr:
+ * - ixa_fragsize - read to determine whether or not to fragment
+ * - IXAF_IPSEC_SECURE - to determine whether or not to invoke IPsec
+ * - ixa_ipsec_* are used inside IPsec
+ * - IXAF_LOOPBACK_COPY - for multicast
+ */
+
+
+/*
+ * ire_sendfn for IRE_LOCAL and IRE_LOOPBACK
+ *
+ * The checks for restrict_interzone_loopback are done in ire_route_recursive.
+ */
+/* ARGSUSED4 */
+int
+ire_send_local_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
+ ip_xmit_attr_t *ixa, uint32_t *identp)
+{
+ ip6_t *ip6h = (ip6_t *)iph_arg;
+ ip_stack_t *ipst = ixa->ixa_ipst;
+ ill_t *ill = ire->ire_ill;
+ ip_recv_attr_t iras; /* NOTE: No bzero for performance */
+ uint_t pktlen = ixa->ixa_pktlen;
+
+ /*
+ * No fragmentation, no nce, and no application of IPsec.
+ *
+ *
+ * Note different order between IP provider and FW_HOOKS than in
+ * send_wire case.
+ */
+
+ /*
+ * DTrace this as ip:::send. A packet blocked by FW_HOOKS will fire the
+ * send probe, but not the receive probe.
+ */
+ DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+ ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, ip6h,
+ int, 1);
+
+ DTRACE_PROBE4(ip6__loopback__out__start,
+ ill_t *, NULL, ill_t *, ill,
+ ip6_t *, ip6h, mblk_t *, mp);
+
+ if (HOOKS6_INTERESTED_LOOPBACK_OUT(ipst)) {
+ int error;
+
+ FW_HOOKS(ipst->ips_ip6_loopback_out_event,
+ ipst->ips_ipv6firewall_loopback_out,
+ NULL, ill, ip6h, mp, mp, 0, ipst, error);
+
+ DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp);
+ if (mp == NULL)
+ return (error);
+
+ /*
+ * Even if the destination was changed by the filter we use the
+ * forwarding decision that was made based on the address
+ * in ip_output/ip_set_destination.
+ */
+ /* Length could be different */
+ ip6h = (ip6_t *)mp->b_rptr;
+ pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
+ }
+
+ /*
+ * If a callback is enabled then we need to know the
+ * source and destination zoneids for the packet. We already
+ * have those handy.
+ */
+ if (ipst->ips_ip6_observe.he_interested) {
+ zoneid_t szone, dzone;
+ zoneid_t stackzoneid;
+
+ stackzoneid = netstackid_to_zoneid(
+ ipst->ips_netstack->netstack_stackid);
+
+ if (stackzoneid == GLOBAL_ZONEID) {
+ /* Shared-IP zone */
+ dzone = ire->ire_zoneid;
+ szone = ixa->ixa_zoneid;
+ } else {
+ szone = dzone = stackzoneid;
+ }
+ ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst);
+ }
+
+ /* Handle lo0 stats */
+ ipst->ips_loopback_packets++;
+
+ /*
+ * Update output mib stats. Note that we can't move into the icmp
+ * sender (icmp_output etc) since they don't know the ill and the
+ * stats are per ill.
+ */
+ if (ixa->ixa_protocol == IPPROTO_ICMPV6) {
+ icmp6_t *icmp6;
+
+ icmp6 = (icmp6_t *)((uchar_t *)ip6h + ixa->ixa_ip_hdr_length);
+ icmp_update_out_mib_v6(ill, icmp6);
+ }
+
+ DTRACE_PROBE4(ip6__loopback__in__start,
+ ill_t *, ill, ill_t *, NULL,
+ ip6_t *, ip6h, mblk_t *, mp);
+
+ if (HOOKS6_INTERESTED_LOOPBACK_IN(ipst)) {
+ int error;
+
+ FW_HOOKS(ipst->ips_ip6_loopback_in_event,
+ ipst->ips_ipv6firewall_loopback_in,
+ ill, NULL, ip6h, mp, mp, 0, ipst, error);
+
+ DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp);
+ if (mp == NULL)
+ return (error);
+
+ /*
+ * Even if the destination was changed by the filter we use the
+ * forwarding decision that was made based on the address
+ * in ip_output/ip_set_destination.
+ */
+ /* Length could be different */
+ ip6h = (ip6_t *)mp->b_rptr;
+ pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
+ }
+
+ DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+ ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, ip6h,
+ int, 1);
+
+ /* Map ixa to ira including IPsec policies */
+ ipsec_out_to_in(ixa, ill, &iras);
+ iras.ira_pktlen = pktlen;
+
+ ire->ire_ib_pkt_count++;
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
+ UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pktlen);
+
+ /* Destined to ire_zoneid - use that for fanout */
+ iras.ira_zoneid = ire->ire_zoneid;
+
+ if (is_system_labeled()) {
+ iras.ira_flags |= IRAF_SYSTEM_LABELED;
+
+ /*
+ * This updates ira_cred, ira_tsl and ira_free_flags based
+ * on the label. We don't expect this to ever fail for
+ * loopback packets, so we silently drop the packet should it
+ * fail.
+ */
+ if (!tsol_get_pkt_label(mp, IPV6_VERSION, &iras)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("tsol_get_pkt_label", mp, ill);
+ freemsg(mp);
+ return (0);
+ }
+ ASSERT(iras.ira_tsl != NULL);
+
+ /* tsol_get_pkt_label sometimes does pullupmsg */
+ ip6h = (ip6_t *)mp->b_rptr;
+ }
+
+ ip_fanout_v6(mp, ip6h, &iras);
+
+ /* We moved any IPsec refs from ixa to iras */
+ ira_cleanup(&iras, B_FALSE);
+ return (0);
+}
+
+static void
+multirt_check_v6(ire_t *ire, ip6_t *ip6h, ip_xmit_attr_t *ixa)
+{
+ ip_stack_t *ipst = ixa->ixa_ipst;
+
+ /* Limit the TTL on multirt packets. Do this even if IPV6_HOPLIMIT */
+ if (ire->ire_type & IRE_MULTICAST) {
+ if (ip6h->ip6_hops > 1) {
+ ip2dbg(("ire_send_multirt_v6: forcing multicast "
+ "multirt TTL to 1 (was %d)\n", ip6h->ip6_hops));
+ ip6h->ip6_hops = 1;
+ }
+ ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
+ } else if ((ipst->ips_ip_multirt_ttl > 0) &&
+ (ip6h->ip6_hops > ipst->ips_ip_multirt_ttl)) {
+ ip6h->ip6_hops = ipst->ips_ip_multirt_ttl;
+ /*
+ * Need to ensure we don't increase the ttl should we go through
+ * ire_send_multicast.
+ */
+ ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
+ }
+
+ /* For IPv6 this also needs to insert a fragment header */
+ ixa->ixa_flags |= IXAF_IPV6_ADD_FRAGHDR;
+}
+
+/*
+ * ire_sendfn for IRE_MULTICAST
+ *
+ * Note that we do path MTU discovery by default for IPv6 multicast. But
+ * since unconnected UDP and RAW sockets don't set IXAF_PMTU_DISCOVERY
+ * only connected sockets get this by default.
+ */
+int
+ire_send_multicast_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
+ ip_xmit_attr_t *ixa, uint32_t *identp)
+{
+ ip6_t *ip6h = (ip6_t *)iph_arg;
+ ip_stack_t *ipst = ixa->ixa_ipst;
+ ill_t *ill = ire->ire_ill;
+ iaflags_t ixaflags = ixa->ixa_flags;
+
+ /*
+ * The IRE_MULTICAST is the same whether or not multirt is in use.
+ * Hence we need special-case code.
+ */
+ if (ixaflags & IXAF_MULTIRT_MULTICAST)
+ multirt_check_v6(ire, ip6h, ixa);
+
+ /*
+ * Check if anything in ip_input_v6 wants a copy of the transmitted
+ * packet (after IPsec and fragmentation)
+ *
+ * 1. Multicast routers always need a copy unless SO_DONTROUTE is set
+ * RSVP and the rsvp daemon is an example of a
+ * protocol and user level process that
+ * handles it's own routing. Hence, it uses the
+ * SO_DONTROUTE option to accomplish this.
+ * 2. If the sender has set IP_MULTICAST_LOOP, then we just
+ * check whether there are any receivers for the group on the ill
+ * (ignoring the zoneid).
+ * 3. If IP_MULTICAST_LOOP is not set, then we check if there are
+ * any members in other shared-IP zones.
+ * If such members exist, then we indicate that the sending zone
+ * shouldn't get a loopback copy to preserve the IP_MULTICAST_LOOP
+ * behavior.
+ *
+ * When we loopback we skip hardware checksum to make sure loopback
+ * copy is checksumed.
+ *
+ * Note that ire_ill is the upper in the case of IPMP.
+ */
+ ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM);
+ if (ipst->ips_ip_g_mrouter && ill->ill_mrouter_cnt > 0 &&
+ !(ixaflags & IXAF_DONTROUTE)) {
+ ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
+ } else if (ixaflags & IXAF_MULTICAST_LOOP) {
+ /*
+ * If this zone or any other zone has members then loopback
+ * a copy.
+ */
+ if (ill_hasmembers_v6(ill, &ip6h->ip6_dst))
+ ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
+ } else if (ipst->ips_netstack->netstack_numzones > 1) {
+ /*
+ * This zone should not have a copy. But there are some other
+ * zones which might have members.
+ */
+ if (ill_hasmembers_otherzones_v6(ill, &ip6h->ip6_dst,
+ ixa->ixa_zoneid)) {
+ ixa->ixa_flags |= IXAF_NO_LOOP_ZONEID_SET;
+ ixa->ixa_no_loop_zoneid = ixa->ixa_zoneid;
+ ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
+ }
+ }
+
+ /*
+ * Unless IPV6_HOPLIMIT or ire_send_multirt_v6 already set a ttl,
+ * force the ttl to the IP_MULTICAST_TTL value
+ */
+ if (!(ixaflags & IXAF_NO_TTL_CHANGE)) {
+ ip6h->ip6_hops = ixa->ixa_multicast_ttl;
+ }
+
+ return (ire_send_wire_v6(ire, mp, ip6h, ixa, identp));
+}
+
+/*
+ * ire_sendfn for IREs with RTF_MULTIRT
+ */
+int
+ire_send_multirt_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
+ ip_xmit_attr_t *ixa, uint32_t *identp)
+{
+ ip6_t *ip6h = (ip6_t *)iph_arg;
+
+ multirt_check_v6(ire, ip6h, ixa);
+
+ if (ire->ire_type & IRE_MULTICAST)
+ return (ire_send_multicast_v6(ire, mp, ip6h, ixa, identp));
+ else
+ return (ire_send_wire_v6(ire, mp, ip6h, ixa, identp));
+}
+
+/*
+ * ire_sendfn for IREs with RTF_REJECT/RTF_BLACKHOLE, including IRE_NOROUTE
+ */
+/* ARGSUSED4 */
+int
+ire_send_noroute_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
+ ip_xmit_attr_t *ixa, uint32_t *identp)
+{
+ ip6_t *ip6h = (ip6_t *)iph_arg;
+ ip_stack_t *ipst = ixa->ixa_ipst;
+ ill_t *ill;
+ ip_recv_attr_t iras;
+ boolean_t dummy;
+
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes);
+
+ if (ire->ire_type & IRE_NOROUTE) {
+ /* A lack of a route as opposed to RTF_REJECT|BLACKHOLE */
+ ip_rts_change_v6(RTM_MISS, &ip6h->ip6_dst, 0, 0, 0, 0, 0, 0,
+ RTA_DST, ipst);
+ }
+
+ if (ire->ire_flags & RTF_BLACKHOLE) {
+ ip_drop_output("ipIfStatsOutNoRoutes RTF_BLACKHOLE", mp, NULL);
+ freemsg(mp);
+ /* No error even for local senders - silent blackhole */
+ return (0);
+ }
+ ip_drop_output("ipIfStatsOutNoRoutes RTF_REJECT", mp, NULL);
+
+ /*
+ * We need an ill_t for the ip_recv_attr_t even though this packet
+ * was never received and icmp_unreachable doesn't currently use
+ * ira_ill.
+ */
+ ill = ill_lookup_on_name("lo0", B_FALSE,
+ !(ixa->ixa_flags & IRAF_IS_IPV4), &dummy, ipst);
+ if (ill == NULL) {
+ freemsg(mp);
+ return (EHOSTUNREACH);
+ }
+
+ bzero(&iras, sizeof (iras));
+ /* Map ixa to ira including IPsec policies */
+ ipsec_out_to_in(ixa, ill, &iras);
+
+ icmp_unreachable_v6(mp, ICMP6_DST_UNREACH_NOROUTE, B_FALSE, &iras);
+ /* We moved any IPsec refs from ixa to iras */
+ ira_cleanup(&iras, B_FALSE);
+
+ ill_refrele(ill);
+ return (EHOSTUNREACH);
+}
+
+/*
+ * Calculate a checksum ignoring any hardware capabilities
+ *
+ * Returns B_FALSE if the packet was too short for the checksum. Caller
+ * should free and do stats.
+ */
+static boolean_t
+ip_output_sw_cksum_v6(mblk_t *mp, ip6_t *ip6h, ip_xmit_attr_t *ixa)
+{
+ ip_stack_t *ipst = ixa->ixa_ipst;
+ uint_t pktlen = ixa->ixa_pktlen;
+ uint16_t *cksump;
+ uint32_t cksum;
+ uint8_t protocol = ixa->ixa_protocol;
+ uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length;
+
+#define iphs ((uint16_t *)ip6h)
+
+ /* Just in case it contained garbage */
+ DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
+
+ /*
+ * Calculate ULP checksum
+ */
+ if (protocol == IPPROTO_TCP) {
+ cksump = IPH_TCPH_CHECKSUMP(ip6h, ip_hdr_length);
+ cksum = IP_TCP_CSUM_COMP;
+ } else if (protocol == IPPROTO_UDP) {
+ cksump = IPH_UDPH_CHECKSUMP(ip6h, ip_hdr_length);
+ cksum = IP_UDP_CSUM_COMP;
+ } else if (protocol == IPPROTO_SCTP) {
+ sctp_hdr_t *sctph;
+
+ ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
+ sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
+ /*
+ * Zero out the checksum field to ensure proper
+ * checksum calculation.
+ */
+ sctph->sh_chksum = 0;
+#ifdef DEBUG
+ if (!skip_sctp_cksum)
+#endif
+ sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
+ return (B_TRUE);
+ } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
+ /*
+ * icmp has placed length and routing
+ * header adjustment in the checksum field.
+ */
+ cksump = (uint16_t *)(((uint8_t *)ip6h) + ip_hdr_length +
+ ixa->ixa_raw_cksum_offset);
+ cksum = htons(protocol);
+ } else if (protocol == IPPROTO_ICMPV6) {
+ cksump = IPH_ICMPV6_CHECKSUMP(ip6h, ip_hdr_length);
+ cksum = IP_ICMPV6_CSUM_COMP; /* Pseudo-header cksum */
+ } else {
+ return (B_TRUE);
+ }
+
+ /* ULP puts the checksum field is in the first mblk */
+ ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
+
+ /*
+ * We accumulate the pseudo header checksum in cksum.
+ * This is pretty hairy code, so watch close. One
+ * thing to keep in mind is that UDP and TCP have
+ * stored their respective datagram lengths in their
+ * checksum fields. This lines things up real nice.
+ */
+ cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] +
+ iphs[8] + iphs[9] + iphs[10] + iphs[11] +
+ iphs[12] + iphs[13] + iphs[14] + iphs[15] +
+ iphs[16] + iphs[17] + iphs[18] + iphs[19];
+ cksum = IP_CSUM(mp, ip_hdr_length, cksum);
+
+ /*
+ * For UDP/IPv6 a zero UDP checksum is not allowed.
+ * Change to 0xffff
+ */
+ if (protocol == IPPROTO_UDP && cksum == 0)
+ *cksump = ~cksum;
+ else
+ *cksump = cksum;
+
+ IP6_STAT(ipst, ip6_out_sw_cksum);
+ IP6_STAT_UPDATE(ipst, ip6_out_sw_cksum_bytes, pktlen);
+
+ /* No IP header checksum for IPv6 */
+
+ return (B_TRUE);
+#undef iphs
+}
+
+/* There are drivers that can't do partial checksum for ICMPv6 */
+int nxge_cksum_workaround = 1;
+
+/*
+ * Calculate the ULP checksum - try to use hardware.
+ * In the case of MULTIRT or multicast the
+ * IXAF_NO_HW_CKSUM is set in which case we use software.
+ *
+ * Returns B_FALSE if the packet was too short for the checksum. Caller
+ * should free and do stats.
+ */
+static boolean_t
+ip_output_cksum_v6(iaflags_t ixaflags, mblk_t *mp, ip6_t *ip6h,
+ ip_xmit_attr_t *ixa, ill_t *ill)
+{
+ uint_t pktlen = ixa->ixa_pktlen;
+ uint16_t *cksump;
+ uint16_t hck_flags;
+ uint32_t cksum;
+ uint8_t protocol = ixa->ixa_protocol;
+ uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length;
+
+#define iphs ((uint16_t *)ip6h)
+
+ if ((ixaflags & IXAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) ||
+ !dohwcksum) {
+ return (ip_output_sw_cksum_v6(mp, ip6h, ixa));
+ }
+
+ /*
+ * Calculate ULP checksum. Note that we don't use cksump and cksum
+ * if the ill has FULL support.
+ */
+ if (protocol == IPPROTO_TCP) {
+ cksump = IPH_TCPH_CHECKSUMP(ip6h, ip_hdr_length);
+ cksum = IP_TCP_CSUM_COMP; /* Pseudo-header cksum */
+ } else if (protocol == IPPROTO_UDP) {
+ cksump = IPH_UDPH_CHECKSUMP(ip6h, ip_hdr_length);
+ cksum = IP_UDP_CSUM_COMP; /* Pseudo-header cksum */
+ } else if (protocol == IPPROTO_SCTP) {
+ sctp_hdr_t *sctph;
+
+ ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
+ sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
+ /*
+ * Zero out the checksum field to ensure proper
+ * checksum calculation.
+ */
+ sctph->sh_chksum = 0;
+#ifdef DEBUG
+ if (!skip_sctp_cksum)
+#endif
+ sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
+ goto ip_hdr_cksum;
+ } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
+ /*
+ * icmp has placed length and routing
+ * header adjustment in the checksum field.
+ */
+ cksump = (uint16_t *)(((uint8_t *)ip6h) + ip_hdr_length +
+ ixa->ixa_raw_cksum_offset);
+ cksum = htons(protocol);
+ } else if (protocol == IPPROTO_ICMPV6) {
+ cksump = IPH_ICMPV6_CHECKSUMP(ip6h, ip_hdr_length);
+ cksum = IP_ICMPV6_CSUM_COMP; /* Pseudo-header cksum */
+ } else {
+ ip_hdr_cksum:
+ /* No IP header checksum for IPv6 */
+ return (B_TRUE);
+ }
+
+ /* ULP puts the checksum field is in the first mblk */
+ ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
+
+ /*
+ * Underlying interface supports hardware checksum offload for
+ * the payload; leave the payload checksum for the hardware to
+ * calculate. N.B: We only need to set up checksum info on the
+ * first mblk.
+ */
+ hck_flags = ill->ill_hcksum_capab->ill_hcksum_txflags;
+
+ DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
+ if (hck_flags & HCKSUM_INET_FULL_V6) {
+ /*
+ * Hardware calculates pseudo-header, header and the
+ * payload checksums, so clear the checksum field in
+ * the protocol header.
+ */
+ *cksump = 0;
+ DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM;
+ return (B_TRUE);
+ }
+ if (((hck_flags) & HCKSUM_INET_PARTIAL) &&
+ (protocol != IPPROTO_ICMPV6 || !nxge_cksum_workaround)) {
+ /*
+ * Partial checksum offload has been enabled. Fill
+ * the checksum field in the protocol header with the
+ * pseudo-header checksum value.
+ *
+ * We accumulate the pseudo header checksum in cksum.
+ * This is pretty hairy code, so watch close. One
+ * thing to keep in mind is that UDP and TCP have
+ * stored their respective datagram lengths in their
+ * checksum fields. This lines things up real nice.
+ */
+ cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] +
+ iphs[8] + iphs[9] + iphs[10] + iphs[11] +
+ iphs[12] + iphs[13] + iphs[14] + iphs[15] +
+ iphs[16] + iphs[17] + iphs[18] + iphs[19];
+ cksum += *(cksump);
+ cksum = (cksum & 0xFFFF) + (cksum >> 16);
+ *(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
+
+ /*
+ * Offsets are relative to beginning of IP header.
+ */
+ DB_CKSUMSTART(mp) = ip_hdr_length;
+ DB_CKSUMSTUFF(mp) = (uint8_t *)cksump - (uint8_t *)ip6h;
+ DB_CKSUMEND(mp) = pktlen;
+ DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM;
+ return (B_TRUE);
+ }
+ /* Hardware capabilities include neither full nor partial IPv6 */
+ return (ip_output_sw_cksum_v6(mp, ip6h, ixa));
+#undef iphs
+}
+
+/*
+ * ire_sendfn for offlink and onlink destinations.
+ * Also called from the multicast, and multirt send functions.
+ *
+ * Assumes that the caller has a hold on the ire.
+ *
+ * This function doesn't care if the IRE just became condemned since that
+ * can happen at any time.
+ */
+/* ARGSUSED */
+int
+ire_send_wire_v6(ire_t *ire, mblk_t *mp, void *iph_arg,
+ ip_xmit_attr_t *ixa, uint32_t *identp)
+{
+ ip_stack_t *ipst = ixa->ixa_ipst;
+ ip6_t *ip6h = (ip6_t *)iph_arg;
+ iaflags_t ixaflags = ixa->ixa_flags;
+ ill_t *ill;
+ uint32_t pktlen = ixa->ixa_pktlen;
+
+ ASSERT(ixa->ixa_nce != NULL);
+ ill = ixa->ixa_nce->nce_ill;
+
+ /*
+ * Update output mib stats. Note that we can't move into the icmp
+ * sender (icmp_output etc) since they don't know the ill and the
+ * stats are per ill.
+ *
+ * With IPMP we record the stats on the upper ill.
+ */
+ if (ixa->ixa_protocol == IPPROTO_ICMPV6) {
+ icmp6_t *icmp6;
+
+ icmp6 = (icmp6_t *)((uchar_t *)ip6h + ixa->ixa_ip_hdr_length);
+ icmp_update_out_mib_v6(ixa->ixa_nce->nce_common->ncec_ill,
+ icmp6);
+ }
+
+ if (ixaflags & IXAF_DONTROUTE)
+ ip6h->ip6_hops = 1;
+
+ /*
+ * This might set b_band, thus the IPsec and fragmentation
+ * code in IP ensures that b_band is updated in the first mblk.
+ */
+ if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) {
+ /* ip_process translates an IS_UNDER_IPMP */
+ mp = ip_process(IPP_LOCAL_OUT, mp, ill, ill);
+ if (mp == NULL) {
+ /* ip_drop_packet and MIB done */
+ return (0); /* Might just be delayed */
+ }
+ }
+
+ /*
+ * To handle IPsec/iptun's labeling needs we need to tag packets
+ * while we still have ixa_tsl
+ */
+ if (is_system_labeled() && ixa->ixa_tsl != NULL &&
+ (ill->ill_mactype == DL_6TO4 || ill->ill_mactype == DL_IPV4 ||
+ ill->ill_mactype == DL_IPV6)) {
+ cred_t *newcr;
+
+ newcr = copycred_from_tslabel(ixa->ixa_cred, ixa->ixa_tsl,
+ KM_NOSLEEP);
+ if (newcr == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards - newcr",
+ mp, ill);
+ freemsg(mp);
+ return (ENOBUFS);
+ }
+ mblk_setcred(mp, newcr, NOPID);
+ crfree(newcr); /* mblk_setcred did its own crhold */
+ }
+
+ /*
+ * IXAF_IPV6_ADD_FRAGHDR is set for CGTP so that we will add a
+ * fragment header without fragmenting. CGTP on the receiver will
+ * filter duplicates on the ident field.
+ */
+ if (pktlen > ixa->ixa_fragsize ||
+ (ixaflags & (IXAF_IPSEC_SECURE|IXAF_IPV6_ADD_FRAGHDR))) {
+ uint32_t ident;
+
+ if (ixaflags & IXAF_IPSEC_SECURE)
+ pktlen += ipsec_out_extra_length(ixa);
+
+ if (pktlen > IP_MAXPACKET)
+ return (EMSGSIZE);
+
+ if (ixaflags & IXAF_SET_ULP_CKSUM) {
+ /*
+ * Compute ULP checksum using software
+ */
+ if (!ip_output_sw_cksum_v6(mp, ip6h, ixa)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards", mp, ill);
+ freemsg(mp);
+ return (EINVAL);
+ }
+ /* Avoid checksum again below if we only add fraghdr */
+ ixaflags &= ~IXAF_SET_ULP_CKSUM;
+ }
+
+ /*
+ * If we need a fragment header, pick the ident and insert
+ * the header before IPsec to we have a place to store
+ * the ident value.
+ */
+ if ((ixaflags & IXAF_IPV6_ADD_FRAGHDR) ||
+ pktlen > ixa->ixa_fragsize) {
+ /*
+ * If this packet would generate a icmp_frag_needed
+ * message, we need to handle it before we do the IPsec
+ * processing. Otherwise, we need to strip the IPsec
+ * headers before we send up the message to the ULPs
+ * which becomes messy and difficult.
+ */
+ if ((pktlen > ixa->ixa_fragsize) &&
+ (ixaflags & IXAF_DONTFRAG)) {
+ /* Generate ICMP and return error */
+ ip_recv_attr_t iras;
+
+ DTRACE_PROBE4(ip6__fragsize__fail,
+ uint_t, pktlen, uint_t, ixa->ixa_fragsize,
+ uint_t, ixa->ixa_pktlen,
+ uint_t, ixa->ixa_pmtu);
+
+ bzero(&iras, sizeof (iras));
+ /* Map ixa to ira including IPsec policies */
+ ipsec_out_to_in(ixa, ill, &iras);
+
+ ip_drop_output("ICMP6_PKT_TOO_BIG", mp, ill);
+ icmp_pkt2big_v6(mp, ixa->ixa_fragsize, B_TRUE,
+ &iras);
+ /* We moved any IPsec refs from ixa to iras */
+ ira_cleanup(&iras, B_FALSE);
+ return (EMSGSIZE);
+ }
+ DTRACE_PROBE4(ip6__fragsize__ok, uint_t, pktlen,
+ uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen,
+ uint_t, ixa->ixa_pmtu);
+ /*
+ * Assign an ident value for this packet. There could
+ * be other threads targeting the same destination, so
+ * we have to arrange for a atomic increment.
+ * Normally ixa_extra_ident is 0, but in the case of
+ * LSO it will be the number of TCP segments that the
+ * driver/hardware will extraly construct.
+ *
+ * Note that cl_inet_ipident has only been used for
+ * IPv4. We don't use it here.
+ */
+ ident = atomic_add_32_nv(identp, ixa->ixa_extra_ident +
+ 1);
+#ifndef _BIG_ENDIAN
+ ident = htonl(ident);
+#endif
+ ixa->ixa_ident = ident; /* In case we do IPsec */
+ }
+ if (ixaflags & IXAF_IPSEC_SECURE) {
+ /*
+ * Pass in sufficient information so that
+ * IPsec can determine whether to fragment, and
+ * which function to call after fragmentation.
+ */
+ return (ipsec_out_process(mp, ixa));
+ }
+
+ mp = ip_fraghdr_add_v6(mp, ident, ixa);
+ if (mp == NULL) {
+ /* MIB and ip_drop_output already done */
+ return (ENOMEM);
+ }
+ ASSERT(pktlen == ixa->ixa_pktlen);
+ pktlen += sizeof (ip6_frag_t);
+
+ if (pktlen > ixa->ixa_fragsize) {
+ return (ip_fragment_v6(mp, ixa->ixa_nce, ixaflags,
+ pktlen, ixa->ixa_fragsize,
+ ixa->ixa_xmit_hint, ixa->ixa_zoneid,
+ ixa->ixa_no_loop_zoneid, ixa->ixa_postfragfn,
+ &ixa->ixa_cookie));
+ }
+ }
+ if (ixaflags & IXAF_SET_ULP_CKSUM) {
+ /* Compute ULP checksum and IP header checksum */
+ /* An IS_UNDER_IPMP ill is ok here */
+ if (!ip_output_cksum_v6(ixaflags, mp, ip6h, ixa, ill)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards", mp, ill);
+ freemsg(mp);
+ return (EINVAL);
+ }
+ }
+ return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixaflags,
+ pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid,
+ ixa->ixa_no_loop_zoneid, &ixa->ixa_cookie));
+}
+
+/*
+ * Post fragmentation function for RTF_MULTIRT routes.
+ * Since IRE_MULTICASTs might have RTF_MULTIRT, this function
+ * checks IXAF_LOOPBACK_COPY.
+ *
+ * If no packet is sent due to failures then we return an errno, but if at
+ * least one succeeded we return zero.
+ */
+int
+ip_postfrag_multirt_v6(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
+ uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid,
+ uintptr_t *ixacookie)
+{
+ irb_t *irb;
+ ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+ ire_t *ire;
+ ire_t *ire1;
+ mblk_t *mp1;
+ nce_t *nce1;
+ ill_t *ill = nce->nce_ill;
+ ill_t *ill1;
+ ip_stack_t *ipst = ill->ill_ipst;
+ int error = 0;
+ int num_sent = 0;
+ int err;
+ uint_t ire_type;
+ in6_addr_t nexthop;
+
+ ASSERT(!(ixaflags & IXAF_IS_IPV4));
+
+ /* Check for IXAF_LOOPBACK_COPY */
+ if (ixaflags & IXAF_LOOPBACK_COPY) {
+ mblk_t *mp1;
+
+ mp1 = copymsg(mp);
+ if (mp1 == NULL) {
+ /* Failed to deliver the loopback copy. */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards", mp, ill);
+ error = ENOBUFS;
+ } else {
+ ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len,
+ nolzid);
+ }
+ }
+
+ /*
+ * Loop over RTF_MULTIRT for ip6_dst in the same bucket. Send
+ * a copy to each one.
+ * Use the nce (nexthop) and ip6_dst to find the ire.
+ *
+ * MULTIRT is not designed to work with shared-IP zones thus we don't
+ * need to pass a zoneid or a label to the IRE lookup.
+ */
+ if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, &ip6h->ip6_dst)) {
+ /* Broadcast and multicast case */
+ ire = ire_ftable_lookup_v6(&ip6h->ip6_dst, 0, 0, 0, NULL,
+ ALL_ZONES, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
+ } else {
+ /* Unicast case */
+ ire = ire_ftable_lookup_v6(&ip6h->ip6_dst, 0, &nce->nce_addr,
+ 0, NULL, ALL_ZONES, NULL, MATCH_IRE_GW, 0, ipst, NULL);
+ }
+
+ if (ire == NULL ||
+ (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
+ !(ire->ire_flags & RTF_MULTIRT)) {
+ /* Drop */
+ ip_drop_output("ip_postfrag_multirt didn't find route",
+ mp, nce->nce_ill);
+ if (ire != NULL)
+ ire_refrele(ire);
+ return (ENETUNREACH);
+ }
+
+ irb = ire->ire_bucket;
+ irb_refhold(irb);
+ for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
+ if (IRE_IS_CONDEMNED(ire1) ||
+ !(ire1->ire_flags & RTF_MULTIRT))
+ continue;
+
+ /* Note: When IPv6 uses radix tree we don't need this check */
+ if (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &ire1->ire_addr_v6))
+ continue;
+
+ /* Do the ire argument one after the loop */
+ if (ire1 == ire)
+ continue;
+
+ ill1 = ire_nexthop_ill(ire1);
+ if (ill1 == NULL) {
+ /*
+ * This ire might not have been picked by
+ * ire_route_recursive, in which case ire_dep might
+ * not have been setup yet.
+ * We kick ire_route_recursive to try to resolve
+ * starting at ire1.
+ */
+ ire_t *ire2;
+
+ ire2 = ire_route_recursive_impl_v6(ire1,
+ &ire1->ire_addr_v6, ire1->ire_type, ire1->ire_ill,
+ ire1->ire_zoneid, NULL, MATCH_IRE_DSTONLY,
+ B_TRUE, 0, ipst, NULL, NULL, NULL);
+ if (ire2 != NULL)
+ ire_refrele(ire2);
+ ill1 = ire_nexthop_ill(ire1);
+ }
+ if (ill1 == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards - no ill",
+ mp, ill);
+ error = ENETUNREACH;
+ continue;
+ }
+ /* Pick the addr and type to use for ndp_nce_init */
+ if (nce->nce_common->ncec_flags & NCE_F_MCAST) {
+ ire_type = IRE_MULTICAST;
+ nexthop = ip6h->ip6_dst;
+ } else {
+ ire_type = ire1->ire_type; /* Doesn't matter */
+ nexthop = ire1->ire_gateway_addr_v6;
+ }
+
+ /* If IPMP meta or under, then we just drop */
+ if (ill1->ill_grp != NULL) {
+ BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards - IPMP",
+ mp, ill1);
+ ill_refrele(ill1);
+ error = ENETUNREACH;
+ continue;
+ }
+
+ nce1 = ndp_nce_init(ill1, &nexthop, ire_type);
+ if (nce1 == NULL) {
+ BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards - no nce",
+ mp, ill1);
+ ill_refrele(ill1);
+ error = ENOBUFS;
+ continue;
+ }
+ mp1 = copymsg(mp);
+ if (mp1 == NULL) {
+ BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards", mp, ill1);
+ nce_refrele(nce1);
+ ill_refrele(ill1);
+ error = ENOBUFS;
+ continue;
+ }
+ /* Preserve HW checksum for this copy */
+ DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp);
+ DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp);
+ DB_CKSUMEND(mp1) = DB_CKSUMEND(mp);
+ DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp);
+ DB_LSOMSS(mp1) = DB_LSOMSS(mp);
+
+ ire1->ire_ob_pkt_count++;
+ err = ip_xmit(mp1, nce1, ixaflags, pkt_len, xmit_hint, szone,
+ 0, ixacookie);
+ if (err == 0)
+ num_sent++;
+ else
+ error = err;
+ nce_refrele(nce1);
+ ill_refrele(ill1);
+ }
+ irb_refrele(irb);
+ ire_refrele(ire);
+ /* Finally, the main one */
+ err = ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0,
+ ixacookie);
+ if (err == 0)
+ num_sent++;
+ else
+ error = err;
+ if (num_sent > 0)
+ return (0);
+ else
+ return (error);
+}
diff --git a/usr/src/uts/common/inet/ip/ip6_rts.c b/usr/src/uts/common/inet/ip/ip6_rts.c
index dcf429c8ba..38b43cdf60 100644
--- a/usr/src/uts/common/inet/ip/ip6_rts.c
+++ b/usr/src/uts/common/inet/ip/ip6_rts.c
@@ -80,8 +80,8 @@ void
rts_fill_msg_v6(int type, int rtm_addrs, const in6_addr_t *dst,
const in6_addr_t *mask, const in6_addr_t *gateway,
const in6_addr_t *src_addr, const in6_addr_t *brd_addr,
- const in6_addr_t *author, const ipif_t *ipif, mblk_t *mp,
- uint_t sacnt, const tsol_gc_t *gc)
+ const in6_addr_t *author, const in6_addr_t *ifaddr, const ill_t *ill,
+ mblk_t *mp, const tsol_gc_t *gc)
{
rt_msghdr_t *rtm;
sin6_t *sin6;
@@ -90,7 +90,6 @@ rts_fill_msg_v6(int type, int rtm_addrs, const in6_addr_t *dst,
int i;
ASSERT(mp != NULL);
- ASSERT(sacnt == 0 || gc != NULL);
/*
* First find the type of the message
* and its length.
@@ -100,7 +99,7 @@ rts_fill_msg_v6(int type, int rtm_addrs, const in6_addr_t *dst,
* Now find the size of the data
* that follows the message header.
*/
- data_size = rts_data_msg_size(rtm_addrs, AF_INET6, sacnt);
+ data_size = rts_data_msg_size(rtm_addrs, AF_INET6, gc != NULL ? 1 : 0);
rtm = (rt_msghdr_t *)mp->b_rptr;
mp->b_wptr = &mp->b_rptr[header_size];
@@ -125,13 +124,17 @@ rts_fill_msg_v6(int type, int rtm_addrs, const in6_addr_t *dst,
cp += sizeof (sin6_t);
break;
case RTA_IFA:
+ sin6->sin6_addr = *ifaddr;
+ sin6->sin6_family = AF_INET6;
+ cp += sizeof (sin6_t);
+ break;
case RTA_SRC:
sin6->sin6_addr = *src_addr;
sin6->sin6_family = AF_INET6;
cp += sizeof (sin6_t);
break;
case RTA_IFP:
- cp += ill_dls_info((struct sockaddr_dl *)cp, ipif);
+ cp += ill_dls_info((struct sockaddr_dl *)cp, ill);
break;
case RTA_AUTHOR:
sin6->sin6_addr = *author;
@@ -154,24 +157,20 @@ rts_fill_msg_v6(int type, int rtm_addrs, const in6_addr_t *dst,
rtm_ext_t *rtm_ext;
struct rtsa_s *rp_dst;
tsol_rtsecattr_t *rsap;
- int i;
ASSERT(gc->gc_grp != NULL);
ASSERT(RW_LOCK_HELD(&gc->gc_grp->gcgrp_rwlock));
- ASSERT(sacnt > 0);
rtm_ext = (rtm_ext_t *)cp;
rtm_ext->rtmex_type = RTMEX_GATEWAY_SECATTR;
- rtm_ext->rtmex_len = TSOL_RTSECATTR_SIZE(sacnt);
+ rtm_ext->rtmex_len = TSOL_RTSECATTR_SIZE(1);
rsap = (tsol_rtsecattr_t *)(rtm_ext + 1);
- rsap->rtsa_cnt = sacnt;
+ rsap->rtsa_cnt = 1;
rp_dst = rsap->rtsa_attr;
- for (i = 0; i < sacnt; i++, gc = gc->gc_next, rp_dst++) {
- ASSERT(gc->gc_db != NULL);
- bcopy(&gc->gc_db->gcdb_attr, rp_dst, sizeof (*rp_dst));
- }
+ ASSERT(gc->gc_db != NULL);
+ bcopy(&gc->gc_db->gcdb_attr, rp_dst, sizeof (*rp_dst));
cp = (uchar_t *)rp_dst;
}
@@ -208,7 +207,7 @@ ip_rts_change_v6(int type, const in6_addr_t *dst_addr,
if (mp == NULL)
return;
rts_fill_msg_v6(type, rtm_addrs, dst_addr, net_mask, gw_addr, source,
- &ipv6_all_zeros, author, NULL, mp, 0, NULL);
+ &ipv6_all_zeros, &ipv6_all_zeros, author, NULL, mp, NULL);
rtm = (rt_msghdr_t *)mp->b_rptr;
rtm->rtm_flags = flags;
rtm->rtm_errno = error;
diff --git a/usr/src/uts/common/inet/ip/ip_arp.c b/usr/src/uts/common/inet/ip/ip_arp.c
new file mode 100644
index 0000000000..489d59dbf6
--- /dev/null
+++ b/usr/src/uts/common/inet/ip/ip_arp.c
@@ -0,0 +1,2468 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <inet/ip_arp.h>
+#include <inet/ip_ndp.h>
+#include <net/if_arp.h>
+#include <netinet/if_ether.h>
+#include <sys/strsubr.h>
+#include <inet/ip6.h>
+#include <inet/ip.h>
+#include <inet/ip_ire.h>
+#include <inet/ip_if.h>
+#include <sys/dlpi.h>
+#include <sys/sunddi.h>
+#include <sys/strsun.h>
+#include <sys/sdt.h>
+#include <inet/mi.h>
+#include <inet/arp.h>
+#include <inet/ipdrop.h>
+#include <sys/sockio.h>
+#include <inet/ip_impl.h>
+#include <sys/policy.h>
+
+#define ARL_LL_ADDR_OFFSET(arl) (((arl)->arl_sap_length) < 0 ? \
+ (sizeof (dl_unitdata_req_t)) : \
+ ((sizeof (dl_unitdata_req_t)) + (ABS((arl)->arl_sap_length))))
+
+/*
+ * MAC-specific intelligence. Shouldn't be needed, but the DL_INFO_ACK
+ * doesn't quite do it for us.
+ */
+typedef struct arp_m_s {
+ t_uscalar_t arp_mac_type;
+ uint32_t arp_mac_arp_hw_type;
+ t_scalar_t arp_mac_sap_length;
+ uint32_t arp_mac_hw_addr_length;
+} arp_m_t;
+
+static int arp_close(queue_t *, int);
+static void arp_rput(queue_t *, mblk_t *);
+static void arp_wput(queue_t *, mblk_t *);
+static arp_m_t *arp_m_lookup(t_uscalar_t mac_type);
+static void arp_notify(ipaddr_t, mblk_t *, uint32_t, ip_recv_attr_t *,
+ ncec_t *);
+static int arp_output(ill_t *, uint32_t, const uchar_t *, const uchar_t *,
+ const uchar_t *, const uchar_t *, uchar_t *);
+static int arp_modclose(arl_t *);
+static void arp_mod_close_tail(arl_t *);
+static mblk_t *arl_unbind(arl_t *);
+static void arp_process_packet(ill_t *, mblk_t *);
+static void arp_excl(ipsq_t *, queue_t *, mblk_t *, void *);
+static void arp_drop_packet(const char *str, mblk_t *, ill_t *);
+static int arp_open(queue_t *, dev_t *, int, int, cred_t *);
+static int ip_sioctl_ifunitsel_arp(queue_t *, int *);
+static int ip_sioctl_slifname_arp(queue_t *, void *);
+static void arp_dlpi_send(arl_t *, mblk_t *);
+static void arl_defaults_common(arl_t *, mblk_t *);
+static int arp_modopen(queue_t *, dev_t *, int, int, cred_t *);
+static void arp_ifname_notify(arl_t *);
+static void arp_rput_dlpi_writer(ipsq_t *, queue_t *, mblk_t *, void *);
+static arl_t *ill_to_arl(ill_t *);
+
+#define DL_PRIM(mp) (((union DL_primitives *)(mp)->b_rptr)->dl_primitive)
+#define IS_DLPI_DATA(mp) \
+ ((DB_TYPE(mp) == M_PROTO) && \
+ MBLKL(mp) >= sizeof (dl_unitdata_ind_t) && \
+ (DL_PRIM(mp) == DL_UNITDATA_IND))
+
+#define AR_NOTFOUND 1 /* No matching ace found in cache */
+#define AR_MERGED 2 /* Matching ace updated (RFC 826 Merge_flag) */
+#define AR_LOOPBACK 3 /* Our own arp packet was received */
+#define AR_BOGON 4 /* Another host has our IP addr. */
+#define AR_FAILED 5 /* Duplicate Address Detection has failed */
+#define AR_CHANGED 6 /* Address has changed; tell IP (and merged) */
+
+boolean_t arp_no_defense;
+
+struct module_info arp_mod_info = {
+ IP_MOD_ID, "arpip", 1, INFPSZ, 65536, 1024
+};
+static struct qinit rinit_arp = {
+ (pfi_t)arp_rput, NULL, arp_open, arp_close, NULL, &arp_mod_info
+};
+static struct qinit winit_arp = {
+ (pfi_t)arp_wput, NULL, arp_open, arp_close, NULL,
+ &arp_mod_info
+};
+struct streamtab arpinfo = {
+ &rinit_arp, &winit_arp
+};
+#define ARH_FIXED_LEN 8
+#define AR_LL_HDR_SLACK 32
+
+/*
+ * pfhooks for ARP.
+ */
+#define ARP_HOOK_IN(_hook, _event, _ilp, _hdr, _fm, _m, ipst) \
+ \
+ if ((_hook).he_interested) { \
+ hook_pkt_event_t info; \
+ \
+ info.hpe_protocol = ipst->ips_arp_net_data; \
+ info.hpe_ifp = _ilp; \
+ info.hpe_ofp = 0; \
+ info.hpe_hdr = _hdr; \
+ info.hpe_mp = &(_fm); \
+ info.hpe_mb = _m; \
+ if (hook_run(ipst->ips_arp_net_data->netd_hooks, \
+ _event, (hook_data_t)&info) != 0) { \
+ if (_fm != NULL) { \
+ freemsg(_fm); \
+ _fm = NULL; \
+ } \
+ _hdr = NULL; \
+ _m = NULL; \
+ } else { \
+ _hdr = info.hpe_hdr; \
+ _m = info.hpe_mb; \
+ } \
+ }
+
+#define ARP_HOOK_OUT(_hook, _event, _olp, _hdr, _fm, _m, ipst) \
+ \
+ if ((_hook).he_interested) { \
+ hook_pkt_event_t info; \
+ \
+ info.hpe_protocol = ipst->ips_arp_net_data; \
+ info.hpe_ifp = 0; \
+ info.hpe_ofp = _olp; \
+ info.hpe_hdr = _hdr; \
+ info.hpe_mp = &(_fm); \
+ info.hpe_mb = _m; \
+ if (hook_run(ipst->ips_arp_net_data->netd_hooks, \
+ _event, (hook_data_t)&info) != 0) { \
+ if (_fm != NULL) { \
+ freemsg(_fm); \
+ _fm = NULL; \
+ } \
+ _hdr = NULL; \
+ _m = NULL; \
+ } else { \
+ _hdr = info.hpe_hdr; \
+ _m = info.hpe_mb; \
+ } \
+ }
+
+static arp_m_t arp_m_tbl[] = {
+ { DL_CSMACD, ARPHRD_ETHER, -2, 6}, /* 802.3 */
+ { DL_TPB, ARPHRD_IEEE802, -2, 6}, /* 802.4 */
+ { DL_TPR, ARPHRD_IEEE802, -2, 6}, /* 802.5 */
+ { DL_METRO, ARPHRD_IEEE802, -2, 6}, /* 802.6 */
+ { DL_ETHER, ARPHRD_ETHER, -2, 6}, /* Ethernet */
+ { DL_FDDI, ARPHRD_ETHER, -2, 6}, /* FDDI */
+ { DL_IB, ARPHRD_IB, -2, 20}, /* Infiniband */
+ { DL_OTHER, ARPHRD_ETHER, -2, 6} /* unknown */
+};
+
+static void
+arl_refhold_locked(arl_t *arl)
+{
+ ASSERT(MUTEX_HELD(&arl->arl_lock));
+ arl->arl_refcnt++;
+ ASSERT(arl->arl_refcnt != 0);
+}
+
+static void
+arl_refrele(arl_t *arl)
+{
+ mutex_enter(&arl->arl_lock);
+ ASSERT(arl->arl_refcnt != 0);
+ arl->arl_refcnt--;
+ if (arl->arl_refcnt > 1) {
+ mutex_exit(&arl->arl_lock);
+ return;
+ }
+
+ /* ill_close or arp_unbind_complete may be waiting */
+ cv_broadcast(&arl->arl_cv);
+ mutex_exit(&arl->arl_lock);
+}
+
+/*
+ * wake up any pending ip ioctls.
+ */
+static void
+arp_cmd_done(ill_t *ill, int err, t_uscalar_t lastprim)
+{
+ if (lastprim == DL_UNBIND_REQ && ill->ill_replumbing)
+ arp_replumb_done(ill, 0);
+ else
+ arp_bringup_done(ill, err);
+}
+
+static int
+ip_nce_resolve_all(ill_t *ill, uchar_t *src_haddr, uint32_t hlen,
+ const in_addr_t *src_paddr, ncec_t **sncec, int op)
+{
+ int retv;
+ ncec_t *ncec;
+ boolean_t ll_changed;
+ uchar_t *lladdr = NULL;
+ int new_state;
+
+ ASSERT(ill != NULL);
+
+ ncec = ncec_lookup_illgrp_v4(ill, src_paddr);
+ *sncec = ncec;
+
+ if (ncec == NULL) {
+ retv = AR_NOTFOUND;
+ goto done;
+ }
+
+ mutex_enter(&ncec->ncec_lock);
+ /*
+ * IP addr and hardware address match what we already
+ * have, then this is a broadcast packet emitted by one of our
+ * interfaces, reflected by the switch and received on another
+ * interface. We return AR_LOOPBACK.
+ */
+ lladdr = ncec->ncec_lladdr;
+ if (NCE_MYADDR(ncec) && hlen == ncec->ncec_ill->ill_phys_addr_length &&
+ bcmp(lladdr, src_haddr, hlen) == 0) {
+ mutex_exit(&ncec->ncec_lock);
+ retv = AR_LOOPBACK;
+ goto done;
+ }
+ /*
+ * If the entry is unverified, then we've just verified that
+ * someone else already owns this address, because this is a
+ * message with the same protocol address but different
+ * hardware address.
+ */
+ if (ncec->ncec_flags & NCE_F_UNVERIFIED) {
+ mutex_exit(&ncec->ncec_lock);
+ ncec_delete(ncec);
+ ncec_refrele(ncec);
+ *sncec = NULL;
+ retv = AR_FAILED;
+ goto done;
+ }
+
+ /*
+ * If the IP address matches ours and we're authoritative for
+ * this entry, then some other node is using our IP addr, so
+ * return AR_BOGON. Also reset the transmit count to zero so
+ * that, if we're currently in initial announcement mode, we
+ * switch back to the lazier defense mode. Knowing that
+ * there's at least one duplicate out there, we ought not
+ * blindly announce.
+ *
+ * NCE_F_AUTHORITY is set in one of two ways:
+ * 1. /sbin/arp told us so, via the "permanent" flag.
+ * 2. This is one of my addresses.
+ */
+ if (ncec->ncec_flags & NCE_F_AUTHORITY) {
+ ncec->ncec_unsolicit_count = 0;
+ mutex_exit(&ncec->ncec_lock);
+ retv = AR_BOGON;
+ goto done;
+ }
+
+ /*
+ * No address conflict was detected, and we are getting
+ * ready to update the ncec's hwaddr. The nce MUST NOT be on an
+ * under interface, because all dynamic nce's are created on the
+ * native interface (in the non-IPMP case) or on the IPMP
+ * meta-interface (in the IPMP case)
+ */
+ ASSERT(!IS_UNDER_IPMP(ncec->ncec_ill));
+
+ /*
+ * update ncec with src_haddr, hlen.
+ *
+ * We are trying to resolve this ncec_addr/src_paddr and we
+ * got a REQUEST/RESPONSE from the ncec_addr/src_paddr.
+ * So the new_state is at least "STALE". If, in addition,
+ * this a solicited, unicast ARP_RESPONSE, we can transition
+ * to REACHABLE.
+ */
+ new_state = ND_STALE;
+ ip1dbg(("got info for ncec %p from addr %x\n",
+ (void *)ncec, *src_paddr));
+ retv = AR_MERGED;
+ if (ncec->ncec_state == ND_INCOMPLETE ||
+ ncec->ncec_state == ND_INITIAL) {
+ ll_changed = B_TRUE;
+ } else {
+ ll_changed = nce_cmp_ll_addr(ncec, src_haddr, hlen);
+ if (!ll_changed)
+ new_state = ND_UNCHANGED;
+ else
+ retv = AR_CHANGED;
+ }
+ /*
+ * We don't have the equivalent of the IPv6 'S' flag indicating
+ * a solicited response, so we assume that if we are in
+ * INCOMPLETE, or got back an unchanged lladdr in PROBE state,
+ * and this is an ARP_RESPONSE, it must be a
+ * solicited response allowing us to transtion to REACHABLE.
+ */
+ if (op == ARP_RESPONSE) {
+ switch (ncec->ncec_state) {
+ case ND_PROBE:
+ new_state = (ll_changed ? ND_STALE : ND_REACHABLE);
+ break;
+ case ND_INCOMPLETE:
+ new_state = ND_REACHABLE;
+ break;
+ }
+ }
+ /*
+ * Call nce_update() to refresh fastpath information on any
+ * dependent nce_t entries.
+ */
+ nce_update(ncec, new_state, (ll_changed ? src_haddr : NULL));
+ mutex_exit(&ncec->ncec_lock);
+ nce_resolv_ok(ncec);
+done:
+ return (retv);
+}
+
+/* Find an entry for a particular MAC type in the arp_m_tbl. */
+static arp_m_t *
+arp_m_lookup(t_uscalar_t mac_type)
+{
+ arp_m_t *arm;
+
+ for (arm = arp_m_tbl; arm < A_END(arp_m_tbl); arm++) {
+ if (arm->arp_mac_type == mac_type)
+ return (arm);
+ }
+ return (NULL);
+}
+
+static uint32_t
+arp_hw_type(t_uscalar_t mactype)
+{
+ arp_m_t *arm;
+
+ if ((arm = arp_m_lookup(mactype)) == NULL)
+ arm = arp_m_lookup(DL_OTHER);
+ return (arm->arp_mac_arp_hw_type);
+}
+
+/*
+ * Called when an DLPI control message has been acked; send down the next
+ * queued message (if any).
+ * The DLPI messages of interest being bind, attach and unbind since
+ * these are the only ones sent by ARP via arp_dlpi_send.
+ */
+static void
+arp_dlpi_done(arl_t *arl, ill_t *ill)
+{
+ mblk_t *mp;
+ int err;
+ t_uscalar_t prim;
+
+ mutex_enter(&arl->arl_lock);
+ prim = arl->arl_dlpi_pending;
+
+ if ((mp = arl->arl_dlpi_deferred) == NULL) {
+ arl->arl_dlpi_pending = DL_PRIM_INVAL;
+ if (arl->arl_state_flags & ARL_LL_DOWN)
+ err = ENETDOWN;
+ else
+ err = 0;
+ mutex_exit(&arl->arl_lock);
+
+ mutex_enter(&ill->ill_lock);
+ ill->ill_arl_dlpi_pending = 0;
+ mutex_exit(&ill->ill_lock);
+ arp_cmd_done(ill, err, prim);
+ return;
+ }
+
+ arl->arl_dlpi_deferred = mp->b_next;
+ mp->b_next = NULL;
+
+ ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
+
+ arl->arl_dlpi_pending = DL_PRIM(mp);
+ mutex_exit(&arl->arl_lock);
+
+ mutex_enter(&ill->ill_lock);
+ ill->ill_arl_dlpi_pending = 1;
+ mutex_exit(&ill->ill_lock);
+
+ putnext(arl->arl_wq, mp);
+}
+
+/*
+ * This routine is called during module initialization when the DL_INFO_ACK
+ * comes back from the device. We set up defaults for all the device dependent
+ * doo-dads we are going to need. This will leave us ready to roll if we are
+ * attempting auto-configuration. Alternatively, these defaults can be
+ * overridden by initialization procedures possessing higher intelligence.
+ *
+ * Caller will free the mp.
+ */
+static void
+arp_ll_set_defaults(arl_t *arl, mblk_t *mp)
+{
+ arp_m_t *arm;
+ dl_info_ack_t *dlia = (dl_info_ack_t *)mp->b_rptr;
+
+ if ((arm = arp_m_lookup(dlia->dl_mac_type)) == NULL)
+ arm = arp_m_lookup(DL_OTHER);
+ ASSERT(arm != NULL);
+
+ /*
+ * We initialize based on parameters in the (currently) not too
+ * exhaustive arp_m_tbl.
+ */
+ if (dlia->dl_version == DL_VERSION_2) {
+ arl->arl_sap_length = dlia->dl_sap_length;
+ arl->arl_phys_addr_length = dlia->dl_brdcst_addr_length;
+ if (dlia->dl_provider_style == DL_STYLE2)
+ arl->arl_needs_attach = 1;
+ } else {
+ arl->arl_sap_length = arm->arp_mac_sap_length;
+ arl->arl_phys_addr_length = arm->arp_mac_hw_addr_length;
+ }
+ /*
+ * Note: the arp_hw_type in the arp header may be derived from
+ * the ill_mac_type and arp_m_lookup().
+ */
+ arl->arl_sap = ETHERTYPE_ARP;
+ arl_defaults_common(arl, mp);
+}
+
+static void
+arp_wput(queue_t *q, mblk_t *mp)
+{
+ int err = EINVAL;
+ struct iocblk *ioc;
+ mblk_t *mp1;
+
+ switch (DB_TYPE(mp)) {
+ case M_IOCTL:
+ ASSERT(q->q_next != NULL);
+ ioc = (struct iocblk *)mp->b_rptr;
+ if (ioc->ioc_cmd != SIOCSLIFNAME &&
+ ioc->ioc_cmd != IF_UNITSEL) {
+ DTRACE_PROBE4(arl__dlpi, char *, "arp_wput",
+ char *, "<some ioctl>", char *, "-",
+ arl_t *, (arl_t *)q->q_ptr);
+ putnext(q, mp);
+ return;
+ }
+ if ((mp1 = mp->b_cont) == 0)
+ err = EINVAL;
+ else if (ioc->ioc_cmd == SIOCSLIFNAME)
+ err = ip_sioctl_slifname_arp(q, mp1->b_rptr);
+ else if (ioc->ioc_cmd == IF_UNITSEL)
+ err = ip_sioctl_ifunitsel_arp(q, (int *)mp1->b_rptr);
+ if (err == 0)
+ miocack(q, mp, 0, 0);
+ else
+ miocnak(q, mp, 0, err);
+ return;
+ default:
+ DTRACE_PROBE4(arl__dlpi, char *, "arp_wput default",
+ char *, "default mblk", char *, "-",
+ arl_t *, (arl_t *)q->q_ptr);
+ putnext(q, mp);
+ return;
+ }
+}
+
+/*
+ * similar to ill_dlpi_pending(): verify that the received DLPI response
+ * matches the one that is pending for the arl.
+ */
+static boolean_t
+arl_dlpi_pending(arl_t *arl, t_uscalar_t prim)
+{
+ t_uscalar_t pending;
+
+ mutex_enter(&arl->arl_lock);
+ if (arl->arl_dlpi_pending == prim) {
+ mutex_exit(&arl->arl_lock);
+ return (B_TRUE);
+ }
+
+ if (arl->arl_state_flags & ARL_CONDEMNED) {
+ mutex_exit(&arl->arl_lock);
+ return (B_FALSE);
+ }
+ pending = arl->arl_dlpi_pending;
+ mutex_exit(&arl->arl_lock);
+
+ if (pending == DL_PRIM_INVAL) {
+ ip0dbg(("arl_dlpi_pending unsolicited ack for %s on %s",
+ dl_primstr(prim), arl->arl_name));
+ } else {
+ ip0dbg(("arl_dlpi_pending ack for %s on %s expect %s",
+ dl_primstr(prim), arl->arl_name, dl_primstr(pending)));
+ }
+ return (B_FALSE);
+}
+
+/* DLPI messages, other than DL_UNITDATA_IND are handled here. */
+static void
+arp_rput_dlpi(queue_t *q, mblk_t *mp)
+{
+ arl_t *arl = (arl_t *)q->q_ptr;
+ union DL_primitives *dlp;
+ t_uscalar_t prim;
+ t_uscalar_t reqprim = DL_PRIM_INVAL;
+ ill_t *ill;
+
+ if ((mp->b_wptr - mp->b_rptr) < sizeof (dlp->dl_primitive)) {
+ putnext(q, mp);
+ return;
+ }
+ dlp = (union DL_primitives *)mp->b_rptr;
+ prim = dlp->dl_primitive;
+
+ /*
+ * If we received an ACK but didn't send a request for it, then it
+ * can't be part of any pending operation; discard up-front.
+ */
+ switch (prim) {
+ case DL_ERROR_ACK:
+ /*
+ * ce is confused about how DLPI works, so we have to interpret
+ * an "error" on DL_NOTIFY_ACK (which we never could have sent)
+ * as really meaning an error on DL_NOTIFY_REQ.
+ *
+ * Note that supporting DL_NOTIFY_REQ is optional, so printing
+ * out an error message on the console isn't warranted except
+ * for debug.
+ */
+ if (dlp->error_ack.dl_error_primitive == DL_NOTIFY_ACK ||
+ dlp->error_ack.dl_error_primitive == DL_NOTIFY_REQ) {
+ reqprim = DL_NOTIFY_REQ;
+ } else {
+ reqprim = dlp->error_ack.dl_error_primitive;
+ }
+ break;
+ case DL_INFO_ACK:
+ reqprim = DL_INFO_REQ;
+ break;
+ case DL_OK_ACK:
+ reqprim = dlp->ok_ack.dl_correct_primitive;
+ break;
+ case DL_BIND_ACK:
+ reqprim = DL_BIND_REQ;
+ break;
+ default:
+ DTRACE_PROBE2(rput_dl_badprim, arl_t *, arl,
+ union DL_primitives *, dlp);
+ putnext(q, mp);
+ return;
+ }
+ if (reqprim == DL_PRIM_INVAL || !arl_dlpi_pending(arl, reqprim)) {
+ freemsg(mp);
+ return;
+ }
+ DTRACE_PROBE4(arl__dlpi, char *, "arp_rput_dlpi received",
+ char *, dl_primstr(prim), char *, dl_primstr(reqprim),
+ arl_t *, arl);
+
+ ASSERT(prim != DL_NOTIFY_IND);
+
+ ill = arl_to_ill(arl);
+
+ switch (reqprim) {
+ case DL_INFO_REQ:
+ /*
+ * ill has not been set up yet for this case. This is the
+ * DL_INFO_ACK for the first DL_INFO_REQ sent from
+ * arp_modopen(). There should be no other arl_dlpi_deferred
+ * messages pending. We initialize the arl here.
+ */
+ ASSERT(!arl->arl_dlpi_style_set);
+ ASSERT(arl->arl_dlpi_pending == DL_INFO_REQ);
+ ASSERT(arl->arl_dlpi_deferred == NULL);
+ arl->arl_dlpi_pending = DL_PRIM_INVAL;
+ arp_ll_set_defaults(arl, mp);
+ freemsg(mp);
+ return;
+ case DL_UNBIND_REQ:
+ mutex_enter(&arl->arl_lock);
+ arl->arl_state_flags &= ~ARL_DL_UNBIND_IN_PROGRESS;
+ /*
+ * This is not an error, so we don't set ARL_LL_DOWN
+ */
+ arl->arl_state_flags &= ~ARL_LL_UP;
+ arl->arl_state_flags |= ARL_LL_UNBOUND;
+ if (arl->arl_state_flags & ARL_CONDEMNED) {
+ /*
+ * if this is part of the unplumb the arl may
+ * vaporize any moment after we cv_signal the
+ * arl_cv so we reset arl_dlpi_pending here.
+ * All other cases (including replumb) will
+ * have the arl_dlpi_pending reset in
+ * arp_dlpi_done.
+ */
+ arl->arl_dlpi_pending = DL_PRIM_INVAL;
+ }
+ cv_signal(&arl->arl_cv);
+ mutex_exit(&arl->arl_lock);
+ break;
+ }
+ if (ill != NULL) {
+ /*
+ * ill ref obtained by arl_to_ill() will be released
+ * by qwriter_ip()
+ */
+ qwriter_ip(ill, ill->ill_wq, mp, arp_rput_dlpi_writer,
+ CUR_OP, B_TRUE);
+ return;
+ }
+ freemsg(mp);
+}
+
+/*
+ * Handling of DLPI messages that require exclusive access to the ipsq.
+ */
+/* ARGSUSED */
+static void
+arp_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
+{
+ union DL_primitives *dlp = (union DL_primitives *)mp->b_rptr;
+ ill_t *ill = (ill_t *)q->q_ptr;
+ arl_t *arl = ill_to_arl(ill);
+
+ if (arl == NULL) {
+ /*
+ * happens as a result arp_modclose triggering unbind.
+ * arp_rput_dlpi will cv_signal the arl_cv and the modclose
+ * will complete, but when it does ipsq_exit, the waiting
+ * qwriter_ip gets into the ipsq but will find the arl null.
+ * There should be no deferred messages in this case, so
+ * just complete and exit.
+ */
+ arp_cmd_done(ill, 0, DL_UNBIND_REQ);
+ freemsg(mp);
+ return;
+ }
+ switch (dlp->dl_primitive) {
+ case DL_ERROR_ACK:
+ switch (dlp->error_ack.dl_error_primitive) {
+ case DL_UNBIND_REQ:
+ mutex_enter(&arl->arl_lock);
+ arl->arl_state_flags &= ~ARL_DL_UNBIND_IN_PROGRESS;
+ arl->arl_state_flags &= ~ARL_LL_UP;
+ arl->arl_state_flags |= ARL_LL_UNBOUND;
+ arl->arl_state_flags |= ARL_LL_DOWN;
+ cv_signal(&arl->arl_cv);
+ mutex_exit(&arl->arl_lock);
+ break;
+ case DL_BIND_REQ:
+ mutex_enter(&arl->arl_lock);
+ arl->arl_state_flags &= ~ARL_LL_UP;
+ arl->arl_state_flags |= ARL_LL_DOWN;
+ arl->arl_state_flags |= ARL_LL_UNBOUND;
+ cv_signal(&arl->arl_cv);
+ mutex_exit(&arl->arl_lock);
+ break;
+ case DL_ATTACH_REQ:
+ break;
+ default:
+ /* If it's anything else, we didn't send it. */
+ arl_refrele(arl);
+ putnext(q, mp);
+ return;
+ }
+ break;
+ case DL_OK_ACK:
+ DTRACE_PROBE4(arl__dlpi, char *, "arp_rput_dlpi_writer ok",
+ char *, dl_primstr(dlp->ok_ack.dl_correct_primitive),
+ char *, dl_primstr(dlp->ok_ack.dl_correct_primitive),
+ arl_t *, arl);
+ mutex_enter(&arl->arl_lock);
+ switch (dlp->ok_ack.dl_correct_primitive) {
+ case DL_UNBIND_REQ:
+ case DL_ATTACH_REQ:
+ break;
+ default:
+ ip0dbg(("Dropping unrecognized DL_OK_ACK for %s",
+ dl_primstr(dlp->ok_ack.dl_correct_primitive)));
+ mutex_exit(&arl->arl_lock);
+ arl_refrele(arl);
+ freemsg(mp);
+ return;
+ }
+ mutex_exit(&arl->arl_lock);
+ break;
+ case DL_BIND_ACK:
+ DTRACE_PROBE2(rput_dl_bind, arl_t *, arl,
+ dl_bind_ack_t *, &dlp->bind_ack);
+
+ mutex_enter(&arl->arl_lock);
+ ASSERT(arl->arl_state_flags & ARL_LL_BIND_PENDING);
+ arl->arl_state_flags &=
+ ~(ARL_LL_BIND_PENDING|ARL_LL_DOWN|ARL_LL_UNBOUND);
+ arl->arl_state_flags |= ARL_LL_UP;
+ mutex_exit(&arl->arl_lock);
+ break;
+ case DL_UDERROR_IND:
+ DTRACE_PROBE2(rput_dl_uderror, arl_t *, arl,
+ dl_uderror_ind_t *, &dlp->uderror_ind);
+ arl_refrele(arl);
+ putnext(q, mp);
+ return;
+ default:
+ DTRACE_PROBE2(rput_dl_badprim, arl_t *, arl,
+ union DL_primitives *, dlp);
+ arl_refrele(arl);
+ putnext(q, mp);
+ return;
+ }
+ arp_dlpi_done(arl, ill);
+ arl_refrele(arl);
+ freemsg(mp);
+}
+
+void
+arp_rput(queue_t *q, mblk_t *mp)
+{
+ arl_t *arl = q->q_ptr;
+ boolean_t need_refrele = B_FALSE;
+
+ mutex_enter(&arl->arl_lock);
+ if (((arl->arl_state_flags &
+ (ARL_CONDEMNED | ARL_LL_REPLUMBING)) != 0)) {
+ /*
+ * Only allow high priority DLPI messages during unplumb or
+ * replumb, and we don't take an arl_refcnt for that case.
+ */
+ if (DB_TYPE(mp) != M_PCPROTO) {
+ mutex_exit(&arl->arl_lock);
+ freemsg(mp);
+ return;
+ }
+ } else {
+ arl_refhold_locked(arl);
+ need_refrele = B_TRUE;
+ }
+ mutex_exit(&arl->arl_lock);
+
+ switch (DB_TYPE(mp)) {
+ case M_PCPROTO:
+ case M_PROTO: {
+ ill_t *ill;
+
+ /*
+ * could be one of
+ * (i) real message from the wire, (DLPI_DATA)
+ * (ii) DLPI message
+ * Take a ref on the ill associated with this arl to
+ * prevent the ill from being unplumbed until this thread
+ * is done.
+ */
+ if (IS_DLPI_DATA(mp)) {
+ ill = arl_to_ill(arl);
+ if (ill == NULL) {
+ arp_drop_packet("No ill", mp, ill);
+ break;
+ }
+ arp_process_packet(ill, mp);
+ ill_refrele(ill);
+ break;
+ }
+ /* Miscellaneous DLPI messages get shuffled off. */
+ arp_rput_dlpi(q, mp);
+ break;
+ }
+ case M_ERROR:
+ case M_HANGUP:
+ if (mp->b_rptr < mp->b_wptr)
+ arl->arl_error = (int)(*mp->b_rptr & 0xFF);
+ if (arl->arl_error == 0)
+ arl->arl_error = ENXIO;
+ freemsg(mp);
+ break;
+ default:
+ ip1dbg(("arp_rput other db type %x\n", DB_TYPE(mp)));
+ putnext(q, mp);
+ break;
+ }
+ if (need_refrele)
+ arl_refrele(arl);
+}
+
+static void
+arp_process_packet(ill_t *ill, mblk_t *mp)
+{
+ mblk_t *mp1;
+ arh_t *arh;
+ in_addr_t src_paddr, dst_paddr;
+ uint32_t hlen, plen;
+ boolean_t is_probe;
+ int op;
+ ncec_t *dst_ncec, *src_ncec = NULL;
+ uchar_t *src_haddr, *arhp, *dst_haddr, *dp, *sp;
+ int err;
+ ip_stack_t *ipst;
+ boolean_t need_ill_refrele = B_FALSE;
+ nce_t *nce;
+ uchar_t *src_lladdr;
+ dl_unitdata_ind_t *dlui;
+ ip_recv_attr_t iras;
+
+ ASSERT(ill != NULL);
+ if (ill->ill_flags & ILLF_NOARP) {
+ arp_drop_packet("Interface does not support ARP", mp, ill);
+ return;
+ }
+ ipst = ill->ill_ipst;
+ /*
+ * What we should have at this point is a DL_UNITDATA_IND message
+ * followed by an ARP packet. We do some initial checks and then
+ * get to work.
+ */
+ dlui = (dl_unitdata_ind_t *)mp->b_rptr;
+ if (dlui->dl_group_address == 1) {
+ /*
+ * multicast or broadcast packet. Only accept on the ipmp
+ * nominated interface for multicasts ('cast_ill').
+ * If we have no cast_ill we are liberal and accept everything.
+ */
+ if (IS_UNDER_IPMP(ill)) {
+ /* For an under ill_grp can change under lock */
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+ if (!ill->ill_nom_cast && ill->ill_grp != NULL &&
+ ill->ill_grp->ig_cast_ill != NULL) {
+ rw_exit(&ipst->ips_ill_g_lock);
+ arp_drop_packet("Interface is not nominated "
+ "for multicast sends and receives",
+ mp, ill);
+ return;
+ }
+ rw_exit(&ipst->ips_ill_g_lock);
+ }
+ }
+ mp1 = mp->b_cont;
+ if (mp1 == NULL) {
+ arp_drop_packet("Missing ARP packet", mp, ill);
+ return;
+ }
+ if (mp1->b_cont != NULL) {
+ /* No fooling around with funny messages. */
+ if (!pullupmsg(mp1, -1)) {
+ arp_drop_packet("Funny message: pullup failed",
+ mp, ill);
+ return;
+ }
+ }
+ arh = (arh_t *)mp1->b_rptr;
+ hlen = arh->arh_hlen;
+ plen = arh->arh_plen;
+ if (MBLKL(mp1) < ARH_FIXED_LEN + 2 * hlen + 2 * plen) {
+ arp_drop_packet("mblk len too small", mp, ill);
+ return;
+ }
+ /*
+ * hlen 0 is used for RFC 1868 UnARP.
+ *
+ * Note that the rest of the code checks that hlen is what we expect
+ * for this hardware address type, so might as well discard packets
+ * here that don't match.
+ */
+ if ((hlen > 0 && hlen != ill->ill_phys_addr_length) || plen == 0) {
+ DTRACE_PROBE2(rput_bogus, ill_t *, ill, mblk_t *, mp1);
+ arp_drop_packet("Bogus hlen or plen", mp, ill);
+ return;
+ }
+ /*
+ * Historically, Solaris has been lenient about hardware type numbers.
+ * We should check here, but don't.
+ */
+ DTRACE_PROBE3(arp__physical__in__start, ill_t *, ill, arh_t *, arh,
+ mblk_t *, mp);
+ /*
+ * If ill is in an ipmp group, it will be the under ill. If we want
+ * to report the packet as coming up the IPMP interface, we should
+ * convert it to the ipmp ill.
+ */
+ ARP_HOOK_IN(ipst->ips_arp_physical_in_event, ipst->ips_arp_physical_in,
+ ill->ill_phyint->phyint_ifindex, arh, mp, mp1, ipst);
+ DTRACE_PROBE1(arp__physical__in__end, mblk_t *, mp);
+ if (mp == NULL)
+ return;
+ arhp = (uchar_t *)arh + ARH_FIXED_LEN;
+ src_haddr = arhp; /* ar$sha */
+ arhp += hlen;
+ bcopy(arhp, &src_paddr, IP_ADDR_LEN); /* ar$spa */
+ sp = arhp;
+ arhp += IP_ADDR_LEN;
+ dst_haddr = arhp; /* ar$dha */
+ arhp += hlen;
+ bcopy(arhp, &dst_paddr, IP_ADDR_LEN); /* ar$tpa */
+ dp = arhp;
+ op = BE16_TO_U16(arh->arh_operation);
+
+ DTRACE_PROBE2(ip__arp__input, (in_addr_t), src_paddr,
+ (in_addr_t), dst_paddr);
+
+ /* Determine if this is just a probe */
+ is_probe = (src_paddr == INADDR_ANY);
+
+ /*
+ * ira_ill is the only field used down the arp_notify path.
+ */
+ bzero(&iras, sizeof (iras));
+ iras.ira_ill = iras.ira_rill = ill;
+ /*
+ * RFC 826: first check if the <protocol, sender protocol address> is
+ * in the cache, if there is a sender protocol address. Note that this
+ * step also handles resolutions based on source.
+ */
+ /* Note: after here we need to freeb(mp) and freemsg(mp1) separately */
+ mp->b_cont = NULL;
+ if (is_probe) {
+ err = AR_NOTFOUND;
+ } else {
+ if (plen != 4) {
+ arp_drop_packet("bad protocol len", mp, ill);
+ return;
+ }
+ err = ip_nce_resolve_all(ill, src_haddr, hlen, &src_paddr,
+ &src_ncec, op);
+ switch (err) {
+ case AR_BOGON:
+ ASSERT(src_ncec != NULL);
+ arp_notify(src_paddr, mp1, AR_CN_BOGON,
+ &iras, src_ncec);
+ break;
+ case AR_FAILED:
+ arp_notify(src_paddr, mp1, AR_CN_FAILED, &iras,
+ src_ncec);
+ break;
+ case AR_LOOPBACK:
+ DTRACE_PROBE2(rput_loopback, ill_t *, ill, arh_t *,
+ arh);
+ freemsg(mp1);
+ break;
+ default:
+ goto update;
+ }
+ freemsg(mp);
+ if (src_ncec != NULL)
+ ncec_refrele(src_ncec);
+ return;
+ }
+update:
+ /*
+ * Now look up the destination address. By RFC 826, we ignore the
+ * packet at this step if the target isn't one of our addresses (i.e.,
+ * one we have been asked to PUBLISH). This is true even if the
+ * target is something we're trying to resolve and the packet
+ * is a response.
+ */
+ dst_ncec = ncec_lookup_illgrp_v4(ill, &dst_paddr);
+ if (dst_ncec == NULL || !NCE_PUBLISH(dst_ncec)) {
+ /*
+ * Let the client know if the source mapping has changed, even
+ * if the destination provides no useful information for the
+ * client.
+ */
+ if (err == AR_CHANGED) {
+ arp_notify(src_paddr, mp1, AR_CN_ANNOUNCE, &iras,
+ NULL);
+ freemsg(mp);
+ } else {
+ freemsg(mp);
+ arp_drop_packet("Target is not interesting", mp1, ill);
+ }
+ if (dst_ncec != NULL)
+ ncec_refrele(dst_ncec);
+ if (src_ncec != NULL)
+ ncec_refrele(src_ncec);
+ return;
+ }
+
+ if (dst_ncec->ncec_flags & NCE_F_UNVERIFIED) {
+ /*
+ * Check for a reflection. Some misbehaving bridges will
+ * reflect our own transmitted packets back to us.
+ */
+ ASSERT(NCE_PUBLISH(dst_ncec));
+ if (hlen != dst_ncec->ncec_ill->ill_phys_addr_length) {
+ ncec_refrele(dst_ncec);
+ if (src_ncec != NULL)
+ ncec_refrele(src_ncec);
+ freemsg(mp);
+ arp_drop_packet("bad arh_len", mp1, ill);
+ return;
+ }
+ if (!nce_cmp_ll_addr(dst_ncec, src_haddr, hlen)) {
+ DTRACE_PROBE3(rput_probe_reflected, ill_t *, ill,
+ arh_t *, arh, ncec_t *, dst_ncec);
+ ncec_refrele(dst_ncec);
+ if (src_ncec != NULL)
+ ncec_refrele(src_ncec);
+ freemsg(mp);
+ arp_drop_packet("Reflected probe", mp1, ill);
+ return;
+ }
+ /*
+ * Responses targeting our HW address that are not responses to
+ * our DAD probe must be ignored as they are related to requests
+ * sent before DAD was restarted.
+ */
+ if (op == ARP_RESPONSE &&
+ (nce_cmp_ll_addr(dst_ncec, dst_haddr, hlen) == 0)) {
+ ncec_refrele(dst_ncec);
+ if (src_ncec != NULL)
+ ncec_refrele(src_ncec);
+ freemsg(mp);
+ arp_drop_packet(
+ "Response to request that was sent before DAD",
+ mp1, ill);
+ return;
+ }
+ /*
+ * Responses targeted to HW addresses which are not ours but
+ * sent to our unverified proto address are also conflicts.
+ * These may be reported by a proxy rather than the interface
+ * with the conflicting address, dst_paddr is in conflict
+ * rather than src_paddr. To ensure IP can locate the correct
+ * ipif to take down, it is necessary to copy dst_paddr to
+ * the src_paddr field before sending it to IP. The same is
+ * required for probes, where src_paddr will be INADDR_ANY.
+ */
+ if (is_probe || op == ARP_RESPONSE) {
+ bcopy(dp, sp, plen);
+ arp_notify(src_paddr, mp1, AR_CN_FAILED, &iras,
+ NULL);
+ ncec_delete(dst_ncec);
+ } else if (err == AR_CHANGED) {
+ arp_notify(src_paddr, mp1, AR_CN_ANNOUNCE, &iras,
+ NULL);
+ } else {
+ DTRACE_PROBE3(rput_request_unverified,
+ ill_t *, ill, arh_t *, arh, ncec_t *, dst_ncec);
+ arp_drop_packet("Unverified request", mp1, ill);
+ }
+ freemsg(mp);
+ ncec_refrele(dst_ncec);
+ if (src_ncec != NULL)
+ ncec_refrele(src_ncec);
+ return;
+ }
+ /*
+ * If it's a request, then we reply to this, and if we think the
+ * sender's unknown, then we create an entry to avoid unnecessary ARPs.
+ * The design assumption is that someone ARPing us is likely to send us
+ * a packet soon, and that we'll want to reply to it.
+ */
+ if (op == ARP_REQUEST) {
+ const uchar_t *nce_hwaddr;
+ struct in_addr nce_paddr;
+ clock_t now;
+ ill_t *under_ill = ill;
+ boolean_t send_unicast = B_TRUE;
+
+ ASSERT(NCE_PUBLISH(dst_ncec));
+
+ if ((dst_ncec->ncec_flags & (NCE_F_BCAST|NCE_F_MCAST)) != 0) {
+ /*
+ * Ignore senders who are deliberately or accidentally
+ * confused.
+ */
+ goto bail;
+ }
+
+ if (!is_probe && err == AR_NOTFOUND) {
+ ASSERT(src_ncec == NULL);
+
+ if (IS_UNDER_IPMP(under_ill)) {
+ /*
+ * create the ncec for the sender on ipmp_ill.
+ * We pass in the ipmp_ill itself to avoid
+ * creating an nce_t on the under_ill.
+ */
+ ill = ipmp_ill_hold_ipmp_ill(under_ill);
+ if (ill == NULL)
+ ill = under_ill;
+ else
+ need_ill_refrele = B_TRUE;
+ }
+
+ err = nce_lookup_then_add_v4(ill, src_haddr, hlen,
+ &src_paddr, 0, ND_STALE, &nce);
+
+ switch (err) {
+ case 0:
+ case EEXIST:
+ ip1dbg(("added ncec %p in state %d ill %s\n",
+ (void *)src_ncec, src_ncec->ncec_state,
+ ill->ill_name));
+ src_ncec = nce->nce_common;
+ break;
+ default:
+ /*
+ * Either no memory, or the outgoing interface
+ * is in the process of down/unplumb. In the
+ * latter case, we will fail the send anyway,
+ * and in the former case, we should try to send
+ * the ARP response.
+ */
+ src_lladdr = src_haddr;
+ goto send_response;
+ }
+ ncec_refhold(src_ncec);
+ nce_refrele(nce);
+ /* set up cleanup interval on ncec */
+ }
+
+ /*
+ * This implements periodic address defense based on a modified
+ * version of the RFC 3927 requirements. Instead of sending a
+ * broadcasted reply every time, as demanded by the RFC, we
+ * send at most one broadcast reply per arp_broadcast_interval.
+ */
+ now = ddi_get_lbolt();
+ if ((now - dst_ncec->ncec_last_time_defended) >
+ MSEC_TO_TICK(ipst->ips_ipv4_dad_announce_interval)) {
+ dst_ncec->ncec_last_time_defended = now;
+ /*
+ * If this is one of the long-suffering entries,
+ * pull it out now. It no longer needs separate
+ * defense, because we're now doing that with this
+ * broadcasted reply.
+ */
+ dst_ncec->ncec_flags &= ~NCE_F_DELAYED;
+ send_unicast = B_FALSE;
+ }
+ if (src_ncec != NULL && send_unicast) {
+ src_lladdr = src_ncec->ncec_lladdr;
+ } else {
+ src_lladdr = under_ill->ill_bcast_mp->b_rptr +
+ NCE_LL_ADDR_OFFSET(under_ill);
+ }
+send_response:
+ nce_hwaddr = dst_ncec->ncec_lladdr;
+ IN6_V4MAPPED_TO_INADDR(&dst_ncec->ncec_addr, &nce_paddr);
+
+ (void) arp_output(under_ill, ARP_RESPONSE,
+ nce_hwaddr, (uchar_t *)&nce_paddr, src_haddr,
+ (uchar_t *)&src_paddr, src_lladdr);
+ }
+bail:
+ if (dst_ncec != NULL) {
+ ncec_refrele(dst_ncec);
+ }
+ if (src_ncec != NULL) {
+ ncec_refrele(src_ncec);
+ }
+ if (err == AR_CHANGED) {
+ mp->b_cont = NULL;
+ arp_notify(src_paddr, mp1, AR_CN_ANNOUNCE, &iras, NULL);
+ mp1 = NULL;
+ }
+ if (need_ill_refrele)
+ ill_refrele(ill);
+done:
+ freemsg(mp);
+ freemsg(mp1);
+}
+
+/*
+ * Basic initialization of the arl_t and the arl_common structure shared with
+ * the ill_t that is done after SLIFNAME/IF_UNITSEL.
+ */
+static int
+arl_ill_init(arl_t *arl, char *ill_name)
+{
+ ill_t *ill;
+ arl_ill_common_t *ai;
+
+ ill = ill_lookup_on_name(ill_name, B_FALSE, B_FALSE, B_FALSE,
+ arl->arl_ipst);
+
+ if (ill == NULL)
+ return (ENXIO);
+
+ /*
+ * By the time we set up the arl, we expect the ETHERTYPE_IP
+ * stream to be fully bound and attached. So we copy/verify
+ * relevant information as possible from/against the ill.
+ *
+ * The following should have been set up in arp_ll_set_defaults()
+ * after the first DL_INFO_ACK was received.
+ */
+ ASSERT(arl->arl_phys_addr_length == ill->ill_phys_addr_length);
+ ASSERT(arl->arl_sap == ETHERTYPE_ARP);
+ ASSERT(arl->arl_mactype == ill->ill_mactype);
+ ASSERT(arl->arl_sap_length == ill->ill_sap_length);
+
+ ai = kmem_zalloc(sizeof (*ai), KM_SLEEP);
+ mutex_enter(&ill->ill_lock);
+ /* First ensure that the ill is not CONDEMNED. */
+ if (ill->ill_state_flags & ILL_CONDEMNED) {
+ mutex_exit(&ill->ill_lock);
+ ill_refrele(ill);
+ kmem_free(ai, sizeof (*ai));
+ return (ENXIO);
+ }
+ if (ill->ill_common != NULL || arl->arl_common != NULL) {
+ mutex_exit(&ill->ill_lock);
+ ip0dbg(("%s: PPA already exists", ill->ill_name));
+ ill_refrele(ill);
+ kmem_free(ai, sizeof (*ai));
+ return (EEXIST);
+ }
+ mutex_init(&ai->ai_lock, NULL, MUTEX_DEFAULT, NULL);
+ ai->ai_arl = arl;
+ ai->ai_ill = ill;
+ ill->ill_common = ai;
+ arl->arl_common = ai;
+ mutex_exit(&ill->ill_lock);
+ (void) strlcpy(arl->arl_name, ill->ill_name, LIFNAMSIZ);
+ arl->arl_name_length = ill->ill_name_length;
+ ill_refrele(ill);
+ arp_ifname_notify(arl);
+ return (0);
+}
+
+/* Allocate and do common initializations for DLPI messages. */
+static mblk_t *
+ip_ar_dlpi_comm(t_uscalar_t prim, size_t size)
+{
+ mblk_t *mp;
+
+ if ((mp = allocb(size, BPRI_HI)) == NULL)
+ return (NULL);
+
+ /*
+ * DLPIv2 says that DL_INFO_REQ and DL_TOKEN_REQ (the latter
+ * of which we don't seem to use) are sent with M_PCPROTO, and
+ * that other DLPI are M_PROTO.
+ */
+ DB_TYPE(mp) = (prim == DL_INFO_REQ) ? M_PCPROTO : M_PROTO;
+
+ mp->b_wptr = mp->b_rptr + size;
+ bzero(mp->b_rptr, size);
+ DL_PRIM(mp) = prim;
+ return (mp);
+}
+
+
+int
+ip_sioctl_ifunitsel_arp(queue_t *q, int *ppa)
+{
+ arl_t *arl;
+ char *cp, ill_name[LIFNAMSIZ];
+
+ if (q->q_next == NULL)
+ return (EINVAL);
+
+ do {
+ q = q->q_next;
+ } while (q->q_next != NULL);
+ cp = q->q_qinfo->qi_minfo->mi_idname;
+
+ arl = (arl_t *)q->q_ptr;
+ (void) snprintf(ill_name, sizeof (ill_name), "%s%d", cp, *ppa);
+ arl->arl_ppa = *ppa;
+ return (arl_ill_init(arl, ill_name));
+}
+
+int
+ip_sioctl_slifname_arp(queue_t *q, void *lifreq)
+{
+ arl_t *arl;
+ struct lifreq *lifr = lifreq;
+
+ /* ioctl not valid when IP opened as a device */
+ if (q->q_next == NULL)
+ return (EINVAL);
+
+ arl = (arl_t *)q->q_ptr;
+ arl->arl_ppa = lifr->lifr_ppa;
+ return (arl_ill_init(arl, lifr->lifr_name));
+}
+
+arl_t *
+ill_to_arl(ill_t *ill)
+{
+ arl_ill_common_t *ai = ill->ill_common;
+ arl_t *arl = NULL;
+
+ if (ai == NULL)
+ return (NULL);
+ /*
+ * Find the arl_t that corresponds to this ill_t from the shared
+ * ill_common structure. We can safely access the ai here as it
+ * will only be freed in arp_modclose() after we have become
+ * single-threaded.
+ */
+ mutex_enter(&ai->ai_lock);
+ if ((arl = ai->ai_arl) != NULL) {
+ mutex_enter(&arl->arl_lock);
+ if (!(arl->arl_state_flags & ARL_CONDEMNED)) {
+ arl_refhold_locked(arl);
+ mutex_exit(&arl->arl_lock);
+ } else {
+ mutex_exit(&arl->arl_lock);
+ arl = NULL;
+ }
+ }
+ mutex_exit(&ai->ai_lock);
+ return (arl);
+}
+
+ill_t *
+arl_to_ill(arl_t *arl)
+{
+ arl_ill_common_t *ai = arl->arl_common;
+ ill_t *ill = NULL;
+
+ if (ai == NULL) {
+ /*
+ * happens when the arp stream is just being opened, and
+ * arl_ill_init has not been executed yet.
+ */
+ return (NULL);
+ }
+ /*
+ * Find the ill_t that corresponds to this arl_t from the shared
+ * arl_common structure. We can safely access the ai here as it
+ * will only be freed in arp_modclose() after we have become
+ * single-threaded.
+ */
+ mutex_enter(&ai->ai_lock);
+ if ((ill = ai->ai_ill) != NULL) {
+ mutex_enter(&ill->ill_lock);
+ if (!ILL_IS_CONDEMNED(ill)) {
+ ill_refhold_locked(ill);
+ mutex_exit(&ill->ill_lock);
+ } else {
+ mutex_exit(&ill->ill_lock);
+ ill = NULL;
+ }
+ }
+ mutex_exit(&ai->ai_lock);
+ return (ill);
+}
+
+int
+arp_ll_up(ill_t *ill)
+{
+ mblk_t *attach_mp = NULL;
+ mblk_t *bind_mp = NULL;
+ mblk_t *unbind_mp = NULL;
+ arl_t *arl;
+
+ ASSERT(IAM_WRITER_ILL(ill));
+ arl = ill_to_arl(ill);
+
+ DTRACE_PROBE2(ill__downup, char *, "arp_ll_up", ill_t *, ill);
+ if (arl == NULL)
+ return (ENXIO);
+ DTRACE_PROBE2(arl__downup, char *, "arp_ll_up", arl_t *, arl);
+ if ((arl->arl_state_flags & ARL_LL_UP) != 0) {
+ arl_refrele(arl);
+ return (0);
+ }
+ if (arl->arl_needs_attach) { /* DL_STYLE2 */
+ attach_mp =
+ ip_ar_dlpi_comm(DL_ATTACH_REQ, sizeof (dl_attach_req_t));
+ if (attach_mp == NULL)
+ goto bad;
+ ((dl_attach_req_t *)attach_mp->b_rptr)->dl_ppa = arl->arl_ppa;
+ }
+
+ /* Allocate and initialize a bind message. */
+ bind_mp = ip_ar_dlpi_comm(DL_BIND_REQ, sizeof (dl_bind_req_t));
+ if (bind_mp == NULL)
+ goto bad;
+ ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ETHERTYPE_ARP;
+ ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS;
+
+ unbind_mp = ip_ar_dlpi_comm(DL_UNBIND_REQ, sizeof (dl_unbind_req_t));
+ if (unbind_mp == NULL)
+ goto bad;
+ if (arl->arl_needs_attach) {
+ arp_dlpi_send(arl, attach_mp);
+ }
+ arl->arl_unbind_mp = unbind_mp;
+
+ arl->arl_state_flags |= ARL_LL_BIND_PENDING;
+ arp_dlpi_send(arl, bind_mp);
+ arl_refrele(arl);
+ return (EINPROGRESS);
+
+bad:
+ freemsg(attach_mp);
+ freemsg(bind_mp);
+ freemsg(unbind_mp);
+ arl_refrele(arl);
+ return (ENOMEM);
+}
+
+/*
+ * consumes/frees mp
+ */
+static void
+arp_notify(in_addr_t src, mblk_t *mp, uint32_t arcn_code,
+ ip_recv_attr_t *ira, ncec_t *ncec)
+{
+ char hbuf[MAC_STR_LEN];
+ char sbuf[INET_ADDRSTRLEN];
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+ arh_t *arh = (arh_t *)mp->b_rptr;
+
+ switch (arcn_code) {
+ case AR_CN_BOGON:
+ /*
+ * Someone is sending ARP packets with a source protocol
+ * address that we have published and for which we believe our
+ * entry is authoritative and verified to be unique on
+ * the network.
+ *
+ * arp_process_packet() sends AR_CN_FAILED for the case when
+ * a DAD probe is received and the hardware address of a
+ * non-authoritative entry has changed. Thus, AR_CN_BOGON
+ * indicates a real conflict, and we have to do resolution.
+ *
+ * We back away quickly from the address if it's from DHCP or
+ * otherwise temporary and hasn't been used recently (or at
+ * all). We'd like to include "deprecated" addresses here as
+ * well (as there's no real reason to defend something we're
+ * discarding), but IPMP "reuses" this flag to mean something
+ * other than the standard meaning.
+ */
+ if (ip_nce_conflict(mp, ira, ncec)) {
+ (void) mac_colon_addr((uint8_t *)(arh + 1),
+ arh->arh_hlen, hbuf, sizeof (hbuf));
+ (void) ip_dot_addr(src, sbuf);
+ cmn_err(CE_WARN,
+ "proxy ARP problem? Node '%s' is using %s on %s",
+ hbuf, sbuf, ill->ill_name);
+ if (!arp_no_defense)
+ (void) arp_announce(ncec);
+ /*
+ * ncec_last_time_defended has been adjusted in
+ * ip_nce_conflict.
+ */
+ } else {
+ ncec_delete(ncec);
+ }
+ freemsg(mp);
+ break;
+ case AR_CN_ANNOUNCE: {
+ nce_hw_map_t hwm;
+ /*
+ * ARP gives us a copy of any packet where it thinks
+ * the address has changed, so that we can update our
+ * caches. We're responsible for caching known answers
+ * in the current design. We check whether the
+ * hardware address really has changed in all of our
+ * entries that have cached this mapping, and if so, we
+ * blow them away. This way we will immediately pick
+ * up the rare case of a host changing hardware
+ * address.
+ */
+ if (src == 0) {
+ freemsg(mp);
+ break;
+ }
+ hwm.hwm_addr = src;
+ hwm.hwm_hwlen = arh->arh_hlen;
+ hwm.hwm_hwaddr = (uchar_t *)(arh + 1);
+ hwm.hwm_flags = 0;
+ ncec_walk_common(ipst->ips_ndp4, NULL,
+ (pfi_t)nce_update_hw_changed, &hwm, B_TRUE);
+ freemsg(mp);
+ break;
+ }
+ case AR_CN_FAILED:
+ if (arp_no_defense) {
+ (void) mac_colon_addr((uint8_t *)(arh + 1),
+ arh->arh_hlen, hbuf, sizeof (hbuf));
+ (void) ip_dot_addr(src, sbuf);
+
+ cmn_err(CE_WARN,
+ "node %s is using our IP address %s on %s",
+ hbuf, sbuf, ill->ill_name);
+ freemsg(mp);
+ break;
+ }
+ /*
+ * mp will be freed by arp_excl.
+ */
+ ill_refhold(ill);
+ qwriter_ip(ill, ill->ill_rq, mp, arp_excl, NEW_OP, B_FALSE);
+ return;
+ default:
+ ASSERT(0);
+ freemsg(mp);
+ break;
+ }
+}
+
+/*
+ * arp_output is called to transmit an ARP Request or Response. The mapping
+ * to RFC 826 variables is:
+ * haddr1 == ar$sha
+ * paddr1 == ar$spa
+ * haddr2 == ar$tha
+ * paddr2 == ar$tpa
+ * The ARP frame is sent to the ether_dst in dst_lladdr.
+ */
+static int
+arp_output(ill_t *ill, uint32_t operation,
+ const uchar_t *haddr1, const uchar_t *paddr1, const uchar_t *haddr2,
+ const uchar_t *paddr2, uchar_t *dst_lladdr)
+{
+ arh_t *arh;
+ uint8_t *cp;
+ uint_t hlen;
+ uint32_t plen = IPV4_ADDR_LEN; /* ar$pln from RFC 826 */
+ uint32_t proto = IP_ARP_PROTO_TYPE;
+ mblk_t *mp;
+ arl_t *arl;
+
+ ASSERT(dst_lladdr != NULL);
+ hlen = ill->ill_phys_addr_length; /* ar$hln from RFC 826 */
+ mp = ill_dlur_gen(dst_lladdr, hlen, ETHERTYPE_ARP, ill->ill_sap_length);
+
+ if (mp == NULL)
+ return (ENOMEM);
+
+ /* IFF_NOARP flag is set or link down: do not send arp messages */
+ if ((ill->ill_flags & ILLF_NOARP) || !ill->ill_dl_up) {
+ freemsg(mp);
+ return (ENXIO);
+ }
+
+ mp->b_cont = allocb(AR_LL_HDR_SLACK + ARH_FIXED_LEN + (hlen * 4) +
+ plen + plen, BPRI_MED);
+ if (mp->b_cont == NULL) {
+ freeb(mp);
+ return (ENOMEM);
+ }
+
+ /* Fill in the ARP header. */
+ cp = mp->b_cont->b_rptr + (AR_LL_HDR_SLACK + hlen + hlen);
+ mp->b_cont->b_rptr = cp;
+ arh = (arh_t *)cp;
+ U16_TO_BE16(arp_hw_type(ill->ill_mactype), arh->arh_hardware);
+ U16_TO_BE16(proto, arh->arh_proto);
+ arh->arh_hlen = (uint8_t)hlen;
+ arh->arh_plen = (uint8_t)plen;
+ U16_TO_BE16(operation, arh->arh_operation);
+ cp += ARH_FIXED_LEN;
+ bcopy(haddr1, cp, hlen);
+ cp += hlen;
+ if (paddr1 == NULL)
+ bzero(cp, plen);
+ else
+ bcopy(paddr1, cp, plen);
+ cp += plen;
+ if (haddr2 == NULL)
+ bzero(cp, hlen);
+ else
+ bcopy(haddr2, cp, hlen);
+ cp += hlen;
+ bcopy(paddr2, cp, plen);
+ cp += plen;
+ mp->b_cont->b_wptr = cp;
+
+ DTRACE_PROBE3(arp__physical__out__start,
+ ill_t *, ill, arh_t *, arh, mblk_t *, mp);
+ ARP_HOOK_OUT(ill->ill_ipst->ips_arp_physical_out_event,
+ ill->ill_ipst->ips_arp_physical_out,
+ ill->ill_phyint->phyint_ifindex, arh, mp, mp->b_cont,
+ ill->ill_ipst);
+ DTRACE_PROBE1(arp__physical__out__end, mblk_t *, mp);
+ if (mp == NULL)
+ return (0);
+
+ /* Ship it out. */
+ arl = ill_to_arl(ill);
+ if (arl == NULL) {
+ freemsg(mp);
+ return (0);
+ }
+ if (canputnext(arl->arl_wq))
+ putnext(arl->arl_wq, mp);
+ else
+ freemsg(mp);
+ arl_refrele(arl);
+ return (0);
+}
+
+/*
+ * Process resolve requests.
+ * If we are not yet reachable then we check and decrease ncec_rcnt; otherwise
+ * we leave it alone (the caller will check and manage ncec_pcnt in those
+ * cases.)
+ */
+int
+arp_request(ncec_t *ncec, in_addr_t sender, ill_t *ill)
+{
+ int err;
+ const uchar_t *target_hwaddr;
+ struct in_addr nce_paddr;
+ uchar_t *dst_lladdr;
+ boolean_t use_rcnt = !NCE_ISREACHABLE(ncec);
+
+ ASSERT(MUTEX_HELD(&ncec->ncec_lock));
+ ASSERT(!IS_IPMP(ill));
+
+ if (use_rcnt && ncec->ncec_rcnt == 0) {
+ /* not allowed any more retransmits. */
+ return (0);
+ }
+
+ if ((ill->ill_flags & ILLF_NOARP) != 0)
+ return (0);
+
+ IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &nce_paddr);
+
+ target_hwaddr =
+ ill->ill_bcast_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
+
+ if (NCE_ISREACHABLE(ncec)) {
+ dst_lladdr = ncec->ncec_lladdr;
+ } else {
+ dst_lladdr = ill->ill_bcast_mp->b_rptr +
+ NCE_LL_ADDR_OFFSET(ill);
+ }
+
+ mutex_exit(&ncec->ncec_lock);
+ err = arp_output(ill, ARP_REQUEST,
+ ill->ill_phys_addr, (uchar_t *)&sender, target_hwaddr,
+ (uchar_t *)&nce_paddr, dst_lladdr);
+ mutex_enter(&ncec->ncec_lock);
+
+ if (err != 0) {
+ /*
+ * Some transient error such as ENOMEM or a down link was
+ * encountered. If the link has been taken down permanently,
+ * the ncec will eventually be cleaned up (ipif_down_tail()
+ * will call ipif_nce_down() and flush the ncec), to terminate
+ * recurring attempts to send ARP requests. In all other cases,
+ * allow the caller another chance at success next time.
+ */
+ return (ncec->ncec_ill->ill_reachable_retrans_time);
+ }
+
+ if (use_rcnt)
+ ncec->ncec_rcnt--;
+
+ return (ncec->ncec_ill->ill_reachable_retrans_time);
+}
+
+/* return B_TRUE if dropped */
+boolean_t
+arp_announce(ncec_t *ncec)
+{
+ ill_t *ill;
+ int err;
+ uchar_t *sphys_addr, *bcast_addr;
+ struct in_addr ncec_addr;
+ boolean_t need_refrele = B_FALSE;
+
+ ASSERT((ncec->ncec_flags & NCE_F_BCAST) == 0);
+ ASSERT((ncec->ncec_flags & NCE_F_MCAST) == 0);
+
+ if (IS_IPMP(ncec->ncec_ill)) {
+ /* sent on the cast_ill */
+ ill = ipmp_ill_get_xmit_ill(ncec->ncec_ill, B_FALSE);
+ if (ill == NULL)
+ return (B_TRUE);
+ need_refrele = B_TRUE;
+ } else {
+ ill = ncec->ncec_ill;
+ }
+
+ /*
+ * broadcast an announce to ill_bcast address.
+ */
+ IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ncec_addr);
+
+ sphys_addr = ncec->ncec_lladdr;
+ bcast_addr = ill->ill_bcast_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
+
+ err = arp_output(ill, ARP_REQUEST,
+ sphys_addr, (uchar_t *)&ncec_addr, bcast_addr,
+ (uchar_t *)&ncec_addr, bcast_addr);
+
+ if (need_refrele)
+ ill_refrele(ill);
+ return (err != 0);
+}
+
+/* return B_TRUE if dropped */
+boolean_t
+arp_probe(ncec_t *ncec)
+{
+ ill_t *ill;
+ int err;
+ struct in_addr ncec_addr;
+ uchar_t *sphys_addr, *dst_lladdr;
+
+ if (IS_IPMP(ncec->ncec_ill)) {
+ ill = ipmp_ill_get_xmit_ill(ncec->ncec_ill, B_FALSE);
+ if (ill == NULL)
+ return (B_TRUE);
+ } else {
+ ill = ncec->ncec_ill;
+ }
+
+ IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ncec_addr);
+
+ sphys_addr = ncec->ncec_lladdr;
+ dst_lladdr = ill->ill_bcast_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
+ err = arp_output(ill, ARP_REQUEST,
+ sphys_addr, NULL, NULL, (uchar_t *)&ncec_addr, dst_lladdr);
+
+ if (IS_IPMP(ncec->ncec_ill))
+ ill_refrele(ill);
+ return (err != 0);
+}
+
+static mblk_t *
+arl_unbind(arl_t *arl)
+{
+ mblk_t *mp;
+
+ if ((mp = arl->arl_unbind_mp) != NULL) {
+ arl->arl_unbind_mp = NULL;
+ arl->arl_state_flags |= ARL_DL_UNBIND_IN_PROGRESS;
+ }
+ return (mp);
+}
+
+int
+arp_ll_down(ill_t *ill)
+{
+ arl_t *arl;
+ mblk_t *unbind_mp;
+ int err = 0;
+ boolean_t replumb = (ill->ill_replumbing == 1);
+
+ DTRACE_PROBE2(ill__downup, char *, "arp_ll_down", ill_t *, ill);
+ if ((arl = ill_to_arl(ill)) == NULL)
+ return (ENXIO);
+ DTRACE_PROBE2(arl__downup, char *, "arp_ll_down", arl_t *, arl);
+ mutex_enter(&arl->arl_lock);
+ unbind_mp = arl_unbind(arl);
+ if (unbind_mp != NULL) {
+ ASSERT(arl->arl_state_flags & ARL_DL_UNBIND_IN_PROGRESS);
+ DTRACE_PROBE2(arp__unbinding, mblk_t *, unbind_mp,
+ arl_t *, arl);
+ err = EINPROGRESS;
+ if (replumb)
+ arl->arl_state_flags |= ARL_LL_REPLUMBING;
+ }
+ mutex_exit(&arl->arl_lock);
+ if (unbind_mp != NULL)
+ arp_dlpi_send(arl, unbind_mp);
+ arl_refrele(arl);
+ return (err);
+}
+
+/* ARGSUSED */
+int
+arp_close(queue_t *q, int flags)
+{
+ if (WR(q)->q_next != NULL) {
+ /* This is a module close */
+ return (arp_modclose(q->q_ptr));
+ }
+ qprocsoff(q);
+ q->q_ptr = WR(q)->q_ptr = NULL;
+ return (0);
+}
+
+static int
+arp_modclose(arl_t *arl)
+{
+ arl_ill_common_t *ai = arl->arl_common;
+ ill_t *ill;
+ queue_t *q = arl->arl_rq;
+ mblk_t *mp, *nextmp;
+ ipsq_t *ipsq = NULL;
+
+ ill = arl_to_ill(arl);
+ if (ill != NULL) {
+ if (!ill_waiter_inc(ill)) {
+ ill_refrele(ill);
+ } else {
+ ill_refrele(ill);
+ if (ipsq_enter(ill, B_FALSE, NEW_OP))
+ ipsq = ill->ill_phyint->phyint_ipsq;
+ ill_waiter_dcr(ill);
+ }
+ if (ipsq == NULL) {
+ /*
+ * could not enter the ipsq because ill is already
+ * marked CONDEMNED.
+ */
+ ill = NULL;
+ }
+ }
+ if (ai != NULL && ipsq == NULL) {
+ /*
+ * Either we did not get an ill because it was marked CONDEMNED
+ * or we could not enter the ipsq because it was unplumbing.
+ * In both cases, wait for the ill to complete ip_modclose().
+ *
+ * If the arp_modclose happened even before SLIFNAME, the ai
+ * itself would be NULL, in which case we can complete the close
+ * without waiting.
+ */
+ mutex_enter(&ai->ai_lock);
+ while (ai->ai_ill != NULL)
+ cv_wait(&ai->ai_ill_unplumb_done, &ai->ai_lock);
+ mutex_exit(&ai->ai_lock);
+ }
+ ASSERT(ill == NULL || IAM_WRITER_ILL(ill));
+
+ mutex_enter(&arl->arl_lock);
+ /*
+ * If the ill had completed unplumbing before arp_modclose(), there
+ * would be no ill (and therefore, no ipsq) to serialize arp_modclose()
+ * so that we need to explicitly check for ARL_CONDEMNED and back off
+ * if it is set.
+ */
+ if ((arl->arl_state_flags & ARL_CONDEMNED) != 0) {
+ mutex_exit(&arl->arl_lock);
+ ASSERT(ipsq == NULL);
+ return (0);
+ }
+ arl->arl_state_flags |= ARL_CONDEMNED;
+
+ /*
+ * send out all pending dlpi messages, don't wait for the ack (which
+ * will be ignored in arp_rput when CONDEMNED is set)
+ *
+ * We have to check for pending DL_UNBIND_REQ because, in the case
+ * that ip_modclose() executed before arp_modclose(), the call to
+ * ill_delete_tail->ipif_arp_down() would have triggered a
+ * DL_UNBIND_REQ. When arp_modclose() executes ipsq_enter() will fail
+ * (since ip_modclose() is in the ipsq) but the DL_UNBIND_ACK may not
+ * have been processed yet. In this scenario, we cannot reset
+ * arl_dlpi_pending, because the setting/clearing of arl_state_flags
+ * related to unbind, and the associated cv_waits must be allowed to
+ * continue.
+ */
+ if (arl->arl_dlpi_pending != DL_UNBIND_REQ)
+ arl->arl_dlpi_pending = DL_PRIM_INVAL;
+ mp = arl->arl_dlpi_deferred;
+ arl->arl_dlpi_deferred = NULL;
+ mutex_exit(&arl->arl_lock);
+
+ for (; mp != NULL; mp = nextmp) {
+ nextmp = mp->b_next;
+ mp->b_next = NULL;
+ putnext(arl->arl_wq, mp);
+ }
+
+ /* Wait for data paths to quiesce */
+ mutex_enter(&arl->arl_lock);
+ while (arl->arl_refcnt != 0)
+ cv_wait(&arl->arl_cv, &arl->arl_lock);
+
+ /*
+ * unbind, so that nothing else can come up from driver.
+ */
+ mp = arl_unbind(arl);
+ mutex_exit(&arl->arl_lock);
+ if (mp != NULL)
+ arp_dlpi_send(arl, mp);
+ mutex_enter(&arl->arl_lock);
+
+ /* wait for unbind ack */
+ while (arl->arl_state_flags & ARL_DL_UNBIND_IN_PROGRESS)
+ cv_wait(&arl->arl_cv, &arl->arl_lock);
+ mutex_exit(&arl->arl_lock);
+
+ qprocsoff(q);
+
+ if (ill != NULL) {
+ mutex_enter(&ill->ill_lock);
+ ill->ill_arl_dlpi_pending = 0;
+ mutex_exit(&ill->ill_lock);
+ }
+
+ if (ai != NULL) {
+ mutex_enter(&ai->ai_lock);
+ ai->ai_arl = NULL;
+ if (ai->ai_ill == NULL) {
+ mutex_destroy(&ai->ai_lock);
+ kmem_free(ai, sizeof (*ai));
+ } else {
+ mutex_exit(&ai->ai_lock);
+ }
+ }
+
+ /* free up the rest */
+ arp_mod_close_tail(arl);
+
+ q->q_ptr = WR(q)->q_ptr = NULL;
+
+ if (ipsq != NULL)
+ ipsq_exit(ipsq);
+
+ return (0);
+}
+
+static void
+arp_mod_close_tail(arl_t *arl)
+{
+ ip_stack_t *ipst = arl->arl_ipst;
+ mblk_t **mpp;
+
+ netstack_hold(ipst->ips_netstack);
+
+ mutex_enter(&ipst->ips_ip_mi_lock);
+ mi_close_unlink(&ipst->ips_arp_g_head, (IDP)arl);
+ mutex_exit(&ipst->ips_ip_mi_lock);
+
+ /*
+ * credp could be null if the open didn't succeed and ip_modopen
+ * itself calls ip_close.
+ */
+ if (arl->arl_credp != NULL)
+ crfree(arl->arl_credp);
+
+ /* Free all retained control messages. */
+ mpp = &arl->arl_first_mp_to_free;
+ do {
+ while (mpp[0]) {
+ mblk_t *mp;
+ mblk_t *mp1;
+
+ mp = mpp[0];
+ mpp[0] = mp->b_next;
+ for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) {
+ mp1->b_next = NULL;
+ mp1->b_prev = NULL;
+ }
+ freemsg(mp);
+ }
+ } while (mpp++ != &arl->arl_last_mp_to_free);
+
+ netstack_rele(ipst->ips_netstack);
+ mi_free(arl->arl_name);
+ mi_close_free((IDP)arl);
+}
+
+/*
+ * DAD failed. Tear down ipifs with the specified srce address. Note that
+ * tearing down the ipif also meas deleting the ncec through ipif_down,
+ * so it is not possible to use nce_timer for recovery. Instead we start
+ * a timer on the ipif. Caller has to free the mp.
+ */
+void
+arp_failure(mblk_t *mp, ip_recv_attr_t *ira)
+{
+ ill_t *ill = ira->ira_ill;
+
+ if ((mp = copymsg(mp)) != NULL) {
+ ill_refhold(ill);
+ qwriter_ip(ill, ill->ill_rq, mp, arp_excl, NEW_OP, B_FALSE);
+ }
+}
+
+/*
+ * This is for exclusive changes due to ARP. Tear down an interface due
+ * to AR_CN_FAILED and AR_CN_BOGON.
+ */
+/* ARGSUSED */
+static void
+arp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
+{
+ ill_t *ill = rq->q_ptr;
+ arh_t *arh;
+ ipaddr_t src;
+ ipif_t *ipif;
+ ip_stack_t *ipst = ill->ill_ipst;
+ uchar_t *haddr;
+ uint_t haddrlen;
+
+ /* first try src = ar$spa */
+ arh = (arh_t *)mp->b_rptr;
+ bcopy((char *)&arh[1] + arh->arh_hlen, &src, IP_ADDR_LEN);
+
+ haddrlen = arh->arh_hlen;
+ haddr = (uint8_t *)(arh + 1);
+
+ if (haddrlen == ill->ill_phys_addr_length) {
+ /*
+ * Ignore conflicts generated by misbehaving switches that
+ * just reflect our own messages back to us. For IPMP, we may
+ * see reflections across any ill in the illgrp.
+ */
+ /* For an under ill_grp can change under lock */
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+ if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
+ IS_UNDER_IPMP(ill) && ill->ill_grp != NULL &&
+ ipmp_illgrp_find_ill(ill->ill_grp, haddr,
+ haddrlen) != NULL) {
+ rw_exit(&ipst->ips_ill_g_lock);
+ goto ignore_conflict;
+ }
+ rw_exit(&ipst->ips_ill_g_lock);
+ }
+
+ /*
+ * Look up the appropriate ipif.
+ */
+ ipif = ipif_lookup_addr(src, ill, ALL_ZONES, ipst);
+ if (ipif == NULL)
+ goto ignore_conflict;
+
+ /* Reload the ill to match the ipif */
+ ill = ipif->ipif_ill;
+
+ /* If it's already duplicate or ineligible, then don't do anything. */
+ if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
+ ipif_refrele(ipif);
+ goto ignore_conflict;
+ }
+
+ /*
+ * If we failed on a recovery probe, then restart the timer to
+ * try again later.
+ */
+ if (!ipif->ipif_was_dup) {
+ char hbuf[MAC_STR_LEN];
+ char sbuf[INET_ADDRSTRLEN];
+ char ibuf[LIFNAMSIZ];
+
+ (void) mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf));
+ (void) ip_dot_addr(src, sbuf);
+ ipif_get_name(ipif, ibuf, sizeof (ibuf));
+
+ cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
+ " disabled", ibuf, sbuf, hbuf);
+ }
+ mutex_enter(&ill->ill_lock);
+ ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
+ ipif->ipif_flags |= IPIF_DUPLICATE;
+ ill->ill_ipif_dup_count++;
+ mutex_exit(&ill->ill_lock);
+ (void) ipif_down(ipif, NULL, NULL);
+ (void) ipif_down_tail(ipif);
+ mutex_enter(&ill->ill_lock);
+ if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
+ ill->ill_net_type == IRE_IF_RESOLVER &&
+ !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
+ ipst->ips_ip_dup_recovery > 0) {
+ ASSERT(ipif->ipif_recovery_id == 0);
+ ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
+ ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
+ }
+ mutex_exit(&ill->ill_lock);
+ ipif_refrele(ipif);
+
+ignore_conflict:
+ freemsg(mp);
+}
+
+/*
+ * This is a place for a dtrace hook.
+ * Note that mp can be either the DL_UNITDATA_IND with a b_cont payload,
+ * or just the ARP packet payload as an M_DATA.
+ */
+/* ARGSUSED */
+static void
+arp_drop_packet(const char *str, mblk_t *mp, ill_t *ill)
+{
+ freemsg(mp);
+}
+
+static boolean_t
+arp_over_driver(queue_t *q)
+{
+ queue_t *qnext = STREAM(q)->sd_wrq->q_next;
+
+ /*
+ * check if first module below stream head is IP or UDP.
+ */
+ ASSERT(qnext != NULL);
+ if (strcmp(Q2NAME(qnext), "ip") != 0 &&
+ strcmp(Q2NAME(qnext), "udp") != 0) {
+ /*
+ * module below is not ip or udp, so arp has been pushed
+ * on the driver.
+ */
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+static int
+arp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
+{
+ int err;
+
+ ASSERT(sflag & MODOPEN);
+ if (!arp_over_driver(q)) {
+ q->q_qinfo = dummymodinfo.st_rdinit;
+ WR(q)->q_qinfo = dummymodinfo.st_wrinit;
+ return ((*dummymodinfo.st_rdinit->qi_qopen)(q, devp, flag,
+ sflag, credp));
+ }
+ err = arp_modopen(q, devp, flag, sflag, credp);
+ return (err);
+}
+
+/*
+ * In most cases we must be a writer on the IP stream before coming to
+ * arp_dlpi_send(), to serialize DLPI sends to the driver. The exceptions
+ * when we are not a writer are very early duing initialization (in
+ * arl_init, before the arl has done a SLIFNAME, so that we don't yet know
+ * the associated ill) or during arp_mod_close, when we could not enter the
+ * ipsq because the ill has already unplumbed.
+ */
+static void
+arp_dlpi_send(arl_t *arl, mblk_t *mp)
+{
+ mblk_t **mpp;
+ t_uscalar_t prim;
+ arl_ill_common_t *ai;
+
+ ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
+
+#ifdef DEBUG
+ ai = arl->arl_common;
+ if (ai != NULL) {
+ mutex_enter(&ai->ai_lock);
+ if (ai->ai_ill != NULL)
+ ASSERT(IAM_WRITER_ILL(ai->ai_ill));
+ mutex_exit(&ai->ai_lock);
+ }
+#endif /* DEBUG */
+
+ mutex_enter(&arl->arl_lock);
+ if (arl->arl_dlpi_pending != DL_PRIM_INVAL) {
+ /* Must queue message. Tail insertion */
+ mpp = &arl->arl_dlpi_deferred;
+ while (*mpp != NULL)
+ mpp = &((*mpp)->b_next);
+
+ *mpp = mp;
+ mutex_exit(&arl->arl_lock);
+ return;
+ }
+ mutex_exit(&arl->arl_lock);
+ if ((prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive)
+ == DL_BIND_REQ) {
+ ASSERT((arl->arl_state_flags & ARL_DL_UNBIND_IN_PROGRESS) == 0);
+ }
+ /*
+ * No need to take the arl_lock to examine ARL_CONDEMNED at this point
+ * because the only thread that can see ARL_CONDEMNED here is the
+ * closing arp_modclose() thread which sets the flag after becoming a
+ * writer on the ipsq. Threads from IP must have finished and
+ * cannot be active now.
+ */
+ if (!(arl->arl_state_flags & ARL_CONDEMNED) ||
+ (prim == DL_UNBIND_REQ)) {
+ if (prim != DL_NOTIFY_CONF) {
+ ill_t *ill = arl_to_ill(arl);
+
+ arl->arl_dlpi_pending = prim;
+ if (ill != NULL) {
+ mutex_enter(&ill->ill_lock);
+ ill->ill_arl_dlpi_pending = 1;
+ mutex_exit(&ill->ill_lock);
+ ill_refrele(ill);
+ }
+ }
+ }
+ DTRACE_PROBE4(arl__dlpi, char *, "arp_dlpi_send",
+ char *, dl_primstr(prim), char *, "-", arl_t *, arl);
+ putnext(arl->arl_wq, mp);
+}
+
+static void
+arl_defaults_common(arl_t *arl, mblk_t *mp)
+{
+ dl_info_ack_t *dlia = (dl_info_ack_t *)mp->b_rptr;
+ /*
+ * Till the ill is fully up the ill is not globally visible.
+ * So no need for a lock.
+ */
+ arl->arl_mactype = dlia->dl_mac_type;
+ arl->arl_sap_length = dlia->dl_sap_length;
+
+ if (!arl->arl_dlpi_style_set) {
+ if (dlia->dl_provider_style == DL_STYLE2)
+ arl->arl_needs_attach = 1;
+ mutex_enter(&arl->arl_lock);
+ ASSERT(arl->arl_dlpi_style_set == 0);
+ arl->arl_dlpi_style_set = 1;
+ arl->arl_state_flags &= ~ARL_LL_SUBNET_PENDING;
+ cv_broadcast(&arl->arl_cv);
+ mutex_exit(&arl->arl_lock);
+ }
+}
+
+int
+arl_init(queue_t *q, arl_t *arl)
+{
+ mblk_t *info_mp;
+ dl_info_req_t *dlir;
+
+ /* subset of ill_init */
+ mutex_init(&arl->arl_lock, NULL, MUTEX_DEFAULT, 0);
+
+ arl->arl_rq = q;
+ arl->arl_wq = WR(q);
+
+ info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)),
+ BPRI_HI);
+ if (info_mp == NULL)
+ return (ENOMEM);
+ /*
+ * allocate sufficient space to contain device name.
+ */
+ arl->arl_name = (char *)(mi_zalloc(2 * LIFNAMSIZ));
+ arl->arl_ppa = UINT_MAX;
+ arl->arl_state_flags |= (ARL_LL_SUBNET_PENDING | ARL_LL_UNBOUND);
+
+ /* Send down the Info Request to the driver. */
+ info_mp->b_datap->db_type = M_PCPROTO;
+ dlir = (dl_info_req_t *)info_mp->b_rptr;
+ info_mp->b_wptr = (uchar_t *)&dlir[1];
+ dlir->dl_primitive = DL_INFO_REQ;
+ arl->arl_dlpi_pending = DL_PRIM_INVAL;
+ qprocson(q);
+
+ arp_dlpi_send(arl, info_mp);
+ return (0);
+}
+
+int
+arl_wait_for_info_ack(arl_t *arl)
+{
+ int err;
+
+ mutex_enter(&arl->arl_lock);
+ while (arl->arl_state_flags & ARL_LL_SUBNET_PENDING) {
+ /*
+ * Return value of 0 indicates a pending signal.
+ */
+ err = cv_wait_sig(&arl->arl_cv, &arl->arl_lock);
+ if (err == 0) {
+ mutex_exit(&arl->arl_lock);
+ return (EINTR);
+ }
+ }
+ mutex_exit(&arl->arl_lock);
+ /*
+ * ip_rput_other could have set an error in ill_error on
+ * receipt of M_ERROR.
+ */
+ return (arl->arl_error);
+}
+
+void
+arl_set_muxid(ill_t *ill, int muxid)
+{
+ arl_t *arl;
+
+ arl = ill_to_arl(ill);
+ if (arl != NULL) {
+ arl->arl_muxid = muxid;
+ arl_refrele(arl);
+ }
+}
+
+int
+arl_get_muxid(ill_t *ill)
+{
+ arl_t *arl;
+ int muxid = 0;
+
+ arl = ill_to_arl(ill);
+ if (arl != NULL) {
+ muxid = arl->arl_muxid;
+ arl_refrele(arl);
+ }
+ return (muxid);
+}
+
+static int
+arp_modopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
+{
+ int err;
+ zoneid_t zoneid;
+ netstack_t *ns;
+ ip_stack_t *ipst;
+ arl_t *arl = NULL;
+
+ /*
+ * Prevent unprivileged processes from pushing IP so that
+ * they can't send raw IP.
+ */
+ if (secpolicy_net_rawaccess(credp) != 0)
+ return (EPERM);
+
+ ns = netstack_find_by_cred(credp);
+ ASSERT(ns != NULL);
+ ipst = ns->netstack_ip;
+ ASSERT(ipst != NULL);
+
+ /*
+ * For exclusive stacks we set the zoneid to zero
+ * to make IP operate as if in the global zone.
+ */
+ if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
+ zoneid = GLOBAL_ZONEID;
+ else
+ zoneid = crgetzoneid(credp);
+
+ arl = (arl_t *)mi_open_alloc_sleep(sizeof (arl_t));
+ q->q_ptr = WR(q)->q_ptr = arl;
+ arl->arl_ipst = ipst;
+ arl->arl_zoneid = zoneid;
+ err = arl_init(q, arl);
+
+ if (err != 0) {
+ mi_free(arl->arl_name);
+ mi_free(arl);
+ netstack_rele(ipst->ips_netstack);
+ q->q_ptr = NULL;
+ WR(q)->q_ptr = NULL;
+ return (err);
+ }
+
+ /*
+ * Wait for the DL_INFO_ACK if a DL_INFO_REQ was sent.
+ */
+ err = arl_wait_for_info_ack(arl);
+ if (err == 0)
+ arl->arl_credp = credp;
+ else
+ goto fail;
+
+ crhold(credp);
+
+ mutex_enter(&ipst->ips_ip_mi_lock);
+ err = mi_open_link(&ipst->ips_arp_g_head, (IDP)q->q_ptr, devp, flag,
+ sflag, credp);
+ mutex_exit(&ipst->ips_ip_mi_lock);
+fail:
+ if (err) {
+ (void) arp_close(q, 0);
+ return (err);
+ }
+ return (0);
+}
+
+/*
+ * Notify any downstream modules (esp softmac and hitbox) of the name
+ * of this interface using an M_CTL.
+ */
+static void
+arp_ifname_notify(arl_t *arl)
+{
+ mblk_t *mp1, *mp2;
+ struct iocblk *iocp;
+ struct lifreq *lifr;
+
+ if ((mp1 = mkiocb(SIOCSLIFNAME)) == NULL)
+ return;
+ if ((mp2 = allocb(sizeof (struct lifreq), BPRI_HI)) == NULL) {
+ freemsg(mp1);
+ return;
+ }
+
+ lifr = (struct lifreq *)mp2->b_rptr;
+ mp2->b_wptr += sizeof (struct lifreq);
+ bzero(lifr, sizeof (struct lifreq));
+
+ (void) strncpy(lifr->lifr_name, arl->arl_name, LIFNAMSIZ);
+ lifr->lifr_ppa = arl->arl_ppa;
+ lifr->lifr_flags = ILLF_IPV4;
+
+ /* Use M_CTL to avoid confusing anyone else who might be listening. */
+ DB_TYPE(mp1) = M_CTL;
+ mp1->b_cont = mp2;
+ iocp = (struct iocblk *)mp1->b_rptr;
+ iocp->ioc_count = msgsize(mp1->b_cont);
+ DTRACE_PROBE4(arl__dlpi, char *, "arp_ifname_notify",
+ char *, "SIOCSLIFNAME", char *, "-", arl_t *, arl);
+ putnext(arl->arl_wq, mp1);
+}
+
+void
+arp_send_replumb_conf(ill_t *ill)
+{
+ mblk_t *mp;
+ arl_t *arl = ill_to_arl(ill);
+
+ if (arl == NULL)
+ return;
+ /*
+ * arl_got_replumb and arl_got_unbind to be cleared after we complete
+ * arp_cmd_done.
+ */
+ mp = mexchange(NULL, NULL, sizeof (dl_notify_conf_t), M_PROTO,
+ DL_NOTIFY_CONF);
+ ((dl_notify_conf_t *)(mp->b_rptr))->dl_notification =
+ DL_NOTE_REPLUMB_DONE;
+ arp_dlpi_send(arl, mp);
+ mutex_enter(&arl->arl_lock);
+ arl->arl_state_flags &= ~ARL_LL_REPLUMBING;
+ mutex_exit(&arl->arl_lock);
+ arl_refrele(arl);
+}
+
+/*
+ * The unplumb code paths call arp_unbind_complete() to make sure that it is
+ * safe to tear down the ill. We wait for DL_UNBIND_ACK to complete, and also
+ * for the arl_refcnt to fall to one so that, when we return from
+ * arp_unbind_complete(), we know for certain that there are no threads in
+ * arp_rput() that might access the arl_ill.
+ */
+void
+arp_unbind_complete(ill_t *ill)
+{
+ arl_t *arl = ill_to_arl(ill);
+
+ if (arl == NULL)
+ return;
+ mutex_enter(&arl->arl_lock);
+ /*
+ * wait for unbind ack and arl_refcnt to drop to 1. Note that the
+ * quiescent arl_refcnt for this function is 1 (and not 0) because
+ * ill_to_arl() will itself return after taking a ref on the arl_t.
+ */
+ while (arl->arl_state_flags & ARL_DL_UNBIND_IN_PROGRESS)
+ cv_wait(&arl->arl_cv, &arl->arl_lock);
+ while (arl->arl_refcnt != 1)
+ cv_wait(&arl->arl_cv, &arl->arl_lock);
+ mutex_exit(&arl->arl_lock);
+ arl_refrele(arl);
+}
diff --git a/usr/src/uts/common/inet/ip/ip_attr.c b/usr/src/uts/common/inet/ip/ip_attr.c
new file mode 100644
index 0000000000..a46a82c85f
--- /dev/null
+++ b/usr/src/uts/common/inet/ip/ip_attr.c
@@ -0,0 +1,1338 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/* Copyright (c) 1990 Mentat Inc. */
+
+#include <sys/types.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#include <sys/zone.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/atomic.h>
+
+#include <sys/systm.h>
+#include <sys/param.h>
+#include <sys/kmem.h>
+#include <sys/sdt.h>
+#include <sys/socket.h>
+#include <sys/mac.h>
+#include <net/if.h>
+#include <net/if_arp.h>
+#include <net/route.h>
+#include <sys/sockio.h>
+#include <netinet/in.h>
+#include <net/if_dl.h>
+
+#include <inet/common.h>
+#include <inet/mi.h>
+#include <inet/mib2.h>
+#include <inet/nd.h>
+#include <inet/arp.h>
+#include <inet/snmpcom.h>
+#include <inet/kstatcom.h>
+
+#include <netinet/igmp_var.h>
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#include <netinet/sctp.h>
+
+#include <inet/ip.h>
+#include <inet/ip_impl.h>
+#include <inet/ip6.h>
+#include <inet/ip6_asp.h>
+#include <inet/tcp.h>
+#include <inet/ip_multi.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ire.h>
+#include <inet/ip_ftable.h>
+#include <inet/ip_rts.h>
+#include <inet/optcom.h>
+#include <inet/ip_ndp.h>
+#include <inet/ip_listutils.h>
+#include <netinet/igmp.h>
+#include <netinet/ip_mroute.h>
+#include <inet/ipp_common.h>
+
+#include <net/pfkeyv2.h>
+#include <inet/sadb.h>
+#include <inet/ipsec_impl.h>
+#include <inet/ipdrop.h>
+#include <inet/ip_netinfo.h>
+#include <sys/squeue_impl.h>
+#include <sys/squeue.h>
+
+#include <inet/ipclassifier.h>
+#include <inet/sctp_ip.h>
+#include <inet/sctp/sctp_impl.h>
+#include <inet/udp_impl.h>
+#include <sys/sunddi.h>
+
+#include <sys/tsol/label.h>
+#include <sys/tsol/tnet.h>
+
+/*
+ * Release a reference on ip_xmit_attr.
+ * The reference is acquired by conn_get_ixa()
+ */
+#define IXA_REFRELE(ixa) \
+{ \
+ if (atomic_add_32_nv(&(ixa)->ixa_refcnt, -1) == 0) \
+ ixa_inactive(ixa); \
+}
+
+#define IXA_REFHOLD(ixa) \
+{ \
+ ASSERT((ixa)->ixa_refcnt != 0); \
+ atomic_add_32(&(ixa)->ixa_refcnt, 1); \
+}
+
+/*
+ * When we need to handle a transmit side asynchronous operation, then we need
+ * to save sufficient information so that we can call the fragment and postfrag
+ * functions. That information is captured in an mblk containing this structure.
+ *
+ * Since this is currently only used for IPsec, we include information for
+ * the kernel crypto framework.
+ */
+typedef struct ixamblk_s {
+ boolean_t ixm_inbound; /* B_FALSE */
+ iaflags_t ixm_flags; /* ixa_flags */
+ netstackid_t ixm_stackid; /* Verify it didn't go away */
+ uint_t ixm_ifindex; /* Used to find the nce */
+ in6_addr_t ixm_nceaddr_v6; /* Used to find nce */
+#define ixm_nceaddr_v4 V4_PART_OF_V6(ixm_nceaddr_v6)
+ uint32_t ixm_fragsize;
+ uint_t ixm_pktlen;
+ uint16_t ixm_ip_hdr_length; /* Points to ULP header */
+ uint8_t ixm_protocol; /* Protocol number for ULP cksum */
+ pfirepostfrag_t ixm_postfragfn;
+
+ zoneid_t ixm_zoneid; /* Needed for ipobs */
+ zoneid_t ixm_no_loop_zoneid; /* IXAF_NO_LOOP_ZONEID_SET */
+
+ uint_t ixm_scopeid; /* For IPv6 link-locals */
+
+ uint32_t ixm_ident; /* For IPv6 fragment header */
+ uint32_t ixm_xmit_hint;
+
+ cred_t *ixm_cred; /* For getpeerucred - refhold if set */
+ pid_t ixm_cpid; /* For getpeerucred */
+
+ ts_label_t *ixm_tsl; /* Refhold if set. */
+
+ /*
+ * When the pointers below are set they have a refhold on the struct.
+ */
+ ipsec_latch_t *ixm_ipsec_latch;
+ struct ipsa_s *ixm_ipsec_ah_sa; /* SA for AH */
+ struct ipsa_s *ixm_ipsec_esp_sa; /* SA for ESP */
+ struct ipsec_policy_s *ixm_ipsec_policy; /* why are we here? */
+ struct ipsec_action_s *ixm_ipsec_action; /* For reflected packets */
+
+ ipsa_ref_t ixm_ipsec_ref[2]; /* Soft reference to SA */
+
+ /* Need these while waiting for SA */
+ uint16_t ixm_ipsec_src_port; /* Source port number of d-gram. */
+ uint16_t ixm_ipsec_dst_port; /* Destination port number of d-gram. */
+ uint8_t ixm_ipsec_icmp_type; /* ICMP type of d-gram */
+ uint8_t ixm_ipsec_icmp_code; /* ICMP code of d-gram */
+
+ sa_family_t ixm_ipsec_inaf; /* Inner address family */
+ uint32_t ixm_ipsec_insrc[IXA_MAX_ADDRLEN]; /* Inner src address */
+ uint32_t ixm_ipsec_indst[IXA_MAX_ADDRLEN]; /* Inner dest address */
+ uint8_t ixm_ipsec_insrcpfx; /* Inner source prefix */
+ uint8_t ixm_ipsec_indstpfx; /* Inner destination prefix */
+
+ uint8_t ixm_ipsec_proto; /* IP protocol number for d-gram. */
+} ixamblk_t;
+
+
+/*
+ * When we need to handle a receive side asynchronous operation, then we need
+ * to save sufficient information so that we can call ip_fanout.
+ * That information is captured in an mblk containing this structure.
+ *
+ * Since this is currently only used for IPsec, we include information for
+ * the kernel crypto framework.
+ */
+typedef struct iramblk_s {
+ boolean_t irm_inbound; /* B_TRUE */
+ iaflags_t irm_flags; /* ira_flags */
+ netstackid_t irm_stackid; /* Verify it didn't go away */
+ uint_t irm_ifindex; /* To find ira_ill */
+
+ uint_t irm_rifindex; /* ira_rifindex */
+ uint_t irm_ruifindex; /* ira_ruifindex */
+ uint_t irm_pktlen;
+ uint16_t irm_ip_hdr_length; /* Points to ULP header */
+ uint8_t irm_protocol; /* Protocol number for ULP cksum */
+ zoneid_t irm_zoneid; /* ALL_ZONES unless local delivery */
+
+ squeue_t *irm_sqp;
+ ill_rx_ring_t *irm_ring;
+
+ ipaddr_t irm_mroute_tunnel; /* IRAF_MROUTE_TUNNEL_SET */
+ zoneid_t irm_no_loop_zoneid; /* IRAF_NO_LOOP_ZONEID_SET */
+ uint32_t irm_esp_udp_ports; /* IRAF_ESP_UDP_PORTS */
+
+ char irm_l2src[IRA_L2SRC_SIZE]; /* If IRAF_L2SRC_SET */
+
+ cred_t *irm_cred; /* For getpeerucred - refhold if set */
+ pid_t irm_cpid; /* For getpeerucred */
+
+ ts_label_t *irm_tsl; /* Refhold if set. */
+
+ /*
+ * When set these correspond to a refhold on the object.
+ */
+ struct ipsa_s *irm_ipsec_ah_sa; /* SA for AH */
+ struct ipsa_s *irm_ipsec_esp_sa; /* SA for ESP */
+ struct ipsec_action_s *irm_ipsec_action; /* For reflected packets */
+} iramblk_t;
+
+
+/*
+ * Take the information in ip_xmit_attr_t and stick it in an mblk
+ * that can later be passed to ip_xmit_attr_from_mblk to recreate the
+ * ip_xmit_attr_t.
+ *
+ * Returns NULL on memory allocation failure.
+ */
+mblk_t *
+ip_xmit_attr_to_mblk(ip_xmit_attr_t *ixa)
+{
+ mblk_t *ixamp;
+ ixamblk_t *ixm;
+ nce_t *nce = ixa->ixa_nce;
+
+ ASSERT(nce != NULL);
+ ixamp = allocb(sizeof (*ixm), BPRI_MED);
+ if (ixamp == NULL)
+ return (NULL);
+
+ ixamp->b_datap->db_type = M_BREAK;
+ ixamp->b_wptr += sizeof (*ixm);
+ ixm = (ixamblk_t *)ixamp->b_rptr;
+
+ bzero(ixm, sizeof (*ixm));
+ ixm->ixm_inbound = B_FALSE;
+ ixm->ixm_flags = ixa->ixa_flags;
+ ixm->ixm_stackid = ixa->ixa_ipst->ips_netstack->netstack_stackid;
+ ixm->ixm_ifindex = nce->nce_ill->ill_phyint->phyint_ifindex;
+ ixm->ixm_nceaddr_v6 = nce->nce_addr;
+ ixm->ixm_fragsize = ixa->ixa_fragsize;
+ ixm->ixm_pktlen = ixa->ixa_pktlen;
+ ixm->ixm_ip_hdr_length = ixa->ixa_ip_hdr_length;
+ ixm->ixm_protocol = ixa->ixa_protocol;
+ ixm->ixm_postfragfn = ixa->ixa_postfragfn;
+ ixm->ixm_zoneid = ixa->ixa_zoneid;
+ ixm->ixm_no_loop_zoneid = ixa->ixa_no_loop_zoneid;
+ ixm->ixm_scopeid = ixa->ixa_scopeid;
+ ixm->ixm_ident = ixa->ixa_ident;
+ ixm->ixm_xmit_hint = ixa->ixa_xmit_hint;
+
+ if (ixa->ixa_tsl != NULL) {
+ ixm->ixm_tsl = ixa->ixa_tsl;
+ label_hold(ixm->ixm_tsl);
+ }
+ if (ixa->ixa_cred != NULL) {
+ ixm->ixm_cred = ixa->ixa_cred;
+ crhold(ixa->ixa_cred);
+ }
+ ixm->ixm_cpid = ixa->ixa_cpid;
+
+ if (ixa->ixa_flags & IXAF_IPSEC_SECURE) {
+ if (ixa->ixa_ipsec_ah_sa != NULL) {
+ ixm->ixm_ipsec_ah_sa = ixa->ixa_ipsec_ah_sa;
+ IPSA_REFHOLD(ixa->ixa_ipsec_ah_sa);
+ }
+ if (ixa->ixa_ipsec_esp_sa != NULL) {
+ ixm->ixm_ipsec_esp_sa = ixa->ixa_ipsec_esp_sa;
+ IPSA_REFHOLD(ixa->ixa_ipsec_esp_sa);
+ }
+ if (ixa->ixa_ipsec_policy != NULL) {
+ ixm->ixm_ipsec_policy = ixa->ixa_ipsec_policy;
+ IPPOL_REFHOLD(ixa->ixa_ipsec_policy);
+ }
+ if (ixa->ixa_ipsec_action != NULL) {
+ ixm->ixm_ipsec_action = ixa->ixa_ipsec_action;
+ IPACT_REFHOLD(ixa->ixa_ipsec_action);
+ }
+ if (ixa->ixa_ipsec_latch != NULL) {
+ ixm->ixm_ipsec_latch = ixa->ixa_ipsec_latch;
+ IPLATCH_REFHOLD(ixa->ixa_ipsec_latch);
+ }
+ ixm->ixm_ipsec_ref[0] = ixa->ixa_ipsec_ref[0];
+ ixm->ixm_ipsec_ref[1] = ixa->ixa_ipsec_ref[1];
+ ixm->ixm_ipsec_src_port = ixa->ixa_ipsec_src_port;
+ ixm->ixm_ipsec_dst_port = ixa->ixa_ipsec_dst_port;
+ ixm->ixm_ipsec_icmp_type = ixa->ixa_ipsec_icmp_type;
+ ixm->ixm_ipsec_icmp_code = ixa->ixa_ipsec_icmp_code;
+ ixm->ixm_ipsec_inaf = ixa->ixa_ipsec_inaf;
+ ixm->ixm_ipsec_insrc[0] = ixa->ixa_ipsec_insrc[0];
+ ixm->ixm_ipsec_insrc[1] = ixa->ixa_ipsec_insrc[1];
+ ixm->ixm_ipsec_insrc[2] = ixa->ixa_ipsec_insrc[2];
+ ixm->ixm_ipsec_insrc[3] = ixa->ixa_ipsec_insrc[3];
+ ixm->ixm_ipsec_indst[0] = ixa->ixa_ipsec_indst[0];
+ ixm->ixm_ipsec_indst[1] = ixa->ixa_ipsec_indst[1];
+ ixm->ixm_ipsec_indst[2] = ixa->ixa_ipsec_indst[2];
+ ixm->ixm_ipsec_indst[3] = ixa->ixa_ipsec_indst[3];
+ ixm->ixm_ipsec_insrcpfx = ixa->ixa_ipsec_insrcpfx;
+ ixm->ixm_ipsec_indstpfx = ixa->ixa_ipsec_indstpfx;
+ ixm->ixm_ipsec_proto = ixa->ixa_ipsec_proto;
+ }
+ return (ixamp);
+}
+
+/*
+ * Extract the ip_xmit_attr_t from the mblk, checking that the
+ * ip_stack_t, ill_t, and nce_t still exist. Returns B_FALSE if that is
+ * not the case.
+ *
+ * Otherwise ixa is updated.
+ * Caller needs to release references on the ixa by calling ixa_refrele()
+ * which will imediately call ixa_inactive to release the references.
+ */
+boolean_t
+ip_xmit_attr_from_mblk(mblk_t *ixamp, ip_xmit_attr_t *ixa)
+{
+ ixamblk_t *ixm;
+ netstack_t *ns;
+ ip_stack_t *ipst;
+ ill_t *ill;
+ nce_t *nce;
+
+ /* We assume the caller hasn't initialized ixa */
+ bzero(ixa, sizeof (*ixa));
+
+ ASSERT(DB_TYPE(ixamp) == M_BREAK);
+ ASSERT(ixamp->b_cont == NULL);
+
+ ixm = (ixamblk_t *)ixamp->b_rptr;
+ ASSERT(!ixm->ixm_inbound);
+
+ /* Verify the netstack is still around */
+ ns = netstack_find_by_stackid(ixm->ixm_stackid);
+ if (ns == NULL) {
+ /* Disappeared on us */
+ (void) ip_xmit_attr_free_mblk(ixamp);
+ return (B_FALSE);
+ }
+ ipst = ns->netstack_ip;
+
+ /* Verify the ill is still around */
+ ill = ill_lookup_on_ifindex(ixm->ixm_ifindex,
+ !(ixm->ixm_flags & IXAF_IS_IPV4), ipst);
+
+ /* We have the ill, hence the netstack can't go away */
+ netstack_rele(ns);
+ if (ill == NULL) {
+ /* Disappeared on us */
+ (void) ip_xmit_attr_free_mblk(ixamp);
+ return (B_FALSE);
+ }
+ /*
+ * Find the nce. We don't load-spread (only lookup nce's on the ill)
+ * because we want to find the same nce as the one we had when
+ * ip_xmit_attr_to_mblk was called.
+ */
+ if (ixm->ixm_flags & IXAF_IS_IPV4) {
+ nce = nce_lookup_v4(ill, &ixm->ixm_nceaddr_v4);
+ } else {
+ nce = nce_lookup_v6(ill, &ixm->ixm_nceaddr_v6);
+ }
+
+ /* We have the nce, hence the ill can't go away */
+ ill_refrele(ill);
+ if (nce == NULL) {
+ /*
+ * Since this is unusual and we don't know what type of
+ * nce it was, we drop the packet.
+ */
+ (void) ip_xmit_attr_free_mblk(ixamp);
+ return (B_FALSE);
+ }
+
+ ixa->ixa_flags = ixm->ixm_flags;
+ ixa->ixa_refcnt = 1;
+ ixa->ixa_ipst = ipst;
+ ixa->ixa_fragsize = ixm->ixm_fragsize;
+ ixa->ixa_pktlen = ixm->ixm_pktlen;
+ ixa->ixa_ip_hdr_length = ixm->ixm_ip_hdr_length;
+ ixa->ixa_protocol = ixm->ixm_protocol;
+ ixa->ixa_nce = nce;
+ ixa->ixa_postfragfn = ixm->ixm_postfragfn;
+ ixa->ixa_zoneid = ixm->ixm_zoneid;
+ ixa->ixa_no_loop_zoneid = ixm->ixm_no_loop_zoneid;
+ ixa->ixa_scopeid = ixm->ixm_scopeid;
+ ixa->ixa_ident = ixm->ixm_ident;
+ ixa->ixa_xmit_hint = ixm->ixm_xmit_hint;
+
+ if (ixm->ixm_tsl != NULL) {
+ ixa->ixa_tsl = ixm->ixm_tsl;
+ ixa->ixa_free_flags |= IXA_FREE_TSL;
+ }
+ if (ixm->ixm_cred != NULL) {
+ ixa->ixa_cred = ixm->ixm_cred;
+ ixa->ixa_free_flags |= IXA_FREE_CRED;
+ }
+ ixa->ixa_cpid = ixm->ixm_cpid;
+
+ ixa->ixa_ipsec_ah_sa = ixm->ixm_ipsec_ah_sa;
+ ixa->ixa_ipsec_esp_sa = ixm->ixm_ipsec_esp_sa;
+ ixa->ixa_ipsec_policy = ixm->ixm_ipsec_policy;
+ ixa->ixa_ipsec_action = ixm->ixm_ipsec_action;
+ ixa->ixa_ipsec_latch = ixm->ixm_ipsec_latch;
+
+ ixa->ixa_ipsec_ref[0] = ixm->ixm_ipsec_ref[0];
+ ixa->ixa_ipsec_ref[1] = ixm->ixm_ipsec_ref[1];
+ ixa->ixa_ipsec_src_port = ixm->ixm_ipsec_src_port;
+ ixa->ixa_ipsec_dst_port = ixm->ixm_ipsec_dst_port;
+ ixa->ixa_ipsec_icmp_type = ixm->ixm_ipsec_icmp_type;
+ ixa->ixa_ipsec_icmp_code = ixm->ixm_ipsec_icmp_code;
+ ixa->ixa_ipsec_inaf = ixm->ixm_ipsec_inaf;
+ ixa->ixa_ipsec_insrc[0] = ixm->ixm_ipsec_insrc[0];
+ ixa->ixa_ipsec_insrc[1] = ixm->ixm_ipsec_insrc[1];
+ ixa->ixa_ipsec_insrc[2] = ixm->ixm_ipsec_insrc[2];
+ ixa->ixa_ipsec_insrc[3] = ixm->ixm_ipsec_insrc[3];
+ ixa->ixa_ipsec_indst[0] = ixm->ixm_ipsec_indst[0];
+ ixa->ixa_ipsec_indst[1] = ixm->ixm_ipsec_indst[1];
+ ixa->ixa_ipsec_indst[2] = ixm->ixm_ipsec_indst[2];
+ ixa->ixa_ipsec_indst[3] = ixm->ixm_ipsec_indst[3];
+ ixa->ixa_ipsec_insrcpfx = ixm->ixm_ipsec_insrcpfx;
+ ixa->ixa_ipsec_indstpfx = ixm->ixm_ipsec_indstpfx;
+ ixa->ixa_ipsec_proto = ixm->ixm_ipsec_proto;
+
+ freeb(ixamp);
+ return (B_TRUE);
+}
+
+/*
+ * Free the ixm mblk and any references it holds
+ * Returns b_cont.
+ */
+mblk_t *
+ip_xmit_attr_free_mblk(mblk_t *ixamp)
+{
+ ixamblk_t *ixm;
+ mblk_t *mp;
+
+ /* Consume mp */
+ ASSERT(DB_TYPE(ixamp) == M_BREAK);
+ mp = ixamp->b_cont;
+
+ ixm = (ixamblk_t *)ixamp->b_rptr;
+ ASSERT(!ixm->ixm_inbound);
+
+ if (ixm->ixm_ipsec_ah_sa != NULL) {
+ IPSA_REFRELE(ixm->ixm_ipsec_ah_sa);
+ ixm->ixm_ipsec_ah_sa = NULL;
+ }
+ if (ixm->ixm_ipsec_esp_sa != NULL) {
+ IPSA_REFRELE(ixm->ixm_ipsec_esp_sa);
+ ixm->ixm_ipsec_esp_sa = NULL;
+ }
+ if (ixm->ixm_ipsec_policy != NULL) {
+ IPPOL_REFRELE(ixm->ixm_ipsec_policy);
+ ixm->ixm_ipsec_policy = NULL;
+ }
+ if (ixm->ixm_ipsec_action != NULL) {
+ IPACT_REFRELE(ixm->ixm_ipsec_action);
+ ixm->ixm_ipsec_action = NULL;
+ }
+ if (ixm->ixm_ipsec_latch) {
+ IPLATCH_REFRELE(ixm->ixm_ipsec_latch);
+ ixm->ixm_ipsec_latch = NULL;
+ }
+
+ if (ixm->ixm_tsl != NULL) {
+ label_rele(ixm->ixm_tsl);
+ ixm->ixm_tsl = NULL;
+ }
+ if (ixm->ixm_cred != NULL) {
+ crfree(ixm->ixm_cred);
+ ixm->ixm_cred = NULL;
+ }
+ freeb(ixamp);
+ return (mp);
+}
+
+/*
+ * Take the information in ip_recv_attr_t and stick it in an mblk
+ * that can later be passed to ip_recv_attr_from_mblk to recreate the
+ * ip_recv_attr_t.
+ *
+ * Returns NULL on memory allocation failure.
+ */
+mblk_t *
+ip_recv_attr_to_mblk(ip_recv_attr_t *ira)
+{
+ mblk_t *iramp;
+ iramblk_t *irm;
+ ill_t *ill = ira->ira_ill;
+
+ ASSERT(ira->ira_ill != NULL || ira->ira_ruifindex != 0);
+
+ iramp = allocb(sizeof (*irm), BPRI_MED);
+ if (iramp == NULL)
+ return (NULL);
+
+ iramp->b_datap->db_type = M_BREAK;
+ iramp->b_wptr += sizeof (*irm);
+ irm = (iramblk_t *)iramp->b_rptr;
+
+ bzero(irm, sizeof (*irm));
+ irm->irm_inbound = B_TRUE;
+ irm->irm_flags = ira->ira_flags;
+ if (ill != NULL) {
+ /* Internal to IP - preserve ip_stack_t, ill and rill */
+ irm->irm_stackid =
+ ill->ill_ipst->ips_netstack->netstack_stackid;
+ irm->irm_ifindex = ira->ira_ill->ill_phyint->phyint_ifindex;
+ ASSERT(ira->ira_rill->ill_phyint->phyint_ifindex ==
+ ira->ira_rifindex);
+ } else {
+ /* Let ip_recv_attr_from_stackid know there isn't one */
+ irm->irm_stackid = -1;
+ }
+ irm->irm_rifindex = ira->ira_rifindex;
+ irm->irm_ruifindex = ira->ira_ruifindex;
+ irm->irm_pktlen = ira->ira_pktlen;
+ irm->irm_ip_hdr_length = ira->ira_ip_hdr_length;
+ irm->irm_protocol = ira->ira_protocol;
+
+ irm->irm_sqp = ira->ira_sqp;
+ irm->irm_ring = ira->ira_ring;
+
+ irm->irm_zoneid = ira->ira_zoneid;
+ irm->irm_mroute_tunnel = ira->ira_mroute_tunnel;
+ irm->irm_no_loop_zoneid = ira->ira_no_loop_zoneid;
+ irm->irm_esp_udp_ports = ira->ira_esp_udp_ports;
+
+ if (ira->ira_tsl != NULL) {
+ irm->irm_tsl = ira->ira_tsl;
+ label_hold(irm->irm_tsl);
+ }
+ if (ira->ira_cred != NULL) {
+ irm->irm_cred = ira->ira_cred;
+ crhold(ira->ira_cred);
+ }
+ irm->irm_cpid = ira->ira_cpid;
+
+ if (ira->ira_flags & IRAF_L2SRC_SET)
+ bcopy(ira->ira_l2src, irm->irm_l2src, IRA_L2SRC_SIZE);
+
+ if (ira->ira_flags & IRAF_IPSEC_SECURE) {
+ if (ira->ira_ipsec_ah_sa != NULL) {
+ irm->irm_ipsec_ah_sa = ira->ira_ipsec_ah_sa;
+ IPSA_REFHOLD(ira->ira_ipsec_ah_sa);
+ }
+ if (ira->ira_ipsec_esp_sa != NULL) {
+ irm->irm_ipsec_esp_sa = ira->ira_ipsec_esp_sa;
+ IPSA_REFHOLD(ira->ira_ipsec_esp_sa);
+ }
+ if (ira->ira_ipsec_action != NULL) {
+ irm->irm_ipsec_action = ira->ira_ipsec_action;
+ IPACT_REFHOLD(ira->ira_ipsec_action);
+ }
+ }
+ return (iramp);
+}
+
+/*
+ * Extract the ip_recv_attr_t from the mblk. If we are used inside IP
+ * then irm_stackid is not -1, in which case we check that the
+ * ip_stack_t and ill_t still exist. Returns B_FALSE if that is
+ * not the case.
+ * If irm_stackid is zero then we are used by an ULP (e.g., squeue_enter)
+ * and we just proceed with ira_ill and ira_rill as NULL.
+ *
+ * The caller needs to release any references on the pointers inside the ire
+ * by calling ira_cleanup.
+ */
+boolean_t
+ip_recv_attr_from_mblk(mblk_t *iramp, ip_recv_attr_t *ira)
+{
+ iramblk_t *irm;
+ netstack_t *ns;
+ ip_stack_t *ipst = NULL;
+ ill_t *ill = NULL, *rill = NULL;
+
+ /* We assume the caller hasn't initialized ira */
+ bzero(ira, sizeof (*ira));
+
+ ASSERT(DB_TYPE(iramp) == M_BREAK);
+ ASSERT(iramp->b_cont == NULL);
+
+ irm = (iramblk_t *)iramp->b_rptr;
+ ASSERT(irm->irm_inbound);
+
+ if (irm->irm_stackid != -1) {
+ /* Verify the netstack is still around */
+ ns = netstack_find_by_stackid(irm->irm_stackid);
+ if (ns == NULL) {
+ /* Disappeared on us */
+ (void) ip_recv_attr_free_mblk(iramp);
+ return (B_FALSE);
+ }
+ ipst = ns->netstack_ip;
+
+ /* Verify the ill is still around */
+ ill = ill_lookup_on_ifindex(irm->irm_ifindex,
+ !(irm->irm_flags & IRAF_IS_IPV4), ipst);
+
+ if (irm->irm_ifindex == irm->irm_rifindex) {
+ rill = ill;
+ } else {
+ rill = ill_lookup_on_ifindex(irm->irm_rifindex,
+ !(irm->irm_flags & IRAF_IS_IPV4), ipst);
+ }
+
+ /* We have the ill, hence the netstack can't go away */
+ netstack_rele(ns);
+ if (ill == NULL || rill == NULL) {
+ /* Disappeared on us */
+ if (ill != NULL)
+ ill_refrele(ill);
+ if (rill != NULL && rill != ill)
+ ill_refrele(rill);
+ (void) ip_recv_attr_free_mblk(iramp);
+ return (B_FALSE);
+ }
+ }
+
+ ira->ira_flags = irm->irm_flags;
+ /* Caller must ill_refele(ira_ill) by using ira_cleanup() */
+ ira->ira_ill = ill;
+ ira->ira_rill = rill;
+
+ ira->ira_rifindex = irm->irm_rifindex;
+ ira->ira_ruifindex = irm->irm_ruifindex;
+ ira->ira_pktlen = irm->irm_pktlen;
+ ira->ira_ip_hdr_length = irm->irm_ip_hdr_length;
+ ira->ira_protocol = irm->irm_protocol;
+
+ ira->ira_sqp = irm->irm_sqp;
+ /* The rest of IP assumes that the rings never go away. */
+ ira->ira_ring = irm->irm_ring;
+
+ ira->ira_zoneid = irm->irm_zoneid;
+ ira->ira_mroute_tunnel = irm->irm_mroute_tunnel;
+ ira->ira_no_loop_zoneid = irm->irm_no_loop_zoneid;
+ ira->ira_esp_udp_ports = irm->irm_esp_udp_ports;
+
+ if (irm->irm_tsl != NULL) {
+ ira->ira_tsl = irm->irm_tsl;
+ ira->ira_free_flags |= IRA_FREE_TSL;
+ }
+ if (irm->irm_cred != NULL) {
+ ira->ira_cred = irm->irm_cred;
+ ira->ira_free_flags |= IRA_FREE_CRED;
+ }
+ ira->ira_cpid = irm->irm_cpid;
+
+ if (ira->ira_flags & IRAF_L2SRC_SET)
+ bcopy(irm->irm_l2src, ira->ira_l2src, IRA_L2SRC_SIZE);
+
+ ira->ira_ipsec_ah_sa = irm->irm_ipsec_ah_sa;
+ ira->ira_ipsec_esp_sa = irm->irm_ipsec_esp_sa;
+ ira->ira_ipsec_action = irm->irm_ipsec_action;
+
+ freeb(iramp);
+ return (B_TRUE);
+}
+
+/*
+ * Free the irm mblk and any references it holds
+ * Returns b_cont.
+ */
+mblk_t *
+ip_recv_attr_free_mblk(mblk_t *iramp)
+{
+ iramblk_t *irm;
+ mblk_t *mp;
+
+ /* Consume mp */
+ ASSERT(DB_TYPE(iramp) == M_BREAK);
+ mp = iramp->b_cont;
+
+ irm = (iramblk_t *)iramp->b_rptr;
+ ASSERT(irm->irm_inbound);
+
+ if (irm->irm_ipsec_ah_sa != NULL) {
+ IPSA_REFRELE(irm->irm_ipsec_ah_sa);
+ irm->irm_ipsec_ah_sa = NULL;
+ }
+ if (irm->irm_ipsec_esp_sa != NULL) {
+ IPSA_REFRELE(irm->irm_ipsec_esp_sa);
+ irm->irm_ipsec_esp_sa = NULL;
+ }
+ if (irm->irm_ipsec_action != NULL) {
+ IPACT_REFRELE(irm->irm_ipsec_action);
+ irm->irm_ipsec_action = NULL;
+ }
+ if (irm->irm_tsl != NULL) {
+ label_rele(irm->irm_tsl);
+ irm->irm_tsl = NULL;
+ }
+ if (irm->irm_cred != NULL) {
+ crfree(irm->irm_cred);
+ irm->irm_cred = NULL;
+ }
+
+ freeb(iramp);
+ return (mp);
+}
+
+/*
+ * Returns true if the mblk contains an ip_recv_attr_t
+ * For now we just check db_type.
+ */
+boolean_t
+ip_recv_attr_is_mblk(mblk_t *mp)
+{
+ /*
+ * Need to handle the various forms of tcp_timermp which are tagged
+ * with b_wptr and might have a NULL b_datap.
+ */
+ if (mp->b_wptr == NULL || mp->b_wptr == (uchar_t *)-1)
+ return (B_FALSE);
+
+#ifdef DEBUG
+ iramblk_t *irm;
+
+ if (DB_TYPE(mp) != M_BREAK)
+ return (B_FALSE);
+
+ irm = (iramblk_t *)mp->b_rptr;
+ ASSERT(irm->irm_inbound);
+ return (B_TRUE);
+#else
+ return (DB_TYPE(mp) == M_BREAK);
+#endif
+}
+
+static ip_xmit_attr_t *
+conn_get_ixa_impl(conn_t *connp, boolean_t replace, int kmflag)
+{
+ ip_xmit_attr_t *ixa;
+ ip_xmit_attr_t *oldixa;
+
+ mutex_enter(&connp->conn_lock);
+ ixa = connp->conn_ixa;
+
+ /* At least one references for the conn_t */
+ ASSERT(ixa->ixa_refcnt >= 1);
+ if (atomic_add_32_nv(&ixa->ixa_refcnt, 1) == 2) {
+ /* No other thread using conn_ixa */
+ mutex_exit(&connp->conn_lock);
+ return (ixa);
+ }
+ ixa = kmem_alloc(sizeof (*ixa), kmflag);
+ if (ixa == NULL) {
+ mutex_exit(&connp->conn_lock);
+ ixa_refrele(connp->conn_ixa);
+ return (NULL);
+ }
+ ixa_safe_copy(connp->conn_ixa, ixa);
+
+ /* Make sure we drop conn_lock before any refrele */
+ if (replace) {
+ ixa->ixa_refcnt++; /* No atomic needed - not visible */
+ oldixa = connp->conn_ixa;
+ connp->conn_ixa = ixa;
+ mutex_exit(&connp->conn_lock);
+ IXA_REFRELE(oldixa); /* Undo refcnt from conn_t */
+ } else {
+ oldixa = connp->conn_ixa;
+ mutex_exit(&connp->conn_lock);
+ }
+ IXA_REFRELE(oldixa); /* Undo above atomic_add_32_nv */
+
+ return (ixa);
+}
+
+/*
+ * Return an ip_xmit_attr_t to use with a conn_t that ensures that only
+ * the caller can access the ip_xmit_attr_t.
+ *
+ * If nobody else is using conn_ixa we return it.
+ * Otherwise we make a "safe" copy of conn_ixa
+ * and return it. The "safe" copy has the pointers set to NULL
+ * (since the pointers might be changed by another thread using
+ * conn_ixa). The caller needs to check for NULL pointers to see
+ * if ip_set_destination needs to be called to re-establish the pointers.
+ *
+ * If 'replace' is set then we replace conn_ixa with the new ip_xmit_attr_t.
+ * That is used when we connect() the ULP.
+ */
+ip_xmit_attr_t *
+conn_get_ixa(conn_t *connp, boolean_t replace)
+{
+ return (conn_get_ixa_impl(connp, replace, KM_NOSLEEP));
+}
+
+/*
+ * Used only when the option is to have the kernel hang due to not
+ * cleaning up ixa references on ills etc.
+ */
+ip_xmit_attr_t *
+conn_get_ixa_tryhard(conn_t *connp, boolean_t replace)
+{
+ return (conn_get_ixa_impl(connp, replace, KM_SLEEP));
+}
+
+/*
+ * Replace conn_ixa with the ixa argument.
+ *
+ * The caller must hold conn_lock.
+ *
+ * We return the old ixa; the caller must ixa_refrele that after conn_lock
+ * has been dropped.
+ */
+ip_xmit_attr_t *
+conn_replace_ixa(conn_t *connp, ip_xmit_attr_t *ixa)
+{
+ ip_xmit_attr_t *oldixa;
+
+ ASSERT(MUTEX_HELD(&connp->conn_lock));
+
+ oldixa = connp->conn_ixa;
+ IXA_REFHOLD(ixa);
+ connp->conn_ixa = ixa;
+ return (oldixa);
+}
+
+/*
+ * Return a ip_xmit_attr_t to use with a conn_t that is based on but
+ * separate from conn_ixa.
+ *
+ * This "safe" copy has the pointers set to NULL
+ * (since the pointers might be changed by another thread using
+ * conn_ixa). The caller needs to check for NULL pointers to see
+ * if ip_set_destination needs to be called to re-establish the pointers.
+ */
+ip_xmit_attr_t *
+conn_get_ixa_exclusive(conn_t *connp)
+{
+ ip_xmit_attr_t *ixa;
+
+ mutex_enter(&connp->conn_lock);
+ ixa = connp->conn_ixa;
+
+ /* At least one references for the conn_t */
+ ASSERT(ixa->ixa_refcnt >= 1);
+
+ /* Make sure conn_ixa doesn't disappear while we copy it */
+ atomic_add_32(&ixa->ixa_refcnt, 1);
+
+ ixa = kmem_alloc(sizeof (*ixa), KM_NOSLEEP);
+ if (ixa == NULL) {
+ mutex_exit(&connp->conn_lock);
+ ixa_refrele(connp->conn_ixa);
+ return (NULL);
+ }
+ ixa_safe_copy(connp->conn_ixa, ixa);
+ mutex_exit(&connp->conn_lock);
+ IXA_REFRELE(connp->conn_ixa);
+ return (ixa);
+}
+
+void
+ixa_safe_copy(ip_xmit_attr_t *src, ip_xmit_attr_t *ixa)
+{
+ bcopy(src, ixa, sizeof (*ixa));
+ ixa->ixa_refcnt = 1;
+ /*
+ * Clear any pointers that have references and might be changed
+ * by ip_set_destination or the ULP
+ */
+ ixa->ixa_ire = NULL;
+ ixa->ixa_nce = NULL;
+ ixa->ixa_dce = NULL;
+ ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
+ ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
+#ifdef DEBUG
+ ixa->ixa_curthread = NULL;
+#endif
+ /* Clear all the IPsec pointers and the flag as well. */
+ ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
+
+ ixa->ixa_ipsec_latch = NULL;
+ ixa->ixa_ipsec_ah_sa = NULL;
+ ixa->ixa_ipsec_esp_sa = NULL;
+ ixa->ixa_ipsec_policy = NULL;
+ ixa->ixa_ipsec_action = NULL;
+
+ /*
+ * We leave ixa_tsl unchanged, but if it has a refhold we need
+ * to get an extra refhold.
+ */
+ if (ixa->ixa_free_flags & IXA_FREE_TSL)
+ label_hold(ixa->ixa_tsl);
+
+ /*
+ * We leave ixa_cred unchanged, but if it has a refhold we need
+ * to get an extra refhold.
+ */
+ if (ixa->ixa_free_flags & IXA_FREE_CRED)
+ crhold(ixa->ixa_cred);
+}
+
+/*
+ * Duplicate an ip_xmit_attr_t.
+ * Assumes that the caller controls the ixa, hence we do not need to use
+ * a safe copy. We just have to increase the refcnt on any pointers.
+ */
+ip_xmit_attr_t *
+ip_xmit_attr_duplicate(ip_xmit_attr_t *src_ixa)
+{
+ ip_xmit_attr_t *ixa;
+
+ ixa = kmem_alloc(sizeof (*ixa), KM_NOSLEEP);
+ if (ixa == NULL)
+ return (NULL);
+ bcopy(src_ixa, ixa, sizeof (*ixa));
+ ixa->ixa_refcnt = 1;
+
+ if (ixa->ixa_ire != NULL)
+ ire_refhold_notr(ixa->ixa_ire);
+ if (ixa->ixa_nce != NULL)
+ nce_refhold(ixa->ixa_nce);
+ if (ixa->ixa_dce != NULL)
+ dce_refhold_notr(ixa->ixa_dce);
+
+#ifdef DEBUG
+ ixa->ixa_curthread = NULL;
+#endif
+
+ if (ixa->ixa_ipsec_latch != NULL)
+ IPLATCH_REFHOLD(ixa->ixa_ipsec_latch);
+ if (ixa->ixa_ipsec_ah_sa != NULL)
+ IPSA_REFHOLD(ixa->ixa_ipsec_ah_sa);
+ if (ixa->ixa_ipsec_esp_sa != NULL)
+ IPSA_REFHOLD(ixa->ixa_ipsec_esp_sa);
+ if (ixa->ixa_ipsec_policy != NULL)
+ IPPOL_REFHOLD(ixa->ixa_ipsec_policy);
+ if (ixa->ixa_ipsec_action != NULL)
+ IPACT_REFHOLD(ixa->ixa_ipsec_action);
+
+ if (ixa->ixa_tsl != NULL) {
+ label_hold(ixa->ixa_tsl);
+ ixa->ixa_free_flags |= IXA_FREE_TSL;
+ }
+ if (ixa->ixa_cred != NULL) {
+ crhold(ixa->ixa_cred);
+ ixa->ixa_free_flags |= IXA_FREE_CRED;
+ }
+ return (ixa);
+}
+
+/*
+ * Used to replace the ixa_label field.
+ * The caller should have a reference on the label, which we transfer to
+ * the attributes so that when the attribute is freed/cleaned up
+ * we will release that reference.
+ */
+void
+ip_xmit_attr_replace_tsl(ip_xmit_attr_t *ixa, ts_label_t *tsl)
+{
+ ASSERT(tsl != NULL);
+
+ if (ixa->ixa_free_flags & IXA_FREE_TSL) {
+ ASSERT(ixa->ixa_tsl != NULL);
+ label_rele(ixa->ixa_tsl);
+ } else {
+ ixa->ixa_free_flags |= IXA_FREE_TSL;
+ }
+ ixa->ixa_tsl = tsl;
+}
+
+/*
+ * Replace the ip_recv_attr_t's label.
+ * Due to kernel RPC's use of db_credp we also need to replace ira_cred;
+ * TCP/UDP uses ira_cred to set db_credp for non-socket users.
+ * This can fail (and return B_FALSE) due to lack of memory.
+ */
+boolean_t
+ip_recv_attr_replace_label(ip_recv_attr_t *ira, ts_label_t *tsl)
+{
+ cred_t *newcr;
+
+ if (ira->ira_free_flags & IRA_FREE_TSL) {
+ ASSERT(ira->ira_tsl != NULL);
+ label_rele(ira->ira_tsl);
+ }
+ label_hold(tsl);
+ ira->ira_tsl = tsl;
+ ira->ira_free_flags |= IRA_FREE_TSL;
+
+ /*
+ * Reset zoneid if we have a shared address. That allows
+ * ip_fanout_tx_v4/v6 to determine the zoneid again.
+ */
+ if (ira->ira_flags & IRAF_TX_SHARED_ADDR)
+ ira->ira_zoneid = ALL_ZONES;
+
+ /* We update ira_cred for RPC */
+ newcr = copycred_from_tslabel(ira->ira_cred, ira->ira_tsl, KM_NOSLEEP);
+ if (newcr == NULL)
+ return (B_FALSE);
+ if (ira->ira_free_flags & IRA_FREE_CRED)
+ crfree(ira->ira_cred);
+ ira->ira_cred = newcr;
+ ira->ira_free_flags |= IRA_FREE_CRED;
+ return (B_TRUE);
+}
+
+/*
+ * This needs to be called after ip_set_destination/tsol_check_dest might
+ * have changed ixa_tsl to be specific for a destination, and we now want to
+ * send to a different destination.
+ * We have to restart with crgetlabel() since ip_set_destination/
+ * tsol_check_dest will start with ixa_tsl.
+ */
+void
+ip_xmit_attr_restore_tsl(ip_xmit_attr_t *ixa, cred_t *cr)
+{
+ if (!is_system_labeled())
+ return;
+
+ if (ixa->ixa_free_flags & IXA_FREE_TSL) {
+ ASSERT(ixa->ixa_tsl != NULL);
+ label_rele(ixa->ixa_tsl);
+ ixa->ixa_free_flags &= ~IXA_FREE_TSL;
+ }
+ ixa->ixa_tsl = crgetlabel(cr);
+}
+
+void
+ixa_refrele(ip_xmit_attr_t *ixa)
+{
+ IXA_REFRELE(ixa);
+}
+
+void
+ixa_inactive(ip_xmit_attr_t *ixa)
+{
+ ASSERT(ixa->ixa_refcnt == 0);
+
+ ixa_cleanup(ixa);
+ kmem_free(ixa, sizeof (*ixa));
+}
+
+/*
+ * Release any references contained in the ixa.
+ * Also clear any fields that are not controlled by ixa_flags.
+ */
+void
+ixa_cleanup(ip_xmit_attr_t *ixa)
+{
+ if (ixa->ixa_ire != NULL) {
+ ire_refrele_notr(ixa->ixa_ire);
+ ixa->ixa_ire = NULL;
+ }
+ if (ixa->ixa_dce != NULL) {
+ dce_refrele_notr(ixa->ixa_dce);
+ ixa->ixa_dce = NULL;
+ }
+ if (ixa->ixa_nce != NULL) {
+ nce_refrele(ixa->ixa_nce);
+ ixa->ixa_nce = NULL;
+ }
+ ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
+ ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
+ if (ixa->ixa_flags & IXAF_IPSEC_SECURE) {
+ ipsec_out_release_refs(ixa);
+ }
+ if (ixa->ixa_free_flags & IXA_FREE_TSL) {
+ ASSERT(ixa->ixa_tsl != NULL);
+ label_rele(ixa->ixa_tsl);
+ ixa->ixa_tsl = NULL;
+ ixa->ixa_free_flags &= ~IXA_FREE_TSL;
+ }
+ if (ixa->ixa_free_flags & IXA_FREE_CRED) {
+ ASSERT(ixa->ixa_cred != NULL);
+ crfree(ixa->ixa_cred);
+ ixa->ixa_cred = NULL;
+ ixa->ixa_free_flags &= ~IXA_FREE_CRED;
+ }
+ ixa->ixa_src_preferences = 0;
+ ixa->ixa_ifindex = 0;
+ ixa->ixa_multicast_ifindex = 0;
+ ixa->ixa_multicast_ifaddr = INADDR_ANY;
+}
+
+/*
+ * Release any references contained in the ira.
+ * Callers which use ip_recv_attr_from_mblk() would pass B_TRUE as the second
+ * argument.
+ */
+void
+ira_cleanup(ip_recv_attr_t *ira, boolean_t refrele_ill)
+{
+ if (ira->ira_ill != NULL) {
+ if (ira->ira_rill != ira->ira_ill) {
+ /* Caused by async processing */
+ ill_refrele(ira->ira_rill);
+ }
+ if (refrele_ill)
+ ill_refrele(ira->ira_ill);
+ }
+ if (ira->ira_flags & IRAF_IPSEC_SECURE) {
+ ipsec_in_release_refs(ira);
+ }
+ if (ira->ira_free_flags & IRA_FREE_TSL) {
+ ASSERT(ira->ira_tsl != NULL);
+ label_rele(ira->ira_tsl);
+ ira->ira_tsl = NULL;
+ ira->ira_free_flags &= ~IRA_FREE_TSL;
+ }
+ if (ira->ira_free_flags & IRA_FREE_CRED) {
+ ASSERT(ira->ira_cred != NULL);
+ crfree(ira->ira_cred);
+ ira->ira_cred = NULL;
+ ira->ira_free_flags &= ~IRA_FREE_CRED;
+ }
+}
+
+/*
+ * Function to help release any IRE, NCE, or DCEs that
+ * have been deleted and are marked as condemned.
+ * The caller is responsible for any serialization which is different
+ * for TCP, SCTP, and others.
+ */
+static void
+ixa_cleanup_stale(ip_xmit_attr_t *ixa)
+{
+ ire_t *ire;
+ nce_t *nce;
+ dce_t *dce;
+
+ ire = ixa->ixa_ire;
+ nce = ixa->ixa_nce;
+ dce = ixa->ixa_dce;
+
+ if (ire != NULL && IRE_IS_CONDEMNED(ire)) {
+ ire_refrele_notr(ire);
+ ire = ire_blackhole(ixa->ixa_ipst,
+ !(ixa->ixa_flags & IXAF_IS_IPV4));
+ ASSERT(ire != NULL);
+#ifdef DEBUG
+ ire_refhold_notr(ire);
+ ire_refrele(ire);
+#endif
+ ixa->ixa_ire = ire;
+ ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
+ }
+ if (nce != NULL && nce->nce_is_condemned) {
+ /* Can make it NULL as long as we set IRE_GENERATION_VERIFY */
+ nce_refrele(nce);
+ ixa->ixa_nce = NULL;
+ ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
+ }
+ if (dce != NULL && DCE_IS_CONDEMNED(dce)) {
+ dce_refrele_notr(dce);
+ dce = dce_get_default(ixa->ixa_ipst);
+ ASSERT(dce != NULL);
+#ifdef DEBUG
+ dce_refhold_notr(dce);
+ dce_refrele(dce);
+#endif
+ ixa->ixa_dce = dce;
+ ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
+ }
+}
+
+/*
+ * Used to run ixa_cleanup_stale inside the tcp squeue.
+ * When done we hand the mp back by assigning it to tcps_ixa_cleanup_mp
+ * and waking up the caller.
+ */
+/* ARGSUSED2 */
+static void
+tcp_ixa_cleanup(void *arg, mblk_t *mp, void *arg2,
+ ip_recv_attr_t *dummy)
+{
+ conn_t *connp = (conn_t *)arg;
+ tcp_stack_t *tcps;
+
+ tcps = connp->conn_netstack->netstack_tcp;
+
+ ixa_cleanup_stale(connp->conn_ixa);
+
+ mutex_enter(&tcps->tcps_ixa_cleanup_lock);
+ ASSERT(tcps->tcps_ixa_cleanup_mp == NULL);
+ tcps->tcps_ixa_cleanup_mp = mp;
+ cv_signal(&tcps->tcps_ixa_cleanup_cv);
+ mutex_exit(&tcps->tcps_ixa_cleanup_lock);
+}
+
+
+/*
+ * ipcl_walk() function to help release any IRE, NCE, or DCEs that
+ * have been deleted and are marked as condemned.
+ * Note that we can't cleanup the pointers since there can be threads
+ * in conn_ip_output() sending while we are called.
+ */
+void
+conn_ixa_cleanup(conn_t *connp, void *arg)
+{
+ boolean_t tryhard = (boolean_t)arg;
+
+ if (IPCL_IS_TCP(connp)) {
+ mblk_t *mp;
+ tcp_stack_t *tcps;
+
+ tcps = connp->conn_netstack->netstack_tcp;
+
+ mutex_enter(&tcps->tcps_ixa_cleanup_lock);
+ while ((mp = tcps->tcps_ixa_cleanup_mp) == NULL) {
+ /*
+ * Multiple concurrent cleanups; need to have the last
+ * one run since it could be an unplumb.
+ */
+ cv_wait(&tcps->tcps_ixa_cleanup_cv,
+ &tcps->tcps_ixa_cleanup_lock);
+ }
+ tcps->tcps_ixa_cleanup_mp = NULL;
+ mutex_exit(&tcps->tcps_ixa_cleanup_lock);
+
+ if (connp->conn_sqp->sq_run == curthread) {
+ /* Already on squeue */
+ tcp_ixa_cleanup(connp, mp, NULL, NULL);
+ } else {
+ CONN_INC_REF(connp);
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_ixa_cleanup,
+ connp, NULL, SQ_PROCESS, SQTAG_TCP_IXA_CLEANUP);
+
+ /* Wait until tcp_ixa_cleanup has run */
+ mutex_enter(&tcps->tcps_ixa_cleanup_lock);
+ while (tcps->tcps_ixa_cleanup_mp == NULL) {
+ cv_wait(&tcps->tcps_ixa_cleanup_cv,
+ &tcps->tcps_ixa_cleanup_lock);
+ }
+ mutex_exit(&tcps->tcps_ixa_cleanup_lock);
+ }
+ } else if (IPCL_IS_SCTP(connp)) {
+ sctp_t *sctp;
+ sctp_faddr_t *fp;
+
+ sctp = CONN2SCTP(connp);
+ RUN_SCTP(sctp);
+ ixa_cleanup_stale(connp->conn_ixa);
+ for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->next)
+ ixa_cleanup_stale(fp->ixa);
+ WAKE_SCTP(sctp);
+ } else {
+ ip_xmit_attr_t *ixa;
+
+ /*
+ * If there is a different thread using conn_ixa then we get a
+ * new copy and cut the old one loose from conn_ixa. Otherwise
+ * we use conn_ixa and prevent any other thread from
+ * using/changing it. Anybody using conn_ixa (e.g., a thread in
+ * conn_ip_output) will do an ixa_refrele which will remove any
+ * references on the ire etc.
+ *
+ * Once we are done other threads can use conn_ixa since the
+ * refcnt will be back at one.
+ *
+ * We are called either because an ill is going away, or
+ * due to memory reclaim. In the former case we wait for
+ * memory since we must remove the refcnts on the ill.
+ */
+ if (tryhard) {
+ ixa = conn_get_ixa_tryhard(connp, B_TRUE);
+ ASSERT(ixa != NULL);
+ } else {
+ ixa = conn_get_ixa(connp, B_TRUE);
+ if (ixa == NULL) {
+ /*
+ * Somebody else was using it and kmem_alloc
+ * failed! Next memory reclaim will try to
+ * clean up.
+ */
+ DTRACE_PROBE1(conn__ixa__cleanup__bail,
+ conn_t *, connp);
+ return;
+ }
+ }
+ ixa_cleanup_stale(ixa);
+ ixa_refrele(ixa);
+ }
+}
+
+/*
+ * ixa needs to be an exclusive copy so that no one changes the cookie
+ * or the ixa_nce.
+ */
+boolean_t
+ixa_check_drain_insert(conn_t *connp, ip_xmit_attr_t *ixa)
+{
+ uintptr_t cookie = ixa->ixa_cookie;
+ ill_dld_direct_t *idd;
+ idl_tx_list_t *idl_txl;
+ ill_t *ill = ixa->ixa_nce->nce_ill;
+ boolean_t inserted = B_FALSE;
+
+ idd = &(ill)->ill_dld_capab->idc_direct;
+ idl_txl = &ixa->ixa_ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)];
+ if (cookie == 0) {
+ /*
+ * ip_xmit failed the canputnext check
+ */
+ connp->conn_did_putbq = 1;
+ ASSERT(cookie == 0);
+ conn_drain_insert(connp, idl_txl);
+ if (!IPCL_IS_NONSTR(connp))
+ noenable(connp->conn_wq);
+ return (B_TRUE);
+ }
+ ASSERT(ILL_DIRECT_CAPABLE(ill));
+ mutex_enter(&idl_txl->txl_lock);
+ if (connp->conn_direct_blocked ||
+ (idd->idd_tx_fctl_df(idd->idd_tx_fctl_dh, cookie) == 0)) {
+ DTRACE_PROBE1(ill__tx__not__blocked, boolean,
+ connp->conn_direct_blocked);
+ } else if (idl_txl->txl_cookie != NULL &&
+ idl_txl->txl_cookie != ixa->ixa_cookie) {
+ DTRACE_PROBE2(ill__send__tx__collision, uintptr_t, cookie,
+ uintptr_t, idl_txl->txl_cookie);
+ /* bump kstat for cookie collision */
+ } else {
+ connp->conn_direct_blocked = B_TRUE;
+ idl_txl->txl_cookie = cookie;
+ conn_drain_insert(connp, idl_txl);
+ if (!IPCL_IS_NONSTR(connp))
+ noenable(connp->conn_wq);
+ inserted = B_TRUE;
+ }
+ mutex_exit(&idl_txl->txl_lock);
+ return (inserted);
+}
diff --git a/usr/src/uts/common/inet/ip/ip_dce.c b/usr/src/uts/common/inet/ip/ip_dce.c
new file mode 100644
index 0000000000..839c5ae0d0
--- /dev/null
+++ b/usr/src/uts/common/inet/ip/ip_dce.c
@@ -0,0 +1,873 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#include <sys/zone.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/atomic.h>
+#define _SUN_TPI_VERSION 2
+#include <sys/tihdr.h>
+
+#include <inet/common.h>
+#include <inet/mi.h>
+#include <inet/mib2.h>
+#include <inet/snmpcom.h>
+
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+
+#include <inet/ip.h>
+#include <inet/ip_impl.h>
+#include <inet/ip6.h>
+#include <inet/ip6_asp.h>
+#include <inet/ip_multi.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ire.h>
+#include <inet/ip_ftable.h>
+#include <inet/ip_rts.h>
+#include <inet/ip_ndp.h>
+#include <inet/ipclassifier.h>
+#include <inet/ip_listutils.h>
+
+#include <sys/sunddi.h>
+
+/*
+ * Routines for handling destination cache entries.
+ * There is always one DCEF_DEFAULT for each ip_stack_t created at init time.
+ * That entry holds both the IP ident value and the dce generation number.
+ *
+ * Any time a DCE is changed significantly (different path MTU, but NOT
+ * different ULP info!), the dce_generation number is increased.
+ * Also, when a new DCE is created, the dce_generation number in the default
+ * DCE is bumped. That allows the dce_t information to be cached efficiently
+ * as long as the entity caching the dce_t also caches the dce_generation,
+ * and compares the cached generation to detect any changes.
+ * Furthermore, when a DCE is deleted, if there are any outstanding references
+ * to the DCE it will be marked as condemned. The condemned mark is
+ * a designated generation number which is never otherwise used, hence
+ * the single comparison with the generation number captures that as well.
+ *
+ * An example of code which caches is as follows:
+ *
+ * if (mystruct->my_dce_generation != mystruct->my_dce->dce_generation) {
+ * The DCE has changed
+ * mystruct->my_dce = dce_lookup_pkt(mp, ixa,
+ * &mystruct->my_dce_generation);
+ * Not needed in practice, since we have the default DCE:
+ * if (DCE_IS_CONDEMNED(mystruct->my_dce))
+ * return failure;
+ * }
+ *
+ * Note that for IPv6 link-local addresses we record the ifindex since the
+ * link-locals are not globally unique.
+ */
+
+/*
+ * Hash bucket structure for DCEs
+ */
+typedef struct dcb_s {
+ krwlock_t dcb_lock;
+ uint32_t dcb_cnt;
+ dce_t *dcb_dce;
+} dcb_t;
+
+static void dce_delete_locked(dcb_t *, dce_t *);
+static void dce_make_condemned(dce_t *);
+
+static kmem_cache_t *dce_cache;
+
+
+/* Operates on a uint64_t */
+#define RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48))
+
+/*
+ * Reclaim a fraction of dce's in the dcb.
+ * For now we have a higher probability to delete DCEs without DCE_PMTU.
+ */
+static void
+dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction)
+{
+ uint_t fraction_pmtu = fraction*4;
+ uint_t hash;
+ dce_t *dce, *nextdce;
+
+ rw_enter(&dcb->dcb_lock, RW_WRITER);
+ for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
+ nextdce = dce->dce_next;
+ /* Clear DCEF_PMTU if the pmtu is too old */
+ mutex_enter(&dce->dce_lock);
+ if ((dce->dce_flags & DCEF_PMTU) &&
+ TICK_TO_SEC(lbolt64) - dce->dce_last_change_time >
+ ipst->ips_ip_pathmtu_interval) {
+ dce->dce_flags &= ~DCEF_PMTU;
+ mutex_exit(&dce->dce_lock);
+ dce_increment_generation(dce);
+ } else {
+ mutex_exit(&dce->dce_lock);
+ }
+ hash = RANDOM_HASH((uint64_t)(uintptr_t)dce);
+ if (dce->dce_flags & DCEF_PMTU) {
+ if (hash % fraction_pmtu != 0)
+ continue;
+ } else {
+ if (hash % fraction != 0)
+ continue;
+ }
+
+ IP_STAT(ipst, ip_dce_reclaim_deleted);
+ dce_delete_locked(dcb, dce);
+ dce_refrele(dce);
+ }
+ rw_exit(&dcb->dcb_lock);
+}
+
+/*
+ * kmem_cache callback to free up memory.
+ *
+ */
+static void
+ip_dce_reclaim_stack(ip_stack_t *ipst)
+{
+ int i;
+
+ IP_STAT(ipst, ip_dce_reclaim_calls);
+ for (i = 0; i < ipst->ips_dce_hashsize; i++) {
+ dcb_reclaim(&ipst->ips_dce_hash_v4[i], ipst,
+ ipst->ips_ip_dce_reclaim_fraction);
+
+ dcb_reclaim(&ipst->ips_dce_hash_v6[i], ipst,
+ ipst->ips_ip_dce_reclaim_fraction);
+ }
+
+ /*
+ * Walk all CONNs that can have a reference on an ire, nce or dce.
+ * Get them to update any stale references to drop any refholds they
+ * have.
+ */
+ ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
+}
+
+/*
+ * Called by the memory allocator subsystem directly, when the system
+ * is running low on memory.
+ */
+/* ARGSUSED */
+void
+ip_dce_reclaim(void *args)
+{
+ netstack_handle_t nh;
+ netstack_t *ns;
+
+ netstack_next_init(&nh);
+ while ((ns = netstack_next(&nh)) != NULL) {
+ ip_dce_reclaim_stack(ns->netstack_ip);
+ netstack_rele(ns);
+ }
+ netstack_next_fini(&nh);
+}
+
+void
+dce_g_init(void)
+{
+ dce_cache = kmem_cache_create("dce_cache",
+ sizeof (dce_t), 0, NULL, NULL, ip_dce_reclaim, NULL, NULL, 0);
+}
+
+void
+dce_g_destroy(void)
+{
+ kmem_cache_destroy(dce_cache);
+}
+
+
+/*
+ * Allocate a default DCE and a hash table for per-IP address DCEs
+ */
+void
+dce_stack_init(ip_stack_t *ipst)
+{
+ int i;
+
+ ipst->ips_dce_default = kmem_cache_alloc(dce_cache, KM_SLEEP);
+ bzero(ipst->ips_dce_default, sizeof (dce_t));
+ ipst->ips_dce_default->dce_flags = DCEF_DEFAULT;
+ ipst->ips_dce_default->dce_generation = DCE_GENERATION_INITIAL;
+ ipst->ips_dce_default->dce_last_change_time = TICK_TO_SEC(lbolt64);
+ ipst->ips_dce_default->dce_refcnt = 1; /* Should never go away */
+ ipst->ips_dce_default->dce_ipst = ipst;
+
+ /* This must be a power of two since we are using IRE_ADDR_HASH macro */
+ ipst->ips_dce_hashsize = 256;
+ ipst->ips_dce_hash_v4 = kmem_zalloc(ipst->ips_dce_hashsize *
+ sizeof (dcb_t), KM_SLEEP);
+ ipst->ips_dce_hash_v6 = kmem_zalloc(ipst->ips_dce_hashsize *
+ sizeof (dcb_t), KM_SLEEP);
+ for (i = 0; i < ipst->ips_dce_hashsize; i++) {
+ rw_init(&ipst->ips_dce_hash_v4[i].dcb_lock, NULL, RW_DEFAULT,
+ NULL);
+ rw_init(&ipst->ips_dce_hash_v6[i].dcb_lock, NULL, RW_DEFAULT,
+ NULL);
+ }
+}
+
+void
+dce_stack_destroy(ip_stack_t *ipst)
+{
+ int i;
+ for (i = 0; i < ipst->ips_dce_hashsize; i++) {
+ rw_destroy(&ipst->ips_dce_hash_v4[i].dcb_lock);
+ rw_destroy(&ipst->ips_dce_hash_v6[i].dcb_lock);
+ }
+ kmem_free(ipst->ips_dce_hash_v4,
+ ipst->ips_dce_hashsize * sizeof (dcb_t));
+ ipst->ips_dce_hash_v4 = NULL;
+ kmem_free(ipst->ips_dce_hash_v6,
+ ipst->ips_dce_hashsize * sizeof (dcb_t));
+ ipst->ips_dce_hash_v6 = NULL;
+ ipst->ips_dce_hashsize = 0;
+
+ ASSERT(ipst->ips_dce_default->dce_refcnt == 1);
+ kmem_cache_free(dce_cache, ipst->ips_dce_default);
+ ipst->ips_dce_default = NULL;
+}
+
+/* When any DCE is good enough */
+dce_t *
+dce_get_default(ip_stack_t *ipst)
+{
+ dce_t *dce;
+
+ dce = ipst->ips_dce_default;
+ dce_refhold(dce);
+ return (dce);
+}
+
+/*
+ * Generic for IPv4 and IPv6.
+ *
+ * Used by callers that need to cache e.g., the datapath
+ * Returns the generation number in the last argument.
+ */
+dce_t *
+dce_lookup_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp)
+{
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ /*
+ * If we have a source route we need to look for the final
+ * destination in the source route option.
+ */
+ ipaddr_t final_dst;
+ ipha_t *ipha = (ipha_t *)mp->b_rptr;
+
+ final_dst = ip_get_dst(ipha);
+ return (dce_lookup_v4(final_dst, ixa->ixa_ipst, generationp));
+ } else {
+ uint_t ifindex;
+ /*
+ * If we have a routing header we need to look for the final
+ * destination in the routing extension header.
+ */
+ in6_addr_t final_dst;
+ ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+
+ final_dst = ip_get_dst_v6(ip6h, mp, NULL);
+ ifindex = 0;
+ if (IN6_IS_ADDR_LINKSCOPE(&final_dst) && ixa->ixa_nce != NULL) {
+ ifindex = ixa->ixa_nce->nce_common->ncec_ill->
+ ill_phyint->phyint_ifindex;
+ }
+ return (dce_lookup_v6(&final_dst, ifindex, ixa->ixa_ipst,
+ generationp));
+ }
+}
+
+/*
+ * Used by callers that need to cache e.g., the datapath
+ * Returns the generation number in the last argument.
+ */
+dce_t *
+dce_lookup_v4(ipaddr_t dst, ip_stack_t *ipst, uint_t *generationp)
+{
+ uint_t hash;
+ dcb_t *dcb;
+ dce_t *dce;
+
+ /* Set *generationp before dropping the lock(s) that allow additions */
+ if (generationp != NULL)
+ *generationp = ipst->ips_dce_default->dce_generation;
+
+ hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
+ dcb = &ipst->ips_dce_hash_v4[hash];
+ rw_enter(&dcb->dcb_lock, RW_READER);
+ for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
+ if (dce->dce_v4addr == dst) {
+ mutex_enter(&dce->dce_lock);
+ if (!DCE_IS_CONDEMNED(dce)) {
+ dce_refhold(dce);
+ if (generationp != NULL)
+ *generationp = dce->dce_generation;
+ mutex_exit(&dce->dce_lock);
+ rw_exit(&dcb->dcb_lock);
+ return (dce);
+ }
+ mutex_exit(&dce->dce_lock);
+ }
+ }
+ rw_exit(&dcb->dcb_lock);
+ /* Not found */
+ dce = ipst->ips_dce_default;
+ dce_refhold(dce);
+ return (dce);
+}
+
+/*
+ * Used by callers that need to cache e.g., the datapath
+ * Returns the generation number in the last argument.
+ * ifindex should only be set for link-locals
+ */
+dce_t *
+dce_lookup_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst,
+ uint_t *generationp)
+{
+ uint_t hash;
+ dcb_t *dcb;
+ dce_t *dce;
+
+ /* Set *generationp before dropping the lock(s) that allow additions */
+ if (generationp != NULL)
+ *generationp = ipst->ips_dce_default->dce_generation;
+
+ hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
+ dcb = &ipst->ips_dce_hash_v6[hash];
+ rw_enter(&dcb->dcb_lock, RW_READER);
+ for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
+ if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
+ dce->dce_ifindex == ifindex) {
+ mutex_enter(&dce->dce_lock);
+ if (!DCE_IS_CONDEMNED(dce)) {
+ dce_refhold(dce);
+ if (generationp != NULL)
+ *generationp = dce->dce_generation;
+ mutex_exit(&dce->dce_lock);
+ rw_exit(&dcb->dcb_lock);
+ return (dce);
+ }
+ mutex_exit(&dce->dce_lock);
+ }
+ }
+ rw_exit(&dcb->dcb_lock);
+ /* Not found */
+ dce = ipst->ips_dce_default;
+ dce_refhold(dce);
+ return (dce);
+}
+
+/*
+ * Atomically looks for a non-default DCE, and if not found tries to create one.
+ * If there is no memory it returns NULL.
+ * When an entry is created we increase the generation number on
+ * the default DCE so that conn_ip_output will detect there is a new DCE.
+ */
+dce_t *
+dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst)
+{
+ uint_t hash;
+ dcb_t *dcb;
+ dce_t *dce;
+
+ hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
+ dcb = &ipst->ips_dce_hash_v4[hash];
+ rw_enter(&dcb->dcb_lock, RW_WRITER);
+ for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
+ if (dce->dce_v4addr == dst) {
+ mutex_enter(&dce->dce_lock);
+ if (!DCE_IS_CONDEMNED(dce)) {
+ dce_refhold(dce);
+ mutex_exit(&dce->dce_lock);
+ rw_exit(&dcb->dcb_lock);
+ return (dce);
+ }
+ mutex_exit(&dce->dce_lock);
+ }
+ }
+ dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
+ if (dce == NULL) {
+ rw_exit(&dcb->dcb_lock);
+ return (NULL);
+ }
+ bzero(dce, sizeof (dce_t));
+ dce->dce_ipst = ipst; /* No netstack_hold */
+ dce->dce_v4addr = dst;
+ dce->dce_generation = DCE_GENERATION_INITIAL;
+ dce->dce_ipversion = IPV4_VERSION;
+ dce->dce_last_change_time = TICK_TO_SEC(lbolt64);
+ dce_refhold(dce); /* For the hash list */
+
+ /* Link into list */
+ if (dcb->dcb_dce != NULL)
+ dcb->dcb_dce->dce_ptpn = &dce->dce_next;
+ dce->dce_next = dcb->dcb_dce;
+ dce->dce_ptpn = &dcb->dcb_dce;
+ dcb->dcb_dce = dce;
+ dce->dce_bucket = dcb;
+ dce_refhold(dce); /* For the caller */
+ rw_exit(&dcb->dcb_lock);
+
+ /* Initialize dce_ident to be different than for the last packet */
+ dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
+
+ dce_increment_generation(ipst->ips_dce_default);
+ return (dce);
+}
+
+/*
+ * Atomically looks for a non-default DCE, and if not found tries to create one.
+ * If there is no memory it returns NULL.
+ * When an entry is created we increase the generation number on
+ * the default DCE so that conn_ip_output will detect there is a new DCE.
+ * ifindex should only be used with link-local addresses.
+ */
+dce_t *
+dce_lookup_and_add_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst)
+{
+ uint_t hash;
+ dcb_t *dcb;
+ dce_t *dce;
+
+ /* We should not create entries for link-locals w/o an ifindex */
+ ASSERT(!(IN6_IS_ADDR_LINKSCOPE(dst)) || ifindex != 0);
+
+ hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
+ dcb = &ipst->ips_dce_hash_v6[hash];
+ rw_enter(&dcb->dcb_lock, RW_WRITER);
+ for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
+ if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
+ dce->dce_ifindex == ifindex) {
+ mutex_enter(&dce->dce_lock);
+ if (!DCE_IS_CONDEMNED(dce)) {
+ dce_refhold(dce);
+ mutex_exit(&dce->dce_lock);
+ rw_exit(&dcb->dcb_lock);
+ return (dce);
+ }
+ mutex_exit(&dce->dce_lock);
+ }
+ }
+
+ dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
+ if (dce == NULL) {
+ rw_exit(&dcb->dcb_lock);
+ return (NULL);
+ }
+ bzero(dce, sizeof (dce_t));
+ dce->dce_ipst = ipst; /* No netstack_hold */
+ dce->dce_v6addr = *dst;
+ dce->dce_ifindex = ifindex;
+ dce->dce_generation = DCE_GENERATION_INITIAL;
+ dce->dce_ipversion = IPV6_VERSION;
+ dce->dce_last_change_time = TICK_TO_SEC(lbolt64);
+ dce_refhold(dce); /* For the hash list */
+
+ /* Link into list */
+ if (dcb->dcb_dce != NULL)
+ dcb->dcb_dce->dce_ptpn = &dce->dce_next;
+ dce->dce_next = dcb->dcb_dce;
+ dce->dce_ptpn = &dcb->dcb_dce;
+ dcb->dcb_dce = dce;
+ dce->dce_bucket = dcb;
+ atomic_add_32(&dcb->dcb_cnt, 1);
+ dce_refhold(dce); /* For the caller */
+ rw_exit(&dcb->dcb_lock);
+
+ /* Initialize dce_ident to be different than for the last packet */
+ dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
+ dce_increment_generation(ipst->ips_dce_default);
+ return (dce);
+}
+
+/*
+ * Set/update uinfo. Creates a per-destination dce if none exists.
+ *
+ * Note that we do not bump the generation number here.
+ * New connections will find the new uinfo.
+ *
+ * The only use of this (tcp, sctp using iulp_t) is to set rtt+rtt_sd.
+ */
+static void
+dce_setuinfo(dce_t *dce, iulp_t *uinfo)
+{
+ /*
+ * Update the round trip time estimate and/or the max frag size
+ * and/or the slow start threshold.
+ *
+ * We serialize multiple advises using dce_lock.
+ */
+ mutex_enter(&dce->dce_lock);
+ /* Gard against setting to zero */
+ if (uinfo->iulp_rtt != 0) {
+ /*
+ * If there is no old cached values, initialize them
+ * conservatively. Set them to be (1.5 * new value).
+ */
+ if (dce->dce_uinfo.iulp_rtt != 0) {
+ dce->dce_uinfo.iulp_rtt = (dce->dce_uinfo.iulp_rtt +
+ uinfo->iulp_rtt) >> 1;
+ } else {
+ dce->dce_uinfo.iulp_rtt = uinfo->iulp_rtt +
+ (uinfo->iulp_rtt >> 1);
+ }
+ if (dce->dce_uinfo.iulp_rtt_sd != 0) {
+ dce->dce_uinfo.iulp_rtt_sd =
+ (dce->dce_uinfo.iulp_rtt_sd +
+ uinfo->iulp_rtt_sd) >> 1;
+ } else {
+ dce->dce_uinfo.iulp_rtt_sd = uinfo->iulp_rtt_sd +
+ (uinfo->iulp_rtt_sd >> 1);
+ }
+ }
+ if (uinfo->iulp_mtu != 0) {
+ if (dce->dce_flags & DCEF_PMTU) {
+ dce->dce_pmtu = MIN(uinfo->iulp_mtu, dce->dce_pmtu);
+ } else {
+ dce->dce_pmtu = MIN(uinfo->iulp_mtu, IP_MAXPACKET);
+ dce->dce_flags |= DCEF_PMTU;
+ }
+ dce->dce_last_change_time = TICK_TO_SEC(lbolt64);
+ }
+ if (uinfo->iulp_ssthresh != 0) {
+ if (dce->dce_uinfo.iulp_ssthresh != 0)
+ dce->dce_uinfo.iulp_ssthresh =
+ (uinfo->iulp_ssthresh +
+ dce->dce_uinfo.iulp_ssthresh) >> 1;
+ else
+ dce->dce_uinfo.iulp_ssthresh = uinfo->iulp_ssthresh;
+ }
+ /* We have uinfo for sure */
+ dce->dce_flags |= DCEF_UINFO;
+ mutex_exit(&dce->dce_lock);
+}
+
+
+int
+dce_update_uinfo_v4(ipaddr_t dst, iulp_t *uinfo, ip_stack_t *ipst)
+{
+ dce_t *dce;
+
+ dce = dce_lookup_and_add_v4(dst, ipst);
+ if (dce == NULL)
+ return (ENOMEM);
+
+ dce_setuinfo(dce, uinfo);
+ dce_refrele(dce);
+ return (0);
+}
+
+int
+dce_update_uinfo_v6(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
+ ip_stack_t *ipst)
+{
+ dce_t *dce;
+
+ dce = dce_lookup_and_add_v6(dst, ifindex, ipst);
+ if (dce == NULL)
+ return (ENOMEM);
+
+ dce_setuinfo(dce, uinfo);
+ dce_refrele(dce);
+ return (0);
+}
+
+/* Common routine for IPv4 and IPv6 */
+int
+dce_update_uinfo(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
+ ip_stack_t *ipst)
+{
+ ipaddr_t dst4;
+
+ if (IN6_IS_ADDR_V4MAPPED_ANY(dst)) {
+ IN6_V4MAPPED_TO_IPADDR(dst, dst4);
+ return (dce_update_uinfo_v4(dst4, uinfo, ipst));
+ } else {
+ return (dce_update_uinfo_v6(dst, ifindex, uinfo, ipst));
+ }
+}
+
+static void
+dce_make_condemned(dce_t *dce)
+{
+ ip_stack_t *ipst = dce->dce_ipst;
+
+ mutex_enter(&dce->dce_lock);
+ ASSERT(!DCE_IS_CONDEMNED(dce));
+ dce->dce_generation = DCE_GENERATION_CONDEMNED;
+ mutex_exit(&dce->dce_lock);
+ /* Count how many condemned dces for kmem_cache callback */
+ atomic_add_32(&ipst->ips_num_dce_condemned, 1);
+}
+
+/*
+ * Increment the generation avoiding the special condemned value
+ */
+void
+dce_increment_generation(dce_t *dce)
+{
+ uint_t generation;
+
+ mutex_enter(&dce->dce_lock);
+ if (!DCE_IS_CONDEMNED(dce)) {
+ generation = dce->dce_generation + 1;
+ if (generation == DCE_GENERATION_CONDEMNED)
+ generation = DCE_GENERATION_INITIAL;
+ ASSERT(generation != DCE_GENERATION_VERIFY);
+ dce->dce_generation = generation;
+ }
+ mutex_exit(&dce->dce_lock);
+}
+
+/*
+ * Increment the generation number on all dces that have a path MTU and
+ * the default DCE. Used when ill_mtu changes.
+ */
+void
+dce_increment_all_generations(boolean_t isv6, ip_stack_t *ipst)
+{
+ int i;
+ dcb_t *dcb;
+ dce_t *dce;
+
+ for (i = 0; i < ipst->ips_dce_hashsize; i++) {
+ if (isv6)
+ dcb = &ipst->ips_dce_hash_v6[i];
+ else
+ dcb = &ipst->ips_dce_hash_v4[i];
+ rw_enter(&dcb->dcb_lock, RW_WRITER);
+ for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
+ if (DCE_IS_CONDEMNED(dce))
+ continue;
+ dce_increment_generation(dce);
+ }
+ rw_exit(&dcb->dcb_lock);
+ }
+ dce_increment_generation(ipst->ips_dce_default);
+}
+
+/*
+ * Caller needs to do a dce_refrele since we can't do the
+ * dce_refrele under dcb_lock.
+ */
+static void
+dce_delete_locked(dcb_t *dcb, dce_t *dce)
+{
+ dce->dce_bucket = NULL;
+ *dce->dce_ptpn = dce->dce_next;
+ if (dce->dce_next != NULL)
+ dce->dce_next->dce_ptpn = dce->dce_ptpn;
+ dce->dce_ptpn = NULL;
+ dce->dce_next = NULL;
+ atomic_add_32(&dcb->dcb_cnt, -1);
+ dce_make_condemned(dce);
+}
+
+static void
+dce_inactive(dce_t *dce)
+{
+ ip_stack_t *ipst = dce->dce_ipst;
+
+ ASSERT(!(dce->dce_flags & DCEF_DEFAULT));
+ ASSERT(dce->dce_ptpn == NULL);
+ ASSERT(dce->dce_bucket == NULL);
+
+ /* Count how many condemned dces for kmem_cache callback */
+ if (DCE_IS_CONDEMNED(dce))
+ atomic_add_32(&ipst->ips_num_dce_condemned, -1);
+
+ kmem_cache_free(dce_cache, dce);
+}
+
+void
+dce_refrele(dce_t *dce)
+{
+ ASSERT(dce->dce_refcnt != 0);
+ if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0)
+ dce_inactive(dce);
+}
+
+void
+dce_refhold(dce_t *dce)
+{
+ atomic_add_32(&dce->dce_refcnt, 1);
+ ASSERT(dce->dce_refcnt != 0);
+}
+
+/* No tracing support yet hence the same as the above functions */
+void
+dce_refrele_notr(dce_t *dce)
+{
+ ASSERT(dce->dce_refcnt != 0);
+ if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0)
+ dce_inactive(dce);
+}
+
+void
+dce_refhold_notr(dce_t *dce)
+{
+ atomic_add_32(&dce->dce_refcnt, 1);
+ ASSERT(dce->dce_refcnt != 0);
+}
+
+/* Report both the IPv4 and IPv6 DCEs. */
+mblk_t *
+ip_snmp_get_mib2_ip_dce(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
+{
+ struct opthdr *optp;
+ mblk_t *mp2ctl;
+ dest_cache_entry_t dest_cache;
+ mblk_t *mp_tail = NULL;
+ dce_t *dce;
+ dcb_t *dcb;
+ int i;
+ uint64_t current_time;
+
+ current_time = TICK_TO_SEC(lbolt64);
+
+ /*
+ * make a copy of the original message
+ */
+ mp2ctl = copymsg(mpctl);
+
+ /* First we do IPv4 entries */
+ optp = (struct opthdr *)&mpctl->b_rptr[
+ sizeof (struct T_optmgmt_ack)];
+ optp->level = MIB2_IP;
+ optp->name = EXPER_IP_DCE;
+
+ for (i = 0; i < ipst->ips_dce_hashsize; i++) {
+ dcb = &ipst->ips_dce_hash_v4[i];
+ rw_enter(&dcb->dcb_lock, RW_READER);
+ for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
+ dest_cache.DestIpv4Address = dce->dce_v4addr;
+ dest_cache.DestFlags = dce->dce_flags;
+ if (dce->dce_flags & DCEF_PMTU)
+ dest_cache.DestPmtu = dce->dce_pmtu;
+ else
+ dest_cache.DestPmtu = 0;
+ dest_cache.DestIdent = dce->dce_ident;
+ dest_cache.DestIfindex = 0;
+ dest_cache.DestAge = current_time -
+ dce->dce_last_change_time;
+ if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
+ (char *)&dest_cache, (int)sizeof (dest_cache))) {
+ ip1dbg(("ip_snmp_get_mib2_ip_dce: "
+ "failed to allocate %u bytes\n",
+ (uint_t)sizeof (dest_cache)));
+ }
+ }
+ rw_exit(&dcb->dcb_lock);
+ }
+ optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
+ ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
+ (int)optp->level, (int)optp->name, (int)optp->len));
+ qreply(q, mpctl);
+
+ if (mp2ctl == NULL) {
+ /* Copymsg failed above */
+ return (NULL);
+ }
+
+ /* Now for IPv6 */
+ mpctl = mp2ctl;
+ mp_tail = NULL;
+ mp2ctl = copymsg(mpctl);
+ optp = (struct opthdr *)&mpctl->b_rptr[
+ sizeof (struct T_optmgmt_ack)];
+ optp->level = MIB2_IP6;
+ optp->name = EXPER_IP_DCE;
+
+ for (i = 0; i < ipst->ips_dce_hashsize; i++) {
+ dcb = &ipst->ips_dce_hash_v6[i];
+ rw_enter(&dcb->dcb_lock, RW_READER);
+ for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
+ dest_cache.DestIpv6Address = dce->dce_v6addr;
+ dest_cache.DestFlags = dce->dce_flags;
+ if (dce->dce_flags & DCEF_PMTU)
+ dest_cache.DestPmtu = dce->dce_pmtu;
+ else
+ dest_cache.DestPmtu = 0;
+ dest_cache.DestIdent = dce->dce_ident;
+ if (IN6_IS_ADDR_LINKSCOPE(&dce->dce_v6addr))
+ dest_cache.DestIfindex = dce->dce_ifindex;
+ else
+ dest_cache.DestIfindex = 0;
+ dest_cache.DestAge = current_time -
+ dce->dce_last_change_time;
+ if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
+ (char *)&dest_cache, (int)sizeof (dest_cache))) {
+ ip1dbg(("ip_snmp_get_mib2_ip_dce: "
+ "failed to allocate %u bytes\n",
+ (uint_t)sizeof (dest_cache)));
+ }
+ }
+ rw_exit(&dcb->dcb_lock);
+ }
+ optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
+ ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
+ (int)optp->level, (int)optp->name, (int)optp->len));
+ qreply(q, mpctl);
+
+ return (mp2ctl);
+}
+
+/*
+ * Remove IPv6 DCEs which refer to an ifindex that is going away.
+ * This is not required for correctness, but it avoids netstat -d
+ * showing stale stuff that will never be used.
+ */
+void
+dce_cleanup(uint_t ifindex, ip_stack_t *ipst)
+{
+ uint_t i;
+ dcb_t *dcb;
+ dce_t *dce, *nextdce;
+
+ for (i = 0; i < ipst->ips_dce_hashsize; i++) {
+ dcb = &ipst->ips_dce_hash_v6[i];
+ rw_enter(&dcb->dcb_lock, RW_WRITER);
+
+ for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
+ nextdce = dce->dce_next;
+ if (dce->dce_ifindex == ifindex) {
+ dce_delete_locked(dcb, dce);
+ dce_refrele(dce);
+ }
+ }
+ rw_exit(&dcb->dcb_lock);
+ }
+}
diff --git a/usr/src/uts/common/inet/ip/ip_ftable.c b/usr/src/uts/common/inet/ip/ip_ftable.c
index 9e228c2925..771dd9f62f 100644
--- a/usr/src/uts/common/inet/ip/ip_ftable.c
+++ b/usr/src/uts/common/inet/ip/ip_ftable.c
@@ -42,7 +42,6 @@
#include <sys/param.h>
#include <sys/socket.h>
#include <sys/strsubr.h>
-#include <sys/pattr.h>
#include <net/if.h>
#include <net/route.h>
#include <netinet/in.h>
@@ -50,6 +49,7 @@
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
+#include <inet/ipsec_impl.h>
#include <inet/common.h>
#include <inet/mi.h>
#include <inet/mib2.h>
@@ -65,7 +65,6 @@
#include <inet/nd.h>
#include <net/pfkeyv2.h>
-#include <inet/ipsec_info.h>
#include <inet/sadb.h>
#include <inet/tcp.h>
#include <inet/ipclassifier.h>
@@ -78,87 +77,34 @@
(((ire)->ire_type & IRE_DEFAULT) || \
(((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0)))
-/*
- * structure for passing args between ire_ftable_lookup and ire_find_best_route
- */
-typedef struct ire_ftable_args_s {
- ipaddr_t ift_addr;
- ipaddr_t ift_mask;
- ipaddr_t ift_gateway;
- int ift_type;
- const ipif_t *ift_ipif;
- zoneid_t ift_zoneid;
- uint32_t ift_ihandle;
- const ts_label_t *ift_tsl;
- int ift_flags;
- ire_t *ift_best_ire;
-} ire_ftable_args_t;
-
static ire_t *route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *);
-static ire_t *ire_round_robin(irb_t *, zoneid_t, ire_ftable_args_t *,
- ip_stack_t *);
-static void ire_del_host_redir(ire_t *, char *);
-static boolean_t ire_find_best_route(struct radix_node *, void *);
-static int ip_send_align_hcksum_flags(mblk_t *, ill_t *);
-static ire_t *ire_ftable_lookup_simple(ipaddr_t,
- ire_t **, zoneid_t, int, ip_stack_t *);
+static void ire_del_host_redir(ire_t *, char *);
+static boolean_t ire_find_best_route(struct radix_node *, void *);
/*
* Lookup a route in forwarding table. A specific lookup is indicated by
* passing the required parameters and indicating the match required in the
* flag field.
*
- * Looking for default route can be done in three ways
- * 1) pass mask as 0 and set MATCH_IRE_MASK in flags field
- * along with other matches.
- * 2) pass type as IRE_DEFAULT and set MATCH_IRE_TYPE in flags
- * field along with other matches.
- * 3) if the destination and mask are passed as zeros.
- *
- * A request to return a default route if no route
- * is found, can be specified by setting MATCH_IRE_DEFAULT
- * in flags.
- *
- * It does not support recursion more than one level. It
- * will do recursive lookup only when the lookup maps to
- * a prefix or default route and MATCH_IRE_RECURSIVE flag is passed.
- *
- * If the routing table is setup to allow more than one level
- * of recursion, the cleaning up cache table will not work resulting
- * in invalid routing.
- *
* Supports IP_BOUND_IF by following the ipif/ill when recursing.
- *
- * NOTE : When this function returns NULL, pire has already been released.
- * pire is valid only when this function successfully returns an
- * ire.
*/
ire_t *
-ire_ftable_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
- int type, const ipif_t *ipif, ire_t **pire, zoneid_t zoneid,
- uint32_t ihandle, const ts_label_t *tsl, int flags, ip_stack_t *ipst)
+ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
+ int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl,
+ int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp)
{
- ire_t *ire = NULL;
- ipaddr_t gw_addr;
+ ire_t *ire;
struct rt_sockaddr rdst, rmask;
struct rt_entry *rt;
ire_ftable_args_t margs;
- boolean_t found_incomplete = B_FALSE;
- ASSERT(ipif == NULL || !ipif->ipif_isv6);
+ ASSERT(ill == NULL || !ill->ill_isv6);
/*
- * When we return NULL from this function, we should make
- * sure that *pire is NULL so that the callers will not
- * wrongly REFRELE the pire.
- */
- if (pire != NULL)
- *pire = NULL;
- /*
- * ire_match_args() will dereference ipif MATCH_IRE_SRC or
- * MATCH_IRE_ILL is set.
+ * ire_match_args() will dereference ill if MATCH_IRE_ILL
+ * is set.
*/
- if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL))
+ if ((flags & MATCH_IRE_ILL) && (ill == NULL))
return (NULL);
(void) memset(&rdst, 0, sizeof (rdst));
@@ -176,9 +122,8 @@ ire_ftable_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
margs.ift_mask = mask;
margs.ift_gateway = gateway;
margs.ift_type = type;
- margs.ift_ipif = ipif;
+ margs.ift_ill = ill;
margs.ift_zoneid = zoneid;
- margs.ift_ihandle = ihandle;
margs.ift_tsl = tsl;
margs.ift_flags = flags;
@@ -191,232 +136,93 @@ ire_ftable_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
* each matching leaf in the radix tree. ire_match_args is
* invoked by the callback function ire_find_best_route()
* We hold the global tree lock in read mode when calling
- * rn_match_args.Before dropping the global tree lock, ensure
+ * rn_match_args. Before dropping the global tree lock, ensure
* that the radix node can't be deleted by incrementing ire_refcnt.
*/
RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
ipst->ips_ip_ftable, ire_find_best_route, &margs);
ire = margs.ift_best_ire;
- RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
-
if (rt == NULL) {
+ RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
return (NULL);
- } else {
- ASSERT(ire != NULL);
}
+ ASSERT(ire != NULL);
DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire);
- if (!IS_DEFAULT_ROUTE(ire))
- goto found_ire_held;
- /*
- * If default route is found, see if default matching criteria
- * are satisfied.
- */
- if (flags & MATCH_IRE_MASK) {
- /*
- * we were asked to match a 0 mask, and came back with
- * a default route. Ok to return it.
- */
- goto found_default_ire;
- }
- if ((flags & MATCH_IRE_TYPE) &&
- (type & (IRE_DEFAULT | IRE_INTERFACE))) {
- /*
- * we were asked to match a default ire type. Ok to return it.
- */
- goto found_default_ire;
- }
- if (flags & MATCH_IRE_DEFAULT) {
- goto found_default_ire;
- }
- /*
- * we found a default route, but default matching criteria
- * are not specified and we are not explicitly looking for
- * default.
- */
- IRE_REFRELE(ire);
- return (NULL);
-found_default_ire:
/*
* round-robin only if we have more than one route in the bucket.
+ * ips_ip_ecmp_behavior controls when we do ECMP
+ * 2: always
+ * 1: for IRE_DEFAULT and /0 IRE_INTERFACE
+ * 0: never
*/
- if ((ire->ire_bucket->irb_ire_cnt > 1) &&
- IS_DEFAULT_ROUTE(ire) &&
- ((flags & (MATCH_IRE_DEFAULT | MATCH_IRE_MASK)) ==
- MATCH_IRE_DEFAULT)) {
- ire_t *next_ire;
-
- next_ire = ire_round_robin(ire->ire_bucket, zoneid, &margs,
- ipst);
- IRE_REFRELE(ire);
- if (next_ire != NULL) {
+ if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) {
+ if (ipst->ips_ip_ecmp_behavior == 2 ||
+ (ipst->ips_ip_ecmp_behavior == 1 &&
+ IS_DEFAULT_ROUTE(ire))) {
+ ire_t *next_ire;
+
+ margs.ift_best_ire = NULL;
+ next_ire = ire_round_robin(ire->ire_bucket, &margs,
+ xmit_hint, ire, ipst);
+ if (next_ire == NULL) {
+ /* keep ire if next_ire is null */
+ goto done;
+ }
+ ire_refrele(ire);
ire = next_ire;
- } else {
- /* no route */
- return (NULL);
}
}
-found_ire_held:
- if ((flags & MATCH_IRE_RJ_BHOLE) &&
- (ire->ire_flags & (RTF_BLACKHOLE | RTF_REJECT))) {
- return (ire);
- }
- /*
- * At this point, IRE that was found must be an IRE_FORWARDTABLE
- * type. If this is a recursive lookup and an IRE_INTERFACE type was
- * found, return that. If it was some other IRE_FORWARDTABLE type of
- * IRE (one of the prefix types), then it is necessary to fill in the
- * parent IRE pointed to by pire, and then lookup the gateway address of
- * the parent. For backwards compatiblity, if this lookup returns an
- * IRE other than a IRE_CACHETABLE or IRE_INTERFACE, then one more level
- * of lookup is done.
- */
- if (flags & MATCH_IRE_RECURSIVE) {
- ipif_t *gw_ipif;
- int match_flags = MATCH_IRE_DSTONLY;
- ire_t *save_ire;
- if (ire->ire_type & IRE_INTERFACE)
- return (ire);
- if (pire != NULL)
- *pire = ire;
- /*
- * If we can't find an IRE_INTERFACE or the caller has not
- * asked for pire, we need to REFRELE the save_ire.
- */
- save_ire = ire;
+done:
+ /* Return generation before dropping lock */
+ if (generationp != NULL)
+ *generationp = ire->ire_generation;
- if (ire->ire_ipif != NULL)
- match_flags |= MATCH_IRE_ILL;
+ RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
- /*
- * ire_ftable_lookup may end up with an incomplete IRE_CACHE
- * entry for the gateway (i.e., one for which the
- * ire_nce->nce_state is not yet ND_REACHABLE). If the caller
- * has specified MATCH_IRE_COMPLETE, such entries will not
- * be returned; instead, we return the IF_RESOLVER ire.
- */
- ire = ire_route_lookup(ire->ire_gateway_addr, 0, 0, 0,
- ire->ire_ipif, NULL, zoneid, tsl, match_flags, ipst);
- DTRACE_PROBE2(ftable__route__lookup1, (ire_t *), ire,
- (ire_t *), save_ire);
- if (ire == NULL ||
- ((ire->ire_type & IRE_CACHE) && ire->ire_nce &&
- ire->ire_nce->nce_state != ND_REACHABLE &&
- (flags & MATCH_IRE_COMPLETE))) {
- /*
- * Do not release the parent ire if MATCH_IRE_PARENT
- * is set. Also return it via ire.
- */
- if (ire != NULL) {
- ire_refrele(ire);
- ire = NULL;
- found_incomplete = B_TRUE;
- }
- if (flags & MATCH_IRE_PARENT) {
- if (pire != NULL) {
- /*
- * Need an extra REFHOLD, if the parent
- * ire is returned via both ire and
- * pire.
- */
- IRE_REFHOLD(save_ire);
- }
- ire = save_ire;
- } else {
- ire_refrele(save_ire);
- if (pire != NULL)
- *pire = NULL;
- }
- if (!found_incomplete)
- return (ire);
- }
- if (ire->ire_type & (IRE_CACHETABLE | IRE_INTERFACE)) {
- /*
- * If the caller did not ask for pire, release
- * it now.
- */
- if (pire == NULL) {
- ire_refrele(save_ire);
- }
- return (ire);
- }
- match_flags |= MATCH_IRE_TYPE;
- gw_addr = ire->ire_gateway_addr;
- gw_ipif = ire->ire_ipif;
- ire_refrele(ire);
- ire = ire_route_lookup(gw_addr, 0, 0,
- (found_incomplete? IRE_INTERFACE :
- (IRE_CACHETABLE | IRE_INTERFACE)),
- gw_ipif, NULL, zoneid, tsl, match_flags, ipst);
- DTRACE_PROBE2(ftable__route__lookup2, (ire_t *), ire,
- (ire_t *), save_ire);
- if (ire == NULL ||
- ((ire->ire_type & IRE_CACHE) && ire->ire_nce &&
- ire->ire_nce->nce_state != ND_REACHABLE &&
- (flags & MATCH_IRE_COMPLETE))) {
- /*
- * Do not release the parent ire if MATCH_IRE_PARENT
- * is set. Also return it via ire.
- */
- if (ire != NULL) {
- ire_refrele(ire);
- ire = NULL;
- }
- if (flags & MATCH_IRE_PARENT) {
- if (pire != NULL) {
- /*
- * Need an extra REFHOLD, if the
- * parent ire is returned via both
- * ire and pire.
- */
- IRE_REFHOLD(save_ire);
- }
- ire = save_ire;
- } else {
- ire_refrele(save_ire);
- if (pire != NULL)
- *pire = NULL;
- }
- return (ire);
- } else if (pire == NULL) {
- /*
- * If the caller did not ask for pire, release
- * it now.
- */
- ire_refrele(save_ire);
- }
- return (ire);
+ /*
+ * For shared-IP zones we need additional checks to what was
+ * done in ire_match_args to make sure IRE_LOCALs are handled.
+ *
+ * When ip_restrict_interzone_loopback is set, then
+ * we ensure that IRE_LOCAL are only used for loopback
+ * between zones when the logical "Ethernet" would
+ * have looped them back. That is, if in the absense of
+ * the IRE_LOCAL we would have sent to packet out the
+ * same ill.
+ */
+ if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES &&
+ ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES &&
+ ipst->ips_ip_restrict_interzone_loopback) {
+ ire = ire_alt_local(ire, zoneid, tsl, ill, generationp);
+ ASSERT(ire != NULL);
}
- ASSERT(pire == NULL || *pire == NULL);
return (ire);
}
/*
* This function is called by
- * ip_fast_forward->ire_forward_simple
+ * ip_input/ire_route_recursive when doing a route lookup on only the
+ * destination address.
+ *
* The optimizations of this function over ire_ftable_lookup are:
* o removing unnecessary flag matching
* o doing longest prefix match instead of overloading it further
* with the unnecessary "best_prefix_match"
- * o Does not do round robin of default route for every packet
- * o inlines code of ire_ctable_lookup to look for nexthop cache
- * entry before calling ire_route_lookup
+ *
+ * If no route is found we return IRE_NOROUTE.
*/
-static ire_t *
-ire_ftable_lookup_simple(ipaddr_t addr,
- ire_t **pire, zoneid_t zoneid, int flags,
- ip_stack_t *ipst)
+ire_t *
+ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst,
+ uint_t *generationp)
{
- ire_t *ire = NULL;
- ire_t *tmp_ire = NULL;
+ ire_t *ire;
struct rt_sockaddr rdst;
struct rt_entry *rt;
- irb_t *irb_ptr;
- ire_t *save_ire;
- int match_flags;
+ irb_t *irb;
rdst.rt_sin_len = sizeof (rdst);
rdst.rt_sin_family = AF_INET;
@@ -430,263 +236,125 @@ ire_ftable_lookup_simple(ipaddr_t addr,
rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
ipst->ips_ip_ftable, NULL, NULL);
- if (rt == NULL) {
- RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
- return (NULL);
- }
- irb_ptr = &rt->rt_irb;
- if (irb_ptr == NULL || irb_ptr->irb_ire_cnt == 0) {
- RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
- return (NULL);
- }
+ if (rt == NULL)
+ goto bad;
- rw_enter(&irb_ptr->irb_lock, RW_READER);
- for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
- if (ire->ire_zoneid == zoneid)
- break;
- }
+ irb = &rt->rt_irb;
+ if (irb->irb_ire_cnt == 0)
+ goto bad;
- if (ire == NULL || (ire->ire_marks & IRE_MARK_CONDEMNED)) {
- rw_exit(&irb_ptr->irb_lock);
- RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
- return (NULL);
+ rw_enter(&irb->irb_lock, RW_READER);
+ ire = irb->irb_ire;
+ if (ire == NULL) {
+ rw_exit(&irb->irb_lock);
+ goto bad;
}
- /* we have a ire that matches */
- if (ire != NULL)
- IRE_REFHOLD(ire);
- rw_exit(&irb_ptr->irb_lock);
- RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
-
- if ((flags & MATCH_IRE_RJ_BHOLE) &&
- (ire->ire_flags & (RTF_BLACKHOLE | RTF_REJECT))) {
- return (ire);
+ while (IRE_IS_CONDEMNED(ire)) {
+ ire = ire->ire_next;
+ if (ire == NULL) {
+ rw_exit(&irb->irb_lock);
+ goto bad;
+ }
}
- /*
- * At this point, IRE that was found must be an IRE_FORWARDTABLE
- * type. If this is a recursive lookup and an IRE_INTERFACE type was
- * found, return that. If it was some other IRE_FORWARDTABLE type of
- * IRE (one of the prefix types), then it is necessary to fill in the
- * parent IRE pointed to by pire, and then lookup the gateway address of
- * the parent. For backwards compatiblity, if this lookup returns an
- * IRE other than a IRE_CACHETABLE or IRE_INTERFACE, then one more level
- * of lookup is done.
- */
- match_flags = MATCH_IRE_DSTONLY;
- if (ire->ire_type & IRE_INTERFACE)
- return (ire);
- *pire = ire;
- /*
- * If we can't find an IRE_INTERFACE or the caller has not
- * asked for pire, we need to REFRELE the save_ire.
- */
- save_ire = ire;
+ /* we have a ire that matches */
+ ire_refhold(ire);
+ rw_exit(&irb->irb_lock);
/*
- * Currently MATCH_IRE_ILL is never used with
- * (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT) while
- * sending out packets as MATCH_IRE_ILL is used only
- * for communicating with on-link hosts. We can't assert
- * that here as RTM_GET calls this function with
- * MATCH_IRE_ILL | MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE.
- * We have already used the MATCH_IRE_ILL in determining
- * the right prefix route at this point. To match the
- * behavior of how we locate routes while sending out
- * packets, we don't want to use MATCH_IRE_ILL below
- * while locating the interface route.
+ * round-robin only if we have more than one route in the bucket.
+ * ips_ip_ecmp_behavior controls when we do ECMP
+ * 2: always
+ * 1: for IRE_DEFAULT and /0 IRE_INTERFACE
+ * 0: never
*
- * ire_ftable_lookup may end up with an incomplete IRE_CACHE
- * entry for the gateway (i.e., one for which the
- * ire_nce->nce_state is not yet ND_REACHABLE). If the caller
- * has specified MATCH_IRE_COMPLETE, such entries will not
- * be returned; instead, we return the IF_RESOLVER ire.
+ * Note: if we found an IRE_IF_CLONE we won't look at the bucket with
+ * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match
+ * and the IRE_INTERFACESs are likely to be shorter matches.
*/
-
- if (ire->ire_ipif == NULL) {
- tmp_ire = ire;
- /*
- * Look to see if the nexthop entry is in the cachetable
- */
- ire = ire_cache_lookup(ire->ire_gateway_addr, zoneid, NULL,
- ipst);
- if (ire == NULL) {
- /* Try ire_route_lookup */
- ire = tmp_ire;
- } else {
- goto solved;
- }
- }
- if (ire->ire_ipif != NULL)
- match_flags |= MATCH_IRE_ILL;
-
- ire = ire_route_lookup(ire->ire_gateway_addr, 0,
- 0, 0, ire->ire_ipif, NULL, zoneid, NULL, match_flags, ipst);
-solved:
- DTRACE_PROBE2(ftable__route__lookup1, (ire_t *), ire,
- (ire_t *), save_ire);
- if (ire == NULL) {
- /*
- * Do not release the parent ire if MATCH_IRE_PARENT
- * is set. Also return it via ire.
- */
- ire_refrele(save_ire);
- *pire = NULL;
- return (ire);
- }
- if (ire->ire_type & (IRE_CACHETABLE | IRE_INTERFACE)) {
- /*
- * If the caller did not ask for pire, release
- * it now.
- */
- if (pire == NULL) {
- ire_refrele(save_ire);
+ if (ire->ire_bucket->irb_ire_cnt > 1) {
+ if (ipst->ips_ip_ecmp_behavior == 2 ||
+ (ipst->ips_ip_ecmp_behavior == 1 &&
+ IS_DEFAULT_ROUTE(ire))) {
+ ire_t *next_ire;
+ ire_ftable_args_t margs;
+
+ (void) memset(&margs, 0, sizeof (margs));
+ margs.ift_addr = addr;
+ margs.ift_zoneid = ALL_ZONES;
+
+ next_ire = ire_round_robin(ire->ire_bucket, &margs,
+ xmit_hint, ire, ipst);
+ if (next_ire == NULL) {
+ /* keep ire if next_ire is null */
+ if (generationp != NULL)
+ *generationp = ire->ire_generation;
+ RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
+ return (ire);
+ }
+ ire_refrele(ire);
+ ire = next_ire;
}
}
- return (ire);
-}
-
-/*
- * Find an IRE_OFFSUBNET IRE entry for the multicast address 'group'
- * that goes through 'ipif'. As a fallback, a route that goes through
- * ipif->ipif_ill can be returned.
- */
-ire_t *
-ipif_lookup_multi_ire(ipif_t *ipif, ipaddr_t group)
-{
- ire_t *ire;
- ire_t *save_ire = NULL;
- ire_t *gw_ire;
- irb_t *irb;
- ipaddr_t gw_addr;
- int match_flags = MATCH_IRE_TYPE | MATCH_IRE_ILL;
- ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
-
- ASSERT(CLASSD(group));
-
- ire = ire_ftable_lookup(group, 0, 0, 0, NULL, NULL, ALL_ZONES, 0,
- NULL, MATCH_IRE_DEFAULT, ipst);
-
- if (ire == NULL)
- return (NULL);
+ /* Return generation before dropping lock */
+ if (generationp != NULL)
+ *generationp = ire->ire_generation;
- irb = ire->ire_bucket;
- ASSERT(irb);
+ RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
- IRB_REFHOLD(irb);
- ire_refrele(ire);
- for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
- if (ire->ire_addr != group ||
- ipif->ipif_zoneid != ire->ire_zoneid &&
- ire->ire_zoneid != ALL_ZONES) {
- continue;
- }
+ /*
+ * Since we only did ALL_ZONES matches there is no special handling
+ * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that.
+ */
+ return (ire);
- switch (ire->ire_type) {
- case IRE_DEFAULT:
- case IRE_PREFIX:
- case IRE_HOST:
- gw_addr = ire->ire_gateway_addr;
- gw_ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE,
- ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
-
- if (gw_ire != NULL) {
- if (save_ire != NULL) {
- ire_refrele(save_ire);
- }
- IRE_REFHOLD(ire);
- if (gw_ire->ire_ipif == ipif) {
- ire_refrele(gw_ire);
-
- IRB_REFRELE(irb);
- return (ire);
- }
- ire_refrele(gw_ire);
- save_ire = ire;
- }
- break;
- case IRE_IF_NORESOLVER:
- case IRE_IF_RESOLVER:
- if (ire->ire_ipif == ipif) {
- if (save_ire != NULL) {
- ire_refrele(save_ire);
- }
- IRE_REFHOLD(ire);
-
- IRB_REFRELE(irb);
- return (ire);
- }
- break;
- }
- }
- IRB_REFRELE(irb);
+bad:
+ if (generationp != NULL)
+ *generationp = IRE_GENERATION_VERIFY;
- return (save_ire);
+ RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
+ return (ire_reject(ipst, B_FALSE));
}
/*
- * Find an IRE_INTERFACE for the multicast group.
+ * Find the ill matching a multicast group.
* Allows different routes for multicast addresses
* in the unicast routing table (akin to 224.0.0.0 but could be more specific)
* which point at different interfaces. This is used when IP_MULTICAST_IF
* isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't
* specify the interface to join on.
*
- * Supports IP_BOUND_IF by following the ipif/ill when recursing.
+ * Supports link-local addresses by using ire_route_recursive which follows
+ * the ill when recursing.
+ *
+ * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group
+ * and the MULTIRT property can be different for different groups, we
+ * extract RTF_MULTIRT from the special unicast route added for a group
+ * with CGTP and pass that back in the multirtp argument.
+ * This is used in ip_set_destination etc to set ixa_postfragfn for multicast.
+ * We have a setsrcp argument for the same reason.
*/
-ire_t *
-ire_lookup_multi(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst)
+ill_t *
+ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst,
+ boolean_t *multirtp, ipaddr_t *setsrcp)
{
ire_t *ire;
- ipif_t *ipif = NULL;
- int match_flags = MATCH_IRE_TYPE;
- ipaddr_t gw_addr;
-
- ire = ire_ftable_lookup(group, 0, 0, 0, NULL, NULL, zoneid,
- 0, NULL, MATCH_IRE_DEFAULT, ipst);
+ ill_t *ill;
- /* We search a resolvable ire in case of multirouting. */
- if ((ire != NULL) && (ire->ire_flags & RTF_MULTIRT)) {
- ire_t *cire = NULL;
- /*
- * If the route is not resolvable, the looked up ire
- * may be changed here. In that case, ire_multirt_lookup()
- * IRE_REFRELE the original ire and change it.
- */
- (void) ire_multirt_lookup(&cire, &ire, MULTIRT_CACHEGW, NULL,
- NULL, ipst);
- if (cire != NULL)
- ire_refrele(cire);
- }
- if (ire == NULL)
- return (NULL);
- /*
- * Make sure we follow ire_ipif.
- *
- * We need to determine the interface route through
- * which the gateway will be reached.
- */
- if (ire->ire_ipif != NULL) {
- ipif = ire->ire_ipif;
- match_flags |= MATCH_IRE_ILL;
- }
-
- switch (ire->ire_type) {
- case IRE_DEFAULT:
- case IRE_PREFIX:
- case IRE_HOST:
- gw_addr = ire->ire_gateway_addr;
- ire_refrele(ire);
- ire = ire_ftable_lookup(gw_addr, 0, 0,
- IRE_INTERFACE, ipif, NULL, zoneid, 0,
- NULL, match_flags, ipst);
- return (ire);
- case IRE_IF_NORESOLVER:
- case IRE_IF_RESOLVER:
- return (ire);
- default:
+ ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL,
+ MATCH_IRE_DSTONLY, B_FALSE, 0, ipst, setsrcp, NULL, NULL);
+ ASSERT(ire != NULL);
+ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
ire_refrele(ire);
return (NULL);
}
+
+ if (multirtp != NULL)
+ *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0;
+
+ ill = ire_nexthop_ill(ire);
+ ire_refrele(ire);
+ return (ill);
}
/*
@@ -701,7 +369,7 @@ ire_del_host_redir(ire_t *ire, char *gateway)
}
/*
- * Search for all HOST REDIRECT routes that are
+ * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are
* pointing at the specified gateway and
* delete them. This routine is called only
* when a default gateway is going away.
@@ -718,732 +386,6 @@ ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst)
rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn);
}
-struct ihandle_arg {
- uint32_t ihandle;
- ire_t *ire;
-};
-
-static int
-ire_ihandle_onlink_match(struct radix_node *rn, void *arg)
-{
- struct rt_entry *rt;
- irb_t *irb;
- ire_t *ire;
- struct ihandle_arg *ih = arg;
-
- rt = (struct rt_entry *)rn;
- ASSERT(rt != NULL);
- irb = &rt->rt_irb;
- for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
- if ((ire->ire_type & IRE_INTERFACE) &&
- (ire->ire_ihandle == ih->ihandle)) {
- ih->ire = ire;
- IRE_REFHOLD(ire);
- return (1);
- }
- }
- return (0);
-}
-
-/*
- * Locate the interface ire that is tied to the cache ire 'cire' via
- * cire->ire_ihandle.
- *
- * We are trying to create the cache ire for an onlink destn. or
- * gateway in 'cire'. We are called from ire_add_v4() in the IRE_IF_RESOLVER
- * case, after the ire has come back from ARP.
- */
-ire_t *
-ire_ihandle_lookup_onlink(ire_t *cire)
-{
- ire_t *ire;
- int match_flags;
- struct ihandle_arg ih;
- ip_stack_t *ipst;
-
- ASSERT(cire != NULL);
- ipst = cire->ire_ipst;
-
- /*
- * We don't need to specify the zoneid to ire_ftable_lookup() below
- * because the ihandle refers to an ipif which can be in only one zone.
- */
- match_flags = MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK;
- /*
- * We know that the mask of the interface ire equals cire->ire_cmask.
- * (When ip_newroute() created 'cire' for an on-link destn. it set its
- * cmask from the interface ire's mask)
- */
- ire = ire_ftable_lookup(cire->ire_addr, cire->ire_cmask, 0,
- IRE_INTERFACE, NULL, NULL, ALL_ZONES, cire->ire_ihandle,
- NULL, match_flags, ipst);
- if (ire != NULL)
- return (ire);
- /*
- * If we didn't find an interface ire above, we can't declare failure.
- * For backwards compatibility, we need to support prefix routes
- * pointing to next hop gateways that are not on-link.
- *
- * In the resolver/noresolver case, ip_newroute() thinks it is creating
- * the cache ire for an onlink destination in 'cire'. But 'cire' is
- * not actually onlink, because ire_ftable_lookup() cheated it, by
- * doing ire_route_lookup() twice and returning an interface ire.
- *
- * Eg. default - gw1 (line 1)
- * gw1 - gw2 (line 2)
- * gw2 - hme0 (line 3)
- *
- * In the above example, ip_newroute() tried to create the cache ire
- * 'cire' for gw1, based on the interface route in line 3. The
- * ire_ftable_lookup() above fails, because there is no interface route
- * to reach gw1. (it is gw2). We fall thru below.
- *
- * Do a brute force search based on the ihandle in a subset of the
- * forwarding tables, corresponding to cire->ire_cmask. Otherwise
- * things become very complex, since we don't have 'pire' in this
- * case. (Also note that this method is not possible in the offlink
- * case because we don't know the mask)
- */
- (void) memset(&ih, 0, sizeof (ih));
- ih.ihandle = cire->ire_ihandle;
- (void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable,
- ire_ihandle_onlink_match, &ih, irb_refhold_rn, irb_refrele_rn);
- return (ih.ire);
-}
-
-/*
- * IRE iterator used by ire_ftable_lookup[_v6]() to process multiple default
- * routes. Given a starting point in the hash list (ire_origin), walk the IREs
- * in the bucket skipping default interface routes and deleted entries.
- * Returns the next IRE (unheld), or NULL when we're back to the starting point.
- * Assumes that the caller holds a reference on the IRE bucket.
- */
-ire_t *
-ire_get_next_default_ire(ire_t *ire, ire_t *ire_origin)
-{
- ASSERT(ire_origin->ire_bucket != NULL);
- ASSERT(ire != NULL);
-
- do {
- ire = ire->ire_next;
- if (ire == NULL)
- ire = ire_origin->ire_bucket->irb_ire;
- if (ire == ire_origin)
- return (NULL);
- } while ((ire->ire_type & IRE_INTERFACE) ||
- (ire->ire_marks & IRE_MARK_CONDEMNED));
- ASSERT(ire != NULL);
- return (ire);
-}
-
-static ipif_t *
-ire_forward_src_ipif(ipaddr_t dst, ire_t *sire, ire_t *ire,
- int zoneid, ushort_t *marks)
-{
- ipif_t *src_ipif;
- ill_t *ill = ire->ire_ipif->ipif_ill;
- ip_stack_t *ipst = ill->ill_ipst;
-
- /*
- * Pick the best source address from ill.
- *
- * 1) Try to pick the source address from the destination
- * route. Clustering assumes that when we have multiple
- * prefixes hosted on an interface, the prefix of the
- * source address matches the prefix of the destination
- * route. We do this only if the address is not
- * DEPRECATED.
- *
- * 2) If the conn is in a different zone than the ire, we
- * need to pick a source address from the right zone.
- */
- if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) {
- /*
- * The RTF_SETSRC flag is set in the parent ire (sire).
- * Check that the ipif matching the requested source
- * address still exists.
- */
- src_ipif = ipif_lookup_addr(sire->ire_src_addr, NULL,
- zoneid, NULL, NULL, NULL, NULL, ipst);
- return (src_ipif);
- }
- *marks |= IRE_MARK_USESRC_CHECK;
- if (IS_IPMP(ill) ||
- (ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) ||
- (ill->ill_usesrc_ifindex != 0)) {
- src_ipif = ipif_select_source(ill, dst, zoneid);
- } else {
- src_ipif = ire->ire_ipif;
- ASSERT(src_ipif != NULL);
- /* hold src_ipif for uniformity */
- ipif_refhold(src_ipif);
- }
- return (src_ipif);
-}
-
-/*
- * This function is called by ip_rput_noire() and ip_fast_forward()
- * to resolve the route of incoming packet that needs to be forwarded.
- * If the ire of the nexthop is not already in the cachetable, this
- * routine will insert it to the table, but won't trigger ARP resolution yet.
- * Thus unlike ip_newroute, this function adds incomplete ires to
- * the cachetable. ARP resolution for these ires are delayed until
- * after all of the packet processing is completed and its ready to
- * be sent out on the wire, Eventually, the packet transmit routine
- * ip_xmit_v4() attempts to send a packet to the driver. If it finds
- * that there is no link layer information, it will do the arp
- * resolution and queue the packet in ire->ire_nce->nce_qd_mp and
- * then send it out once the arp resolution is over
- * (see ip_xmit_v4()->ire_arpresolve()). This scheme is similar to
- * the model of BSD/SunOS 4
- *
- * In future, the insertion of incomplete ires in the cachetable should
- * be implemented in hostpath as well, as doing so will greatly reduce
- * the existing complexity for code paths that depend on the context of
- * the sender (such as IPsec).
- *
- * Thus this scheme of adding incomplete ires in cachetable in forwarding
- * path can be used as a template for simplifying the hostpath.
- */
-
-ire_t *
-ire_forward(ipaddr_t dst, enum ire_forward_action *ret_action,
- ire_t *supplied_ire, ire_t *supplied_sire, const struct ts_label_s *tsl,
- ip_stack_t *ipst)
-{
- ipaddr_t gw = 0;
- ire_t *ire = NULL;
- ire_t *sire = NULL, *save_ire;
- ill_t *dst_ill = NULL;
- int error;
- zoneid_t zoneid;
- ipif_t *src_ipif = NULL;
- mblk_t *res_mp;
- ushort_t ire_marks = 0;
- tsol_gcgrp_t *gcgrp = NULL;
- tsol_gcgrp_addr_t ga;
-
- zoneid = GLOBAL_ZONEID;
-
- if (supplied_ire != NULL) {
- /* We have arrived here from ipfil_sendpkt */
- ire = supplied_ire;
- sire = supplied_sire;
- goto create_irecache;
- }
-
- ire = ire_ftable_lookup(dst, 0, 0, 0, NULL, &sire, zoneid, 0,
- tsl, MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
- MATCH_IRE_RJ_BHOLE | MATCH_IRE_PARENT|MATCH_IRE_SECATTR, ipst);
-
- if (ire == NULL) {
- ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0, RTA_DST, ipst);
- goto icmp_err_ret;
- }
-
- /*
- * If we encounter CGTP, we should have the caller use
- * ip_newroute to resolve multirt instead of this function.
- * CGTP specs explicitly state that it can't be used with routers.
- * This essentially prevents insertion of incomplete RTF_MULTIRT
- * ires in cachetable.
- */
- if (ipst->ips_ip_cgtp_filter &&
- ((ire->ire_flags & RTF_MULTIRT) ||
- ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)))) {
- ip3dbg(("ire_forward: packet is to be multirouted- "
- "handing it to ip_newroute\n"));
- if (sire != NULL)
- ire_refrele(sire);
- ire_refrele(ire);
- /*
- * Inform caller about encountering of multirt so that
- * ip_newroute() can be called.
- */
- *ret_action = Forward_check_multirt;
- return (NULL);
- }
-
- /*
- * Verify that the returned IRE does not have either
- * the RTF_REJECT or RTF_BLACKHOLE flags set and that the IRE is
- * either an IRE_CACHE, IRE_IF_NORESOLVER or IRE_IF_RESOLVER.
- */
- if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) ||
- (ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0) {
- ip3dbg(("ire 0x%p is not cache/resolver/noresolver\n",
- (void *)ire));
- goto icmp_err_ret;
- }
-
- /*
- * If we already have a fully resolved IRE CACHE of the
- * nexthop router, just hand over the cache entry
- * and we are done.
- */
-
- if (ire->ire_type & IRE_CACHE) {
-
- /*
- * If we are using this ire cache entry as a
- * gateway to forward packets, chances are we
- * will be using it again. So turn off
- * the temporary flag, thus reducing its
- * chances of getting deleted frequently.
- */
- if (ire->ire_marks & IRE_MARK_TEMPORARY) {
- irb_t *irb = ire->ire_bucket;
- rw_enter(&irb->irb_lock, RW_WRITER);
- /*
- * We need to recheck for IRE_MARK_TEMPORARY after
- * acquiring the lock in order to guarantee
- * irb_tmp_ire_cnt
- */
- if (ire->ire_marks & IRE_MARK_TEMPORARY) {
- ire->ire_marks &= ~IRE_MARK_TEMPORARY;
- irb->irb_tmp_ire_cnt--;
- }
- rw_exit(&irb->irb_lock);
- }
-
- if (sire != NULL) {
- UPDATE_OB_PKT_COUNT(sire);
- sire->ire_last_used_time = lbolt;
- ire_refrele(sire);
- }
- *ret_action = Forward_ok;
- return (ire);
- }
-create_irecache:
- /*
- * Increment the ire_ob_pkt_count field for ire if it is an
- * INTERFACE (IF_RESOLVER or IF_NORESOLVER) IRE type, and
- * increment the same for the parent IRE, sire, if it is some
- * sort of prefix IRE (which includes DEFAULT, PREFIX, and HOST).
- */
- if ((ire->ire_type & IRE_INTERFACE) != 0) {
- UPDATE_OB_PKT_COUNT(ire);
- ire->ire_last_used_time = lbolt;
- }
-
- /*
- * sire must be either IRE_CACHETABLE OR IRE_INTERFACE type
- */
- if (sire != NULL) {
- gw = sire->ire_gateway_addr;
- ASSERT((sire->ire_type &
- (IRE_CACHETABLE | IRE_INTERFACE)) == 0);
- UPDATE_OB_PKT_COUNT(sire);
- sire->ire_last_used_time = lbolt;
- }
-
- dst_ill = ire->ire_ipif->ipif_ill;
- if (IS_IPMP(dst_ill))
- dst_ill = ipmp_illgrp_hold_next_ill(dst_ill->ill_grp);
- else
- ill_refhold(dst_ill);
-
- if (dst_ill == NULL) {
- ip2dbg(("ire_forward no dst ill; ire 0x%p\n", (void *)ire));
- goto icmp_err_ret;
- }
-
- ASSERT(src_ipif == NULL);
- /* Now obtain the src_ipif */
- src_ipif = ire_forward_src_ipif(dst, sire, ire, zoneid, &ire_marks);
- if (src_ipif == NULL)
- goto icmp_err_ret;
-
- switch (ire->ire_type) {
- case IRE_IF_NORESOLVER:
- /* create ire_cache for ire_addr endpoint */
- if (dst_ill->ill_resolver_mp == NULL) {
- ip1dbg(("ire_forward: dst_ill %p "
- "for IRE_IF_NORESOLVER ire %p has "
- "no ill_resolver_mp\n",
- (void *)dst_ill, (void *)ire));
- goto icmp_err_ret;
- }
- /* FALLTHRU */
- case IRE_IF_RESOLVER:
- /*
- * We have the IRE_IF_RESOLVER of the nexthop gateway
- * and now need to build a IRE_CACHE for it.
- * In this case, we have the following :
- *
- * 1) src_ipif - used for getting a source address.
- *
- * 2) dst_ill - from which we derive ire_stq/ire_rfq. This
- * means packets using the IRE_CACHE that we will build
- * here will go out on dst_ill.
- *
- * 3) sire may or may not be NULL. But, the IRE_CACHE that is
- * to be created will only be tied to the IRE_INTERFACE
- * that was derived from the ire_ihandle field.
- *
- * If sire is non-NULL, it means the destination is
- * off-link and we will first create the IRE_CACHE for the
- * gateway.
- */
- res_mp = dst_ill->ill_resolver_mp;
- if (ire->ire_type == IRE_IF_RESOLVER &&
- (!OK_RESOLVER_MP(res_mp))) {
- goto icmp_err_ret;
- }
- /*
- * To be at this point in the code with a non-zero gw
- * means that dst is reachable through a gateway that
- * we have never resolved. By changing dst to the gw
- * addr we resolve the gateway first.
- */
- if (gw != INADDR_ANY) {
- /*
- * The source ipif that was determined above was
- * relative to the destination address, not the
- * gateway's. If src_ipif was not taken out of
- * the IRE_IF_RESOLVER entry, we'll need to call
- * ipif_select_source() again.
- */
- if (src_ipif != ire->ire_ipif) {
- ipif_refrele(src_ipif);
- src_ipif = ipif_select_source(dst_ill,
- gw, zoneid);
- if (src_ipif == NULL)
- goto icmp_err_ret;
- }
- dst = gw;
- gw = INADDR_ANY;
- }
- /*
- * dst has been set to the address of the nexthop.
- *
- * TSol note: get security attributes of the nexthop;
- * Note that the nexthop may either be a gateway, or the
- * packet destination itself; Detailed explanation of
- * issues involved is provided in the IRE_IF_NORESOLVER
- * logic in ip_newroute().
- */
- ga.ga_af = AF_INET;
- IN6_IPADDR_TO_V4MAPPED(dst, &ga.ga_addr);
- gcgrp = gcgrp_lookup(&ga, B_FALSE);
-
- if (ire->ire_type == IRE_IF_NORESOLVER)
- dst = ire->ire_addr; /* ire_cache for tunnel endpoint */
-
- save_ire = ire;
- /*
- * create an incomplete IRE_CACHE.
- * An areq_mp will be generated in ire_arpresolve() for
- * RESOLVER interfaces.
- */
- ire = ire_create(
- (uchar_t *)&dst, /* dest address */
- (uchar_t *)&ip_g_all_ones, /* mask */
- (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */
- (uchar_t *)&gw, /* gateway address */
- (save_ire->ire_type == IRE_IF_RESOLVER ? NULL:
- &save_ire->ire_max_frag),
- NULL,
- dst_ill->ill_rq, /* recv-from queue */
- dst_ill->ill_wq, /* send-to queue */
- IRE_CACHE, /* IRE type */
- src_ipif,
- ire->ire_mask, /* Parent mask */
- 0,
- ire->ire_ihandle, /* Interface handle */
- 0,
- &(ire->ire_uinfo),
- NULL,
- gcgrp,
- ipst);
- ip1dbg(("incomplete ire_cache 0x%p\n", (void *)ire));
- if (ire != NULL) {
- gcgrp = NULL; /* reference now held by IRE */
- ire->ire_marks |= ire_marks;
- /* add the incomplete ire: */
- error = ire_add(&ire, NULL, NULL, NULL, B_TRUE);
- if (error == 0 && ire != NULL) {
- ire->ire_max_frag = save_ire->ire_max_frag;
- ip1dbg(("setting max_frag to %d in ire 0x%p\n",
- ire->ire_max_frag, (void *)ire));
- } else {
- ire_refrele(save_ire);
- goto icmp_err_ret;
- }
- } else {
- if (gcgrp != NULL) {
- GCGRP_REFRELE(gcgrp);
- gcgrp = NULL;
- }
- }
-
- ire_refrele(save_ire);
- break;
- default:
- break;
- }
-
- *ret_action = Forward_ok;
- if (sire != NULL)
- ire_refrele(sire);
- if (dst_ill != NULL)
- ill_refrele(dst_ill);
- if (src_ipif != NULL)
- ipif_refrele(src_ipif);
- return (ire);
-icmp_err_ret:
- *ret_action = Forward_ret_icmp_err;
- if (sire != NULL)
- ire_refrele(sire);
- if (dst_ill != NULL)
- ill_refrele(dst_ill);
- if (src_ipif != NULL)
- ipif_refrele(src_ipif);
- if (ire != NULL) {
- if (ire->ire_flags & RTF_BLACKHOLE)
- *ret_action = Forward_blackhole;
- ire_refrele(ire);
- }
- return (NULL);
-}
-
-/*
- * Since caller is ip_fast_forward, there is no CGTP or Tsol test
- * Also we dont call ftable lookup with MATCH_IRE_PARENT
- */
-
-ire_t *
-ire_forward_simple(ipaddr_t dst, enum ire_forward_action *ret_action,
- ip_stack_t *ipst)
-{
- ipaddr_t gw = 0;
- ire_t *ire = NULL;
- ire_t *sire = NULL, *save_ire;
- ill_t *dst_ill = NULL;
- int error;
- zoneid_t zoneid = GLOBAL_ZONEID;
- ipif_t *src_ipif = NULL;
- mblk_t *res_mp;
- ushort_t ire_marks = 0;
-
- ire = ire_ftable_lookup_simple(dst, &sire, zoneid,
- MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE, ipst);
- if (ire == NULL) {
- ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0, RTA_DST, ipst);
- goto icmp_err_ret;
- }
-
- /*
- * Verify that the returned IRE does not have either
- * the RTF_REJECT or RTF_BLACKHOLE flags set and that the IRE is
- * either an IRE_CACHE, IRE_IF_NORESOLVER or IRE_IF_RESOLVER.
- */
- if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) ||
- ((ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0)) {
- ip3dbg(("ire 0x%p is not cache/resolver/noresolver\n",
- (void *)ire));
- goto icmp_err_ret;
- }
-
- /*
- * If we already have a fully resolved IRE CACHE of the
- * nexthop router, just hand over the cache entry
- * and we are done.
- */
- if (ire->ire_type & IRE_CACHE) {
- /*
- * If we are using this ire cache entry as a
- * gateway to forward packets, chances are we
- * will be using it again. So turn off
- * the temporary flag, thus reducing its
- * chances of getting deleted frequently.
- */
- if (ire->ire_marks & IRE_MARK_TEMPORARY) {
- irb_t *irb = ire->ire_bucket;
- rw_enter(&irb->irb_lock, RW_WRITER);
- ire->ire_marks &= ~IRE_MARK_TEMPORARY;
- irb->irb_tmp_ire_cnt--;
- rw_exit(&irb->irb_lock);
- }
-
- if (sire != NULL) {
- UPDATE_OB_PKT_COUNT(sire);
- ire_refrele(sire);
- }
- *ret_action = Forward_ok;
- return (ire);
- }
- /*
- * Increment the ire_ob_pkt_count field for ire if it is an
- * INTERFACE (IF_RESOLVER or IF_NORESOLVER) IRE type, and
- * increment the same for the parent IRE, sire, if it is some
- * sort of prefix IRE (which includes DEFAULT, PREFIX, and HOST).
- */
- if ((ire->ire_type & IRE_INTERFACE) != 0) {
- UPDATE_OB_PKT_COUNT(ire);
- ire->ire_last_used_time = lbolt;
- }
-
- /*
- * sire must be either IRE_CACHETABLE OR IRE_INTERFACE type
- */
- if (sire != NULL) {
- gw = sire->ire_gateway_addr;
- ASSERT((sire->ire_type &
- (IRE_CACHETABLE | IRE_INTERFACE)) == 0);
- UPDATE_OB_PKT_COUNT(sire);
- }
-
- dst_ill = ire->ire_ipif->ipif_ill;
- if (IS_IPMP(dst_ill))
- dst_ill = ipmp_illgrp_hold_next_ill(dst_ill->ill_grp);
- else
- ill_refhold(dst_ill); /* for symmetry */
-
- if (dst_ill == NULL) {
- ip2dbg(("ire_forward_simple: no dst ill; ire 0x%p\n",
- (void *)ire));
- goto icmp_err_ret;
- }
-
- ASSERT(src_ipif == NULL);
- /* Now obtain the src_ipif */
- src_ipif = ire_forward_src_ipif(dst, sire, ire, zoneid, &ire_marks);
- if (src_ipif == NULL)
- goto icmp_err_ret;
-
- switch (ire->ire_type) {
- case IRE_IF_NORESOLVER:
- /* create ire_cache for ire_addr endpoint */
- case IRE_IF_RESOLVER:
- /*
- * We have the IRE_IF_RESOLVER of the nexthop gateway
- * and now need to build a IRE_CACHE for it.
- * In this case, we have the following :
- *
- * 1) src_ipif - used for getting a source address.
- *
- * 2) dst_ill - from which we derive ire_stq/ire_rfq. This
- * means packets using the IRE_CACHE that we will build
- * here will go out on dst_ill.
- *
- * 3) sire may or may not be NULL. But, the IRE_CACHE that is
- * to be created will only be tied to the IRE_INTERFACE
- * that was derived from the ire_ihandle field.
- *
- * If sire is non-NULL, it means the destination is
- * off-link and we will first create the IRE_CACHE for the
- * gateway.
- */
- res_mp = dst_ill->ill_resolver_mp;
- if (ire->ire_type == IRE_IF_RESOLVER &&
- (!OK_RESOLVER_MP(res_mp))) {
- ire_refrele(ire);
- ire = NULL;
- goto out;
- }
- /*
- * To be at this point in the code with a non-zero gw
- * means that dst is reachable through a gateway that
- * we have never resolved. By changing dst to the gw
- * addr we resolve the gateway first.
- */
- if (gw != INADDR_ANY) {
- /*
- * The source ipif that was determined above was
- * relative to the destination address, not the
- * gateway's. If src_ipif was not taken out of
- * the IRE_IF_RESOLVER entry, we'll need to call
- * ipif_select_source() again.
- */
- if (src_ipif != ire->ire_ipif) {
- ipif_refrele(src_ipif);
- src_ipif = ipif_select_source(dst_ill,
- gw, zoneid);
- if (src_ipif == NULL)
- goto icmp_err_ret;
- }
- dst = gw;
- gw = INADDR_ANY;
- }
-
- if (ire->ire_type == IRE_IF_NORESOLVER)
- dst = ire->ire_addr; /* ire_cache for tunnel endpoint */
-
- save_ire = ire;
- /*
- * create an incomplete IRE_CACHE.
- * An areq_mp will be generated in ire_arpresolve() for
- * RESOLVER interfaces.
- */
- ire = ire_create(
- (uchar_t *)&dst, /* dest address */
- (uchar_t *)&ip_g_all_ones, /* mask */
- (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */
- (uchar_t *)&gw, /* gateway address */
- (save_ire->ire_type == IRE_IF_RESOLVER ? NULL:
- &save_ire->ire_max_frag),
- NULL,
- dst_ill->ill_rq, /* recv-from queue */
- dst_ill->ill_wq, /* send-to queue */
- IRE_CACHE, /* IRE type */
- src_ipif,
- ire->ire_mask, /* Parent mask */
- 0,
- ire->ire_ihandle, /* Interface handle */
- 0,
- &(ire->ire_uinfo),
- NULL,
- NULL,
- ipst);
- ip1dbg(("incomplete ire_cache 0x%p\n", (void *)ire));
- if (ire != NULL) {
- ire->ire_marks |= ire_marks;
- /* add the incomplete ire: */
- error = ire_add(&ire, NULL, NULL, NULL, B_TRUE);
- if (error == 0 && ire != NULL) {
- ire->ire_max_frag = save_ire->ire_max_frag;
- ip1dbg(("setting max_frag to %d in ire 0x%p\n",
- ire->ire_max_frag, (void *)ire));
- } else {
- ire_refrele(save_ire);
- goto icmp_err_ret;
- }
- }
-
- ire_refrele(save_ire);
- break;
- default:
- break;
- }
-
-out:
- *ret_action = Forward_ok;
- if (sire != NULL)
- ire_refrele(sire);
- if (dst_ill != NULL)
- ill_refrele(dst_ill);
- if (src_ipif != NULL)
- ipif_refrele(src_ipif);
- return (ire);
-icmp_err_ret:
- *ret_action = Forward_ret_icmp_err;
- if (src_ipif != NULL)
- ipif_refrele(src_ipif);
- if (dst_ill != NULL)
- ill_refrele(dst_ill);
- if (sire != NULL)
- ire_refrele(sire);
- if (ire != NULL) {
- if (ire->ire_flags & RTF_BLACKHOLE)
- *ret_action = Forward_blackhole;
- ire_refrele(ire);
- }
- /* caller needs to send icmp error message */
- return (NULL);
-
-}
-
/*
* Obtain the rt_entry and rt_irb for the route to be added to
* the ips_ip_ftable.
@@ -1489,7 +431,7 @@ ire_get_bucket(ire_t *ire)
rt->rt_nodes->rn_key = (char *)&rt->rt_dst;
rt->rt_dst = rdst;
irb = &rt->rt_irb;
- irb->irb_marks |= IRB_MARK_FTABLE; /* dynamically allocated/freed */
+ irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */
irb->irb_ipst = ipst;
rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL);
RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable);
@@ -1510,7 +452,7 @@ ire_get_bucket(ire_t *ire)
}
if (rt != NULL) {
irb = &rt->rt_irb;
- IRB_REFHOLD(irb);
+ irb_refhold(irb);
}
RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
return (irb);
@@ -1551,10 +493,12 @@ ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid)
ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6);
- if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) {
- ill = ire_to_ill(ire);
- if (ill != NULL)
+ if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) {
+ ill = ire_nexthop_ill(ire);
+ if (ill != NULL) {
ifindex = ill->ill_phyint->phyint_ifindex;
+ ill_refrele(ill);
+ }
ire_refrele(ire);
}
netstack_rele(ns);
@@ -1563,7 +507,7 @@ ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid)
/*
* Routine to find the route to a destination. If a ifindex is supplied
- * it tries to match the the route to the corresponding ipif for the ifindex
+ * it tries to match the route to the corresponding ipif for the ifindex
*/
static ire_t *
route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst)
@@ -1571,27 +515,33 @@ route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst)
ire_t *ire = NULL;
int match_flags;
- match_flags = (MATCH_IRE_DSTONLY | MATCH_IRE_DEFAULT |
- MATCH_IRE_RECURSIVE | MATCH_IRE_RJ_BHOLE);
+ match_flags = MATCH_IRE_DSTONLY;
/* XXX pass NULL tsl for now */
if (dst_addr->sa_family == AF_INET) {
- ire = ire_route_lookup(
- ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr,
- 0, 0, 0, NULL, NULL, zoneid, NULL, match_flags, ipst);
+ ire = ire_route_recursive_v4(
+ ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL,
+ zoneid, NULL, match_flags, B_TRUE, 0, ipst, NULL, NULL,
+ NULL);
} else {
- ire = ire_route_lookup_v6(
- &((struct sockaddr_in6 *)dst_addr)->sin6_addr,
- 0, 0, 0, NULL, NULL, zoneid, NULL, match_flags, ipst);
+ ire = ire_route_recursive_v6(
+ &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL,
+ zoneid, NULL, match_flags, B_TRUE, 0, ipst, NULL, NULL,
+ NULL);
+ }
+ ASSERT(ire != NULL);
+ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+ ire_refrele(ire);
+ return (NULL);
}
return (ire);
}
/*
* This routine is called by IP Filter to send a packet out on the wire
- * to a specified V4 dst (which may be onlink or offlink). The ifindex may or
- * may not be 0. A non-null ifindex indicates IP Filter has stipulated
+ * to a specified dstination (which may be onlink or offlink). The ifindex may
+ * or may not be 0. A non-null ifindex indicates IP Filter has stipulated
* an outgoing interface and requires the nexthop to be on that interface.
* IP WILL NOT DO the following to the data packet before sending it out:
* a. manipulate ttl
@@ -1611,21 +561,18 @@ route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst)
* of the offlink dst's nexthop needs to get
* resolved before packet can be sent to dst.
* Thus transmission is not guaranteed.
- *
+ * Note: No longer have visibility to the ARP queue
+ * hence no EINPROGRESS.
*/
-
int
ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex,
zoneid_t zoneid)
{
- ire_t *ire = NULL, *sire = NULL;
- ire_t *ire_cache = NULL;
- int value;
- int match_flags;
- ipaddr_t dst;
+ ipaddr_t nexthop;
netstack_t *ns;
ip_stack_t *ipst;
- enum ire_forward_action ret_action;
+ ip_xmit_attr_t ixas;
+ int error;
ASSERT(mp != NULL);
@@ -1646,429 +593,57 @@ ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex,
ASSERT(dst_addr->sa_family == AF_INET ||
dst_addr->sa_family == AF_INET6);
- if (dst_addr->sa_family == AF_INET) {
- dst = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr;
- } else {
- /*
- * We dont have support for V6 yet. It will be provided
- * once RFE 6399103 has been delivered.
- * Until then, for V6 dsts, IP Filter will not call
- * this function. Instead the netinfo framework provides
- * its own code path, in ip_inject_impl(), to achieve
- * what it needs to do, for the time being.
- */
- ip1dbg(("ipfil_sendpkt: no V6 support \n"));
- value = ECOMM;
- freemsg(mp);
- goto discard;
- }
-
- /*
- * Lets get the ire. We might get the ire cache entry,
- * or the ire,sire pair needed to create the cache entry.
- * XXX pass NULL tsl for now.
- */
-
- if (ifindex == 0) {
- /* There is no supplied index. So use the FIB info */
-
- match_flags = (MATCH_IRE_DSTONLY | MATCH_IRE_DEFAULT |
- MATCH_IRE_RECURSIVE | MATCH_IRE_RJ_BHOLE);
- ire = ire_route_lookup(dst,
- 0, 0, 0, NULL, &sire, zoneid, msg_getlabel(mp),
- match_flags, ipst);
- } else {
- ipif_t *supplied_ipif;
- ill_t *ill;
-
- match_flags = (MATCH_IRE_DSTONLY | MATCH_IRE_DEFAULT |
- MATCH_IRE_RECURSIVE| MATCH_IRE_RJ_BHOLE|
- MATCH_IRE_SECATTR | MATCH_IRE_ILL);
-
- /*
- * If supplied ifindex is non-null, the only valid
- * nexthop is one off of the interface corresponding
- * to the specified ifindex.
- */
- ill = ill_lookup_on_ifindex(ifindex, B_FALSE,
- NULL, NULL, NULL, NULL, ipst);
- if (ill == NULL) {
- ip1dbg(("ipfil_sendpkt: Could not find"
- " route to dst\n"));
- value = ECOMM;
- freemsg(mp);
- goto discard;
- }
-
- supplied_ipif = ipif_get_next_ipif(NULL, ill);
- ire = ire_route_lookup(dst, 0, 0, 0, supplied_ipif,
- &sire, zoneid, msg_getlabel(mp), match_flags, ipst);
- if (supplied_ipif != NULL)
- ipif_refrele(supplied_ipif);
- ill_refrele(ill);
- }
-
+ bzero(&ixas, sizeof (ixas));
/*
- * Verify that the returned IRE is non-null and does
- * not have either the RTF_REJECT or RTF_BLACKHOLE
- * flags set and that the IRE is either an IRE_CACHE,
- * IRE_IF_NORESOLVER or IRE_IF_RESOLVER.
+ * No IPsec, no fragmentation, and don't let any hooks see
+ * the packet.
*/
- if (ire == NULL ||
- ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) ||
- (ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0)) {
- /*
- * Either ire could not be found or we got
- * an invalid one
- */
- ip1dbg(("ipfil_sendpkt: Could not find route to dst\n"));
- value = ENONET;
- freemsg(mp);
- goto discard;
- }
-
- /* IP Filter and CGTP dont mix. So bail out if CGTP is on */
- if (ipst->ips_ip_cgtp_filter &&
- ((ire->ire_flags & RTF_MULTIRT) ||
- ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)))) {
- ip1dbg(("ipfil_sendpkt: IPFilter does not work with CGTP\n"));
- value = ECOMM;
- freemsg(mp);
- goto discard;
- }
+ ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK;
+ ixas.ixa_cred = kcred;
+ ixas.ixa_cpid = NOPID;
+ ixas.ixa_tsl = NULL;
+ ixas.ixa_ipst = ipst;
+ ixas.ixa_ifindex = ifindex;
- ASSERT(ire->ire_type != IRE_CACHE || ire->ire_nce != NULL);
-
- /*
- * If needed, we will create the ire cache entry for the
- * nexthop, resolve its link-layer address and then send
- * the packet out without ttl or IPSec processing.
- */
- switch (ire->ire_type) {
- case IRE_CACHE:
- if (sire != NULL) {
- UPDATE_OB_PKT_COUNT(sire);
- sire->ire_last_used_time = lbolt;
- ire_refrele(sire);
- }
- ire_cache = ire;
- break;
- case IRE_IF_NORESOLVER:
- case IRE_IF_RESOLVER:
- /*
- * Call ire_forward(). This function
- * will, create the ire cache entry of the
- * the nexthop and adds this incomplete ire
- * to the ire cache table
- */
- ire_cache = ire_forward(dst, &ret_action, ire, sire,
- msg_getlabel(mp), ipst);
- if (ire_cache == NULL) {
- ip1dbg(("ipfil_sendpkt: failed to create the"
- " ire cache entry \n"));
- value = ENONET;
- freemsg(mp);
- sire = NULL;
- ire = NULL;
- goto discard;
- }
- break;
- }
-
- if (DB_CKSUMFLAGS(mp)) {
- if (ip_send_align_hcksum_flags(mp, ire_to_ill(ire_cache)))
- goto cleanup;
- }
-
- /*
- * Now that we have the ire cache entry of the nexthop, call
- * ip_xmit_v4() to trigger mac addr resolution
- * if necessary and send it once ready.
- */
-
- value = ip_xmit_v4(mp, ire_cache, NULL, B_FALSE, NULL);
-cleanup:
- ire_refrele(ire_cache);
- /*
- * At this point, the reference for these have already been
- * released within ire_forward() and/or ip_xmit_v4(). So we set
- * them to NULL to make sure we dont drop the references
- * again in case ip_xmit_v4() returns with either SEND_FAILED
- * or LLHDR_RESLV_FAILED
- */
- sire = NULL;
- ire = NULL;
-
- switch (value) {
- case SEND_FAILED:
- ip1dbg(("ipfil_sendpkt: Send failed\n"));
- value = ECOMM;
- break;
- case LLHDR_RESLV_FAILED:
- ip1dbg(("ipfil_sendpkt: Link-layer resolution"
- " failed\n"));
- value = ECOMM;
- break;
- case LOOKUP_IN_PROGRESS:
- netstack_rele(ns);
- return (EINPROGRESS);
- case SEND_PASSED:
- netstack_rele(ns);
- return (0);
- }
-discard:
if (dst_addr->sa_family == AF_INET) {
- BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
- } else {
- BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
- }
- if (ire != NULL)
- ire_refrele(ire);
- if (sire != NULL)
- ire_refrele(sire);
- netstack_rele(ns);
- return (value);
-}
-
-
-/*
- * We don't check for dohwcksum in here because it should be being used
- * elsewhere to control what flags are being set on the mblk. That is,
- * if DB_CKSUMFLAGS() is non-zero then we assume dohwcksum to be true
- * for this packet.
- *
- * This function assumes that it is *only* being called for TCP or UDP
- * packets and nothing else.
- */
-static int
-ip_send_align_hcksum_flags(mblk_t *mp, ill_t *ill)
-{
- int illhckflags;
- int mbhckflags;
- uint16_t *up;
- uint32_t cksum;
- ipha_t *ipha;
- ip6_t *ip6;
- int proto;
- int ipversion;
- int length;
- int start;
- ip6_pkt_t ipp;
-
- mbhckflags = DB_CKSUMFLAGS(mp);
- ASSERT(mbhckflags != 0);
- ASSERT(mp->b_datap->db_type == M_DATA);
- /*
- * Since this function only knows how to manage the hardware checksum
- * issue, reject and packets that have flags set on the aside from
- * checksum related attributes as we cannot necessarily safely map
- * that packet onto the new NIC. Packets that can be potentially
- * dropped here include those marked for LSO.
- */
- if ((mbhckflags &
- ~(HCK_FULLCKSUM|HCK_PARTIALCKSUM|HCK_IPV4_HDRCKSUM)) != 0) {
- DTRACE_PROBE2(pbr__incapable, (mblk_t *), mp, (ill_t *), ill);
- freemsg(mp);
- return (-1);
- }
-
- ipha = (ipha_t *)mp->b_rptr;
-
- /*
- * Find out what the new NIC is capable of, if anything, and
- * only allow it to be used with M_DATA mblks being sent out.
- */
- if (ILL_HCKSUM_CAPABLE(ill)) {
- illhckflags = ill->ill_hcksum_capab->ill_hcksum_txflags;
- } else {
- /*
- * No capabilities, so turn off everything.
- */
- illhckflags = 0;
- (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0, 0, 0);
- mp->b_datap->db_struioflag &= ~STRUIO_IP;
- }
-
- DTRACE_PROBE4(pbr__info__a, (mblk_t *), mp, (ill_t *), ill,
- uint32_t, illhckflags, uint32_t, mbhckflags);
- /*
- * This block of code that looks for the position of the TCP/UDP
- * checksum is early in this function because we need to know
- * what needs to be blanked out for the hardware checksum case.
- *
- * That we're in this function implies that the packet is either
- * TCP or UDP on Solaris, so checks are made for one protocol and
- * if that fails, the other is therefore implied.
- */
- ipversion = IPH_HDR_VERSION(ipha);
+ ipha_t *ipha = (ipha_t *)mp->b_rptr;
- if (ipversion == IPV4_VERSION) {
- proto = ipha->ipha_protocol;
- if (proto == IPPROTO_TCP) {
- up = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
- } else {
- up = IPH_UDPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
+ ixas.ixa_flags |= IXAF_IS_IPV4;
+ nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr;
+ if (nexthop != ipha->ipha_dst) {
+ ixas.ixa_flags |= IXAF_NEXTHOP_SET;
+ ixas.ixa_nexthop_v4 = nexthop;
}
+ ixas.ixa_multicast_ttl = ipha->ipha_ttl;
} else {
- uint8_t lasthdr;
-
- /*
- * Nothing I've seen indicates that IPv6 checksum'ing
- * precludes the presence of extension headers, so we
- * can't just look at the next header value in the IPv6
- * packet header to see if it is TCP/UDP.
- */
- ip6 = (ip6_t *)ipha;
- (void) memset(&ipp, 0, sizeof (ipp));
- start = ip_find_hdr_v6(mp, ip6, &ipp, &lasthdr);
- proto = lasthdr;
-
- if (proto == IPPROTO_TCP) {
- up = IPH_TCPH_CHECKSUMP(ipha, start);
- } else {
- up = IPH_UDPH_CHECKSUMP(ipha, start);
- }
- }
+ ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+ in6_addr_t *nexthop6;
- /*
- * The first case here is easiest:
- * mblk hasn't asked for full checksum, but the card supports it.
- *
- * In addition, check for IPv4 header capability. Note that only
- * the mblk flag is checked and not ipversion.
- */
- if ((((illhckflags & HCKSUM_INET_FULL_V4) && (ipversion == 4)) ||
- (((illhckflags & HCKSUM_INET_FULL_V6) && (ipversion == 6)))) &&
- ((mbhckflags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) != 0)) {
- int newflags = HCK_FULLCKSUM;
-
- if ((mbhckflags & HCK_IPV4_HDRCKSUM) != 0) {
- if ((illhckflags & HCKSUM_IPHDRCKSUM) != 0) {
- newflags |= HCK_IPV4_HDRCKSUM;
- } else {
- /*
- * Rather than call a function, just inline
- * the computation of the basic IPv4 header.
- */
- cksum = (ipha->ipha_dst >> 16) +
- (ipha->ipha_dst & 0xFFFF) +
- (ipha->ipha_src >> 16) +
- (ipha->ipha_src & 0xFFFF);
- IP_HDR_CKSUM(ipha, cksum,
- ((uint32_t *)ipha)[0],
- ((uint16_t *)ipha)[4]);
- }
+ nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr;
+ if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) {
+ ixas.ixa_flags |= IXAF_NEXTHOP_SET;
+ ixas.ixa_nexthop_v6 = *nexthop6;
}
-
- *up = 0;
- (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
- newflags, 0);
- return (0);
- }
-
- DTRACE_PROBE2(pbr__info__b, int, ipversion, int, proto);
-
- /*
- * Start calculating the pseudo checksum over the IP packet header.
- * Although the final pseudo checksum used by TCP/UDP consists of
- * more than just the address fields, we can use the result of
- * adding those together a little bit further down for IPv4.
- */
- if (ipversion == IPV4_VERSION) {
- cksum = (ipha->ipha_dst >> 16) + (ipha->ipha_dst & 0xFFFF) +
- (ipha->ipha_src >> 16) + (ipha->ipha_src & 0xFFFF);
- start = IP_SIMPLE_HDR_LENGTH;
- length = ntohs(ipha->ipha_length);
- DTRACE_PROBE3(pbr__info__e, uint32_t, ipha->ipha_src,
- uint32_t, ipha->ipha_dst, int, cksum);
- } else {
- uint16_t *pseudo;
-
- pseudo = (uint16_t *)&ip6->ip6_src;
-
- /* calculate pseudo-header checksum */
- cksum = pseudo[0] + pseudo[1] + pseudo[2] + pseudo[3] +
- pseudo[4] + pseudo[5] + pseudo[6] + pseudo[7] +
- pseudo[8] + pseudo[9] + pseudo[10] + pseudo[11] +
- pseudo[12] + pseudo[13] + pseudo[14] + pseudo[15];
-
- length = ntohs(ip6->ip6_plen) + sizeof (ip6_t);
- }
-
- /* Fold the initial sum */
- cksum = (cksum & 0xffff) + (cksum >> 16);
-
- /*
- * If the packet was asking for an IPv4 header checksum to be
- * calculated but the interface doesn't support that, fill it in
- * using our pseudo checksum as a starting point.
- */
- if (((mbhckflags & HCK_IPV4_HDRCKSUM) != 0) &&
- ((illhckflags & HCKSUM_IPHDRCKSUM) == 0)) {
- /*
- * IP_HDR_CKSUM uses the 2rd arg to the macro in a destructive
- * way so pass in a copy of the checksum calculated thus far.
- */
- uint32_t ipsum = cksum;
-
- DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
-
- IP_HDR_CKSUM(ipha, ipsum, ((uint32_t *)ipha)[0],
- ((uint16_t *)ipha)[4]);
- }
-
- DTRACE_PROBE3(pbr__info__c, int, start, int, length, int, cksum);
-
- if (proto == IPPROTO_TCP) {
- cksum += IP_TCP_CSUM_COMP;
- } else {
- cksum += IP_UDP_CSUM_COMP;
+ ixas.ixa_multicast_ttl = ip6h->ip6_hops;
}
- cksum += htons(length - start);
- cksum = (cksum & 0xffff) + (cksum >> 16);
-
- /*
- * For TCP/UDP, we either want to setup the packet for partial
- * checksum or we want to do it all ourselves because the NIC
- * offers no support for either partial or full checksum.
- */
- if ((illhckflags & HCKSUM_INET_PARTIAL) != 0) {
- /*
- * The only case we care about here is if the mblk was
- * previously set for full checksum offload. If it was
- * marked for partial (and the NIC does partial), then
- * we have nothing to do. Similarly if the packet was
- * not set for partial or full, we do nothing as this
- * is cheaper than more work to set something up.
- */
- if ((mbhckflags & HCK_FULLCKSUM) != 0) {
- uint32_t offset;
-
- if (proto == IPPROTO_TCP) {
- offset = TCP_CHECKSUM_OFFSET;
- } else {
- offset = UDP_CHECKSUM_OFFSET;
- }
- *up = cksum;
-
- DTRACE_PROBE3(pbr__info__f, int, length - start, int,
- cksum, int, offset);
+ error = ip_output_simple(mp, &ixas);
+ ixa_cleanup(&ixas);
- (void) hcksum_assoc(mp, NULL, NULL, start,
- start + offset, length, 0,
- DB_CKSUMFLAGS(mp) | HCK_PARTIALCKSUM, 0);
- }
+ netstack_rele(ns);
+ switch (error) {
+ case 0:
+ break;
- } else if (mbhckflags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) {
- DB_CKSUMFLAGS(mp) &= ~(HCK_PARTIALCKSUM|HCK_FULLCKSUM);
+ case EHOSTUNREACH:
+ case ENETUNREACH:
+ error = ENONET;
+ break;
- *up = 0;
- *up = IP_CSUM(mp, start, cksum);
+ default:
+ error = ECOMM;
+ break;
}
-
- DTRACE_PROBE4(pbr__info__d, (mblk_t *), mp, (ipha_t *), ipha,
- (uint16_t *), up, int, cksum);
- return (0);
+ return (error);
}
/*
@@ -2094,18 +669,18 @@ ire_find_best_route(struct radix_node *rn, void *arg)
rw_enter(&irb_ptr->irb_lock, RW_READER);
for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
- if (ire->ire_marks & IRE_MARK_CONDEMNED)
+ if (IRE_IS_CONDEMNED(ire))
continue;
- if (margs->ift_flags & MATCH_IRE_MASK)
+ if (margs->ift_flags & (MATCH_IRE_MASK|MATCH_IRE_SHORTERMASK))
match_mask = margs->ift_mask;
else
match_mask = ire->ire_mask;
if (ire_match_args(ire, margs->ift_addr, match_mask,
- margs->ift_gateway, margs->ift_type, margs->ift_ipif,
- margs->ift_zoneid, margs->ift_ihandle, margs->ift_tsl,
- margs->ift_flags, NULL)) {
- IRE_REFHOLD(ire);
+ margs->ift_gateway, margs->ift_type, margs->ift_ill,
+ margs->ift_zoneid, margs->ift_tsl,
+ margs->ift_flags)) {
+ ire_refhold(ire);
rw_exit(&irb_ptr->irb_lock);
margs->ift_best_ire = ire;
return (B_TRUE);
@@ -2198,107 +773,182 @@ irb_refrele_ftable(irb_t *irb)
}
/*
- * IRE iterator used by ire_ftable_lookup() to process multiple default
- * routes. Given a starting point in the hash list (ire_origin), walk the IREs
- * in the bucket skipping default interface routes and deleted entries.
- * Returns the next IRE (unheld), or NULL when we're back to the starting point.
- * Assumes that the caller holds a reference on the IRE bucket.
+ * IRE iterator used by ire_ftable_lookup to process multiple equal
+ * routes. Given a starting point in the hash list (hash), walk the IREs
+ * in the bucket skipping deleted entries. We treat the bucket as a circular
+ * list for the purposes of walking it.
+ * Returns the IRE (held) that corresponds to the hash value. If that IRE is
+ * not applicable (ire_match_args failed) then it returns a subsequent one.
+ * If we fail to find an IRE we return NULL.
*
- * In the absence of good IRE_DEFAULT routes, this function will return
- * the first IRE_INTERFACE route found (if any).
+ * Assumes that the caller holds a reference on the IRE bucket and a read lock
+ * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6).
+ *
+ * Applies to IPv4 and IPv6.
+ *
+ * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same
+ * address and bucket, we compare against ire_type for the orig_ire. We also
+ * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being
+ * first in the bucket. Thus we compare that ire_flags match the orig_ire.
+ *
+ * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is
+ * reachable from the zone i.e., that the ire_gateway_addr is in a subnet
+ * in which the zone has an IP address. We check this for the global zone
+ * even if no shared-IP zones are configured.
*/
ire_t *
-ire_round_robin(irb_t *irb_ptr, zoneid_t zoneid, ire_ftable_args_t *margs,
- ip_stack_t *ipst)
+ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash,
+ ire_t *orig_ire, ip_stack_t *ipst)
{
- ire_t *ire_origin;
- ire_t *ire, *maybe_ire = NULL;
+ ire_t *ire, *maybe_ire = NULL;
+ uint_t maybe_badcnt;
+ uint_t maxwalk;
- rw_enter(&irb_ptr->irb_lock, RW_WRITER);
- ire_origin = irb_ptr->irb_rr_origin;
- if (ire_origin != NULL) {
- ire_origin = ire_origin->ire_next;
- IRE_FIND_NEXT_ORIGIN(ire_origin);
- }
+ /* Fold in more bits from the hint/hash */
+ hash = hash ^ (hash >> 8) ^ (hash >> 16);
- if (ire_origin == NULL) {
- /*
- * first time through routine, or we dropped off the end
- * of list.
- */
- ire_origin = irb_ptr->irb_ire;
- IRE_FIND_NEXT_ORIGIN(ire_origin);
- }
- irb_ptr->irb_rr_origin = ire_origin;
- IRB_REFHOLD_LOCKED(irb_ptr);
+ rw_enter(&irb_ptr->irb_lock, RW_WRITER);
+ maxwalk = irb_ptr->irb_ire_cnt; /* Excludes condemned */
+ hash %= maxwalk;
+ irb_refhold_locked(irb_ptr);
rw_exit(&irb_ptr->irb_lock);
- DTRACE_PROBE2(ire__rr__origin, (irb_t *), irb_ptr,
- (ire_t *), ire_origin);
-
/*
* Round-robin the routers list looking for a route that
* matches the passed in parameters.
- * We start with the ire we found above and we walk the hash
- * list until we're back where we started. It doesn't matter if
- * routes are added or deleted by other threads - we know this
- * ire will stay in the list because we hold a reference on the
- * ire bucket.
+ * First we skip "hash" number of non-condemned IREs.
+ * Then we match the IRE.
+ * If we find an ire which has a non-zero ire_badcnt then we remember
+ * it and keep on looking for a lower ire_badcnt.
+ * If we come to the end of the list we continue (treat the
+ * bucket list as a circular list) but we match less than "max"
+ * entries.
*/
- ire = ire_origin;
- while (ire != NULL) {
- int match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR;
- ire_t *rire;
+ ire = irb_ptr->irb_ire;
+ while (maxwalk > 0) {
+ if (IRE_IS_CONDEMNED(ire))
+ goto next_ire_skip;
+
+ /* Skip the first "hash" entries to do ECMP */
+ if (hash != 0) {
+ hash--;
+ goto next_ire_skip;
+ }
- if (ire->ire_marks & IRE_MARK_CONDEMNED)
+ /* See CGTP comment above */
+ if (ire->ire_type != orig_ire->ire_type ||
+ ire->ire_flags != orig_ire->ire_flags)
goto next_ire;
- if (!ire_match_args(ire, margs->ift_addr, (ipaddr_t)0,
- margs->ift_gateway, margs->ift_type, margs->ift_ipif,
- margs->ift_zoneid, margs->ift_ihandle, margs->ift_tsl,
- margs->ift_flags, NULL))
+ /*
+ * Note: Since IPv6 has hash buckets instead of radix
+ * buckers we need to explicitly compare the addresses.
+ * That makes this less efficient since we will be called
+ * even if there is no alternatives just because the
+ * bucket has multiple IREs for different addresses.
+ */
+ if (ire->ire_ipversion == IPV6_VERSION) {
+ if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6,
+ &ire->ire_addr_v6))
+ goto next_ire;
+ }
+
+ /*
+ * For some reason find_best_route uses ire_mask. We do
+ * the same.
+ */
+ if (ire->ire_ipversion == IPV4_VERSION ?
+ !ire_match_args(ire, margs->ift_addr,
+ ire->ire_mask, margs->ift_gateway,
+ margs->ift_type, margs->ift_ill, margs->ift_zoneid,
+ margs->ift_tsl, margs->ift_flags) :
+ !ire_match_args_v6(ire, &margs->ift_addr_v6,
+ &ire->ire_mask_v6, &margs->ift_gateway_v6,
+ margs->ift_type, margs->ift_ill, margs->ift_zoneid,
+ margs->ift_tsl, margs->ift_flags))
goto next_ire;
- if (ire->ire_type & IRE_INTERFACE) {
+ if (margs->ift_zoneid != ALL_ZONES &&
+ (ire->ire_type & IRE_OFFLINK)) {
/*
- * keep looking to see if there is a non-interface
- * default ire, but save this one as a last resort.
+ * When we're in a zone, we're only
+ * interested in routers that are
+ * reachable through ipifs within our zone.
*/
- if (maybe_ire == NULL)
- maybe_ire = ire;
- goto next_ire;
+ if (ire->ire_ipversion == IPV4_VERSION) {
+ if (!ire_gateway_ok_zone_v4(
+ ire->ire_gateway_addr, margs->ift_zoneid,
+ ire->ire_ill, margs->ift_tsl, ipst,
+ B_TRUE))
+ goto next_ire;
+ } else {
+ if (!ire_gateway_ok_zone_v6(
+ &ire->ire_gateway_addr_v6,
+ margs->ift_zoneid, ire->ire_ill,
+ margs->ift_tsl, ipst, B_TRUE))
+ goto next_ire;
+ }
}
-
- if (zoneid == ALL_ZONES) {
- IRE_REFHOLD(ire);
- IRB_REFRELE(irb_ptr);
+ mutex_enter(&ire->ire_lock);
+ /* Look for stale ire_badcnt and clear */
+ if (ire->ire_badcnt != 0 &&
+ (TICK_TO_SEC(lbolt64) - ire->ire_last_badcnt >
+ ipst->ips_ip_ire_badcnt_lifetime))
+ ire->ire_badcnt = 0;
+ mutex_exit(&ire->ire_lock);
+
+ if (ire->ire_badcnt == 0) {
+ /* We found one with a zero badcnt; done */
+ ire_refhold(ire);
+ /*
+ * Care needed since irb_refrele grabs WLOCK to free
+ * the irb_t.
+ */
+ if (ire->ire_ipversion == IPV4_VERSION) {
+ RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
+ irb_refrele(irb_ptr);
+ RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
+ } else {
+ rw_exit(&ipst->ips_ip6_ire_head_lock);
+ irb_refrele(irb_ptr);
+ rw_enter(&ipst->ips_ip6_ire_head_lock,
+ RW_READER);
+ }
return (ire);
}
/*
- * When we're in a non-global zone, we're only
- * interested in routers that are
- * reachable through ipifs within our zone.
+ * keep looking to see if there is a better (lower
+ * badcnt) matching IRE, but save this one as a last resort.
+ * If we find a lower badcnt pick that one as the last* resort.
*/
- if (ire->ire_ipif != NULL)
- match_flags |= MATCH_IRE_ILL;
-
- rire = ire_route_lookup(ire->ire_gateway_addr, 0, 0,
- IRE_INTERFACE, ire->ire_ipif, NULL, zoneid, margs->ift_tsl,
- match_flags, ipst);
- if (rire != NULL) {
- ire_refrele(rire);
- IRE_REFHOLD(ire);
- IRB_REFRELE(irb_ptr);
- return (ire);
+ if (maybe_ire == NULL) {
+ maybe_ire = ire;
+ maybe_badcnt = ire->ire_badcnt;
+ } else if (ire->ire_badcnt < maybe_badcnt) {
+ maybe_ire = ire;
+ maybe_badcnt = ire->ire_badcnt;
}
+
next_ire:
- ire = (ire->ire_next ? ire->ire_next : irb_ptr->irb_ire);
- if (ire == ire_origin)
- break;
+ maxwalk--;
+next_ire_skip:
+ ire = ire->ire_next;
+ if (ire == NULL)
+ ire = irb_ptr->irb_ire;
}
if (maybe_ire != NULL)
- IRE_REFHOLD(maybe_ire);
- IRB_REFRELE(irb_ptr);
+ ire_refhold(maybe_ire);
+
+ /* Care needed since irb_refrele grabs WLOCK to free the irb_t. */
+ if (ire->ire_ipversion == IPV4_VERSION) {
+ RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
+ irb_refrele(irb_ptr);
+ RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
+ } else {
+ rw_exit(&ipst->ips_ip6_ire_head_lock);
+ irb_refrele(irb_ptr);
+ rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
+ }
return (maybe_ire);
}
@@ -2306,7 +956,7 @@ void
irb_refhold_rn(struct radix_node *rn)
{
if ((rn->rn_flags & RNF_ROOT) == 0)
- IRB_REFHOLD(&((rt_t *)(rn))->rt_irb);
+ irb_refhold(&((rt_t *)(rn))->rt_irb);
}
void
@@ -2315,3 +965,587 @@ irb_refrele_rn(struct radix_node *rn)
if ((rn->rn_flags & RNF_ROOT) == 0)
irb_refrele_ftable(&((rt_t *)(rn))->rt_irb);
}
+
+/*
+ * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject
+ * routes this routine sets up a ire_nce_cache as well. The caller needs to
+ * lookup an nce for the multicast case.
+ */
+ire_t *
+ip_select_route(const in6_addr_t *v6dst, ip_xmit_attr_t *ixa,
+ uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp)
+{
+ uint_t match_args;
+ uint_t ire_type;
+ ill_t *ill;
+ ire_t *ire;
+ ip_stack_t *ipst = ixa->ixa_ipst;
+ ipaddr_t v4dst;
+ in6_addr_t v6nexthop;
+ iaflags_t ixaflags = ixa->ixa_flags;
+ nce_t *nce;
+
+ match_args = MATCH_IRE_SECATTR;
+ IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst);
+ if (setsrcp != NULL)
+ ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp));
+ if (errorp != NULL)
+ ASSERT(*errorp == 0);
+
+ /*
+ * The content of the ixa will be different if IP_NEXTHOP,
+ * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set
+ */
+
+ if ((ixaflags & IXAF_IS_IPV4) ? CLASSD(v4dst) :
+ IN6_IS_ADDR_MULTICAST(v6dst)) {
+ /* Pick up the IRE_MULTICAST for the ill */
+ if (ixa->ixa_multicast_ifindex != 0) {
+ ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex,
+ !(ixaflags & IXAF_IS_IPV4), ipst);
+ } else if (ixaflags & IXAF_SCOPEID_SET) {
+ /* sin6_scope_id takes precedence over ixa_ifindex */
+ ASSERT(ixa->ixa_scopeid != 0);
+ ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
+ !(ixaflags & IXAF_IS_IPV4), ipst);
+ } else if (ixa->ixa_ifindex != 0) {
+ /*
+ * In the ipmp case, the ixa_ifindex is set to
+ * point at an under_ill and we would return the
+ * ire_multicast() corresponding to that under_ill.
+ */
+ ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
+ !(ixaflags & IXAF_IS_IPV4), ipst);
+ } else if (ixaflags & IXAF_IS_IPV4) {
+ ipaddr_t v4setsrc = INADDR_ANY;
+
+ ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid, ipst,
+ multirtp, &v4setsrc);
+ if (setsrcp != NULL)
+ IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
+ } else {
+ ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid, ipst,
+ multirtp, setsrcp);
+ }
+ if (ill != NULL && IS_VNI(ill)) {
+ ill_refrele(ill);
+ ill = NULL;
+ }
+ if (ill == NULL) {
+ if (errorp != NULL)
+ *errorp = ENXIO;
+ /* Get a hold on the IRE_NOROUTE */
+ ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4));
+ return (ire);
+ }
+ if (!(ill->ill_flags & ILLF_MULTICAST)) {
+ ill_refrele(ill);
+ if (errorp != NULL)
+ *errorp = EHOSTUNREACH;
+ /* Get a hold on the IRE_NOROUTE */
+ ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4));
+ return (ire);
+ }
+ /* Get a refcnt on the single IRE_MULTICAST per ill */
+ ire = ire_multicast(ill);
+ ill_refrele(ill);
+ if (generationp != NULL)
+ *generationp = ire->ire_generation;
+ if (errorp != NULL &&
+ (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
+ *errorp = EHOSTUNREACH;
+ }
+ return (ire);
+ }
+
+ if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) {
+ if (ixaflags & IXAF_SCOPEID_SET) {
+ /* sin6_scope_id takes precedence over ixa_ifindex */
+ ASSERT(ixa->ixa_scopeid != 0);
+ ill = ill_lookup_on_ifindex(ixa->ixa_scopeid,
+ !(ixaflags & IXAF_IS_IPV4), ipst);
+ } else {
+ ASSERT(ixa->ixa_ifindex != 0);
+ ill = ill_lookup_on_ifindex(ixa->ixa_ifindex,
+ !(ixaflags & IXAF_IS_IPV4), ipst);
+ }
+ if (ill != NULL && IS_VNI(ill)) {
+ ill_refrele(ill);
+ ill = NULL;
+ }
+ if (ill == NULL) {
+ if (errorp != NULL)
+ *errorp = ENXIO;
+ /* Get a hold on the IRE_NOROUTE */
+ ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4));
+ return (ire);
+ }
+ /*
+ * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF
+ * so for both of them we need to be able look for an under
+ * interface.
+ */
+ if (IS_UNDER_IPMP(ill))
+ match_args |= MATCH_IRE_TESTHIDDEN;
+ } else {
+ ill = NULL;
+ }
+
+ if (ixaflags & IXAF_NEXTHOP_SET) {
+ /* IP_NEXTHOP was set */
+ v6nexthop = ixa->ixa_nexthop_v6;
+ } else {
+ v6nexthop = *v6dst;
+ }
+
+ ire_type = 0;
+ /* If ill is null then ire_route_recursive will set MATCH_IRE_ILL */
+
+ /*
+ * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then
+ * we only look for an onlink IRE.
+ */
+ if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) {
+ match_args |= MATCH_IRE_TYPE;
+ ire_type = IRE_ONLINK;
+ }
+
+ if (ixaflags & IXAF_IS_IPV4) {
+ ipaddr_t v4nexthop;
+ ipaddr_t v4setsrc = INADDR_ANY;
+
+ IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop);
+ ire = ire_route_recursive_v4(v4nexthop, ire_type, ill,
+ ixa->ixa_zoneid, ixa->ixa_tsl, match_args, B_TRUE,
+ ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp);
+ if (setsrcp != NULL)
+ IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp);
+ } else {
+ ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill,
+ ixa->ixa_zoneid, ixa->ixa_tsl, match_args, B_TRUE,
+ ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp);
+ }
+
+#ifdef DEBUG
+ if (match_args & MATCH_IRE_TESTHIDDEN) {
+ ip3dbg(("looking for hidden; dst %x ire %p\n",
+ v4dst, (void *)ire));
+ }
+#endif
+
+ if (ill != NULL)
+ ill_refrele(ill);
+
+ if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
+ (ire->ire_type & IRE_MULTICAST)) {
+ /* No ire_nce_cache */
+ return (ire);
+ }
+
+ /* Setup ire_nce_cache if it doesn't exist or is condemned. */
+ mutex_enter(&ire->ire_lock);
+ nce = ire->ire_nce_cache;
+ if (nce == NULL || nce->nce_is_condemned) {
+ mutex_exit(&ire->ire_lock);
+ (void) ire_revalidate_nce(ire);
+ } else {
+ mutex_exit(&ire->ire_lock);
+ }
+ return (ire);
+}
+
+/*
+ * Find a route given some xmit attributes and a packet.
+ * Generic for IPv4 and IPv6
+ *
+ * This never returns NULL. But when it returns the IRE_NOROUTE
+ * it might set errorp.
+ */
+ire_t *
+ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp,
+ int *errorp, boolean_t *multirtp)
+{
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ ipha_t *ipha = (ipha_t *)mp->b_rptr;
+ in6_addr_t v6dst;
+
+ IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
+
+ return (ip_select_route(&v6dst, ixa, generationp,
+ NULL, errorp, multirtp));
+ } else {
+ ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+
+ return (ip_select_route(&ip6h->ip6_dst, ixa, generationp,
+ NULL, errorp, multirtp));
+ }
+}
+
+ire_t *
+ip_select_route_v4(ipaddr_t dst, ip_xmit_attr_t *ixa, uint_t *generationp,
+ ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp)
+{
+ in6_addr_t v6dst;
+ ire_t *ire;
+ in6_addr_t setsrc;
+
+ ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
+
+ IN6_IPADDR_TO_V4MAPPED(dst, &v6dst);
+
+ setsrc = ipv6_all_zeros;
+ ire = ip_select_route(&v6dst, ixa, generationp, &setsrc, errorp,
+ multirtp);
+ if (v4setsrcp != NULL)
+ IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp);
+ return (ire);
+}
+
+/*
+ * Recursively look for a route to the destination. Can also match on
+ * the zoneid, ill, and label. Used for the data paths. See also
+ * ire_route_recursive.
+ *
+ * If ill is set this means we will match it by adding MATCH_IRE_ILL.
+ *
+ * Note that this function never returns NULL. It returns an IRE_NOROUTE
+ * instead.
+ *
+ * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
+ * is an error.
+ * Allow at most one RTF_INDIRECT.
+ */
+ire_t *
+ire_route_recursive_impl_v4(ire_t *ire,
+ ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg,
+ zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
+ boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp,
+ tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
+{
+ int i, j;
+ ire_t *ires[MAX_IRE_RECURSION];
+ uint_t generation;
+ uint_t generations[MAX_IRE_RECURSION];
+ boolean_t need_refrele = B_FALSE;
+ boolean_t invalidate = B_FALSE;
+ int prefs[MAX_IRE_RECURSION];
+ ill_t *ill = NULL;
+
+ if (setsrcp != NULL)
+ ASSERT(*setsrcp == INADDR_ANY);
+ if (gwattrp != NULL)
+ ASSERT(*gwattrp == NULL);
+
+ if (ill_arg != NULL)
+ match_args |= MATCH_IRE_ILL;
+
+ /*
+ * We iterate up to three times to resolve a route, even though
+ * we have four slots in the array. The extra slot is for an
+ * IRE_IF_CLONE we might need to create.
+ */
+ i = 0;
+ while (i < MAX_IRE_RECURSION - 1) {
+ /* ire_ftable_lookup handles round-robin/ECMP */
+ if (ire == NULL) {
+ ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type,
+ (ill_arg != NULL ? ill_arg : ill), zoneid, tsl,
+ match_args, xmit_hint, ipst, &generation);
+ } else {
+ /* Caller passed it; extra hold since we will rele */
+ ire_refhold(ire);
+ if (generationp != NULL)
+ generation = *generationp;
+ else
+ generation = IRE_GENERATION_VERIFY;
+ }
+ if (ire == NULL)
+ ire = ire_reject(ipst, B_FALSE);
+
+ /* Need to return the ire with RTF_REJECT|BLACKHOLE */
+ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
+ goto error;
+
+ ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */
+
+ prefs[i] = ire_pref(ire);
+ if (i != 0) {
+ /*
+ * Don't allow anything unusual past the first
+ * iteration.
+ */
+ if ((ire->ire_type &
+ (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) ||
+ prefs[i] <= prefs[i-1]) {
+ ire_refrele(ire);
+ ire = ire_reject(ipst, B_FALSE);
+ goto error;
+ }
+ }
+ /* We have a usable IRE */
+ ires[i] = ire;
+ generations[i] = generation;
+ i++;
+
+ /* The first RTF_SETSRC address is passed back if setsrcp */
+ if ((ire->ire_flags & RTF_SETSRC) &&
+ setsrcp != NULL && *setsrcp == INADDR_ANY) {
+ ASSERT(ire->ire_setsrc_addr != INADDR_ANY);
+ *setsrcp = ire->ire_setsrc_addr;
+ }
+
+ /* The first ire_gw_secattr is passed back if gwattrp */
+ if (ire->ire_gw_secattr != NULL &&
+ gwattrp != NULL && *gwattrp == NULL)
+ *gwattrp = ire->ire_gw_secattr;
+
+ /*
+ * Check if we have a short-cut pointer to an IRE for this
+ * destination, and that the cached dependency isn't stale.
+ * In that case we've rejoined an existing tree towards a
+ * parent, thus we don't need to continue the loop to
+ * discover the rest of the tree.
+ */
+ mutex_enter(&ire->ire_lock);
+ if (ire->ire_dep_parent != NULL &&
+ ire->ire_dep_parent->ire_generation ==
+ ire->ire_dep_parent_generation) {
+ mutex_exit(&ire->ire_lock);
+ ire = NULL;
+ goto done;
+ }
+ mutex_exit(&ire->ire_lock);
+
+ /*
+ * If this type should have an ire_nce_cache (even if it
+ * doesn't yet have one) then we are done. Includes
+ * IRE_INTERFACE with a full 32 bit mask.
+ */
+ if (ire->ire_nce_capable) {
+ ire = NULL;
+ goto done;
+ }
+ ASSERT(!(ire->ire_type & IRE_IF_CLONE));
+ /*
+ * For an IRE_INTERFACE we create an IRE_IF_CLONE for this
+ * particular destination
+ */
+ if (ire->ire_type & IRE_INTERFACE) {
+ in6_addr_t v6nexthop;
+ ire_t *clone;
+
+ ASSERT(ire->ire_masklen != IPV4_ABITS);
+
+ /*
+ * In the case of ip_input and ILLF_FORWARDING not
+ * being set, and in the case of RTM_GET,
+ * there is no point in allocating
+ * an IRE_IF_CLONE. We return the IRE_INTERFACE.
+ * Note that !allocate can result in a ire_dep_parent
+ * which is IRE_IF_* without an IRE_IF_CLONE.
+ * We recover from that when we need to send packets
+ * by ensuring that the generations become
+ * IRE_GENERATION_VERIFY in this case.
+ */
+ if (!allocate) {
+ invalidate = B_TRUE;
+ ire = NULL;
+ goto done;
+ }
+
+ IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop);
+
+ clone = ire_create_if_clone(ire, &v6nexthop,
+ &generation);
+ if (clone == NULL) {
+ /*
+ * Temporary failure - no memory.
+ * Don't want caller to cache IRE_NOROUTE.
+ */
+ invalidate = B_TRUE;
+ ire = ire_blackhole(ipst, B_FALSE);
+ goto error;
+ }
+ /*
+ * Make clone next to last entry and the
+ * IRE_INTERFACE the last in the dependency
+ * chain since the clone depends on the
+ * IRE_INTERFACE.
+ */
+ ASSERT(i >= 1);
+ ASSERT(i < MAX_IRE_RECURSION);
+
+ ires[i] = ires[i-1];
+ generations[i] = generations[i-1];
+ ires[i-1] = clone;
+ generations[i-1] = generation;
+ i++;
+
+ ire = NULL;
+ goto done;
+ }
+
+ /*
+ * We only match on the type and optionally ILL when
+ * recursing. The type match is used by some callers
+ * to exclude certain types (such as IRE_IF_CLONE or
+ * IRE_LOCAL|IRE_LOOPBACK).
+ */
+ match_args &= MATCH_IRE_TYPE;
+ nexthop = ire->ire_gateway_addr;
+ if (ill == NULL && ire->ire_ill != NULL) {
+ ill = ire->ire_ill;
+ need_refrele = B_TRUE;
+ ill_refhold(ill);
+ match_args |= MATCH_IRE_ILL;
+ }
+ ire = NULL;
+ }
+ ASSERT(ire == NULL);
+ ire = ire_reject(ipst, B_FALSE);
+
+error:
+ ASSERT(ire != NULL);
+ if (need_refrele)
+ ill_refrele(ill);
+
+ /*
+ * In the case of MULTIRT we want to try a different IRE the next
+ * time. We let the next packet retry in that case.
+ */
+ if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT))
+ (void) ire_no_good(ires[0]);
+
+cleanup:
+ /* cleanup ires[i] */
+ ire_dep_unbuild(ires, i);
+ for (j = 0; j < i; j++)
+ ire_refrele(ires[j]);
+
+ ASSERT(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE));
+ /*
+ * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the
+ * ip_select_route since the reject or lack of memory might be gone.
+ */
+ if (generationp != NULL)
+ *generationp = IRE_GENERATION_VERIFY;
+ return (ire);
+
+done:
+ ASSERT(ire == NULL);
+ if (need_refrele) {
+ ill_refrele(ill);
+ ill = NULL;
+ }
+
+ /* Build dependencies */
+ if (!ire_dep_build(ires, generations, i)) {
+ /* Something in chain was condemned; tear it apart */
+ ire = ire_reject(ipst, B_FALSE);
+ goto cleanup;
+ }
+
+ /*
+ * Release all refholds except the one for ires[0] that we
+ * will return to the caller.
+ */
+ for (j = 1; j < i; j++)
+ ire_refrele(ires[j]);
+
+ if (invalidate) {
+ /*
+ * Since we needed to allocate but couldn't we need to make
+ * sure that the dependency chain is rebuilt the next time.
+ */
+ ire_dep_invalidate_generations(ires[0]);
+ generation = IRE_GENERATION_VERIFY;
+ } else {
+ /*
+ * IREs can have been added or deleted while we did the
+ * recursive lookup and we can't catch those until we've built
+ * the dependencies. We verify the stored
+ * ire_dep_parent_generation to catch any such changes and
+ * return IRE_GENERATION_VERIFY (which will cause
+ * ip_select_route to be called again so we can redo the
+ * recursive lookup next time we send a packet.
+ */
+ generation = ire_dep_validate_generations(ires[0]);
+ if (generations[0] != ires[0]->ire_generation) {
+ /* Something changed at the top */
+ generation = IRE_GENERATION_VERIFY;
+ }
+ }
+ if (generationp != NULL)
+ *generationp = generation;
+
+ return (ires[0]);
+}
+
+ire_t *
+ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill,
+ zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
+ boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp,
+ tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
+{
+ return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill,
+ zoneid, tsl, match_args, allocate, xmit_hint, ipst, setsrcp,
+ gwattrp, generationp));
+}
+
+/*
+ * Recursively look for a route to the destination.
+ * We only handle a destination match here, yet we have the same arguments
+ * as the full match to allow function pointers to select between the two.
+ *
+ * Note that this function never returns NULL. It returns an IRE_NOROUTE
+ * instead.
+ *
+ * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
+ * is an error.
+ * Allow at most one RTF_INDIRECT.
+ */
+ire_t *
+ire_route_recursive_dstonly_v4(ipaddr_t nexthop, boolean_t allocate,
+ uint32_t xmit_hint, ip_stack_t *ipst)
+{
+ ire_t *ire;
+ ire_t *ire1;
+ uint_t generation;
+
+ /* ire_ftable_lookup handles round-robin/ECMP */
+ ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst,
+ &generation);
+ ASSERT(ire != NULL);
+
+ /*
+ * If this type should have an ire_nce_cache (even if it
+ * doesn't yet have one) then we are done. Includes
+ * IRE_INTERFACE with a full 32 bit mask.
+ */
+ if (ire->ire_nce_capable)
+ return (ire);
+
+ /*
+ * If the IRE has a current cached parent we know that the whole
+ * parent chain is current, hence we don't need to discover and
+ * build any dependencies by doing a recursive lookup.
+ */
+ mutex_enter(&ire->ire_lock);
+ if (ire->ire_dep_parent != NULL &&
+ ire->ire_dep_parent->ire_generation ==
+ ire->ire_dep_parent_generation) {
+ mutex_exit(&ire->ire_lock);
+ return (ire);
+ }
+ mutex_exit(&ire->ire_lock);
+
+ /*
+ * Fallback to loop in the normal code starting with the ire
+ * we found. Normally this would return the same ire.
+ */
+ ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES,
+ NULL, MATCH_IRE_DSTONLY, allocate, xmit_hint, ipst, NULL, NULL,
+ &generation);
+ ire_refrele(ire);
+ return (ire1);
+}
diff --git a/usr/src/uts/common/inet/ip/ip_helper_stream.c b/usr/src/uts/common/inet/ip/ip_helper_stream.c
index 6f5608e950..3fa6364417 100644
--- a/usr/src/uts/common/inet/ip/ip_helper_stream.c
+++ b/usr/src/uts/common/inet/ip/ip_helper_stream.c
@@ -58,14 +58,14 @@ static struct qinit ip_helper_stream_winit = {
&ip_helper_stream_info, NULL, NULL, NULL, STRUIOT_NONE
};
-#define IP_USE_HELPER_CACHE (ip_helper_stream_cache != NULL)
-
/*
* set the q_ptr of the 'q' to the conn_t pointer passed in
*/
static void
ip_helper_share_conn(queue_t *q, mblk_t *mp, cred_t *crp)
{
+ conn_t *connp = *((conn_t **)mp->b_cont->b_rptr);
+
/*
* This operation is allowed only on helper streams with kcred
*/
@@ -75,24 +75,12 @@ ip_helper_share_conn(queue_t *q, mblk_t *mp, cred_t *crp)
return;
}
- if (IP_USE_HELPER_CACHE) {
- ip_helper_stream_info_t *ip_helper_info;
-
- ip_helper_info = *((ip_helper_stream_info_t **)
- mp->b_cont->b_rptr);
- ip_helper_info->iphs_minfo = q->q_ptr;
- ip_helper_info->iphs_rq = RD(q);
- ip_helper_info->iphs_wq = WR(q);
- } else {
- conn_t *connp = *((conn_t **)mp->b_cont->b_rptr);
-
- connp->conn_helper_info->iphs_minfo = q->q_ptr;
- connp->conn_helper_info->iphs_rq = RD(q);
- connp->conn_helper_info->iphs_wq = WR(q);
- WR(q)->q_ptr = RD(q)->q_ptr = (void *)connp;
- connp->conn_rq = RD(q);
- connp->conn_wq = WR(q);
- }
+ connp->conn_helper_info->iphs_minfo = q->q_ptr;
+ connp->conn_helper_info->iphs_rq = RD(q);
+ connp->conn_helper_info->iphs_wq = WR(q);
+ WR(q)->q_ptr = RD(q)->q_ptr = (void *)connp;
+ connp->conn_rq = RD(q);
+ connp->conn_wq = WR(q);
miocack(q, mp, 0, 0);
}
@@ -104,17 +92,13 @@ ip_helper_wput(queue_t *q, mblk_t *mp)
iocp->ioc_cmd == SIOCSQPTR) {
ip_helper_share_conn(q, mp, iocp->ioc_cr);
} else {
- conn_t *connp = (conn_t *)q->q_ptr;
-
- if (connp->conn_af_isv6) {
- ip_wput_v6(q, mp);
- } else {
- ip_wput(q, mp);
- }
+ /* We only handle ioctl related messages here */
+ ASSERT(DB_TYPE(mp) != M_DATA);
+ ip_wput_nondata(q, mp);
}
}
-/* ARGSUSED */
+/* ARGSUSED3 */
int
ip_helper_stream_setup(queue_t *q, dev_t *devp, int flag, int sflag,
cred_t *credp, boolean_t isv6)
@@ -126,10 +110,8 @@ ip_helper_stream_setup(queue_t *q, dev_t *devp, int flag, int sflag,
ASSERT(RD(q) == q);
- ip_minfop = kmem_alloc(sizeof (ip_helper_minfo_t), KM_NOSLEEP);
- if (ip_minfop == NULL) {
- return (ENOMEM);
- }
+ ip_minfop = kmem_alloc(sizeof (ip_helper_minfo_t), KM_SLEEP);
+ ASSERT(ip_minfop != NULL);
ip_minfop->ip_minfo_dev = 0;
ip_minfop->ip_minfo_arena = NULL;
@@ -171,7 +153,7 @@ ip_helper_stream_setup(queue_t *q, dev_t *devp, int flag, int sflag,
return (0);
}
-/* ARGSUSED */
+/* ARGSUSED1 */
static int
ip_helper_stream_close(queue_t *q, int flag)
{
@@ -189,305 +171,91 @@ ip_helper_stream_close(queue_t *q, int flag)
/*
* Public interface for creating an IP stream with shared conn_t
+ * Handles multiple callers in parallel by using conn_lock.
+ * Note that we allocate the helper stream without any locks, which means
+ * we might need to free it if we had two threads doing this concurrently
+ * for the conn_t.
*/
-/* ARGSUSED */
int
ip_create_helper_stream(conn_t *connp, ldi_ident_t li)
{
+ ip_helper_stream_info_t *helper;
int error;
int ret;
ASSERT(!servicing_interrupt());
- error = 0;
- if (IP_USE_HELPER_CACHE) {
- connp->conn_helper_info = kmem_cache_alloc(
- ip_helper_stream_cache, KM_NOSLEEP);
- if (connp->conn_helper_info == NULL)
- return (EAGAIN);
- connp->conn_rq = connp->conn_helper_info->iphs_rq;
- connp->conn_wq = connp->conn_helper_info->iphs_wq;
- /*
- * Doesn't need to hold the QLOCK for there is no one else
- * should have a pointer to this queue.
- */
- connp->conn_rq->q_flag |= QWANTR;
- connp->conn_wq->q_flag |= QWANTR;
-
- connp->conn_rq->q_ptr = connp;
- connp->conn_wq->q_ptr = connp;
- } else {
- ASSERT(connp->conn_helper_info == NULL);
- connp->conn_helper_info = kmem_alloc(
- sizeof (ip_helper_stream_info_t), KM_SLEEP);
- /*
- * open ip device via the layered interface.
- * pass in kcred as some threads do not have the
- * priviledge to open /dev/ip and the check in
- * secpolicy_spec_open() will fail the open
- */
- error = ldi_open_by_name(connp->conn_af_isv6 ?
- DEV_IP6 : DEV_IP, IP_HELPER_STR,
- kcred, &connp->conn_helper_info->iphs_handle, li);
-
- if (error != 0) {
- kmem_free(connp->conn_helper_info,
- (sizeof (ip_helper_stream_info_t)));
- connp->conn_helper_info = NULL;
- return (error);
- }
- /*
- * Share connp with the helper stream
- */
- error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
- SIOCSQPTR, (intptr_t)connp, FKIOCTL, kcred, &ret);
-
- if (error != 0) {
- /*
- * Passing in a zero flag indicates that an error
- * occured and stream was not shared
- */
- (void) ldi_close(connp->conn_helper_info->iphs_handle,
- 0, kcred);
- kmem_free(connp->conn_helper_info,
- (sizeof (ip_helper_stream_info_t)));
- connp->conn_helper_info = NULL;
- }
+ if (connp->conn_helper_info != NULL) {
+ /* Already allocated */
+ return (0);
}
- return (error);
-}
-
-/*
- * Public interface for freeing IP helper stream
- */
-/* ARGSUSED */
-void
-ip_free_helper_stream(conn_t *connp)
-{
- ASSERT(!servicing_interrupt());
- if (IP_USE_HELPER_CACHE) {
-
- if (connp->conn_helper_info == NULL)
- return;
- ASSERT(connp->conn_helper_info->iphs_rq != NULL);
- ASSERT(connp->conn_helper_info->iphs_wq != NULL);
-
- /* Prevent service procedures from being called */
- disable_svc(connp->conn_helper_info->iphs_rq);
-
- /* Wait until service procedure of each queue is run */
- wait_svc(connp->conn_helper_info->iphs_rq);
-
- /* Cleanup any pending ioctls */
- conn_ioctl_cleanup(connp);
-
- /* Allow service procedures to be called again */
- enable_svc(connp->conn_helper_info->iphs_rq);
-
- /* Flush the queues */
- flushq(connp->conn_helper_info->iphs_rq, FLUSHALL);
- flushq(connp->conn_helper_info->iphs_wq, FLUSHALL);
-
- connp->conn_helper_info->iphs_rq->q_ptr = NULL;
- connp->conn_helper_info->iphs_wq->q_ptr = NULL;
-
- kmem_cache_free(ip_helper_stream_cache,
- connp->conn_helper_info);
- } else {
- ASSERT(
- connp->conn_helper_info->iphs_handle != NULL);
-
- connp->conn_helper_info->iphs_rq->q_ptr =
- connp->conn_helper_info->iphs_wq->q_ptr =
- connp->conn_helper_info->iphs_minfo;
- (void) ldi_close(connp->conn_helper_info->iphs_handle,
- IP_HELPER_STR, kcred);
- kmem_free(connp->conn_helper_info,
- sizeof (ip_helper_stream_info_t));
- }
- connp->conn_helper_info = NULL;
-}
-
-/*
- * create a T_SVR4_OPTMGMT_REQ TPI message and send down the IP stream
- */
-static int
-ip_send_option_request(conn_t *connp, uint_t optset_context, int level,
- int option_name, const void *optval, t_uscalar_t optlen, cred_t *cr)
-{
- struct T_optmgmt_req *optmgmt_reqp;
- struct opthdr *ohp;
- ssize_t size;
- mblk_t *mp;
-
- size = sizeof (struct T_optmgmt_req) + sizeof (struct opthdr) + optlen;
- /* Not used to generate UCRED, thus don't need correct pid */
- mp = allocb_cred(size, cr, NOPID);
- if (mp == NULL)
- return (ENOMEM);
-
- mp->b_datap->db_type = M_PROTO;
- optmgmt_reqp = (struct T_optmgmt_req *)mp->b_wptr;
-
- optmgmt_reqp->PRIM_type = T_SVR4_OPTMGMT_REQ;
- optmgmt_reqp->MGMT_flags = optset_context;
- optmgmt_reqp->OPT_length = (t_scalar_t)sizeof (struct opthdr) + optlen;
- optmgmt_reqp->OPT_offset = (t_scalar_t)sizeof (struct T_optmgmt_req);
-
- mp->b_wptr += sizeof (struct T_optmgmt_req);
-
- ohp = (struct opthdr *)mp->b_wptr;
- ohp->level = level;
- ohp->name = option_name;
- ohp->len = optlen;
-
- mp->b_wptr += sizeof (struct opthdr);
-
- if (optval != NULL) {
- bcopy(optval, mp->b_wptr, optlen);
- } else {
- bzero(mp->b_wptr, optlen);
- }
- mp->b_wptr += optlen;
+ error = 0;
+ helper = kmem_alloc(sizeof (ip_helper_stream_info_t), KM_SLEEP);
/*
- * Send down the primitive
+ * open ip device via the layered interface.
+ * pass in kcred as some threads do not have the
+ * priviledge to open /dev/ip and the check in
+ * secpolicy_spec_open() will fail the open
*/
- return (ldi_putmsg(connp->conn_helper_info->iphs_handle, mp));
-}
+ error = ldi_open_by_name((connp->conn_family == AF_INET6 ? DEV_IP6 :
+ DEV_IP), IP_HELPER_STR, kcred, &helper->iphs_handle, li);
-/*
- * wait/process the response to T_SVR4_OPTMGMT_REQ TPI message
- */
-static int
-ip_get_option_response(conn_t *connp, uint_t optset_context, void *optval,
- t_uscalar_t *optlenp)
-{
- union T_primitives *tpr;
- int error;
- mblk_t *mp;
-
- mp = NULL;
-
- ASSERT(optset_context == T_CHECK || optset_context == T_NEGOTIATE);
- error = ldi_getmsg(connp->conn_helper_info->iphs_handle, &mp, NULL);
if (error != 0) {
+ kmem_free(helper, sizeof (ip_helper_stream_info_t));
return (error);
}
-
- if (DB_TYPE(mp) != M_PCPROTO || MBLKL(mp) < sizeof (tpr->type)) {
- error = EPROTO;
- goto done;
- }
-
- tpr = (union T_primitives *)mp->b_rptr;
-
- switch (tpr->type) {
- case T_OPTMGMT_ACK:
- if (MBLKL(mp) < TOPTMGMTACKSZ)
- error = EPROTO;
- break;
- case T_ERROR_ACK:
- if (MBLKL(mp) < TERRORACKSZ) {
- error = EPROTO;
- break;
- }
-
- if (tpr->error_ack.TLI_error == TSYSERR)
- error = tpr->error_ack.UNIX_error;
- else
- error = proto_tlitosyserr(tpr->error_ack.TLI_error);
- break;
- default:
- error = EPROTO;
- break;
+ /* Make sure we are the only one */
+ mutex_enter(&connp->conn_lock);
+ if (connp->conn_helper_info != NULL) {
+ /* Some other thread won - discard this stream */
+ mutex_exit(&connp->conn_lock);
+ (void) ldi_close(helper->iphs_handle, 0, kcred);
+ kmem_free(helper, sizeof (ip_helper_stream_info_t));
+ return (0);
}
+ connp->conn_helper_info = helper;
+ /*
+ * Share connp with the helper stream. We hold conn_lock across this
+ * operation.
+ */
+ error = ldi_ioctl(helper->iphs_handle, SIOCSQPTR, (intptr_t)connp,
+ FKIOCTL, kcred, &ret);
- if ((optset_context == T_CHECK) && (error == 0)) {
- struct opthdr *opt_res;
- t_uscalar_t len;
- t_uscalar_t size;
- t_uscalar_t maxlen = *optlenp;
- void *option;
- struct T_optmgmt_ack *optmgmt_ack;
-
- optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
- opt_res = (struct opthdr *)
- ((uintptr_t)mp->b_rptr + optmgmt_ack->OPT_offset);
- /*
- * Check mblk boundary
- */
- if (!MBLKIN(mp, optmgmt_ack->OPT_offset,
- optmgmt_ack->OPT_length)) {
- error = EPROTO;
- goto done;
- }
-
- /*
- * Check alignment
- */
- if ((((uintptr_t)opt_res) & (__TPI_ALIGN_SIZE - 1)) != 0) {
- error = EPROTO;
- goto done;
- }
-
- option = &opt_res[1];
-
- /* check to ensure that the option is within bounds */
- if ((((uintptr_t)option + opt_res->len) < (uintptr_t)option) ||
- !MBLKIN(mp, sizeof (struct opthdr), opt_res->len)) {
- error = EPROTO;
- goto done;
- }
-
- len = opt_res->len;
- size = MIN(len, maxlen);
-
+ if (error != 0) {
/*
- * Copy data
+ * Passing in a zero flag indicates that an error
+ * occured and stream was not shared
*/
- bcopy(option, optval, size);
- bcopy(&size, optlenp, sizeof (size));
+ (void) ldi_close(helper->iphs_handle, 0, kcred);
+ kmem_free(helper, sizeof (ip_helper_stream_info_t));
+ connp->conn_helper_info = NULL;
}
-
-done:
- freemsg(mp);
+ mutex_exit(&connp->conn_lock);
return (error);
}
/*
- * Public interface to get socketoptions via the ip helper stream.
- */
-int
-ip_get_options(conn_t *connp, int level, int option_name, void *optval,
- t_uscalar_t *optlenp, cred_t *cr)
-{
- int error;
-
- error = ip_send_option_request(connp, T_CHECK, level, option_name, NULL,
- *optlenp, cr);
- if (error)
- return (error);
-
- return (ip_get_option_response(connp, T_CHECK, optval, optlenp));
-}
-
-/*
- * Public interface to set socket options via the ip helper stream.
+ * Public interface for freeing IP helper stream
+ * Caller must ensure no concurrent use of the conn_t, which is normally
+ * done by calling this from the close routine when the conn_t is quiesced.
*/
-int
-ip_set_options(conn_t *connp, int level, int option_name, const void *optval,
- t_uscalar_t optlen, cred_t *cr)
+void
+ip_free_helper_stream(conn_t *connp)
{
+ ASSERT(!servicing_interrupt());
- int error;
+ if (connp->conn_helper_info == NULL)
+ return;
- error = ip_send_option_request(connp, T_NEGOTIATE, level, option_name,
- optval, optlen, cr);
- if (error)
- return (error);
+ ASSERT(connp->conn_helper_info->iphs_handle != NULL);
- return (ip_get_option_response(connp, T_NEGOTIATE, (void *)optval,
- &optlen));
+ connp->conn_helper_info->iphs_rq->q_ptr =
+ connp->conn_helper_info->iphs_wq->q_ptr =
+ connp->conn_helper_info->iphs_minfo;
+ (void) ldi_close(connp->conn_helper_info->iphs_handle,
+ IP_HELPER_STR, kcred);
+ kmem_free(connp->conn_helper_info, sizeof (ip_helper_stream_info_t));
+ connp->conn_helper_info = NULL;
}
diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c
index b175f4530f..6066da35b4 100644
--- a/usr/src/uts/common/inet/ip/ip_if.c
+++ b/usr/src/uts/common/inet/ip/ip_if.c
@@ -72,6 +72,7 @@
#include <inet/mi.h>
#include <inet/nd.h>
#include <inet/arp.h>
+#include <inet/ip_arp.h>
#include <inet/mib2.h>
#include <inet/ip.h>
#include <inet/ip6.h>
@@ -88,12 +89,6 @@
#include <inet/ip_netinfo.h>
#include <inet/ilb_ip.h>
-#include <net/pfkeyv2.h>
-#include <inet/ipsec_info.h>
-#include <inet/sadb.h>
-#include <inet/ipsec_impl.h>
-#include <sys/iphada.h>
-
#include <netinet/igmp.h>
#include <inet/ip_listutils.h>
#include <inet/ipclassifier.h>
@@ -119,15 +114,6 @@ typedef struct ipft_s {
#define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */
#define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */
-typedef struct ip_sock_ar_s {
- union {
- area_t ip_sock_area;
- ared_t ip_sock_ared;
- areq_t ip_sock_areq;
- } ip_sock_ar_u;
- queue_t *ip_sock_ar_q;
-} ip_sock_ar_t;
-
static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *);
static int nd_ill_forward_set(queue_t *q, mblk_t *mp,
char *value, caddr_t cp, cred_t *ioc_cr);
@@ -148,7 +134,7 @@ static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t,
queue_t *q, mblk_t *mp, boolean_t need_up);
static int ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp,
- int ioccmd, struct linkblk *li, boolean_t doconsist);
+ int ioccmd, struct linkblk *li);
static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *);
static void ip_wput_ioctl(queue_t *q, mblk_t *mp);
static void ipsq_flush(ill_t *ill);
@@ -159,17 +145,14 @@ static void ipsq_delete(ipsq_t *);
static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type,
boolean_t initialize, boolean_t insert);
-static void ipif_check_bcast_ires(ipif_t *test_ipif);
static ire_t **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep);
+static void ipif_delete_bcast_ires(ipif_t *ipif);
+static int ipif_add_ires_v4(ipif_t *, boolean_t);
static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif,
boolean_t isv6);
-static void ipif_down_delete_ire(ire_t *ire, char *ipif);
-static void ipif_delete_cache_ire(ire_t *, char *);
static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp);
static void ipif_free(ipif_t *ipif);
static void ipif_free_tail(ipif_t *ipif);
-static void ipif_mtu_change(ire_t *ire, char *ipif_arg);
-static void ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif);
static void ipif_set_default(ipif_t *ipif);
static int ipif_set_values(queue_t *q, mblk_t *mp,
char *interf_name, uint_t *ppa);
@@ -177,17 +160,13 @@ static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp,
queue_t *q);
static ipif_t *ipif_lookup_on_name(char *name, size_t namelen,
boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid,
- queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *);
-static void ipif_update_other_ipifs(ipif_t *old_ipif);
+ ip_stack_t *);
static int ill_alloc_ppa(ill_if_t *, ill_t *);
-static int ill_arp_off(ill_t *ill);
-static int ill_arp_on(ill_t *ill);
static void ill_delete_interface_type(ill_if_t *);
static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q);
static void ill_dl_down(ill_t *ill);
static void ill_down(ill_t *ill);
-static void ill_downi(ire_t *ire, char *ill_arg);
static void ill_free_mib(ill_t *ill);
static void ill_glist_delete(ill_t *);
static void ill_phyint_reinit(ill_t *ill);
@@ -199,38 +178,22 @@ static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid;
static ip_v6intfid_func_t ip_ipv4_v6intfid, ip_ipv6_v6intfid;
static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid;
static ip_v6intfid_func_t ip_ipv4_v6destintfid, ip_ipv6_v6destintfid;
-static ip_v6mapinfo_func_t ip_ether_v6mapinfo, ip_ib_v6mapinfo;
-static ip_v6mapinfo_func_t ip_nodef_v6mapinfo;
-static ip_v4mapinfo_func_t ip_ether_v4mapinfo, ip_ib_v4mapinfo;
-static ip_v4mapinfo_func_t ip_nodef_v4mapinfo;
-static void ipif_save_ire(ipif_t *, ire_t *);
-static void ipif_remove_ire(ipif_t *, ire_t *);
-static void ip_cgtp_bcast_add(ire_t *, ire_t *, ip_stack_t *);
+static ip_v4mapinfo_func_t ip_ether_v4_mapping;
+static ip_v6mapinfo_func_t ip_ether_v6_mapping;
+static ip_v4mapinfo_func_t ip_ib_v4_mapping;
+static ip_v6mapinfo_func_t ip_ib_v6_mapping;
+static ip_v4mapinfo_func_t ip_mbcast_mapping;
+static void ip_cgtp_bcast_add(ire_t *, ip_stack_t *);
static void ip_cgtp_bcast_delete(ire_t *, ip_stack_t *);
static void phyint_free(phyint_t *);
-/*
- * Per-ill IPsec capabilities management.
- */
-static ill_ipsec_capab_t *ill_ipsec_capab_alloc(void);
-static void ill_ipsec_capab_free(ill_ipsec_capab_t *);
-static void ill_ipsec_capab_add(ill_t *, uint_t, boolean_t);
-static void ill_ipsec_capab_delete(ill_t *, uint_t);
-static boolean_t ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *, int);
-static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *,
- boolean_t);
+static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *);
static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
-static void ill_capability_mdt_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
-static void ill_capability_mdt_reset_fill(ill_t *, mblk_t *);
-static void ill_capability_ipsec_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
-static void ill_capability_ipsec_reset_fill(ill_t *, mblk_t *);
static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *);
static void ill_capability_zerocopy_ack(ill_t *, mblk_t *,
dl_capability_sub_t *);
static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *);
-static int ill_capability_ipsec_reset_size(ill_t *, int *, int *, int *,
- int *);
static void ill_capability_dld_reset_fill(ill_t *, mblk_t *);
static void ill_capability_dld_ack(ill_t *, mblk_t *,
dl_capability_sub_t *);
@@ -242,11 +205,11 @@ static void ill_capability_send(ill_t *, mblk_t *);
static ill_t *ill_prev_usesrc(ill_t *);
static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t);
static void ill_disband_usesrc_group(ill_t *);
-static void conn_cleanup_stale_ire(conn_t *, caddr_t);
+static void ip_sioctl_garp_reply(mblk_t *, ill_t *, void *, int);
#ifdef DEBUG
-static void ill_trace_cleanup(const ill_t *);
-static void ipif_trace_cleanup(const ipif_t *);
+static void ill_trace_cleanup(const ill_t *);
+static void ipif_trace_cleanup(const ipif_t *);
#endif
/*
@@ -255,182 +218,10 @@ static void ipif_trace_cleanup(const ipif_t *);
*/
int ip_min_frag_prune_time = 0;
-/*
- * max # of IPsec algorithms supported. Limited to 1 byte by PF_KEY
- * and the IPsec DOI
- */
-#define MAX_IPSEC_ALGS 256
-
-#define BITSPERBYTE 8
-#define BITS(type) (BITSPERBYTE * (long)sizeof (type))
-
-#define IPSEC_ALG_ENABLE(algs, algid) \
- ((algs)[(algid) / BITS(ipsec_capab_elem_t)] |= \
- (1 << ((algid) % BITS(ipsec_capab_elem_t))))
-
-#define IPSEC_ALG_IS_ENABLED(algid, algs) \
- ((algs)[(algid) / BITS(ipsec_capab_elem_t)] & \
- (1 << ((algid) % BITS(ipsec_capab_elem_t))))
-
-typedef uint8_t ipsec_capab_elem_t;
-
-/*
- * Per-algorithm parameters. Note that at present, only encryption
- * algorithms have variable keysize (IKE does not provide a way to negotiate
- * auth algorithm keysize).
- *
- * All sizes here are in bits.
- */
-typedef struct
-{
- uint16_t minkeylen;
- uint16_t maxkeylen;
-} ipsec_capab_algparm_t;
-
-/*
- * Per-ill capabilities.
- */
-struct ill_ipsec_capab_s {
- ipsec_capab_elem_t *encr_hw_algs;
- ipsec_capab_elem_t *auth_hw_algs;
- uint32_t algs_size; /* size of _hw_algs in bytes */
- /* algorithm key lengths */
- ipsec_capab_algparm_t *encr_algparm;
- uint32_t encr_algparm_size;
- uint32_t encr_algparm_end;
-};
-
-/*
- * The field values are larger than strictly necessary for simple
- * AR_ENTRY_ADDs but the padding lets us accomodate the socket ioctls.
- */
-static area_t ip_area_template = {
- AR_ENTRY_ADD, /* area_cmd */
- sizeof (ip_sock_ar_t) + (IP_ADDR_LEN*2) + sizeof (struct sockaddr_dl),
- /* area_name_offset */
- /* area_name_length temporarily holds this structure length */
- sizeof (area_t), /* area_name_length */
- IP_ARP_PROTO_TYPE, /* area_proto */
- sizeof (ip_sock_ar_t), /* area_proto_addr_offset */
- IP_ADDR_LEN, /* area_proto_addr_length */
- sizeof (ip_sock_ar_t) + IP_ADDR_LEN,
- /* area_proto_mask_offset */
- 0, /* area_flags */
- sizeof (ip_sock_ar_t) + IP_ADDR_LEN + IP_ADDR_LEN,
- /* area_hw_addr_offset */
- /* Zero length hw_addr_length means 'use your idea of the address' */
- 0 /* area_hw_addr_length */
-};
-
-/*
- * AR_ENTRY_ADD/DELETE templates have been added for IPv6 external resolver
- * support
- */
-static area_t ip6_area_template = {
- AR_ENTRY_ADD, /* area_cmd */
- sizeof (ip_sock_ar_t) + (IPV6_ADDR_LEN*2) + sizeof (sin6_t),
- /* area_name_offset */
- /* area_name_length temporarily holds this structure length */
- sizeof (area_t), /* area_name_length */
- IP_ARP_PROTO_TYPE, /* area_proto */
- sizeof (ip_sock_ar_t), /* area_proto_addr_offset */
- IPV6_ADDR_LEN, /* area_proto_addr_length */
- sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN,
- /* area_proto_mask_offset */
- 0, /* area_flags */
- sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN + IPV6_ADDR_LEN,
- /* area_hw_addr_offset */
- /* Zero length hw_addr_length means 'use your idea of the address' */
- 0 /* area_hw_addr_length */
-};
-
-static ared_t ip_ared_template = {
- AR_ENTRY_DELETE,
- sizeof (ared_t) + IP_ADDR_LEN,
- sizeof (ared_t),
- IP_ARP_PROTO_TYPE,
- sizeof (ared_t),
- IP_ADDR_LEN,
- 0
-};
-
-static ared_t ip6_ared_template = {
- AR_ENTRY_DELETE,
- sizeof (ared_t) + IPV6_ADDR_LEN,
- sizeof (ared_t),
- IP_ARP_PROTO_TYPE,
- sizeof (ared_t),
- IPV6_ADDR_LEN,
- 0
-};
-
-/*
- * A template for an IPv6 AR_ENTRY_QUERY template has not been created, as
- * as the areq doesn't include an IP address in ill_dl_up() (the only place a
- * areq is used).
- */
-static areq_t ip_areq_template = {
- AR_ENTRY_QUERY, /* cmd */
- sizeof (areq_t)+(2*IP_ADDR_LEN), /* name offset */
- sizeof (areq_t), /* name len (filled by ill_arp_alloc) */
- IP_ARP_PROTO_TYPE, /* protocol, from arps perspective */
- sizeof (areq_t), /* target addr offset */
- IP_ADDR_LEN, /* target addr_length */
- 0, /* flags */
- sizeof (areq_t) + IP_ADDR_LEN, /* sender addr offset */
- IP_ADDR_LEN, /* sender addr length */
- AR_EQ_DEFAULT_XMIT_COUNT, /* xmit_count */
- AR_EQ_DEFAULT_XMIT_INTERVAL, /* (re)xmit_interval in milliseconds */
- AR_EQ_DEFAULT_MAX_BUFFERED /* max # of requests to buffer */
- /* anything else filled in by the code */
-};
-
-static arc_t ip_aru_template = {
- AR_INTERFACE_UP,
- sizeof (arc_t), /* Name offset */
- sizeof (arc_t) /* Name length (set by ill_arp_alloc) */
-};
-
-static arc_t ip_ard_template = {
- AR_INTERFACE_DOWN,
- sizeof (arc_t), /* Name offset */
- sizeof (arc_t) /* Name length (set by ill_arp_alloc) */
-};
-
-static arc_t ip_aron_template = {
- AR_INTERFACE_ON,
- sizeof (arc_t), /* Name offset */
- sizeof (arc_t) /* Name length (set by ill_arp_alloc) */
-};
-
-static arc_t ip_aroff_template = {
- AR_INTERFACE_OFF,
- sizeof (arc_t), /* Name offset */
- sizeof (arc_t) /* Name length (set by ill_arp_alloc) */
-};
-
-static arma_t ip_arma_multi_template = {
- AR_MAPPING_ADD,
- sizeof (arma_t) + 3*IP_ADDR_LEN + IP_MAX_HW_LEN,
- /* Name offset */
- sizeof (arma_t), /* Name length (set by ill_arp_alloc) */
- IP_ARP_PROTO_TYPE,
- sizeof (arma_t), /* proto_addr_offset */
- IP_ADDR_LEN, /* proto_addr_length */
- sizeof (arma_t) + IP_ADDR_LEN, /* proto_mask_offset */
- sizeof (arma_t) + 2*IP_ADDR_LEN, /* proto_extract_mask_offset */
- ACE_F_PERMANENT | ACE_F_MAPPING, /* flags */
- sizeof (arma_t) + 3*IP_ADDR_LEN, /* hw_addr_offset */
- IP_MAX_HW_LEN, /* hw_addr_length */
- 0, /* hw_mapping_start */
-};
-
static ipft_t ip_ioctl_ftbl[] = {
{ IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 },
{ IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t),
IPFT_F_NO_REPLY },
- { IP_IOC_IRE_ADVISE_NO_REPLY, ip_ire_advise, sizeof (ipic_t),
- IPFT_F_NO_REPLY },
{ IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY },
{ 0 }
};
@@ -444,35 +235,38 @@ static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
static ip_m_t ip_m_tbl[] = {
{ DL_ETHER, IFT_ETHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
- ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_ether_v6intfid,
+ ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid,
ip_nodef_v6intfid },
{ DL_CSMACD, IFT_ISO88023, ETHERTYPE_IP, ETHERTYPE_IPV6,
- ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_nodef_v6intfid,
+ ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
ip_nodef_v6intfid },
{ DL_TPB, IFT_ISO88024, ETHERTYPE_IP, ETHERTYPE_IPV6,
- ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_nodef_v6intfid,
+ ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
ip_nodef_v6intfid },
{ DL_TPR, IFT_ISO88025, ETHERTYPE_IP, ETHERTYPE_IPV6,
- ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_nodef_v6intfid,
+ ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
ip_nodef_v6intfid },
{ DL_FDDI, IFT_FDDI, ETHERTYPE_IP, ETHERTYPE_IPV6,
- ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_ether_v6intfid,
+ ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid,
ip_nodef_v6intfid },
{ DL_IB, IFT_IB, ETHERTYPE_IP, ETHERTYPE_IPV6,
- ip_ib_v4mapinfo, ip_ib_v6mapinfo, ip_ib_v6intfid,
+ ip_ib_v4_mapping, ip_ib_v6_mapping, ip_ib_v6intfid,
+ ip_nodef_v6intfid },
+ { DL_IPV4, IFT_IPV4, IPPROTO_ENCAP, IPPROTO_IPV6,
+ ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid,
+ ip_ipv4_v6destintfid },
+ { DL_IPV6, IFT_IPV6, IPPROTO_ENCAP, IPPROTO_IPV6,
+ ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv6_v6intfid,
+ ip_ipv6_v6destintfid },
+ { DL_6TO4, IFT_6TO4, IPPROTO_ENCAP, IPPROTO_IPV6,
+ ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid,
ip_nodef_v6intfid },
- { DL_IPV4, IFT_IPV4, IPPROTO_ENCAP, IPPROTO_IPV6, ip_nodef_v4mapinfo,
- ip_nodef_v6mapinfo, ip_ipv4_v6intfid, ip_ipv4_v6destintfid },
- { DL_IPV6, IFT_IPV6, IPPROTO_ENCAP, IPPROTO_IPV6, ip_nodef_v4mapinfo,
- ip_nodef_v6mapinfo, ip_ipv6_v6intfid, ip_ipv6_v6destintfid },
- { DL_6TO4, IFT_6TO4, IPPROTO_ENCAP, IPPROTO_IPV6, ip_nodef_v4mapinfo,
- ip_nodef_v6mapinfo, ip_ipv4_v6intfid, ip_nodef_v6intfid },
{ SUNW_DL_VNI, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
NULL, NULL, ip_nodef_v6intfid, ip_nodef_v6intfid },
{ SUNW_DL_IPMP, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
NULL, NULL, ip_ipmp_v6intfid, ip_nodef_v6intfid },
{ DL_OTHER, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
- ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_nodef_v6intfid,
+ ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
ip_nodef_v6intfid }
};
@@ -567,149 +361,6 @@ ill_allocate_mibs(ill_t *ill)
}
/*
- * Common code for preparation of ARP commands. Two points to remember:
- * 1) The ill_name is tacked on at the end of the allocated space so
- * the templates name_offset field must contain the total space
- * to allocate less the name length.
- *
- * 2) The templates name_length field should contain the *template*
- * length. We use it as a parameter to bcopy() and then write
- * the real ill_name_length into the name_length field of the copy.
- * (Always called as writer.)
- */
-mblk_t *
-ill_arp_alloc(ill_t *ill, const uchar_t *template, caddr_t addr)
-{
- arc_t *arc = (arc_t *)template;
- char *cp;
- int len;
- mblk_t *mp;
- uint_t name_length = ill->ill_name_length;
- uint_t template_len = arc->arc_name_length;
-
- len = arc->arc_name_offset + name_length;
- mp = allocb(len, BPRI_HI);
- if (mp == NULL)
- return (NULL);
- cp = (char *)mp->b_rptr;
- mp->b_wptr = (uchar_t *)&cp[len];
- if (template_len)
- bcopy(template, cp, template_len);
- if (len > template_len)
- bzero(&cp[template_len], len - template_len);
- mp->b_datap->db_type = M_PROTO;
-
- arc = (arc_t *)cp;
- arc->arc_name_length = name_length;
- cp = (char *)arc + arc->arc_name_offset;
- bcopy(ill->ill_name, cp, name_length);
-
- if (addr) {
- area_t *area = (area_t *)mp->b_rptr;
-
- cp = (char *)area + area->area_proto_addr_offset;
- bcopy(addr, cp, area->area_proto_addr_length);
- if (area->area_cmd == AR_ENTRY_ADD) {
- cp = (char *)area;
- len = area->area_proto_addr_length;
- if (area->area_proto_mask_offset)
- cp += area->area_proto_mask_offset;
- else
- cp += area->area_proto_addr_offset + len;
- while (len-- > 0)
- *cp++ = (char)~0;
- }
- }
- return (mp);
-}
-
-mblk_t *
-ipif_area_alloc(ipif_t *ipif, uint_t optflags)
-{
- caddr_t addr;
- mblk_t *mp;
- area_t *area;
- uchar_t *areap;
- ill_t *ill = ipif->ipif_ill;
-
- if (ill->ill_isv6) {
- ASSERT(ill->ill_flags & ILLF_XRESOLV);
- addr = (caddr_t)&ipif->ipif_v6lcl_addr;
- areap = (uchar_t *)&ip6_area_template;
- } else {
- addr = (caddr_t)&ipif->ipif_lcl_addr;
- areap = (uchar_t *)&ip_area_template;
- }
-
- if ((mp = ill_arp_alloc(ill, areap, addr)) == NULL)
- return (NULL);
-
- /*
- * IPMP requires that the hardware address be included in all
- * AR_ENTRY_ADD requests so that ARP can deduce the arl to send on.
- * If there are no active underlying ills in the group (and thus no
- * hardware address, DAD will be deferred until an underlying ill
- * becomes active.
- */
- if (IS_IPMP(ill)) {
- if ((ill = ipmp_ipif_hold_bound_ill(ipif)) == NULL) {
- freemsg(mp);
- return (NULL);
- }
- } else {
- ill_refhold(ill);
- }
-
- area = (area_t *)mp->b_rptr;
- area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR;
- area->area_flags |= optflags;
- area->area_hw_addr_length = ill->ill_phys_addr_length;
- bcopy(ill->ill_phys_addr, mp->b_rptr + area->area_hw_addr_offset,
- area->area_hw_addr_length);
-
- ill_refrele(ill);
- return (mp);
-}
-
-mblk_t *
-ipif_ared_alloc(ipif_t *ipif)
-{
- caddr_t addr;
- uchar_t *aredp;
-
- if (ipif->ipif_ill->ill_isv6) {
- ASSERT(ipif->ipif_ill->ill_flags & ILLF_XRESOLV);
- addr = (caddr_t)&ipif->ipif_v6lcl_addr;
- aredp = (uchar_t *)&ip6_ared_template;
- } else {
- addr = (caddr_t)&ipif->ipif_lcl_addr;
- aredp = (uchar_t *)&ip_ared_template;
- }
-
- return (ill_arp_alloc(ipif->ipif_ill, aredp, addr));
-}
-
-mblk_t *
-ill_ared_alloc(ill_t *ill, ipaddr_t addr)
-{
- return (ill_arp_alloc(ill, (uchar_t *)&ip_ared_template,
- (char *)&addr));
-}
-
-mblk_t *
-ill_arie_alloc(ill_t *ill, const char *grifname, const void *template)
-{
- mblk_t *mp = ill_arp_alloc(ill, template, 0);
- arie_t *arie;
-
- if (mp != NULL) {
- arie = (arie_t *)mp->b_rptr;
- (void) strlcpy(arie->arie_grifname, grifname, LIFNAMSIZ);
- }
- return (mp);
-}
-
-/*
* Completely vaporize a lower level tap and all associated interfaces.
* ill_delete is called only out of ip_close when the device control
* stream is being closed.
@@ -735,8 +386,8 @@ ill_delete(ill_t *ill)
* remove it from the list, and free the data structure.
* Walk down the ipif list and remove the logical interfaces
* first before removing the main ipif. We can't unplumb
- * zeroth interface first in the case of IPv6 as reset_conn_ill
- * -> ip_ll_delmulti_v6 de-references ill_ipif for checking
+ * zeroth interface first in the case of IPv6 as update_conn_ill
+ * -> ip_ll_multireq de-references ill_ipif for checking
* POINTOPOINT.
*
* If ill_ipif was not properly initialized (i.e low on memory),
@@ -747,22 +398,15 @@ ill_delete(ill_t *ill)
ipif_free(ipif);
/*
- * Used only by ill_arp_on and ill_arp_off, which are writers.
- * So nobody can be using this mp now. Free the mp allocated for
- * honoring ILLF_NOARP
+ * clean out all the nce_t entries that depend on this
+ * ill for the ill_phys_addr.
*/
- freemsg(ill->ill_arp_on_mp);
- ill->ill_arp_on_mp = NULL;
+ nce_flush(ill, B_TRUE);
/* Clean up msgs on pending upcalls for mrouted */
reset_mrt_ill(ill);
- /*
- * ipif_free -> reset_conn_ipif will remove all multicast
- * references for IPv4. For IPv6, we need to do it here as
- * it points only at ills.
- */
- reset_conn_ill(ill);
+ update_conn_ill(ill, ipst);
/*
* Remove multicast references added as a result of calls to
@@ -786,6 +430,16 @@ ill_delete(ill_t *ill)
sctp_update_ill(ill, SCTP_ILL_REMOVE);
/*
+ * Walk all CONNs that can have a reference on an ire or nce for this
+ * ill (we actually walk all that now have stale references).
+ */
+ ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst);
+
+ /* With IPv6 we have dce_ifindex. Cleanup for neatness */
+ if (ill->ill_isv6)
+ dce_cleanup(ill->ill_phyint->phyint_ifindex, ipst);
+
+ /*
* If an address on this ILL is being used as a source address then
* clear out the pointers in other ILLs that point to this ILL.
*/
@@ -828,12 +482,10 @@ ill_delete_tail(ill_t *ill)
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
ipif_non_duplicate(ipif);
- ipif_down_tail(ipif);
+ (void) ipif_down_tail(ipif);
}
- ASSERT(ill->ill_ipif_dup_count == 0 &&
- ill->ill_arp_down_mp == NULL &&
- ill->ill_arp_del_mapping_mp == NULL);
+ ASSERT(ill->ill_ipif_dup_count == 0);
/*
* If polling capability is enabled (which signifies direct
@@ -864,23 +516,6 @@ ill_delete_tail(ill_t *ill)
/*
* Free capabilities.
*/
- if (ill->ill_ipsec_capab_ah != NULL) {
- ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_AH);
- ill_ipsec_capab_free(ill->ill_ipsec_capab_ah);
- ill->ill_ipsec_capab_ah = NULL;
- }
-
- if (ill->ill_ipsec_capab_esp != NULL) {
- ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_ESP);
- ill_ipsec_capab_free(ill->ill_ipsec_capab_esp);
- ill->ill_ipsec_capab_esp = NULL;
- }
-
- if (ill->ill_mdt_capab != NULL) {
- kmem_free(ill->ill_mdt_capab, sizeof (ill_mdt_capab_t));
- ill->ill_mdt_capab = NULL;
- }
-
if (ill->ill_hcksum_capab != NULL) {
kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t));
ill->ill_hcksum_capab = NULL;
@@ -911,11 +546,10 @@ ill_delete_tail(ill_t *ill)
*
* We don't walk conns, mrts and ires because
*
- * 1) reset_conn_ill and reset_mrt_ill cleans up conns and mrts.
+ * 1) update_conn_ill and reset_mrt_ill cleans up conns and mrts.
* 2) ill_down ->ill_downi walks all the ires and cleans up
* ill references.
*/
- ASSERT(ilm_walk_ill(ill) == 0);
/*
* If this ill is an IPMP meta-interface, blow away the illgrp. This
@@ -974,6 +608,9 @@ ill_delete_tail(ill_t *ill)
ill_trace_cleanup(ill);
#endif
+ /* The default multicast interface might have changed */
+ ire_increment_multicast_generation(ipst, ill->ill_isv6);
+
/* Drop refcnt here */
netstack_rele(ill->ill_ipst->ips_netstack);
ill->ill_ipst = NULL;
@@ -1077,97 +714,6 @@ ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap,
}
/*
- * Add the 'mp' to the list of pending mp's headed by ill_pending_mp. Return
- * an error if we already have 1 or more ioctls in progress. This is only
- * needed for SIOCG*ARP.
- */
-boolean_t
-ill_pending_mp_add(ill_t *ill, conn_t *connp, mblk_t *add_mp)
-{
- ASSERT(MUTEX_HELD(&ill->ill_lock));
- ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL));
- /* We should only see M_IOCDATA arp ioctls here. */
- ASSERT(add_mp->b_datap->db_type == M_IOCDATA);
-
- ASSERT(MUTEX_HELD(&connp->conn_lock));
- /*
- * Return error if the conn has started closing. The conn
- * could have finished cleaning up the pending mp list,
- * If so we should not add another mp to the list negating
- * the cleanup.
- */
- if (connp->conn_state_flags & CONN_CLOSING)
- return (B_FALSE);
- /*
- * Add the pending mp to the head of the list, chained by b_next.
- * Note down the conn on which the ioctl request came, in b_prev.
- * This will be used to later get the conn, when we get a response
- * on the ill queue, from some other module (typically arp)
- */
- add_mp->b_next = (void *)ill->ill_pending_mp;
- add_mp->b_queue = CONNP_TO_WQ(connp);
- ill->ill_pending_mp = add_mp;
- if (connp != NULL)
- connp->conn_oper_pending_ill = ill;
- return (B_TRUE);
-}
-
-/*
- * Retrieve the ill_pending_mp and return it. We have to walk the list
- * of mblks starting at ill_pending_mp, and match based on the ioc_id.
- */
-mblk_t *
-ill_pending_mp_get(ill_t *ill, conn_t **connpp, uint_t ioc_id)
-{
- mblk_t *prev = NULL;
- mblk_t *curr = NULL;
- uint_t id;
- conn_t *connp;
-
- /*
- * When the conn closes, conn_ioctl_cleanup needs to clean
- * up the pending mp, but it does not know the ioc_id and
- * passes in a zero for it.
- */
- mutex_enter(&ill->ill_lock);
- if (ioc_id != 0)
- *connpp = NULL;
-
- /* Search the list for the appropriate ioctl based on ioc_id */
- for (prev = NULL, curr = ill->ill_pending_mp; curr != NULL;
- prev = curr, curr = curr->b_next) {
- id = ((struct iocblk *)curr->b_rptr)->ioc_id;
- connp = Q_TO_CONN(curr->b_queue);
- /* Match based on the ioc_id or based on the conn */
- if ((id == ioc_id) || (ioc_id == 0 && connp == *connpp))
- break;
- }
-
- if (curr != NULL) {
- /* Unlink the mblk from the pending mp list */
- if (prev != NULL) {
- prev->b_next = curr->b_next;
- } else {
- ASSERT(ill->ill_pending_mp == curr);
- ill->ill_pending_mp = curr->b_next;
- }
-
- /*
- * conn refcnt must have been bumped up at the start of
- * the ioctl. So we can safely access the conn.
- */
- ASSERT(CONN_Q(curr->b_queue));
- *connpp = Q_TO_CONN(curr->b_queue);
- curr->b_next = NULL;
- curr->b_queue = NULL;
- }
-
- mutex_exit(&ill->ill_lock);
-
- return (curr);
-}
-
-/*
* Add the pending mp to the list. There can be only 1 pending mp
* in the list. Any exclusive ioctl that needs to wait for a response
* from another module or driver needs to use this function to set
@@ -1283,6 +829,7 @@ ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp)
ipxop_t *ipx;
queue_t *q;
ipif_t *ipif;
+ int cmd;
ASSERT(IAM_WRITER_ILL(ill));
ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
@@ -1312,11 +859,16 @@ ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp)
ipx->ipx_pending_ipif = NULL;
ipx->ipx_waitfor = 0;
ipx->ipx_current_ipif = NULL;
+ cmd = ipx->ipx_current_ioctl;
ipx->ipx_current_ioctl = 0;
ipx->ipx_current_done = B_TRUE;
mutex_exit(&ipx->ipx_lock);
if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) {
+ DTRACE_PROBE4(ipif__ioctl,
+ char *, "ipsq_pending_mp_cleanup",
+ int, cmd, ill_t *, ipif == NULL ? NULL : ipif->ipif_ill,
+ ipif_t *, ipif);
if (connp == NULL) {
ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL);
} else {
@@ -1337,43 +889,6 @@ ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp)
}
/*
- * The ill is closing. Cleanup all the pending mps. Called exclusively
- * towards the end of ill_delete. The refcount has gone to 0. So nobody
- * knows this ill, and hence nobody can add an mp to this list
- */
-static void
-ill_pending_mp_cleanup(ill_t *ill)
-{
- mblk_t *mp;
- queue_t *q;
-
- ASSERT(IAM_WRITER_ILL(ill));
-
- mutex_enter(&ill->ill_lock);
- /*
- * Every mp on the pending mp list originating from an ioctl
- * added 1 to the conn refcnt, at the start of the ioctl.
- * So bump it down now. See comments in ip_wput_nondata()
- */
- while (ill->ill_pending_mp != NULL) {
- mp = ill->ill_pending_mp;
- ill->ill_pending_mp = mp->b_next;
- mutex_exit(&ill->ill_lock);
-
- q = mp->b_queue;
- ASSERT(CONN_Q(q));
- mp->b_next = NULL;
- mp->b_prev = NULL;
- mp->b_queue = NULL;
- ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL);
- mutex_enter(&ill->ill_lock);
- }
- ill->ill_pending_ipif = NULL;
-
- mutex_exit(&ill->ill_lock);
-}
-
-/*
* Called in the conn close path and ill delete path
*/
static void
@@ -1435,6 +950,9 @@ ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp)
curr->b_prev = NULL;
curr->b_queue = NULL;
if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) {
+ DTRACE_PROBE4(ipif__ioctl,
+ char *, "ipsq_xopq_mp_cleanup",
+ int, 0, ill_t *, NULL, ipif_t *, NULL);
ip_ioctl_finish(q, curr, ENXIO, connp != NULL ?
CONN_CLOSE : NO_COPYOUT, NULL);
} else {
@@ -1455,7 +973,6 @@ ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp)
void
conn_ioctl_cleanup(conn_t *connp)
{
- mblk_t *curr;
ipsq_t *ipsq;
ill_t *ill;
boolean_t refheld;
@@ -1476,13 +993,6 @@ conn_ioctl_cleanup(conn_t *connp)
return;
}
- curr = ill_pending_mp_get(ill, &connp, 0);
- if (curr != NULL) {
- mutex_exit(&connp->conn_lock);
- CONN_DEC_REF(connp);
- inet_freemsg(curr);
- return;
- }
/*
* We may not be able to refhold the ill if the ill/ipif
* is changing. But we need to make sure that the ill will
@@ -1522,58 +1032,43 @@ conn_ioctl_cleanup(conn_t *connp)
/*
* ipcl_walk function for cleaning up conn_*_ill fields.
+ * Note that we leave ixa_multicast_ifindex, conn_incoming_ifindex, and
+ * conn_bound_if in place. We prefer dropping
+ * packets instead of sending them out the wrong interface, or accepting
+ * packets from the wrong ifindex.
*/
static void
conn_cleanup_ill(conn_t *connp, caddr_t arg)
{
ill_t *ill = (ill_t *)arg;
- ire_t *ire;
mutex_enter(&connp->conn_lock);
- if (connp->conn_multicast_ill == ill) {
- /* Revert to late binding */
- connp->conn_multicast_ill = NULL;
- }
- if (connp->conn_incoming_ill == ill)
- connp->conn_incoming_ill = NULL;
- if (connp->conn_outgoing_ill == ill)
- connp->conn_outgoing_ill = NULL;
if (connp->conn_dhcpinit_ill == ill) {
connp->conn_dhcpinit_ill = NULL;
ASSERT(ill->ill_dhcpinit != 0);
atomic_dec_32(&ill->ill_dhcpinit);
- }
- if (connp->conn_ire_cache != NULL) {
- ire = connp->conn_ire_cache;
- /*
- * Source address selection makes it possible for IRE_CACHE
- * entries to be created with ire_stq coming from interface X
- * and ipif coming from interface Y. Thus whenever interface
- * X goes down, remove all references to it by checking both
- * on ire_ipif and ire_stq.
- */
- if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) ||
- (ire->ire_type == IRE_CACHE &&
- ire->ire_stq == ill->ill_wq)) {
- connp->conn_ire_cache = NULL;
- mutex_exit(&connp->conn_lock);
- ire_refrele_notr(ire);
- return;
- }
+ ill_set_inputfn(ill);
}
mutex_exit(&connp->conn_lock);
}
-static void
+static int
ill_down_ipifs_tail(ill_t *ill)
{
ipif_t *ipif;
+ int err;
ASSERT(IAM_WRITER_ILL(ill));
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
ipif_non_duplicate(ipif);
- ipif_down_tail(ipif);
+ /*
+ * ipif_down_tail will call arp_ll_down on the last ipif
+ * and typically return EINPROGRESS when the DL_UNBIND is sent.
+ */
+ if ((err = ipif_down_tail(ipif)) != 0)
+ return (err);
}
+ return (0);
}
/* ARGSUSED */
@@ -1581,7 +1076,7 @@ void
ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
{
ASSERT(IAM_WRITER_IPSQ(ipsq));
- ill_down_ipifs_tail(q->q_ptr);
+ (void) ill_down_ipifs_tail(q->q_ptr);
freemsg(mp);
ipsq_current_finish(ipsq);
}
@@ -1598,12 +1093,27 @@ ill_down_start(queue_t *q, mblk_t *mp)
ipif_t *ipif;
ASSERT(IAM_WRITER_ILL(ill));
+ mutex_enter(&ill->ill_lock);
+ ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS;
+ /* no more nce addition allowed */
+ mutex_exit(&ill->ill_lock);
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
(void) ipif_down(ipif, NULL, NULL);
ill_down(ill);
+ /*
+ * Walk all CONNs that can have a reference on an ire or nce for this
+ * ill (we actually walk all that now have stale references).
+ */
+ ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ill->ill_ipst);
+
+ /* With IPv6 we have dce_ifindex. Cleanup for neatness */
+ if (ill->ill_isv6)
+ dce_cleanup(ill->ill_phyint->phyint_ifindex, ill->ill_ipst);
+
+
(void) ipsq_pending_mp_cleanup(ill, NULL);
ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0);
@@ -1626,44 +1136,68 @@ ill_down_start(queue_t *q, mblk_t *mp)
static void
ill_down(ill_t *ill)
{
+ mblk_t *mp;
ip_stack_t *ipst = ill->ill_ipst;
- /* Blow off any IREs dependent on this ILL. */
- ire_walk(ill_downi, ill, ipst);
+ /*
+ * Blow off any IREs dependent on this ILL.
+ * The caller needs to handle conn_ixa_cleanup
+ */
+ ill_delete_ires(ill);
+
+ ire_walk_ill(0, 0, ill_downi, ill, ill);
/* Remove any conn_*_ill depending on this ill */
ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst);
+
+ /*
+ * Free state for additional IREs.
+ */
+ mutex_enter(&ill->ill_saved_ire_lock);
+ mp = ill->ill_saved_ire_mp;
+ ill->ill_saved_ire_mp = NULL;
+ ill->ill_saved_ire_cnt = 0;
+ mutex_exit(&ill->ill_saved_ire_lock);
+ freemsg(mp);
}
/*
- * ire_walk routine used to delete every IRE that depends on queues
- * associated with 'ill'. (Always called as writer.)
+ * ire_walk routine used to delete every IRE that depends on
+ * 'ill'. (Always called as writer.)
+ *
+ * Note: since the routes added by the kernel are deleted separately,
+ * this will only be 1) IRE_IF_CLONE and 2) manually added IRE_INTERFACE.
+ *
+ * We also remove references on ire_nce_cache entries that refer to the ill.
*/
-static void
+void
ill_downi(ire_t *ire, char *ill_arg)
{
ill_t *ill = (ill_t *)ill_arg;
+ nce_t *nce;
- /*
- * Source address selection makes it possible for IRE_CACHE
- * entries to be created with ire_stq coming from interface X
- * and ipif coming from interface Y. Thus whenever interface
- * X goes down, remove all references to it by checking both
- * on ire_ipif and ire_stq.
- */
- if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) ||
- (ire->ire_type == IRE_CACHE && ire->ire_stq == ill->ill_wq)) {
+ mutex_enter(&ire->ire_lock);
+ nce = ire->ire_nce_cache;
+ if (nce != NULL && nce->nce_ill == ill)
+ ire->ire_nce_cache = NULL;
+ else
+ nce = NULL;
+ mutex_exit(&ire->ire_lock);
+ if (nce != NULL)
+ nce_refrele(nce);
+ if (ire->ire_ill == ill)
ire_delete(ire);
- }
}
-/*
- * Remove ire/nce from the fastpath list.
- */
+/* Remove IRE_IF_CLONE on this ill */
void
-ill_fastpath_nack(ill_t *ill)
+ill_downi_if_clone(ire_t *ire, char *ill_arg)
{
- nce_fastpath_list_dispatch(ill, NULL, NULL);
+ ill_t *ill = (ill_t *)ill_arg;
+
+ ASSERT(ire->ire_type & IRE_IF_CLONE);
+ if (ire->ire_ill == ill)
+ ire_delete(ire);
}
/* Consume an M_IOCACK of the fastpath probe. */
@@ -1685,20 +1219,11 @@ ill_fastpath_ack(ill_t *ill, mblk_t *mp)
freeb(mp1);
if (mp == NULL)
return;
- if (mp->b_cont != NULL) {
- /*
- * Update all IRE's or NCE's that are waiting for
- * fastpath update.
- */
- nce_fastpath_list_dispatch(ill, ndp_fastpath_update, mp);
- mp1 = mp->b_cont;
- freeb(mp);
- mp = mp1;
- } else {
+ if (mp->b_cont != NULL)
+ nce_fastpath_update(ill, mp);
+ else
ip0dbg(("ill_fastpath_ack: no b_cont\n"));
- }
-
- freeb(mp);
+ freemsg(mp);
}
/*
@@ -1745,6 +1270,8 @@ ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp)
ioc = (struct iocblk *)mp->b_rptr;
ioc->ioc_count = msgdsize(mp->b_cont);
+ DTRACE_PROBE3(ill__dlpi, char *, "ill_fastpath_probe",
+ char *, "DL_IOC_HDR_INFO", ill_t *, ill);
putnext(ill->ill_wq, mp);
return (0);
}
@@ -1797,8 +1324,7 @@ ill_capability_reset(ill_t *ill, boolean_t reneg)
* direct function call capabilities viz. ILL_CAPAB_DLD*
* which will be turned off by the corresponding reset functions.
*/
- ill->ill_capabilities &= ~(ILL_CAPAB_MDT | ILL_CAPAB_HCKSUM |
- ILL_CAPAB_ZEROCOPY | ILL_CAPAB_AH | ILL_CAPAB_ESP);
+ ill->ill_capabilities &= ~(ILL_CAPAB_HCKSUM | ILL_CAPAB_ZEROCOPY);
}
static void
@@ -1812,9 +1338,6 @@ ill_capability_reset_alloc(ill_t *ill)
ASSERT(IAM_WRITER_ILL(ill));
ASSERT(ill->ill_capab_reset_mp == NULL);
- if (ILL_MDT_CAPABLE(ill))
- size += sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t);
-
if (ILL_HCKSUM_CAPABLE(ill)) {
size += sizeof (dl_capability_sub_t) +
sizeof (dl_capab_hcksum_t);
@@ -1825,12 +1348,6 @@ ill_capability_reset_alloc(ill_t *ill)
sizeof (dl_capab_zerocopy_t);
}
- if (ill->ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP)) {
- size += sizeof (dl_capability_sub_t);
- size += ill_capability_ipsec_reset_size(ill, NULL, NULL,
- NULL, NULL);
- }
-
if (ill->ill_capabilities & ILL_CAPAB_DLD) {
size += sizeof (dl_capability_sub_t) +
sizeof (dl_capab_dld_t);
@@ -1853,10 +1370,8 @@ ill_capability_reset_alloc(ill_t *ill)
* Each handler fills in the corresponding dl_capability_sub_t
* inside the mblk,
*/
- ill_capability_mdt_reset_fill(ill, mp);
ill_capability_hcksum_reset_fill(ill, mp);
ill_capability_zerocopy_reset_fill(ill, mp);
- ill_capability_ipsec_reset_fill(ill, mp);
ill_capability_dld_reset_fill(ill, mp);
ill->ill_capab_reset_mp = mp;
@@ -1906,162 +1421,7 @@ ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers)
}
/* Process the encapsulated sub-capability */
- ill_capability_dispatch(ill, mp, inners, B_TRUE);
-}
-
-/*
- * Process Multidata Transmit capability negotiation ack received from a
- * DLS Provider. isub must point to the sub-capability (DL_CAPAB_MDT) of a
- * DL_CAPABILITY_ACK message.
- */
-static void
-ill_capability_mdt_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
-{
- mblk_t *nmp = NULL;
- dl_capability_req_t *oc;
- dl_capab_mdt_t *mdt_ic, *mdt_oc;
- ill_mdt_capab_t **ill_mdt_capab;
- uint_t sub_dl_cap = isub->dl_cap;
- uint8_t *capend;
-
- ASSERT(sub_dl_cap == DL_CAPAB_MDT);
-
- ill_mdt_capab = (ill_mdt_capab_t **)&ill->ill_mdt_capab;
-
- /*
- * Note: range checks here are not absolutely sufficient to
- * make us robust against malformed messages sent by drivers;
- * this is in keeping with the rest of IP's dlpi handling.
- * (Remember, it's coming from something else in the kernel
- * address space)
- */
-
- capend = (uint8_t *)(isub + 1) + isub->dl_length;
- if (capend > mp->b_wptr) {
- cmn_err(CE_WARN, "ill_capability_mdt_ack: "
- "malformed sub-capability too long for mblk");
- return;
- }
-
- mdt_ic = (dl_capab_mdt_t *)(isub + 1);
-
- if (mdt_ic->mdt_version != MDT_VERSION_2) {
- cmn_err(CE_CONT, "ill_capability_mdt_ack: "
- "unsupported MDT sub-capability (version %d, expected %d)",
- mdt_ic->mdt_version, MDT_VERSION_2);
- return;
- }
-
- if (!dlcapabcheckqid(&mdt_ic->mdt_mid, ill->ill_lmod_rq)) {
- ip1dbg(("ill_capability_mdt_ack: mid token for MDT "
- "capability isn't as expected; pass-thru module(s) "
- "detected, discarding capability\n"));
- return;
- }
-
- if (mdt_ic->mdt_flags & DL_CAPAB_MDT_ENABLE) {
-
- if (*ill_mdt_capab == NULL) {
- *ill_mdt_capab = kmem_zalloc(sizeof (ill_mdt_capab_t),
- KM_NOSLEEP);
- if (*ill_mdt_capab == NULL) {
- cmn_err(CE_WARN, "ill_capability_mdt_ack: "
- "could not enable MDT version %d "
- "for %s (ENOMEM)\n", MDT_VERSION_2,
- ill->ill_name);
- return;
- }
- }
-
- ip1dbg(("ill_capability_mdt_ack: interface %s supports "
- "MDT version %d (%d bytes leading, %d bytes trailing "
- "header spaces, %d max pld bufs, %d span limit)\n",
- ill->ill_name, MDT_VERSION_2,
- mdt_ic->mdt_hdr_head, mdt_ic->mdt_hdr_tail,
- mdt_ic->mdt_max_pld, mdt_ic->mdt_span_limit));
-
- (*ill_mdt_capab)->ill_mdt_version = MDT_VERSION_2;
- (*ill_mdt_capab)->ill_mdt_on = 1;
- /*
- * Round the following values to the nearest 32-bit; ULP
- * may further adjust them to accomodate for additional
- * protocol headers. We pass these values to ULP during
- * bind time.
- */
- (*ill_mdt_capab)->ill_mdt_hdr_head =
- roundup(mdt_ic->mdt_hdr_head, 4);
- (*ill_mdt_capab)->ill_mdt_hdr_tail =
- roundup(mdt_ic->mdt_hdr_tail, 4);
- (*ill_mdt_capab)->ill_mdt_max_pld = mdt_ic->mdt_max_pld;
- (*ill_mdt_capab)->ill_mdt_span_limit = mdt_ic->mdt_span_limit;
-
- ill->ill_capabilities |= ILL_CAPAB_MDT;
- } else {
- uint_t size;
- uchar_t *rptr;
-
- size = sizeof (dl_capability_req_t) +
- sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t);
-
- if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
- cmn_err(CE_WARN, "ill_capability_mdt_ack: "
- "could not enable MDT for %s (ENOMEM)\n",
- ill->ill_name);
- return;
- }
-
- rptr = nmp->b_rptr;
- /* initialize dl_capability_req_t */
- oc = (dl_capability_req_t *)nmp->b_rptr;
- oc->dl_sub_offset = sizeof (dl_capability_req_t);
- oc->dl_sub_length = sizeof (dl_capability_sub_t) +
- sizeof (dl_capab_mdt_t);
- nmp->b_rptr += sizeof (dl_capability_req_t);
-
- /* initialize dl_capability_sub_t */
- bcopy(isub, nmp->b_rptr, sizeof (*isub));
- nmp->b_rptr += sizeof (*isub);
-
- /* initialize dl_capab_mdt_t */
- mdt_oc = (dl_capab_mdt_t *)nmp->b_rptr;
- bcopy(mdt_ic, mdt_oc, sizeof (*mdt_ic));
-
- nmp->b_rptr = rptr;
-
- ip1dbg(("ill_capability_mdt_ack: asking interface %s "
- "to enable MDT version %d\n", ill->ill_name,
- MDT_VERSION_2));
-
- /* set ENABLE flag */
- mdt_oc->mdt_flags |= DL_CAPAB_MDT_ENABLE;
-
- /* nmp points to a DL_CAPABILITY_REQ message to enable MDT */
- ill_capability_send(ill, nmp);
- }
-}
-
-static void
-ill_capability_mdt_reset_fill(ill_t *ill, mblk_t *mp)
-{
- dl_capab_mdt_t *mdt_subcap;
- dl_capability_sub_t *dl_subcap;
-
- if (!ILL_MDT_CAPABLE(ill))
- return;
-
- ASSERT(ill->ill_mdt_capab != NULL);
-
- dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
- dl_subcap->dl_cap = DL_CAPAB_MDT;
- dl_subcap->dl_length = sizeof (*mdt_subcap);
-
- mdt_subcap = (dl_capab_mdt_t *)(dl_subcap + 1);
- mdt_subcap->mdt_version = ill->ill_mdt_capab->ill_mdt_version;
- mdt_subcap->mdt_flags = 0;
- mdt_subcap->mdt_hdr_head = 0;
- mdt_subcap->mdt_hdr_tail = 0;
-
- mp->b_wptr += sizeof (*dl_subcap) + sizeof (*mdt_subcap);
+ ill_capability_dispatch(ill, mp, inners);
}
static void
@@ -2083,503 +1443,10 @@ ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp)
mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t);
}
-/*
- * Allocate an IPsec capability request which will be filled by our
- * caller to turn on support for one or more algorithms.
- */
-/* ARGSUSED */
-static mblk_t *
-ill_alloc_ipsec_cap_req(ill_t *ill, dl_capability_sub_t *isub)
-{
- mblk_t *nmp;
- dl_capability_req_t *ocap;
- dl_capab_ipsec_t *ocip;
- dl_capab_ipsec_t *icip;
- uint8_t *ptr;
- icip = (dl_capab_ipsec_t *)(isub + 1);
-
- /*
- * Allocate new mblk which will contain a new capability
- * request to enable the capabilities.
- */
-
- nmp = ip_dlpi_alloc(sizeof (dl_capability_req_t) +
- sizeof (dl_capability_sub_t) + isub->dl_length, DL_CAPABILITY_REQ);
- if (nmp == NULL)
- return (NULL);
-
- ptr = nmp->b_rptr;
-
- /* initialize dl_capability_req_t */
- ocap = (dl_capability_req_t *)ptr;
- ocap->dl_sub_offset = sizeof (dl_capability_req_t);
- ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length;
- ptr += sizeof (dl_capability_req_t);
-
- /* initialize dl_capability_sub_t */
- bcopy(isub, ptr, sizeof (*isub));
- ptr += sizeof (*isub);
-
- /* initialize dl_capab_ipsec_t */
- ocip = (dl_capab_ipsec_t *)ptr;
- bcopy(icip, ocip, sizeof (*icip));
-
- nmp->b_wptr = (uchar_t *)(&ocip->cip_data[0]);
- return (nmp);
-}
-
-/*
- * Process an IPsec capability negotiation ack received from a DLS Provider.
- * isub must point to the sub-capability (DL_CAPAB_IPSEC_AH or
- * DL_CAPAB_IPSEC_ESP) of a DL_CAPABILITY_ACK message.
- */
static void
-ill_capability_ipsec_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
-{
- dl_capab_ipsec_t *icip;
- dl_capab_ipsec_alg_t *ialg; /* ptr to input alg spec. */
- dl_capab_ipsec_alg_t *oalg; /* ptr to output alg spec. */
- uint_t cipher, nciphers;
- mblk_t *nmp;
- uint_t alg_len;
- boolean_t need_sadb_dump;
- uint_t sub_dl_cap = isub->dl_cap;
- ill_ipsec_capab_t **ill_capab;
- uint64_t ill_capab_flag;
- uint8_t *capend, *ciphend;
- boolean_t sadb_resync;
-
- ASSERT(sub_dl_cap == DL_CAPAB_IPSEC_AH ||
- sub_dl_cap == DL_CAPAB_IPSEC_ESP);
-
- if (sub_dl_cap == DL_CAPAB_IPSEC_AH) {
- ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_ah;
- ill_capab_flag = ILL_CAPAB_AH;
- } else {
- ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_esp;
- ill_capab_flag = ILL_CAPAB_ESP;
- }
-
- /*
- * If the ill capability structure exists, then this incoming
- * DL_CAPABILITY_ACK is a response to a "renegotiation" cycle.
- * If this is so, then we'd need to resynchronize the SADB
- * after re-enabling the offloaded ciphers.
- */
- sadb_resync = (*ill_capab != NULL);
-
- /*
- * Note: range checks here are not absolutely sufficient to
- * make us robust against malformed messages sent by drivers;
- * this is in keeping with the rest of IP's dlpi handling.
- * (Remember, it's coming from something else in the kernel
- * address space)
- */
-
- capend = (uint8_t *)(isub + 1) + isub->dl_length;
- if (capend > mp->b_wptr) {
- cmn_err(CE_WARN, "ill_capability_ipsec_ack: "
- "malformed sub-capability too long for mblk");
- return;
- }
-
- /*
- * There are two types of acks we process here:
- * 1. acks in reply to a (first form) generic capability req
- * (no ENABLE flag set)
- * 2. acks in reply to a ENABLE capability req.
- * (ENABLE flag set)
- *
- * We process the subcapability passed as argument as follows:
- * 1 do initializations
- * 1.1 initialize nmp = NULL
- * 1.2 set need_sadb_dump to B_FALSE
- * 2 for each cipher in subcapability:
- * 2.1 if ENABLE flag is set:
- * 2.1.1 update per-ill ipsec capabilities info
- * 2.1.2 set need_sadb_dump to B_TRUE
- * 2.2 if ENABLE flag is not set:
- * 2.2.1 if nmp is NULL:
- * 2.2.1.1 allocate and initialize nmp
- * 2.2.1.2 init current pos in nmp
- * 2.2.2 copy current cipher to current pos in nmp
- * 2.2.3 set ENABLE flag in nmp
- * 2.2.4 update current pos
- * 3 if nmp is not equal to NULL, send enable request
- * 3.1 send capability request
- * 4 if need_sadb_dump is B_TRUE
- * 4.1 enable promiscuous on/off notifications
- * 4.2 call ill_dlpi_send(isub->dlcap) to send all
- * AH or ESP SA's to interface.
- */
-
- nmp = NULL;
- oalg = NULL;
- need_sadb_dump = B_FALSE;
- icip = (dl_capab_ipsec_t *)(isub + 1);
- ialg = (dl_capab_ipsec_alg_t *)(&icip->cip_data[0]);
-
- nciphers = icip->cip_nciphers;
- ciphend = (uint8_t *)(ialg + icip->cip_nciphers);
-
- if (ciphend > capend) {
- cmn_err(CE_WARN, "ill_capability_ipsec_ack: "
- "too many ciphers for sub-capability len");
- return;
- }
-
- for (cipher = 0; cipher < nciphers; cipher++) {
- alg_len = sizeof (dl_capab_ipsec_alg_t);
-
- if (ialg->alg_flag & DL_CAPAB_ALG_ENABLE) {
- /*
- * TBD: when we provide a way to disable capabilities
- * from above, need to manage the request-pending state
- * and fail if we were not expecting this ACK.
- */
- IPSECHW_DEBUG(IPSECHW_CAPAB,
- ("ill_capability_ipsec_ack: got ENABLE ACK\n"));
-
- /*
- * Update IPsec capabilities for this ill
- */
-
- if (*ill_capab == NULL) {
- IPSECHW_DEBUG(IPSECHW_CAPAB,
- ("ill_capability_ipsec_ack: "
- "allocating ipsec_capab for ill\n"));
- *ill_capab = ill_ipsec_capab_alloc();
-
- if (*ill_capab == NULL) {
- cmn_err(CE_WARN,
- "ill_capability_ipsec_ack: "
- "could not enable IPsec Hardware "
- "acceleration for %s (ENOMEM)\n",
- ill->ill_name);
- return;
- }
- }
-
- ASSERT(ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH ||
- ialg->alg_type == DL_CAPAB_IPSEC_ALG_ENCR);
-
- if (ialg->alg_prim >= MAX_IPSEC_ALGS) {
- cmn_err(CE_WARN,
- "ill_capability_ipsec_ack: "
- "malformed IPsec algorithm id %d",
- ialg->alg_prim);
- continue;
- }
-
- if (ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH) {
- IPSEC_ALG_ENABLE((*ill_capab)->auth_hw_algs,
- ialg->alg_prim);
- } else {
- ipsec_capab_algparm_t *alp;
-
- IPSEC_ALG_ENABLE((*ill_capab)->encr_hw_algs,
- ialg->alg_prim);
- if (!ill_ipsec_capab_resize_algparm(*ill_capab,
- ialg->alg_prim)) {
- cmn_err(CE_WARN,
- "ill_capability_ipsec_ack: "
- "no space for IPsec alg id %d",
- ialg->alg_prim);
- continue;
- }
- alp = &((*ill_capab)->encr_algparm[
- ialg->alg_prim]);
- alp->minkeylen = ialg->alg_minbits;
- alp->maxkeylen = ialg->alg_maxbits;
- }
- ill->ill_capabilities |= ill_capab_flag;
- /*
- * indicate that a capability was enabled, which
- * will be used below to kick off a SADB dump
- * to the ill.
- */
- need_sadb_dump = B_TRUE;
- } else {
- IPSECHW_DEBUG(IPSECHW_CAPAB,
- ("ill_capability_ipsec_ack: enabling alg 0x%x\n",
- ialg->alg_prim));
-
- if (nmp == NULL) {
- nmp = ill_alloc_ipsec_cap_req(ill, isub);
- if (nmp == NULL) {
- /*
- * Sending the PROMISC_ON/OFF
- * notification request failed.
- * We cannot enable the algorithms
- * since the Provider will not
- * notify IP of promiscous mode
- * changes, which could lead
- * to leakage of packets.
- */
- cmn_err(CE_WARN,
- "ill_capability_ipsec_ack: "
- "could not enable IPsec Hardware "
- "acceleration for %s (ENOMEM)\n",
- ill->ill_name);
- return;
- }
- /* ptr to current output alg specifier */
- oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr;
- }
-
- /*
- * Copy current alg specifier, set ENABLE
- * flag, and advance to next output alg.
- * For now we enable all IPsec capabilities.
- */
- ASSERT(oalg != NULL);
- bcopy(ialg, oalg, alg_len);
- oalg->alg_flag |= DL_CAPAB_ALG_ENABLE;
- nmp->b_wptr += alg_len;
- oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr;
- }
-
- /* move to next input algorithm specifier */
- ialg = (dl_capab_ipsec_alg_t *)
- ((char *)ialg + alg_len);
- }
-
- if (nmp != NULL)
- /*
- * nmp points to a DL_CAPABILITY_REQ message to enable
- * IPsec hardware acceleration.
- */
- ill_capability_send(ill, nmp);
-
- if (need_sadb_dump)
- /*
- * An acknowledgement corresponding to a request to
- * enable acceleration was received, notify SADB.
- */
- ill_ipsec_capab_add(ill, sub_dl_cap, sadb_resync);
-}
-
-/*
- * Given an mblk with enough space in it, create sub-capability entries for
- * DL_CAPAB_IPSEC_{AH,ESP} types which consist of previously-advertised
- * offloaded ciphers (both AUTH and ENCR) with their enable flags cleared,
- * in preparation for the reset the DL_CAPABILITY_REQ message.
- */
-static void
-ill_fill_ipsec_reset(uint_t nciphers, int stype, uint_t slen,
- ill_ipsec_capab_t *ill_cap, mblk_t *mp)
-{
- dl_capab_ipsec_t *oipsec;
- dl_capab_ipsec_alg_t *oalg;
- dl_capability_sub_t *dl_subcap;
- int i, k;
-
- ASSERT(nciphers > 0);
- ASSERT(ill_cap != NULL);
- ASSERT(mp != NULL);
- ASSERT(MBLKTAIL(mp) >= sizeof (*dl_subcap) + sizeof (*oipsec) + slen);
-
- /* dl_capability_sub_t for "stype" */
- dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
- dl_subcap->dl_cap = stype;
- dl_subcap->dl_length = sizeof (dl_capab_ipsec_t) + slen;
- mp->b_wptr += sizeof (dl_capability_sub_t);
-
- /* dl_capab_ipsec_t for "stype" */
- oipsec = (dl_capab_ipsec_t *)mp->b_wptr;
- oipsec->cip_version = 1;
- oipsec->cip_nciphers = nciphers;
- mp->b_wptr = (uchar_t *)&oipsec->cip_data[0];
-
- /* create entries for "stype" AUTH ciphers */
- for (i = 0; i < ill_cap->algs_size; i++) {
- for (k = 0; k < BITSPERBYTE; k++) {
- if ((ill_cap->auth_hw_algs[i] & (1 << k)) == 0)
- continue;
-
- oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr;
- bzero((void *)oalg, sizeof (*oalg));
- oalg->alg_type = DL_CAPAB_IPSEC_ALG_AUTH;
- oalg->alg_prim = k + (BITSPERBYTE * i);
- mp->b_wptr += sizeof (dl_capab_ipsec_alg_t);
- }
- }
- /* create entries for "stype" ENCR ciphers */
- for (i = 0; i < ill_cap->algs_size; i++) {
- for (k = 0; k < BITSPERBYTE; k++) {
- if ((ill_cap->encr_hw_algs[i] & (1 << k)) == 0)
- continue;
-
- oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr;
- bzero((void *)oalg, sizeof (*oalg));
- oalg->alg_type = DL_CAPAB_IPSEC_ALG_ENCR;
- oalg->alg_prim = k + (BITSPERBYTE * i);
- mp->b_wptr += sizeof (dl_capab_ipsec_alg_t);
- }
- }
-}
-
-/*
- * Macro to count number of 1s in a byte (8-bit word). The total count is
- * accumulated into the passed-in argument (sum). We could use SPARCv9's
- * POPC instruction, but our macro is more flexible for an arbitrary length
- * of bytes, such as {auth,encr}_hw_algs. These variables are currently
- * 256-bits long (MAX_IPSEC_ALGS), so if we know for sure that the length
- * stays that way, we can reduce the number of iterations required.
- */
-#define COUNT_1S(val, sum) { \
- uint8_t x = val & 0xff; \
- x = (x & 0x55) + ((x >> 1) & 0x55); \
- x = (x & 0x33) + ((x >> 2) & 0x33); \
- sum += (x & 0xf) + ((x >> 4) & 0xf); \
-}
-
-/* ARGSUSED */
-static int
-ill_capability_ipsec_reset_size(ill_t *ill, int *ah_cntp, int *ah_lenp,
- int *esp_cntp, int *esp_lenp)
-{
- ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah;
- ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp;
- uint64_t ill_capabilities = ill->ill_capabilities;
- int ah_cnt = 0, esp_cnt = 0;
- int ah_len = 0, esp_len = 0;
- int i, size = 0;
-
- if (!(ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP)))
- return (0);
-
- ASSERT(cap_ah != NULL || !(ill_capabilities & ILL_CAPAB_AH));
- ASSERT(cap_esp != NULL || !(ill_capabilities & ILL_CAPAB_ESP));
-
- /* Find out the number of ciphers for AH */
- if (cap_ah != NULL) {
- for (i = 0; i < cap_ah->algs_size; i++) {
- COUNT_1S(cap_ah->auth_hw_algs[i], ah_cnt);
- COUNT_1S(cap_ah->encr_hw_algs[i], ah_cnt);
- }
- if (ah_cnt > 0) {
- size += sizeof (dl_capability_sub_t) +
- sizeof (dl_capab_ipsec_t);
- /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */
- ah_len = (ah_cnt - 1) * sizeof (dl_capab_ipsec_alg_t);
- size += ah_len;
- }
- }
-
- /* Find out the number of ciphers for ESP */
- if (cap_esp != NULL) {
- for (i = 0; i < cap_esp->algs_size; i++) {
- COUNT_1S(cap_esp->auth_hw_algs[i], esp_cnt);
- COUNT_1S(cap_esp->encr_hw_algs[i], esp_cnt);
- }
- if (esp_cnt > 0) {
- size += sizeof (dl_capability_sub_t) +
- sizeof (dl_capab_ipsec_t);
- /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */
- esp_len = (esp_cnt - 1) * sizeof (dl_capab_ipsec_alg_t);
- size += esp_len;
- }
- }
-
- if (ah_cntp != NULL)
- *ah_cntp = ah_cnt;
- if (ah_lenp != NULL)
- *ah_lenp = ah_len;
- if (esp_cntp != NULL)
- *esp_cntp = esp_cnt;
- if (esp_lenp != NULL)
- *esp_lenp = esp_len;
-
- return (size);
-}
-
-/* ARGSUSED */
-static void
-ill_capability_ipsec_reset_fill(ill_t *ill, mblk_t *mp)
+ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp)
{
- ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah;
- ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp;
- int ah_cnt = 0, esp_cnt = 0;
- int ah_len = 0, esp_len = 0;
- int size;
-
- size = ill_capability_ipsec_reset_size(ill, &ah_cnt, &ah_len,
- &esp_cnt, &esp_len);
- if (size == 0)
- return;
-
- /*
- * Clear the capability flags for IPsec HA but retain the ill
- * capability structures since it's possible that another thread
- * is still referring to them. The structures only get deallocated
- * when we destroy the ill.
- *
- * Various places check the flags to see if the ill is capable of
- * hardware acceleration, and by clearing them we ensure that new
- * outbound IPsec packets are sent down encrypted.
- */
-
- /* Fill in DL_CAPAB_IPSEC_AH sub-capability entries */
- if (ah_cnt > 0) {
- ill_fill_ipsec_reset(ah_cnt, DL_CAPAB_IPSEC_AH, ah_len,
- cap_ah, mp);
- }
-
- /* Fill in DL_CAPAB_IPSEC_ESP sub-capability entries */
- if (esp_cnt > 0) {
- ill_fill_ipsec_reset(esp_cnt, DL_CAPAB_IPSEC_ESP, esp_len,
- cap_esp, mp);
- }
-
- /*
- * At this point we've composed a bunch of sub-capabilities to be
- * encapsulated in a DL_CAPABILITY_REQ and later sent downstream
- * by the caller. Upon receiving this reset message, the driver
- * must stop inbound decryption (by destroying all inbound SAs)
- * and let the corresponding packets come in encrypted.
- */
-}
-
-static void
-ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp,
- boolean_t encapsulated)
-{
- boolean_t legacy = B_FALSE;
-
- /*
- * Note that only the following two sub-capabilities may be
- * considered as "legacy", since their original definitions
- * do not incorporate the dl_mid_t module ID token, and hence
- * may require the use of the wrapper sub-capability.
- */
switch (subp->dl_cap) {
- case DL_CAPAB_IPSEC_AH:
- case DL_CAPAB_IPSEC_ESP:
- legacy = B_TRUE;
- break;
- }
-
- /*
- * For legacy sub-capabilities which don't incorporate a queue_t
- * pointer in their structures, discard them if we detect that
- * there are intermediate modules in between IP and the driver.
- */
- if (!encapsulated && legacy && ill->ill_lmod_cnt > 1) {
- ip1dbg(("ill_capability_dispatch: unencapsulated capab type "
- "%d discarded; %d module(s) present below IP\n",
- subp->dl_cap, ill->ill_lmod_cnt));
- return;
- }
-
- switch (subp->dl_cap) {
- case DL_CAPAB_IPSEC_AH:
- case DL_CAPAB_IPSEC_ESP:
- ill_capability_ipsec_ack(ill, mp, subp);
- break;
- case DL_CAPAB_MDT:
- ill_capability_mdt_ack(ill, mp, subp);
- break;
case DL_CAPAB_HCKSUM:
ill_capability_hcksum_ack(ill, mp, subp);
break;
@@ -3104,7 +1971,7 @@ ill_capability_lso_enable(ill_t *ill)
DLD_ENABLE)) == 0) {
ill->ill_lso_capab->ill_lso_flags = lso.lso_flags;
ill->ill_lso_capab->ill_lso_max = lso.lso_max;
- ill->ill_capabilities |= ILL_CAPAB_DLD_LSO;
+ ill->ill_capabilities |= ILL_CAPAB_LSO;
ip1dbg(("ill_capability_lso_enable: interface %s "
"has enabled LSO\n ", ill->ill_name));
} else {
@@ -3180,7 +2047,7 @@ ill_capability_dld_disable(ill_t *ill)
NULL, DLD_DISABLE);
}
- if ((ill->ill_capabilities & ILL_CAPAB_DLD_LSO) != 0) {
+ if ((ill->ill_capabilities & ILL_CAPAB_LSO) != 0) {
ASSERT(ill->ill_lso_capab != NULL);
/*
* Clear the capability flag for LSO but retain the
@@ -3189,7 +2056,7 @@ ill_capability_dld_disable(ill_t *ill)
* deallocated when we destroy the ill.
*/
- ill->ill_capabilities &= ~ILL_CAPAB_DLD_LSO;
+ ill->ill_capabilities &= ~ILL_CAPAB_LSO;
(void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO,
NULL, DLD_DISABLE);
}
@@ -3335,7 +2202,7 @@ ill_capability_ack_thr(void *arg)
ill_capability_id_ack(ill, mp, subp);
break;
default:
- ill_capability_dispatch(ill, mp, subp, B_FALSE);
+ ill_capability_dispatch(ill, mp, subp);
break;
}
}
@@ -3410,8 +2277,14 @@ ill_frag_timeout(ill_t *ill, time_t dead_interval)
uint32_t hdr_length;
mblk_t *send_icmp_head;
mblk_t *send_icmp_head_v6;
- zoneid_t zoneid;
ip_stack_t *ipst = ill->ill_ipst;
+ ip_recv_attr_t iras;
+
+ bzero(&iras, sizeof (iras));
+ iras.ira_flags = 0;
+ iras.ira_ill = iras.ira_rill = ill;
+ iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+ iras.ira_rifindex = iras.ira_ruifindex;
ipfb = ill->ill_frag_hash_tbl;
if (ipfb == NULL)
@@ -3483,6 +2356,7 @@ ill_frag_timeout(ill_t *ill, time_t dead_interval)
}
}
BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
+ ip_drop_input("ipIfStatsReasmFails", ipf->ipf_mp, ill);
freeb(ipf->ipf_mp);
}
mutex_exit(&ipfb->ipfb_lock);
@@ -3496,19 +2370,21 @@ ill_frag_timeout(ill_t *ill, time_t dead_interval)
mp = send_icmp_head_v6;
send_icmp_head_v6 = send_icmp_head_v6->b_next;
mp->b_next = NULL;
- if (mp->b_datap->db_type == M_CTL)
- ip6h = (ip6_t *)mp->b_cont->b_rptr;
- else
- ip6h = (ip6_t *)mp->b_rptr;
- zoneid = ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst,
+ ip6h = (ip6_t *)mp->b_rptr;
+ iras.ira_flags = 0;
+ /*
+ * This will result in an incorrect ALL_ZONES zoneid
+ * for multicast packets, but we
+ * don't send ICMP errors for those in any case.
+ */
+ iras.ira_zoneid =
+ ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst,
ill, ipst);
- if (zoneid == ALL_ZONES) {
- freemsg(mp);
- } else {
- icmp_time_exceeded_v6(ill->ill_wq, mp,
- ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE,
- B_FALSE, zoneid, ipst);
- }
+ ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill);
+ icmp_time_exceeded_v6(mp,
+ ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE,
+ &iras);
+ ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
}
while (send_icmp_head != NULL) {
ipaddr_t dst;
@@ -3517,19 +2393,20 @@ ill_frag_timeout(ill_t *ill, time_t dead_interval)
send_icmp_head = send_icmp_head->b_next;
mp->b_next = NULL;
- if (mp->b_datap->db_type == M_CTL)
- dst = ((ipha_t *)mp->b_cont->b_rptr)->ipha_dst;
- else
- dst = ((ipha_t *)mp->b_rptr)->ipha_dst;
+ dst = ((ipha_t *)mp->b_rptr)->ipha_dst;
- zoneid = ipif_lookup_addr_zoneid(dst, ill, ipst);
- if (zoneid == ALL_ZONES) {
- freemsg(mp);
- } else {
- icmp_time_exceeded(ill->ill_wq, mp,
- ICMP_REASSEMBLY_TIME_EXCEEDED, zoneid,
- ipst);
- }
+ iras.ira_flags = IRAF_IS_IPV4;
+ /*
+ * This will result in an incorrect ALL_ZONES zoneid
+ * for broadcast and multicast packets, but we
+ * don't send ICMP errors for those in any case.
+ */
+ iras.ira_zoneid = ipif_lookup_addr_zoneid(dst,
+ ill, ipst);
+ ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill);
+ icmp_time_exceeded(mp,
+ ICMP_REASSEMBLY_TIME_EXCEEDED, &iras);
+ ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
}
}
/*
@@ -3647,8 +2524,9 @@ ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt)
ipfb->ipfb_count -= count;
ASSERT(ipfb->ipfb_frag_pkts > 0);
ipfb->ipfb_frag_pkts--;
- freemsg(mp);
BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
+ ip_drop_input("ipIfStatsReasmFails", mp, ill);
+ freemsg(mp);
}
if (ipf)
@@ -3776,6 +2654,7 @@ static void
ill_set_nce_router_flags(ill_t *ill, boolean_t enable)
{
ipif_t *ipif;
+ ncec_t *ncec;
nce_t *nce;
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
@@ -3784,16 +2663,16 @@ ill_set_nce_router_flags(ill_t *ill, boolean_t enable)
* addresses on IPMP interfaces have an nce_ill that points to
* the bound underlying ill.
*/
- nce = ndp_lookup_v6(ill, B_TRUE, &ipif->ipif_v6lcl_addr,
- B_FALSE);
+ nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr);
if (nce != NULL) {
- mutex_enter(&nce->nce_lock);
+ ncec = nce->nce_common;
+ mutex_enter(&ncec->ncec_lock);
if (enable)
- nce->nce_flags |= NCE_F_ISROUTER;
+ ncec->ncec_flags |= NCE_F_ISROUTER;
else
- nce->nce_flags &= ~NCE_F_ISROUTER;
- mutex_exit(&nce->nce_lock);
- NCE_REFRELE(nce);
+ ncec->ncec_flags &= ~NCE_F_ISROUTER;
+ mutex_exit(&ncec->ncec_lock);
+ nce_refrele(nce);
}
}
}
@@ -3986,8 +2865,7 @@ ill_get_ppa_ptr(char *name)
* use avl tree to locate the ill.
*/
static ill_t *
-ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp,
- ipsq_func_t func, int *error, ip_stack_t *ipst)
+ill_find_by_name(char *name, boolean_t isv6, ip_stack_t *ipst)
{
char *ppa_ptr = NULL;
int len;
@@ -3995,10 +2873,6 @@ ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp,
ill_t *ill = NULL;
ill_if_t *ifp;
int list;
- ipsq_t *ipsq;
-
- if (error != NULL)
- *error = 0;
/*
* get ppa ptr
@@ -4009,8 +2883,6 @@ ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp,
list = IP_V4_G_HEAD;
if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) {
- if (error != NULL)
- *error = ENXIO;
return (NULL);
}
@@ -4038,42 +2910,19 @@ ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp,
/*
* Even the interface type does not exist.
*/
- if (error != NULL)
- *error = ENXIO;
return (NULL);
}
ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL);
if (ill != NULL) {
- /*
- * The block comment at the start of ipif_down
- * explains the use of the macros used below
- */
- GRAB_CONN_LOCK(q);
mutex_enter(&ill->ill_lock);
if (ILL_CAN_LOOKUP(ill)) {
ill_refhold_locked(ill);
mutex_exit(&ill->ill_lock);
- RELEASE_CONN_LOCK(q);
return (ill);
- } else if (ILL_CAN_WAIT(ill, q)) {
- ipsq = ill->ill_phyint->phyint_ipsq;
- mutex_enter(&ipsq->ipsq_lock);
- mutex_enter(&ipsq->ipsq_xop->ipx_lock);
- mutex_exit(&ill->ill_lock);
- ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
- mutex_exit(&ipsq->ipsq_xop->ipx_lock);
- mutex_exit(&ipsq->ipsq_lock);
- RELEASE_CONN_LOCK(q);
- if (error != NULL)
- *error = EINPROGRESS;
- return (NULL);
}
mutex_exit(&ill->ill_lock);
- RELEASE_CONN_LOCK(q);
}
- if (error != NULL)
- *error = ENXIO;
return (NULL);
}
@@ -4474,6 +3323,8 @@ ill_init(queue_t *q, ill_t *ill)
* ip_open(), before we reach here.
*/
mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0);
+ mutex_init(&ill->ill_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL);
+ ill->ill_saved_ire_cnt = 0;
ill->ill_rq = q;
ill->ill_wq = WR(q);
@@ -4521,7 +3372,9 @@ ill_init(queue_t *q, ill_t *ill)
*/
ill->ill_phyint->phyint_illv4 = ill;
ill->ill_ppa = UINT_MAX;
- ill->ill_fastpath_list = &ill->ill_fastpath_list;
+ list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node));
+
+ ill_set_inputfn(ill);
if (!ipsq_init(ill, B_TRUE)) {
freemsg(info_mp);
@@ -4536,6 +3389,8 @@ ill_init(queue_t *q, ill_t *ill)
ill->ill_frag_count = 0;
ill->ill_ipf_gen = 0;
+ rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL);
ill->ill_global_timer = INFINITY;
ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0;
ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0;
@@ -4550,7 +3405,6 @@ ill_init(queue_t *q, ill_t *ill)
* IPv6.
*/
ill->ill_reachable_time = ND_REACHABLE_TIME;
- ill->ill_reachable_retrans_time = ND_RETRANS_TIMER;
ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT;
ill->ill_max_buf = ND_MAX_Q;
ill->ill_refcnt = 0;
@@ -4574,15 +3428,14 @@ ill_init(queue_t *q, ill_t *ill)
* creates datalink socket info from the device.
*/
int
-ill_dls_info(struct sockaddr_dl *sdl, const ipif_t *ipif)
+ill_dls_info(struct sockaddr_dl *sdl, const ill_t *ill)
{
size_t len;
- ill_t *ill = ipif->ipif_ill;
sdl->sdl_family = AF_LINK;
- sdl->sdl_index = ill->ill_phyint->phyint_ifindex;
+ sdl->sdl_index = ill_get_upper_ifindex(ill);
sdl->sdl_type = ill->ill_type;
- ipif_get_name(ipif, sdl->sdl_data, sizeof (sdl->sdl_data));
+ ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data));
len = strlen(sdl->sdl_data);
ASSERT(len < 256);
sdl->sdl_nlen = (uchar_t)len;
@@ -4604,7 +3457,7 @@ ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill)
sdl->sdl_family = AF_LINK;
sdl->sdl_index = ill->ill_phyint->phyint_ifindex;
sdl->sdl_type = ill->ill_type;
- ipif_get_name(ill->ill_ipif, sdl->sdl_data, sizeof (sdl->sdl_data));
+ ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data));
sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data);
sdl->sdl_alen = ill->ill_phys_addr_length;
sdl->sdl_slen = 0;
@@ -4646,7 +3499,7 @@ loopback_kstat_update(kstat_t *ksp, int rw)
/*
* Has ifindex been plumbed already?
*/
-boolean_t
+static boolean_t
phyint_exists(uint_t index, ip_stack_t *ipst)
{
ASSERT(index != 0);
@@ -4749,8 +3602,7 @@ phyint_flags_init(phyint_t *phyi, t_uscalar_t mactype)
*/
ill_t *
ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
- queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, boolean_t *did_alloc,
- ip_stack_t *ipst)
+ boolean_t *did_alloc, ip_stack_t *ipst)
{
ill_t *ill;
ipif_t *ipif;
@@ -4762,9 +3614,9 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
isloopback = mi_strcmp(name, ipif_loopback_name) == 0;
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- ill = ill_find_by_name(name, isv6, q, mp, func, error, ipst);
+ ill = ill_find_by_name(name, isv6, ipst);
rw_exit(&ipst->ips_ill_g_lock);
- if (ill != NULL || (error != NULL && *error == EINPROGRESS))
+ if (ill != NULL)
return (ill);
/*
@@ -4775,9 +3627,8 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
return (NULL);
rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
-
- ill = ill_find_by_name(name, isv6, q, mp, func, error, ipst);
- if (ill != NULL || (error != NULL && *error == EINPROGRESS)) {
+ ill = ill_find_by_name(name, isv6, ipst);
+ if (ill != NULL) {
rw_exit(&ipst->ips_ill_g_lock);
return (ill);
}
@@ -4791,6 +3642,7 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
*ill = ill_null;
mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, NULL);
ill->ill_ipst = ipst;
+ list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node));
netstack_hold(ipst->ips_netstack);
/*
* For exclusive stacks we set the zoneid to zero
@@ -4809,17 +3661,16 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0);
phyint_flags_init(ill->ill_phyint, DL_LOOP);
- ill->ill_max_frag = IP_LOOPBACK_MTU;
- /* Add room for tcp+ip headers */
if (isv6) {
ill->ill_isv6 = B_TRUE;
- ill->ill_max_frag += IPV6_HDR_LEN + 20; /* for TCP */
+ ill->ill_max_frag = ip_loopback_mtu_v6plus;
} else {
- ill->ill_max_frag += IP_SIMPLE_HDR_LENGTH + 20;
+ ill->ill_max_frag = ip_loopback_mtuplus;
}
if (!ill_allocate_mibs(ill))
goto done;
- ill->ill_max_mtu = ill->ill_max_frag;
+ ill->ill_current_frag = ill->ill_max_frag;
+ ill->ill_mtu = ill->ill_max_frag; /* Initial value */
/*
* ipif_loopback_name can't be pointed at directly because its used
* by both the ipv4 and ipv6 interfaces. When the ill is removed
@@ -4832,6 +3683,8 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
/* Set ill_dlpi_pending for ipsq_current_finish() to work properly */
ill->ill_dlpi_pending = DL_PRIM_INVAL;
+ rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL);
ill->ill_global_timer = INFINITY;
ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0;
ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0;
@@ -4857,14 +3710,12 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK);
IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr);
- ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr;
V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask);
V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
ipif->ipif_v6subnet);
ill->ill_flags |= ILLF_IPV4;
} else {
ipif->ipif_v6lcl_addr = ipv6_loopback;
- ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr;
ipif->ipif_v6net_mask = ipv6_all_ones;
V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
ipif->ipif_v6subnet);
@@ -4884,6 +3735,8 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
ipsq = ill->ill_phyint->phyint_ipsq;
+ ill_set_inputfn(ill);
+
if (ill_glist_insert(ill, "lo", isv6) != 0)
cmn_err(CE_PANIC, "cannot insert loopback interface");
@@ -4924,8 +3777,6 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
}
}
- if (error != NULL)
- *error = 0;
*did_alloc = B_TRUE;
rw_exit(&ipst->ips_ill_g_lock);
ill_nic_event_dispatch(ill, MAP_IPIF_ID(ill->ill_ipif->ipif_id),
@@ -4947,8 +3798,6 @@ done:
mi_free(ill);
}
rw_exit(&ipst->ips_ill_g_lock);
- if (error != NULL)
- *error = ENOMEM;
return (NULL);
}
@@ -4956,8 +3805,7 @@ done:
* For IPP calls - use the ip_stack_t for global stack.
*/
ill_t *
-ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6,
- queue_t *q, mblk_t *mp, ipsq_func_t func, int *err)
+ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6)
{
ip_stack_t *ipst;
ill_t *ill;
@@ -4968,7 +3816,7 @@ ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6,
return (NULL);
}
- ill = ill_lookup_on_ifindex(index, isv6, q, mp, func, err, ipst);
+ ill = ill_lookup_on_ifindex(index, isv6, ipst);
netstack_rele(ipst->ips_netstack);
return (ill);
}
@@ -4977,19 +3825,11 @@ ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6,
* Return a pointer to the ill which matches the index and IP version type.
*/
ill_t *
-ill_lookup_on_ifindex(uint_t index, boolean_t isv6, queue_t *q, mblk_t *mp,
- ipsq_func_t func, int *err, ip_stack_t *ipst)
+ill_lookup_on_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst)
{
ill_t *ill;
- ipsq_t *ipsq;
phyint_t *phyi;
- ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) ||
- (q != NULL && mp != NULL && func != NULL && err != NULL));
-
- if (err != NULL)
- *err = 0;
-
/*
* Indexes are stored in the phyint - a common structure
* to both IPv4 and IPv6.
@@ -5000,43 +3840,45 @@ ill_lookup_on_ifindex(uint_t index, boolean_t isv6, queue_t *q, mblk_t *mp,
if (phyi != NULL) {
ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4;
if (ill != NULL) {
- /*
- * The block comment at the start of ipif_down
- * explains the use of the macros used below
- */
- GRAB_CONN_LOCK(q);
mutex_enter(&ill->ill_lock);
- if (ILL_CAN_LOOKUP(ill)) {
+ if (!ILL_IS_CONDEMNED(ill)) {
ill_refhold_locked(ill);
mutex_exit(&ill->ill_lock);
- RELEASE_CONN_LOCK(q);
rw_exit(&ipst->ips_ill_g_lock);
return (ill);
- } else if (ILL_CAN_WAIT(ill, q)) {
- ipsq = ill->ill_phyint->phyint_ipsq;
- mutex_enter(&ipsq->ipsq_lock);
- mutex_enter(&ipsq->ipsq_xop->ipx_lock);
- rw_exit(&ipst->ips_ill_g_lock);
- mutex_exit(&ill->ill_lock);
- ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
- mutex_exit(&ipsq->ipsq_xop->ipx_lock);
- mutex_exit(&ipsq->ipsq_lock);
- RELEASE_CONN_LOCK(q);
- if (err != NULL)
- *err = EINPROGRESS;
- return (NULL);
}
- RELEASE_CONN_LOCK(q);
mutex_exit(&ill->ill_lock);
}
}
rw_exit(&ipst->ips_ill_g_lock);
- if (err != NULL)
- *err = ENXIO;
return (NULL);
}
/*
+ * Verify whether or not an interface index is valid.
+ * It can be zero (meaning "reset") or an interface index assigned
+ * to a non-VNI interface. (We don't use VNI interface to send packets.)
+ */
+boolean_t
+ip_ifindex_valid(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst)
+{
+ ill_t *ill;
+
+ if (ifindex == 0)
+ return (B_TRUE);
+
+ ill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
+ if (ill == NULL)
+ return (B_FALSE);
+ if (IS_VNI(ill)) {
+ ill_refrele(ill);
+ return (B_FALSE);
+ }
+ ill_refrele(ill);
+ return (B_TRUE);
+}
+
+/*
* Return the ifindex next in sequence after the passed in ifindex.
* If there is no next ifindex for the given protocol, return 0.
*/
@@ -5118,6 +3960,20 @@ ill_get_ifindex_by_name(char *name, ip_stack_t *ipst)
}
/*
+ * Return the ifindex to be used by upper layer protocols for instance
+ * for IPV6_RECVPKTINFO. If IPMP this is the one for the upper ill.
+ */
+uint_t
+ill_get_upper_ifindex(const ill_t *ill)
+{
+ if (IS_UNDER_IPMP(ill))
+ return (ipmp_ill_get_ipmp_ifindex(ill));
+ else
+ return (ill->ill_phyint->phyint_ifindex);
+}
+
+
+/*
* Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt
* that gives a running thread a reference to the ill. This reference must be
* released by the thread when it is done accessing the ill and related
@@ -5145,17 +4001,18 @@ ill_refhold_locked(ill_t *ill)
ILL_TRACE_REF(ill);
}
-int
+/* Returns true if we managed to get a refhold */
+boolean_t
ill_check_and_refhold(ill_t *ill)
{
mutex_enter(&ill->ill_lock);
- if (ILL_CAN_LOOKUP(ill)) {
+ if (!ILL_IS_CONDEMNED(ill)) {
ill_refhold_locked(ill);
mutex_exit(&ill->ill_lock);
- return (0);
+ return (B_TRUE);
}
mutex_exit(&ill->ill_lock);
- return (ILL_LOOKUP_FAILED);
+ return (B_FALSE);
}
/*
@@ -5234,8 +4091,8 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
ASSERT(IAM_WRITER_ILL(ill));
/*
- * Till the ill is fully up ILL_CHANGING will be set and
- * the ill is not globally visible. So no need for a lock.
+ * Till the ill is fully up the ill is not globally visible.
+ * So no need for a lock.
*/
dlia = (dl_info_ack_t *)mp->b_rptr;
ill->ill_mactype = dlia->dl_mac_type;
@@ -5279,8 +4136,9 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
* IP will fly apart otherwise.
*/
min_mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU;
- ill->ill_max_frag = MAX(min_mtu, dlia->dl_max_sdu);
- ill->ill_max_mtu = ill->ill_max_frag;
+ ill->ill_max_frag = MAX(min_mtu, dlia->dl_max_sdu);
+ ill->ill_current_frag = ill->ill_max_frag;
+ ill->ill_mtu = ill->ill_max_frag;
ill->ill_type = ipm->ip_m_type;
@@ -5320,14 +4178,6 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
*/
ill->ill_sap = (ill->ill_isv6) ? ipm->ip_m_ipv6sap : ipm->ip_m_ipv4sap;
/*
- * Set ipif_mtu which is used to set the IRE's
- * ire_max_frag value. The driver could have sent
- * a different mtu from what it sent last time. No
- * need to call ipif_mtu_change because IREs have
- * not yet been created.
- */
- ill->ill_ipif->ipif_mtu = ill->ill_max_mtu;
- /*
* Clear all the flags that were set based on ill_bcast_addr_length
* and ill_phys_addr_length (in ipif_set_values) as these could have
* changed now and we need to re-evaluate.
@@ -5336,8 +4186,7 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT);
/*
- * Free ill_resolver_mp and ill_bcast_mp as things could have
- * changed now.
+ * Free ill_bcast_mp as things could have changed now.
*
* NOTE: The IPMP meta-interface is special-cased because it starts
* with no underlying interfaces (and thus an unknown broadcast
@@ -5345,19 +4194,14 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
* capable as part of allowing it to join a group.
*/
if (ill->ill_bcast_addr_length == 0 && !IS_IPMP(ill)) {
- if (ill->ill_resolver_mp != NULL)
- freemsg(ill->ill_resolver_mp);
if (ill->ill_bcast_mp != NULL)
freemsg(ill->ill_bcast_mp);
- if (ill->ill_flags & ILLF_XRESOLV)
- ill->ill_net_type = IRE_IF_RESOLVER;
- else
- ill->ill_net_type = IRE_IF_NORESOLVER;
- ill->ill_resolver_mp = ill_dlur_gen(NULL,
+ ill->ill_net_type = IRE_IF_NORESOLVER;
+
+ ill->ill_bcast_mp = ill_dlur_gen(NULL,
ill->ill_phys_addr_length,
ill->ill_sap,
ill->ill_sap_length);
- ill->ill_bcast_mp = copymsg(ill->ill_resolver_mp);
if (ill->ill_isv6)
/*
@@ -5520,7 +4364,7 @@ ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6)
* 3b. link local, but deprecated
* 4. loopback.
*/
-ipif_t *
+static ipif_t *
ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
{
ill_t *ill;
@@ -5537,7 +4381,8 @@ ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
mutex_enter(&ill->ill_lock);
- if (IS_VNI(ill) || IS_UNDER_IPMP(ill) || !ILL_CAN_LOOKUP(ill) ||
+ if (IS_VNI(ill) || IS_UNDER_IPMP(ill) ||
+ ILL_IS_CONDEMNED(ill) ||
!(ill->ill_flags & ILLF_MULTICAST)) {
mutex_exit(&ill->ill_lock);
continue;
@@ -5550,7 +4395,7 @@ ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
continue;
}
if (!(ipif->ipif_flags & IPIF_UP) ||
- !IPIF_CAN_LOOKUP(ipif)) {
+ IPIF_IS_CONDEMNED(ipif)) {
continue;
}
@@ -5618,6 +4463,22 @@ ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
}
}
+ill_t *
+ill_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
+{
+ ipif_t *ipif;
+ ill_t *ill;
+
+ ipif = ipif_lookup_multicast(ipst, zoneid, isv6);
+ if (ipif == NULL)
+ return (NULL);
+
+ ill = ipif->ipif_ill;
+ ill_refhold(ill);
+ ipif_refrele(ipif);
+ return (ill);
+}
+
/*
* This function is called when an application does not specify an interface
* to be used for multicast traffic (joining a group/sending data). It
@@ -5629,22 +4490,21 @@ ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
* anything in between. If there is no such multicast route, we just find
* any multicast capable interface and return it. The returned ipif
* is refhold'ed.
+ *
+ * We support MULTIRT and RTF_SETSRC on the multicast routes added to the
+ * unicast table. This is used by CGTP.
*/
-ipif_t *
-ipif_lookup_group(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst)
+ill_t *
+ill_lookup_group_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst,
+ boolean_t *multirtp, ipaddr_t *setsrcp)
{
- ire_t *ire;
- ipif_t *ipif;
+ ill_t *ill;
- ire = ire_lookup_multi(group, zoneid, ipst);
- if (ire != NULL) {
- ipif = ire->ire_ipif;
- ipif_refhold(ipif);
- ire_refrele(ire);
- return (ipif);
- }
+ ill = ire_lookup_multi_ill_v4(group, zoneid, ipst, multirtp, setsrcp);
+ if (ill != NULL)
+ return (ill);
- return (ipif_lookup_multicast(ipst, zoneid, B_FALSE));
+ return (ill_lookup_multicast(ipst, zoneid, B_FALSE));
}
/*
@@ -5652,16 +4512,11 @@ ipif_lookup_group(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst)
* The destination address is used only for matching point-to-point interfaces.
*/
ipif_t *
-ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp,
- ipsq_func_t func, int *error, ip_stack_t *ipst)
+ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, ip_stack_t *ipst)
{
ipif_t *ipif;
ill_t *ill;
ill_walk_context_t ctx;
- ipsq_t *ipsq;
-
- if (error != NULL)
- *error = 0;
/*
* First match all the point-to-point interfaces
@@ -5672,7 +4527,6 @@ ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp,
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
ill = ILL_START_WALK_V4(&ctx, ipst);
for (; ill != NULL; ill = ill_next(&ctx, ill)) {
- GRAB_CONN_LOCK(q);
mutex_enter(&ill->ill_lock);
for (ipif = ill->ill_ipif; ipif != NULL;
ipif = ipif->ipif_next) {
@@ -5680,41 +4534,20 @@ ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp,
if ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
(ipif->ipif_lcl_addr == if_addr) &&
(ipif->ipif_pp_dst_addr == dst)) {
- /*
- * The block comment at the start of ipif_down
- * explains the use of the macros used below
- */
- if (IPIF_CAN_LOOKUP(ipif)) {
+ if (!IPIF_IS_CONDEMNED(ipif)) {
ipif_refhold_locked(ipif);
mutex_exit(&ill->ill_lock);
- RELEASE_CONN_LOCK(q);
rw_exit(&ipst->ips_ill_g_lock);
return (ipif);
- } else if (IPIF_CAN_WAIT(ipif, q)) {
- ipsq = ill->ill_phyint->phyint_ipsq;
- mutex_enter(&ipsq->ipsq_lock);
- mutex_enter(&ipsq->ipsq_xop->ipx_lock);
- mutex_exit(&ill->ill_lock);
- rw_exit(&ipst->ips_ill_g_lock);
- ipsq_enq(ipsq, q, mp, func, NEW_OP,
- ill);
- mutex_exit(&ipsq->ipsq_xop->ipx_lock);
- mutex_exit(&ipsq->ipsq_lock);
- RELEASE_CONN_LOCK(q);
- if (error != NULL)
- *error = EINPROGRESS;
- return (NULL);
}
}
}
mutex_exit(&ill->ill_lock);
- RELEASE_CONN_LOCK(q);
}
rw_exit(&ipst->ips_ill_g_lock);
/* lookup the ipif based on interface address */
- ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, q, mp, func, error,
- ipst);
+ ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, ipst);
ASSERT(ipif == NULL || !ipif->ipif_isv6);
return (ipif);
}
@@ -5723,18 +4556,15 @@ ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp,
* Common function for ipif_lookup_addr() and ipif_lookup_addr_exact().
*/
static ipif_t *
-ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, boolean_t match_illgrp,
- zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func, int *error,
- ip_stack_t *ipst)
+ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, uint32_t match_flags,
+ zoneid_t zoneid, ip_stack_t *ipst)
{
ipif_t *ipif;
ill_t *ill;
boolean_t ptp = B_FALSE;
- ipsq_t *ipsq;
ill_walk_context_t ctx;
-
- if (error != NULL)
- *error = 0;
+ boolean_t match_illgrp = (match_flags & IPIF_MATCH_ILLGRP);
+ boolean_t no_duplicate = (match_flags & IPIF_MATCH_NONDUP);
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
/*
@@ -5748,7 +4578,6 @@ repeat:
(!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) {
continue;
}
- GRAB_CONN_LOCK(q);
mutex_enter(&ill->ill_lock);
for (ipif = ill->ill_ipif; ipif != NULL;
ipif = ipif->ipif_next) {
@@ -5756,47 +4585,29 @@ repeat:
zoneid != ipif->ipif_zoneid &&
ipif->ipif_zoneid != ALL_ZONES)
continue;
+
+ if (no_duplicate && !(ipif->ipif_flags & IPIF_UP))
+ continue;
+
/* Allow the ipif to be down */
if ((!ptp && (ipif->ipif_lcl_addr == addr) &&
((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
(ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) &&
(ipif->ipif_pp_dst_addr == addr))) {
- /*
- * The block comment at the start of ipif_down
- * explains the use of the macros used below
- */
- if (IPIF_CAN_LOOKUP(ipif)) {
+ if (!IPIF_IS_CONDEMNED(ipif)) {
ipif_refhold_locked(ipif);
mutex_exit(&ill->ill_lock);
- RELEASE_CONN_LOCK(q);
rw_exit(&ipst->ips_ill_g_lock);
return (ipif);
- } else if (IPIF_CAN_WAIT(ipif, q)) {
- ipsq = ill->ill_phyint->phyint_ipsq;
- mutex_enter(&ipsq->ipsq_lock);
- mutex_enter(&ipsq->ipsq_xop->ipx_lock);
- mutex_exit(&ill->ill_lock);
- rw_exit(&ipst->ips_ill_g_lock);
- ipsq_enq(ipsq, q, mp, func, NEW_OP,
- ill);
- mutex_exit(&ipsq->ipsq_xop->ipx_lock);
- mutex_exit(&ipsq->ipsq_lock);
- RELEASE_CONN_LOCK(q);
- if (error != NULL)
- *error = EINPROGRESS;
- return (NULL);
}
}
}
mutex_exit(&ill->ill_lock);
- RELEASE_CONN_LOCK(q);
}
/* If we already did the ptp case, then we are done */
if (ptp) {
rw_exit(&ipst->ips_ill_g_lock);
- if (error != NULL)
- *error = ENXIO;
return (NULL);
}
ptp = B_TRUE;
@@ -5804,55 +4615,6 @@ repeat:
}
/*
- * Check if the address exists in the system.
- * We don't hold the conn_lock as we will not perform defered ipsqueue
- * operation.
- */
-boolean_t
-ip_addr_exists(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst)
-{
- ipif_t *ipif;
- ill_t *ill;
- ill_walk_context_t ctx;
-
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
-
- ill = ILL_START_WALK_V4(&ctx, ipst);
- for (; ill != NULL; ill = ill_next(&ctx, ill)) {
- mutex_enter(&ill->ill_lock);
- for (ipif = ill->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
- if (zoneid != ALL_ZONES &&
- zoneid != ipif->ipif_zoneid &&
- ipif->ipif_zoneid != ALL_ZONES)
- continue;
- /* Allow the ipif to be down */
- /*
- * XXX Different from ipif_lookup_addr(), we don't do
- * twice lookups. As from bind()'s point of view, we
- * may return once we find a match.
- */
- if (((ipif->ipif_lcl_addr == addr) &&
- ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
- ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
- (ipif->ipif_pp_dst_addr == addr))) {
- /*
- * Allow bind() to be successful even if the
- * ipif is with IPIF_CHANGING bit set.
- */
- mutex_exit(&ill->ill_lock);
- rw_exit(&ipst->ips_ill_g_lock);
- return (B_TRUE);
- }
- }
- mutex_exit(&ill->ill_lock);
- }
-
- rw_exit(&ipst->ips_ill_g_lock);
- return (B_FALSE);
-}
-
-/*
* Lookup an ipif with the specified address. For point-to-point links we
* look for matches on either the destination address or the local address,
* but we skip the local address check if IPIF_UNNUMBERED is set. If the
@@ -5860,11 +4622,25 @@ ip_addr_exists(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst)
* (or illgrp if `match_ill' is in an IPMP group).
*/
ipif_t *
-ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q,
- mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst)
+ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid,
+ ip_stack_t *ipst)
+{
+ return (ipif_lookup_addr_common(addr, match_ill, IPIF_MATCH_ILLGRP,
+ zoneid, ipst));
+}
+
+/*
+ * Lookup an ipif with the specified address. Similar to ipif_lookup_addr,
+ * except that we will only return an address if it is not marked as
+ * IPIF_DUPLICATE
+ */
+ipif_t *
+ipif_lookup_addr_nondup(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid,
+ ip_stack_t *ipst)
{
- return (ipif_lookup_addr_common(addr, match_ill, B_TRUE, zoneid, q, mp,
- func, error, ipst));
+ return (ipif_lookup_addr_common(addr, match_ill,
+ (IPIF_MATCH_ILLGRP | IPIF_MATCH_NONDUP),
+ zoneid, ipst));
}
/*
@@ -5872,12 +4648,12 @@ ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q,
* `match_ill' across the IPMP group. This function is only needed in some
* corner-cases; almost everything should use ipif_lookup_addr().
*/
-static ipif_t *
+ipif_t *
ipif_lookup_addr_exact(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst)
{
ASSERT(match_ill != NULL);
- return (ipif_lookup_addr_common(addr, match_ill, B_FALSE, ALL_ZONES,
- NULL, NULL, NULL, NULL, ipst));
+ return (ipif_lookup_addr_common(addr, match_ill, 0, ALL_ZONES,
+ ipst));
}
/*
@@ -5951,13 +4727,13 @@ repeat:
* IRE lookup and pick the first ipif corresponding to the source address in the
* ire.
* Returns: held ipif
+ *
+ * This is only used for ICMP_ADDRESS_MASK_REQUESTs
*/
ipif_t *
ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid)
{
ipif_t *ipif;
- ire_t *ire;
- ip_stack_t *ipst = ill->ill_ipst;
ASSERT(!ill->ill_isv6);
@@ -5970,7 +4746,7 @@ ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid)
*/
mutex_enter(&ill->ill_lock);
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
- if (!IPIF_CAN_LOOKUP(ipif))
+ if (IPIF_IS_CONDEMNED(ipif))
continue;
if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid &&
ipif->ipif_zoneid != ALL_ZONES)
@@ -5991,24 +4767,11 @@ ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid)
}
}
mutex_exit(&ill->ill_lock);
- ire = ire_route_lookup(addr, 0, 0, 0, NULL, NULL, zoneid,
- NULL, MATCH_IRE_RECURSIVE, ipst);
- if (ire != NULL) {
- /*
- * The callers of this function wants to know the
- * interface on which they have to send the replies
- * back. For IREs that have ire_stq and ire_ipif
- * derived from different ills, we really don't care
- * what we return here.
- */
- ipif = ire->ire_ipif;
- if (ipif != NULL) {
- ipif_refhold(ipif);
- ire_refrele(ire);
- return (ipif);
- }
- ire_refrele(ire);
- }
+ /*
+ * For a remote destination it isn't possible to nail down a particular
+ * ipif.
+ */
+
/* Pick the first interface */
ipif = ipif_get_next_ipif(NULL, ill);
return (ipif);
@@ -6027,9 +4790,8 @@ ill_is_quiescent(ill_t *ill)
ASSERT(MUTEX_HELD(&ill->ill_lock));
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
- if (ipif->ipif_refcnt != 0 || !IPIF_DOWN_OK(ipif)) {
+ if (ipif->ipif_refcnt != 0)
return (B_FALSE);
- }
}
if (!ILL_DOWN_OK(ill) || ill->ill_refcnt != 0) {
return (B_FALSE);
@@ -6045,7 +4807,7 @@ ill_is_freeable(ill_t *ill)
ASSERT(MUTEX_HELD(&ill->ill_lock));
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
- if (ipif->ipif_refcnt != 0 || !IPIF_FREE_OK(ipif)) {
+ if (ipif->ipif_refcnt != 0) {
return (B_FALSE);
}
}
@@ -6067,9 +4829,8 @@ ipif_is_quiescent(ipif_t *ipif)
ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
- if (ipif->ipif_refcnt != 0 || !IPIF_DOWN_OK(ipif)) {
+ if (ipif->ipif_refcnt != 0)
return (B_FALSE);
- }
ill = ipif->ipif_ill;
if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 ||
@@ -6078,7 +4839,7 @@ ipif_is_quiescent(ipif_t *ipif)
}
/* This is the last ipif going down or being deleted on this ill */
- if (!ILL_DOWN_OK(ill) || ill->ill_refcnt != 0) {
+ if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) {
return (B_FALSE);
}
@@ -6087,14 +4848,14 @@ ipif_is_quiescent(ipif_t *ipif)
/*
* return true if the ipif can be destroyed: the ipif has to be quiescent
- * with zero references from ire/nce/ilm to it.
+ * with zero references from ire/ilm to it.
*/
static boolean_t
ipif_is_freeable(ipif_t *ipif)
{
ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
ASSERT(ipif->ipif_id != 0);
- return (ipif->ipif_refcnt == 0 && IPIF_FREE_OK(ipif));
+ return (ipif->ipif_refcnt == 0);
}
/*
@@ -6275,7 +5036,7 @@ th_trace_gethash(ip_stack_t *ipst)
* block.
*/
objsize = MAX(MAX(sizeof (ill_t), sizeof (ipif_t)),
- MAX(sizeof (ire_t), sizeof (nce_t)));
+ MAX(sizeof (ire_t), sizeof (ncec_t)));
rshift = highbit(objsize);
mh = mod_hash_create_extended(name, 64, mod_hash_null_keydtor,
th_trace_free, mod_hash_byptr, (void *)rshift,
@@ -6509,7 +5270,7 @@ ipif_get_next_ipif(ipif_t *curr, ill_t *ill)
mutex_enter(&ill->ill_lock);
for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next);
ipif != NULL; ipif = ipif->ipif_next) {
- if (!IPIF_CAN_LOOKUP(ipif))
+ if (IPIF_IS_CONDEMNED(ipif))
continue;
ipif_refhold_locked(ipif);
mutex_exit(&ill->ill_lock);
@@ -6535,28 +5296,53 @@ ip_m_lookup(t_uscalar_t mac_type)
}
/*
+ * Make a link layer address from the multicast IP address *addr.
+ * To form the link layer address, invoke the ip_m_v*mapping function
+ * associated with the link-layer type.
+ */
+void
+ip_mcast_mapping(ill_t *ill, uchar_t *addr, uchar_t *hwaddr)
+{
+ ip_m_t *ipm;
+
+ if (ill->ill_net_type == IRE_IF_NORESOLVER)
+ return;
+
+ ASSERT(addr != NULL);
+
+ ipm = ip_m_lookup(ill->ill_mactype);
+ if (ipm == NULL ||
+ (ill->ill_isv6 && ipm->ip_m_v6mapping == NULL) ||
+ (!ill->ill_isv6 && ipm->ip_m_v4mapping == NULL)) {
+ ip0dbg(("no mapping for ill %s mactype 0x%x\n",
+ ill->ill_name, ill->ill_mactype));
+ return;
+ }
+ if (ill->ill_isv6)
+ (*ipm->ip_m_v6mapping)(ill, addr, hwaddr);
+ else
+ (*ipm->ip_m_v4mapping)(ill, addr, hwaddr);
+}
+
+/*
* ip_rt_add is called to add an IPv4 route to the forwarding table.
- * ipif_arg is passed in to associate it with the correct interface.
- * We may need to restart this operation if the ipif cannot be looked up
- * due to an exclusive operation that is currently in progress. The restart
- * entry point is specified by 'func'
+ * ill is passed in to associate it with the correct interface.
+ * If ire_arg is set, then we return the held IRE in that location.
*/
int
ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
- ipaddr_t src_addr, int flags, ipif_t *ipif_arg, ire_t **ire_arg,
- boolean_t ioctl_msg, queue_t *q, mblk_t *mp, ipsq_func_t func,
- struct rtsa_s *sp, ip_stack_t *ipst)
+ ipaddr_t src_addr, int flags, ill_t *ill, ire_t **ire_arg,
+ boolean_t ioctl_msg, struct rtsa_s *sp, ip_stack_t *ipst, zoneid_t zoneid)
{
- ire_t *ire;
+ ire_t *ire, *nire;
ire_t *gw_ire = NULL;
ipif_t *ipif = NULL;
- boolean_t ipif_refheld = B_FALSE;
uint_t type;
int match_flags = MATCH_IRE_TYPE;
- int error;
tsol_gc_t *gc = NULL;
tsol_gcgrp_t *gcgrp = NULL;
boolean_t gcgrp_xtraref = B_FALSE;
+ boolean_t cgtp_broadcast;
ip1dbg(("ip_rt_add:"));
@@ -6579,27 +5365,19 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
return (ENETUNREACH);
/*
* Get the ipif, if any, corresponding to the gw_addr
+ * If -ifp was specified we restrict ourselves to the ill, otherwise
+ * we match on the gatway and destination to handle unnumbered pt-pt
+ * interfaces.
*/
- ipif = ipif_lookup_interface(gw_addr, dst_addr, q, mp, func, &error,
- ipst);
+ if (ill != NULL)
+ ipif = ipif_lookup_addr(gw_addr, ill, ALL_ZONES, ipst);
+ else
+ ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst);
if (ipif != NULL) {
if (IS_VNI(ipif->ipif_ill)) {
ipif_refrele(ipif);
return (EINVAL);
}
- ipif_refheld = B_TRUE;
- } else if (error == EINPROGRESS) {
- ip1dbg(("ip_rt_add: null and EINPROGRESS"));
- return (EINPROGRESS);
- } else {
- error = 0;
- }
-
- if (ipif != NULL) {
- ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif nonnull"));
- ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
- } else {
- ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif is null"));
}
/*
@@ -6612,12 +5390,12 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
flags &= ~RTF_GATEWAY;
if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK &&
mask == IP_HOST_MASK) {
- ire = ire_ctable_lookup(dst_addr, 0, IRE_LOOPBACK, ipif,
- ALL_ZONES, NULL, match_flags, ipst);
+ ire = ire_ftable_lookup_v4(dst_addr, 0, 0, IRE_LOOPBACK,
+ NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst,
+ NULL);
if (ire != NULL) {
ire_refrele(ire);
- if (ipif_refheld)
- ipif_refrele(ipif);
+ ipif_refrele(ipif);
return (EEXIST);
}
ip1dbg(("ip_rt_add: 0x%p creating IRE 0x%x"
@@ -6627,40 +5405,58 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
ire = ire_create(
(uchar_t *)&dst_addr, /* dest address */
(uchar_t *)&mask, /* mask */
- (uchar_t *)&ipif->ipif_src_addr,
NULL, /* no gateway */
- &ipif->ipif_mtu,
- NULL,
- ipif->ipif_rq, /* recv-from queue */
- NULL, /* no send-to queue */
ipif->ipif_ire_type, /* LOOPBACK */
- ipif,
- 0,
- 0,
- 0,
- (ipif->ipif_flags & IPIF_PRIVATE) ?
- RTF_PRIVATE : 0,
- &ire_uinfo_null,
- NULL,
+ ipif->ipif_ill,
+ zoneid,
+ (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0,
NULL,
ipst);
if (ire == NULL) {
- if (ipif_refheld)
- ipif_refrele(ipif);
+ ipif_refrele(ipif);
return (ENOMEM);
}
- error = ire_add(&ire, q, mp, func, B_FALSE);
- if (error == 0)
- goto save_ire;
- if (ipif_refheld)
- ipif_refrele(ipif);
- return (error);
+ /* src address assigned by the caller? */
+ if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
+ ire->ire_setsrc_addr = src_addr;
+ nire = ire_add(ire);
+ if (nire == NULL) {
+ /*
+ * In the result of failure, ire_add() will have
+ * already deleted the ire in question, so there
+ * is no need to do that here.
+ */
+ ipif_refrele(ipif);
+ return (ENOMEM);
+ }
+ /*
+ * Check if it was a duplicate entry. This handles
+ * the case of two racing route adds for the same route
+ */
+ if (nire != ire) {
+ ASSERT(nire->ire_identical_ref > 1);
+ ire_delete(nire);
+ ire_refrele(nire);
+ ipif_refrele(ipif);
+ return (EEXIST);
+ }
+ ire = nire;
+ goto save_ire;
}
}
/*
+ * The routes for multicast with CGTP are quite special in that
+ * the gateway is the local interface address, yet RTF_GATEWAY
+ * is set. We turn off RTF_GATEWAY to provide compatibility with
+ * this undocumented and unusual use of multicast routes.
+ */
+ if ((flags & RTF_MULTIRT) && ipif != NULL)
+ flags &= ~RTF_GATEWAY;
+
+ /*
* Traditionally, interface routes are ones where RTF_GATEWAY isn't set
* and the gateway address provided is one of the system's interface
* addresses. By using the routing socket interface and supplying an
@@ -6694,8 +5490,8 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
* logical interfaces
*
* 192.0.2.32 255.255.255.224 192.0.2.33 U if0
- * 192.0.2.32 255.255.255.224 192.0.2.34 U if0:1
- * 192.0.2.32 255.255.255.224 192.0.2.35 U if0:2
+ * 192.0.2.32 255.255.255.224 192.0.2.34 U if0
+ * 192.0.2.32 255.255.255.224 192.0.2.35 U if0
*
* the ipif's corresponding to each of these interface routes can be
* uniquely identified by the "gateway" (actually interface address).
@@ -6710,47 +5506,37 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
/* RTF_GATEWAY not set */
if (!(flags & RTF_GATEWAY)) {
- queue_t *stq;
-
if (sp != NULL) {
ip2dbg(("ip_rt_add: gateway security attributes "
"cannot be set with interface route\n"));
- if (ipif_refheld)
+ if (ipif != NULL)
ipif_refrele(ipif);
return (EINVAL);
}
/*
- * As the interface index specified with the RTA_IFP sockaddr is
- * the same for all ipif's off of an ill, the matching logic
- * below uses MATCH_IRE_ILL if such an index was specified.
- * This means that routes sharing the same prefix when added
- * using a RTA_IFP sockaddr must have distinct interface
- * indices (namely, they must be on distinct ill's).
- *
- * On the other hand, since the gateway address will usually be
- * different for each ipif on the system, the matching logic
- * uses MATCH_IRE_IPIF in the case of a traditional interface
- * route. This means that interface routes for the same prefix
- * can be created if they belong to distinct ipif's and if a
- * RTA_IFP sockaddr is not present.
+ * Whether or not ill (RTA_IFP) is set, we require that
+ * the gateway is one of our local addresses.
*/
- if (ipif_arg != NULL) {
- if (ipif_refheld) {
- ipif_refrele(ipif);
- ipif_refheld = B_FALSE;
- }
- ipif = ipif_arg;
- match_flags |= MATCH_IRE_ILL;
- } else {
- /*
- * Check the ipif corresponding to the gw_addr
- */
- if (ipif == NULL)
- return (ENETUNREACH);
- match_flags |= MATCH_IRE_IPIF;
+ if (ipif == NULL)
+ return (ENETUNREACH);
+
+ /*
+ * We use MATCH_IRE_ILL here. If the caller specified an
+ * interface (from the RTA_IFP sockaddr) we use it, otherwise
+ * we use the ill derived from the gateway address.
+ * We can always match the gateway address since we record it
+ * in ire_gateway_addr.
+ * We don't allow RTA_IFP to specify a different ill than the
+ * one matching the ipif to make sure we can delete the route.
+ */
+ match_flags |= MATCH_IRE_GW | MATCH_IRE_ILL;
+ if (ill == NULL) {
+ ill = ipif->ipif_ill;
+ } else if (ill != ipif->ipif_ill) {
+ ipif_refrele(ipif);
+ return (EINVAL);
}
- ASSERT(ipif != NULL);
/*
* We check for an existing entry at this point.
@@ -6761,45 +5547,32 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
*/
if (!ioctl_msg)
match_flags |= MATCH_IRE_MASK;
- ire = ire_ftable_lookup(dst_addr, mask, 0, IRE_INTERFACE, ipif,
- NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
+ ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr,
+ IRE_INTERFACE, ill, ALL_ZONES, NULL, match_flags, 0, ipst,
+ NULL);
if (ire != NULL) {
ire_refrele(ire);
- if (ipif_refheld)
- ipif_refrele(ipif);
+ ipif_refrele(ipif);
return (EEXIST);
}
- stq = (ipif->ipif_net_type == IRE_IF_RESOLVER)
- ? ipif->ipif_rq : ipif->ipif_wq;
-
/*
- * Create a copy of the IRE_LOOPBACK,
- * IRE_IF_NORESOLVER or IRE_IF_RESOLVER with
- * the modified address and netmask.
+ * Create a copy of the IRE_LOOPBACK, IRE_IF_NORESOLVER or
+ * IRE_IF_RESOLVER with the modified address, netmask, and
+ * gateway.
*/
ire = ire_create(
(uchar_t *)&dst_addr,
(uint8_t *)&mask,
- (uint8_t *)&ipif->ipif_src_addr,
- NULL,
- &ipif->ipif_mtu,
- NULL,
- NULL,
- stq,
- ipif->ipif_net_type,
- ipif,
- 0,
- 0,
- 0,
+ (uint8_t *)&gw_addr,
+ ill->ill_net_type,
+ ill,
+ zoneid,
flags,
- &ire_uinfo_null,
- NULL,
NULL,
ipst);
if (ire == NULL) {
- if (ipif_refheld)
- ipif_refrele(ipif);
+ ipif_refrele(ipif);
return (ENOMEM);
}
@@ -6810,7 +5583,7 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
* set up prefixes with the RTF_REJECT flag set (for example,
* when generating aggregate routes.)
*
- * If the IRE type (as defined by ipif->ipif_net_type) is
+ * If the IRE type (as defined by ill->ill_net_type) is
* IRE_LOOPBACK, then we map the request into a
* IRE_IF_NORESOLVER. We also OR in the RTF_BLACKHOLE flag as
* these interface routes, by definition, can only be that.
@@ -6819,27 +5592,37 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
* routine, but rather using ire_create() directly.
*
*/
- if (ipif->ipif_net_type == IRE_LOOPBACK) {
+ if (ill->ill_net_type == IRE_LOOPBACK) {
ire->ire_type = IRE_IF_NORESOLVER;
ire->ire_flags |= RTF_BLACKHOLE;
}
- error = ire_add(&ire, q, mp, func, B_FALSE);
- if (error == 0)
- goto save_ire;
+ /* src address assigned by the caller? */
+ if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
+ ire->ire_setsrc_addr = src_addr;
+ nire = ire_add(ire);
+ if (nire == NULL) {
+ /*
+ * In the result of failure, ire_add() will have
+ * already deleted the ire in question, so there
+ * is no need to do that here.
+ */
+ ipif_refrele(ipif);
+ return (ENOMEM);
+ }
/*
- * In the result of failure, ire_add() will have already
- * deleted the ire in question, so there is no need to
- * do that here.
+ * Check if it was a duplicate entry. This handles
+ * the case of two racing route adds for the same route
*/
- if (ipif_refheld)
+ if (nire != ire) {
+ ire_delete(nire);
+ ire_refrele(nire);
ipif_refrele(ipif);
- return (error);
- }
- if (ipif_refheld) {
- ipif_refrele(ipif);
- ipif_refheld = B_FALSE;
+ return (EEXIST);
+ }
+ ire = nire;
+ goto save_ire;
}
/*
@@ -6847,13 +5630,19 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
* If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the
* gateway, it is currently unreachable and we fail the request
* accordingly.
+ * If RTA_IFP was specified we look on that particular ill.
*/
- ipif = ipif_arg;
- if (ipif_arg != NULL)
+ if (ill != NULL)
match_flags |= MATCH_IRE_ILL;
+
+ /* Check whether the gateway is reachable. */
again:
- gw_ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, ipif_arg, NULL,
- ALL_ZONES, 0, NULL, match_flags, ipst);
+ type = IRE_INTERFACE;
+ if (flags & RTF_INDIRECT)
+ type |= IRE_OFFLINK;
+
+ gw_ire = ire_ftable_lookup_v4(gw_addr, 0, 0, type, ill,
+ ALL_ZONES, NULL, match_flags, 0, ipst, NULL);
if (gw_ire == NULL) {
/*
* With IPMP, we allow host routes to influence in.mpathd's
@@ -6862,10 +5651,13 @@ again:
* underlying IRE_INTERFACEs are marked hidden. So allow
* hidden test IREs to be found and try again.
*/
- if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) {
- match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
+ if (!(match_flags & MATCH_IRE_TESTHIDDEN)) {
+ match_flags |= MATCH_IRE_TESTHIDDEN;
goto again;
}
+
+ if (ipif != NULL)
+ ipif_refrele(ipif);
return (ENETUNREACH);
}
@@ -6885,10 +5677,12 @@ again:
type = IRE_PREFIX;
/* check for a duplicate entry */
- ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, ipif_arg,
- NULL, ALL_ZONES, 0, NULL,
- match_flags | MATCH_IRE_MASK | MATCH_IRE_GW, ipst);
+ ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill,
+ ALL_ZONES, NULL, match_flags | MATCH_IRE_MASK | MATCH_IRE_GW,
+ 0, ipst, NULL);
if (ire != NULL) {
+ if (ipif != NULL)
+ ipif_refrele(ipif);
ire_refrele(gw_ire);
ire_refrele(ire);
return (EEXIST);
@@ -6905,6 +5699,8 @@ again:
/* we hold reference to it upon success */
gcgrp = gcgrp_lookup(&ga, B_TRUE);
if (gcgrp == NULL) {
+ if (ipif != NULL)
+ ipif_refrele(ipif);
ire_refrele(gw_ire);
return (ENOMEM);
}
@@ -6918,6 +5714,8 @@ again:
*/
gc = gc_create(sp, gcgrp, &gcgrp_xtraref);
if (gc == NULL) {
+ if (ipif != NULL)
+ ipif_refrele(ipif);
/* release reference held by gcgrp_lookup */
GCGRP_REFRELE(gcgrp);
ire_refrele(gw_ire);
@@ -6929,23 +5727,12 @@ again:
ire = ire_create(
(uchar_t *)&dst_addr, /* dest address */
(uchar_t *)&mask, /* mask */
- /* src address assigned by the caller? */
- (uchar_t *)(((src_addr != INADDR_ANY) &&
- (flags & RTF_SETSRC)) ? &src_addr : NULL),
(uchar_t *)&gw_addr, /* gateway address */
- &gw_ire->ire_max_frag,
- NULL, /* no src nce */
- NULL, /* no recv-from queue */
- NULL, /* no send-to queue */
(ushort_t)type, /* IRE type */
- ipif_arg,
- 0,
- 0,
- 0,
+ ill,
+ zoneid,
flags,
- &gw_ire->ire_uinfo, /* Inherit ULP info from gw */
gc, /* security attribute */
- NULL,
ipst);
/*
@@ -6958,26 +5745,51 @@ again:
if (ire == NULL) {
if (gc != NULL)
GC_REFRELE(gc);
+ if (ipif != NULL)
+ ipif_refrele(ipif);
ire_refrele(gw_ire);
return (ENOMEM);
}
+ /* Before we add, check if an extra CGTP broadcast is needed */
+ cgtp_broadcast = ((flags & RTF_MULTIRT) &&
+ ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST);
+
+ /* src address assigned by the caller? */
+ if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
+ ire->ire_setsrc_addr = src_addr;
+
/*
* POLICY: should we allow an RTF_HOST with address INADDR_ANY?
* SUN/OS socket stuff does but do we really want to allow 0.0.0.0?
*/
/* Add the new IRE. */
- error = ire_add(&ire, q, mp, func, B_FALSE);
- if (error != 0) {
+ nire = ire_add(ire);
+ if (nire == NULL) {
/*
- * In the result of failure, ire_add() will have already
- * deleted the ire in question, so there is no need to
- * do that here.
+ * In the result of failure, ire_add() will have
+ * already deleted the ire in question, so there
+ * is no need to do that here.
*/
+ if (ipif != NULL)
+ ipif_refrele(ipif);
ire_refrele(gw_ire);
- return (error);
+ return (ENOMEM);
+ }
+ /*
+ * Check if it was a duplicate entry. This handles
+ * the case of two racing route adds for the same route
+ */
+ if (nire != ire) {
+ ire_delete(nire);
+ ire_refrele(nire);
+ if (ipif != NULL)
+ ipif_refrele(ipif);
+ ire_refrele(gw_ire);
+ return (EEXIST);
}
+ ire = nire;
if (flags & RTF_MULTIRT) {
/*
@@ -6990,45 +5802,47 @@ again:
* because an IP source address cannot be a broadcast
* or a multicast.
*/
- ire_t *ire_dst = ire_ctable_lookup(ire->ire_addr, 0,
- IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
- if (ire_dst != NULL) {
- ip_cgtp_bcast_add(ire, ire_dst, ipst);
- ire_refrele(ire_dst);
+ if (cgtp_broadcast) {
+ ip_cgtp_bcast_add(ire, ipst);
goto save_ire;
}
if (ipst->ips_ip_cgtp_filter_ops != NULL &&
!CLASSD(ire->ire_addr)) {
- int res = ipst->ips_ip_cgtp_filter_ops->cfo_add_dest_v4(
- ipst->ips_netstack->netstack_stackid,
- ire->ire_addr,
- ire->ire_gateway_addr,
- ire->ire_src_addr,
- gw_ire->ire_src_addr);
+ int res;
+ ipif_t *src_ipif;
+
+ /* Find the source address corresponding to gw_ire */
+ src_ipif = ipif_lookup_addr(gw_ire->ire_gateway_addr,
+ NULL, zoneid, ipst);
+ if (src_ipif != NULL) {
+ res = ipst->ips_ip_cgtp_filter_ops->
+ cfo_add_dest_v4(
+ ipst->ips_netstack->netstack_stackid,
+ ire->ire_addr,
+ ire->ire_gateway_addr,
+ ire->ire_setsrc_addr,
+ src_ipif->ipif_lcl_addr);
+ ipif_refrele(src_ipif);
+ } else {
+ res = EADDRNOTAVAIL;
+ }
if (res != 0) {
+ if (ipif != NULL)
+ ipif_refrele(ipif);
ire_refrele(gw_ire);
ire_delete(ire);
+ ire_refrele(ire); /* Held in ire_add */
return (res);
}
}
}
- /*
- * Now that the prefix IRE entry has been created, delete any
- * existing gateway IRE cache entries as well as any IRE caches
- * using the gateway, and force them to be created through
- * ip_newroute.
- */
- if (gc != NULL) {
- ASSERT(gcgrp != NULL);
- ire_clookup_delete_cache_gw(gw_addr, ALL_ZONES, ipst);
- }
-
save_ire:
if (gw_ire != NULL) {
ire_refrele(gw_ire);
+ gw_ire = NULL;
}
- if (ipif != NULL) {
+ if (ill != NULL) {
/*
* Save enough information so that we can recreate the IRE if
* the interface goes down and then up. The metrics associated
@@ -7037,7 +5851,7 @@ save_ire:
* memory cannot be allocated, none of this information will be
* saved.
*/
- ipif_save_ire(ipif, ire);
+ ill_save_ire(ill, ire);
}
if (ioctl_msg)
ip_rts_rtmsg(RTM_OLDADD, ire, 0, ipst);
@@ -7052,27 +5866,23 @@ save_ire:
} else {
ire_refrele(ire); /* Held in ire_add */
}
- if (ipif_refheld)
+ if (ipif != NULL)
ipif_refrele(ipif);
return (0);
}
/*
* ip_rt_delete is called to delete an IPv4 route.
- * ipif_arg is passed in to associate it with the correct interface.
- * We may need to restart this operation if the ipif cannot be looked up
- * due to an exclusive operation that is currently in progress. The restart
- * entry point is specified by 'func'
+ * ill is passed in to associate it with the correct interface.
*/
/* ARGSUSED4 */
int
ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
- uint_t rtm_addrs, int flags, ipif_t *ipif_arg, boolean_t ioctl_msg,
- queue_t *q, mblk_t *mp, ipsq_func_t func, ip_stack_t *ipst)
+ uint_t rtm_addrs, int flags, ill_t *ill, boolean_t ioctl_msg,
+ ip_stack_t *ipst, zoneid_t zoneid)
{
ire_t *ire = NULL;
ipif_t *ipif;
- boolean_t ipif_refheld = B_FALSE;
uint_t type;
uint_t match_flags = MATCH_IRE_TYPE;
int err = 0;
@@ -7096,52 +5906,47 @@ ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
*
* This makes it possible to delete an original
* IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1.
+ * However, we have RTF_KERNEL set on the ones created by ipif_up
+ * and those can not be deleted here.
*
- * As the interface index specified with the RTA_IFP sockaddr is the
- * same for all ipif's off of an ill, the matching logic below uses
- * MATCH_IRE_ILL if such an index was specified. This means a route
- * sharing the same prefix and interface index as the the route
- * intended to be deleted might be deleted instead if a RTA_IFP sockaddr
- * is specified in the request.
- *
- * On the other hand, since the gateway address will usually be
- * different for each ipif on the system, the matching logic
- * uses MATCH_IRE_IPIF in the case of a traditional interface
- * route. This means that interface routes for the same prefix can be
- * uniquely identified if they belong to distinct ipif's and if a
- * RTA_IFP sockaddr is not present.
+ * We use MATCH_IRE_ILL if we know the interface. If the caller
+ * specified an interface (from the RTA_IFP sockaddr) we use it,
+ * otherwise we use the ill derived from the gateway address.
+ * We can always match the gateway address since we record it
+ * in ire_gateway_addr.
*
* For more detail on specifying routes by gateway address and by
* interface index, see the comments in ip_rt_add().
*/
- ipif = ipif_lookup_interface(gw_addr, dst_addr, q, mp, func, &err,
- ipst);
- if (ipif != NULL)
- ipif_refheld = B_TRUE;
- else if (err == EINPROGRESS)
- return (err);
- else
- err = 0;
+ ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst);
if (ipif != NULL) {
- if (ipif_arg != NULL) {
- if (ipif_refheld) {
- ipif_refrele(ipif);
- ipif_refheld = B_FALSE;
- }
- ipif = ipif_arg;
- match_flags |= MATCH_IRE_ILL;
- } else {
- match_flags |= MATCH_IRE_IPIF;
- }
+ ill_t *ill_match;
+
+ if (ill != NULL)
+ ill_match = ill;
+ else
+ ill_match = ipif->ipif_ill;
+
+ match_flags |= MATCH_IRE_ILL;
if (ipif->ipif_ire_type == IRE_LOOPBACK) {
- ire = ire_ctable_lookup(dst_addr, 0, IRE_LOOPBACK, ipif,
- ALL_ZONES, NULL, match_flags, ipst);
+ ire = ire_ftable_lookup_v4(dst_addr, 0, 0, IRE_LOOPBACK,
+ ill_match, ALL_ZONES, NULL, match_flags, 0, ipst,
+ NULL);
}
if (ire == NULL) {
- ire = ire_ftable_lookup(dst_addr, mask, 0,
- IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL,
- match_flags, ipst);
+ match_flags |= MATCH_IRE_GW;
+ ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr,
+ IRE_INTERFACE, ill_match, ALL_ZONES, NULL,
+ match_flags, 0, ipst, NULL);
}
+ /* Avoid deleting routes created by kernel from an ipif */
+ if (ire != NULL && (ire->ire_flags & RTF_KERNEL)) {
+ ire_refrele(ire);
+ ire = NULL;
+ }
+
+ /* Restore in case we didn't find a match */
+ match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_ILL);
}
if (ire == NULL) {
@@ -7151,15 +5956,11 @@ ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
* set the IRE type to lookup based on whether
* this is a host route, a default route or just a prefix.
*
- * If an ipif_arg was passed in, then the lookup is based on an
+ * If an ill was passed in, then the lookup is based on an
* interface index so MATCH_IRE_ILL is added to match_flags.
- * In any case, MATCH_IRE_IPIF is cleared and MATCH_IRE_GW is
- * set as the route being looked up is not a traditional
- * interface route.
*/
- match_flags &= ~MATCH_IRE_IPIF;
match_flags |= MATCH_IRE_GW;
- if (ipif_arg != NULL)
+ if (ill != NULL)
match_flags |= MATCH_IRE_ILL;
if (mask == IP_HOST_MASK)
type = IRE_HOST;
@@ -7167,14 +5968,15 @@ ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
type = IRE_DEFAULT;
else
type = IRE_PREFIX;
- ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, ipif_arg,
- NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
+ ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill,
+ ALL_ZONES, NULL, match_flags, 0, ipst, NULL);
}
- if (ipif_refheld)
+ if (ipif != NULL) {
ipif_refrele(ipif);
+ ipif = NULL;
+ }
- /* ipif is not refheld anymore */
if (ire == NULL)
return (ESRCH);
@@ -7193,9 +5995,9 @@ ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
ip_cgtp_bcast_delete(ire, ipst);
}
- ipif = ire->ire_ipif;
- if (ipif != NULL)
- ipif_remove_ire(ipif, ire);
+ ill = ire->ire_ill;
+ if (ill != NULL)
+ ill_remove_saved_ire(ill, ire);
if (ioctl_msg)
ip_rts_rtmsg(RTM_OLDDEL, ire, 0, ipst);
ire_delete(ire);
@@ -7249,7 +6051,7 @@ ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
}
error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL,
- B_TRUE, q, mp, ip_process_ioctl, NULL, ipst);
+ B_TRUE, NULL, ipst, ALL_ZONES);
if (ipif != NULL)
ipif_refrele(ipif);
return (error);
@@ -7301,8 +6103,8 @@ ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
}
error = ip_rt_delete(dst_addr, mask, gw_addr,
- RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, B_TRUE, q,
- mp, ip_process_ioctl, ipst);
+ RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, B_TRUE,
+ ipst, ALL_ZONES);
if (ipif != NULL)
ipif_refrele(ipif);
return (error);
@@ -7655,7 +6457,8 @@ ipsq_dlpi_done(ipsq_t *ipsq)
if (phyi != NULL) {
ill = phyi->phyint_illv4;
if (ill != NULL &&
- ill->ill_dlpi_pending != DL_PRIM_INVAL)
+ (ill->ill_dlpi_pending != DL_PRIM_INVAL ||
+ ill->ill_arl_dlpi_pending))
return (B_FALSE);
ill = phyi->phyint_illv6;
@@ -7819,8 +6622,8 @@ ipsq_try_enter_internal(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func,
/*
* The ipsq_t (ipsq) is the synchronization data structure used to serialize
- * certain critical operations like plumbing (i.e. most set ioctls), multicast
- * joins, igmp/mld timers, etc. There is one ipsq per phyint. The ipsq
+ * certain critical operations like plumbing (i.e. most set ioctls), etc.
+ * There is one ipsq per phyint. The ipsq
* serializes exclusive ioctls issued by applications on a per ipsq basis in
* ipsq_xopq_mphead. It also protects against multiple threads executing in
* the ipsq. Responses from the driver pertain to the current ioctl (say a
@@ -7838,7 +6641,7 @@ ipsq_try_enter_internal(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func,
* proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next
* ioctl if the current ioctl has completed. If the current ioctl is still
* in progress it simply returns. The current ioctl could be waiting for
- * a response from another module (arp or the driver or could be waiting for
+ * a response from another module (the driver or could be waiting for
* the ipif/ill/ire refcnts to drop to zero. In such a case the ipx_pending_mp
* and ipx_pending_ipif are set. ipx_current_ipif is set throughout the
* execution of the ioctl and ipsq_exit does not start the next ioctl unless
@@ -7959,6 +6762,38 @@ ipsq_exit(ipsq_t *ipsq)
}
/*
+ * Used to start any igmp or mld timers that could not be started
+ * while holding ill_mcast_lock. The timers can't be started while holding
+ * the lock, since mld/igmp_start_timers may need to call untimeout()
+ * which can't be done while holding the lock which the timeout handler
+ * acquires. Otherwise
+ * there could be a deadlock since the timeout handlers
+ * mld_timeout_handler_per_ill/igmp_timeout_handler_per_ill also acquire
+ * ill_mcast_lock.
+ */
+void
+ill_mcast_timer_start(ip_stack_t *ipst)
+{
+ int next;
+
+ mutex_enter(&ipst->ips_igmp_timer_lock);
+ next = ipst->ips_igmp_deferred_next;
+ ipst->ips_igmp_deferred_next = INFINITY;
+ mutex_exit(&ipst->ips_igmp_timer_lock);
+
+ if (next != INFINITY)
+ igmp_start_timers(next, ipst);
+
+ mutex_enter(&ipst->ips_mld_timer_lock);
+ next = ipst->ips_mld_deferred_next;
+ ipst->ips_mld_deferred_next = INFINITY;
+ mutex_exit(&ipst->ips_mld_timer_lock);
+
+ if (next != INFINITY)
+ mld_start_timers(next, ipst);
+}
+
+/*
* Start the current exclusive operation on `ipsq'; associate it with `ipif'
* and `ioccmd'.
*/
@@ -8101,7 +6936,6 @@ ipsq_flush(ill_t *ill)
mutex_exit(&ipx->ipx_lock);
(void) ipsq_pending_mp_cleanup(ill, NULL);
ipsq_xopq_mp_cleanup(ill, NULL);
- ill_pending_mp_cleanup(ill);
}
/*
@@ -8114,7 +6948,7 @@ ipsq_flush(ill_t *ill)
*/
int
ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
- cmd_info_t *ci, ipsq_func_t func)
+ cmd_info_t *ci)
{
char *name;
struct ifreq *ifr;
@@ -8124,7 +6958,6 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
conn_t *connp;
boolean_t isv6;
boolean_t exists;
- int err;
mblk_t *mp1;
zoneid_t zoneid;
ip_stack_t *ipst;
@@ -8138,7 +6971,7 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
} else {
ill = NULL;
connp = Q_TO_CONN(q);
- isv6 = connp->conn_af_isv6;
+ isv6 = (connp->conn_family == AF_INET6);
zoneid = connp->conn_zoneid;
if (zoneid == GLOBAL_ZONEID) {
/* global zone can access ipifs in all zones */
@@ -8195,13 +7028,38 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
ipif_refhold(ipif);
} else {
ipif = ipif_lookup_on_name(name, mi_strlen(name), B_FALSE,
- &exists, isv6, zoneid,
- (connp == NULL) ? q : CONNP_TO_WQ(connp), mp, func, &err,
- ipst);
- if (ipif == NULL) {
- if (err == EINPROGRESS)
- return (err);
- err = 0; /* Ensure we don't use it below */
+ &exists, isv6, zoneid, ipst);
+
+ /*
+ * Ensure that get ioctls don't see any internal state changes
+ * caused by set ioctls by deferring them if IPIF_CHANGING is
+ * set.
+ */
+ if (ipif != NULL && !(ipip->ipi_flags & IPI_WR) &&
+ !IAM_WRITER_IPIF(ipif)) {
+ ipsq_t *ipsq;
+
+ if (connp != NULL)
+ mutex_enter(&connp->conn_lock);
+ mutex_enter(&ipif->ipif_ill->ill_lock);
+ if (IPIF_IS_CHANGING(ipif) &&
+ !IPIF_IS_CONDEMNED(ipif)) {
+ ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq;
+ mutex_enter(&ipsq->ipsq_lock);
+ mutex_enter(&ipsq->ipsq_xop->ipx_lock);
+ mutex_exit(&ipif->ipif_ill->ill_lock);
+ ipsq_enq(ipsq, q, mp, ip_process_ioctl,
+ NEW_OP, ipif->ipif_ill);
+ mutex_exit(&ipsq->ipsq_xop->ipx_lock);
+ mutex_exit(&ipsq->ipsq_lock);
+ if (connp != NULL)
+ mutex_exit(&connp->conn_lock);
+ ipif_refrele(ipif);
+ return (EINPROGRESS);
+ }
+ mutex_exit(&ipif->ipif_ill->ill_lock);
+ if (connp != NULL)
+ mutex_exit(&connp->conn_lock);
}
}
@@ -8226,6 +7084,9 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
if (ipif == NULL)
return (ENXIO);
+ DTRACE_PROBE4(ipif__ioctl, char *, "ip_extract_lifreq",
+ int, ipip->ipi_cmd, ill_t *, ipif->ipif_ill, ipif_t *, ipif);
+
ci->ci_ipif = ipif;
return (0);
}
@@ -8544,7 +7405,6 @@ ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
uint_t ifindex;
zoneid_t zoneid;
- int err = 0;
boolean_t isv6 = B_FALSE;
struct sockaddr_in *sin;
struct sockaddr_in6 *sin6;
@@ -8571,13 +7431,12 @@ ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
return (EINVAL);
ifindex = STRUCT_FGET(lifs, lifs_ifindex);
- isv6 = (Q_TO_CONN(q))->conn_af_isv6;
- ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, q, mp,
- ip_process_ioctl, &err, ipst);
+ isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6;
+ ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, ipst);
if (ipif == NULL) {
ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n",
ifindex));
- return (err);
+ return (ENXIO);
}
/* Allocate a buffer to hold requested information */
@@ -8943,17 +7802,19 @@ ip_sioctl_dstinfo(queue_t *q, mblk_t *mp)
in6_addr_t *daddr, *saddr;
ipaddr_t v4daddr;
ire_t *ire;
+ ipaddr_t v4setsrc;
+ in6_addr_t v6setsrc;
char *slabel, *dlabel;
boolean_t isipv4;
int match_ire;
ill_t *dst_ill;
- ipif_t *src_ipif, *ire_ipif;
struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
- zoneid_t zoneid;
- ip_stack_t *ipst = CONNQ_TO_IPST(q);
+ conn_t *connp = Q_TO_CONN(q);
+ zoneid_t zoneid = IPCL_ZONEID(connp);
+ ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
+ uint64_t ipif_flags;
ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */
- zoneid = Q_TO_CONN(q)->conn_zoneid;
/*
* This ioctl is I_STR only, and must have a
@@ -8976,7 +7837,7 @@ ip_sioctl_dstinfo(queue_t *q, mblk_t *mp)
data_mp = new_data_mp;
mp->b_cont = data_mp;
}
- match_ire = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_PARENT;
+ match_ire = MATCH_IRE_DSTONLY;
for (cur = data_mp->b_rptr, end = data_mp->b_wptr;
end - cur >= sizeof (struct dstinforeq);
@@ -8987,8 +7848,8 @@ ip_sioctl_dstinfo(queue_t *q, mblk_t *mp)
/*
* ip_addr_scope_v6() and ip6_asp_lookup() handle
- * v4 mapped addresses; ire_ftable_lookup[_v6]()
- * and ipif_select_source[_v6]() do not.
+ * v4 mapped addresses; ire_ftable_lookup_v6()
+ * and ip_select_source_v6() do not.
*/
dir->dir_dscope = ip_addr_scope_v6(daddr);
dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence, ipst);
@@ -8996,13 +7857,19 @@ ip_sioctl_dstinfo(queue_t *q, mblk_t *mp)
isipv4 = IN6_IS_ADDR_V4MAPPED(daddr);
if (isipv4) {
IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr);
- ire = ire_ftable_lookup(v4daddr, NULL, NULL,
- 0, NULL, NULL, zoneid, 0, NULL, match_ire, ipst);
+ v4setsrc = INADDR_ANY;
+ ire = ire_route_recursive_v4(v4daddr, 0, NULL, zoneid,
+ NULL, match_ire, B_TRUE, 0, ipst, &v4setsrc, NULL,
+ NULL);
} else {
- ire = ire_ftable_lookup_v6(daddr, NULL, NULL,
- 0, NULL, NULL, zoneid, 0, NULL, match_ire, ipst);
+ v6setsrc = ipv6_all_zeros;
+ ire = ire_route_recursive_v6(daddr, 0, NULL, zoneid,
+ NULL, match_ire, B_TRUE, 0, ipst, &v6setsrc, NULL,
+ NULL);
}
- if (ire == NULL) {
+ ASSERT(ire != NULL);
+ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+ ire_refrele(ire);
dir->dir_dreachable = 0;
/* move on to next dst addr */
@@ -9010,36 +7877,40 @@ ip_sioctl_dstinfo(queue_t *q, mblk_t *mp)
}
dir->dir_dreachable = 1;
- ire_ipif = ire->ire_ipif;
- if (ire_ipif == NULL)
- goto next_dst;
+ dst_ill = ire_nexthop_ill(ire);
+ if (dst_ill == NULL) {
+ ire_refrele(ire);
+ continue;
+ }
- /*
- * We expect to get back an interface ire or a
- * gateway ire cache entry. For both types, the
- * output interface is ire_ipif->ipif_ill.
- */
- dst_ill = ire_ipif->ipif_ill;
+ /* With ipmp we most likely look at the ipmp ill here */
dir->dir_dmactype = dst_ill->ill_mactype;
if (isipv4) {
- src_ipif = ipif_select_source(dst_ill, v4daddr, zoneid);
+ ipaddr_t v4saddr;
+
+ if (ip_select_source_v4(dst_ill, v4setsrc, v4daddr,
+ connp->conn_ixa->ixa_multicast_ifaddr, zoneid, ipst,
+ &v4saddr, NULL, &ipif_flags) != 0) {
+ v4saddr = INADDR_ANY;
+ ipif_flags = 0;
+ }
+ IN6_IPADDR_TO_V4MAPPED(v4saddr, saddr);
} else {
- src_ipif = ipif_select_source_v6(dst_ill,
- daddr, B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid);
+ if (ip_select_source_v6(dst_ill, &v6setsrc, daddr,
+ zoneid, ipst, B_FALSE, IPV6_PREFER_SRC_DEFAULT,
+ saddr, NULL, &ipif_flags) != 0) {
+ *saddr = ipv6_all_zeros;
+ ipif_flags = 0;
+ }
}
- if (src_ipif == NULL)
- goto next_dst;
- *saddr = src_ipif->ipif_v6lcl_addr;
dir->dir_sscope = ip_addr_scope_v6(saddr);
slabel = ip6_asp_lookup(saddr, NULL, ipst);
dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel);
- dir->dir_sdeprecated =
- (src_ipif->ipif_flags & IPIF_DEPRECATED) ? 1 : 0;
- ipif_refrele(src_ipif);
-next_dst:
+ dir->dir_sdeprecated = (ipif_flags & IPIF_DEPRECATED) ? 1 : 0;
ire_refrele(ire);
+ ill_refrele(dst_ill);
}
miocack(q, mp, iocp->ioc_count, 0);
}
@@ -9088,16 +7959,16 @@ ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr,
v4_addr);
- ire = ire_ctable_lookup(v4_addr, 0,
- IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid,
- NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst);
+ ire = ire_ftable_lookup_v4(v4_addr, 0, 0,
+ IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL,
+ MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL);
} else {
in6_addr_t v6addr;
v6addr = sin6->sin6_addr;
- ire = ire_ctable_lookup_v6(&v6addr, 0,
- IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid,
- NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst);
+ ire = ire_ftable_lookup_v6(&v6addr, 0, 0,
+ IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL,
+ MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL);
}
break;
}
@@ -9105,9 +7976,9 @@ ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
ipaddr_t v4addr;
v4addr = sin->sin_addr.s_addr;
- ire = ire_ctable_lookup(v4addr, 0,
+ ire = ire_ftable_lookup_v4(v4addr, 0, 0,
IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid,
- NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst);
+ NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL);
break;
}
default:
@@ -9160,9 +8031,8 @@ ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
sin = (sin_t *)&sia->sa_addr;
/*
- * Match addresses with a zero gateway field to avoid
- * routes going through a router.
- * Exclude broadcast and multicast addresses.
+ * We check for IRE_ONLINK and exclude IRE_BROADCAST|IRE_MULTICAST
+ * to make sure we only look at on-link unicast address.
*/
switch (sin->sin_family) {
case AF_INET6: {
@@ -9174,20 +8044,18 @@ ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr,
v4_addr);
if (!CLASSD(v4_addr)) {
- ire = ire_route_lookup(v4_addr, 0, 0, 0,
- NULL, NULL, zoneid, NULL,
- MATCH_IRE_GW, ipst);
+ ire = ire_ftable_lookup_v4(v4_addr, 0, 0, 0,
+ NULL, zoneid, NULL, MATCH_IRE_DSTONLY,
+ 0, ipst, NULL);
}
} else {
in6_addr_t v6addr;
- in6_addr_t v6gw;
v6addr = sin6->sin6_addr;
- v6gw = ipv6_all_zeros;
if (!IN6_IS_ADDR_MULTICAST(&v6addr)) {
- ire = ire_route_lookup_v6(&v6addr, 0,
- &v6gw, 0, NULL, NULL, zoneid,
- NULL, MATCH_IRE_GW, ipst);
+ ire = ire_ftable_lookup_v6(&v6addr, 0, 0, 0,
+ NULL, zoneid, NULL, MATCH_IRE_DSTONLY, 0,
+ ipst, NULL);
}
}
break;
@@ -9197,9 +8065,8 @@ ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
v4addr = sin->sin_addr.s_addr;
if (!CLASSD(v4addr)) {
- ire = ire_route_lookup(v4addr, 0, 0, 0,
- NULL, NULL, zoneid, NULL,
- MATCH_IRE_GW, ipst);
+ ire = ire_ftable_lookup_v4(v4addr, 0, 0, 0, NULL,
+ zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
}
break;
}
@@ -9208,10 +8075,11 @@ ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
}
sia->sa_res = 0;
if (ire != NULL) {
- if (ire->ire_type & (IRE_INTERFACE|IRE_CACHE|
- IRE_LOCAL|IRE_LOOPBACK)) {
+ ASSERT(!(ire->ire_type & IRE_MULTICAST));
+
+ if ((ire->ire_type & IRE_ONLINK) &&
+ !(ire->ire_type & IRE_BROADCAST))
sia->sa_res = 1;
- }
ire_refrele(ire);
}
return (0);
@@ -9228,54 +8096,40 @@ ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
return (ENXIO);
}
-/*
- * ARP IOCTLs.
- * How does IP get in the business of fronting ARP configuration/queries?
- * Well it's like this, the Berkeley ARP IOCTLs (SIOCGARP, SIOCDARP, SIOCSARP)
- * are by tradition passed in through a datagram socket. That lands in IP.
- * As it happens, this is just as well since the interface is quite crude in
- * that it passes in no information about protocol or hardware types, or
- * interface association. After making the protocol assumption, IP is in
- * the position to look up the name of the ILL, which ARP will need, and
- * format a request that can be handled by ARP. The request is passed up
- * stream to ARP, and the original IOCTL is completed by IP when ARP passes
- * back a response. ARP supports its own set of more general IOCTLs, in
- * case anyone is interested.
- */
+/* ARP IOCTLs. */
/* ARGSUSED */
int
ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
{
- mblk_t *mp1;
- mblk_t *mp2;
- mblk_t *pending_mp;
- ipaddr_t ipaddr;
- area_t *area;
- struct iocblk *iocp;
- conn_t *connp;
- struct arpreq *ar;
- struct xarpreq *xar;
- int flags, alength;
- uchar_t *lladdr;
- ire_t *ire;
- ip_stack_t *ipst;
- ill_t *ill = ipif->ipif_ill;
- ill_t *proxy_ill = NULL;
- ipmp_arpent_t *entp = NULL;
- boolean_t if_arp_ioctl = B_FALSE;
- boolean_t proxyarp = B_FALSE;
+ int err;
+ ipaddr_t ipaddr;
+ struct iocblk *iocp;
+ conn_t *connp;
+ struct arpreq *ar;
+ struct xarpreq *xar;
+ int arp_flags, flags, alength;
+ uchar_t *lladdr;
+ ip_stack_t *ipst;
+ ill_t *ill = ipif->ipif_ill;
+ ill_t *proxy_ill = NULL;
+ ipmp_arpent_t *entp = NULL;
+ boolean_t proxyarp = B_FALSE;
+ boolean_t if_arp_ioctl = B_FALSE;
+ ncec_t *ncec = NULL;
+ nce_t *nce;
ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
connp = Q_TO_CONN(q);
ipst = connp->conn_netstack->netstack_ip;
+ iocp = (struct iocblk *)mp->b_rptr;
if (ipip->ipi_cmd_type == XARP_CMD) {
/* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */
xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr;
ar = NULL;
- flags = xar->xarp_flags;
+ arp_flags = xar->xarp_flags;
lladdr = (uchar_t *)LLADDR(&xar->xarp_ha);
if_arp_ioctl = (xar->xarp_ha.sdl_nlen != 0);
/*
@@ -9294,7 +8148,7 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr;
xar = NULL;
- flags = ar->arp_flags;
+ arp_flags = ar->arp_flags;
lladdr = (uchar_t *)ar->arp_ha.sa_data;
/*
* Theoretically, the sa_family could tell us what link
@@ -9315,7 +8169,14 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
}
}
- ipaddr = sin->sin_addr.s_addr;
+ /* Translate ATF* flags to NCE* flags */
+ flags = 0;
+ if (arp_flags & ATF_AUTHORITY)
+ flags |= NCE_F_AUTHORITY;
+ if (arp_flags & ATF_PERM)
+ flags |= NCE_F_NONUD; /* not subject to aging */
+ if (arp_flags & ATF_PUBL)
+ flags |= NCE_F_PUBLISH;
/*
* IPMP ARP special handling:
@@ -9349,171 +8210,120 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
lladdr = proxy_ill->ill_phys_addr;
}
/* FALLTHRU */
- case SIOCDARP:
- case SIOCDXARP:
- ire = ire_ctable_lookup(ipaddr, 0, IRE_LOCAL, NULL,
- ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
- if (ire != NULL) {
- ire_refrele(ire);
- return (EPERM);
- }
}
}
+ ipaddr = sin->sin_addr.s_addr;
/*
- * We are going to pass up to ARP a packet chain that looks
- * like:
- *
- * M_IOCTL-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK
- *
- * Get a copy of the original IOCTL mblk to head the chain,
- * to be sent up (in mp1). Also get another copy to store
- * in the ill_pending_mp list, for matching the response
- * when it comes back from ARP.
- */
- mp1 = copyb(mp);
- pending_mp = copymsg(mp);
- if (mp1 == NULL || pending_mp == NULL) {
- if (mp1 != NULL)
- freeb(mp1);
- if (pending_mp != NULL)
- inet_freemsg(pending_mp);
- return (ENOMEM);
- }
-
- mp2 = ill_arp_alloc(ill, (uchar_t *)&ip_area_template,
- (caddr_t)&ipaddr);
- if (mp2 == NULL) {
- freeb(mp1);
- inet_freemsg(pending_mp);
- return (ENOMEM);
- }
- /* Put together the chain. */
- mp1->b_cont = mp2;
- mp1->b_datap->db_type = M_IOCTL;
- mp2->b_cont = mp;
- mp2->b_datap->db_type = M_DATA;
-
- iocp = (struct iocblk *)mp1->b_rptr;
-
- /*
- * An M_IOCDATA's payload (struct copyresp) is mostly the same as an
- * M_IOCTL's payload (struct iocblk), but 'struct copyresp' has a
- * cp_private field (or cp_rval on 32-bit systems) in place of the
- * ioc_count field; set ioc_count to be correct.
+ * don't match across illgrp per case (1) and (2).
+ * XXX use IS_IPMP(ill) like ndp_sioc_update?
*/
- iocp->ioc_count = MBLKL(mp1->b_cont);
+ nce = nce_lookup_v4(ill, &ipaddr);
+ if (nce != NULL)
+ ncec = nce->nce_common;
- /*
- * Set the proper command in the ARP message.
- * Convert the SIOC{G|S|D}ARP calls into our
- * AR_ENTRY_xxx calls.
- */
- area = (area_t *)mp2->b_rptr;
switch (iocp->ioc_cmd) {
case SIOCDARP:
- case SIOCDXARP:
+ case SIOCDXARP: {
/*
- * We defer deleting the corresponding IRE until
- * we return from arp.
+ * Delete the NCE if any.
+ */
+ if (ncec == NULL) {
+ iocp->ioc_error = ENXIO;
+ break;
+ }
+ /* Don't allow changes to arp mappings of local addresses. */
+ if (NCE_MYADDR(ncec)) {
+ nce_refrele(nce);
+ return (ENOTSUP);
+ }
+ iocp->ioc_error = 0;
+
+ /*
+ * Delete the nce_common which has ncec_ill set to ipmp_ill.
+ * This will delete all the nce entries on the under_ills.
+ */
+ ncec_delete(ncec);
+ /*
+ * Once the NCE has been deleted, then the ire_dep* consistency
+ * mechanism will find any IRE which depended on the now
+ * condemned NCE (as part of sending packets).
+ * That mechanism handles redirects by deleting redirects
+ * that refer to UNREACHABLE nces.
*/
- area->area_cmd = AR_ENTRY_DELETE;
- area->area_proto_mask_offset = 0;
break;
+ }
case SIOCGARP:
case SIOCGXARP:
- area->area_cmd = AR_ENTRY_SQUERY;
- area->area_proto_mask_offset = 0;
+ if (ncec != NULL) {
+ lladdr = ncec->ncec_lladdr;
+ flags = ncec->ncec_flags;
+ iocp->ioc_error = 0;
+ ip_sioctl_garp_reply(mp, ncec->ncec_ill, lladdr, flags);
+ } else {
+ iocp->ioc_error = ENXIO;
+ }
break;
case SIOCSARP:
case SIOCSXARP:
- /*
- * Delete the corresponding ire to make sure IP will
- * pick up any change from arp.
- */
+ /* Don't allow changes to arp mappings of local addresses. */
+ if (ncec != NULL && NCE_MYADDR(ncec)) {
+ nce_refrele(nce);
+ return (ENOTSUP);
+ }
+
+ /* static arp entries will undergo NUD if ATF_PERM is not set */
+ flags |= NCE_F_STATIC;
if (!if_arp_ioctl) {
- (void) ip_ire_clookup_and_delete(ipaddr, NULL, ipst);
+ ip_nce_lookup_and_update(&ipaddr, NULL, ipst,
+ lladdr, alength, flags);
} else {
ipif_t *ipif = ipif_get_next_ipif(NULL, ill);
if (ipif != NULL) {
- (void) ip_ire_clookup_and_delete(ipaddr, ipif,
- ipst);
+ ip_nce_lookup_and_update(&ipaddr, ipif, ipst,
+ lladdr, alength, flags);
ipif_refrele(ipif);
}
}
- break;
- }
- iocp->ioc_cmd = area->area_cmd;
-
- /*
- * Fill in the rest of the ARP operation fields.
- */
- area->area_hw_addr_length = alength;
- bcopy(lladdr, (char *)area + area->area_hw_addr_offset, alength);
-
- /* Translate the flags. */
- if (flags & ATF_PERM)
- area->area_flags |= ACE_F_PERMANENT;
- if (flags & ATF_PUBL)
- area->area_flags |= ACE_F_PUBLISH;
- if (flags & ATF_AUTHORITY)
- area->area_flags |= ACE_F_AUTHORITY;
-
- /*
- * If this is a permanent AR_ENTRY_ADD on the IPMP interface, track it
- * so that IP can update ARP as the active ills in the group change.
- */
- if (IS_IPMP(ill) && area->area_cmd == AR_ENTRY_ADD &&
- (area->area_flags & ACE_F_PERMANENT)) {
- entp = ipmp_illgrp_create_arpent(ill->ill_grp, mp2, proxyarp);
-
+ if (nce != NULL) {
+ nce_refrele(nce);
+ nce = NULL;
+ }
/*
- * The second part of the conditional below handles a corner
- * case: if this is proxy ARP and the IPMP group has no active
- * interfaces, we can't send the request to ARP now since it
- * won't be able to build an ACE. So we return success and
- * notify ARP about the proxy ARP entry once an interface
- * becomes active.
+ * NCE_F_STATIC entries will be added in state ND_REACHABLE
+ * by nce_add_common()
*/
- if (entp == NULL || (proxyarp && proxy_ill == NULL)) {
- mp2->b_cont = NULL;
- inet_freemsg(mp1);
- inet_freemsg(pending_mp);
- return (entp == NULL ? ENOMEM : 0);
+ err = nce_lookup_then_add_v4(ill, lladdr,
+ ill->ill_phys_addr_length, &ipaddr, flags, ND_UNCHANGED,
+ &nce);
+ if (err == EEXIST) {
+ ncec = nce->nce_common;
+ mutex_enter(&ncec->ncec_lock);
+ ncec->ncec_state = ND_REACHABLE;
+ ncec->ncec_flags = flags;
+ nce_update(ncec, ND_UNCHANGED, lladdr);
+ mutex_exit(&ncec->ncec_lock);
+ err = 0;
+ }
+ if (nce != NULL) {
+ nce_refrele(nce);
+ nce = NULL;
+ }
+ if (IS_IPMP(ill) && err == 0) {
+ entp = ipmp_illgrp_create_arpent(ill->ill_grp,
+ proxyarp, ipaddr, lladdr, ill->ill_phys_addr_length,
+ flags);
+ if (entp == NULL || (proxyarp && proxy_ill == NULL)) {
+ iocp->ioc_error = (entp == NULL ? ENOMEM : 0);
+ break;
+ }
}
+ iocp->ioc_error = err;
}
- /*
- * Before sending 'mp' to ARP, we have to clear the b_next
- * and b_prev. Otherwise if STREAMS encounters such a message
- * in freemsg(), (because ARP can close any time) it can cause
- * a panic. But mi code needs the b_next and b_prev values of
- * mp->b_cont, to complete the ioctl. So we store it here
- * in pending_mp->bcont, and restore it in ip_sioctl_iocack()
- * when the response comes down from ARP.
- */
- pending_mp->b_cont->b_next = mp->b_cont->b_next;
- pending_mp->b_cont->b_prev = mp->b_cont->b_prev;
- mp->b_cont->b_next = NULL;
- mp->b_cont->b_prev = NULL;
-
- mutex_enter(&connp->conn_lock);
- mutex_enter(&ill->ill_lock);
- /* conn has not yet started closing, hence this can't fail */
- if (ipip->ipi_flags & IPI_WR) {
- VERIFY(ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp),
- pending_mp, 0) != 0);
- } else {
- VERIFY(ill_pending_mp_add(ill, connp, pending_mp) != 0);
+ if (nce != NULL) {
+ nce_refrele(nce);
}
- mutex_exit(&ill->ill_lock);
- mutex_exit(&connp->conn_lock);
-
- /*
- * Up to ARP it goes. The response will come back in ip_wput() as an
- * M_IOCACK, and will be handed to ip_sioctl_iocack() for completion.
- */
- putnext(ill->ill_rq, mp1);
/*
* If we created an IPMP ARP entry, mark that we've notified ARP.
@@ -9521,7 +8331,7 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
if (entp != NULL)
ipmp_illgrp_mark_arpent(ill->ill_grp, entp);
- return (EINPROGRESS);
+ return (iocp->ioc_error);
}
/*
@@ -9530,10 +8340,9 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
*/
int
ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
- cmd_info_t *ci, ipsq_func_t func)
+ cmd_info_t *ci)
{
mblk_t *mp1;
- int err;
sin_t *sin;
conn_t *connp;
ipif_t *ipif;
@@ -9548,7 +8357,7 @@ ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
/* ioctl comes down on a conn */
ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
connp = Q_TO_CONN(q);
- if (connp->conn_af_isv6)
+ if (connp->conn_family == AF_INET6)
return (ENXIO);
ipst = connp->conn_netstack->netstack_ip;
@@ -9575,10 +8384,9 @@ ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
if (ipip->ipi_cmd_type == XARP_CMD && sdl->sdl_nlen != 0) {
ipif = ipif_lookup_on_name(sdl->sdl_data, sdl->sdl_nlen,
- B_FALSE, &exists, B_FALSE, ALL_ZONES, CONNP_TO_WQ(connp),
- mp, func, &err, ipst);
+ B_FALSE, &exists, B_FALSE, ALL_ZONES, ipst);
if (ipif == NULL)
- return (err);
+ return (ENXIO);
if (ipif->ipif_id != 0) {
ipif_refrele(ipif);
return (ENXIO);
@@ -9591,23 +8399,24 @@ ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
* find the wrong ill, so we first do an ipif_lookup_addr().
*/
ipif = ipif_lookup_addr(sin->sin_addr.s_addr, NULL, ALL_ZONES,
- CONNP_TO_WQ(connp), mp, func, &err, ipst);
+ ipst);
if (ipif == NULL) {
- ire = ire_ftable_lookup(sin->sin_addr.s_addr, 0, 0,
- IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, NULL,
- MATCH_IRE_TYPE, ipst);
- if (ire == NULL || ((ill = ire_to_ill(ire)) == NULL)) {
+ ire = ire_ftable_lookup_v4(sin->sin_addr.s_addr,
+ 0, 0, IRE_IF_RESOLVER, NULL, ALL_ZONES,
+ NULL, MATCH_IRE_TYPE, 0, ipst, NULL);
+ if (ire == NULL || ((ill = ire->ire_ill) == NULL)) {
if (ire != NULL)
ire_refrele(ire);
return (ENXIO);
}
+ ASSERT(ire != NULL && ill != NULL);
ipif = ill->ill_ipif;
ipif_refhold(ipif);
ire_refrele(ire);
}
}
- if (ipif->ipif_net_type != IRE_IF_RESOLVER) {
+ if (ipif->ipif_ill->ill_net_type != IRE_IF_RESOLVER) {
ipif_refrele(ipif);
return (ENXIO);
}
@@ -9700,123 +8509,20 @@ ip_sioctl_plink_ipmp(ill_t *ill, int ioccmd)
void
ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
{
- mblk_t *mp1, *mp2;
+ mblk_t *mp1;
struct linkblk *li;
- struct ipmx_s *ipmxp;
- ill_t *ill;
int ioccmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
int err = 0;
- boolean_t entered_ipsq = B_FALSE;
- boolean_t islink;
- ip_stack_t *ipst;
-
- if (CONN_Q(q))
- ipst = CONNQ_TO_IPST(q);
- else
- ipst = ILLQ_TO_IPST(q);
ASSERT(ioccmd == I_PLINK || ioccmd == I_PUNLINK ||
ioccmd == I_LINK || ioccmd == I_UNLINK);
- islink = (ioccmd == I_PLINK || ioccmd == I_LINK);
-
mp1 = mp->b_cont; /* This is the linkblk info */
li = (struct linkblk *)mp1->b_rptr;
- /*
- * ARP has added this special mblk, and the utility is asking us
- * to perform consistency checks, and also atomically set the
- * muxid. Ifconfig is an example. It achieves this by using
- * /dev/arp as the mux to plink the arp stream, and pushes arp on
- * to /dev/udp[6] stream for use as the mux when plinking the IP
- * stream. SIOCSLIFMUXID is not required. See ifconfig.c, arp.c
- * and other comments in this routine for more details.
- */
- mp2 = mp1->b_cont; /* This is added by ARP */
-
- /*
- * If I_{P}LINK/I_{P}UNLINK is issued by a utility other than
- * ifconfig which didn't push ARP on top of the dummy mux, we won't
- * get the special mblk above. For backward compatibility, we
- * request ip_sioctl_plink_ipmod() to skip the consistency checks.
- * The utility will use SIOCSLIFMUXID to store the muxids. This is
- * not atomic, and can leave the streams unplumbable if the utility
- * is interrupted before it does the SIOCSLIFMUXID.
- */
- if (mp2 == NULL) {
- err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li, B_FALSE);
- if (err == EINPROGRESS)
- return;
- goto done;
- }
-
- /*
- * This is an I_{P}LINK sent down by ifconfig through the ARP module;
- * ARP has appended this last mblk to tell us whether the lower stream
- * is an arp-dev stream or an IP module stream.
- */
- ipmxp = (struct ipmx_s *)mp2->b_rptr;
- if (ipmxp->ipmx_arpdev_stream) {
- /*
- * The lower stream is the arp-dev stream.
- */
- ill = ill_lookup_on_name(ipmxp->ipmx_name, B_FALSE, B_FALSE,
- q, mp, ip_sioctl_plink, &err, NULL, ipst);
- if (ill == NULL) {
- if (err == EINPROGRESS)
- return;
- err = EINVAL;
- goto done;
- }
-
- if (ipsq == NULL) {
- ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink,
- NEW_OP, B_FALSE);
- if (ipsq == NULL) {
- ill_refrele(ill);
- return;
- }
- entered_ipsq = B_TRUE;
- }
- ASSERT(IAM_WRITER_ILL(ill));
- ill_refrele(ill);
-
- /*
- * To ensure consistency between IP and ARP, the following
- * LIFO scheme is used in plink/punlink. (IP first, ARP last).
- * This is because the muxid's are stored in the IP stream on
- * the ill.
- *
- * I_{P}LINK: ifconfig plinks the IP stream before plinking
- * the ARP stream. On an arp-dev stream, IP checks that it is
- * not yet plinked, and it also checks that the corresponding
- * IP stream is already plinked.
- *
- * I_{P}UNLINK: ifconfig punlinks the ARP stream before
- * punlinking the IP stream. IP does not allow punlink of the
- * IP stream unless the arp stream has been punlinked.
- */
- if ((islink &&
- (ill->ill_arp_muxid != 0 || ill->ill_ip_muxid == 0)) ||
- (!islink && ill->ill_arp_muxid != li->l_index)) {
- err = EINVAL;
- goto done;
- }
-
- if (IS_IPMP(ill) &&
- (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0)
- goto done;
-
- ill->ill_arp_muxid = islink ? li->l_index : 0;
- } else {
- /*
- * The lower stream is probably an IP module stream. Do
- * consistency checking.
- */
- err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li, B_TRUE);
- if (err == EINPROGRESS)
- return;
- }
+ err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li);
+ if (err == EINPROGRESS)
+ return;
done:
if (err == 0)
miocack(q, mp, 0, 0);
@@ -9826,21 +8532,19 @@ done:
/* Conn was refheld in ip_sioctl_copyin_setup */
if (CONN_Q(q))
CONN_OPER_PENDING_DONE(Q_TO_CONN(q));
- if (entered_ipsq)
- ipsq_exit(ipsq);
}
/*
* Process I_{P}LINK and I_{P}UNLINK requests named by `ioccmd' and pointed to
* by `mp' and `li' for the IP module stream (if li->q_bot is in fact an IP
* module stream). If `doconsist' is set, then do the extended consistency
- * checks requested by ifconfig(1M) and (atomically) set ill_ip_muxid here.
+ * checks requested by ifconfig(1M) and (atomically) set ill_muxid here.
* Returns zero on success, EINPROGRESS if the operation is still pending, or
* an error code on failure.
*/
static int
ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
- struct linkblk *li, boolean_t doconsist)
+ struct linkblk *li)
{
int err = 0;
ill_t *ill;
@@ -9849,6 +8553,8 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
struct qinit *qinfo;
boolean_t islink = (ioccmd == I_PLINK || ioccmd == I_LINK);
boolean_t entered_ipsq = B_FALSE;
+ boolean_t is_ip = B_FALSE;
+ arl_t *arl;
/*
* Walk the lower stream to verify it's the IP module stream.
@@ -9861,6 +8567,11 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
name = qinfo->qi_minfo->mi_idname;
if (name != NULL && strcmp(name, ip_mod_info.mi_idname) == 0 &&
qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) {
+ is_ip = B_TRUE;
+ break;
+ }
+ if (name != NULL && strcmp(name, arp_mod_info.mi_idname) == 0 &&
+ qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) {
break;
}
}
@@ -9871,30 +8582,46 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
if (ipwq == NULL)
return (0);
- ill = ipwq->q_ptr;
+ if (!is_ip) {
+ arl = (arl_t *)ipwq->q_ptr;
+ ill = arl_to_ill(arl);
+ if (ill == NULL)
+ return (0);
+ } else {
+ ill = ipwq->q_ptr;
+ }
ASSERT(ill != NULL);
if (ipsq == NULL) {
ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink,
NEW_OP, B_FALSE);
- if (ipsq == NULL)
+ if (ipsq == NULL) {
+ if (!is_ip)
+ ill_refrele(ill);
return (EINPROGRESS);
+ }
entered_ipsq = B_TRUE;
}
ASSERT(IAM_WRITER_ILL(ill));
-
- if (doconsist) {
- /*
- * Consistency checking requires that I_{P}LINK occurs
- * prior to setting ill_ip_muxid, and that I_{P}UNLINK
- * occurs prior to clearing ill_arp_muxid.
- */
- if ((islink && ill->ill_ip_muxid != 0) ||
- (!islink && ill->ill_arp_muxid != 0)) {
- err = EINVAL;
- goto done;
+ mutex_enter(&ill->ill_lock);
+ if (!is_ip) {
+ if (islink && ill->ill_muxid == 0) {
+ /*
+ * Plumbing has to be done with IP plumbed first, arp
+ * second, but here we have arp being plumbed first.
+ */
+ mutex_exit(&ill->ill_lock);
+ ipsq_exit(ipsq);
+ ill_refrele(ill);
+ return (EINVAL);
}
}
+ mutex_exit(&ill->ill_lock);
+ if (!is_ip) {
+ arl->arl_muxid = islink ? li->l_index : 0;
+ ill_refrele(ill);
+ goto done;
+ }
if (IS_IPMP(ill) && (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0)
goto done;
@@ -9912,8 +8639,7 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
ill->ill_lmod_cnt++;
}
- if (doconsist)
- ill->ill_ip_muxid = islink ? li->l_index : 0;
+ ill->ill_muxid = islink ? li->l_index : 0;
/*
* Mark the ipsq busy until the capability operations initiated below
@@ -9997,11 +8723,11 @@ ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp,
}
/*
- * ip_sioctl_copyin_setup is called by ip_wput with any M_IOCTL message
+ * ip_sioctl_copyin_setup is called by ip_wput_nondata with any M_IOCTL message
* that arrives. Most of the IOCTLs are "socket" IOCTLs which we handle
* in either I_STR or TRANSPARENT form, using the mi_copy facility.
* We establish here the size of the block to be copied in. mi_copyin
- * arranges for this to happen, an processing continues in ip_wput with
+ * arranges for this to happen, an processing continues in ip_wput_nondata with
* an M_IOCDATA message.
*/
void
@@ -10054,17 +8780,7 @@ ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp)
* will fail all ioctls).
*/
if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) {
- if (ipip->ipi_flags & IPI_PASS_DOWN) {
- /*
- * Pass common Streams ioctls which the IP
- * module does not own or consume along to
- * be processed down stream.
- */
- putnext(q, mp);
- return;
- } else {
- goto nak;
- }
+ goto nak;
}
/* Make sure we have ioctl data to process. */
@@ -10216,286 +8932,62 @@ nak:
qreply(q, mp);
}
-/* ip_wput hands off ARP IOCTL responses to us */
-/* ARGSUSED3 */
-void
-ip_sioctl_iocack(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
+static void
+ip_sioctl_garp_reply(mblk_t *mp, ill_t *ill, void *hwaddr, int flags)
{
struct arpreq *ar;
struct xarpreq *xar;
- area_t *area;
- mblk_t *area_mp;
+ mblk_t *tmp;
struct iocblk *iocp;
- mblk_t *orig_ioc_mp, *tmp;
- struct iocblk *orig_iocp;
- ill_t *ill;
- conn_t *connp = NULL;
- mblk_t *pending_mp;
- int x_arp_ioctl = B_FALSE, ifx_arp_ioctl = B_FALSE;
+ int x_arp_ioctl = B_FALSE;
int *flagsp;
char *storage = NULL;
- sin_t *sin;
- ipaddr_t addr;
- int err;
- ip_stack_t *ipst;
- ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq));
- ill = q->q_ptr;
ASSERT(ill != NULL);
- ipst = ill->ill_ipst;
-
- /*
- * We should get back from ARP a packet chain that looks like:
- * M_IOCACK-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK
- */
- if (!(area_mp = mp->b_cont) ||
- (area_mp->b_wptr - area_mp->b_rptr) < sizeof (ip_sock_ar_t) ||
- !(orig_ioc_mp = area_mp->b_cont) ||
- !orig_ioc_mp->b_cont || !orig_ioc_mp->b_cont->b_cont) {
- freemsg(mp);
- return;
- }
- orig_iocp = (struct iocblk *)orig_ioc_mp->b_rptr;
+ iocp = (struct iocblk *)mp->b_rptr;
+ ASSERT(iocp->ioc_cmd == SIOCGXARP || iocp->ioc_cmd == SIOCGARP);
- tmp = (orig_ioc_mp->b_cont)->b_cont;
- if ((orig_iocp->ioc_cmd == SIOCGXARP) ||
- (orig_iocp->ioc_cmd == SIOCSXARP) ||
- (orig_iocp->ioc_cmd == SIOCDXARP)) {
+ tmp = (mp->b_cont)->b_cont; /* xarpreq/arpreq */
+ if ((iocp->ioc_cmd == SIOCGXARP) ||
+ (iocp->ioc_cmd == SIOCSXARP)) {
x_arp_ioctl = B_TRUE;
xar = (struct xarpreq *)tmp->b_rptr;
- sin = (sin_t *)&xar->xarp_pa;
flagsp = &xar->xarp_flags;
storage = xar->xarp_ha.sdl_data;
- if (xar->xarp_ha.sdl_nlen != 0)
- ifx_arp_ioctl = B_TRUE;
} else {
ar = (struct arpreq *)tmp->b_rptr;
- sin = (sin_t *)&ar->arp_pa;
flagsp = &ar->arp_flags;
storage = ar->arp_ha.sa_data;
}
- iocp = (struct iocblk *)mp->b_rptr;
-
- /*
- * Find the pending message; if we're exclusive, it'll be on our IPSQ.
- * Otherwise, we can find it from our ioc_id.
- */
- if (ipsq != NULL)
- pending_mp = ipsq_pending_mp_get(ipsq, &connp);
- else
- pending_mp = ill_pending_mp_get(ill, &connp, iocp->ioc_id);
-
- if (pending_mp == NULL) {
- ASSERT(connp == NULL);
- inet_freemsg(mp);
- return;
- }
- ASSERT(connp != NULL);
- q = CONNP_TO_WQ(connp);
-
- /* Uncouple the internally generated IOCTL from the original one */
- area = (area_t *)area_mp->b_rptr;
- area_mp->b_cont = NULL;
-
- /*
- * Restore the b_next and b_prev used by mi code. This is needed
- * to complete the ioctl using mi* functions. We stored them in
- * the pending mp prior to sending the request to ARP.
- */
- orig_ioc_mp->b_cont->b_next = pending_mp->b_cont->b_next;
- orig_ioc_mp->b_cont->b_prev = pending_mp->b_cont->b_prev;
- inet_freemsg(pending_mp);
-
/*
- * We're done if there was an error or if this is not an SIOCG{X}ARP
- * Catch the case where there is an IRE_CACHE by no entry in the
- * arp table.
- */
- addr = sin->sin_addr.s_addr;
- if (iocp->ioc_error && iocp->ioc_cmd == AR_ENTRY_SQUERY) {
- ire_t *ire;
- dl_unitdata_req_t *dlup;
- mblk_t *llmp;
- int addr_len;
- ill_t *ipsqill = NULL;
-
- if (ifx_arp_ioctl) {
- /*
- * There's no need to lookup the ill, since
- * we've already done that when we started
- * processing the ioctl and sent the message
- * to ARP on that ill. So use the ill that
- * is stored in q->q_ptr.
- */
- ipsqill = ill;
- ire = ire_ctable_lookup(addr, 0, IRE_CACHE,
- ipsqill->ill_ipif, ALL_ZONES,
- NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst);
- } else {
- ire = ire_ctable_lookup(addr, 0, IRE_CACHE,
- NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
- if (ire != NULL)
- ipsqill = ire_to_ill(ire);
- }
-
- if ((x_arp_ioctl) && (ipsqill != NULL))
- storage += ill_xarp_info(&xar->xarp_ha, ipsqill);
-
- if (ire != NULL) {
- /*
- * Since the ire obtained from cachetable is used for
- * mac addr copying below, treat an incomplete ire as if
- * as if we never found it.
- */
- if (ire->ire_nce != NULL &&
- ire->ire_nce->nce_state != ND_REACHABLE) {
- ire_refrele(ire);
- ire = NULL;
- ipsqill = NULL;
- goto errack;
- }
- *flagsp = ATF_INUSE;
- llmp = (ire->ire_nce != NULL ?
- ire->ire_nce->nce_res_mp : NULL);
- if (llmp != NULL && ipsqill != NULL) {
- uchar_t *macaddr;
-
- addr_len = ipsqill->ill_phys_addr_length;
- if (x_arp_ioctl && ((addr_len +
- ipsqill->ill_name_length) >
- sizeof (xar->xarp_ha.sdl_data))) {
- ire_refrele(ire);
- freemsg(mp);
- ip_ioctl_finish(q, orig_ioc_mp,
- EINVAL, NO_COPYOUT, ipsq);
- return;
- }
- *flagsp |= ATF_COM;
- dlup = (dl_unitdata_req_t *)llmp->b_rptr;
- if (ipsqill->ill_sap_length < 0)
- macaddr = llmp->b_rptr +
- dlup->dl_dest_addr_offset;
- else
- macaddr = llmp->b_rptr +
- dlup->dl_dest_addr_offset +
- ipsqill->ill_sap_length;
- /*
- * For SIOCGARP, MAC address length
- * validation has already been done
- * before the ioctl was issued to ARP to
- * allow it to progress only on 6 byte
- * addressable (ethernet like) media. Thus
- * the mac address copying can not overwrite
- * the sa_data area below.
- */
- bcopy(macaddr, storage, addr_len);
- }
- /* Ditch the internal IOCTL. */
- freemsg(mp);
- ire_refrele(ire);
- ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, ipsq);
- return;
- }
- }
-
- /*
- * If this was a failed AR_ENTRY_ADD or a successful AR_ENTRY_DELETE
- * on the IPMP meta-interface, ensure any ARP entries added in
- * ip_sioctl_arp() are deleted.
- */
- if (IS_IPMP(ill) &&
- ((iocp->ioc_error != 0 && iocp->ioc_cmd == AR_ENTRY_ADD) ||
- ((iocp->ioc_error == 0 && iocp->ioc_cmd == AR_ENTRY_DELETE)))) {
- ipmp_illgrp_t *illg = ill->ill_grp;
- ipmp_arpent_t *entp;
-
- if ((entp = ipmp_illgrp_lookup_arpent(illg, &addr)) != NULL)
- ipmp_illgrp_destroy_arpent(illg, entp);
- }
-
- /*
- * Delete the coresponding IRE_CACHE if any.
- * Reset the error if there was one (in case there was no entry
- * in arp.)
- */
- if (iocp->ioc_cmd == AR_ENTRY_DELETE) {
- ipif_t *ipintf = NULL;
-
- if (ifx_arp_ioctl) {
- /*
- * There's no need to lookup the ill, since
- * we've already done that when we started
- * processing the ioctl and sent the message
- * to ARP on that ill. So use the ill that
- * is stored in q->q_ptr.
- */
- ipintf = ill->ill_ipif;
- }
- if (ip_ire_clookup_and_delete(addr, ipintf, ipst)) {
- /*
- * The address in "addr" may be an entry for a
- * router. If that's true, then any off-net
- * IRE_CACHE entries that go through the router
- * with address "addr" must be clobbered. Use
- * ire_walk to achieve this goal.
- */
- if (ifx_arp_ioctl)
- ire_walk_ill_v4(MATCH_IRE_ILL, 0,
- ire_delete_cache_gw, (char *)&addr, ill);
- else
- ire_walk_v4(ire_delete_cache_gw, (char *)&addr,
- ALL_ZONES, ipst);
- iocp->ioc_error = 0;
- }
- }
-errack:
- if (iocp->ioc_error || iocp->ioc_cmd != AR_ENTRY_SQUERY) {
- err = iocp->ioc_error;
- freemsg(mp);
- ip_ioctl_finish(q, orig_ioc_mp, err, NO_COPYOUT, ipsq);
- return;
- }
-
- /*
- * Completion of an SIOCG{X}ARP. Translate the information from
- * the area_t into the struct {x}arpreq.
+ * We're done if this is not an SIOCG{X}ARP
*/
if (x_arp_ioctl) {
storage += ill_xarp_info(&xar->xarp_ha, ill);
if ((ill->ill_phys_addr_length + ill->ill_name_length) >
sizeof (xar->xarp_ha.sdl_data)) {
- freemsg(mp);
- ip_ioctl_finish(q, orig_ioc_mp, EINVAL, NO_COPYOUT,
- ipsq);
+ iocp->ioc_error = EINVAL;
return;
}
}
*flagsp = ATF_INUSE;
- if (area->area_flags & ACE_F_PERMANENT)
- *flagsp |= ATF_PERM;
- if (area->area_flags & ACE_F_PUBLISH)
- *flagsp |= ATF_PUBL;
- if (area->area_flags & ACE_F_AUTHORITY)
+ /*
+ * If /sbin/arp told us we are the authority using the "permanent"
+ * flag, or if this is one of my addresses print "permanent"
+ * in the /sbin/arp output.
+ */
+ if ((flags & NCE_F_MYADDR) || (flags & NCE_F_AUTHORITY))
*flagsp |= ATF_AUTHORITY;
- if (area->area_hw_addr_length != 0) {
+ if (flags & NCE_F_NONUD)
+ *flagsp |= ATF_PERM; /* not subject to aging */
+ if (flags & NCE_F_PUBLISH)
+ *flagsp |= ATF_PUBL;
+ if (hwaddr != NULL) {
*flagsp |= ATF_COM;
- /*
- * For SIOCGARP, MAC address length validation has
- * already been done before the ioctl was issued to ARP
- * to allow it to progress only on 6 byte addressable
- * (ethernet like) media. Thus the mac address copying
- * can not overwrite the sa_data area below.
- */
- bcopy((char *)area + area->area_hw_addr_offset,
- storage, area->area_hw_addr_length);
+ bcopy((char *)hwaddr, storage, ill->ill_phys_addr_length);
}
-
- /* Ditch the internal IOCTL. */
- freemsg(mp);
- /* Complete the original. */
- ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, ipsq);
}
/*
@@ -10552,7 +9044,7 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
name = lifr->lifr_name;
ASSERT(CONN_Q(q));
connp = Q_TO_CONN(q);
- isv6 = connp->conn_af_isv6;
+ isv6 = (connp->conn_family == AF_INET6);
zoneid = connp->conn_zoneid;
namelen = mi_strlen(name);
if (namelen == 0)
@@ -10567,7 +9059,7 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
* for the last 4 args to ipif_lookup_name.
*/
ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, B_TRUE,
- &exists, isv6, zoneid, NULL, NULL, NULL, NULL, ipst);
+ &exists, isv6, zoneid, ipst);
/* Prevent any further action */
if (ipif == NULL) {
return (ENOBUFS);
@@ -10605,12 +9097,11 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
break;
}
}
- ill = ill_lookup_on_name(name, B_FALSE, isv6,
- CONNP_TO_WQ(connp), mp, ip_process_ioctl, &err, NULL, ipst);
+ ill = ill_lookup_on_name(name, B_FALSE, isv6, NULL, ipst);
if (found_sep)
*cp = IPIF_SEPARATOR_CHAR;
if (ill == NULL)
- return (err);
+ return (ENXIO);
}
ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP,
@@ -10687,7 +9178,7 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ASSERT(q->q_next == NULL);
ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n",
- ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
+ ill->ill_name, ipif->ipif_id, (void *)ipif));
ASSERT(IAM_WRITER_IPIF(ipif));
connp = Q_TO_CONN(q);
@@ -10703,7 +9194,7 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
* same as any other interface (meaning it skips the code directly
* below).
*/
- if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) {
+ if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) {
if (sin->sin_family == AF_UNSPEC &&
(IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) {
/*
@@ -10802,7 +9293,7 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
mutex_exit(&ill->ill_lock);
mutex_exit(&connp->conn_lock);
ipif_non_duplicate(ipif);
- ipif_down_tail(ipif);
+ (void) ipif_down_tail(ipif);
ipif_free_tail(ipif); /* frees ipif */
return (0);
}
@@ -10833,7 +9324,7 @@ ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q,
ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n",
ill->ill_name, ipif->ipif_id, (void *)ipif));
- if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) {
+ if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) {
ASSERT(ill->ill_state_flags & ILL_CONDEMNED);
ill_delete_tail(ill);
mi_free(ill);
@@ -10841,10 +9332,9 @@ ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q,
}
ipif_non_duplicate(ipif);
- ipif_down_tail(ipif);
+ (void) ipif_down_tail(ipif);
ipif_free_tail(ipif);
- ILL_UNMARK_CHANGING(ill);
return (0);
}
@@ -10930,8 +9420,6 @@ ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
* we have net and subnet bcast ire's for
* the old address if we need them.
*/
- if (!ipif->ipif_isv6)
- ipif_check_bcast_ires(ipif);
/*
* If the interface is already marked up,
* we call ipif_down which will take care
@@ -10941,7 +9429,7 @@ ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
err = ipif_logical_down(ipif, q, mp);
if (err == EINPROGRESS)
return (err);
- ipif_down_tail(ipif);
+ (void) ipif_down_tail(ipif);
need_up = 1;
}
@@ -10988,11 +9476,6 @@ ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ov6addr = ipif->ipif_v6lcl_addr;
ipif->ipif_v6lcl_addr = v6addr;
sctp_update_ipif_addr(ipif, ov6addr);
- if (ipif->ipif_flags & (IPIF_ANYCAST | IPIF_NOLOCAL)) {
- ipif->ipif_v6src_addr = ipv6_all_zeros;
- } else {
- ipif->ipif_v6src_addr = v6addr;
- }
ipif->ipif_addr_ready = 0;
/*
@@ -11050,12 +9533,22 @@ ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
* ip_rput_dlpi when we see the DL_BIND_ACK.
*/
err = ipif_up(ipif, q, mp);
+ } else {
+ /* Perhaps ilgs should use this ill */
+ update_conn_ill(NULL, ill->ill_ipst);
}
if (need_dl_down)
ill_dl_down(ill);
- if (need_arp_down)
- ipif_resolver_down(ipif);
+
+ if (need_arp_down && !ill->ill_isv6)
+ (void) ipif_arp_down(ipif);
+
+ /*
+ * The default multicast interface might have changed (for
+ * instance if the IPv6 scope of the address changed)
+ */
+ ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6);
return (err);
}
@@ -11072,7 +9565,7 @@ ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n",
ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
ASSERT(IAM_WRITER_IPIF(ipif));
- ipif_down_tail(ipif);
+ (void) ipif_down_tail(ipif);
return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE));
}
@@ -11162,7 +9655,7 @@ ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
err = ipif_logical_down(ipif, q, mp);
if (err == EINPROGRESS)
return (err);
- ipif_down_tail(ipif);
+ (void) ipif_down_tail(ipif);
need_up = B_TRUE;
}
/*
@@ -11254,8 +9747,8 @@ ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
if (need_dl_down)
ill_dl_down(ill);
- if (need_arp_down)
- ipif_resolver_down(ipif);
+ if (need_arp_down && !ipif->ipif_isv6)
+ (void) ipif_arp_down(ipif);
return (err);
}
@@ -11271,7 +9764,7 @@ ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
{
ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n",
ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
- ipif_down_tail(ipif);
+ (void) ipif_down_tail(ipif);
return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE));
}
@@ -11333,7 +9826,6 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
struct ifreq *ifr;
struct lifreq *lifr;
boolean_t set_linklocal = B_FALSE;
- boolean_t zero_source = B_FALSE;
ip1dbg(("ip_sioctl_flags(%s:%u %p)\n",
ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
@@ -11345,7 +9837,7 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
if (ipip->ipi_cmd_type == IF_CMD) {
ifr = (struct ifreq *)if_req;
- flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff);
+ flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff);
} else {
lifr = (struct lifreq *)if_req;
flags = lifr->lifr_flags;
@@ -11425,10 +9917,10 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
}
/*
- * Only allow the IFF_XRESOLV and IFF_TEMPORARY flags to be set on
+ * Only allow IFF_TEMPORARY flag to be set on
* IPv6 interfaces.
*/
- if ((turn_on & (IFF_XRESOLV|IFF_TEMPORARY)) && !(ipif->ipif_isv6))
+ if ((turn_on & IFF_TEMPORARY) && !(ipif->ipif_isv6))
return (EINVAL);
/*
@@ -11444,9 +9936,6 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK))
return (EINVAL);
- if (flags & (IFF_NOLOCAL|IFF_ANYCAST))
- zero_source = B_TRUE;
-
/*
* For IPv6 ipif_id 0, don't allow the interface to be up without
* a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set.
@@ -11454,7 +9943,7 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
* set later on in this function.
*/
if (ipif->ipif_id == 0 && ipif->ipif_isv6 &&
- (flags & IFF_UP) && !zero_source &&
+ (flags & IFF_UP) && !(flags & (IFF_NOLOCAL|IFF_ANYCAST)) &&
IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) {
if (ipif_cant_setlinklocal(ipif))
return (EINVAL);
@@ -11560,13 +10049,15 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ill_ipif, RTSQ_DEFAULT);
}
}
+ /* The default multicast interface might have changed */
+ ire_increment_multicast_generation(ill->ill_ipst,
+ ill->ill_isv6);
+
return (0);
- } else if (set_linklocal || zero_source) {
+ } else if (set_linklocal) {
mutex_enter(&ill->ill_lock);
if (set_linklocal)
ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL;
- if (zero_source)
- ipif->ipif_state_flags |= IPIF_ZERO_SOURCE;
mutex_exit(&ill->ill_lock);
}
@@ -11610,13 +10101,10 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED|
IPIF_NOFAILOVER)) {
/*
- * Taking this ipif down, make sure we have
- * valid net and subnet bcast ire's for other
- * logical interfaces, if we need them.
+ * ipif_down() will ire_delete bcast ire's for the subnet,
+ * while the ire_identical_ref tracks the case of IRE_BROADCAST
+ * entries shared between multiple ipifs on the same subnet.
*/
- if (!ipif->ipif_isv6)
- ipif_check_bcast_ires(ipif);
-
if (((ipif->ipif_flags | turn_on) & IPIF_UP) &&
!(turn_off & IPIF_UP)) {
if (ipif->ipif_flags & IPIF_UP)
@@ -11627,7 +10115,7 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ip1dbg(("ipif_down returns %d err ", err));
if (err == EINPROGRESS)
return (err);
- ipif_down_tail(ipif);
+ (void) ipif_down_tail(ipif);
}
return (ip_sioctl_flags_tail(ipif, flags, q, mp));
}
@@ -11642,7 +10130,6 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
boolean_t phyint_flags_modified = B_FALSE;
int err = 0;
boolean_t set_linklocal = B_FALSE;
- boolean_t zero_source = B_FALSE;
ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n",
ipif->ipif_ill->ill_name, ipif->ipif_id));
@@ -11680,21 +10167,13 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
set_linklocal = B_TRUE;
ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL;
}
- if (ipif->ipif_state_flags & IPIF_ZERO_SOURCE) {
- zero_source = B_TRUE;
- ipif->ipif_state_flags &= ~IPIF_ZERO_SOURCE;
- }
+
mutex_exit(&ill->ill_lock);
mutex_exit(&phyi->phyint_lock);
if (set_linklocal)
(void) ipif_setlinklocal(ipif);
- if (zero_source)
- ipif->ipif_v6src_addr = ipv6_all_zeros;
- else
- ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr;
-
/*
* PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the same to
* the kernel: if any of them has been set by userland, the interface
@@ -11744,6 +10223,9 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
*/
sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
}
+
+ /* The default multicast interface might have changed */
+ ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6);
return (err);
}
@@ -11762,7 +10244,7 @@ ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n",
ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
- ipif_down_tail(ipif);
+ (void) ipif_down_tail(ipif);
if (ipip->ipi_cmd_type == IF_CMD) {
/* cast to uint16_t prevents unwanted sign extension */
flags = (uint16_t)ifr->ifr_flags;
@@ -11814,6 +10296,10 @@ ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
return (0);
}
+/*
+ * We allow the MTU to be set on an ILL, but not have it be different
+ * for different IPIFs since we don't actually send packets on IPIFs.
+ */
/* ARGSUSED */
int
ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
@@ -11823,8 +10309,7 @@ ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
int ip_min_mtu;
struct ifreq *ifr;
struct lifreq *lifr;
- ire_t *ire;
- ip_stack_t *ipst;
+ ill_t *ill;
ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name,
ipif->ipif_id, (void *)ipif));
@@ -11835,48 +10320,35 @@ ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
lifr = (struct lifreq *)if_req;
mtu = lifr->lifr_mtu;
}
+ /* Only allow for logical unit zero i.e. not on "bge0:17" */
+ if (ipif->ipif_id != 0)
+ return (EINVAL);
+ ill = ipif->ipif_ill;
if (ipif->ipif_isv6)
ip_min_mtu = IPV6_MIN_MTU;
else
ip_min_mtu = IP_MIN_MTU;
- if (mtu > ipif->ipif_ill->ill_max_frag || mtu < ip_min_mtu)
+ mutex_enter(&ill->ill_lock);
+ if (mtu > ill->ill_max_frag || mtu < ip_min_mtu) {
+ mutex_exit(&ill->ill_lock);
return (EINVAL);
+ }
+ /*
+ * The dce and fragmentation code can handle changes to ill_mtu
+ * concurrent with sending/fragmenting packets.
+ */
+ ill->ill_mtu = mtu;
+ ill->ill_flags |= ILLF_FIXEDMTU;
+ mutex_exit(&ill->ill_lock);
/*
- * Change the MTU size in all relevant ire's.
- * Mtu change Vs. new ire creation - protocol below.
- * First change ipif_mtu and the ire_max_frag of the
- * interface ire. Then do an ire walk and change the
- * ire_max_frag of all affected ires. During ire_add
- * under the bucket lock, set the ire_max_frag of the
- * new ire being created from the ipif/ire from which
- * it is being derived. If an mtu change happens after
- * the ire is added, the new ire will be cleaned up.
- * Conversely if the mtu change happens before the ire
- * is added, ire_add will see the new value of the mtu.
+ * Make sure all dce_generation checks find out
+ * that ill_mtu has changed.
*/
- ipif->ipif_mtu = mtu;
- ipif->ipif_flags |= IPIF_FIXEDMTU;
+ dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst);
- if (ipif->ipif_isv6)
- ire = ipif_to_ire_v6(ipif);
- else
- ire = ipif_to_ire(ipif);
- if (ire != NULL) {
- ire->ire_max_frag = ipif->ipif_mtu;
- ire_refrele(ire);
- }
- ipst = ipif->ipif_ill->ill_ipst;
- if (ipif->ipif_flags & IPIF_UP) {
- if (ipif->ipif_isv6)
- ire_walk_v6(ipif_mtu_change, (char *)ipif, ALL_ZONES,
- ipst);
- else
- ire_walk_v4(ipif_mtu_change, (char *)ipif, ALL_ZONES,
- ipst);
- }
/* Update the MTU in SCTP's list */
sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
return (0);
@@ -11893,12 +10365,17 @@ ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n",
ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
+
+ /*
+ * We allow a get on any logical interface even though the set
+ * can only be done on logical unit 0.
+ */
if (ipip->ipi_cmd_type == IF_CMD) {
ifr = (struct ifreq *)if_req;
- ifr->ifr_metric = ipif->ipif_mtu;
+ ifr->ifr_metric = ipif->ipif_ill->ill_mtu;
} else {
lifr = (struct lifreq *)if_req;
- lifr->lifr_mtu = ipif->ipif_mtu;
+ lifr->lifr_mtu = ipif->ipif_ill->ill_mtu;
}
return (0);
}
@@ -11911,9 +10388,10 @@ ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
{
ipaddr_t addr;
ire_t *ire;
- ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
+ ill_t *ill = ipif->ipif_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
- ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ipif->ipif_ill->ill_name,
+ ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ill->ill_name,
ipif->ipif_id));
ASSERT(IAM_WRITER_IPIF(ipif));
@@ -11931,12 +10409,10 @@ ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
* If we are already up, make sure the new
* broadcast address makes sense. If it does,
* there should be an IRE for it already.
- * Don't match on ipif, only on the ill
- * since we are sharing these now.
*/
- ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST,
- ipif, ALL_ZONES, NULL,
- (MATCH_IRE_ILL | MATCH_IRE_TYPE), ipst);
+ ire = ire_ftable_lookup_v4(addr, 0, 0, IRE_BROADCAST,
+ ill, ipif->ipif_zoneid, NULL,
+ (MATCH_IRE_ILL | MATCH_IRE_TYPE), 0, ipst, NULL);
if (ire == NULL) {
return (EINVAL);
} else {
@@ -11944,13 +10420,13 @@ ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
}
}
/*
- * Changing the broadcast addr for this ipif.
- * Make sure we have valid net and subnet bcast
- * ire's for other logical interfaces, if needed.
+ * Changing the broadcast addr for this ipif. Since the IRE_BROADCAST
+ * needs to already exist we never need to change the set of
+ * IRE_BROADCASTs when we are UP.
*/
if (addr != ipif->ipif_brd_addr)
- ipif_check_bcast_ires(ipif);
- IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr);
+ IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr);
+
return (0);
}
@@ -12026,13 +10502,10 @@ ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
* Make sure we have valid net and subnet broadcast ire's
* for the old netmask, if needed by other logical interfaces.
*/
- if (!ipif->ipif_isv6)
- ipif_check_bcast_ires(ipif);
-
err = ipif_logical_down(ipif, q, mp);
if (err == EINPROGRESS)
return (err);
- ipif_down_tail(ipif);
+ (void) ipif_down_tail(ipif);
err = ip_sioctl_netmask_tail(ipif, sin, q, mp);
return (err);
}
@@ -12087,7 +10560,7 @@ ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
{
ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n",
ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
- ipif_down_tail(ipif);
+ (void) ipif_down_tail(ipif);
return (ip_sioctl_netmask_tail(ipif, sin, q, mp));
}
@@ -12188,6 +10661,7 @@ int
ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ip_ioctl_cmd_t *ipip, void *if_req)
{
+ int arp_muxid;
ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n",
ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
@@ -12197,14 +10671,15 @@ ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
if (ipip->ipi_cmd_type == IF_CMD) {
struct ifreq *ifr = (struct ifreq *)if_req;
- ipif->ipif_ill->ill_ip_muxid = ifr->ifr_ip_muxid;
- ipif->ipif_ill->ill_arp_muxid = ifr->ifr_arp_muxid;
+ ipif->ipif_ill->ill_muxid = ifr->ifr_ip_muxid;
+ arp_muxid = ifr->ifr_arp_muxid;
} else {
struct lifreq *lifr = (struct lifreq *)if_req;
- ipif->ipif_ill->ill_ip_muxid = lifr->lifr_ip_muxid;
- ipif->ipif_ill->ill_arp_muxid = lifr->lifr_arp_muxid;
+ ipif->ipif_ill->ill_muxid = lifr->lifr_ip_muxid;
+ arp_muxid = lifr->lifr_arp_muxid;
}
+ arl_set_muxid(ipif->ipif_ill, arp_muxid);
return (0);
}
@@ -12213,22 +10688,24 @@ int
ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ip_ioctl_cmd_t *ipip, void *if_req)
{
+ int arp_muxid = 0;
ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n",
ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
/*
* Get the muxid saved in ill for I_PUNLINK.
*/
+ arp_muxid = arl_get_muxid(ipif->ipif_ill);
if (ipip->ipi_cmd_type == IF_CMD) {
struct ifreq *ifr = (struct ifreq *)if_req;
- ifr->ifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid;
- ifr->ifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid;
+ ifr->ifr_ip_muxid = ipif->ipif_ill->ill_muxid;
+ ifr->ifr_arp_muxid = arp_muxid;
} else {
struct lifreq *lifr = (struct lifreq *)if_req;
- lifr->lifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid;
- lifr->lifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid;
+ lifr->lifr_ip_muxid = ipif->ipif_ill->ill_muxid;
+ lifr->lifr_arp_muxid = arp_muxid;
}
return (0);
}
@@ -12298,7 +10775,7 @@ ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
err = ipif_logical_down(ipif, q, mp);
if (err == EINPROGRESS)
return (err);
- ipif_down_tail(ipif);
+ (void) ipif_down_tail(ipif);
need_up = B_TRUE;
}
@@ -12353,7 +10830,7 @@ ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n",
ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
- ipif_down_tail(ipif);
+ (void) ipif_down_tail(ipif);
addrlen = lifr->lifr_addrlen;
if (ipif->ipif_isv6) {
@@ -12454,7 +10931,7 @@ ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
err = ipif_logical_down(ipif, q, mp);
if (err == EINPROGRESS)
return (err);
- ipif_down_tail(ipif);
+ (void) ipif_down_tail(ipif);
need_up = B_TRUE;
}
err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up);
@@ -12538,24 +11015,6 @@ ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
/*
* Set (hardware) link specific information that might override
* what was acquired through the DL_INFO_ACK.
- * The logic is as follows.
- *
- * become exclusive
- * set CHANGING flag
- * change mtu on affected IREs
- * clear CHANGING flag
- *
- * An ire add that occurs before the CHANGING flag is set will have its mtu
- * changed by the ip_sioctl_lnkinfo.
- *
- * During the time the CHANGING flag is set, no new ires will be added to the
- * bucket, and ire add will fail (due the CHANGING flag).
- *
- * An ire add that occurs after the CHANGING flag is set will have the right mtu
- * before it is added to the bucket.
- *
- * Obviously only 1 thread can set the CHANGING flag and we need to become
- * exclusive to set the flag.
*/
/* ARGSUSED */
int
@@ -12563,19 +11022,16 @@ ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ip_ioctl_cmd_t *ipi, void *if_req)
{
ill_t *ill = ipif->ipif_ill;
- ipif_t *nipif;
int ip_min_mtu;
- boolean_t mtu_walk = B_FALSE;
struct lifreq *lifr = (struct lifreq *)if_req;
lif_ifinfo_req_t *lir;
- ire_t *ire;
ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n",
ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
lir = &lifr->lifr_ifinfo;
ASSERT(IAM_WRITER_IPIF(ipif));
- /* Only allow for logical unit zero i.e. not on "le0:17" */
+ /* Only allow for logical unit zero i.e. not on "bge0:17" */
if (ipif->ipif_id != 0)
return (EINVAL);
@@ -12588,9 +11044,20 @@ ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
/*
* Verify values before we set anything. Allow zero to
* mean unspecified.
+ *
+ * XXX We should be able to set the user-defined lir_mtu to some value
+ * that is greater than ill_current_frag but less than ill_max_frag- the
+ * ill_max_frag value tells us the max MTU that can be handled by the
+ * datalink, whereas the ill_current_frag is dynamically computed for
+ * some link-types like tunnels, based on the tunnel PMTU. However,
+ * since there is currently no way of distinguishing between
+ * administratively fixed link mtu values (e.g., those set via
+ * /sbin/dladm) and dynamically discovered MTUs (e.g., those discovered
+ * for tunnels) we conservatively choose the ill_current_frag as the
+ * upper-bound.
*/
if (lir->lir_maxmtu != 0 &&
- (lir->lir_maxmtu > ill->ill_max_frag ||
+ (lir->lir_maxmtu > ill->ill_current_frag ||
lir->lir_maxmtu < ip_min_mtu))
return (EINVAL);
if (lir->lir_reachtime != 0 &&
@@ -12601,18 +11068,12 @@ ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
return (EINVAL);
mutex_enter(&ill->ill_lock);
- ill->ill_state_flags |= ILL_CHANGING;
- for (nipif = ill->ill_ipif; nipif != NULL;
- nipif = nipif->ipif_next) {
- nipif->ipif_state_flags |= IPIF_CHANGING;
- }
-
- if (lir->lir_maxmtu != 0) {
- ill->ill_max_mtu = lir->lir_maxmtu;
+ /*
+ * The dce and fragmentation code can handle changes to ill_mtu
+ * concurrent with sending/fragmenting packets.
+ */
+ if (lir->lir_maxmtu != 0)
ill->ill_user_mtu = lir->lir_maxmtu;
- mtu_walk = B_TRUE;
- }
- mutex_exit(&ill->ill_lock);
if (lir->lir_reachtime != 0)
ill->ill_reachable_time = lir->lir_reachtime;
@@ -12621,47 +11082,29 @@ ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ill->ill_reachable_retrans_time = lir->lir_reachretrans;
ill->ill_max_hops = lir->lir_maxhops;
-
ill->ill_max_buf = ND_MAX_Q;
-
- if (mtu_walk) {
+ if (!(ill->ill_flags & ILLF_FIXEDMTU) && ill->ill_user_mtu != 0) {
/*
- * Set the MTU on all ipifs associated with this ill except
- * for those whose MTU was fixed via SIOCSLIFMTU.
+ * ill_mtu is the actual interface MTU, obtained as the min
+ * of user-configured mtu and the value announced by the
+ * driver (via DL_NOTE_SDU_SIZE/DL_INFO_ACK). Note that since
+ * we have already made the choice of requiring
+ * ill_user_mtu < ill_current_frag by the time we get here,
+ * the ill_mtu effectively gets assigned to the ill_user_mtu
+ * here.
*/
- for (nipif = ill->ill_ipif; nipif != NULL;
- nipif = nipif->ipif_next) {
- if (nipif->ipif_flags & IPIF_FIXEDMTU)
- continue;
-
- nipif->ipif_mtu = ill->ill_max_mtu;
-
- if (!(nipif->ipif_flags & IPIF_UP))
- continue;
-
- if (nipif->ipif_isv6)
- ire = ipif_to_ire_v6(nipif);
- else
- ire = ipif_to_ire(nipif);
- if (ire != NULL) {
- ire->ire_max_frag = ipif->ipif_mtu;
- ire_refrele(ire);
- }
-
- ire_walk_ill(MATCH_IRE_ILL, 0, ipif_mtu_change,
- nipif, ill);
- }
- }
-
- mutex_enter(&ill->ill_lock);
- for (nipif = ill->ill_ipif; nipif != NULL;
- nipif = nipif->ipif_next) {
- nipif->ipif_state_flags &= ~IPIF_CHANGING;
+ ill->ill_mtu = MIN(ill->ill_current_frag, ill->ill_user_mtu);
}
- ILL_UNMARK_CHANGING(ill);
mutex_exit(&ill->ill_lock);
/*
+ * Make sure all dce_generation checks find out
+ * that ill_mtu has changed.
+ */
+ if (!(ill->ill_flags & ILLF_FIXEDMTU) && (lir->lir_maxmtu != 0))
+ dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst);
+
+ /*
* Refresh IPMP meta-interface MTU if necessary.
*/
if (IS_UNDER_IPMP(ill))
@@ -12687,7 +11130,7 @@ ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
lir->lir_maxhops = ill->ill_max_hops;
lir->lir_reachtime = ill->ill_reachable_time;
lir->lir_reachretrans = ill->ill_reachable_retrans_time;
- lir->lir_maxmtu = ill->ill_max_mtu;
+ lir->lir_maxmtu = ill->ill_mtu;
return (0);
}
@@ -12722,7 +11165,7 @@ ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp, ip_stack_t *ipst)
mutex_enter(&ill->ill_lock);
for (ipif = ill->ill_ipif; ipif != NULL;
ipif = ipif->ipif_next) {
- if (!IPIF_CAN_LOOKUP(ipif))
+ if (IPIF_IS_CONDEMNED(ipif))
continue;
if (!(ipif->ipif_flags & IPIF_UP))
continue;
@@ -12848,29 +11291,9 @@ done:
}
/*
- * Lookup an ipif using the sequence id (ipif_seqid)
+ * Assign a unique id for the ipif. This is used by sctp_addr.c
+ * Note: remove if sctp_addr.c is redone to not shadow ill/ipif data structures.
*/
-ipif_t *
-ipif_lookup_seqid(ill_t *ill, uint_t seqid)
-{
- ipif_t *ipif;
-
- ASSERT(MUTEX_HELD(&ill->ill_lock));
-
- for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
- if (ipif->ipif_seqid == seqid && IPIF_CAN_LOOKUP(ipif))
- return (ipif);
- }
- return (NULL);
-}
-
-/*
- * Assign a unique id for the ipif. This is used later when we send
- * IRES to ARP for resolution where we initialize ire_ipif_seqid
- * to the value pointed by ire_ipif->ipif_seqid. Later when the
- * IRE is added, we verify that ipif has not disappeared.
- */
-
static void
ipif_assign_seqid(ipif_t *ipif)
{
@@ -12893,41 +11316,21 @@ ipif_clone(const ipif_t *sipif, ipif_t *dipif)
ASSERT(!(sipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)));
ASSERT(!(dipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)));
ASSERT(sipif->ipif_ire_type == dipif->ipif_ire_type);
- ASSERT(sipif->ipif_arp_del_mp == NULL);
- ASSERT(dipif->ipif_arp_del_mp == NULL);
- ASSERT(sipif->ipif_igmp_rpt == NULL);
- ASSERT(dipif->ipif_igmp_rpt == NULL);
- ASSERT(sipif->ipif_multicast_up == 0);
- ASSERT(dipif->ipif_multicast_up == 0);
- ASSERT(sipif->ipif_joined_allhosts == 0);
- ASSERT(dipif->ipif_joined_allhosts == 0);
-
- dipif->ipif_mtu = sipif->ipif_mtu;
+
dipif->ipif_flags = sipif->ipif_flags;
dipif->ipif_metric = sipif->ipif_metric;
dipif->ipif_zoneid = sipif->ipif_zoneid;
dipif->ipif_v6subnet = sipif->ipif_v6subnet;
dipif->ipif_v6lcl_addr = sipif->ipif_v6lcl_addr;
- dipif->ipif_v6src_addr = sipif->ipif_v6src_addr;
dipif->ipif_v6net_mask = sipif->ipif_v6net_mask;
dipif->ipif_v6brd_addr = sipif->ipif_v6brd_addr;
dipif->ipif_v6pp_dst_addr = sipif->ipif_v6pp_dst_addr;
/*
- * While dipif is down right now, it might've been up before. Since
- * it's changing identity, its packet counters need to be reset.
- */
- dipif->ipif_ib_pkt_count = 0;
- dipif->ipif_ob_pkt_count = 0;
- dipif->ipif_fo_pkt_count = 0;
-
- /*
* As per the comment atop the function, we assume that these sipif
* fields will be changed before sipif is unlocked.
*/
dipif->ipif_seqid = sipif->ipif_seqid;
- dipif->ipif_saved_ire_mp = sipif->ipif_saved_ire_mp;
- dipif->ipif_saved_ire_cnt = sipif->ipif_saved_ire_cnt;
dipif->ipif_state_flags = sipif->ipif_state_flags;
}
@@ -12951,13 +11354,6 @@ ipif_transfer(ipif_t *sipif, ipif_t *dipif, ipif_t *virgipif)
* Grab all of the locks that protect the ipif in a defined order.
*/
GRAB_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill);
- if (sipif > dipif) {
- mutex_enter(&sipif->ipif_saved_ire_lock);
- mutex_enter(&dipif->ipif_saved_ire_lock);
- } else {
- mutex_enter(&dipif->ipif_saved_ire_lock);
- mutex_enter(&sipif->ipif_saved_ire_lock);
- }
ipif_clone(sipif, dipif);
if (virgipif != NULL) {
@@ -12965,8 +11361,6 @@ ipif_transfer(ipif_t *sipif, ipif_t *dipif, ipif_t *virgipif)
mi_free(virgipif);
}
- mutex_exit(&sipif->ipif_saved_ire_lock);
- mutex_exit(&dipif->ipif_saved_ire_lock);
RELEASE_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill);
/*
@@ -13115,10 +11509,7 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize,
*/
ipif->ipif_zoneid = ill->ill_zoneid;
- mutex_init(&ipif->ipif_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL);
-
ipif->ipif_refcnt = 0;
- ipif->ipif_saved_ire_cnt = 0;
if (insert) {
if (ipif_insert(ipif, ire_type != IRE_LOOPBACK) != 0) {
@@ -13171,8 +11562,6 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize,
IN6_IPADDR_TO_V4MAPPED(inaddr_any,
&ipif->ipif_v6lcl_addr);
IN6_IPADDR_TO_V4MAPPED(inaddr_any,
- &ipif->ipif_v6src_addr);
- IN6_IPADDR_TO_V4MAPPED(inaddr_any,
&ipif->ipif_v6subnet);
IN6_IPADDR_TO_V4MAPPED(inaddr_any,
&ipif->ipif_v6net_mask);
@@ -13189,8 +11578,6 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize,
if (!initialize)
goto out;
- ipif->ipif_mtu = ill->ill_max_mtu;
-
/*
* NOTE: The IPMP meta-interface is special-cased because it starts
* with no underlying interfaces (and thus an unknown broadcast
@@ -13236,207 +11623,47 @@ out:
}
/*
- * If appropriate, send a message up to the resolver delete the entry
- * for the address of this interface which is going out of business.
- * (Always called as writer).
- *
- * NOTE : We need to check for NULL mps as some of the fields are
- * initialized only for some interface types. See ipif_resolver_up()
- * for details.
+ * Remove the neighbor cache entries associated with this logical
+ * interface.
*/
-void
-ipif_resolver_down(ipif_t *ipif)
+int
+ipif_arp_down(ipif_t *ipif)
{
- mblk_t *mp;
ill_t *ill = ipif->ipif_ill;
+ int err = 0;
- ip1dbg(("ipif_resolver_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
+ ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
ASSERT(IAM_WRITER_IPIF(ipif));
- if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV))
- return;
-
- /* Delete the mapping for the local address */
- mp = ipif->ipif_arp_del_mp;
- if (mp != NULL) {
- ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n",
- *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id));
- putnext(ill->ill_rq, mp);
- ipif->ipif_arp_del_mp = NULL;
- }
-
- /*
- * Make IPMP aware of the deleted data address.
- */
- if (IS_IPMP(ill))
- ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
+ DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_down",
+ ill_t *, ill, ipif_t *, ipif);
+ ipif_nce_down(ipif);
/*
* If this is the last ipif that is going down and there are no
* duplicate addresses we may yet attempt to re-probe, then we need to
* clean up ARP completely.
*/
- if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0) {
+ if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 &&
+ !ill->ill_logical_down && ill->ill_net_type == IRE_IF_RESOLVER) {
/*
* If this was the last ipif on an IPMP interface, purge any
- * IPMP ARP entries associated with it.
+ * static ARP entries associated with it.
*/
if (IS_IPMP(ill))
ipmp_illgrp_refresh_arpent(ill->ill_grp);
- /* Send up AR_INTERFACE_DOWN message */
- mp = ill->ill_arp_down_mp;
- if (mp != NULL) {
- ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n",
- *(unsigned *)mp->b_rptr, ill->ill_name,
- ipif->ipif_id));
- putnext(ill->ill_rq, mp);
- ill->ill_arp_down_mp = NULL;
- }
-
- /* Tell ARP to delete the multicast mappings */
- mp = ill->ill_arp_del_mapping_mp;
- if (mp != NULL) {
- ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n",
- *(unsigned *)mp->b_rptr, ill->ill_name,
- ipif->ipif_id));
- putnext(ill->ill_rq, mp);
- ill->ill_arp_del_mapping_mp = NULL;
- }
+ /* UNBIND, DETACH */
+ err = arp_ll_down(ill);
}
-}
-
-/*
- * Set up the multicast mappings for `ipif' in ARP. If `arp_add_mapping_mp'
- * is non-NULL, then upon success it will contain an mblk that can be passed
- * to ARP to create the mapping. Otherwise, if it's NULL, upon success ARP
- * will have already been notified to create the mapping. Returns zero on
- * success, -1 upon failure.
- */
-int
-ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp)
-{
- mblk_t *del_mp = NULL;
- mblk_t *add_mp = NULL;
- mblk_t *mp;
- ill_t *ill = ipif->ipif_ill;
- phyint_t *phyi = ill->ill_phyint;
- ipaddr_t addr, mask, extract_mask = 0;
- arma_t *arma;
- uint8_t *maddr, *bphys_addr;
- uint32_t hw_start;
- dl_unitdata_req_t *dlur;
-
- ASSERT(IAM_WRITER_IPIF(ipif));
- if (ipif->ipif_flags & IPIF_POINTOPOINT)
- return (0);
-
- /*
- * IPMP meta-interfaces don't have any inherent multicast mappings,
- * and instead use the ones on the underlying interfaces.
- */
- if (IS_IPMP(ill))
- return (0);
-
- /*
- * Delete the existing mapping from ARP. Normally, ipif_down() ->
- * ipif_resolver_down() will send this up to ARP, but it may be that
- * we are enabling PHYI_MULTI_BCAST via ip_rput_dlpi_writer().
- */
- mp = ill->ill_arp_del_mapping_mp;
- if (mp != NULL) {
- ip1dbg(("ipif_arp_setup_multicast: arp cmd %x for %s:%u\n",
- *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id));
- putnext(ill->ill_rq, mp);
- ill->ill_arp_del_mapping_mp = NULL;
- }
-
- if (arp_add_mapping_mp != NULL)
- *arp_add_mapping_mp = NULL;
-
- /*
- * Check that the address is not to long for the constant
- * length reserved in the template arma_t.
- */
- if (ill->ill_phys_addr_length > IP_MAX_HW_LEN)
- return (-1);
-
- /* Add mapping mblk */
- addr = (ipaddr_t)htonl(INADDR_UNSPEC_GROUP);
- mask = (ipaddr_t)htonl(IN_CLASSD_NET);
- add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_arma_multi_template,
- (caddr_t)&addr);
- if (add_mp == NULL)
- return (-1);
- arma = (arma_t *)add_mp->b_rptr;
- maddr = (uint8_t *)arma + arma->arma_hw_addr_offset;
- bcopy(&mask, (char *)arma + arma->arma_proto_mask_offset, IP_ADDR_LEN);
- arma->arma_hw_addr_length = ill->ill_phys_addr_length;
- /*
- * Determine the broadcast address.
- */
- dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
- if (ill->ill_sap_length < 0)
- bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset;
- else
- bphys_addr = (uchar_t *)dlur +
- dlur->dl_dest_addr_offset + ill->ill_sap_length;
- /*
- * Check PHYI_MULTI_BCAST and length of physical
- * address to determine if we use the mapping or the
- * broadcast address.
- */
- if (!(phyi->phyint_flags & PHYI_MULTI_BCAST))
- if (!MEDIA_V4MINFO(ill->ill_media, ill->ill_phys_addr_length,
- bphys_addr, maddr, &hw_start, &extract_mask))
- phyi->phyint_flags |= PHYI_MULTI_BCAST;
-
- if ((phyi->phyint_flags & PHYI_MULTI_BCAST) ||
- (ill->ill_flags & ILLF_MULTICAST)) {
- /* Make sure this will not match the "exact" entry. */
- addr = (ipaddr_t)htonl(INADDR_ALLHOSTS_GROUP);
- del_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ared_template,
- (caddr_t)&addr);
- if (del_mp == NULL) {
- freemsg(add_mp);
- return (-1);
- }
- bcopy(&extract_mask, (char *)arma +
- arma->arma_proto_extract_mask_offset, IP_ADDR_LEN);
- if (phyi->phyint_flags & PHYI_MULTI_BCAST) {
- /* Use link-layer broadcast address for MULTI_BCAST */
- bcopy(bphys_addr, maddr, ill->ill_phys_addr_length);
- ip2dbg(("ipif_arp_setup_multicast: adding"
- " MULTI_BCAST ARP setup for %s\n", ill->ill_name));
- } else {
- arma->arma_hw_mapping_start = hw_start;
- ip2dbg(("ipif_arp_setup_multicast: adding multicast"
- " ARP setup for %s\n", ill->ill_name));
- }
- } else {
- freemsg(add_mp);
- ASSERT(del_mp == NULL);
- /* It is neither MULTICAST nor MULTI_BCAST */
- return (0);
- }
- ASSERT(add_mp != NULL && del_mp != NULL);
- ASSERT(ill->ill_arp_del_mapping_mp == NULL);
- ill->ill_arp_del_mapping_mp = del_mp;
- if (arp_add_mapping_mp != NULL) {
- /* The caller just wants the mblks allocated */
- *arp_add_mapping_mp = add_mp;
- } else {
- /* The caller wants us to send it to arp */
- putnext(ill->ill_rq, add_mp);
- }
- return (0);
+ return (err);
}
/*
* Get the resolver set up for a new IP address. (Always called as writer.)
- * Called both for IPv4 and IPv6 interfaces, though it only sets up the
- * resolver for v6 if it's an ILLF_XRESOLV interface. Honors ILLF_NOARP.
+ * Called both for IPv4 and IPv6 interfaces, though it only does some
+ * basic DAD related initialization for IPv6. Honors ILLF_NOARP.
*
* The enumerated value res_act tunes the behavior:
* * Res_act_initial: set up all the resolver structures for a new
@@ -13451,17 +11678,9 @@ ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp)
int
ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
{
- mblk_t *arp_up_mp = NULL;
- mblk_t *arp_down_mp = NULL;
- mblk_t *arp_add_mp = NULL;
- mblk_t *arp_del_mp = NULL;
- mblk_t *arp_add_mapping_mp = NULL;
- mblk_t *arp_del_mapping_mp = NULL;
- ill_t *ill = ipif->ipif_ill;
- int err = ENOMEM;
- boolean_t added_ipif = B_FALSE;
- boolean_t publish;
- boolean_t was_dup;
+ ill_t *ill = ipif->ipif_ill;
+ int err;
+ boolean_t was_dup;
ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n",
ill->ill_name, ipif->ipif_id, (uint_t)ipif->ipif_flags));
@@ -13490,231 +11709,55 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
return (0);
}
/* NDP will set the ipif_addr_ready flag when it's ready */
- if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV))
+ if (ill->ill_isv6)
return (0);
- if (ill->ill_isv6) {
- /*
- * External resolver for IPv6
- */
- ASSERT(res_act == Res_act_initial);
- publish = !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr);
- } else {
- /*
- * IPv4 arp case. If the ARP stream has already started
- * closing, fail this request for ARP bringup. Else
- * record the fact that an ARP bringup is pending.
- */
- mutex_enter(&ill->ill_lock);
- if (ill->ill_arp_closing) {
- mutex_exit(&ill->ill_lock);
- err = EINVAL;
- goto failed;
- } else {
- if (ill->ill_ipif_up_count == 0 &&
- ill->ill_ipif_dup_count == 0 && !was_dup)
- ill->ill_arp_bringup_pending = 1;
- mutex_exit(&ill->ill_lock);
- }
- publish = (ipif->ipif_lcl_addr != INADDR_ANY);
- }
-
- if (IS_IPMP(ill) && publish) {
- /*
- * If we're here via ipif_up(), then the ipif won't be bound
- * yet -- add it to the group, which will bind it if possible.
- * (We would add it in ipif_up(), but deleting on failure
- * there is gruesome.) If we're here via ipmp_ill_bind_ipif(),
- * then the ipif has already been added to the group and we
- * just need to use the binding.
- */
- if (ipmp_ipif_bound_ill(ipif) == NULL) {
- if (ipmp_illgrp_add_ipif(ill->ill_grp, ipif) == NULL) {
- /*
- * We couldn't bind the ipif to an ill yet,
- * so we have nothing to publish.
- */
- publish = B_FALSE;
- }
- added_ipif = B_TRUE;
- }
- }
-
- /*
- * Add an entry for the local address in ARP only if it
- * is not UNNUMBERED and it is suitable for publishing.
- */
- if (!(ipif->ipif_flags & IPIF_UNNUMBERED) && publish) {
- if (res_act == Res_act_defend) {
- arp_add_mp = ipif_area_alloc(ipif, ACE_F_DEFEND);
- if (arp_add_mp == NULL)
- goto failed;
- /*
- * If we're just defending our address now, then
- * there's no need to set up ARP multicast mappings.
- * The publish command is enough.
- */
- goto done;
- }
-
- /*
- * Allocate an ARP add message and an ARP delete message (the
- * latter is saved for use when the address goes down).
- */
- if ((arp_add_mp = ipif_area_alloc(ipif, 0)) == NULL)
- goto failed;
-
- if ((arp_del_mp = ipif_ared_alloc(ipif)) == NULL)
- goto failed;
-
- if (res_act != Res_act_initial)
- goto arp_setup_multicast;
- } else {
- if (res_act != Res_act_initial)
- goto done;
- }
- /*
- * Need to bring up ARP or setup multicast mapping only
- * when the first interface is coming UP.
- */
- if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0 || was_dup)
- goto done;
-
- /*
- * Allocate an ARP down message (to be saved) and an ARP up message.
- */
- arp_down_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ard_template, 0);
- if (arp_down_mp == NULL)
- goto failed;
-
- arp_up_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aru_template, 0);
- if (arp_up_mp == NULL)
- goto failed;
-
- if (ipif->ipif_flags & IPIF_POINTOPOINT)
- goto done;
-
-arp_setup_multicast:
- /*
- * Setup the multicast mappings. This function initializes
- * ill_arp_del_mapping_mp also. This does not need to be done for
- * IPv6, or for the IPMP interface (since it has no link-layer).
- */
- if (!ill->ill_isv6 && !IS_IPMP(ill)) {
- err = ipif_arp_setup_multicast(ipif, &arp_add_mapping_mp);
- if (err != 0)
- goto failed;
- ASSERT(ill->ill_arp_del_mapping_mp != NULL);
- ASSERT(arp_add_mapping_mp != NULL);
- }
-done:
- if (arp_up_mp != NULL) {
- ip1dbg(("ipif_resolver_up: ARP_UP for %s:%u\n",
- ill->ill_name, ipif->ipif_id));
- putnext(ill->ill_rq, arp_up_mp);
- arp_up_mp = NULL;
- }
- if (arp_add_mp != NULL) {
- ip1dbg(("ipif_resolver_up: ARP_ADD for %s:%u\n",
- ill->ill_name, ipif->ipif_id));
- /*
- * If it's an extended ARP implementation, then we'll wait to
- * hear that DAD has finished before using the interface.
- */
- if (!ill->ill_arp_extend)
- ipif->ipif_addr_ready = 1;
- putnext(ill->ill_rq, arp_add_mp);
- arp_add_mp = NULL;
- } else {
- ipif->ipif_addr_ready = 1;
- }
- if (arp_add_mapping_mp != NULL) {
- ip1dbg(("ipif_resolver_up: MAPPING_ADD for %s:%u\n",
- ill->ill_name, ipif->ipif_id));
- putnext(ill->ill_rq, arp_add_mapping_mp);
- arp_add_mapping_mp = NULL;
- }
-
- if (res_act == Res_act_initial) {
- if (ill->ill_flags & ILLF_NOARP)
- err = ill_arp_off(ill);
- else
- err = ill_arp_on(ill);
- if (err != 0) {
- ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n",
- err));
- goto failed;
- }
- }
-
- if (arp_del_mp != NULL) {
- ASSERT(ipif->ipif_arp_del_mp == NULL);
- ipif->ipif_arp_del_mp = arp_del_mp;
- }
- if (arp_down_mp != NULL) {
- ASSERT(ill->ill_arp_down_mp == NULL);
- ill->ill_arp_down_mp = arp_down_mp;
- }
- if (arp_del_mapping_mp != NULL) {
- ASSERT(ill->ill_arp_del_mapping_mp == NULL);
- ill->ill_arp_del_mapping_mp = arp_del_mapping_mp;
- }
-
- return ((ill->ill_ipif_up_count != 0 || was_dup ||
- ill->ill_ipif_dup_count != 0) ? 0 : EINPROGRESS);
-failed:
- ip1dbg(("ipif_resolver_up: FAILED\n"));
- if (added_ipif)
- ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
- freemsg(arp_add_mp);
- freemsg(arp_del_mp);
- freemsg(arp_add_mapping_mp);
- freemsg(arp_up_mp);
- freemsg(arp_down_mp);
- ill->ill_arp_bringup_pending = 0;
+ err = ipif_arp_up(ipif, res_act, was_dup);
return (err);
}
/*
- * This routine restarts IPv4 duplicate address detection (DAD) when a link has
- * just gone back up.
+ * This routine restarts IPv4/IPv6 duplicate address detection (DAD)
+ * when a link has just gone back up.
*/
static void
-ipif_arp_start_dad(ipif_t *ipif)
+ipif_nce_start_dad(ipif_t *ipif)
{
+ ncec_t *ncec;
ill_t *ill = ipif->ipif_ill;
- mblk_t *arp_add_mp;
+ boolean_t isv6 = ill->ill_isv6;
- /* ACE_F_UNVERIFIED restarts DAD */
- if (ill->ill_net_type != IRE_IF_RESOLVER || ill->ill_arp_closing ||
- (ipif->ipif_flags & IPIF_UNNUMBERED) ||
- ipif->ipif_lcl_addr == INADDR_ANY ||
- (arp_add_mp = ipif_area_alloc(ipif, ACE_F_UNVERIFIED)) == NULL) {
- /*
- * If we can't contact ARP for some reason, that's not really a
- * problem. Just send out the routing socket notification that
- * DAD completion would have done, and continue.
- */
- ipif_mask_reply(ipif);
- ipif_up_notify(ipif);
- ipif->ipif_addr_ready = 1;
- return;
- }
+ if (isv6) {
+ ncec = ncec_lookup_illgrp_v6(ipif->ipif_ill,
+ &ipif->ipif_v6lcl_addr);
+ } else {
+ ipaddr_t v4addr;
- putnext(ill->ill_rq, arp_add_mp);
-}
+ if (ill->ill_net_type != IRE_IF_RESOLVER ||
+ (ipif->ipif_flags & IPIF_UNNUMBERED) ||
+ ipif->ipif_lcl_addr == INADDR_ANY) {
+ /*
+ * If we can't contact ARP for some reason,
+ * that's not really a problem. Just send
+ * out the routing socket notification that
+ * DAD completion would have done, and continue.
+ */
+ ipif_mask_reply(ipif);
+ ipif_up_notify(ipif);
+ ipif->ipif_addr_ready = 1;
+ return;
+ }
-static void
-ipif_ndp_start_dad(ipif_t *ipif)
-{
- nce_t *nce;
+ IN6_V4MAPPED_TO_IPADDR(&ipif->ipif_v6lcl_addr, v4addr);
+ ncec = ncec_lookup_illgrp_v4(ipif->ipif_ill, &v4addr);
+ }
- nce = ndp_lookup_v6(ipif->ipif_ill, B_TRUE, &ipif->ipif_v6lcl_addr,
- B_FALSE);
- if (nce == NULL)
+ if (ncec == NULL) {
+ ip1dbg(("couldn't find ncec for ipif %p leaving !ready\n",
+ (void *)ipif));
return;
-
- if (!ndp_restart_dad(nce)) {
+ }
+ if (!nce_restart_dad(ncec)) {
/*
* If we can't restart DAD for some reason, that's not really a
* problem. Just send out the routing socket notification that
@@ -13723,7 +11766,7 @@ ipif_ndp_start_dad(ipif_t *ipif)
ipif_up_notify(ipif);
ipif->ipif_addr_ready = 1;
}
- NCE_REFRELE(nce);
+ ncec_refrele(ncec);
}
/*
@@ -13749,30 +11792,21 @@ ill_restart_dad(ill_t *ill, boolean_t went_up)
* If layer two doesn't support duplicate address detection, then just
* send the routing socket message now and be done with it.
*/
- if ((ill->ill_isv6 && (ill->ill_flags & ILLF_XRESOLV)) ||
- (!ill->ill_isv6 && !ill->ill_arp_extend)) {
+ if (!ill->ill_isv6 && arp_no_defense) {
ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
return;
}
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
if (went_up) {
+
if (ipif->ipif_flags & IPIF_UP) {
- if (ill->ill_isv6)
- ipif_ndp_start_dad(ipif);
- else
- ipif_arp_start_dad(ipif);
- } else if (ill->ill_isv6 &&
- (ipif->ipif_flags & IPIF_DUPLICATE)) {
+ ipif_nce_start_dad(ipif);
+ } else if (ipif->ipif_flags & IPIF_DUPLICATE) {
/*
- * For IPv4, the ARP module itself will
- * automatically start the DAD process when it
- * sees DL_NOTE_LINK_UP. We respond to the
- * AR_CN_READY at the completion of that task.
- * For IPv6, we must kick off the bring-up
- * process now.
+ * kick off the bring-up process now.
*/
- ndp_do_recovery(ipif);
+ ipif_do_recovery(ipif);
} else {
/*
* Unfortunately, the first ipif is "special"
@@ -13822,7 +11856,7 @@ ipsq_delete(ipsq_t *ipsq)
static int
ill_up_ipifs_on_ill(ill_t *ill, queue_t *q, mblk_t *mp)
{
- int err;
+ int err = 0;
ipif_t *ipif;
if (ill == NULL)
@@ -13841,9 +11875,6 @@ ill_up_ipifs_on_ill(ill_t *ill, queue_t *q, mblk_t *mp)
}
}
}
- mutex_enter(&ill->ill_lock);
- ill->ill_state_flags &= ~ILL_CHANGING;
- mutex_exit(&ill->ill_lock);
ill->ill_up_ipifs = B_FALSE;
return (0);
}
@@ -13859,6 +11890,15 @@ ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp)
ASSERT(IAM_WRITER_ILL(ill));
+ if (ill->ill_replumbing) {
+ ill->ill_replumbing = 0;
+ /*
+ * Send down REPLUMB_DONE notification followed by the
+ * BIND_REQ on the arp stream.
+ */
+ if (!ill->ill_isv6)
+ arp_send_replumb_conf(ill);
+ }
err = ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv4, q, mp);
if (err != 0)
return (err);
@@ -13887,16 +11927,10 @@ ill_down_ipifs(ill_t *ill, boolean_t logical)
if (ipif->ipif_flags & IPIF_UP)
ipif->ipif_was_up = B_TRUE;
- /*
- * Need to re-create net/subnet bcast ires if
- * they are dependent on ipif.
- */
- if (!ipif->ipif_isv6)
- ipif_check_bcast_ires(ipif);
if (logical) {
(void) ipif_logical_down(ipif, NULL, NULL);
ipif_non_duplicate(ipif);
- ipif_down_tail(ipif);
+ (void) ipif_down_tail(ipif);
} else {
(void) ipif_down(ipif, NULL, NULL);
}
@@ -13904,29 +11938,18 @@ ill_down_ipifs(ill_t *ill, boolean_t logical)
}
/*
- * Redo source address selection. This is called when a
- * non-NOLOCAL/DEPRECATED/ANYCAST ipif comes up.
+ * Redo source address selection. This makes IXAF_VERIFY_SOURCE take
+ * a look again at valid source addresses.
+ * This should be called each time after the set of source addresses has been
+ * changed.
*/
void
-ill_update_source_selection(ill_t *ill)
+ip_update_source_selection(ip_stack_t *ipst)
{
- ipif_t *ipif;
-
- ASSERT(IAM_WRITER_ILL(ill));
-
- /*
- * Underlying interfaces are only used for test traffic and thus
- * should always send with their (deprecated) source addresses.
- */
- if (IS_UNDER_IPMP(ill))
- return;
-
- for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
- if (ill->ill_isv6)
- ipif_recreate_interface_routes_v6(NULL, ipif);
- else
- ipif_recreate_interface_routes(NULL, ipif);
- }
+ /* We skip past SRC_GENERATION_VERIFY */
+ if (atomic_add_32_nv(&ipst->ips_src_generation, 1) ==
+ SRC_GENERATION_VERIFY)
+ atomic_add_32(&ipst->ips_src_generation, 1);
}
/*
@@ -14154,6 +12177,8 @@ ip_sioctl_groupinfo(ipif_t *dummy_ipif, sin_t *sin, queue_t *q, mblk_t *mp,
static void
ill_dl_down(ill_t *ill)
{
+ DTRACE_PROBE2(ill__downup, char *, "ill_dl_down", ill_t *, ill);
+
/*
* The ill is down; unbind but stay attached since we're still
* associated with a PPA. If we have negotiated DLPI capabilites
@@ -14167,6 +12192,13 @@ ill_dl_down(ill_t *ill)
ip1dbg(("ill_dl_down(%s)\n", ill->ill_name));
+ if (!ill->ill_replumbing) {
+ /* Free all ilms for this ill */
+ update_conn_ill(ill, ill->ill_ipst);
+ } else {
+ ill_leave_multicast(ill);
+ }
+
ill->ill_unbind_mp = NULL;
if (mp != NULL) {
ip1dbg(("ill_dl_down: %s (%u) for %s\n",
@@ -14191,23 +12223,13 @@ ill_dl_down(ill_t *ill)
ill_capability_reset(ill, B_FALSE);
ill_dlpi_send(ill, mp);
}
-
- /*
- * Toss all of our multicast memberships. We could keep them, but
- * then we'd have to do bookkeeping of any joins and leaves performed
- * by the application while the the interface is down (we can't just
- * issue them because arp cannot currently process AR_ENTRY_SQUERY's
- * on a downed interface).
- */
- ill_leave_multicast(ill);
-
mutex_enter(&ill->ill_lock);
ill->ill_dl_up = 0;
ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0);
mutex_exit(&ill->ill_lock);
}
-static void
+void
ill_dlpi_dispatch(ill_t *ill, mblk_t *mp)
{
union DL_primitives *dlp;
@@ -14249,6 +12271,8 @@ ill_dlpi_dispatch(ill_t *ill, mblk_t *mp)
}
mutex_exit(&ill->ill_lock);
+ DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_dispatch",
+ char *, dl_primstr(prim), ill_t *, ill);
putnext(ill->ill_wq, mp);
/*
@@ -14301,8 +12325,9 @@ ill_dlpi_send(ill_t *ill, mblk_t *mp)
while (*mpp != NULL)
mpp = &((*mpp)->b_next);
- ip1dbg(("ill_dlpi_send: deferring request for %s\n",
- ill->ill_name));
+ ip1dbg(("ill_dlpi_send: deferring request for %s "
+ "while %s pending\n", ill->ill_name,
+ dl_primstr(ill->ill_dlpi_pending)));
*mpp = mp;
mutex_exit(&ill->ill_lock);
@@ -14437,51 +12462,237 @@ ill_dlpi_done(ill_t *ill, t_uscalar_t prim)
ill_dlpi_dispatch(ill, mp);
}
+/*
+ * Queue a (multicast) DLPI control message to be sent to the driver by
+ * later calling ill_dlpi_send_queued.
+ * We queue them while holding a lock (ill_mcast_lock) to ensure that they
+ * are sent in order i.e., prevent a DL_DISABMULTI_REQ and DL_ENABMULTI_REQ
+ * for the same group to race.
+ * We send DLPI control messages in order using ill_lock.
+ * For IPMP we should be called on the cast_ill.
+ */
void
-conn_delete_ire(conn_t *connp, caddr_t arg)
+ill_dlpi_queue(ill_t *ill, mblk_t *mp)
{
- ipif_t *ipif = (ipif_t *)arg;
- ire_t *ire;
+ mblk_t **mpp;
- /*
- * Look at the cached ires on conns which has pointers to ipifs.
- * We just call ire_refrele which clears up the reference
- * to ire. Called when a conn closes. Also called from ipif_free
- * to cleanup indirect references to the stale ipif via the cached ire.
- */
- mutex_enter(&connp->conn_lock);
- ire = connp->conn_ire_cache;
- if (ire != NULL && (ipif == NULL || ire->ire_ipif == ipif)) {
- connp->conn_ire_cache = NULL;
- mutex_exit(&connp->conn_lock);
- IRE_REFRELE_NOTR(ire);
- return;
+ ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
+
+ mutex_enter(&ill->ill_lock);
+ /* Must queue message. Tail insertion */
+ mpp = &ill->ill_dlpi_deferred;
+ while (*mpp != NULL)
+ mpp = &((*mpp)->b_next);
+
+ *mpp = mp;
+ mutex_exit(&ill->ill_lock);
+}
+
+/*
+ * Send the messages that were queued. Make sure there is only
+ * one outstanding message. ip_rput_dlpi_writer calls ill_dlpi_done()
+ * when an ACK or a NAK is received to process the next queued message.
+ * For IPMP we are called on the upper ill, but when send what is queued
+ * on the cast_ill.
+ */
+void
+ill_dlpi_send_queued(ill_t *ill)
+{
+ mblk_t *mp;
+ union DL_primitives *dlp;
+ t_uscalar_t prim;
+ ill_t *release_ill = NULL;
+
+ if (IS_IPMP(ill)) {
+ /* On the upper IPMP ill. */
+ release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
+ if (release_ill == NULL) {
+ /* Avoid ever sending anything down to the ipmpstub */
+ return;
+ }
+ ill = release_ill;
}
- mutex_exit(&connp->conn_lock);
+ mutex_enter(&ill->ill_lock);
+ while ((mp = ill->ill_dlpi_deferred) != NULL) {
+ if (ill->ill_dlpi_pending != DL_PRIM_INVAL) {
+ /* Can't send. Somebody else will send it */
+ mutex_exit(&ill->ill_lock);
+ goto done;
+ }
+ ill->ill_dlpi_deferred = mp->b_next;
+ mp->b_next = NULL;
+ if (!ill->ill_dl_up) {
+ /*
+ * Nobody there. All multicast addresses will be
+ * re-joined when we get the DL_BIND_ACK bringing the
+ * interface up.
+ */
+ freemsg(mp);
+ continue;
+ }
+ dlp = (union DL_primitives *)mp->b_rptr;
+ prim = dlp->dl_primitive;
+
+ if (!(ill->ill_state_flags & ILL_CONDEMNED) ||
+ (prim == DL_UNBIND_REQ)) {
+ ill->ill_dlpi_pending = prim;
+ }
+ mutex_exit(&ill->ill_lock);
+ DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_send_queued",
+ char *, dl_primstr(prim), ill_t *, ill);
+ putnext(ill->ill_wq, mp);
+ mutex_enter(&ill->ill_lock);
+ }
+ mutex_exit(&ill->ill_lock);
+done:
+ if (release_ill != NULL)
+ ill_refrele(release_ill);
}
/*
- * Some operations (e.g., ipif_down()) conditionally delete a number
- * of IREs. Those IREs may have been previously cached in the conn structure.
- * This ipcl_walk() walker function releases all references to such IREs based
- * on the condemned flag.
+ * Queue an IP (IGMP/MLD) message to be sent by IP from
+ * ill_mcast_send_queued
+ * We queue them while holding a lock (ill_mcast_lock) to ensure that they
+ * are sent in order i.e., prevent a IGMP leave and IGMP join for the same
+ * group to race.
+ * We send them in order using ill_lock.
+ * For IPMP we are called on the upper ill, but we queue on the cast_ill.
*/
-/* ARGSUSED */
void
-conn_cleanup_stale_ire(conn_t *connp, caddr_t arg)
+ill_mcast_queue(ill_t *ill, mblk_t *mp)
{
- ire_t *ire;
+ mblk_t **mpp;
+ ill_t *release_ill = NULL;
- mutex_enter(&connp->conn_lock);
- ire = connp->conn_ire_cache;
- if (ire != NULL && (ire->ire_marks & IRE_MARK_CONDEMNED)) {
- connp->conn_ire_cache = NULL;
- mutex_exit(&connp->conn_lock);
- IRE_REFRELE_NOTR(ire);
- return;
+ ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
+
+ if (IS_IPMP(ill)) {
+ /* On the upper IPMP ill. */
+ release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
+ if (release_ill == NULL) {
+ /* Discard instead of queuing for the ipmp interface */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards - no cast_ill",
+ mp, ill);
+ freemsg(mp);
+ return;
+ }
+ ill = release_ill;
}
- mutex_exit(&connp->conn_lock);
+
+ mutex_enter(&ill->ill_lock);
+ /* Must queue message. Tail insertion */
+ mpp = &ill->ill_mcast_deferred;
+ while (*mpp != NULL)
+ mpp = &((*mpp)->b_next);
+
+ *mpp = mp;
+ mutex_exit(&ill->ill_lock);
+ if (release_ill != NULL)
+ ill_refrele(release_ill);
+}
+
+/*
+ * Send the IP packets that were queued by ill_mcast_queue.
+ * These are IGMP/MLD packets.
+ *
+ * For IPMP we are called on the upper ill, but when send what is queued
+ * on the cast_ill.
+ *
+ * Request loopback of the report if we are acting as a multicast
+ * router, so that the process-level routing demon can hear it.
+ * This will run multiple times for the same group if there are members
+ * on the same group for multiple ipif's on the same ill. The
+ * igmp_input/mld_input code will suppress this due to the loopback thus we
+ * always loopback membership report.
+ *
+ * We also need to make sure that this does not get load balanced
+ * by IPMP. We do this by passing an ill to ip_output_simple.
+ */
+void
+ill_mcast_send_queued(ill_t *ill)
+{
+ mblk_t *mp;
+ ip_xmit_attr_t ixas;
+ ill_t *release_ill = NULL;
+
+ if (IS_IPMP(ill)) {
+ /* On the upper IPMP ill. */
+ release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
+ if (release_ill == NULL) {
+ /*
+ * We should have no messages on the ipmp interface
+ * but no point in trying to send them.
+ */
+ return;
+ }
+ ill = release_ill;
+ }
+ bzero(&ixas, sizeof (ixas));
+ ixas.ixa_zoneid = ALL_ZONES;
+ ixas.ixa_cred = kcred;
+ ixas.ixa_cpid = NOPID;
+ ixas.ixa_tsl = NULL;
+ /*
+ * Here we set ixa_ifindex. If IPMP it will be the lower ill which
+ * makes ip_select_route pick the IRE_MULTICAST for the cast_ill.
+ * That is necessary to handle IGMP/MLD snooping switches.
+ */
+ ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
+ ixas.ixa_ipst = ill->ill_ipst;
+
+ mutex_enter(&ill->ill_lock);
+ while ((mp = ill->ill_mcast_deferred) != NULL) {
+ ill->ill_mcast_deferred = mp->b_next;
+ mp->b_next = NULL;
+ if (!ill->ill_dl_up) {
+ /*
+ * Nobody there. Just drop the ip packets.
+ * IGMP/MLD will resend later, if this is a replumb.
+ */
+ freemsg(mp);
+ continue;
+ }
+ mutex_enter(&ill->ill_phyint->phyint_lock);
+ if (IS_UNDER_IPMP(ill) && !ipmp_ill_is_active(ill)) {
+ /*
+ * When the ill is getting deactivated, we only want to
+ * send the DLPI messages, so drop IGMP/MLD packets.
+ * DLPI messages are handled by ill_dlpi_send_queued()
+ */
+ mutex_exit(&ill->ill_phyint->phyint_lock);
+ freemsg(mp);
+ continue;
+ }
+ mutex_exit(&ill->ill_phyint->phyint_lock);
+ mutex_exit(&ill->ill_lock);
+
+ /* Check whether we are sending IPv4 or IPv6. */
+ if (ill->ill_isv6) {
+ ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+
+ ixas.ixa_multicast_ttl = ip6h->ip6_hops;
+ ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
+ } else {
+ ipha_t *ipha = (ipha_t *)mp->b_rptr;
+
+ ixas.ixa_multicast_ttl = ipha->ipha_ttl;
+ ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
+ ixas.ixa_flags &= ~IXAF_SET_ULP_CKSUM;
+ }
+
+ ixas.ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_SOURCE;
+ (void) ip_output_simple(mp, &ixas);
+ ixa_cleanup(&ixas);
+
+ mutex_enter(&ill->ill_lock);
+ }
+ mutex_exit(&ill->ill_lock);
+
+done:
+ if (release_ill != NULL)
+ ill_refrele(release_ill);
}
/*
@@ -14494,7 +12705,7 @@ conn_cleanup_stale_ire(conn_t *connp, caddr_t arg)
* that both Solaris and 4.3 BSD have exhibited this behaviour for a long
* time. We go thru the cleanup in order to remove these routes.
* b. The bringup of the interface could fail in ill_dl_up i.e. we get
- * DL_ERROR_ACK in response to the the DL_BIND request. The interface is
+ * DL_ERROR_ACK in response to the DL_BIND request. The interface is
* down, but we need to cleanup i.e. do ill_dl_down and
* ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down.
*
@@ -14504,12 +12715,11 @@ conn_cleanup_stale_ire(conn_t *connp, caddr_t arg)
*
* The following members in ipif_t track references to the ipif.
* int ipif_refcnt; Active reference count
- * uint_t ipif_ire_cnt; Number of ire's referencing this ipif
- * uint_t ipif_ilm_cnt; Number of ilms's references this ipif.
*
* The following members in ill_t track references to the ill.
* int ill_refcnt; active refcnt
* uint_t ill_ire_cnt; Number of ires referencing ill
+ * uint_t ill_ncec_cnt; Number of ncecs referencing ill
* uint_t ill_nce_cnt; Number of nces referencing ill
* uint_t ill_ilm_cnt; Number of ilms referencing ill
*
@@ -14525,21 +12735,25 @@ conn_cleanup_stale_ire(conn_t *connp, caddr_t arg)
* references to the ipif / ill. Pointers from other structures do not
* count towards this reference count.
*
- * ipif_ire_cnt/ill_ire_cnt is the number of ire's
- * associated with the ipif/ill. This is incremented whenever a new
- * ire is created referencing the ipif/ill. This is done atomically inside
- * ire_add_v[46] where the ire is actually added to the ire hash table.
- * The count is decremented in ire_inactive where the ire is destroyed.
+ * ill_ire_cnt is the number of ire's associated with the
+ * ill. This is incremented whenever a new ire is created referencing the
+ * ill. This is done atomically inside ire_add_v[46] where the ire is
+ * actually added to the ire hash table. The count is decremented in
+ * ire_inactive where the ire is destroyed.
*
- * nce's reference ill's thru nce_ill and the count of nce's associated with
- * an ill is recorded in ill_nce_cnt. This is incremented atomically in
+ * ill_ncec_cnt is the number of ncec's referencing the ill thru ncec_ill.
+ * This is incremented atomically in
* ndp_add_v4()/ndp_add_v6() where the nce is actually added to the
- * table. Similarly it is decremented in ndp_inactive() where the nce
+ * table. Similarly it is decremented in ncec_inactive() where the ncec
+ * is destroyed.
+ *
+ * ill_nce_cnt is the number of nce's referencing the ill thru nce_ill. This is
+ * incremented atomically in nce_add() where the nce is actually added to the
+ * ill_nce. Similarly it is decremented in nce_inactive() where the nce
* is destroyed.
*
- * ilm's reference to the ipif (for IPv4 ilm's) or the ill (for IPv6 ilm's)
- * is incremented in ilm_add_v6() and decremented before the ilm is freed
- * in ilm_walker_cleanup() or ilm_delete().
+ * ill_ilm_cnt is the ilm's reference to the ill. It is incremented in
+ * ilm_add() and decremented before the ilm is freed in ilm_delete().
*
* Flow of ioctls involving interface down/up
*
@@ -14555,50 +12769,22 @@ conn_cleanup_stale_ire(conn_t *connp, caddr_t arg)
* to the above. All the *tail functions are called after the refcounts have
* dropped to the appropriate values.
*
- * The mechanism to quiesce an ipif is as follows.
- *
- * Mark the ipif as IPIF_CHANGING. No more lookups will be allowed
- * on the ipif. Callers either pass a flag requesting wait or the lookup
- * functions will return NULL.
- *
- * Delete all ires referencing this ipif
+ * SIOC ioctls during the IPIF_CHANGING interval.
*
- * Any thread attempting to do an ipif_refhold on an ipif that has been
- * obtained thru a cached pointer will first make sure that
- * the ipif can be refheld using the macro IPIF_CAN_LOOKUP and only then
- * increment the refcount.
- *
- * The above guarantees that the ipif refcount will eventually come down to
- * zero and the ipif will quiesce, once all threads that currently hold a
- * reference to the ipif refrelease the ipif. The ipif is quiescent after the
- * ipif_refcount has dropped to zero and all ire's associated with this ipif
- * have also been ire_inactive'd. i.e. when ipif_{ire, ill}_cnt and
- * ipif_refcnt both drop to zero. See also: comments above IPIF_DOWN_OK()
- * in ip.h
- *
- * Lookups during the IPIF_CHANGING/ILL_CHANGING interval.
- *
- * Threads trying to lookup an ipif or ill can pass a flag requesting
- * wait and restart if the ipif / ill cannot be looked up currently.
- * For eg. bind, and route operations (Eg. route add / delete) cannot return
- * failure if the ipif is currently undergoing an exclusive operation, and
- * hence pass the flag. The mblk is then enqueued in the ipsq and the operation
- * is restarted by ipsq_exit() when the current exclusive operation completes.
- * The lookup and enqueue is atomic using the ill_lock and ipsq_lock. The
+ * Threads handling SIOC set ioctls serialize on the squeue, but this
+ * is not done for SIOC get ioctls. Since a set ioctl can cause several
+ * steps of internal changes to the state, some of which are visible in
+ * ipif_flags (such as IFF_UP being cleared and later set), and we want
+ * the set ioctl to be atomic related to the get ioctls, the SIOC get code
+ * will wait and restart ioctls if IPIF_CHANGING is set. The mblk is then
+ * enqueued in the ipsq and the operation is restarted by ipsq_exit() when
+ * the current exclusive operation completes. The IPIF_CHANGING check
+ * and enqueue is atomic using the ill_lock and ipsq_lock. The
* lookup is done holding the ill_lock. Hence the ill/ipif state flags can't
* change while the ill_lock is held. Before dropping the ill_lock we acquire
* the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish
- * until we release the ipsq_lock, even though the the ill/ipif state flags
+ * until we release the ipsq_lock, even though the ill/ipif state flags
* can change after we drop the ill_lock.
- *
- * An attempt to send out a packet using an ipif that is currently
- * IPIF_CHANGING will fail. No attempt is made in this case to enqueue this
- * operation and restart it later when the exclusive condition on the ipif ends.
- * This is an example of not passing the wait flag to the lookup functions. For
- * example an attempt to refhold and use conn->conn_multicast_ipif and send
- * out a multicast packet on that ipif will fail while the ipif is
- * IPIF_CHANGING. An attempt to create an IRE_CACHE using an ipif that is
- * currently IPIF_CHANGING will also fail.
*/
int
ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
@@ -14613,6 +12799,9 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
+ DTRACE_PROBE3(ipif__downup, char *, "ipif_down",
+ ill_t *, ill, ipif_t *, ipif);
+
if (ipif->ipif_flags & IPIF_UP) {
mutex_enter(&ill->ill_lock);
ipif->ipif_flags &= ~IPIF_UP;
@@ -14649,15 +12838,12 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
}
}
- /*
- * Delete all IRE's pointing at this ipif or its source address.
- */
- if (ipif->ipif_isv6) {
- ire_walk_v6(ipif_down_delete_ire, (char *)ipif, ALL_ZONES,
- ipst);
- } else {
- ire_walk_v4(ipif_down_delete_ire, (char *)ipif, ALL_ZONES,
- ipst);
+ if (ipif_was_up) {
+ /* only delete if we'd added ire's before */
+ if (ipif->ipif_isv6)
+ ipif_delete_ires_v6(ipif);
+ else
+ ipif_delete_ires_v4(ipif);
}
if (ipif_was_up && ill->ill_ipif_up_count == 0) {
@@ -14672,30 +12858,28 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
}
/*
- * Cleaning up the conn_ire_cache or conns must be done only after the
- * ires have been deleted above. Otherwise a thread could end up
- * caching an ire in a conn after we have finished the cleanup of the
- * conn. The caching is done after making sure that the ire is not yet
- * condemned. Also documented in the block comment above ip_output
+ * neighbor-discovery or arp entries for this interface. The ipif
+ * has to be quiesced, so we walk all the nce's and delete those
+ * that point at the ipif->ipif_ill. At the same time, we also
+ * update IPMP so that ipifs for data addresses are unbound. We dont
+ * call ipif_arp_down to DL_UNBIND the arp stream itself here, but defer
+ * that for ipif_down_tail()
*/
- ipcl_walk(conn_cleanup_stale_ire, NULL, ipst);
- /* Also, delete the ires cached in SCTP */
- sctp_ire_cache_flush(ipif);
+ ipif_nce_down(ipif);
/*
- * Update any other ipifs which have used "our" local address as
- * a source address. This entails removing and recreating IRE_INTERFACE
- * entries for such ipifs.
+ * If this is the last ipif on the ill, we also need to remove
+ * any IREs with ire_ill set. Otherwise ipif_is_quiescent() will
+ * never succeed.
*/
- if (ipif->ipif_isv6)
- ipif_update_other_ipifs_v6(ipif);
- else
- ipif_update_other_ipifs(ipif);
+ if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0)
+ ire_walk_ill(0, 0, ill_downi, ill, ill);
/*
- * neighbor-discovery or arp entries for this interface.
+ * Walk all CONNs that can have a reference on an ire for this
+ * ipif (we actually walk all that now have stale references).
*/
- ipif_ndp_down(ipif);
+ ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst);
/*
* If mp is NULL the caller will wait for the appropriate refcnt.
@@ -14748,10 +12932,14 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
return (EINPROGRESS);
}
-void
+int
ipif_down_tail(ipif_t *ipif)
{
ill_t *ill = ipif->ipif_ill;
+ int err = 0;
+
+ DTRACE_PROBE3(ipif__downup, char *, "ipif_down_tail",
+ ill_t *, ill, ipif_t *, ipif);
/*
* Skip any loopback interface (null wq).
@@ -14766,15 +12954,14 @@ ipif_down_tail(ipif_t *ipif)
ill->ill_dl_up) {
ill_dl_down(ill);
}
- ill->ill_logical_down = 0;
+ if (!ipif->ipif_isv6)
+ err = ipif_arp_down(ipif);
- /*
- * Has to be after removing the routes in ipif_down_delete_ire.
- */
- ipif_resolver_down(ipif);
+ ill->ill_logical_down = 0;
ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
ip_rts_newaddrmsg(RTM_DELETE, 0, ipif, RTSQ_DEFAULT);
+ return (err);
}
/*
@@ -14785,6 +12972,9 @@ ipif_down_tail(ipif_t *ipif)
static int
ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
{
+ DTRACE_PROBE3(ipif__downup, char *, "ipif_logical_down",
+ ill_t *, ipif->ipif_ill, ipif_t *, ipif);
+
/*
* The ill_logical_down flag is a transient flag. It is set here
* and is cleared once the down has completed in ipif_down_tail.
@@ -14799,152 +12989,6 @@ ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
}
/*
- * This is called when the SIOCSLIFUSESRC ioctl is processed in IP.
- * If the usesrc client ILL is already part of a usesrc group or not,
- * in either case a ire_stq with the matching usesrc client ILL will
- * locate the IRE's that need to be deleted. We want IREs to be created
- * with the new source address.
- */
-static void
-ipif_delete_cache_ire(ire_t *ire, char *ill_arg)
-{
- ill_t *ucill = (ill_t *)ill_arg;
-
- ASSERT(IAM_WRITER_ILL(ucill));
-
- if (ire->ire_stq == NULL)
- return;
-
- if ((ire->ire_type == IRE_CACHE) &&
- ((ill_t *)ire->ire_stq->q_ptr == ucill))
- ire_delete(ire);
-}
-
-/*
- * ire_walk routine to delete every IRE dependent on the interface
- * address that is going down. (Always called as writer.)
- * Works for both v4 and v6.
- * In addition for checking for ire_ipif matches it also checks for
- * IRE_CACHE entries which have the same source address as the
- * disappearing ipif since ipif_select_source might have picked
- * that source. Note that ipif_down/ipif_update_other_ipifs takes
- * care of any IRE_INTERFACE with the disappearing source address.
- */
-static void
-ipif_down_delete_ire(ire_t *ire, char *ipif_arg)
-{
- ipif_t *ipif = (ipif_t *)ipif_arg;
-
- ASSERT(IAM_WRITER_IPIF(ipif));
- if (ire->ire_ipif == NULL)
- return;
-
- if (ire->ire_ipif != ipif) {
- /*
- * Look for a matching source address.
- */
- if (ire->ire_type != IRE_CACHE)
- return;
- if (ipif->ipif_flags & IPIF_NOLOCAL)
- return;
-
- if (ire->ire_ipversion == IPV4_VERSION) {
- if (ire->ire_src_addr != ipif->ipif_src_addr)
- return;
- } else {
- if (!IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6,
- &ipif->ipif_v6lcl_addr))
- return;
- }
- ire_delete(ire);
- return;
- }
- /*
- * ire_delete() will do an ire_flush_cache which will delete
- * all ire_ipif matches
- */
- ire_delete(ire);
-}
-
-/*
- * ire_walk_ill function for deleting all IRE_CACHE entries for an ill when
- * 1) an ipif (on that ill) changes the IPIF_DEPRECATED flags, or
- * 2) when an interface is brought up or down (on that ill).
- * This ensures that the IRE_CACHE entries don't retain stale source
- * address selection results.
- */
-void
-ill_ipif_cache_delete(ire_t *ire, char *ill_arg)
-{
- ill_t *ill = (ill_t *)ill_arg;
-
- ASSERT(IAM_WRITER_ILL(ill));
- ASSERT(ire->ire_type == IRE_CACHE);
-
- /*
- * We are called for IRE_CACHEs whose ire_stq or ire_ipif matches
- * ill, but we only want to delete the IRE if ire_ipif matches.
- */
- ASSERT(ire->ire_ipif != NULL);
- if (ill == ire->ire_ipif->ipif_ill)
- ire_delete(ire);
-}
-
-/*
- * Delete all the IREs whose ire_stq's reference `ill_arg'. IPMP uses this
- * instead of ill_ipif_cache_delete() because ire_ipif->ipif_ill references
- * the IPMP ill.
- */
-void
-ill_stq_cache_delete(ire_t *ire, char *ill_arg)
-{
- ill_t *ill = (ill_t *)ill_arg;
-
- ASSERT(IAM_WRITER_ILL(ill));
- ASSERT(ire->ire_type == IRE_CACHE);
-
- /*
- * We are called for IRE_CACHEs whose ire_stq or ire_ipif matches
- * ill, but we only want to delete the IRE if ire_stq matches.
- */
- if (ire->ire_stq->q_ptr == ill_arg)
- ire_delete(ire);
-}
-
-/*
- * Delete all the IREs whose ire_stq's reference any ill in the same IPMP
- * group as `ill_arg'. Used by ipmp_ill_deactivate() to flush all IRE_CACHE
- * entries for the illgrp.
- */
-void
-ill_grp_cache_delete(ire_t *ire, char *ill_arg)
-{
- ill_t *ill = (ill_t *)ill_arg;
-
- ASSERT(IAM_WRITER_ILL(ill));
-
- if (ire->ire_type == IRE_CACHE &&
- IS_IN_SAME_ILLGRP((ill_t *)ire->ire_stq->q_ptr, ill)) {
- ire_delete(ire);
- }
-}
-
-/*
- * Delete all broadcast IREs with a source address on `ill_arg'.
- */
-static void
-ill_broadcast_delete(ire_t *ire, char *ill_arg)
-{
- ill_t *ill = (ill_t *)ill_arg;
-
- ASSERT(IAM_WRITER_ILL(ill));
- ASSERT(ire->ire_type == IRE_BROADCAST);
-
- if (ire->ire_ipif->ipif_ill == ill)
- ire_delete(ire);
-}
-
-/*
* Initiate deallocate of an IPIF. Always called as writer. Called by
* ill_delete or ip_sioctl_removeif.
*/
@@ -14959,16 +13003,6 @@ ipif_free(ipif_t *ipif)
(void) untimeout(ipif->ipif_recovery_id);
ipif->ipif_recovery_id = 0;
- /* Remove conn references */
- reset_conn_ipif(ipif);
-
- /*
- * Make sure we have valid net and subnet broadcast ire's for the
- * other ipif's which share them with this ipif.
- */
- if (!ipif->ipif_isv6)
- ipif_check_bcast_ires(ipif);
-
/*
* Take down the interface. We can be called either from ill_delete
* or from ip_sioctl_removeif.
@@ -14996,27 +13030,15 @@ ipif_free(ipif_t *ipif)
static void
ipif_free_tail(ipif_t *ipif)
{
- mblk_t *mp;
ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
/*
- * Free state for addition IRE_IF_[NO]RESOLVER ire's.
- */
- mutex_enter(&ipif->ipif_saved_ire_lock);
- mp = ipif->ipif_saved_ire_mp;
- ipif->ipif_saved_ire_mp = NULL;
- mutex_exit(&ipif->ipif_saved_ire_lock);
- freemsg(mp);
-
- /*
* Need to hold both ill_g_lock and ill_lock while
* inserting or removing an ipif from the linked list
* of ipifs hanging off the ill.
*/
rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
- ASSERT(ilm_walk_ipif(ipif) == 0);
-
#ifdef DEBUG
ipif_trace_cleanup(ipif);
#endif
@@ -15028,10 +13050,9 @@ ipif_free_tail(ipif_t *ipif)
ipif_remove(ipif);
rw_exit(&ipst->ips_ill_g_lock);
- mutex_destroy(&ipif->ipif_saved_ire_lock);
-
ASSERT(!(ipif->ipif_flags & (IPIF_UP | IPIF_DUPLICATE)));
ASSERT(ipif->ipif_recovery_id == 0);
+ ASSERT(ipif->ipif_ire_local == NULL);
/* Free the memory. */
mi_free(ipif);
@@ -15064,6 +13085,23 @@ ipif_get_name(const ipif_t *ipif, char *buf, int len)
}
/*
+ * Sets `buf' to an ill name.
+ */
+void
+ill_get_name(const ill_t *ill, char *buf, int len)
+{
+ char *name;
+ size_t name_len;
+
+ name = ill->ill_name;
+ name_len = ill->ill_name_length;
+ len -= 1;
+ buf[len] = '\0';
+ len = MIN(len, name_len);
+ bcopy(name, buf, len);
+}
+
+/*
* Find an IPIF based on the name passed in. Names can be of the form <phys>
* (e.g., le0) or <phys>:<#> (e.g., le0:1). When there is no colon, the
* implied unit id is zero. <phys> must correspond to the name of an ILL.
@@ -15071,8 +13109,7 @@ ipif_get_name(const ipif_t *ipif, char *buf, int len)
*/
static ipif_t *
ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
- boolean_t *exists, boolean_t isv6, zoneid_t zoneid, queue_t *q,
- mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst)
+ boolean_t *exists, boolean_t isv6, zoneid_t zoneid, ip_stack_t *ipst)
{
char *cp;
char *endp;
@@ -15081,10 +13118,6 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
ipif_t *ipif;
uint_t ire_type;
boolean_t did_alloc = B_FALSE;
- ipsq_t *ipsq;
-
- if (error != NULL)
- *error = 0;
/*
* If the caller wants to us to create the ipif, make sure we have a
@@ -15093,8 +13126,6 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
ASSERT(!do_alloc || zoneid != ALL_ZONES);
if (namelen == 0) {
- if (error != NULL)
- *error = ENXIO;
return (NULL);
}
@@ -15121,8 +13152,6 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
* is zero, fail.
*/
if (&cp[2] < endp && cp[1] == '0') {
- if (error != NULL)
- *error = EINVAL;
return (NULL);
}
}
@@ -15140,7 +13169,7 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
* ill_lookup_on_name will clear it.
*/
ill = ill_lookup_on_name(name, do_alloc, isv6,
- q, mp, func, error, &did_alloc, ipst);
+ &did_alloc, ipst);
if (cp != endp)
*cp = IPIF_SEPARATOR_CHAR;
if (ill == NULL)
@@ -15153,13 +13182,10 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
cp++;
if (ddi_strtol(cp, NULL, 0, &id) != 0) {
ill_refrele(ill);
- if (error != NULL)
- *error = ENXIO;
return (NULL);
}
}
- GRAB_CONN_LOCK(q);
mutex_enter(&ill->ill_lock);
/* Now see if there is an IPIF with this unit number. */
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
@@ -15168,16 +13194,9 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
zoneid != ipif->ipif_zoneid &&
ipif->ipif_zoneid != ALL_ZONES) {
mutex_exit(&ill->ill_lock);
- RELEASE_CONN_LOCK(q);
ill_refrele(ill);
- if (error != NULL)
- *error = ENXIO;
return (NULL);
}
- /*
- * The block comment at the start of ipif_down
- * explains the use of the macros used below
- */
if (IPIF_CAN_LOOKUP(ipif)) {
ipif_refhold_locked(ipif);
mutex_exit(&ill->ill_lock);
@@ -15189,32 +13208,15 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
* ipif_ill_refrele_tail which can end up
* in trying to acquire any lock.
*/
- RELEASE_CONN_LOCK(q);
ill_refrele(ill);
return (ipif);
- } else if (IPIF_CAN_WAIT(ipif, q)) {
- ipsq = ill->ill_phyint->phyint_ipsq;
- mutex_enter(&ipsq->ipsq_lock);
- mutex_enter(&ipsq->ipsq_xop->ipx_lock);
- mutex_exit(&ill->ill_lock);
- ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
- mutex_exit(&ipsq->ipsq_xop->ipx_lock);
- mutex_exit(&ipsq->ipsq_lock);
- RELEASE_CONN_LOCK(q);
- ill_refrele(ill);
- if (error != NULL)
- *error = EINPROGRESS;
- return (NULL);
}
}
}
- RELEASE_CONN_LOCK(q);
if (!do_alloc) {
mutex_exit(&ill->ill_lock);
ill_refrele(ill);
- if (error != NULL)
- *error = ENXIO;
return (NULL);
}
@@ -15236,8 +13238,6 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
ipif = ipif_allocate(ill, id, ire_type, B_TRUE, B_TRUE);
if (ipif != NULL)
ipif_refhold_locked(ipif);
- else if (error != NULL)
- *error = ENOMEM;
mutex_exit(&ill->ill_lock);
ill_refrele(ill);
return (ipif);
@@ -15258,6 +13258,7 @@ ipif_mask_reply(ipif_t *ipif)
ipha_t *ipha;
mblk_t *mp;
ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
+ ip_xmit_attr_t ixas;
#define REPLY_LEN (sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN)
@@ -15269,6 +13270,9 @@ ipif_mask_reply(ipif_t *ipif)
/* ICMP mask reply is not for a loopback interface */
ASSERT(ipif->ipif_ill->ill_wq != NULL);
+ if (ipif->ipif_lcl_addr == INADDR_ANY)
+ return;
+
mp = allocb(REPLY_LEN, BPRI_HI);
if (mp == NULL)
return;
@@ -15278,7 +13282,7 @@ ipif_mask_reply(ipif_t *ipif)
bzero(ipha, REPLY_LEN);
*ipha = icmp_ipha;
ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl;
- ipha->ipha_src = ipif->ipif_src_addr;
+ ipha->ipha_src = ipif->ipif_lcl_addr;
ipha->ipha_dst = ipif->ipif_brd_addr;
ipha->ipha_length = htons(REPLY_LEN);
ipha->ipha_ident = 0;
@@ -15288,64 +13292,19 @@ ipif_mask_reply(ipif_t *ipif)
bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN);
icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0);
- put(ipif->ipif_wq, mp);
-
+ bzero(&ixas, sizeof (ixas));
+ ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
+ ixas.ixa_flags |= IXAF_SET_SOURCE;
+ ixas.ixa_zoneid = ALL_ZONES;
+ ixas.ixa_ifindex = 0;
+ ixas.ixa_ipst = ipst;
+ ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
+ (void) ip_output_simple(mp, &ixas);
+ ixa_cleanup(&ixas);
#undef REPLY_LEN
}
/*
- * When the mtu in the ipif changes, we call this routine through ire_walk
- * to update all the relevant IREs.
- * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq.
- */
-static void
-ipif_mtu_change(ire_t *ire, char *ipif_arg)
-{
- ipif_t *ipif = (ipif_t *)ipif_arg;
-
- if (ire->ire_stq == NULL || ire->ire_ipif != ipif)
- return;
-
- mutex_enter(&ire->ire_lock);
- if (ire->ire_marks & IRE_MARK_PMTU) {
- /* Avoid increasing the PMTU */
- ire->ire_max_frag = MIN(ipif->ipif_mtu, ire->ire_max_frag);
- if (ire->ire_max_frag == ipif->ipif_mtu)
- ire->ire_marks &= ~IRE_MARK_PMTU;
- } else {
- ire->ire_max_frag = MIN(ipif->ipif_mtu, IP_MAXPACKET);
- }
- mutex_exit(&ire->ire_lock);
-}
-
-/*
- * When the mtu in the ill changes, we call this routine through ire_walk
- * to update all the relevant IREs.
- * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq.
- */
-void
-ill_mtu_change(ire_t *ire, char *ill_arg)
-{
- ill_t *ill = (ill_t *)ill_arg;
-
- if (ire->ire_stq == NULL || ire->ire_ipif->ipif_ill != ill)
- return;
-
- mutex_enter(&ire->ire_lock);
- if (ire->ire_marks & IRE_MARK_PMTU) {
- /* Avoid increasing the PMTU */
- ire->ire_max_frag = MIN(ire->ire_ipif->ipif_mtu,
- ire->ire_max_frag);
- if (ire->ire_max_frag == ire->ire_ipif->ipif_mtu) {
- ire->ire_marks &= ~IRE_MARK_PMTU;
- }
- } else {
- ire->ire_max_frag = MIN(ire->ire_ipif->ipif_mtu, IP_MAXPACKET);
- }
- mutex_exit(&ire->ire_lock);
-}
-
-/*
* Join the ipif specific multicast groups.
* Must be called after a mapping has been set up in the resolver. (Always
* called as writer.)
@@ -15355,13 +13314,15 @@ ipif_multicast_up(ipif_t *ipif)
{
int err;
ill_t *ill;
+ ilm_t *ilm;
ASSERT(IAM_WRITER_IPIF(ipif));
ill = ipif->ipif_ill;
ip1dbg(("ipif_multicast_up\n"));
- if (!(ill->ill_flags & ILLF_MULTICAST) || ipif->ipif_multicast_up)
+ if (!(ill->ill_flags & ILLF_MULTICAST) ||
+ ipif->ipif_allhosts_ilm != NULL)
return;
if (ipif->ipif_isv6) {
@@ -15380,228 +13341,147 @@ ipif_multicast_up(ipif_t *ipif)
* underlying IPMP interfaces since they should be invisible.
*/
if (!IS_UNDER_IPMP(ill)) {
- err = ip_addmulti_v6(&v6allmc, ill, ipif->ipif_zoneid,
- ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
- if (err != 0) {
+ ilm = ip_addmulti(&v6allmc, ill, ipif->ipif_zoneid,
+ &err);
+ if (ilm == NULL) {
+ ASSERT(err != 0);
ip0dbg(("ipif_multicast_up: "
"all_hosts_mcast failed %d\n", err));
return;
}
- ipif->ipif_joined_allhosts = 1;
+ ipif->ipif_allhosts_ilm = ilm;
}
/*
- * Enable multicast for the solicited node multicast address
+ * Enable multicast for the solicited node multicast address.
+ * If IPMP we need to put the membership on the upper ill.
*/
if (!(ipif->ipif_flags & IPIF_NOLOCAL)) {
- err = ip_addmulti_v6(&v6solmc, ill, ipif->ipif_zoneid,
- ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
- if (err != 0) {
+ ill_t *mcast_ill = NULL;
+ boolean_t need_refrele;
+
+ if (IS_UNDER_IPMP(ill) &&
+ (mcast_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) {
+ need_refrele = B_TRUE;
+ } else {
+ mcast_ill = ill;
+ need_refrele = B_FALSE;
+ }
+
+ ilm = ip_addmulti(&v6solmc, mcast_ill,
+ ipif->ipif_zoneid, &err);
+ if (need_refrele)
+ ill_refrele(mcast_ill);
+
+ if (ilm == NULL) {
+ ASSERT(err != 0);
ip0dbg(("ipif_multicast_up: solicited MC"
" failed %d\n", err));
- if (ipif->ipif_joined_allhosts) {
- (void) ip_delmulti_v6(&v6allmc, ill,
- ipif->ipif_zoneid, B_TRUE, B_TRUE);
- ipif->ipif_joined_allhosts = 0;
+ if ((ilm = ipif->ipif_allhosts_ilm) != NULL) {
+ ipif->ipif_allhosts_ilm = NULL;
+ (void) ip_delmulti(ilm);
}
return;
}
+ ipif->ipif_solmulti_ilm = ilm;
}
} else {
+ in6_addr_t v6group;
+
if (ipif->ipif_lcl_addr == INADDR_ANY || IS_UNDER_IPMP(ill))
return;
/* Join the all hosts multicast address */
ip1dbg(("ipif_multicast_up - addmulti\n"));
- err = ip_addmulti(htonl(INADDR_ALLHOSTS_GROUP), ipif,
- ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
- if (err) {
+ IN6_IPADDR_TO_V4MAPPED(htonl(INADDR_ALLHOSTS_GROUP), &v6group);
+
+ ilm = ip_addmulti(&v6group, ill, ipif->ipif_zoneid, &err);
+ if (ilm == NULL) {
+ ASSERT(err != 0);
ip0dbg(("ipif_multicast_up: failed %d\n", err));
return;
}
+ ipif->ipif_allhosts_ilm = ilm;
}
- ipif->ipif_multicast_up = 1;
}
/*
* Blow away any multicast groups that we joined in ipif_multicast_up().
- * (Explicit memberships are blown away in ill_leave_multicast() when the
- * ill is brought down.)
+ * (ilms from explicit memberships are handled in conn_update_ill.)
*/
void
ipif_multicast_down(ipif_t *ipif)
{
- int err;
-
ASSERT(IAM_WRITER_IPIF(ipif));
ip1dbg(("ipif_multicast_down\n"));
- if (!ipif->ipif_multicast_up)
- return;
-
- ip1dbg(("ipif_multicast_down - delmulti\n"));
-
- if (!ipif->ipif_isv6) {
- err = ip_delmulti(htonl(INADDR_ALLHOSTS_GROUP), ipif, B_TRUE,
- B_TRUE);
- if (err != 0)
- ip0dbg(("ipif_multicast_down: failed %d\n", err));
-
- ipif->ipif_multicast_up = 0;
- return;
- }
- /*
- * Leave the all-hosts multicast address.
- */
- if (ipif->ipif_joined_allhosts) {
- err = ip_delmulti_v6(&ipv6_all_hosts_mcast, ipif->ipif_ill,
- ipif->ipif_zoneid, B_TRUE, B_TRUE);
- if (err != 0) {
- ip0dbg(("ipif_multicast_down: all_hosts_mcast "
- "failed %d\n", err));
- }
- ipif->ipif_joined_allhosts = 0;
+ if (ipif->ipif_allhosts_ilm != NULL) {
+ (void) ip_delmulti(ipif->ipif_allhosts_ilm);
+ ipif->ipif_allhosts_ilm = NULL;
}
-
- /*
- * Disable multicast for the solicited node multicast address
- */
- if (!(ipif->ipif_flags & IPIF_NOLOCAL)) {
- in6_addr_t ipv6_multi = ipv6_solicited_node_mcast;
-
- ipv6_multi.s6_addr32[3] |=
- ipif->ipif_v6lcl_addr.s6_addr32[3];
-
- err = ip_delmulti_v6(&ipv6_multi, ipif->ipif_ill,
- ipif->ipif_zoneid, B_TRUE, B_TRUE);
- if (err != 0) {
- ip0dbg(("ipif_multicast_down: sol MC failed %d\n",
- err));
- }
+ if (ipif->ipif_solmulti_ilm != NULL) {
+ (void) ip_delmulti(ipif->ipif_solmulti_ilm);
+ ipif->ipif_solmulti_ilm = NULL;
}
-
- ipif->ipif_multicast_up = 0;
}
/*
* Used when an interface comes up to recreate any extra routes on this
* interface.
*/
-static ire_t **
-ipif_recover_ire(ipif_t *ipif)
+int
+ill_recover_saved_ire(ill_t *ill)
{
- mblk_t *mp;
- ire_t **ipif_saved_irep;
- ire_t **irep;
- ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
-
- ip1dbg(("ipif_recover_ire(%s:%u)", ipif->ipif_ill->ill_name,
- ipif->ipif_id));
+ mblk_t *mp;
+ ip_stack_t *ipst = ill->ill_ipst;
- mutex_enter(&ipif->ipif_saved_ire_lock);
- ipif_saved_irep = (ire_t **)kmem_zalloc(sizeof (ire_t *) *
- ipif->ipif_saved_ire_cnt, KM_NOSLEEP);
- if (ipif_saved_irep == NULL) {
- mutex_exit(&ipif->ipif_saved_ire_lock);
- return (NULL);
- }
+ ip1dbg(("ill_recover_saved_ire(%s)", ill->ill_name));
- irep = ipif_saved_irep;
- for (mp = ipif->ipif_saved_ire_mp; mp != NULL; mp = mp->b_cont) {
- ire_t *ire;
- queue_t *rfq;
- queue_t *stq;
+ mutex_enter(&ill->ill_saved_ire_lock);
+ for (mp = ill->ill_saved_ire_mp; mp != NULL; mp = mp->b_cont) {
+ ire_t *ire, *nire;
ifrt_t *ifrt;
- uchar_t *src_addr;
- uchar_t *gateway_addr;
- ushort_t type;
- /*
- * When the ire was initially created and then added in
- * ip_rt_add(), it was created either using ipif->ipif_net_type
- * in the case of a traditional interface route, or as one of
- * the IRE_OFFSUBNET types (with the exception of
- * IRE_HOST types ire which is created by icmp_redirect() and
- * which we don't need to save or recover). In the case where
- * ipif->ipif_net_type was IRE_LOOPBACK, ip_rt_add() will update
- * the ire_type to IRE_IF_NORESOLVER before calling ire_add()
- * to satisfy software like GateD and Sun Cluster which creates
- * routes using the the loopback interface's address as a
- * gateway.
- *
- * As ifrt->ifrt_type reflects the already updated ire_type,
- * ire_create() will be called in the same way here as
- * in ip_rt_add(), namely using ipif->ipif_net_type when
- * the route looks like a traditional interface route (where
- * ifrt->ifrt_type & IRE_INTERFACE is true) and otherwise using
- * the saved ifrt->ifrt_type. This means that in the case where
- * ipif->ipif_net_type is IRE_LOOPBACK, the ire created by
- * ire_create() will be an IRE_LOOPBACK, it will then be turned
- * into an IRE_IF_NORESOLVER and then added by ire_add().
- */
ifrt = (ifrt_t *)mp->b_rptr;
- ASSERT(ifrt->ifrt_type != IRE_CACHE);
- if (ifrt->ifrt_type & IRE_INTERFACE) {
- rfq = NULL;
- stq = (ipif->ipif_net_type == IRE_IF_RESOLVER)
- ? ipif->ipif_rq : ipif->ipif_wq;
- src_addr = (ifrt->ifrt_flags & RTF_SETSRC)
- ? (uint8_t *)&ifrt->ifrt_src_addr
- : (uint8_t *)&ipif->ipif_src_addr;
- gateway_addr = NULL;
- type = ipif->ipif_net_type;
- } else if (ifrt->ifrt_type & IRE_BROADCAST) {
- /* Recover multiroute broadcast IRE. */
- rfq = ipif->ipif_rq;
- stq = ipif->ipif_wq;
- src_addr = (ifrt->ifrt_flags & RTF_SETSRC)
- ? (uint8_t *)&ifrt->ifrt_src_addr
- : (uint8_t *)&ipif->ipif_src_addr;
- gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr;
- type = ifrt->ifrt_type;
- } else {
- rfq = NULL;
- stq = NULL;
- src_addr = (ifrt->ifrt_flags & RTF_SETSRC)
- ? (uint8_t *)&ifrt->ifrt_src_addr : NULL;
- gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr;
- type = ifrt->ifrt_type;
- }
-
/*
* Create a copy of the IRE with the saved address and netmask.
*/
- ip1dbg(("ipif_recover_ire: creating IRE %s (%d) for "
- "0x%x/0x%x\n",
- ip_nv_lookup(ire_nv_tbl, ifrt->ifrt_type), ifrt->ifrt_type,
- ntohl(ifrt->ifrt_addr),
- ntohl(ifrt->ifrt_mask)));
- ire = ire_create(
- (uint8_t *)&ifrt->ifrt_addr,
- (uint8_t *)&ifrt->ifrt_mask,
- src_addr,
- gateway_addr,
- &ifrt->ifrt_max_frag,
- NULL,
- rfq,
- stq,
- type,
- ipif,
- 0,
- 0,
- 0,
- ifrt->ifrt_flags,
- &ifrt->ifrt_iulp_info,
- NULL,
- NULL,
- ipst);
-
+ if (ill->ill_isv6) {
+ ire = ire_create_v6(
+ &ifrt->ifrt_v6addr,
+ &ifrt->ifrt_v6mask,
+ &ifrt->ifrt_v6gateway_addr,
+ ifrt->ifrt_type,
+ ill,
+ ifrt->ifrt_zoneid,
+ ifrt->ifrt_flags,
+ NULL,
+ ipst);
+ } else {
+ ire = ire_create(
+ (uint8_t *)&ifrt->ifrt_addr,
+ (uint8_t *)&ifrt->ifrt_mask,
+ (uint8_t *)&ifrt->ifrt_gateway_addr,
+ ifrt->ifrt_type,
+ ill,
+ ifrt->ifrt_zoneid,
+ ifrt->ifrt_flags,
+ NULL,
+ ipst);
+ }
if (ire == NULL) {
- mutex_exit(&ipif->ipif_saved_ire_lock);
- kmem_free(ipif_saved_irep,
- ipif->ipif_saved_ire_cnt * sizeof (ire_t *));
- return (NULL);
+ mutex_exit(&ill->ill_saved_ire_lock);
+ return (ENOMEM);
+ }
+
+ if (ifrt->ifrt_flags & RTF_SETSRC) {
+ if (ill->ill_isv6) {
+ ire->ire_setsrc_addr_v6 =
+ ifrt->ifrt_v6setsrc_addr;
+ } else {
+ ire->ire_setsrc_addr = ifrt->ifrt_setsrc_addr;
+ }
}
/*
@@ -15611,23 +13491,37 @@ ipif_recover_ire(ipif_t *ipif)
* set up prefixes with the RTF_REJECT flag set (for example,
* when generating aggregate routes.)
*
- * If the IRE type (as defined by ipif->ipif_net_type) is
+ * If the IRE type (as defined by ill->ill_net_type) is
* IRE_LOOPBACK, then we map the request into a
* IRE_IF_NORESOLVER.
*/
- if (ipif->ipif_net_type == IRE_LOOPBACK)
+ if (ill->ill_net_type == IRE_LOOPBACK)
ire->ire_type = IRE_IF_NORESOLVER;
+
/*
* ire held by ire_add, will be refreled' towards the
* the end of ipif_up_done
*/
- (void) ire_add(&ire, NULL, NULL, NULL, B_FALSE);
- *irep = ire;
- irep++;
- ip1dbg(("ipif_recover_ire: added ire %p\n", (void *)ire));
+ nire = ire_add(ire);
+ /*
+ * Check if it was a duplicate entry. This handles
+ * the case of two racing route adds for the same route
+ */
+ if (nire == NULL) {
+ ip1dbg(("ill_recover_saved_ire: FAILED\n"));
+ } else if (nire != ire) {
+ ip1dbg(("ill_recover_saved_ire: duplicate ire %p\n",
+ (void *)nire));
+ ire_delete(nire);
+ } else {
+ ip1dbg(("ill_recover_saved_ire: added ire %p\n",
+ (void *)nire));
+ }
+ if (nire != NULL)
+ ire_refrele(nire);
}
- mutex_exit(&ipif->ipif_saved_ire_lock);
- return (ipif_saved_irep);
+ mutex_exit(&ill->ill_saved_ire_lock);
+ return (0);
}
/*
@@ -15766,6 +13660,8 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
ASSERT(IAM_WRITER_IPIF(ipif));
ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id));
+ DTRACE_PROBE3(ipif__downup, char *, "ipif_up",
+ ill_t *, ill, ipif_t *, ipif);
/* Shouldn't get here if it is already up. */
if (ipif->ipif_flags & IPIF_UP)
@@ -15786,7 +13682,7 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
/*
* The ipif being brought up should be quiesced. If it's not,
* something has gone amiss and we need to bail out. (If it's
- * quiesced, we know it will remain so via IPIF_CHANGING.)
+ * quiesced, we know it will remain so via IPIF_CONDEMNED.)
*/
mutex_enter(&ill->ill_lock);
if (!ipif_is_quiescent(ipif)) {
@@ -15868,8 +13764,8 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
/*
* If the ipif being brought up was on slot zero, then we
* first need to bring up the placeholder we stuck there. In
- * ip_rput_dlpi_writer(), ip_arp_done(), or the recursive call
- * to ipif_up() itself, if we successfully bring up the
+ * ip_rput_dlpi_writer(), arp_bringup_done(), or the recursive
+ * call to ipif_up() itself, if we successfully bring up the
* placeholder, we'll check ill_move_ipif and bring it up too.
*/
if (ipif_orig_id == 0) {
@@ -15907,13 +13803,13 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
}
/*
- * ipif_resolver_up may end up sending an
- * AR_INTERFACE_UP message to ARP, which would, in
- * turn send a DLPI message to the driver. ioctls are
+ * ipif_resolver_up may end up needeing to bind/attach
+ * the ARP stream, which in turn necessitates a
+ * DLPI message exchange with the driver. ioctls are
* serialized and so we cannot send more than one
* interface up message at a time. If ipif_resolver_up
- * does send an interface up message to ARP, we get
- * EINPROGRESS and we will complete in ip_arp_done.
+ * does need to wait for the DLPI handshake for the ARP stream,
+ * we get EINPROGRESS and we will complete in arp_bringup_done.
*/
ASSERT(connp != NULL || !CONN_Q(q));
@@ -15928,18 +13824,12 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
return (EINTR);
/*
- * Crank up the resolver. For IPv6, this cranks up the
- * external resolver if one is configured, but even if an
- * external resolver isn't configured, it must be called to
- * reset DAD state. For IPv6, if an external resolver is not
- * being used, ipif_resolver_up() will never return
- * EINPROGRESS, so we can always call ipif_ndp_up() here.
- * Note that if an external resolver is being used, there's no
- * need to call ipif_ndp_up() since it will do nothing.
+ * Crank up IPv6 neighbor discovery. Unlike ARP, this should
+ * complete when ipif_ndp_up returns.
*/
err = ipif_resolver_up(ipif, Res_act_initial);
if (err == EINPROGRESS) {
- /* We will complete it in ip_arp_done() */
+ /* We will complete it in arp_bringup_done() */
return (err);
}
@@ -15958,9 +13848,13 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
*/
ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
ipif->ipif_addr_ready = 1;
+ err = ill_add_ires(ill);
+ /* allocation failure? */
+ if (err != 0)
+ return (err);
}
- err = isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif);
+ err = (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif));
if (err == 0 && ill->ill_move_ipif != NULL) {
ipif = ill->ill_move_ipif;
ill->ill_move_ipif = NULL;
@@ -15970,6 +13864,53 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
}
/*
+ * Add any IREs tied to the ill. For now this is just an IRE_MULTICAST.
+ * The identical set of IREs need to be removed in ill_delete_ires().
+ */
+int
+ill_add_ires(ill_t *ill)
+{
+ ire_t *ire;
+ in6_addr_t dummy6 = {(uint32_t)V6_MCAST, 0, 0, 1};
+ in_addr_t dummy4 = htonl(INADDR_ALLHOSTS_GROUP);
+
+ if (ill->ill_ire_multicast != NULL)
+ return (0);
+
+ /*
+ * provide some dummy ire_addr for creating the ire.
+ */
+ if (ill->ill_isv6) {
+ ire = ire_create_v6(&dummy6, 0, 0, IRE_MULTICAST, ill,
+ ALL_ZONES, RTF_UP, NULL, ill->ill_ipst);
+ } else {
+ ire = ire_create((uchar_t *)&dummy4, 0, 0, IRE_MULTICAST, ill,
+ ALL_ZONES, RTF_UP, NULL, ill->ill_ipst);
+ }
+ if (ire == NULL)
+ return (ENOMEM);
+
+ ill->ill_ire_multicast = ire;
+ return (0);
+}
+
+void
+ill_delete_ires(ill_t *ill)
+{
+ if (ill->ill_ire_multicast != NULL) {
+ /*
+ * BIND/ATTACH completed; Release the ref for ill_ire_multicast
+ * which was taken without any th_tracing enabled.
+ * We also mark it as condemned (note that it was never added)
+ * so that caching conn's can move off of it.
+ */
+ ire_make_condemned(ill->ill_ire_multicast);
+ ire_refrele_notr(ill->ill_ire_multicast);
+ ill->ill_ire_multicast = NULL;
+ }
+}
+
+/*
* Perform a bind for the physical device.
* When the routine returns EINPROGRESS then mp has been consumed and
* the ioctl will be acked from ip_rput_dlpi.
@@ -15978,30 +13919,26 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
static int
ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
{
- areq_t *areq;
- mblk_t *areq_mp = NULL;
mblk_t *bind_mp = NULL;
mblk_t *unbind_mp = NULL;
conn_t *connp;
boolean_t success;
- uint16_t sap_addr;
+ int err;
+
+ DTRACE_PROBE2(ill__downup, char *, "ill_dl_up", ill_t *, ill);
ip1dbg(("ill_dl_up(%s)\n", ill->ill_name));
ASSERT(IAM_WRITER_ILL(ill));
ASSERT(mp != NULL);
- /* Create a resolver cookie for ARP */
- if (!ill->ill_isv6 && ill->ill_net_type == IRE_IF_RESOLVER) {
- areq_mp = ill_arp_alloc(ill, (uchar_t *)&ip_areq_template, 0);
- if (areq_mp == NULL)
- return (ENOMEM);
+ /*
+ * Make sure we have an IRE_MULTICAST in case we immediately
+ * start receiving packets.
+ */
+ err = ill_add_ires(ill);
+ if (err != 0)
+ goto bad;
- freemsg(ill->ill_resolver_mp);
- ill->ill_resolver_mp = areq_mp;
- areq = (areq_t *)areq_mp->b_rptr;
- sap_addr = ill->ill_sap;
- bcopy(&sap_addr, areq->areq_sap, sizeof (sap_addr));
- }
bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long),
DL_BIND_REQ);
if (bind_mp == NULL)
@@ -16067,46 +14004,39 @@ bad:
return (ENOMEM);
}
+/* Add room for tcp+ip headers */
uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20;
/*
* DLPI and ARP is up.
- * Create all the IREs associated with an interface bring up multicast.
+ * Create all the IREs associated with an interface. Bring up multicast.
* Set the interface flag and finish other initialization
- * that potentially had to be differed to after DL_BIND_ACK.
+ * that potentially had to be deferred to after DL_BIND_ACK.
*/
int
ipif_up_done(ipif_t *ipif)
{
- ire_t *ire_array[20];
- ire_t **irep = ire_array;
- ire_t **irep1;
- ipaddr_t net_mask = 0;
- ipaddr_t subnet_mask, route_mask;
- ill_t *ill = ipif->ipif_ill;
- queue_t *stq;
- ipif_t *src_ipif;
- ipif_t *tmp_ipif;
- boolean_t flush_ire_cache = B_TRUE;
- int err = 0;
- ire_t **ipif_saved_irep = NULL;
- int ipif_saved_ire_cnt;
- int cnt;
- boolean_t src_ipif_held = B_FALSE;
+ ill_t *ill = ipif->ipif_ill;
+ int err = 0;
boolean_t loopback = B_FALSE;
- ip_stack_t *ipst = ill->ill_ipst;
+ boolean_t update_src_selection = B_TRUE;
+ ipif_t *tmp_ipif;
ip1dbg(("ipif_up_done(%s:%u)\n",
ipif->ipif_ill->ill_name, ipif->ipif_id));
+ DTRACE_PROBE3(ipif__downup, char *, "ipif_up_done",
+ ill_t *, ill, ipif_t *, ipif);
+
/* Check if this is a loopback interface */
if (ipif->ipif_ill->ill_wq == NULL)
loopback = B_TRUE;
ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
+
/*
* If all other interfaces for this ill are down or DEPRECATED,
- * or otherwise unsuitable for source address selection, remove
- * any IRE_CACHE entries for this ill to make sure source
+ * or otherwise unsuitable for source address selection,
+ * reset the src generation numbers to make sure source
* address selection gets to take this new ipif into account.
* No need to hold ill_lock while traversing the ipif list since
* we are writer
@@ -16119,31 +14049,16 @@ ipif_up_done(ipif_t *ipif)
(tmp_ipif == ipif))
continue;
/* first useable pre-existing interface */
- flush_ire_cache = B_FALSE;
+ update_src_selection = B_FALSE;
break;
}
- if (flush_ire_cache)
- ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
- IRE_CACHE, ill_ipif_cache_delete, (char *)ill, ill);
+ if (update_src_selection)
+ ip_update_source_selection(ill->ill_ipst);
- /*
- * Figure out which way the send-to queue should go. Only
- * IRE_IF_RESOLVER or IRE_IF_NORESOLVER or IRE_LOOPBACK
- * should show up here.
- */
- switch (ill->ill_net_type) {
- case IRE_IF_RESOLVER:
- stq = ill->ill_rq;
- break;
- case IRE_IF_NORESOLVER:
- case IRE_LOOPBACK:
- stq = ill->ill_wq;
- break;
- default:
- return (EINVAL);
- }
+ if (IS_LOOPBACK(ill) || ill->ill_net_type == IRE_IF_NORESOLVER) {
+ nce_t *loop_nce = NULL;
+ uint16_t flags = (NCE_F_MYADDR | NCE_F_AUTHORITY | NCE_F_NONUD);
- if (IS_LOOPBACK(ill)) {
/*
* lo0:1 and subsequent ipifs were marked IRE_LOCAL in
* ipif_lookup_on_name(), but in the case of zones we can have
@@ -16155,29 +14070,130 @@ ipif_up_done(ipif_t *ipif)
ipif->ipif_ire_type = IRE_LOOPBACK;
else
ipif->ipif_ire_type = IRE_LOCAL;
+ if (ill->ill_net_type != IRE_LOOPBACK)
+ flags |= NCE_F_PUBLISH;
+
+ /* add unicast nce for the local addr */
+ err = nce_lookup_then_add_v4(ill, NULL,
+ ill->ill_phys_addr_length, &ipif->ipif_lcl_addr, flags,
+ ND_REACHABLE, &loop_nce);
+ /* A shared-IP zone sees EEXIST for lo0:N */
+ if (err == 0 || err == EEXIST) {
+ ipif->ipif_added_nce = 1;
+ loop_nce->nce_ipif_cnt++;
+ nce_refrele(loop_nce);
+ err = 0;
+ } else {
+ ASSERT(loop_nce == NULL);
+ return (err);
+ }
}
- if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST) ||
- ((ipif->ipif_flags & IPIF_DEPRECATED) &&
- !(ipif->ipif_flags & IPIF_NOFAILOVER))) {
+ /* Create all the IREs associated with this interface */
+ err = ipif_add_ires_v4(ipif, loopback);
+ if (err != 0) {
/*
- * Can't use our source address. Select a different
- * source address for the IRE_INTERFACE and IRE_LOCAL
+ * see comments about return value from
+ * ip_addr_availability_check() in ipif_add_ires_v4().
*/
- src_ipif = ipif_select_source(ipif->ipif_ill,
- ipif->ipif_subnet, ipif->ipif_zoneid);
- if (src_ipif == NULL)
- src_ipif = ipif; /* Last resort */
- else
- src_ipif_held = B_TRUE;
- } else {
- src_ipif = ipif;
+ if (err != EADDRINUSE) {
+ (void) ipif_arp_down(ipif);
+ } else {
+ /*
+ * Make IPMP aware of the deleted ipif so that
+ * the needed ipmp cleanup (e.g., of ipif_bound_ill)
+ * can be completed. Note that we do not want to
+ * destroy the nce that was created on the ipmp_ill
+ * for the active copy of the duplicate address in
+ * use.
+ */
+ if (IS_IPMP(ill))
+ ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
+ err = EADDRNOTAVAIL;
+ }
+ return (err);
}
- /* Create all the IREs associated with this interface */
+ if (ill->ill_ipif_up_count == 1 && !loopback) {
+ /* Recover any additional IREs entries for this ill */
+ (void) ill_recover_saved_ire(ill);
+ }
+
+ if (ill->ill_need_recover_multicast) {
+ /*
+ * Need to recover all multicast memberships in the driver.
+ * This had to be deferred until we had attached. The same
+ * code exists in ipif_up_done_v6() to recover IPv6
+ * memberships.
+ *
+ * Note that it would be preferable to unconditionally do the
+ * ill_recover_multicast() in ill_dl_up(), but we cannot do
+ * that since ill_join_allmulti() depends on ill_dl_up being
+ * set, and it is not set until we receive a DL_BIND_ACK after
+ * having called ill_dl_up().
+ */
+ ill_recover_multicast(ill);
+ }
+
+ if (ill->ill_ipif_up_count == 1) {
+ /*
+ * Since the interface is now up, it may now be active.
+ */
+ if (IS_UNDER_IPMP(ill))
+ ipmp_ill_refresh_active(ill);
+
+ /*
+ * If this is an IPMP interface, we may now be able to
+ * establish ARP entries.
+ */
+ if (IS_IPMP(ill))
+ ipmp_illgrp_refresh_arpent(ill->ill_grp);
+ }
+
+ /* Join the allhosts multicast address */
+ ipif_multicast_up(ipif);
+
+ if (!loopback && !update_src_selection &&
+ !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)))
+ ip_update_source_selection(ill->ill_ipst);
+
+ if (!loopback && ipif->ipif_addr_ready) {
+ /* Broadcast an address mask reply. */
+ ipif_mask_reply(ipif);
+ }
+ /* Perhaps ilgs should use this ill */
+ update_conn_ill(NULL, ill->ill_ipst);
+
+ /*
+ * This had to be deferred until we had bound. Tell routing sockets and
+ * others that this interface is up if it looks like the address has
+ * been validated. Otherwise, if it isn't ready yet, wait for
+ * duplicate address detection to do its thing.
+ */
+ if (ipif->ipif_addr_ready)
+ ipif_up_notify(ipif);
+ return (0);
+}
+
+/*
+ * Add the IREs associated with the ipif.
+ * Those MUST be explicitly removed in ipif_delete_ires_v4.
+ */
+static int
+ipif_add_ires_v4(ipif_t *ipif, boolean_t loopback)
+{
+ ill_t *ill = ipif->ipif_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+ ire_t *ire_array[20];
+ ire_t **irep = ire_array;
+ ire_t **irep1;
+ ipaddr_t net_mask = 0;
+ ipaddr_t subnet_mask, route_mask;
+ int err;
+ ire_t *ire_local = NULL; /* LOCAL or LOOPBACK */
+
if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
!(ipif->ipif_flags & IPIF_NOLOCAL)) {
-
/*
* If we're on a labeled system then make sure that zone-
* private addresses have proper remote host database entries.
@@ -16191,38 +14207,34 @@ ipif_up_done(ipif_t *ipif)
err = ip_srcid_insert(&ipif->ipif_v6lcl_addr,
ipif->ipif_zoneid, ipst);
if (err != 0) {
- ip0dbg(("ipif_up_done: srcid_insert %d\n", err));
+ ip0dbg(("ipif_add_ires: srcid_insert %d\n", err));
return (err);
}
/* If the interface address is set, create the local IRE. */
- ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x for 0x%x\n",
- (void *)ipif,
- ipif->ipif_ire_type,
- ntohl(ipif->ipif_lcl_addr)));
- *irep++ = ire_create(
+ ire_local = ire_create(
(uchar_t *)&ipif->ipif_lcl_addr, /* dest address */
(uchar_t *)&ip_g_all_ones, /* mask */
- (uchar_t *)&src_ipif->ipif_src_addr, /* source address */
NULL, /* no gateway */
- &ip_loopback_mtuplus, /* max frag size */
- NULL,
- ipif->ipif_rq, /* recv-from queue */
- NULL, /* no send-to queue */
ipif->ipif_ire_type, /* LOCAL or LOOPBACK */
- ipif,
- 0,
- 0,
- 0,
- (ipif->ipif_flags & IPIF_PRIVATE) ?
- RTF_PRIVATE : 0,
- &ire_uinfo_null,
- NULL,
+ ipif->ipif_ill,
+ ipif->ipif_zoneid,
+ ((ipif->ipif_flags & IPIF_PRIVATE) ?
+ RTF_PRIVATE : 0) | RTF_KERNEL,
NULL,
ipst);
+ ip1dbg(("ipif_add_ires: 0x%p creating IRE %p type 0x%x"
+ " for 0x%x\n", (void *)ipif, (void *)ire_local,
+ ipif->ipif_ire_type,
+ ntohl(ipif->ipif_lcl_addr)));
+ if (ire_local == NULL) {
+ ip1dbg(("ipif_up_done: NULL ire_local\n"));
+ err = ENOMEM;
+ goto bad;
+ }
} else {
ip1dbg((
- "ipif_up_done: not creating IRE %d for 0x%x: flags 0x%x\n",
+ "ipif_add_ires: not creating IRE %d for 0x%x: flags 0x%x\n",
ipif->ipif_ire_type,
ntohl(ipif->ipif_lcl_addr),
(uint_t)ipif->ipif_flags));
@@ -16249,7 +14261,7 @@ ipif_up_done(ipif_t *ipif)
}
/* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */
- if (stq != NULL && !(ipif->ipif_flags & IPIF_NOXMIT) &&
+ if (!loopback && !(ipif->ipif_flags & IPIF_NOXMIT) &&
ipif->ipif_subnet != INADDR_ANY) {
/* ipif_subnet is ipif_pp_dst_addr for pt-pt */
@@ -16259,7 +14271,7 @@ ipif_up_done(ipif_t *ipif)
route_mask = subnet_mask;
}
- ip1dbg(("ipif_up_done: ipif 0x%p ill 0x%p "
+ ip1dbg(("ipif_add_ires: ipif 0x%p ill 0x%p "
"creating if IRE ill_net_type 0x%x for 0x%x\n",
(void *)ipif, (void *)ill,
ill->ill_net_type,
@@ -16267,20 +14279,12 @@ ipif_up_done(ipif_t *ipif)
*irep++ = ire_create(
(uchar_t *)&ipif->ipif_subnet, /* dest address */
(uchar_t *)&route_mask, /* mask */
- (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */
- NULL, /* no gateway */
- &ipif->ipif_mtu, /* max frag */
- NULL,
- NULL, /* no recv queue */
- stq, /* send-to queue */
+ (uchar_t *)&ipif->ipif_lcl_addr, /* gateway */
ill->ill_net_type, /* IF_[NO]RESOLVER */
- ipif,
- 0,
- 0,
- 0,
- (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE: 0,
- &ire_uinfo_null,
- NULL,
+ ill,
+ ipif->ipif_zoneid,
+ ((ipif->ipif_flags & IPIF_PRIVATE) ?
+ RTF_PRIVATE: 0) | RTF_KERNEL,
NULL,
ipst);
}
@@ -16288,11 +14292,10 @@ ipif_up_done(ipif_t *ipif)
/*
* Create any necessary broadcast IREs.
*/
- if (ipif->ipif_flags & IPIF_BROADCAST)
+ if ((ipif->ipif_flags & IPIF_BROADCAST) &&
+ !(ipif->ipif_flags & IPIF_NOXMIT))
irep = ipif_create_bcast_ires(ipif, irep);
- ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
-
/* If an earlier ire_create failed, get out now */
for (irep1 = irep; irep1 > ire_array; ) {
irep1--;
@@ -16324,14 +14327,9 @@ ipif_up_done(ipif_t *ipif)
* ipif. So we don't want to delete it (otherwise the other ipif
* would be unable to send packets).
* ip_addr_availability_check() identifies this case for us and
- * returns EADDRINUSE; we need to turn it into EADDRNOTAVAIL
+ * returns EADDRINUSE; Caller should turn it into EADDRNOTAVAIL
* which is the expected error code.
*/
- if (err == EADDRINUSE) {
- freemsg(ipif->ipif_arp_del_mp);
- ipif->ipif_arp_del_mp = NULL;
- err = EADDRNOTAVAIL;
- }
ill->ill_ipif_up_count--;
ipif->ipif_flags &= ~IPIF_UP;
goto bad;
@@ -16341,19 +14339,33 @@ ipif_up_done(ipif_t *ipif)
* Add in all newly created IREs. ire_create_bcast() has
* already checked for duplicates of the IRE_BROADCAST type.
*/
+ if (ire_local != NULL) {
+ ire_local = ire_add(ire_local);
+#ifdef DEBUG
+ if (ire_local != NULL) {
+ ire_refhold_notr(ire_local);
+ ire_refrele(ire_local);
+ }
+#endif
+ }
+
+ rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
+ if (ire_local != NULL)
+ ipif->ipif_ire_local = ire_local;
+ rw_exit(&ipst->ips_ill_g_lock);
+ ire_local = NULL;
+
for (irep1 = irep; irep1 > ire_array; ) {
irep1--;
- ASSERT(!MUTEX_HELD(&((*irep1)->ire_ipif->ipif_ill->ill_lock)));
- /*
- * refheld by ire_add. refele towards the end of the func
- */
- (void) ire_add(irep1, NULL, NULL, NULL, B_FALSE);
+ ASSERT(!MUTEX_HELD(&((*irep1)->ire_ill->ill_lock)));
+ /* refheld by ire_add. */
+ *irep1 = ire_add(*irep1);
+ if (*irep1 != NULL) {
+ ire_refrele(*irep1);
+ *irep1 = NULL;
+ }
}
- /* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */
- ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt;
- ipif_saved_irep = ipif_recover_ire(ipif);
-
if (!loopback) {
/*
* If the broadcast address has been set, make sure it makes
@@ -16364,9 +14376,9 @@ ipif_up_done(ipif_t *ipif)
(ipif->ipif_flags & IPIF_BROADCAST)) {
ire_t *ire;
- ire = ire_ctable_lookup(ipif->ipif_brd_addr, 0,
- IRE_BROADCAST, ipif, ALL_ZONES,
- NULL, (MATCH_IRE_TYPE | MATCH_IRE_ILL), ipst);
+ ire = ire_ftable_lookup_v4(ipif->ipif_brd_addr, 0, 0,
+ IRE_BROADCAST, ipif->ipif_ill, ALL_ZONES, NULL,
+ (MATCH_IRE_TYPE | MATCH_IRE_ILL), 0, ipst, NULL);
if (ire == NULL) {
/*
@@ -16383,176 +14395,113 @@ ipif_up_done(ipif_t *ipif)
}
}
-
- if (ill->ill_need_recover_multicast) {
- /*
- * Need to recover all multicast memberships in the driver.
- * This had to be deferred until we had attached. The same
- * code exists in ipif_up_done_v6() to recover IPv6
- * memberships.
- *
- * Note that it would be preferable to unconditionally do the
- * ill_recover_multicast() in ill_dl_up(), but we cannot do
- * that since ill_join_allmulti() depends on ill_dl_up being
- * set, and it is not set until we receive a DL_BIND_ACK after
- * having called ill_dl_up().
- */
- ill_recover_multicast(ill);
- }
-
- if (ill->ill_ipif_up_count == 1) {
- /*
- * Since the interface is now up, it may now be active.
- */
- if (IS_UNDER_IPMP(ill))
- ipmp_ill_refresh_active(ill);
-
- /*
- * If this is an IPMP interface, we may now be able to
- * establish ARP entries.
- */
- if (IS_IPMP(ill))
- ipmp_illgrp_refresh_arpent(ill->ill_grp);
- }
-
- /* Join the allhosts multicast address */
- ipif_multicast_up(ipif);
-
- /*
- * See if anybody else would benefit from our new ipif.
- */
- if (!loopback &&
- !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) {
- ill_update_source_selection(ill);
- }
-
- for (irep1 = irep; irep1 > ire_array; ) {
- irep1--;
- if (*irep1 != NULL) {
- /* was held in ire_add */
- ire_refrele(*irep1);
- }
- }
-
- cnt = ipif_saved_ire_cnt;
- for (irep1 = ipif_saved_irep; cnt > 0; irep1++, cnt--) {
- if (*irep1 != NULL) {
- /* was held in ire_add */
- ire_refrele(*irep1);
- }
- }
-
- if (!loopback && ipif->ipif_addr_ready) {
- /* Broadcast an address mask reply. */
- ipif_mask_reply(ipif);
- }
- if (ipif_saved_irep != NULL) {
- kmem_free(ipif_saved_irep,
- ipif_saved_ire_cnt * sizeof (ire_t *));
- }
- if (src_ipif_held)
- ipif_refrele(src_ipif);
-
- /*
- * This had to be deferred until we had bound. Tell routing sockets and
- * others that this interface is up if it looks like the address has
- * been validated. Otherwise, if it isn't ready yet, wait for
- * duplicate address detection to do its thing.
- */
- if (ipif->ipif_addr_ready)
- ipif_up_notify(ipif);
return (0);
bad:
- ip1dbg(("ipif_up_done: FAILED \n"));
-
+ ip1dbg(("ipif_add_ires: FAILED \n"));
+ if (ire_local != NULL)
+ ire_delete(ire_local);
while (irep > ire_array) {
irep--;
- if (*irep != NULL)
+ if (*irep != NULL) {
ire_delete(*irep);
+ }
}
(void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst);
- if (ipif_saved_irep != NULL) {
- kmem_free(ipif_saved_irep,
- ipif_saved_ire_cnt * sizeof (ire_t *));
- }
- if (src_ipif_held)
- ipif_refrele(src_ipif);
-
- ipif_resolver_down(ipif);
return (err);
}
-/*
- * Turn off the ARP with the ILLF_NOARP flag.
- */
-static int
-ill_arp_off(ill_t *ill)
+/* Remove all the IREs created by ipif_add_ires_v4 */
+void
+ipif_delete_ires_v4(ipif_t *ipif)
{
- mblk_t *arp_off_mp = NULL;
- mblk_t *arp_on_mp = NULL;
+ ill_t *ill = ipif->ipif_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+ ipaddr_t net_mask = 0;
+ ipaddr_t subnet_mask, route_mask;
+ int match_args;
+ ire_t *ire;
+ boolean_t loopback;
- ip1dbg(("ill_arp_off(%s)\n", ill->ill_name));
+ /* Check if this is a loopback interface */
+ loopback = (ipif->ipif_ill->ill_wq == NULL);
- ASSERT(IAM_WRITER_ILL(ill));
- ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
+ match_args = MATCH_IRE_TYPE | MATCH_IRE_ILL | MATCH_IRE_MASK |
+ MATCH_IRE_ZONEONLY;
- /*
- * If the on message is still around we've already done
- * an arp_off without doing an arp_on thus there is no
- * work needed.
- */
- if (ill->ill_arp_on_mp != NULL)
- return (0);
+ rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
+ if ((ire = ipif->ipif_ire_local) != NULL) {
+ ipif->ipif_ire_local = NULL;
+ rw_exit(&ipst->ips_ill_g_lock);
+ /*
+ * Move count to ipif so we don't loose the count due to
+ * a down/up dance.
+ */
+ atomic_add_32(&ipif->ipif_ib_pkt_count, ire->ire_ib_pkt_count);
- /*
- * Allocate an ARP on message (to be saved) and an ARP off message
- */
- arp_off_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aroff_template, 0);
- if (!arp_off_mp)
- return (ENOMEM);
+ ire_delete(ire);
+ ire_refrele_notr(ire);
+ } else {
+ rw_exit(&ipst->ips_ill_g_lock);
+ }
+
+ match_args |= MATCH_IRE_GW;
- arp_on_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aron_template, 0);
- if (!arp_on_mp)
- goto failed;
+ if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
+ !(ipif->ipif_flags & IPIF_NOLOCAL)) {
+ net_mask = ip_net_mask(ipif->ipif_lcl_addr);
+ } else {
+ net_mask = htonl(IN_CLASSA_NET); /* fallback */
+ }
- ASSERT(ill->ill_arp_on_mp == NULL);
- ill->ill_arp_on_mp = arp_on_mp;
+ subnet_mask = ipif->ipif_net_mask;
- /* Send an AR_INTERFACE_OFF request */
- putnext(ill->ill_rq, arp_off_mp);
- return (0);
-failed:
+ /*
+ * If mask was not specified, use natural netmask of
+ * interface address. Also, store this mask back into the
+ * ipif struct.
+ */
+ if (subnet_mask == 0)
+ subnet_mask = net_mask;
- if (arp_off_mp)
- freemsg(arp_off_mp);
- return (ENOMEM);
-}
+ /* Delete the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */
+ if (IS_UNDER_IPMP(ill))
+ match_args |= MATCH_IRE_TESTHIDDEN;
-/*
- * Turn on ARP by turning off the ILLF_NOARP flag.
- */
-static int
-ill_arp_on(ill_t *ill)
-{
- mblk_t *mp;
+ if (!loopback && !(ipif->ipif_flags & IPIF_NOXMIT) &&
+ ipif->ipif_subnet != INADDR_ANY) {
+ /* ipif_subnet is ipif_pp_dst_addr for pt-pt */
- ip1dbg(("ipif_arp_on(%s)\n", ill->ill_name));
+ if (ipif->ipif_flags & IPIF_POINTOPOINT) {
+ route_mask = IP_HOST_MASK;
+ } else {
+ route_mask = subnet_mask;
+ }
- ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
+ ire = ire_ftable_lookup_v4(
+ ipif->ipif_subnet, /* dest address */
+ route_mask, /* mask */
+ ipif->ipif_lcl_addr, /* gateway */
+ ill->ill_net_type, /* IF_[NO]RESOLVER */
+ ill,
+ ipif->ipif_zoneid,
+ NULL,
+ match_args,
+ 0,
+ ipst,
+ NULL);
+ ASSERT(ire != NULL);
+ ire_delete(ire);
+ ire_refrele(ire);
+ }
- ASSERT(IAM_WRITER_ILL(ill));
/*
- * Send an AR_INTERFACE_ON request if we have already done
- * an arp_off (which allocated the message).
+ * Create any necessary broadcast IREs.
*/
- if (ill->ill_arp_on_mp != NULL) {
- mp = ill->ill_arp_on_mp;
- ill->ill_arp_on_mp = NULL;
- putnext(ill->ill_rq, mp);
- }
- return (0);
+ if ((ipif->ipif_flags & IPIF_BROADCAST) &&
+ !(ipif->ipif_flags & IPIF_NOXMIT))
+ ipif_delete_bcast_ires(ipif);
}
/*
@@ -16561,49 +14510,72 @@ ill_arp_on(ill_t *ill)
* this selection is done regardless of the destination.
*/
boolean_t
-ipif_usesrc_avail(ill_t *ill, zoneid_t zoneid)
+ipif_zone_avail(uint_t ifindex, boolean_t isv6, zoneid_t zoneid,
+ ip_stack_t *ipst)
{
- uint_t ifindex;
- ipif_t *ipif = NULL;
- ill_t *uill;
- boolean_t isv6;
- ip_stack_t *ipst = ill->ill_ipst;
+ ipif_t *ipif = NULL;
+ ill_t *uill;
- ASSERT(ill != NULL);
+ ASSERT(ifindex != 0);
- isv6 = ill->ill_isv6;
- ifindex = ill->ill_usesrc_ifindex;
- if (ifindex != 0) {
- uill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL,
- NULL, ipst);
- if (uill == NULL)
- return (B_FALSE);
- mutex_enter(&uill->ill_lock);
- for (ipif = uill->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
- if (!IPIF_CAN_LOOKUP(ipif))
- continue;
- if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
- continue;
- if (!(ipif->ipif_flags & IPIF_UP))
- continue;
- if (ipif->ipif_zoneid != zoneid)
- continue;
- if ((isv6 &&
- IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) ||
- (ipif->ipif_lcl_addr == INADDR_ANY))
- continue;
- mutex_exit(&uill->ill_lock);
- ill_refrele(uill);
- return (B_TRUE);
- }
+ uill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
+ if (uill == NULL)
+ return (B_FALSE);
+
+ mutex_enter(&uill->ill_lock);
+ for (ipif = uill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
+ if (IPIF_IS_CONDEMNED(ipif))
+ continue;
+ if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
+ continue;
+ if (!(ipif->ipif_flags & IPIF_UP))
+ continue;
+ if (ipif->ipif_zoneid != zoneid)
+ continue;
+ if (isv6 ? IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) :
+ ipif->ipif_lcl_addr == INADDR_ANY)
+ continue;
mutex_exit(&uill->ill_lock);
ill_refrele(uill);
+ return (B_TRUE);
}
+ mutex_exit(&uill->ill_lock);
+ ill_refrele(uill);
return (B_FALSE);
}
/*
+ * Find an ipif with a good local address on the ill+zoneid.
+ */
+ipif_t *
+ipif_good_addr(ill_t *ill, zoneid_t zoneid)
+{
+ ipif_t *ipif;
+
+ mutex_enter(&ill->ill_lock);
+ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
+ if (IPIF_IS_CONDEMNED(ipif))
+ continue;
+ if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
+ continue;
+ if (!(ipif->ipif_flags & IPIF_UP))
+ continue;
+ if (ipif->ipif_zoneid != zoneid &&
+ ipif->ipif_zoneid != ALL_ZONES && zoneid != ALL_ZONES)
+ continue;
+ if (ill->ill_isv6 ?
+ IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) :
+ ipif->ipif_lcl_addr == INADDR_ANY)
+ continue;
+ ipif_refhold_locked(ipif);
+ mutex_exit(&ill->ill_lock);
+ return (ipif);
+ }
+ mutex_exit(&ill->ill_lock);
+ return (NULL);
+}
+
+/*
* IP source address type, sorted from worst to best. For a given type,
* always prefer IP addresses on the same subnet. All-zones addresses are
* suboptimal because they pose problems with unlabeled destinations.
@@ -16615,7 +14587,8 @@ typedef enum {
IPIF_DIFFNET_ALLZONES, /* allzones and different subnet */
IPIF_SAMENET_ALLZONES, /* allzones and same subnet */
IPIF_DIFFNET, /* normal and different subnet */
- IPIF_SAMENET /* normal and same subnet */
+ IPIF_SAMENET, /* normal and same subnet */
+ IPIF_LOCALADDR /* local loopback */
} ipif_type_t;
/*
@@ -16629,7 +14602,8 @@ typedef enum {
* This only occurs when there is no valid source address for the ill.
*/
ipif_t *
-ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid)
+ipif_select_source_v4(ill_t *ill, ipaddr_t dst, zoneid_t zoneid,
+ boolean_t allow_usesrc, boolean_t *notreadyp)
{
ill_t *usill = NULL;
ill_t *ipmp_ill = NULL;
@@ -16639,9 +14613,9 @@ ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid)
ip_stack_t *ipst = ill->ill_ipst;
boolean_t samenet;
- if (ill->ill_usesrc_ifindex != 0) {
+ if (ill->ill_usesrc_ifindex != 0 && allow_usesrc) {
usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex,
- B_FALSE, NULL, NULL, NULL, NULL, ipst);
+ B_FALSE, ipst);
if (usill != NULL)
ill = usill; /* Select source from usesrc ILL */
else
@@ -16705,14 +14679,22 @@ retry:
if ((next_ipif = ipif->ipif_next) == NULL)
next_ipif = ill->ill_ipif;
- if (!IPIF_CAN_LOOKUP(ipif))
+ if (IPIF_IS_CONDEMNED(ipif))
continue;
/* Always skip NOLOCAL and ANYCAST interfaces */
if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
continue;
- if (!(ipif->ipif_flags & IPIF_UP) || !ipif->ipif_addr_ready)
+ if (!(ipif->ipif_flags & IPIF_UP))
continue;
- if (ipif->ipif_zoneid != zoneid &&
+
+ if (!ipif->ipif_addr_ready) {
+ if (notreadyp != NULL)
+ *notreadyp = B_TRUE;
+ continue;
+ }
+
+ if (zoneid != ALL_ZONES &&
+ ipif->ipif_zoneid != zoneid &&
ipif->ipif_zoneid != ALL_ZONES)
continue;
@@ -16749,7 +14731,9 @@ retry:
samenet = ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet);
- if (ipif->ipif_flags & IPIF_DEPRECATED) {
+ if (ipif->ipif_lcl_addr == dst) {
+ type = IPIF_LOCALADDR;
+ } else if (ipif->ipif_flags & IPIF_DEPRECATED) {
type = samenet ? IPIF_SAMENET_DEPRECATED :
IPIF_DIFFNET_DEPRECATED;
} else if (ipif->ipif_zoneid == ALL_ZONES) {
@@ -16762,14 +14746,14 @@ retry:
if (type > best_type) {
best_type = type;
best_ipif = ipif;
- if (best_type == IPIF_SAMENET)
+ if (best_type == IPIF_LOCALADDR)
break; /* can't get better */
}
} while ((ipif = next_ipif) != start_ipif);
if ((ipif = best_ipif) != NULL) {
mutex_enter(&ipif->ipif_ill->ill_lock);
- if (!IPIF_CAN_LOOKUP(ipif)) {
+ if (IPIF_IS_CONDEMNED(ipif)) {
mutex_exit(&ipif->ipif_ill->ill_lock);
goto retry;
}
@@ -16783,7 +14767,7 @@ retry:
*/
if (IS_IPMP(ill) && ipif != NULL) {
next_ipif = ipif->ipif_next;
- if (next_ipif != NULL && IPIF_CAN_LOOKUP(next_ipif))
+ if (next_ipif != NULL && !IPIF_IS_CONDEMNED(next_ipif))
ill->ill_src_ipif = next_ipif;
else
ill->ill_src_ipif = NULL;
@@ -16803,14 +14787,14 @@ retry:
if (ipif == NULL) {
char buf1[INET6_ADDRSTRLEN];
- ip1dbg(("ipif_select_source(%s, %s) -> NULL\n",
+ ip1dbg(("ipif_select_source_v4(%s, %s) -> NULL\n",
ill->ill_name,
inet_ntop(AF_INET, &dst, buf1, sizeof (buf1))));
} else {
char buf1[INET6_ADDRSTRLEN];
char buf2[INET6_ADDRSTRLEN];
- ip1dbg(("ipif_select_source(%s, %s) -> %s\n",
+ ip1dbg(("ipif_select_source_v4(%s, %s) -> %s\n",
ipif->ipif_ill->ill_name,
inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)),
inet_ntop(AF_INET, &ipif->ipif_lcl_addr,
@@ -16821,172 +14805,80 @@ retry:
}
/*
- * If old_ipif is not NULL, see if ipif was derived from old
- * ipif and if so, recreate the interface route by re-doing
- * source address selection. This happens when ipif_down ->
- * ipif_update_other_ipifs calls us.
+ * Pick a source address based on the destination ill and an optional setsrc
+ * address.
+ * The result is stored in srcp. If generation is set, then put the source
+ * generation number there before we look for the source address (to avoid
+ * missing changes in the set of source addresses.
+ * If flagsp is set, then us it to pass back ipif_flags.
*
- * If old_ipif is NULL, just redo the source address selection
- * if needed. This happens when ipif_up_done calls us.
+ * If the caller wants to cache the returned source address and detect when
+ * that might be stale, the caller should pass in a generation argument,
+ * which the caller can later compare against ips_src_generation
+ *
+ * The precedence order for selecting an IPv4 source address is:
+ * - RTF_SETSRC on the offlink ire always wins.
+ * - If usrsrc is set, swap the ill to be the usesrc one.
+ * - If IPMP is used on the ill, select a random address from the most
+ * preferred ones below:
+ * 1. If onlink destination, same subnet and not deprecated, not ALL_ZONES
+ * 2. Not deprecated, not ALL_ZONES
+ * 3. If onlink destination, same subnet and not deprecated, ALL_ZONES
+ * 4. Not deprecated, ALL_ZONES
+ * 5. If onlink destination, same subnet and deprecated
+ * 6. Deprecated.
+ *
+ * We have lower preference for ALL_ZONES IP addresses,
+ * as they pose problems with unlabeled destinations.
+ *
+ * Note that when multiple IP addresses match e.g., #1 we pick
+ * the first one if IPMP is not in use. With IPMP we randomize.
*/
-static void
-ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif)
+int
+ip_select_source_v4(ill_t *ill, ipaddr_t setsrc, ipaddr_t dst,
+ ipaddr_t multicast_ifaddr,
+ zoneid_t zoneid, ip_stack_t *ipst, ipaddr_t *srcp,
+ uint32_t *generation, uint64_t *flagsp)
{
- ire_t *ire;
- ire_t *ipif_ire;
- queue_t *stq;
- ipif_t *nipif;
- ill_t *ill;
- boolean_t need_rele = B_FALSE;
- ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
-
- ASSERT(old_ipif == NULL || IAM_WRITER_IPIF(old_ipif));
- ASSERT(IAM_WRITER_IPIF(ipif));
+ ipif_t *ipif;
+ boolean_t notready = B_FALSE; /* Set if !ipif_addr_ready found */
- ill = ipif->ipif_ill;
- if (!(ipif->ipif_flags &
- (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) {
- /*
- * Can't possibly have borrowed the source
- * from old_ipif.
- */
- return;
- }
+ if (flagsp != NULL)
+ *flagsp = 0;
/*
- * Is there any work to be done? No work if the address
- * is INADDR_ANY, loopback or NOLOCAL or ANYCAST (
- * ipif_select_source() does not borrow addresses from
- * NOLOCAL and ANYCAST interfaces).
+ * Need to grab the generation number before we check to
+ * avoid a race with a change to the set of local addresses.
+ * No lock needed since the thread which updates the set of local
+ * addresses use ipif/ill locks and exit those (hence a store memory
+ * barrier) before doing the atomic increase of ips_src_generation.
*/
- if ((old_ipif != NULL) &&
- ((old_ipif->ipif_lcl_addr == INADDR_ANY) ||
- (old_ipif->ipif_ill->ill_wq == NULL) ||
- (old_ipif->ipif_flags &
- (IPIF_NOLOCAL|IPIF_ANYCAST)))) {
- return;
+ if (generation != NULL) {
+ *generation = ipst->ips_src_generation;
}
- /*
- * Perform the same checks as when creating the
- * IRE_INTERFACE in ipif_up_done.
- */
- if (!(ipif->ipif_flags & IPIF_UP))
- return;
-
- if ((ipif->ipif_flags & IPIF_NOXMIT) ||
- (ipif->ipif_subnet == INADDR_ANY))
- return;
-
- ipif_ire = ipif_to_ire(ipif);
- if (ipif_ire == NULL)
- return;
-
- /*
- * We know that ipif uses some other source for its
- * IRE_INTERFACE. Is it using the source of this
- * old_ipif?
- */
- if (old_ipif != NULL &&
- old_ipif->ipif_lcl_addr != ipif_ire->ire_src_addr) {
- ire_refrele(ipif_ire);
- return;
- }
- if (ip_debug > 2) {
- /* ip1dbg */
- pr_addr_dbg("ipif_recreate_interface_routes: deleting IRE for"
- " src %s\n", AF_INET, &ipif_ire->ire_src_addr);
- }
-
- stq = ipif_ire->ire_stq;
-
- /*
- * Can't use our source address. Select a different
- * source address for the IRE_INTERFACE.
- */
- nipif = ipif_select_source(ill, ipif->ipif_subnet, ipif->ipif_zoneid);
- if (nipif == NULL) {
- /* Last resort - all ipif's have IPIF_NOLOCAL */
- nipif = ipif;
- } else {
- need_rele = B_TRUE;
+ if (CLASSD(dst) && multicast_ifaddr != INADDR_ANY) {
+ *srcp = multicast_ifaddr;
+ return (0);
}
- ire = ire_create(
- (uchar_t *)&ipif->ipif_subnet, /* dest pref */
- (uchar_t *)&ipif->ipif_net_mask, /* mask */
- (uchar_t *)&nipif->ipif_src_addr, /* src addr */
- NULL, /* no gateway */
- &ipif->ipif_mtu, /* max frag */
- NULL, /* no src nce */
- NULL, /* no recv from queue */
- stq, /* send-to queue */
- ill->ill_net_type, /* IF_[NO]RESOLVER */
- ipif,
- 0,
- 0,
- 0,
- 0,
- &ire_uinfo_null,
- NULL,
- NULL,
- ipst);
-
- if (ire != NULL) {
- ire_t *ret_ire;
- int error;
-
- /*
- * We don't need ipif_ire anymore. We need to delete
- * before we add so that ire_add does not detect
- * duplicates.
- */
- ire_delete(ipif_ire);
- ret_ire = ire;
- error = ire_add(&ret_ire, NULL, NULL, NULL, B_FALSE);
- ASSERT(error == 0);
- ASSERT(ire == ret_ire);
- /* Held in ire_add */
- ire_refrele(ret_ire);
+ /* Was RTF_SETSRC set on the first IRE in the recursive lookup? */
+ if (setsrc != INADDR_ANY) {
+ *srcp = setsrc;
+ return (0);
}
- /*
- * Either we are falling through from above or could not
- * allocate a replacement.
- */
- ire_refrele(ipif_ire);
- if (need_rele)
- ipif_refrele(nipif);
-}
-
-/*
- * This old_ipif is going away.
- *
- * Determine if any other ipif's are using our address as
- * ipif_lcl_addr (due to those being IPIF_NOLOCAL, IPIF_ANYCAST, or
- * IPIF_DEPRECATED).
- * Find the IRE_INTERFACE for such ipifs and recreate them
- * to use an different source address following the rules in
- * ipif_up_done.
- */
-static void
-ipif_update_other_ipifs(ipif_t *old_ipif)
-{
- ipif_t *ipif;
- ill_t *ill;
- char buf[INET6_ADDRSTRLEN];
-
- ASSERT(IAM_WRITER_IPIF(old_ipif));
-
- ill = old_ipif->ipif_ill;
-
- ip1dbg(("ipif_update_other_ipifs(%s, %s)\n", ill->ill_name,
- inet_ntop(AF_INET, &old_ipif->ipif_lcl_addr, buf, sizeof (buf))));
-
- for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
- if (ipif == old_ipif)
- continue;
- ipif_recreate_interface_routes(old_ipif, ipif);
+ ipif = ipif_select_source_v4(ill, dst, zoneid, B_TRUE, &notready);
+ if (ipif == NULL) {
+ if (notready)
+ return (ENETDOWN);
+ else
+ return (EADDRNOTAVAIL);
}
+ *srcp = ipif->ipif_lcl_addr;
+ if (flagsp != NULL)
+ *flagsp = ipif->ipif_flags;
+ ipif_refrele(ipif);
+ return (0);
}
/* ARGSUSED */
@@ -17049,51 +14941,12 @@ ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
}
/*
- * Refresh all IRE_BROADCAST entries associated with `ill' to ensure the
- * minimum (but complete) set exist. This is necessary when adding or
- * removing an interface to/from an IPMP group, since interfaces in an
- * IPMP group use the IRE_BROADCAST entries for the IPMP group (whenever
- * its test address subnets overlap with IPMP data addresses). It's also
- * used to refresh the IRE_BROADCAST entries associated with the IPMP
- * interface when the nominated broadcast interface changes.
- */
-void
-ill_refresh_bcast(ill_t *ill)
-{
- ire_t *ire_array[12]; /* max ipif_create_bcast_ires() can create */
- ire_t **irep;
- ipif_t *ipif;
-
- ASSERT(!ill->ill_isv6);
- ASSERT(IAM_WRITER_ILL(ill));
-
- /*
- * Remove any old broadcast IREs.
- */
- ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_BROADCAST,
- ill_broadcast_delete, ill, ill);
-
- /*
- * Create new ones for any ipifs that are up and broadcast-capable.
- */
- for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
- if ((ipif->ipif_flags & (IPIF_UP|IPIF_BROADCAST)) !=
- (IPIF_UP|IPIF_BROADCAST))
- continue;
-
- irep = ipif_create_bcast_ires(ipif, ire_array);
- while (irep-- > ire_array) {
- (void) ire_add(irep, NULL, NULL, NULL, B_FALSE);
- if (*irep != NULL)
- ire_refrele(*irep);
- }
- }
-}
-
-/*
* Create any IRE_BROADCAST entries for `ipif', and store those entries in
- * `irep'. Returns a pointer to the next free `irep' entry (just like
- * ire_check_and_create_bcast()).
+ * `irep'. Returns a pointer to the next free `irep' entry
+ * A mirror exists in ipif_delete_bcast_ires().
+ *
+ * The management of any "extra" or seemingly duplicate IRE_BROADCASTs is
+ * done in ire_add.
*/
static ire_t **
ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep)
@@ -17101,18 +14954,20 @@ ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep)
ipaddr_t addr;
ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr);
ipaddr_t subnetmask = ipif->ipif_net_mask;
- int flags = MATCH_IRE_TYPE | MATCH_IRE_ILL;
+ ill_t *ill = ipif->ipif_ill;
+ zoneid_t zoneid = ipif->ipif_zoneid;
ip1dbg(("ipif_create_bcast_ires: creating broadcast IREs\n"));
ASSERT(ipif->ipif_flags & IPIF_BROADCAST);
+ ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT));
if (ipif->ipif_lcl_addr == INADDR_ANY ||
(ipif->ipif_flags & IPIF_NOLOCAL))
netmask = htonl(IN_CLASSA_NET); /* fallback */
- irep = ire_check_and_create_bcast(ipif, 0, irep, flags);
- irep = ire_check_and_create_bcast(ipif, INADDR_BROADCAST, irep, flags);
+ irep = ire_create_bcast(ill, 0, zoneid, irep);
+ irep = ire_create_bcast(ill, INADDR_BROADCAST, zoneid, irep);
/*
* For backward compatibility, we create net broadcast IREs based on
@@ -17125,9 +14980,8 @@ ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep)
*/
if (netmask < subnetmask) {
addr = netmask & ipif->ipif_subnet;
- irep = ire_check_and_create_bcast(ipif, addr, irep, flags);
- irep = ire_check_and_create_bcast(ipif, ~netmask | addr, irep,
- flags);
+ irep = ire_create_bcast(ill, addr, zoneid, irep);
+ irep = ire_create_bcast(ill, ~netmask | addr, zoneid, irep);
}
/*
@@ -17138,282 +14992,73 @@ ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep)
*/
if (subnetmask != 0xFFFFFFFF) {
addr = ipif->ipif_subnet;
- irep = ire_check_and_create_bcast(ipif, addr, irep, flags);
- irep = ire_check_and_create_bcast(ipif, ~subnetmask | addr,
- irep, flags);
+ irep = ire_create_bcast(ill, addr, zoneid, irep);
+ irep = ire_create_bcast(ill, ~subnetmask | addr, zoneid, irep);
}
return (irep);
}
/*
- * Broadcast IRE info structure used in the functions below. Since we
- * allocate BCAST_COUNT of them on the stack, keep the bit layout compact.
- */
-typedef struct bcast_ireinfo {
- uchar_t bi_type; /* BCAST_* value from below */
- uchar_t bi_willdie:1, /* will this IRE be going away? */
- bi_needrep:1, /* do we need to replace it? */
- bi_haverep:1, /* have we replaced it? */
- bi_pad:5;
- ipaddr_t bi_addr; /* IRE address */
- ipif_t *bi_backup; /* last-ditch ipif to replace it on */
-} bcast_ireinfo_t;
-
-enum { BCAST_ALLONES, BCAST_ALLZEROES, BCAST_NET, BCAST_SUBNET, BCAST_COUNT };
-
-/*
- * Check if `ipif' needs the dying broadcast IRE described by `bireinfop', and
- * return B_TRUE if it should immediately be used to recreate the IRE.
- */
-static boolean_t
-ipif_consider_bcast(ipif_t *ipif, bcast_ireinfo_t *bireinfop)
-{
- ipaddr_t addr;
-
- ASSERT(!bireinfop->bi_haverep && bireinfop->bi_willdie);
-
- switch (bireinfop->bi_type) {
- case BCAST_NET:
- addr = ipif->ipif_subnet & ip_net_mask(ipif->ipif_subnet);
- if (addr != bireinfop->bi_addr)
- return (B_FALSE);
- break;
- case BCAST_SUBNET:
- if (ipif->ipif_subnet != bireinfop->bi_addr)
- return (B_FALSE);
- break;
- }
-
- bireinfop->bi_needrep = 1;
- if (ipif->ipif_flags & (IPIF_DEPRECATED|IPIF_NOLOCAL|IPIF_ANYCAST)) {
- if (bireinfop->bi_backup == NULL)
- bireinfop->bi_backup = ipif;
- return (B_FALSE);
- }
- return (B_TRUE);
-}
-
-/*
- * Create the broadcast IREs described by `bireinfop' on `ipif', and return
- * them ala ire_check_and_create_bcast().
- */
-static ire_t **
-ipif_create_bcast(ipif_t *ipif, bcast_ireinfo_t *bireinfop, ire_t **irep)
-{
- ipaddr_t mask, addr;
-
- ASSERT(!bireinfop->bi_haverep && bireinfop->bi_needrep);
-
- addr = bireinfop->bi_addr;
- irep = ire_create_bcast(ipif, addr, irep);
-
- switch (bireinfop->bi_type) {
- case BCAST_NET:
- mask = ip_net_mask(ipif->ipif_subnet);
- irep = ire_create_bcast(ipif, addr | ~mask, irep);
- break;
- case BCAST_SUBNET:
- mask = ipif->ipif_net_mask;
- irep = ire_create_bcast(ipif, addr | ~mask, irep);
- break;
- }
-
- bireinfop->bi_haverep = 1;
- return (irep);
-}
-
-/*
- * Walk through all of the ipifs on `ill' that will be affected by `test_ipif'
- * going away, and determine if any of the broadcast IREs (named by `bireinfop')
- * that are going away are still needed. If so, have ipif_create_bcast()
- * recreate them (except for the deprecated case, as explained below).
- */
-static ire_t **
-ill_create_bcast(ill_t *ill, ipif_t *test_ipif, bcast_ireinfo_t *bireinfo,
- ire_t **irep)
-{
- int i;
- ipif_t *ipif;
-
- ASSERT(!ill->ill_isv6);
- for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
- /*
- * Skip this ipif if it's (a) the one being taken down, (b)
- * not in the same zone, or (c) has no valid local address.
- */
- if (ipif == test_ipif ||
- ipif->ipif_zoneid != test_ipif->ipif_zoneid ||
- ipif->ipif_subnet == 0 ||
- (ipif->ipif_flags & (IPIF_UP|IPIF_BROADCAST|IPIF_NOXMIT)) !=
- (IPIF_UP|IPIF_BROADCAST))
- continue;
-
- /*
- * For each dying IRE that hasn't yet been replaced, see if
- * `ipif' needs it and whether the IRE should be recreated on
- * `ipif'. If `ipif' is deprecated, ipif_consider_bcast()
- * will return B_FALSE even if `ipif' needs the IRE on the
- * hopes that we'll later find a needy non-deprecated ipif.
- * However, the ipif is recorded in bi_backup for possible
- * subsequent use by ipif_check_bcast_ires().
- */
- for (i = 0; i < BCAST_COUNT; i++) {
- if (!bireinfo[i].bi_willdie || bireinfo[i].bi_haverep)
- continue;
- if (!ipif_consider_bcast(ipif, &bireinfo[i]))
- continue;
- irep = ipif_create_bcast(ipif, &bireinfo[i], irep);
- }
-
- /*
- * If we've replaced all of the broadcast IREs that are going
- * to be taken down, we know we're done.
- */
- for (i = 0; i < BCAST_COUNT; i++) {
- if (bireinfo[i].bi_willdie && !bireinfo[i].bi_haverep)
- break;
- }
- if (i == BCAST_COUNT)
- break;
- }
- return (irep);
-}
-
-/*
- * Check if `test_ipif' (which is going away) is associated with any existing
- * broadcast IREs, and whether any other ipifs (e.g., on the same ill) were
- * using those broadcast IREs. If so, recreate the broadcast IREs on one or
- * more of those other ipifs. (The old IREs will be deleted in ipif_down().)
- *
- * This is necessary because broadcast IREs are shared. In particular, a
- * given ill has one set of all-zeroes and all-ones broadcast IREs (for every
- * zone), plus one set of all-subnet-ones, all-subnet-zeroes, all-net-ones,
- * and all-net-zeroes for every net/subnet (and every zone) it has IPIF_UP
- * ipifs on. Thus, if there are two IPIF_UP ipifs on the same subnet with the
- * same zone, they will share the same set of broadcast IREs.
- *
- * Note: the upper bound of 12 IREs comes from the worst case of replacing all
- * six pairs (loopback and non-loopback) of broadcast IREs (all-zeroes,
- * all-ones, subnet-zeroes, subnet-ones, net-zeroes, and net-ones).
+ * Mirror of ipif_create_bcast_ires()
*/
static void
-ipif_check_bcast_ires(ipif_t *test_ipif)
+ipif_delete_bcast_ires(ipif_t *ipif)
{
- ill_t *ill = test_ipif->ipif_ill;
- ire_t *ire, *ire_array[12]; /* see note above */
- ire_t **irep1, **irep = &ire_array[0];
- uint_t i, willdie;
- ipaddr_t mask = ip_net_mask(test_ipif->ipif_subnet);
- bcast_ireinfo_t bireinfo[BCAST_COUNT];
-
- ASSERT(!test_ipif->ipif_isv6);
- ASSERT(IAM_WRITER_IPIF(test_ipif));
-
- /*
- * No broadcast IREs for the LOOPBACK interface
- * or others such as point to point and IPIF_NOXMIT.
- */
- if (!(test_ipif->ipif_flags & IPIF_BROADCAST) ||
- (test_ipif->ipif_flags & IPIF_NOXMIT))
- return;
-
- bzero(bireinfo, sizeof (bireinfo));
- bireinfo[0].bi_type = BCAST_ALLZEROES;
- bireinfo[0].bi_addr = 0;
-
- bireinfo[1].bi_type = BCAST_ALLONES;
- bireinfo[1].bi_addr = INADDR_BROADCAST;
-
- bireinfo[2].bi_type = BCAST_NET;
- bireinfo[2].bi_addr = test_ipif->ipif_subnet & mask;
-
- if (test_ipif->ipif_net_mask != 0)
- mask = test_ipif->ipif_net_mask;
- bireinfo[3].bi_type = BCAST_SUBNET;
- bireinfo[3].bi_addr = test_ipif->ipif_subnet & mask;
-
- /*
- * Figure out what (if any) broadcast IREs will die as a result of
- * `test_ipif' going away. If none will die, we're done.
- */
- for (i = 0, willdie = 0; i < BCAST_COUNT; i++) {
- ire = ire_ctable_lookup(bireinfo[i].bi_addr, 0, IRE_BROADCAST,
- test_ipif, ALL_ZONES, NULL,
- (MATCH_IRE_TYPE | MATCH_IRE_IPIF), ill->ill_ipst);
- if (ire != NULL) {
- willdie++;
- bireinfo[i].bi_willdie = 1;
- ire_refrele(ire);
- }
- }
-
- if (willdie == 0)
- return;
-
- /*
- * Walk through all the ipifs that will be affected by the dying IREs,
- * and recreate the IREs as necessary. Note that all interfaces in an
- * IPMP illgrp share the same broadcast IREs, and thus the entire
- * illgrp must be walked, starting with the IPMP meta-interface (so
- * that broadcast IREs end up on it whenever possible).
- */
- if (IS_UNDER_IPMP(ill))
- ill = ipmp_illgrp_ipmp_ill(ill->ill_grp);
-
- irep = ill_create_bcast(ill, test_ipif, bireinfo, irep);
+ ipaddr_t addr;
+ ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr);
+ ipaddr_t subnetmask = ipif->ipif_net_mask;
+ ill_t *ill = ipif->ipif_ill;
+ zoneid_t zoneid = ipif->ipif_zoneid;
+ ire_t *ire;
- if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) {
- ipmp_illgrp_t *illg = ill->ill_grp;
+ ASSERT(ipif->ipif_flags & IPIF_BROADCAST);
+ ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT));
- ill = list_head(&illg->ig_if);
- for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
- for (i = 0; i < BCAST_COUNT; i++) {
- if (bireinfo[i].bi_willdie &&
- !bireinfo[i].bi_haverep)
- break;
- }
- if (i == BCAST_COUNT)
- break;
+ if (ipif->ipif_lcl_addr == INADDR_ANY ||
+ (ipif->ipif_flags & IPIF_NOLOCAL))
+ netmask = htonl(IN_CLASSA_NET); /* fallback */
- irep = ill_create_bcast(ill, test_ipif, bireinfo, irep);
- }
- }
+ ire = ire_lookup_bcast(ill, 0, zoneid);
+ ASSERT(ire != NULL);
+ ire_delete(ire); ire_refrele(ire);
+ ire = ire_lookup_bcast(ill, INADDR_BROADCAST, zoneid);
+ ASSERT(ire != NULL);
+ ire_delete(ire); ire_refrele(ire);
/*
- * Scan through the set of broadcast IREs and see if there are any
- * that we need to replace that have not yet been replaced. If so,
- * replace them using the appropriate backup ipif.
+ * For backward compatibility, we create net broadcast IREs based on
+ * the old "IP address class system", since some old machines only
+ * respond to these class derived net broadcast. However, we must not
+ * create these net broadcast IREs if the subnetmask is shorter than
+ * the IP address class based derived netmask. Otherwise, we may
+ * create a net broadcast address which is the same as an IP address
+ * on the subnet -- and then TCP will refuse to talk to that address.
*/
- for (i = 0; i < BCAST_COUNT; i++) {
- if (bireinfo[i].bi_needrep && !bireinfo[i].bi_haverep)
- irep = ipif_create_bcast(bireinfo[i].bi_backup,
- &bireinfo[i], irep);
+ if (netmask < subnetmask) {
+ addr = netmask & ipif->ipif_subnet;
+ ire = ire_lookup_bcast(ill, addr, zoneid);
+ ASSERT(ire != NULL);
+ ire_delete(ire); ire_refrele(ire);
+ ire = ire_lookup_bcast(ill, ~netmask | addr, zoneid);
+ ASSERT(ire != NULL);
+ ire_delete(ire); ire_refrele(ire);
}
/*
- * If we can't create all of them, don't add any of them. (Code in
- * ip_wput_ire() and ire_to_ill() assumes that we always have a
- * non-loopback copy and loopback copy for a given address.)
+ * Don't create IRE_BROADCAST IREs for the interface if the subnetmask
+ * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already
+ * created. Creating these broadcast IREs will only create confusion
+ * as `addr' will be the same as the IP address.
*/
- for (irep1 = irep; irep1 > ire_array; ) {
- irep1--;
- if (*irep1 == NULL) {
- ip0dbg(("ipif_check_bcast_ires: can't create "
- "IRE_BROADCAST, memory allocation failure\n"));
- while (irep > ire_array) {
- irep--;
- if (*irep != NULL)
- ire_delete(*irep);
- }
- return;
- }
- }
-
- for (irep1 = irep; irep1 > ire_array; ) {
- irep1--;
- if (ire_add(irep1, NULL, NULL, NULL, B_FALSE) == 0)
- ire_refrele(*irep1); /* Held in ire_add */
+ if (subnetmask != 0xFFFFFFFF) {
+ addr = ipif->ipif_subnet;
+ ire = ire_lookup_bcast(ill, addr, zoneid);
+ ASSERT(ire != NULL);
+ ire_delete(ire); ire_refrele(ire);
+ ire = ire_lookup_bcast(ill, ~subnetmask | addr, zoneid);
+ ASSERT(ire != NULL);
+ ire_delete(ire); ire_refrele(ire);
}
}
@@ -17423,7 +15068,7 @@ ipif_check_bcast_ires(ipif_t *test_ipif)
* Set IFF_IPV* and ill_isv6 prior to doing the lookup
* since ipif_lookup_on_name uses the _isv6 flags when matching.
* Returns EINPROGRESS when mp has been consumed by queueing it on
- * ill_pending_mp and the ioctl will complete in ip_rput.
+ * ipx_pending_mp and the ioctl will complete in ip_rput.
*
* Can operate on either a module or a driver queue.
* Returns an error if not a module queue.
@@ -17485,7 +15130,7 @@ ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
* We start off as IFF_IPV4 in ipif_allocate and become
* IFF_IPV4 or IFF_IPV6 here depending on lifr_flags value.
* The only flags that we read from user space are IFF_IPV4,
- * IFF_IPV6, IFF_XRESOLV and IFF_BROADCAST.
+ * IFF_IPV6, and IFF_BROADCAST.
*
* This ill has not been inserted into the global list.
* So we are still single threaded and don't need any lock
@@ -17502,22 +15147,13 @@ ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
}
new_flags =
- lifr->lifr_flags & (IFF_IPV6|IFF_IPV4|IFF_XRESOLV|IFF_BROADCAST);
+ lifr->lifr_flags & (IFF_IPV6|IFF_IPV4|IFF_BROADCAST);
if ((new_flags ^ (IFF_IPV6|IFF_IPV4)) == 0) {
ip1dbg(("ip_sioctl_slifname: flags must be exactly one of "
"IFF_IPV4 or IFF_IPV6\n"));
return (EINVAL);
}
- /*
- * Only allow the IFF_XRESOLV flag to be set on IPv6 interfaces.
- */
- if ((new_flags & IFF_XRESOLV) && !(new_flags & IFF_IPV6) &&
- !(ipif->ipif_isv6)) {
- ip1dbg(("ip_sioctl_slifname: XRESOLV only allowed on "
- "IPv6 interface\n"));
- return (EINVAL);
- }
/*
* We always start off as IPv4, so only need to check for IPv6.
@@ -17532,11 +15168,6 @@ ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
else
ipif->ipif_flags &= ~IPIF_BROADCAST;
- if ((new_flags & IFF_XRESOLV) != 0)
- ill->ill_flags |= ILLF_XRESOLV;
- else
- ill->ill_flags &= ~ILLF_XRESOLV;
-
/* We started off as V4. */
if (ill->ill_flags & ILLF_IPV6) {
ill->ill_phyint->phyint_illv6 = ill;
@@ -17566,23 +15197,17 @@ ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
*/
ipif_t *
ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid,
- queue_t *q, mblk_t *mp, ipsq_func_t func, int *err, ip_stack_t *ipst)
+ ip_stack_t *ipst)
{
ill_t *ill;
ipif_t *ipif = NULL;
- ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) ||
- (q != NULL && mp != NULL && func != NULL && err != NULL));
-
- if (err != NULL)
- *err = 0;
-
- ill = ill_lookup_on_ifindex(index, isv6, q, mp, func, err, ipst);
+ ill = ill_lookup_on_ifindex(index, isv6, ipst);
if (ill != NULL) {
mutex_enter(&ill->ill_lock);
for (ipif = ill->ill_ipif; ipif != NULL;
ipif = ipif->ipif_next) {
- if (IPIF_CAN_LOOKUP(ipif) && (zoneid == ALL_ZONES ||
+ if (!IPIF_IS_CONDEMNED(ipif) && (zoneid == ALL_ZONES ||
zoneid == ipif->ipif_zoneid ||
ipif->ipif_zoneid == ALL_ZONES)) {
ipif_refhold_locked(ipif);
@@ -17591,8 +15216,6 @@ ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid,
}
mutex_exit(&ill->ill_lock);
ill_refrele(ill);
- if (ipif == NULL && err != NULL)
- *err = ENXIO;
}
return (ipif);
}
@@ -17673,6 +15296,8 @@ ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
if (ILL_OTHER(ill))
ip_rts_ifmsg(ILL_OTHER(ill)->ill_ipif, RTSQ_DEFAULT);
+ /* Perhaps ilgs should use this ill */
+ update_conn_ill(NULL, ill->ill_ipst);
return (0);
}
@@ -17764,7 +15389,7 @@ ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
err = ipif_logical_down(ipif, q, mp);
if (err == EINPROGRESS)
return (err);
- ipif_down_tail(ipif);
+ (void) ipif_down_tail(ipif);
need_up = B_TRUE;
}
@@ -17801,6 +15426,9 @@ ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid,
/* Update sctp list */
sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
+ /* The default multicast interface might have changed */
+ ire_increment_multicast_generation(ipst, ipif->ipif_ill->ill_isv6);
+
if (need_up) {
/*
* Now bring the interface back up. If this
@@ -17825,7 +15453,6 @@ ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
zone_t *zptr;
zone_status_t status;
- ASSERT(ipif->ipif_id != 0);
ASSERT(ipip->ipi_cmd_type == LIF_CMD);
if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES)
zoneid = GLOBAL_ZONEID;
@@ -17863,7 +15490,7 @@ ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
return (EINVAL);
}
- ipif_down_tail(ipif);
+ (void) ipif_down_tail(ipif);
return (ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp,
B_TRUE));
@@ -17943,6 +15570,16 @@ ill_prev_usesrc(ill_t *uill)
* Release all members of the usesrc group. This routine is called
* from ill_delete when the interface being unplumbed is the
* group head.
+ *
+ * This silently clears the usesrc that ifconfig setup.
+ * An alternative would be to keep that ifindex, and drop packets on the floor
+ * since no source address can be selected.
+ * Even if we keep the current semantics, don't need a lock and a linked list.
+ * Can walk all the ills checking if they have a ill_usesrc_ifindex matching
+ * the one that is being removed. Issue is how we return the usesrc users
+ * (SIOCGLIFSRCOF). We want to be able to find the ills which have an
+ * ill_usesrc_ifindex matching a target ill. We could also do that with an
+ * ill walk, but the walker would need to insert in the ioctl response.
*/
static void
ill_disband_usesrc_group(ill_t *uill)
@@ -18023,8 +15660,7 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ip_ioctl_cmd_t *ipip, void *ifreq)
{
struct lifreq *lifr = (struct lifreq *)ifreq;
- boolean_t isv6 = B_FALSE, reset_flg = B_FALSE,
- ill_flag_changed = B_FALSE;
+ boolean_t isv6 = B_FALSE, reset_flg = B_FALSE;
ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill;
int err = 0, ret;
uint_t ifindex;
@@ -18035,7 +15671,7 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
ASSERT(q->q_next == NULL);
ASSERT(CONN_Q(q));
- isv6 = (Q_TO_CONN(q))->conn_af_isv6;
+ isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6;
ifindex = lifr->lifr_index;
if (ifindex == 0) {
@@ -18048,10 +15684,9 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
reset_flg = B_TRUE;
}
- usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, q, mp,
- ip_process_ioctl, &err, ipst);
+ usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
if (usesrc_ill == NULL) {
- return (err);
+ return (ENXIO);
}
ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl,
@@ -18101,31 +15736,6 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
usesrc_ill->ill_isv6));
/*
- * The next step ensures that no new ires will be created referencing
- * the client ill, until the ILL_CHANGING flag is cleared. Then
- * we go through an ire walk deleting all ire caches that reference
- * the client ill. New ires referencing the client ill that are added
- * to the ire table before the ILL_CHANGING flag is set, will be
- * cleaned up by the ire walk below. Attempt to add new ires referencing
- * the client ill while the ILL_CHANGING flag is set will be failed
- * during the ire_add in ire_atomic_start. ire_atomic_start atomically
- * checks (under the ill_g_usesrc_lock) that the ire being added
- * is not stale, i.e the ire_stq and ire_ipif are consistent and
- * belong to the same usesrc group.
- */
- mutex_enter(&usesrc_cli_ill->ill_lock);
- usesrc_cli_ill->ill_state_flags |= ILL_CHANGING;
- mutex_exit(&usesrc_cli_ill->ill_lock);
- ill_flag_changed = B_TRUE;
-
- if (ipif->ipif_isv6)
- ire_walk_v6(ipif_delete_cache_ire, (char *)usesrc_cli_ill,
- ALL_ZONES, ipst);
- else
- ire_walk_v4(ipif_delete_cache_ire, (char *)usesrc_cli_ill,
- ALL_ZONES, ipst);
-
- /*
* ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next
* and the ill_usesrc_ifindex fields
*/
@@ -18169,15 +15779,14 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
rw_exit(&ipst->ips_ill_g_usesrc_lock);
done:
- if (ill_flag_changed) {
- mutex_enter(&usesrc_cli_ill->ill_lock);
- usesrc_cli_ill->ill_state_flags &= ~ILL_CHANGING;
- mutex_exit(&usesrc_cli_ill->ill_lock);
- }
if (ipsq != NULL)
ipsq_exit(ipsq);
/* The refrele on the lifr_name ipif is done by ip_process_ioctl */
ill_refrele(usesrc_ill);
+
+ /* Let conn_ixa caching know that source address selection changed */
+ ip_update_source_selection(ipst);
+
return (err);
}
@@ -18384,7 +15993,6 @@ ill_phyint_reinit(ill_t *ill)
* Now that the phyint's ifindex has been assigned, complete the
* remaining
*/
-
ill->ill_ip_mib->ipIfStatsIfIndex = ill->ill_phyint->phyint_ifindex;
if (ill->ill_isv6) {
ill->ill_icmp6_mib->ipv6IfIcmpIfIndex =
@@ -18449,6 +16057,8 @@ ip_ifname_notify(ill_t *ill, queue_t *q)
lifr->lifr_ppa = ill->ill_ppa;
lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6));
+ DTRACE_PROBE3(ill__dlpi, char *, "ip_ifname_notify",
+ char *, "SIOCSLIFNAME", ill_t *, ill);
putnext(q, mp1);
}
@@ -18503,23 +16113,6 @@ ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
*/
err = ill_dl_phys(ill, ipif, mp, q);
- /*
- * If there is no IRE expiration timer running, get one started.
- * igmp and mld timers will be triggered by the first multicast
- */
- if (ipst->ips_ip_ire_expire_id == 0) {
- /*
- * acquire the lock and check again.
- */
- mutex_enter(&ipst->ips_ip_trash_timer_lock);
- if (ipst->ips_ip_ire_expire_id == 0) {
- ipst->ips_ip_ire_expire_id = timeout(
- ip_trash_timer_expire, ipst,
- MSEC_TO_TICK(ipst->ips_ip_timer_interval));
- }
- mutex_exit(&ipst->ips_ip_trash_timer_lock);
- }
-
if (ill->ill_isv6) {
mutex_enter(&ipst->ips_mld_slowtimeout_lock);
if (ipst->ips_mld_slowtimeout_id == 0) {
@@ -18545,7 +16138,7 @@ ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
* Common routine for ppa and ifname setting. Should be called exclusive.
*
* Returns EINPROGRESS when mp has been consumed by queueing it on
- * ill_pending_mp and the ioctl will complete in ip_rput.
+ * ipx_pending_mp and the ioctl will complete in ip_rput.
*
* NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return
* the new name and new ppa in lifr_name and lifr_ppa respectively.
@@ -18576,6 +16169,7 @@ ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr)
ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ);
ASSERT(ill->ill_ppa == UINT_MAX);
+ ill->ill_defend_start = ill->ill_defend_count = 0;
/* The ppa is sent down by ifconfig or is chosen */
if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) {
return (EINVAL);
@@ -18630,18 +16224,18 @@ ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr)
if (ill->ill_flags & ILLF_IPV6) {
ill->ill_isv6 = B_TRUE;
+ ill_set_inputfn(ill);
if (ill->ill_rq != NULL) {
ill->ill_rq->q_qinfo = &iprinitv6;
- ill->ill_wq->q_qinfo = &ipwinitv6;
}
/* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */
ipif->ipif_v6lcl_addr = ipv6_all_zeros;
- ipif->ipif_v6src_addr = ipv6_all_zeros;
ipif->ipif_v6subnet = ipv6_all_zeros;
ipif->ipif_v6net_mask = ipv6_all_zeros;
ipif->ipif_v6brd_addr = ipv6_all_zeros;
ipif->ipif_v6pp_dst_addr = ipv6_all_zeros;
+ ill->ill_reachable_retrans_time = ND_RETRANS_TIMER;
/*
* point-to-point or Non-mulicast capable
* interfaces won't do NUD unless explicitly
@@ -18670,8 +16264,9 @@ ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr)
ill->ill_flags |= ILLF_ROUTER;
} else if (ill->ill_flags & ILLF_IPV4) {
ill->ill_isv6 = B_FALSE;
+ ill_set_inputfn(ill);
+ ill->ill_reachable_retrans_time = ARP_RETRANS_TIMER;
IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr);
- IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6src_addr);
IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet);
IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask);
IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr);
@@ -18783,6 +16378,7 @@ ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr)
* restore previous values
*/
ill->ill_isv6 = B_FALSE;
+ ill_set_inputfn(ill);
}
return (error);
}
@@ -18810,95 +16406,11 @@ ipif_init(ip_stack_t *ipst)
}
/*
- * Lookup the ipif corresponding to the onlink destination address. For
- * point-to-point interfaces, it matches with remote endpoint destination
- * address. For point-to-multipoint interfaces it only tries to match the
- * destination with the interface's subnet address. The longest, most specific
- * match is found to take care of such rare network configurations like -
- * le0: 129.146.1.1/16
- * le1: 129.146.2.2/24
- *
- * This is used by SO_DONTROUTE and IP_NEXTHOP. Since neither of those are
- * supported on underlying interfaces in an IPMP group, underlying interfaces
- * are ignored when looking up a match. (If we didn't ignore them, we'd
- * risk using a test address as a source for outgoing traffic.)
- */
-ipif_t *
-ipif_lookup_onlink_addr(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst)
-{
- ipif_t *ipif, *best_ipif;
- ill_t *ill;
- ill_walk_context_t ctx;
-
- ASSERT(zoneid != ALL_ZONES);
- best_ipif = NULL;
-
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- ill = ILL_START_WALK_V4(&ctx, ipst);
- for (; ill != NULL; ill = ill_next(&ctx, ill)) {
- if (IS_UNDER_IPMP(ill))
- continue;
- mutex_enter(&ill->ill_lock);
- for (ipif = ill->ill_ipif; ipif != NULL;
- ipif = ipif->ipif_next) {
- if (!IPIF_CAN_LOOKUP(ipif))
- continue;
- if (ipif->ipif_zoneid != zoneid &&
- ipif->ipif_zoneid != ALL_ZONES)
- continue;
- /*
- * Point-to-point case. Look for exact match with
- * destination address.
- */
- if (ipif->ipif_flags & IPIF_POINTOPOINT) {
- if (ipif->ipif_pp_dst_addr == addr) {
- ipif_refhold_locked(ipif);
- mutex_exit(&ill->ill_lock);
- rw_exit(&ipst->ips_ill_g_lock);
- if (best_ipif != NULL)
- ipif_refrele(best_ipif);
- return (ipif);
- }
- } else if (ipif->ipif_subnet == (addr &
- ipif->ipif_net_mask)) {
- /*
- * Point-to-multipoint case. Looping through to
- * find the most specific match. If there are
- * multiple best match ipif's then prefer ipif's
- * that are UP. If there is only one best match
- * ipif and it is DOWN we must still return it.
- */
- if ((best_ipif == NULL) ||
- (ipif->ipif_net_mask >
- best_ipif->ipif_net_mask) ||
- ((ipif->ipif_net_mask ==
- best_ipif->ipif_net_mask) &&
- ((ipif->ipif_flags & IPIF_UP) &&
- (!(best_ipif->ipif_flags & IPIF_UP))))) {
- ipif_refhold_locked(ipif);
- mutex_exit(&ill->ill_lock);
- rw_exit(&ipst->ips_ill_g_lock);
- if (best_ipif != NULL)
- ipif_refrele(best_ipif);
- best_ipif = ipif;
- rw_enter(&ipst->ips_ill_g_lock,
- RW_READER);
- mutex_enter(&ill->ill_lock);
- }
- }
- }
- mutex_exit(&ill->ill_lock);
- }
- rw_exit(&ipst->ips_ill_g_lock);
- return (best_ipif);
-}
-
-/*
* Save enough information so that we can recreate the IRE if
* the interface goes down and then up.
*/
-static void
-ipif_save_ire(ipif_t *ipif, ire_t *ire)
+void
+ill_save_ire(ill_t *ill, ire_t *ire)
{
mblk_t *save_mp;
@@ -18910,115 +16422,148 @@ ipif_save_ire(ipif_t *ipif, ire_t *ire)
ifrt = (ifrt_t *)save_mp->b_rptr;
bzero(ifrt, sizeof (ifrt_t));
ifrt->ifrt_type = ire->ire_type;
- ifrt->ifrt_addr = ire->ire_addr;
- ifrt->ifrt_gateway_addr = ire->ire_gateway_addr;
- ifrt->ifrt_src_addr = ire->ire_src_addr;
- ifrt->ifrt_mask = ire->ire_mask;
+ if (ire->ire_ipversion == IPV4_VERSION) {
+ ASSERT(!ill->ill_isv6);
+ ifrt->ifrt_addr = ire->ire_addr;
+ ifrt->ifrt_gateway_addr = ire->ire_gateway_addr;
+ ifrt->ifrt_setsrc_addr = ire->ire_setsrc_addr;
+ ifrt->ifrt_mask = ire->ire_mask;
+ } else {
+ ASSERT(ill->ill_isv6);
+ ifrt->ifrt_v6addr = ire->ire_addr_v6;
+ /* ire_gateway_addr_v6 can change due to RTM_CHANGE */
+ mutex_enter(&ire->ire_lock);
+ ifrt->ifrt_v6gateway_addr = ire->ire_gateway_addr_v6;
+ mutex_exit(&ire->ire_lock);
+ ifrt->ifrt_v6setsrc_addr = ire->ire_setsrc_addr_v6;
+ ifrt->ifrt_v6mask = ire->ire_mask_v6;
+ }
ifrt->ifrt_flags = ire->ire_flags;
- ifrt->ifrt_max_frag = ire->ire_max_frag;
- mutex_enter(&ipif->ipif_saved_ire_lock);
- save_mp->b_cont = ipif->ipif_saved_ire_mp;
- ipif->ipif_saved_ire_mp = save_mp;
- ipif->ipif_saved_ire_cnt++;
- mutex_exit(&ipif->ipif_saved_ire_lock);
+ ifrt->ifrt_zoneid = ire->ire_zoneid;
+ mutex_enter(&ill->ill_saved_ire_lock);
+ save_mp->b_cont = ill->ill_saved_ire_mp;
+ ill->ill_saved_ire_mp = save_mp;
+ ill->ill_saved_ire_cnt++;
+ mutex_exit(&ill->ill_saved_ire_lock);
}
}
-static void
-ipif_remove_ire(ipif_t *ipif, ire_t *ire)
+/*
+ * Remove one entry from ill_saved_ire_mp.
+ */
+void
+ill_remove_saved_ire(ill_t *ill, ire_t *ire)
{
mblk_t **mpp;
mblk_t *mp;
ifrt_t *ifrt;
- /* Remove from ipif_saved_ire_mp list if it is there */
- mutex_enter(&ipif->ipif_saved_ire_lock);
- for (mpp = &ipif->ipif_saved_ire_mp; *mpp != NULL;
+ /* Remove from ill_saved_ire_mp list if it is there */
+ mutex_enter(&ill->ill_saved_ire_lock);
+ for (mpp = &ill->ill_saved_ire_mp; *mpp != NULL;
mpp = &(*mpp)->b_cont) {
+ in6_addr_t gw_addr_v6;
+
/*
- * On a given ipif, the triple of address, gateway and
- * mask is unique for each saved IRE (in the case of
- * ordinary interface routes, the gateway address is
- * all-zeroes).
+ * On a given ill, the tuple of address, gateway, mask,
+ * ire_type, and zoneid is unique for each saved IRE.
*/
mp = *mpp;
ifrt = (ifrt_t *)mp->b_rptr;
- if (ifrt->ifrt_addr == ire->ire_addr &&
+ /* ire_gateway_addr_v6 can change - need lock */
+ mutex_enter(&ire->ire_lock);
+ gw_addr_v6 = ire->ire_gateway_addr_v6;
+ mutex_exit(&ire->ire_lock);
+
+ if (ifrt->ifrt_zoneid != ire->ire_zoneid ||
+ ifrt->ifrt_type != ire->ire_type)
+ continue;
+
+ if (ill->ill_isv6 ?
+ (IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6addr,
+ &ire->ire_addr_v6) &&
+ IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6gateway_addr,
+ &gw_addr_v6) &&
+ IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6mask,
+ &ire->ire_mask_v6)) :
+ (ifrt->ifrt_addr == ire->ire_addr &&
ifrt->ifrt_gateway_addr == ire->ire_gateway_addr &&
- ifrt->ifrt_mask == ire->ire_mask) {
+ ifrt->ifrt_mask == ire->ire_mask)) {
*mpp = mp->b_cont;
- ipif->ipif_saved_ire_cnt--;
+ ill->ill_saved_ire_cnt--;
freeb(mp);
break;
}
}
- mutex_exit(&ipif->ipif_saved_ire_lock);
+ mutex_exit(&ill->ill_saved_ire_lock);
}
/*
* IP multirouting broadcast routes handling
* Append CGTP broadcast IREs to regular ones created
* at ifconfig time.
+ * The usage is a route add <cgtp_bc> <nic_bc> -multirt i.e., both
+ * the destination and the gateway are broadcast addresses.
+ * The caller has verified that the destination is an IRE_BROADCAST and that
+ * RTF_MULTIRT was set. Here if the gateway is a broadcast address, then
+ * we create a MULTIRT IRE_BROADCAST.
+ * Note that the IRE_HOST created by ire_rt_add doesn't get found by anything
+ * since the IRE_BROADCAST takes precedence; ire_add_v4 does head insertion.
*/
static void
-ip_cgtp_bcast_add(ire_t *ire, ire_t *ire_dst, ip_stack_t *ipst)
+ip_cgtp_bcast_add(ire_t *ire, ip_stack_t *ipst)
{
ire_t *ire_prim;
ASSERT(ire != NULL);
- ASSERT(ire_dst != NULL);
- ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0,
- IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
+ ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0,
+ IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst,
+ NULL);
if (ire_prim != NULL) {
/*
* We are in the special case of broadcasts for
* CGTP. We add an IRE_BROADCAST that holds
* the RTF_MULTIRT flag, the destination
- * address of ire_dst and the low level
+ * address and the low level
* info of ire_prim. In other words, CGTP
* broadcast is added to the redundant ipif.
*/
- ipif_t *ipif_prim;
+ ill_t *ill_prim;
ire_t *bcast_ire;
- ipif_prim = ire_prim->ire_ipif;
+ ill_prim = ire_prim->ire_ill;
- ip2dbg(("ip_cgtp_filter_bcast_add: "
- "ire_dst %p, ire_prim %p, ipif_prim %p\n",
- (void *)ire_dst, (void *)ire_prim,
- (void *)ipif_prim));
+ ip2dbg(("ip_cgtp_filter_bcast_add: ire_prim %p, ill_prim %p\n",
+ (void *)ire_prim, (void *)ill_prim));
bcast_ire = ire_create(
(uchar_t *)&ire->ire_addr,
(uchar_t *)&ip_g_all_ones,
- (uchar_t *)&ire_dst->ire_src_addr,
(uchar_t *)&ire->ire_gateway_addr,
- &ipif_prim->ipif_mtu,
- NULL,
- ipif_prim->ipif_rq,
- ipif_prim->ipif_wq,
IRE_BROADCAST,
- ipif_prim,
- 0,
- 0,
- 0,
- ire->ire_flags,
- &ire_uinfo_null,
- NULL,
+ ill_prim,
+ GLOBAL_ZONEID, /* CGTP is only for the global zone */
+ ire->ire_flags | RTF_KERNEL,
NULL,
ipst);
+ /*
+ * Here we assume that ire_add does head insertion so that
+ * the added IRE_BROADCAST comes before the existing IRE_HOST.
+ */
if (bcast_ire != NULL) {
-
- if (ire_add(&bcast_ire, NULL, NULL, NULL,
- B_FALSE) == 0) {
+ if (ire->ire_flags & RTF_SETSRC) {
+ bcast_ire->ire_setsrc_addr =
+ ire->ire_setsrc_addr;
+ }
+ bcast_ire = ire_add(bcast_ire);
+ if (bcast_ire != NULL) {
ip2dbg(("ip_cgtp_filter_bcast_add: "
"added bcast_ire %p\n",
(void *)bcast_ire));
- ipif_save_ire(bcast_ire->ire_ipif,
- bcast_ire);
+ ill_save_ire(ill_prim, bcast_ire);
ire_refrele(bcast_ire);
}
}
@@ -19028,430 +16573,52 @@ ip_cgtp_bcast_add(ire_t *ire, ire_t *ire_dst, ip_stack_t *ipst)
/*
* IP multirouting broadcast routes handling
- * Remove the broadcast ire
+ * Remove the broadcast ire.
+ * The usage is a route delete <cgtp_bc> <nic_bc> -multirt i.e., both
+ * the destination and the gateway are broadcast addresses.
+ * The caller has only verified that RTF_MULTIRT was set. We check
+ * that the destination is broadcast and that the gateway is a broadcast
+ * address, and if so delete the IRE added by ip_cgtp_bcast_add().
*/
static void
ip_cgtp_bcast_delete(ire_t *ire, ip_stack_t *ipst)
{
- ire_t *ire_dst;
-
ASSERT(ire != NULL);
- ire_dst = ire_ctable_lookup(ire->ire_addr, 0, IRE_BROADCAST,
- NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
- if (ire_dst != NULL) {
+
+ if (ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST) {
ire_t *ire_prim;
- ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0,
- IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
+ ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0,
+ IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0,
+ ipst, NULL);
if (ire_prim != NULL) {
- ipif_t *ipif_prim;
+ ill_t *ill_prim;
ire_t *bcast_ire;
- ipif_prim = ire_prim->ire_ipif;
+ ill_prim = ire_prim->ire_ill;
ip2dbg(("ip_cgtp_filter_bcast_delete: "
- "ire_dst %p, ire_prim %p, ipif_prim %p\n",
- (void *)ire_dst, (void *)ire_prim,
- (void *)ipif_prim));
-
- bcast_ire = ire_ctable_lookup(ire->ire_addr,
- ire->ire_gateway_addr,
- IRE_BROADCAST,
- ipif_prim, ALL_ZONES,
- NULL,
- MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_IPIF |
- MATCH_IRE_MASK, ipst);
+ "ire_prim %p, ill_prim %p\n",
+ (void *)ire_prim, (void *)ill_prim));
+
+ bcast_ire = ire_ftable_lookup_v4(ire->ire_addr, 0,
+ ire->ire_gateway_addr, IRE_BROADCAST,
+ ill_prim, ALL_ZONES, NULL,
+ MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_ILL |
+ MATCH_IRE_MASK, 0, ipst, NULL);
if (bcast_ire != NULL) {
ip2dbg(("ip_cgtp_filter_bcast_delete: "
"looked up bcast_ire %p\n",
(void *)bcast_ire));
- ipif_remove_ire(bcast_ire->ire_ipif,
+ ill_remove_saved_ire(bcast_ire->ire_ill,
bcast_ire);
ire_delete(bcast_ire);
ire_refrele(bcast_ire);
}
ire_refrele(ire_prim);
}
- ire_refrele(ire_dst);
- }
-}
-
-/*
- * IPsec hardware acceleration capabilities related functions.
- */
-
-/*
- * Free a per-ill IPsec capabilities structure.
- */
-static void
-ill_ipsec_capab_free(ill_ipsec_capab_t *capab)
-{
- if (capab->auth_hw_algs != NULL)
- kmem_free(capab->auth_hw_algs, capab->algs_size);
- if (capab->encr_hw_algs != NULL)
- kmem_free(capab->encr_hw_algs, capab->algs_size);
- if (capab->encr_algparm != NULL)
- kmem_free(capab->encr_algparm, capab->encr_algparm_size);
- kmem_free(capab, sizeof (ill_ipsec_capab_t));
-}
-
-/*
- * Allocate a new per-ill IPsec capabilities structure. This structure
- * is specific to an IPsec protocol (AH or ESP). It is implemented as
- * an array which specifies, for each algorithm, whether this algorithm
- * is supported by the ill or not.
- */
-static ill_ipsec_capab_t *
-ill_ipsec_capab_alloc(void)
-{
- ill_ipsec_capab_t *capab;
- uint_t nelems;
-
- capab = kmem_zalloc(sizeof (ill_ipsec_capab_t), KM_NOSLEEP);
- if (capab == NULL)
- return (NULL);
-
- /* we need one bit per algorithm */
- nelems = MAX_IPSEC_ALGS / BITS(ipsec_capab_elem_t);
- capab->algs_size = nelems * sizeof (ipsec_capab_elem_t);
-
- /* allocate memory to store algorithm flags */
- capab->encr_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP);
- if (capab->encr_hw_algs == NULL)
- goto nomem;
- capab->auth_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP);
- if (capab->auth_hw_algs == NULL)
- goto nomem;
- /*
- * Leave encr_algparm NULL for now since we won't need it half
- * the time
- */
- return (capab);
-
-nomem:
- ill_ipsec_capab_free(capab);
- return (NULL);
-}
-
-/*
- * Resize capability array. Since we're exclusive, this is OK.
- */
-static boolean_t
-ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *capab, int algid)
-{
- ipsec_capab_algparm_t *nalp, *oalp;
- uint32_t olen, nlen;
-
- oalp = capab->encr_algparm;
- olen = capab->encr_algparm_size;
-
- if (oalp != NULL) {
- if (algid < capab->encr_algparm_end)
- return (B_TRUE);
- }
-
- nlen = (algid + 1) * sizeof (*nalp);
- nalp = kmem_zalloc(nlen, KM_NOSLEEP);
- if (nalp == NULL)
- return (B_FALSE);
-
- if (oalp != NULL) {
- bcopy(oalp, nalp, olen);
- kmem_free(oalp, olen);
- }
- capab->encr_algparm = nalp;
- capab->encr_algparm_size = nlen;
- capab->encr_algparm_end = algid + 1;
-
- return (B_TRUE);
-}
-
-/*
- * Compare the capabilities of the specified ill with the protocol
- * and algorithms specified by the SA passed as argument.
- * If they match, returns B_TRUE, B_FALSE if they do not match.
- *
- * The ill can be passed as a pointer to it, or by specifying its index
- * and whether it is an IPv6 ill (ill_index and ill_isv6 arguments).
- *
- * Called by ipsec_out_is_accelerated() do decide whether an outbound
- * packet is eligible for hardware acceleration, and by
- * ill_ipsec_capab_send_all() to decide whether a SA must be sent down
- * to a particular ill.
- */
-boolean_t
-ipsec_capab_match(ill_t *ill, uint_t ill_index, boolean_t ill_isv6,
- ipsa_t *sa, netstack_t *ns)
-{
- boolean_t sa_isv6;
- uint_t algid;
- struct ill_ipsec_capab_s *cpp;
- boolean_t need_refrele = B_FALSE;
- ip_stack_t *ipst = ns->netstack_ip;
-
- if (ill == NULL) {
- ill = ill_lookup_on_ifindex(ill_index, ill_isv6, NULL,
- NULL, NULL, NULL, ipst);
- if (ill == NULL) {
- ip0dbg(("ipsec_capab_match: ill doesn't exist\n"));
- return (B_FALSE);
- }
- need_refrele = B_TRUE;
- }
-
- /*
- * Use the address length specified by the SA to determine
- * if it corresponds to a IPv6 address, and fail the matching
- * if the isv6 flag passed as argument does not match.
- * Note: this check is used for SADB capability checking before
- * sending SA information to an ill.
- */
- sa_isv6 = (sa->ipsa_addrfam == AF_INET6);
- if (sa_isv6 != ill_isv6)
- /* protocol mismatch */
- goto done;
-
- /*
- * Check if the ill supports the protocol, algorithm(s) and
- * key size(s) specified by the SA, and get the pointers to
- * the algorithms supported by the ill.
- */
- switch (sa->ipsa_type) {
-
- case SADB_SATYPE_ESP:
- if (!(ill->ill_capabilities & ILL_CAPAB_ESP))
- /* ill does not support ESP acceleration */
- goto done;
- cpp = ill->ill_ipsec_capab_esp;
- algid = sa->ipsa_auth_alg;
- if (!IPSEC_ALG_IS_ENABLED(algid, cpp->auth_hw_algs))
- goto done;
- algid = sa->ipsa_encr_alg;
- if (!IPSEC_ALG_IS_ENABLED(algid, cpp->encr_hw_algs))
- goto done;
- if (algid < cpp->encr_algparm_end) {
- ipsec_capab_algparm_t *alp = &cpp->encr_algparm[algid];
- if (sa->ipsa_encrkeybits < alp->minkeylen)
- goto done;
- if (sa->ipsa_encrkeybits > alp->maxkeylen)
- goto done;
- }
- break;
-
- case SADB_SATYPE_AH:
- if (!(ill->ill_capabilities & ILL_CAPAB_AH))
- /* ill does not support AH acceleration */
- goto done;
- if (!IPSEC_ALG_IS_ENABLED(sa->ipsa_auth_alg,
- ill->ill_ipsec_capab_ah->auth_hw_algs))
- goto done;
- break;
}
-
- if (need_refrele)
- ill_refrele(ill);
- return (B_TRUE);
-done:
- if (need_refrele)
- ill_refrele(ill);
- return (B_FALSE);
-}
-
-/*
- * Add a new ill to the list of IPsec capable ills.
- * Called from ill_capability_ipsec_ack() when an ACK was received
- * indicating that IPsec hardware processing was enabled for an ill.
- *
- * ill must point to the ill for which acceleration was enabled.
- * dl_cap must be set to DL_CAPAB_IPSEC_AH or DL_CAPAB_IPSEC_ESP.
- */
-static void
-ill_ipsec_capab_add(ill_t *ill, uint_t dl_cap, boolean_t sadb_resync)
-{
- ipsec_capab_ill_t **ills, *cur_ill, *new_ill;
- uint_t sa_type;
- uint_t ipproto;
- ip_stack_t *ipst = ill->ill_ipst;
-
- ASSERT((dl_cap == DL_CAPAB_IPSEC_AH) ||
- (dl_cap == DL_CAPAB_IPSEC_ESP));
-
- switch (dl_cap) {
- case DL_CAPAB_IPSEC_AH:
- sa_type = SADB_SATYPE_AH;
- ills = &ipst->ips_ipsec_capab_ills_ah;
- ipproto = IPPROTO_AH;
- break;
- case DL_CAPAB_IPSEC_ESP:
- sa_type = SADB_SATYPE_ESP;
- ills = &ipst->ips_ipsec_capab_ills_esp;
- ipproto = IPPROTO_ESP;
- break;
- }
-
- rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_WRITER);
-
- /*
- * Add ill index to list of hardware accelerators. If
- * already in list, do nothing.
- */
- for (cur_ill = *ills; cur_ill != NULL &&
- (cur_ill->ill_index != ill->ill_phyint->phyint_ifindex ||
- cur_ill->ill_isv6 != ill->ill_isv6); cur_ill = cur_ill->next)
- ;
-
- if (cur_ill == NULL) {
- /* if this is a new entry for this ill */
- new_ill = kmem_zalloc(sizeof (ipsec_capab_ill_t), KM_NOSLEEP);
- if (new_ill == NULL) {
- rw_exit(&ipst->ips_ipsec_capab_ills_lock);
- return;
- }
-
- new_ill->ill_index = ill->ill_phyint->phyint_ifindex;
- new_ill->ill_isv6 = ill->ill_isv6;
- new_ill->next = *ills;
- *ills = new_ill;
- } else if (!sadb_resync) {
- /* not resync'ing SADB and an entry exists for this ill */
- rw_exit(&ipst->ips_ipsec_capab_ills_lock);
- return;
- }
-
- rw_exit(&ipst->ips_ipsec_capab_ills_lock);
-
- if (ipst->ips_ipcl_proto_fanout_v6[ipproto].connf_head != NULL)
- /*
- * IPsec module for protocol loaded, initiate dump
- * of the SADB to this ill.
- */
- sadb_ill_download(ill, sa_type);
-}
-
-/*
- * Remove an ill from the list of IPsec capable ills.
- */
-static void
-ill_ipsec_capab_delete(ill_t *ill, uint_t dl_cap)
-{
- ipsec_capab_ill_t **ills, *cur_ill, *prev_ill;
- ip_stack_t *ipst = ill->ill_ipst;
-
- ASSERT(dl_cap == DL_CAPAB_IPSEC_AH ||
- dl_cap == DL_CAPAB_IPSEC_ESP);
-
- ills = (dl_cap == DL_CAPAB_IPSEC_AH) ? &ipst->ips_ipsec_capab_ills_ah :
- &ipst->ips_ipsec_capab_ills_esp;
-
- rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_WRITER);
-
- prev_ill = NULL;
- for (cur_ill = *ills; cur_ill != NULL && (cur_ill->ill_index !=
- ill->ill_phyint->phyint_ifindex || cur_ill->ill_isv6 !=
- ill->ill_isv6); prev_ill = cur_ill, cur_ill = cur_ill->next)
- ;
- if (cur_ill == NULL) {
- /* entry not found */
- rw_exit(&ipst->ips_ipsec_capab_ills_lock);
- return;
- }
- if (prev_ill == NULL) {
- /* entry at front of list */
- *ills = NULL;
- } else {
- prev_ill->next = cur_ill->next;
- }
- kmem_free(cur_ill, sizeof (ipsec_capab_ill_t));
- rw_exit(&ipst->ips_ipsec_capab_ills_lock);
-}
-
-/*
- * Called by SADB to send a DL_CONTROL_REQ message to every ill
- * supporting the specified IPsec protocol acceleration.
- * sa_type must be SADB_SATYPE_AH or SADB_SATYPE_ESP.
- * We free the mblk and, if sa is non-null, release the held referece.
- */
-void
-ill_ipsec_capab_send_all(uint_t sa_type, mblk_t *mp, ipsa_t *sa,
- netstack_t *ns)
-{
- ipsec_capab_ill_t *ici, *cur_ici;
- ill_t *ill;
- mblk_t *nmp, *mp_ship_list = NULL, *next_mp;
- ip_stack_t *ipst = ns->netstack_ip;
-
- ici = (sa_type == SADB_SATYPE_AH) ? ipst->ips_ipsec_capab_ills_ah :
- ipst->ips_ipsec_capab_ills_esp;
-
- rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_READER);
-
- for (cur_ici = ici; cur_ici != NULL; cur_ici = cur_ici->next) {
- ill = ill_lookup_on_ifindex(cur_ici->ill_index,
- cur_ici->ill_isv6, NULL, NULL, NULL, NULL, ipst);
-
- /*
- * Handle the case where the ill goes away while the SADB is
- * attempting to send messages. If it's going away, it's
- * nuking its shadow SADB, so we don't care..
- */
-
- if (ill == NULL)
- continue;
-
- if (sa != NULL) {
- /*
- * Make sure capabilities match before
- * sending SA to ill.
- */
- if (!ipsec_capab_match(ill, cur_ici->ill_index,
- cur_ici->ill_isv6, sa, ipst->ips_netstack)) {
- ill_refrele(ill);
- continue;
- }
-
- mutex_enter(&sa->ipsa_lock);
- sa->ipsa_flags |= IPSA_F_HW;
- mutex_exit(&sa->ipsa_lock);
- }
-
- /*
- * Copy template message, and add it to the front
- * of the mblk ship list. We want to avoid holding
- * the ipsec_capab_ills_lock while sending the
- * message to the ills.
- *
- * The b_next and b_prev are temporarily used
- * to build a list of mblks to be sent down, and to
- * save the ill to which they must be sent.
- */
- nmp = copymsg(mp);
- if (nmp == NULL) {
- ill_refrele(ill);
- continue;
- }
- ASSERT(nmp->b_next == NULL && nmp->b_prev == NULL);
- nmp->b_next = mp_ship_list;
- mp_ship_list = nmp;
- nmp->b_prev = (mblk_t *)ill;
- }
-
- rw_exit(&ipst->ips_ipsec_capab_ills_lock);
-
- for (nmp = mp_ship_list; nmp != NULL; nmp = next_mp) {
- /* restore the mblk to a sane state */
- next_mp = nmp->b_next;
- nmp->b_next = NULL;
- ill = (ill_t *)nmp->b_prev;
- nmp->b_prev = NULL;
-
- ill_dlpi_send(ill, nmp);
- ill_refrele(ill);
- }
-
- if (sa != NULL)
- IPSA_REFRELE(sa);
- freemsg(mp);
}
/*
@@ -19531,71 +16698,79 @@ ip_ipmp_v6intfid(ill_t *ill, in6_addr_t *v6addr)
addr[0] &= ~0x2; /* set local bit */
}
-/* ARGSUSED */
-static boolean_t
-ip_ether_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr,
- uint32_t *hw_start, in6_addr_t *v6_extract_mask)
+/*
+ * Map the multicast in6_addr_t in m_ip6addr to the physaddr for ethernet.
+ */
+static void
+ip_ether_v6_mapping(ill_t *ill, uchar_t *m_ip6addr, uchar_t *m_physaddr)
{
- /*
- * Multicast address mappings used over Ethernet/802.X.
- * This address is used as a base for mappings.
- */
- static uint8_t ipv6_g_phys_multi_addr[] = {0x33, 0x33, 0x00,
- 0x00, 0x00, 0x00};
+ phyint_t *phyi = ill->ill_phyint;
/*
- * Extract low order 32 bits from IPv6 multicast address.
- * Or that into the link layer address, starting from the
- * second byte.
+ * Check PHYI_MULTI_BCAST and length of physical
+ * address to determine if we use the mapping or the
+ * broadcast address.
*/
- *hw_start = 2;
- v6_extract_mask->s6_addr32[0] = 0;
- v6_extract_mask->s6_addr32[1] = 0;
- v6_extract_mask->s6_addr32[2] = 0;
- v6_extract_mask->s6_addr32[3] = 0xffffffffU;
- bcopy(ipv6_g_phys_multi_addr, maddr, lla_length);
- return (B_TRUE);
+ if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 ||
+ ill->ill_phys_addr_length != ETHERADDRL) {
+ ip_mbcast_mapping(ill, m_ip6addr, m_physaddr);
+ return;
+ }
+ m_physaddr[0] = 0x33;
+ m_physaddr[1] = 0x33;
+ m_physaddr[2] = m_ip6addr[12];
+ m_physaddr[3] = m_ip6addr[13];
+ m_physaddr[4] = m_ip6addr[14];
+ m_physaddr[5] = m_ip6addr[15];
}
/*
- * Indicate by return value whether multicast is supported. If not,
- * this code should not touch/change any parameters.
+ * Map the multicast ipaddr_t in m_ipaddr to the physaddr for ethernet.
*/
-/* ARGSUSED */
-static boolean_t
-ip_ether_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr,
- uint32_t *hw_start, ipaddr_t *extract_mask)
+static void
+ip_ether_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
{
+ phyint_t *phyi = ill->ill_phyint;
+
/*
- * Multicast address mappings used over Ethernet/802.X.
- * This address is used as a base for mappings.
+ * Check PHYI_MULTI_BCAST and length of physical
+ * address to determine if we use the mapping or the
+ * broadcast address.
*/
- static uint8_t ip_g_phys_multi_addr[] = { 0x01, 0x00, 0x5e,
- 0x00, 0x00, 0x00 };
-
- if (phys_length != ETHERADDRL)
- return (B_FALSE);
-
- *extract_mask = htonl(0x007fffff);
- *hw_start = 2;
- bcopy(ip_g_phys_multi_addr, maddr, ETHERADDRL);
- return (B_TRUE);
+ if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 ||
+ ill->ill_phys_addr_length != ETHERADDRL) {
+ ip_mbcast_mapping(ill, m_ipaddr, m_physaddr);
+ return;
+ }
+ m_physaddr[0] = 0x01;
+ m_physaddr[1] = 0x00;
+ m_physaddr[2] = 0x5e;
+ m_physaddr[3] = m_ipaddr[1] & 0x7f;
+ m_physaddr[4] = m_ipaddr[2];
+ m_physaddr[5] = m_ipaddr[3];
}
/* ARGSUSED */
-static boolean_t
-ip_nodef_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr,
- uint32_t *hw_start, ipaddr_t *extract_mask)
+static void
+ip_mbcast_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
{
- return (B_FALSE);
-}
+ /*
+ * for the MULTI_BCAST case and other cases when we want to
+ * use the link-layer broadcast address for multicast.
+ */
+ uint8_t *bphys_addr;
+ dl_unitdata_req_t *dlur;
-/* ARGSUSED */
-static boolean_t
-ip_nodef_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr,
- uint32_t *hw_start, in6_addr_t *v6_extract_mask)
-{
- return (B_FALSE);
+ dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
+ if (ill->ill_sap_length < 0) {
+ bphys_addr = (uchar_t *)dlur +
+ dlur->dl_dest_addr_offset;
+ } else {
+ bphys_addr = (uchar_t *)dlur +
+ dlur->dl_dest_addr_offset + ill->ill_sap_length;
+ }
+
+ bcopy(bphys_addr, m_physaddr, ill->ill_phys_addr_length);
}
/*
@@ -19624,6 +16799,7 @@ ip_ib_v6intfid(ill_t *ill, in6_addr_t *v6addr)
}
/*
+ * Map the multicast ipaddr_t in m_ipaddr to the physaddr for InfiniBand.
* Note on mapping from multicast IP addresses to IPoIB multicast link
* addresses. IPoIB multicast link addresses are based on IBA link addresses.
* The format of an IPoIB multicast address is:
@@ -19637,72 +16813,70 @@ ip_ib_v6intfid(ill_t *ill, in6_addr_t *v6addr)
* network interface. They can be ascertained from the broadcast address.
* The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6.
*/
-
-static boolean_t
-ip_ib_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr,
- uint32_t *hw_start, in6_addr_t *v6_extract_mask)
+static void
+ip_ib_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
{
- /*
- * Base IPoIB IPv6 multicast address used for mappings.
- * Does not contain the IBA scope/Pkey values.
- */
- static uint8_t ipv6_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff,
- 0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00,
+ static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff,
+ 0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+ uint8_t *bphys_addr;
+ dl_unitdata_req_t *dlur;
+
+ bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length);
/*
- * Extract low order 80 bits from IPv6 multicast address.
- * Or that into the link layer address, starting from the
- * sixth byte.
+ * RFC 4391: IPv4 MGID is 28-bit long.
*/
- *hw_start = 6;
- bcopy(ipv6_g_phys_ibmulti_addr, maddr, lla_length);
+ m_physaddr[16] = m_ipaddr[0] & 0x0f;
+ m_physaddr[17] = m_ipaddr[1];
+ m_physaddr[18] = m_ipaddr[2];
+ m_physaddr[19] = m_ipaddr[3];
+
+ dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
+ if (ill->ill_sap_length < 0) {
+ bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset;
+ } else {
+ bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset +
+ ill->ill_sap_length;
+ }
/*
* Now fill in the IBA scope/Pkey values from the broadcast address.
*/
- *(maddr + 5) = *(bphys_addr + 5);
- *(maddr + 8) = *(bphys_addr + 8);
- *(maddr + 9) = *(bphys_addr + 9);
-
- v6_extract_mask->s6_addr32[0] = 0;
- v6_extract_mask->s6_addr32[1] = htonl(0x0000ffff);
- v6_extract_mask->s6_addr32[2] = 0xffffffffU;
- v6_extract_mask->s6_addr32[3] = 0xffffffffU;
- return (B_TRUE);
+ m_physaddr[5] = bphys_addr[5];
+ m_physaddr[8] = bphys_addr[8];
+ m_physaddr[9] = bphys_addr[9];
}
-static boolean_t
-ip_ib_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr,
- uint32_t *hw_start, ipaddr_t *extract_mask)
+static void
+ip_ib_v6_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
{
- /*
- * Base IPoIB IPv4 multicast address used for mappings.
- * Does not contain the IBA scope/Pkey values.
- */
static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff,
- 0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
+ 0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+ uint8_t *bphys_addr;
+ dl_unitdata_req_t *dlur;
- if (phys_length != sizeof (ipv4_g_phys_ibmulti_addr))
- return (B_FALSE);
+ bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length);
/*
- * Extract low order 28 bits from IPv4 multicast address.
- * Or that into the link layer address, starting from the
- * sixteenth byte.
+ * RFC 4391: IPv4 MGID is 80-bit long.
*/
- *extract_mask = htonl(0x0fffffff);
- *hw_start = 16;
- bcopy(ipv4_g_phys_ibmulti_addr, maddr, phys_length);
+ bcopy(&m_ipaddr[6], &m_physaddr[10], 10);
+ dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
+ if (ill->ill_sap_length < 0) {
+ bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset;
+ } else {
+ bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset +
+ ill->ill_sap_length;
+ }
/*
* Now fill in the IBA scope/Pkey values from the broadcast address.
*/
- *(maddr + 5) = *(bphys_addr + 5);
- *(maddr + 8) = *(bphys_addr + 8);
- *(maddr + 9) = *(bphys_addr + 9);
- return (B_TRUE);
+ m_physaddr[5] = bphys_addr[5];
+ m_physaddr[8] = bphys_addr[8];
+ m_physaddr[9] = bphys_addr[9];
}
/*
@@ -19758,56 +16932,34 @@ ip_ipv4_v6destintfid(ill_t *ill, in6_addr_t *v6addr)
}
/*
- * Returns B_TRUE if an ipif is present in the given zone, matching some flags
- * (typically IPIF_UP). If ipifp is non-null, the held ipif is returned there.
- * This works for both IPv4 and IPv6; if the passed-in ill is v6, the ipif with
- * the link-local address is preferred.
+ * Lookup an ill and verify that the zoneid has an ipif on that ill.
+ * Returns an held ill, or NULL.
*/
-boolean_t
-ipif_lookup_zoneid(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp)
+ill_t *
+ill_lookup_on_ifindex_zoneid(uint_t index, zoneid_t zoneid, boolean_t isv6,
+ ip_stack_t *ipst)
{
+ ill_t *ill;
ipif_t *ipif;
- ipif_t *maybe_ipif = NULL;
- mutex_enter(&ill->ill_lock);
- if (ill->ill_state_flags & ILL_CONDEMNED) {
- mutex_exit(&ill->ill_lock);
- if (ipifp != NULL)
- *ipifp = NULL;
- return (B_FALSE);
- }
+ ill = ill_lookup_on_ifindex(index, isv6, ipst);
+ if (ill == NULL)
+ return (NULL);
+ mutex_enter(&ill->ill_lock);
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
- if (!IPIF_CAN_LOOKUP(ipif))
+ if (IPIF_IS_CONDEMNED(ipif))
continue;
if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid &&
ipif->ipif_zoneid != ALL_ZONES)
continue;
- if ((ipif->ipif_flags & flags) != flags)
- continue;
- if (ipifp == NULL) {
- mutex_exit(&ill->ill_lock);
- ASSERT(maybe_ipif == NULL);
- return (B_TRUE);
- }
- if (!ill->ill_isv6 ||
- IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6src_addr)) {
- ipif_refhold_locked(ipif);
- mutex_exit(&ill->ill_lock);
- *ipifp = ipif;
- return (B_TRUE);
- }
- if (maybe_ipif == NULL)
- maybe_ipif = ipif;
- }
- if (ipifp != NULL) {
- if (maybe_ipif != NULL)
- ipif_refhold_locked(maybe_ipif);
- *ipifp = maybe_ipif;
+ mutex_exit(&ill->ill_lock);
+ return (ill);
}
mutex_exit(&ill->ill_lock);
- return (maybe_ipif != NULL);
+ ill_refrele(ill);
+ return (NULL);
}
/*
@@ -19822,8 +16974,7 @@ ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6,
ipif_t *ipif;
ill_t *ill;
- ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL,
- ipst);
+ ill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
if (ill == NULL)
return (NULL);
@@ -19849,19 +17000,52 @@ ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6,
}
/*
- * Flush the fastpath by deleting any nce's that are waiting for the fastpath,
- * There is one exceptions IRE_BROADCAST are difficult to recreate,
- * so instead we just nuke their nce_fp_mp's; see ndp_fastpath_flush()
- * for details.
+ * Set ill_inputfn based on the current know state.
+ * This needs to be called when any of the factors taken into
+ * account changes.
*/
void
-ill_fastpath_flush(ill_t *ill)
+ill_set_inputfn(ill_t *ill)
{
- ip_stack_t *ipst = ill->ill_ipst;
+ ip_stack_t *ipst = ill->ill_ipst;
- nce_fastpath_list_dispatch(ill, NULL, NULL);
- ndp_walk_common((ill->ill_isv6 ? ipst->ips_ndp6 : ipst->ips_ndp4),
- ill, (pfi_t)ndp_fastpath_flush, NULL, B_TRUE);
+ if (ill->ill_isv6) {
+ if (is_system_labeled())
+ ill->ill_inputfn = ill_input_full_v6;
+ else
+ ill->ill_inputfn = ill_input_short_v6;
+ } else {
+ if (is_system_labeled())
+ ill->ill_inputfn = ill_input_full_v4;
+ else if (ill->ill_dhcpinit != 0)
+ ill->ill_inputfn = ill_input_full_v4;
+ else if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head
+ != NULL)
+ ill->ill_inputfn = ill_input_full_v4;
+ else if (ipst->ips_ip_cgtp_filter &&
+ ipst->ips_ip_cgtp_filter_ops != NULL)
+ ill->ill_inputfn = ill_input_full_v4;
+ else
+ ill->ill_inputfn = ill_input_short_v4;
+ }
+}
+
+/*
+ * Re-evaluate ill_inputfn for all the IPv4 ills.
+ * Used when RSVP and CGTP comes and goes.
+ */
+void
+ill_set_inputfn_all(ip_stack_t *ipst)
+{
+ ill_walk_context_t ctx;
+ ill_t *ill;
+
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+ ill = ILL_START_WALK_V4(&ctx, ipst);
+ for (; ill != NULL; ill = ill_next(&ctx, ill))
+ ill_set_inputfn(ill);
+
+ rw_exit(&ipst->ips_ill_g_lock);
}
/*
@@ -19897,6 +17081,10 @@ ill_set_phys_addr(ill_t *ill, mblk_t *mp)
}
ipsq_current_start(ipsq, ill->ill_ipif, 0);
+ mutex_enter(&ill->ill_lock);
+ ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS;
+ /* no more nce addition allowed */
+ mutex_exit(&ill->ill_lock);
/*
* If we can quiesce the ill, then set the address. If not, then
@@ -19923,8 +17111,8 @@ ill_set_phys_addr(ill_t *ill, mblk_t *mp)
* are passed (linked by b_cont), since we sometimes need to save two distinct
* copies in the ill_t, and our context doesn't permit sleeping or allocation
* failure (we'll free the other copy if it's not needed). Since the ill_t
- * is quiesced, we know any stale IREs with the old address information have
- * already been removed, so we don't need to call ill_fastpath_flush().
+ * is quiesced, we know any stale nce's with the old address information have
+ * already been removed, so we don't need to call nce_flush().
*/
/* ARGSUSED */
static void
@@ -19934,6 +17122,7 @@ ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy)
mblk_t *addrmp2 = unlinkb(addrmp);
dl_notify_ind_t *dlindp = (dl_notify_ind_t *)addrmp->b_rptr;
uint_t addrlen, addroff;
+ int status;
ASSERT(IAM_WRITER_IPSQ(ipsq));
@@ -19962,7 +17151,7 @@ ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy)
ill->ill_phys_addr = addrmp->b_rptr + addroff;
ill->ill_phys_addr_mp = addrmp;
ill->ill_phys_addr_length = addrlen;
- if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV))
+ if (ill->ill_isv6)
ill_set_ndmp(ill, addrmp2, addroff, addrlen);
else
freemsg(addrmp2);
@@ -19978,10 +17167,15 @@ ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy)
/*
* If there are ipifs to bring up, ill_up_ipifs() will return
* EINPROGRESS, and ipsq_current_finish() will be called by
- * ip_rput_dlpi_writer() or ip_arp_done() when the last ipif is
+ * ip_rput_dlpi_writer() or arp_bringup_done() when the last ipif is
* brought up.
*/
- if (ill_up_ipifs(ill, q, addrmp) != EINPROGRESS)
+ status = ill_up_ipifs(ill, q, addrmp);
+ mutex_enter(&ill->ill_lock);
+ if (ill->ill_dl_up)
+ ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS;
+ mutex_exit(&ill->ill_lock);
+ if (status != EINPROGRESS)
ipsq_current_finish(ipsq);
}
@@ -20009,6 +17203,11 @@ ill_replumb(ill_t *ill, mblk_t *mp)
ipsq_current_start(ipsq, ill->ill_ipif, 0);
+ mutex_enter(&ill->ill_lock);
+ ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS;
+ /* no more nce addition allowed */
+ mutex_exit(&ill->ill_lock);
+
/*
* If we can quiesce the ill, then continue. If not, then
* ill_replumb_tail() will be called from ipif_ill_refrele_tail().
@@ -20034,14 +17233,32 @@ static void
ill_replumb_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy)
{
ill_t *ill = q->q_ptr;
+ int err;
+ conn_t *connp = NULL;
ASSERT(IAM_WRITER_IPSQ(ipsq));
-
- ill_down_ipifs_tail(ill);
-
freemsg(ill->ill_replumb_mp);
ill->ill_replumb_mp = copyb(mp);
+ if (ill->ill_replumb_mp == NULL) {
+ /* out of memory */
+ ipsq_current_finish(ipsq);
+ return;
+ }
+
+ mutex_enter(&ill->ill_lock);
+ ill->ill_up_ipifs = ipsq_pending_mp_add(NULL, ill->ill_ipif,
+ ill->ill_rq, ill->ill_replumb_mp, 0);
+ mutex_exit(&ill->ill_lock);
+
+ if (!ill->ill_up_ipifs) {
+ /* already closing */
+ ipsq_current_finish(ipsq);
+ return;
+ }
+ ill->ill_replumbing = 1;
+ err = ill_down_ipifs_tail(ill);
+
/*
* Successfully quiesced and brought down the interface, now we send
* the DL_NOTE_REPLUMB_DONE message down to the driver. Reuse the
@@ -20055,15 +17272,23 @@ ill_replumb_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy)
ill_dlpi_send(ill, mp);
/*
- * If there are ipifs to bring up, ill_up_ipifs() will return
- * EINPROGRESS, and ipsq_current_finish() will be called by
- * ip_rput_dlpi_writer() or ip_arp_done() when the last ipif is
- * brought up.
+ * For IPv4, we would usually get EINPROGRESS because the ETHERTYPE_ARP
+ * streams have to be unbound. When all the DLPI exchanges are done,
+ * ipsq_current_finish() will be called by arp_bringup_done(). The
+ * remainder of ipif bringup via ill_up_ipifs() will also be done in
+ * arp_bringup_done().
*/
- if (ill->ill_replumb_mp == NULL ||
- ill_up_ipifs(ill, q, ill->ill_replumb_mp) != EINPROGRESS) {
- ipsq_current_finish(ipsq);
+ ASSERT(ill->ill_replumb_mp != NULL);
+ if (err == EINPROGRESS)
+ return;
+ else
+ ill->ill_replumb_mp = ipsq_pending_mp_get(ipsq, &connp);
+ ASSERT(connp == NULL);
+ if (err == 0 && ill->ill_replumb_mp != NULL &&
+ ill_up_ipifs(ill, q, ill->ill_replumb_mp) == EINPROGRESS) {
+ return;
}
+ ipsq_current_finish(ipsq);
}
/*
@@ -20342,6 +17567,338 @@ fail:
"information for %s (ENOMEM)\n", str, ill->ill_name));
}
+static int
+ipif_arp_up_done_tail(ipif_t *ipif, enum ip_resolver_action res_act)
+{
+ int err = 0;
+ const in_addr_t *addr = NULL;
+ nce_t *nce = NULL;
+ ill_t *ill = ipif->ipif_ill;
+ ill_t *bound_ill;
+ boolean_t added_ipif = B_FALSE;
+ uint16_t state;
+ uint16_t flags;
+
+ DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up_done_tail",
+ ill_t *, ill, ipif_t *, ipif);
+ if (ipif->ipif_lcl_addr != INADDR_ANY) {
+ addr = &ipif->ipif_lcl_addr;
+ }
+
+ if ((ipif->ipif_flags & IPIF_UNNUMBERED) || addr == NULL) {
+ if (res_act != Res_act_initial)
+ return (EINVAL);
+ }
+
+ if (addr != NULL) {
+ ipmp_illgrp_t *illg = ill->ill_grp;
+
+ /* add unicast nce for the local addr */
+
+ if (IS_IPMP(ill)) {
+ /*
+ * If we're here via ipif_up(), then the ipif
+ * won't be bound yet -- add it to the group,
+ * which will bind it if possible. (We would
+ * add it in ipif_up(), but deleting on failure
+ * there is gruesome.) If we're here via
+ * ipmp_ill_bind_ipif(), then the ipif has
+ * already been added to the group and we
+ * just need to use the binding.
+ */
+ if ((bound_ill = ipmp_ipif_bound_ill(ipif)) == NULL) {
+ bound_ill = ipmp_illgrp_add_ipif(illg, ipif);
+ if (bound_ill == NULL) {
+ /*
+ * We couldn't bind the ipif to an ill
+ * yet, so we have nothing to publish.
+ * Mark the address as ready and return.
+ */
+ ipif->ipif_addr_ready = 1;
+ return (0);
+ }
+ added_ipif = B_TRUE;
+ }
+ } else {
+ bound_ill = ill;
+ }
+
+ flags = (NCE_F_MYADDR | NCE_F_PUBLISH | NCE_F_AUTHORITY |
+ NCE_F_NONUD);
+ /*
+ * If this is an initial bring-up (or the ipif was never
+ * completely brought up), do DAD. Otherwise, we're here
+ * because IPMP has rebound an address to this ill: send
+ * unsolicited advertisements (ARP announcements) to
+ * inform others.
+ */
+ if (res_act == Res_act_initial || !ipif->ipif_addr_ready) {
+ state = ND_UNCHANGED; /* compute in nce_add_common() */
+ } else {
+ state = ND_REACHABLE;
+ flags |= NCE_F_UNSOL_ADV;
+ }
+
+retry:
+ err = nce_lookup_then_add_v4(ill,
+ bound_ill->ill_phys_addr, bound_ill->ill_phys_addr_length,
+ addr, flags, state, &nce);
+
+ /*
+ * note that we may encounter EEXIST if we are moving
+ * the nce as a result of a rebind operation.
+ */
+ switch (err) {
+ case 0:
+ ipif->ipif_added_nce = 1;
+ nce->nce_ipif_cnt++;
+ break;
+ case EEXIST:
+ ip1dbg(("ipif_arp_up: NCE already exists for %s\n",
+ ill->ill_name));
+ if (!NCE_MYADDR(nce->nce_common)) {
+ /*
+ * A leftover nce from before this address
+ * existed
+ */
+ ncec_delete(nce->nce_common);
+ nce_refrele(nce);
+ nce = NULL;
+ goto retry;
+ }
+ if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
+ nce_refrele(nce);
+ nce = NULL;
+ ip1dbg(("ipif_arp_up: NCE already exists "
+ "for %s:%u\n", ill->ill_name,
+ ipif->ipif_id));
+ goto arp_up_done;
+ }
+ /*
+ * Duplicate local addresses are permissible for
+ * IPIF_POINTOPOINT interfaces which will get marked
+ * IPIF_UNNUMBERED later in
+ * ip_addr_availability_check().
+ *
+ * The nce_ipif_cnt field tracks the number of
+ * ipifs that have nce_addr as their local address.
+ */
+ ipif->ipif_addr_ready = 1;
+ ipif->ipif_added_nce = 1;
+ nce->nce_ipif_cnt++;
+ err = 0;
+ break;
+ default:
+ ASSERT(nce == NULL);
+ goto arp_up_done;
+ }
+ if (arp_no_defense) {
+ if ((ipif->ipif_flags & IPIF_UP) &&
+ !ipif->ipif_addr_ready)
+ ipif_up_notify(ipif);
+ ipif->ipif_addr_ready = 1;
+ }
+ } else {
+ /* zero address. nothing to publish */
+ ipif->ipif_addr_ready = 1;
+ }
+ if (nce != NULL)
+ nce_refrele(nce);
+arp_up_done:
+ if (added_ipif && err != 0)
+ ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
+ return (err);
+}
+
+int
+ipif_arp_up(ipif_t *ipif, enum ip_resolver_action res_act, boolean_t was_dup)
+{
+ int err = 0;
+ ill_t *ill = ipif->ipif_ill;
+ boolean_t first_interface, wait_for_dlpi = B_FALSE;
+
+ DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up",
+ ill_t *, ill, ipif_t *, ipif);
+
+ /*
+ * need to bring up ARP or setup mcast mapping only
+ * when the first interface is coming UP.
+ */
+ first_interface = (ill->ill_ipif_up_count == 0 &&
+ ill->ill_ipif_dup_count == 0 && !was_dup);
+
+ if (res_act == Res_act_initial && first_interface) {
+ /*
+ * Send ATTACH + BIND
+ */
+ err = arp_ll_up(ill);
+ if (err != EINPROGRESS && err != 0)
+ return (err);
+
+ /*
+ * Add NCE for local address. Start DAD.
+ * we'll wait to hear that DAD has finished
+ * before using the interface.
+ */
+ if (err == EINPROGRESS)
+ wait_for_dlpi = B_TRUE;
+ }
+
+ if (!wait_for_dlpi)
+ (void) ipif_arp_up_done_tail(ipif, res_act);
+
+ return (!wait_for_dlpi ? 0 : EINPROGRESS);
+}
+
+/*
+ * Finish processing of "arp_up" after all the DLPI message
+ * exchanges have completed between arp and the driver.
+ */
+void
+arp_bringup_done(ill_t *ill, int err)
+{
+ mblk_t *mp1;
+ ipif_t *ipif;
+ conn_t *connp = NULL;
+ ipsq_t *ipsq;
+ queue_t *q;
+
+ ip1dbg(("arp_bringup_done(%s)\n", ill->ill_name));
+
+ ASSERT(IAM_WRITER_ILL(ill));
+
+ ipsq = ill->ill_phyint->phyint_ipsq;
+ ipif = ipsq->ipsq_xop->ipx_pending_ipif;
+ mp1 = ipsq_pending_mp_get(ipsq, &connp);
+ ASSERT(!((mp1 != NULL) ^ (ipif != NULL)));
+ if (mp1 == NULL) /* bringup was aborted by the user */
+ return;
+
+ /*
+ * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we
+ * must have an associated conn_t. Otherwise, we're bringing this
+ * interface back up as part of handling an asynchronous event (e.g.,
+ * physical address change).
+ */
+ if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
+ ASSERT(connp != NULL);
+ q = CONNP_TO_WQ(connp);
+ } else {
+ ASSERT(connp == NULL);
+ q = ill->ill_rq;
+ }
+ if (err == 0) {
+ if (ipif->ipif_isv6) {
+ if ((err = ipif_up_done_v6(ipif)) != 0)
+ ip0dbg(("arp_bringup_done: init failed\n"));
+ } else {
+ err = ipif_arp_up_done_tail(ipif, Res_act_initial);
+ if (err != 0 || (err = ipif_up_done(ipif)) != 0)
+ ip0dbg(("arp_bringup_done: init failed\n"));
+ }
+ } else {
+ ip0dbg(("arp_bringup_done: DL_BIND_REQ failed\n"));
+ }
+
+ if ((err == 0) && (ill->ill_up_ipifs)) {
+ err = ill_up_ipifs(ill, q, mp1);
+ if (err == EINPROGRESS)
+ return;
+ }
+
+ /*
+ * If we have a moved ipif to bring up, and everything has succeeded
+ * to this point, bring it up on the IPMP ill. Otherwise, leave it
+ * down -- the admin can try to bring it up by hand if need be.
+ */
+ if (ill->ill_move_ipif != NULL) {
+ ipif = ill->ill_move_ipif;
+ ip1dbg(("bringing up ipif %p on ill %s\n", (void *)ipif,
+ ipif->ipif_ill->ill_name));
+ ill->ill_move_ipif = NULL;
+ if (err == 0) {
+ err = ipif_up(ipif, q, mp1);
+ if (err == EINPROGRESS)
+ return;
+ }
+ }
+
+ /*
+ * The operation must complete without EINPROGRESS since
+ * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp.
+ * Otherwise, the operation will be stuck forever in the ipsq.
+ */
+ ASSERT(err != EINPROGRESS);
+ if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
+ DTRACE_PROBE4(ipif__ioctl, char *, "arp_bringup_done finish",
+ int, ipsq->ipsq_xop->ipx_current_ioctl,
+ ill_t *, ill, ipif_t *, ipif);
+ ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
+ } else {
+ ipsq_current_finish(ipsq);
+ }
+}
+
+/*
+ * Finish processing of arp replumb after all the DLPI message
+ * exchanges have completed between arp and the driver.
+ */
+void
+arp_replumb_done(ill_t *ill, int err)
+{
+ mblk_t *mp1;
+ ipif_t *ipif;
+ conn_t *connp = NULL;
+ ipsq_t *ipsq;
+ queue_t *q;
+
+ ASSERT(IAM_WRITER_ILL(ill));
+
+ ipsq = ill->ill_phyint->phyint_ipsq;
+ ipif = ipsq->ipsq_xop->ipx_pending_ipif;
+ mp1 = ipsq_pending_mp_get(ipsq, &connp);
+ ASSERT(!((mp1 != NULL) ^ (ipif != NULL)));
+ if (mp1 == NULL) {
+ ip0dbg(("arp_replumb_done: bringup aborted ioctl %x\n",
+ ipsq->ipsq_xop->ipx_current_ioctl));
+ /* bringup was aborted by the user */
+ return;
+ }
+ /*
+ * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we
+ * must have an associated conn_t. Otherwise, we're bringing this
+ * interface back up as part of handling an asynchronous event (e.g.,
+ * physical address change).
+ */
+ if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
+ ASSERT(connp != NULL);
+ q = CONNP_TO_WQ(connp);
+ } else {
+ ASSERT(connp == NULL);
+ q = ill->ill_rq;
+ }
+ if ((err == 0) && (ill->ill_up_ipifs)) {
+ err = ill_up_ipifs(ill, q, mp1);
+ if (err == EINPROGRESS)
+ return;
+ }
+ /*
+ * The operation must complete without EINPROGRESS since
+ * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp.
+ * Otherwise, the operation will be stuck forever in the ipsq.
+ */
+ ASSERT(err != EINPROGRESS);
+ if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
+ DTRACE_PROBE4(ipif__ioctl, char *,
+ "arp_replumb_done finish",
+ int, ipsq->ipsq_xop->ipx_current_ioctl,
+ ill_t *, ill, ipif_t *, ipif);
+ ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
+ } else {
+ ipsq_current_finish(ipsq);
+ }
+}
+
void
ipif_up_notify(ipif_t *ipif)
{
@@ -20610,3 +18167,48 @@ ip_sioctl_ilb_cmd(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
done:
return (ret);
}
+
+/* Remove all cache entries for this logical interface */
+void
+ipif_nce_down(ipif_t *ipif)
+{
+ ill_t *ill = ipif->ipif_ill;
+ nce_t *nce;
+
+ DTRACE_PROBE3(ipif__downup, char *, "ipif_nce_down",
+ ill_t *, ill, ipif_t *, ipif);
+ if (ipif->ipif_added_nce) {
+ if (ipif->ipif_isv6)
+ nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr);
+ else
+ nce = nce_lookup_v4(ill, &ipif->ipif_lcl_addr);
+ if (nce != NULL) {
+ if (--nce->nce_ipif_cnt == 0)
+ ncec_delete(nce->nce_common);
+ ipif->ipif_added_nce = 0;
+ nce_refrele(nce);
+ } else {
+ /*
+ * nce may already be NULL because it was already
+ * flushed, e.g., due to a call to nce_flush
+ */
+ ipif->ipif_added_nce = 0;
+ }
+ }
+ /*
+ * Make IPMP aware of the deleted data address.
+ */
+ if (IS_IPMP(ill))
+ ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
+
+ /*
+ * Remove all other nces dependent on this ill when the last ipif
+ * is going away.
+ */
+ if (ill->ill_ipif_up_count == 0) {
+ ncec_walk(ill, (pfi_t)ncec_delete_per_ill,
+ (uchar_t *)ill, ill->ill_ipst);
+ if (IS_UNDER_IPMP(ill))
+ nce_flush(ill, B_TRUE);
+ }
+}
diff --git a/usr/src/uts/common/inet/ip/ip_input.c b/usr/src/uts/common/inet/ip/ip_input.c
new file mode 100644
index 0000000000..d47670f85d
--- /dev/null
+++ b/usr/src/uts/common/inet/ip/ip_input.c
@@ -0,0 +1,3095 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/* Copyright (c) 1990 Mentat Inc. */
+
+#include <sys/types.h>
+#include <sys/stream.h>
+#include <sys/dlpi.h>
+#include <sys/stropts.h>
+#include <sys/sysmacros.h>
+#include <sys/strsubr.h>
+#include <sys/strlog.h>
+#include <sys/strsun.h>
+#include <sys/zone.h>
+#define _SUN_TPI_VERSION 2
+#include <sys/tihdr.h>
+#include <sys/xti_inet.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/kobj.h>
+#include <sys/modctl.h>
+#include <sys/atomic.h>
+#include <sys/policy.h>
+#include <sys/priv.h>
+
+#include <sys/systm.h>
+#include <sys/param.h>
+#include <sys/kmem.h>
+#include <sys/sdt.h>
+#include <sys/socket.h>
+#include <sys/vtrace.h>
+#include <sys/isa_defs.h>
+#include <sys/mac.h>
+#include <net/if.h>
+#include <net/if_arp.h>
+#include <net/route.h>
+#include <sys/sockio.h>
+#include <netinet/in.h>
+#include <net/if_dl.h>
+
+#include <inet/common.h>
+#include <inet/mi.h>
+#include <inet/mib2.h>
+#include <inet/nd.h>
+#include <inet/arp.h>
+#include <inet/snmpcom.h>
+#include <inet/kstatcom.h>
+
+#include <netinet/igmp_var.h>
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#include <netinet/sctp.h>
+
+#include <inet/ip.h>
+#include <inet/ip_impl.h>
+#include <inet/ip6.h>
+#include <inet/ip6_asp.h>
+#include <inet/optcom.h>
+#include <inet/tcp.h>
+#include <inet/tcp_impl.h>
+#include <inet/ip_multi.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ire.h>
+#include <inet/ip_ftable.h>
+#include <inet/ip_rts.h>
+#include <inet/ip_ndp.h>
+#include <inet/ip_listutils.h>
+#include <netinet/igmp.h>
+#include <netinet/ip_mroute.h>
+#include <inet/ipp_common.h>
+
+#include <net/pfkeyv2.h>
+#include <inet/sadb.h>
+#include <inet/ipsec_impl.h>
+#include <inet/ipdrop.h>
+#include <inet/ip_netinfo.h>
+#include <inet/ilb_ip.h>
+#include <sys/squeue_impl.h>
+#include <sys/squeue.h>
+
+#include <sys/ethernet.h>
+#include <net/if_types.h>
+#include <sys/cpuvar.h>
+
+#include <ipp/ipp.h>
+#include <ipp/ipp_impl.h>
+#include <ipp/ipgpc/ipgpc.h>
+
+#include <sys/pattr.h>
+#include <inet/ipclassifier.h>
+#include <inet/sctp_ip.h>
+#include <inet/sctp/sctp_impl.h>
+#include <inet/udp_impl.h>
+#include <sys/sunddi.h>
+
+#include <sys/tsol/label.h>
+#include <sys/tsol/tnet.h>
+
+#include <rpc/pmap_prot.h>
+
+#ifdef DEBUG
+extern boolean_t skip_sctp_cksum;
+#endif
+
+static void ip_input_local_v4(ire_t *, mblk_t *, ipha_t *,
+ ip_recv_attr_t *);
+
+static void ip_input_broadcast_v4(ire_t *, mblk_t *, ipha_t *,
+ ip_recv_attr_t *);
+static void ip_input_multicast_v4(ire_t *, mblk_t *, ipha_t *,
+ ip_recv_attr_t *);
+
+#pragma inline(ip_input_common_v4, ip_input_local_v4, ip_forward_xmit_v4)
+
+/*
+ * Direct read side procedure capable of dealing with chains. GLDv3 based
+ * drivers call this function directly with mblk chains while STREAMS
+ * read side procedure ip_rput() calls this for single packet with ip_ring
+ * set to NULL to process one packet at a time.
+ *
+ * The ill will always be valid if this function is called directly from
+ * the driver.
+ *
+ * If ip_input() is called from GLDv3:
+ *
+ * - This must be a non-VLAN IP stream.
+ * - 'mp' is either an untagged or a special priority-tagged packet.
+ * - Any VLAN tag that was in the MAC header has been stripped.
+ *
+ * If the IP header in packet is not 32-bit aligned, every message in the
+ * chain will be aligned before further operations. This is required on SPARC
+ * platform.
+ */
+void
+ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
+ struct mac_header_info_s *mhip)
+{
+ (void) ip_input_common_v4(ill, ip_ring, mp_chain, mhip, NULL, NULL,
+ NULL);
+}
+
+/*
+ * ip_accept_tcp() - This function is called by the squeue when it retrieves
+ * a chain of packets in the poll mode. The packets have gone through the
+ * data link processing but not IP processing. For performance and latency
+ * reasons, the squeue wants to process the chain in line instead of feeding
+ * it back via ip_input path.
+ *
+ * We set up the ip_recv_attr_t with IRAF_TARGET_SQP to that ip_fanout_v4
+ * will pass back any TCP packets matching the target sqp to
+ * ip_input_common_v4 using ira_target_sqp_mp. Other packets are handled by
+ * ip_input_v4 and ip_fanout_v4 as normal.
+ * The TCP packets that match the target squeue are returned to the caller
+ * as a b_next chain after each packet has been prepend with an mblk
+ * from ip_recv_attr_to_mblk.
+ */
+mblk_t *
+ip_accept_tcp(ill_t *ill, ill_rx_ring_t *ip_ring, squeue_t *target_sqp,
+ mblk_t *mp_chain, mblk_t **last, uint_t *cnt)
+{
+ return (ip_input_common_v4(ill, ip_ring, mp_chain, NULL, target_sqp,
+ last, cnt));
+}
+
+/*
+ * Used by ip_input and ip_accept_tcp
+ * The last three arguments are only used by ip_accept_tcp, and mhip is
+ * only used by ip_input.
+ */
+mblk_t *
+ip_input_common_v4(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
+ struct mac_header_info_s *mhip, squeue_t *target_sqp,
+ mblk_t **last, uint_t *cnt)
+{
+ mblk_t *mp;
+ ipha_t *ipha;
+ ip_recv_attr_t iras; /* Receive attributes */
+ rtc_t rtc;
+ iaflags_t chain_flags = 0; /* Fixed for chain */
+ mblk_t *ahead = NULL; /* Accepted head */
+ mblk_t *atail = NULL; /* Accepted tail */
+ uint_t acnt = 0; /* Accepted count */
+
+ ASSERT(mp_chain != NULL);
+ ASSERT(ill != NULL);
+
+ /* These ones do not change as we loop over packets */
+ iras.ira_ill = iras.ira_rill = ill;
+ iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+ iras.ira_rifindex = iras.ira_ruifindex;
+ iras.ira_sqp = NULL;
+ iras.ira_ring = ip_ring;
+ /* For ECMP and outbound transmit ring selection */
+ iras.ira_xmit_hint = ILL_RING_TO_XMIT_HINT(ip_ring);
+
+ iras.ira_target_sqp = target_sqp;
+ iras.ira_target_sqp_mp = NULL;
+ if (target_sqp != NULL)
+ chain_flags |= IRAF_TARGET_SQP;
+
+ /*
+ * We try to have a mhip pointer when possible, but
+ * it might be NULL in some cases. In those cases we
+ * have to assume unicast.
+ */
+ iras.ira_mhip = mhip;
+ iras.ira_flags = 0;
+ if (mhip != NULL) {
+ switch (mhip->mhi_dsttype) {
+ case MAC_ADDRTYPE_MULTICAST :
+ chain_flags |= IRAF_L2DST_MULTICAST;
+ break;
+ case MAC_ADDRTYPE_BROADCAST :
+ chain_flags |= IRAF_L2DST_BROADCAST;
+ break;
+ }
+ }
+
+ /*
+ * Initialize the one-element route cache.
+ *
+ * We do ire caching from one iteration to
+ * another. In the event the packet chain contains
+ * all packets from the same dst, this caching saves
+ * an ire_route_recursive for each of the succeeding
+ * packets in a packet chain.
+ */
+ rtc.rtc_ire = NULL;
+ rtc.rtc_ipaddr = INADDR_ANY;
+
+ /* Loop over b_next */
+ for (mp = mp_chain; mp != NULL; mp = mp_chain) {
+ mp_chain = mp->b_next;
+ mp->b_next = NULL;
+
+ ASSERT(DB_TYPE(mp) == M_DATA);
+
+
+ /*
+ * if db_ref > 1 then copymsg and free original. Packet
+ * may be changed and we do not want the other entity
+ * who has a reference to this message to trip over the
+ * changes. This is a blind change because trying to
+ * catch all places that might change the packet is too
+ * difficult.
+ *
+ * This corresponds to the fast path case, where we have
+ * a chain of M_DATA mblks. We check the db_ref count
+ * of only the 1st data block in the mblk chain. There
+ * doesn't seem to be a reason why a device driver would
+ * send up data with varying db_ref counts in the mblk
+ * chain. In any case the Fast path is a private
+ * interface, and our drivers don't do such a thing.
+ * Given the above assumption, there is no need to walk
+ * down the entire mblk chain (which could have a
+ * potential performance problem)
+ *
+ * The "(DB_REF(mp) > 1)" check was moved from ip_rput()
+ * to here because of exclusive ip stacks and vnics.
+ * Packets transmitted from exclusive stack over vnic
+ * can have db_ref > 1 and when it gets looped back to
+ * another vnic in a different zone, you have ip_input()
+ * getting dblks with db_ref > 1. So if someone
+ * complains of TCP performance under this scenario,
+ * take a serious look here on the impact of copymsg().
+ */
+ if (DB_REF(mp) > 1) {
+ if ((mp = ip_fix_dbref(mp, &iras)) == NULL) {
+ /* mhip might point into 1st packet in chain */
+ iras.ira_mhip = NULL;
+ continue;
+ }
+ }
+
+ /*
+ * IP header ptr not aligned?
+ * OR IP header not complete in first mblk
+ */
+ ipha = (ipha_t *)mp->b_rptr;
+ if (!OK_32PTR(ipha) || MBLKL(mp) < IP_SIMPLE_HDR_LENGTH) {
+ mp = ip_check_and_align_header(mp, IP_SIMPLE_HDR_LENGTH,
+ &iras);
+ if (mp == NULL) {
+ /* mhip might point into 1st packet in chain */
+ iras.ira_mhip = NULL;
+ continue;
+ }
+ ipha = (ipha_t *)mp->b_rptr;
+ }
+
+ /* Protect against a mix of Ethertypes and IP versions */
+ if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
+ ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
+ freemsg(mp);
+ /* mhip might point into 1st packet in the chain. */
+ iras.ira_mhip = NULL;
+ continue;
+ }
+
+ /*
+ * Check for Martian addrs; we have to explicitly
+ * test for for zero dst since this is also used as
+ * an indication that the rtc is not used.
+ */
+ if (ipha->ipha_dst == INADDR_ANY) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
+ ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
+ freemsg(mp);
+ /* mhip might point into 1st packet in the chain. */
+ iras.ira_mhip = NULL;
+ continue;
+ }
+
+ /*
+ * Keep L2SRC from a previous packet in chain since mhip
+ * might point into an earlier packet in the chain.
+ * Keep IRAF_VERIFIED_SRC to avoid redoing broadcast
+ * source check in forwarding path.
+ */
+ chain_flags |= (iras.ira_flags &
+ (IRAF_L2SRC_SET|IRAF_VERIFIED_SRC));
+
+ iras.ira_flags = IRAF_IS_IPV4 | IRAF_VERIFY_IP_CKSUM |
+ IRAF_VERIFY_ULP_CKSUM | chain_flags;
+ iras.ira_free_flags = 0;
+ iras.ira_cred = NULL;
+ iras.ira_cpid = NOPID;
+ iras.ira_tsl = NULL;
+ iras.ira_zoneid = ALL_ZONES; /* Default for forwarding */
+
+ /*
+ * We must count all incoming packets, even if they end
+ * up being dropped later on. Defer counting bytes until
+ * we have the whole IP header in first mblk.
+ */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
+
+ iras.ira_pktlen = ntohs(ipha->ipha_length);
+ UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets,
+ iras.ira_pktlen);
+
+ /*
+ * Call one of:
+ * ill_input_full_v4
+ * ill_input_short_v4
+ * The former is used in unusual cases. See ill_set_inputfn().
+ */
+ (*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc);
+
+ /* Any references to clean up? No hold on ira_ill */
+ if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED))
+ ira_cleanup(&iras, B_FALSE);
+
+ if (iras.ira_target_sqp_mp != NULL) {
+ /* Better be called from ip_accept_tcp */
+ ASSERT(target_sqp != NULL);
+
+ /* Found one packet to accept */
+ mp = iras.ira_target_sqp_mp;
+ iras.ira_target_sqp_mp = NULL;
+ ASSERT(ip_recv_attr_is_mblk(mp));
+
+ if (atail != NULL)
+ atail->b_next = mp;
+ else
+ ahead = mp;
+ atail = mp;
+ acnt++;
+ mp = NULL;
+ }
+ /* mhip might point into 1st packet in the chain. */
+ iras.ira_mhip = NULL;
+ }
+ /* Any remaining references to the route cache? */
+ if (rtc.rtc_ire != NULL) {
+ ASSERT(rtc.rtc_ipaddr != INADDR_ANY);
+ ire_refrele(rtc.rtc_ire);
+ }
+
+ if (ahead != NULL) {
+ /* Better be called from ip_accept_tcp */
+ ASSERT(target_sqp != NULL);
+ *last = atail;
+ *cnt = acnt;
+ return (ahead);
+ }
+
+ return (NULL);
+}
+
+/*
+ * This input function is used when
+ * - is_system_labeled()
+ * - CGTP filtering
+ * - DHCP unicast before we have an IP address configured
+ * - there is an listener for IPPROTO_RSVP
+ */
+void
+ill_input_full_v4(mblk_t *mp, void *iph_arg, void *nexthop_arg,
+ ip_recv_attr_t *ira, rtc_t *rtc)
+{
+ ipha_t *ipha = (ipha_t *)iph_arg;
+ ipaddr_t nexthop = *(ipaddr_t *)nexthop_arg;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+ int cgtp_flt_pkt;
+
+ ASSERT(ira->ira_tsl == NULL);
+
+ /*
+ * Attach any necessary label information to
+ * this packet
+ */
+ if (is_system_labeled()) {
+ ira->ira_flags |= IRAF_SYSTEM_LABELED;
+
+ /*
+ * This updates ira_cred, ira_tsl and ira_free_flags based
+ * on the label.
+ */
+ if (!tsol_get_pkt_label(mp, IPV4_VERSION, ira)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ /* Note that ira_tsl can be NULL here. */
+
+ /* tsol_get_pkt_label sometimes does pullupmsg */
+ ipha = (ipha_t *)mp->b_rptr;
+ }
+
+ /*
+ * Invoke the CGTP (multirouting) filtering module to process
+ * the incoming packet. Packets identified as duplicates
+ * must be discarded. Filtering is active only if the
+ * the ip_cgtp_filter ndd variable is non-zero.
+ */
+ cgtp_flt_pkt = CGTP_IP_PKT_NOT_CGTP;
+ if (ipst->ips_ip_cgtp_filter &&
+ ipst->ips_ip_cgtp_filter_ops != NULL) {
+ netstackid_t stackid;
+
+ stackid = ipst->ips_netstack->netstack_stackid;
+ /*
+ * CGTP and IPMP are mutually exclusive so
+ * phyint_ifindex is fine here.
+ */
+ cgtp_flt_pkt =
+ ipst->ips_ip_cgtp_filter_ops->cfo_filter(stackid,
+ ill->ill_phyint->phyint_ifindex, mp);
+ if (cgtp_flt_pkt == CGTP_IP_PKT_DUPLICATE) {
+ ip_drop_input("CGTP_IP_PKT_DUPLICATE", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ }
+
+ /*
+ * Brutal hack for DHCPv4 unicast: RFC2131 allows a DHCP
+ * server to unicast DHCP packets to a DHCP client using the
+ * IP address it is offering to the client. This can be
+ * disabled through the "broadcast bit", but not all DHCP
+ * servers honor that bit. Therefore, to interoperate with as
+ * many DHCP servers as possible, the DHCP client allows the
+ * server to unicast, but we treat those packets as broadcast
+ * here. Note that we don't rewrite the packet itself since
+ * (a) that would mess up the checksums and (b) the DHCP
+ * client conn is bound to INADDR_ANY so ip_fanout_udp() will
+ * hand it the packet regardless.
+ */
+ if (ill->ill_dhcpinit != 0 &&
+ ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION &&
+ ipha->ipha_protocol == IPPROTO_UDP) {
+ udpha_t *udpha;
+
+ ipha = ip_pullup(mp, sizeof (ipha_t) + sizeof (udpha_t), ira);
+ if (ipha == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards - dhcp", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ /* Reload since pullupmsg() can change b_rptr. */
+ udpha = (udpha_t *)&ipha[1];
+
+ if (ntohs(udpha->uha_dst_port) == IPPORT_BOOTPC) {
+ DTRACE_PROBE2(ip4__dhcpinit__pkt, ill_t *, ill,
+ mblk_t *, mp);
+ /*
+ * This assumes that we deliver to all conns for
+ * multicast and broadcast packets.
+ */
+ nexthop = INADDR_BROADCAST;
+ ira->ira_flags |= IRAF_DHCP_UNICAST;
+ }
+ }
+
+ /*
+ * If rsvpd is running, let RSVP daemon handle its processing
+ * and forwarding of RSVP multicast/unicast packets.
+ * If rsvpd is not running but mrouted is running, RSVP
+ * multicast packets are forwarded as multicast traffic
+ * and RSVP unicast packets are forwarded by unicast router.
+ * If neither rsvpd nor mrouted is running, RSVP multicast
+ * packets are not forwarded, but the unicast packets are
+ * forwarded like unicast traffic.
+ */
+ if (ipha->ipha_protocol == IPPROTO_RSVP &&
+ ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head != NULL) {
+ /* RSVP packet and rsvpd running. Treat as ours */
+ ip2dbg(("ip_input: RSVP for us: 0x%x\n", ntohl(nexthop)));
+ /*
+ * We use a multicast address to get the packet to
+ * ire_recv_multicast_v4. There will not be a membership
+ * check since we set IRAF_RSVP
+ */
+ nexthop = htonl(INADDR_UNSPEC_GROUP);
+ ira->ira_flags |= IRAF_RSVP;
+ }
+
+ ill_input_short_v4(mp, ipha, &nexthop, ira, rtc);
+}
+
+/*
+ * This is the tail-end of the full receive side packet handling.
+ * It can be used directly when the configuration is simple.
+ */
+void
+ill_input_short_v4(mblk_t *mp, void *iph_arg, void *nexthop_arg,
+ ip_recv_attr_t *ira, rtc_t *rtc)
+{
+ ire_t *ire;
+ uint_t opt_len;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+ uint_t pkt_len;
+ ssize_t len;
+ ipha_t *ipha = (ipha_t *)iph_arg;
+ ipaddr_t nexthop = *(ipaddr_t *)nexthop_arg;
+ ilb_stack_t *ilbs = ipst->ips_netstack->netstack_ilb;
+#define rptr ((uchar_t *)ipha)
+
+ ASSERT(DB_TYPE(mp) == M_DATA);
+
+ /*
+ * The following test for loopback is faster than
+ * IP_LOOPBACK_ADDR(), because it avoids any bitwise
+ * operations.
+ * Note that these addresses are always in network byte order
+ */
+ if (((*(uchar_t *)&ipha->ipha_dst) == 127) ||
+ ((*(uchar_t *)&ipha->ipha_src) == 127)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
+ ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
+ freemsg(mp);
+ return;
+ }
+
+ len = mp->b_wptr - rptr;
+ pkt_len = ira->ira_pktlen;
+
+ /* multiple mblk or too short */
+ len -= pkt_len;
+ if (len != 0) {
+ mp = ip_check_length(mp, rptr, len, pkt_len,
+ IP_SIMPLE_HDR_LENGTH, ira);
+ if (mp == NULL)
+ return;
+ ipha = (ipha_t *)mp->b_rptr;
+ }
+
+ DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+ ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL,
+ int, 0);
+
+ /*
+ * The event for packets being received from a 'physical'
+ * interface is placed after validation of the source and/or
+ * destination address as being local so that packets can be
+ * redirected to loopback addresses using ipnat.
+ */
+ DTRACE_PROBE4(ip4__physical__in__start,
+ ill_t *, ill, ill_t *, NULL,
+ ipha_t *, ipha, mblk_t *, mp);
+
+ if (HOOKS4_INTERESTED_PHYSICAL_IN(ipst)) {
+ int ll_multicast = 0;
+ int error;
+ ipaddr_t orig_dst = ipha->ipha_dst;
+
+ if (ira->ira_flags & IRAF_L2DST_MULTICAST)
+ ll_multicast = HPE_MULTICAST;
+ else if (ira->ira_flags & IRAF_L2DST_BROADCAST)
+ ll_multicast = HPE_BROADCAST;
+
+ FW_HOOKS(ipst->ips_ip4_physical_in_event,
+ ipst->ips_ipv4firewall_physical_in,
+ ill, NULL, ipha, mp, mp, ll_multicast, ipst, error);
+
+ DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, mp);
+
+ if (mp == NULL)
+ return;
+ /* The length could have changed */
+ ipha = (ipha_t *)mp->b_rptr;
+ ira->ira_pktlen = ntohs(ipha->ipha_length);
+ pkt_len = ira->ira_pktlen;
+
+ /*
+ * In case the destination changed we override any previous
+ * change to nexthop.
+ */
+ if (orig_dst != ipha->ipha_dst)
+ nexthop = ipha->ipha_dst;
+ if (nexthop == INADDR_ANY) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
+ ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ }
+
+ if (ipst->ips_ip4_observe.he_interested) {
+ zoneid_t dzone;
+
+ /*
+ * On the inbound path the src zone will be unknown as
+ * this packet has come from the wire.
+ */
+ dzone = ip_get_zoneid_v4(nexthop, mp, ira, ALL_ZONES);
+ ipobs_hook(mp, IPOBS_HOOK_INBOUND, ALL_ZONES, dzone, ill, ipst);
+ }
+
+ /*
+ * If there is a good HW IP header checksum we clear the need
+ * look at the IP header checksum.
+ */
+ if ((DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) &&
+ ILL_HCKSUM_CAPABLE(ill) && dohwcksum) {
+ /* Header checksum was ok. Clear the flag */
+ DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
+ ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM;
+ }
+
+ /*
+ * Here we check to see if we machine is setup as
+ * L3 loadbalancer and if the incoming packet is for a VIP
+ *
+ * Check the following:
+ * - there is at least a rule
+ * - protocol of the packet is supported
+ */
+ if (ilb_has_rules(ilbs) && ILB_SUPP_L4(ipha->ipha_protocol)) {
+ ipaddr_t lb_dst;
+ int lb_ret;
+
+ /* For convenience, we pull up the mblk. */
+ if (mp->b_cont != NULL) {
+ if (pullupmsg(mp, -1) == 0) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards - pullupmsg",
+ mp, ill);
+ freemsg(mp);
+ return;
+ }
+ ipha = (ipha_t *)mp->b_rptr;
+ }
+
+ /*
+ * We just drop all fragments going to any VIP, at
+ * least for now....
+ */
+ if (ntohs(ipha->ipha_fragment_offset_and_flags) &
+ (IPH_MF | IPH_OFFSET)) {
+ if (!ilb_rule_match_vip_v4(ilbs, nexthop, NULL)) {
+ goto after_ilb;
+ }
+
+ ILB_KSTAT_UPDATE(ilbs, ip_frag_in, 1);
+ ILB_KSTAT_UPDATE(ilbs, ip_frag_dropped, 1);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ILB fragment", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ lb_ret = ilb_check_v4(ilbs, ill, mp, ipha, ipha->ipha_protocol,
+ (uint8_t *)ipha + IPH_HDR_LENGTH(ipha), &lb_dst);
+
+ if (lb_ret == ILB_DROPPED) {
+ /* Is this the right counter to increase? */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ILB_DROPPED", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ if (lb_ret == ILB_BALANCED) {
+ /* Set the dst to that of the chosen server */
+ nexthop = lb_dst;
+ DB_CKSUMFLAGS(mp) = 0;
+ }
+ }
+
+after_ilb:
+ opt_len = ipha->ipha_version_and_hdr_length - IP_SIMPLE_HDR_VERSION;
+ ira->ira_ip_hdr_length = IP_SIMPLE_HDR_LENGTH;
+ if (opt_len != 0) {
+ int error = 0;
+
+ ira->ira_ip_hdr_length += (opt_len << 2);
+ ira->ira_flags |= IRAF_IPV4_OPTIONS;
+
+ /* IP Options present! Validate the length. */
+ mp = ip_check_optlen(mp, ipha, opt_len, pkt_len, ira);
+ if (mp == NULL)
+ return;
+
+ /* Might have changed */
+ ipha = (ipha_t *)mp->b_rptr;
+
+ /* Verify IP header checksum before parsing the options */
+ if ((ira->ira_flags & IRAF_VERIFY_IP_CKSUM) &&
+ ip_csum_hdr(ipha)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
+ ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM;
+
+ /*
+ * Go off to ip_input_options which returns the next hop
+ * destination address, which may have been affected
+ * by source routing.
+ */
+ IP_STAT(ipst, ip_opt);
+
+ nexthop = ip_input_options(ipha, nexthop, mp, ira, &error);
+ if (error != 0) {
+ /*
+ * An ICMP error has been sent and the packet has
+ * been dropped.
+ */
+ return;
+ }
+ }
+ /* Can not use route cache with TX since the labels can differ */
+ if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
+ if (CLASSD(nexthop)) {
+ ire = ire_multicast(ill);
+ } else {
+ /* Match destination and label */
+ ire = ire_route_recursive_v4(nexthop, 0, NULL,
+ ALL_ZONES, ira->ira_tsl, MATCH_IRE_SECATTR,
+ (ill->ill_flags & ILLF_ROUTER),
+ ira->ira_xmit_hint, ipst, NULL, NULL, NULL);
+ }
+ /* Update the route cache so we do the ire_refrele */
+ ASSERT(ire != NULL);
+ if (rtc->rtc_ire != NULL)
+ ire_refrele(rtc->rtc_ire);
+ rtc->rtc_ire = ire;
+ rtc->rtc_ipaddr = nexthop;
+ } else if (nexthop == rtc->rtc_ipaddr) {
+ /* Use the route cache */
+ ASSERT(rtc->rtc_ire != NULL);
+ ire = rtc->rtc_ire;
+ } else {
+ /* Update the route cache */
+ if (CLASSD(nexthop)) {
+ ire = ire_multicast(ill);
+ } else {
+ /* Just match the destination */
+ ire = ire_route_recursive_dstonly_v4(nexthop,
+ (ill->ill_flags & ILLF_ROUTER), ira->ira_xmit_hint,
+ ipst);
+ }
+ ASSERT(ire != NULL);
+ if (rtc->rtc_ire != NULL)
+ ire_refrele(rtc->rtc_ire);
+ rtc->rtc_ire = ire;
+ rtc->rtc_ipaddr = nexthop;
+ }
+
+ ire->ire_ib_pkt_count++;
+
+ /*
+ * Based on ire_type and ire_flags call one of:
+ * ire_recv_local_v4 - for IRE_LOCAL
+ * ire_recv_loopback_v4 - for IRE_LOOPBACK
+ * ire_recv_multirt_v4 - if RTF_MULTIRT
+ * ire_recv_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE
+ * ire_recv_multicast_v4 - for IRE_MULTICAST
+ * ire_recv_broadcast_v4 - for IRE_BROADCAST
+ * ire_recv_noaccept_v4 - for ire_noaccept ones
+ * ire_recv_forward_v4 - for the rest.
+ */
+ (*ire->ire_recvfn)(ire, mp, ipha, ira);
+}
+#undef rptr
+
+/*
+ * ire_recvfn for IREs that need forwarding
+ */
+void
+ire_recv_forward_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
+{
+ ipha_t *ipha = (ipha_t *)iph_arg;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+ ill_t *dst_ill;
+ nce_t *nce;
+ ipaddr_t src = ipha->ipha_src;
+ uint32_t added_tx_len;
+ uint32_t mtu, iremtu;
+
+ if (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+ ip_drop_input("l2 multicast not forwarded", mp, ill);
+ freemsg(mp);
+ return;
+ }
+
+ if (!(ill->ill_flags & ILLF_ROUTER) && !ip_source_routed(ipha, ipst)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+ ip_drop_input("ipIfStatsForwProhibits", mp, ill);
+ freemsg(mp);
+ return;
+ }
+
+ /*
+ * Either ire_nce_capable or ire_dep_parent would be set for the IRE
+ * when it is found by ire_route_recursive, but that some other thread
+ * could have changed the routes with the effect of clearing
+ * ire_dep_parent. In that case we'd end up dropping the packet, or
+ * finding a new nce below.
+ * Get, allocate, or update the nce.
+ * We get a refhold on ire_nce_cache as a result of this to avoid races
+ * where ire_nce_cache is deleted.
+ *
+ * This ensures that we don't forward if the interface is down since
+ * ipif_down removes all the nces.
+ */
+ mutex_enter(&ire->ire_lock);
+ nce = ire->ire_nce_cache;
+ if (nce == NULL) {
+ /* Not yet set up - try to set one up */
+ mutex_exit(&ire->ire_lock);
+ (void) ire_revalidate_nce(ire);
+ mutex_enter(&ire->ire_lock);
+ nce = ire->ire_nce_cache;
+ if (nce == NULL) {
+ mutex_exit(&ire->ire_lock);
+ /* The ire_dep_parent chain went bad, or no memory */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("No ire_dep_parent", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ }
+ nce_refhold(nce);
+ mutex_exit(&ire->ire_lock);
+
+ if (nce->nce_is_condemned) {
+ nce_t *nce1;
+
+ nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_FALSE);
+ nce_refrele(nce);
+ if (nce1 == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("No nce", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ nce = nce1;
+ }
+ dst_ill = nce->nce_ill;
+
+ /*
+ * Unless we are forwarding, drop the packet.
+ * We have to let source routed packets through if they go out
+ * the same interface i.e., they are 'ping -l' packets.
+ */
+ if (!(dst_ill->ill_flags & ILLF_ROUTER) &&
+ !(ip_source_routed(ipha, ipst) && dst_ill == ill)) {
+ if (ip_source_routed(ipha, ipst)) {
+ ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ill);
+ icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira);
+ nce_refrele(nce);
+ return;
+ }
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+ ip_drop_input("ipIfStatsForwProhibits", mp, ill);
+ freemsg(mp);
+ nce_refrele(nce);
+ return;
+ }
+
+ if (ire->ire_zoneid != GLOBAL_ZONEID && ire->ire_zoneid != ALL_ZONES) {
+ ipaddr_t dst = ipha->ipha_dst;
+
+ ire->ire_ib_pkt_count--;
+ /*
+ * Should only use IREs that are visible from the
+ * global zone for forwarding.
+ * Take a source route into account the same way as ip_input
+ * did.
+ */
+ if (ira->ira_flags & IRAF_IPV4_OPTIONS) {
+ int error = 0;
+
+ dst = ip_input_options(ipha, dst, mp, ira, &error);
+ ASSERT(error == 0); /* ip_input checked */
+ }
+ ire = ire_route_recursive_v4(dst, 0, NULL, GLOBAL_ZONEID,
+ ira->ira_tsl, MATCH_IRE_SECATTR,
+ (ill->ill_flags & ILLF_ROUTER), ira->ira_xmit_hint, ipst,
+ NULL, NULL, NULL);
+ ire->ire_ib_pkt_count++;
+ (*ire->ire_recvfn)(ire, mp, ipha, ira);
+ ire_refrele(ire);
+ nce_refrele(nce);
+ return;
+ }
+
+ /*
+ * ipIfStatsHCInForwDatagrams should only be increment if there
+ * will be an attempt to forward the packet, which is why we
+ * increment after the above condition has been checked.
+ */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams);
+
+ /* Initiate Read side IPPF processing */
+ if (IPP_ENABLED(IPP_FWD_IN, ipst)) {
+ /* ip_process translates an IS_UNDER_IPMP */
+ mp = ip_process(IPP_FWD_IN, mp, ill, ill);
+ if (mp == NULL) {
+ /* ip_drop_packet and MIB done */
+ ip2dbg(("ire_recv_forward_v4: pkt dropped/deferred "
+ "during IPPF processing\n"));
+ nce_refrele(nce);
+ return;
+ }
+ }
+
+ DTRACE_PROBE4(ip4__forwarding__start,
+ ill_t *, ill, ill_t *, dst_ill, ipha_t *, ipha, mblk_t *, mp);
+
+ if (HOOKS4_INTERESTED_FORWARDING(ipst)) {
+ int error;
+
+ FW_HOOKS(ipst->ips_ip4_forwarding_event,
+ ipst->ips_ipv4firewall_forwarding,
+ ill, dst_ill, ipha, mp, mp, 0, ipst, error);
+
+ DTRACE_PROBE1(ip4__forwarding__end, mblk_t *, mp);
+
+ if (mp == NULL) {
+ nce_refrele(nce);
+ return;
+ }
+ /*
+ * Even if the destination was changed by the filter we use the
+ * forwarding decision that was made based on the address
+ * in ip_input.
+ */
+
+ /* Might have changed */
+ ipha = (ipha_t *)mp->b_rptr;
+ ira->ira_pktlen = ntohs(ipha->ipha_length);
+ }
+
+ /* Packet is being forwarded. Turning off hwcksum flag. */
+ DB_CKSUMFLAGS(mp) = 0;
+
+ /*
+ * Martian Address Filtering [RFC 1812, Section 5.3.7]
+ * The loopback address check for both src and dst has already
+ * been checked in ip_input
+ * In the future one can envision adding RPF checks using number 3.
+ * If we already checked the same source address we can skip this.
+ */
+ if (!(ira->ira_flags & IRAF_VERIFIED_SRC) ||
+ src != ira->ira_verified_src) {
+ switch (ipst->ips_src_check) {
+ case 0:
+ break;
+ case 2:
+ if (ip_type_v4(src, ipst) == IRE_BROADCAST) {
+ BUMP_MIB(ill->ill_ip_mib,
+ ipIfStatsForwProhibits);
+ BUMP_MIB(ill->ill_ip_mib,
+ ipIfStatsInAddrErrors);
+ ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
+ freemsg(mp);
+ nce_refrele(nce);
+ return;
+ }
+ /* FALLTHRU */
+
+ case 1:
+ if (CLASSD(src)) {
+ BUMP_MIB(ill->ill_ip_mib,
+ ipIfStatsForwProhibits);
+ BUMP_MIB(ill->ill_ip_mib,
+ ipIfStatsInAddrErrors);
+ ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
+ freemsg(mp);
+ nce_refrele(nce);
+ return;
+ }
+ break;
+ }
+ /* Remember for next packet */
+ ira->ira_flags |= IRAF_VERIFIED_SRC;
+ ira->ira_verified_src = src;
+ }
+
+ /*
+ * Check if packet is going out the same link on which it arrived.
+ * Means we might need to send a redirect.
+ */
+ if (IS_ON_SAME_LAN(dst_ill, ill) && ipst->ips_ip_g_send_redirects) {
+ ip_send_potential_redirect_v4(mp, ipha, ire, ira);
+ }
+
+ added_tx_len = 0;
+ if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
+ mblk_t *mp1;
+ uint32_t old_pkt_len = ira->ira_pktlen;
+
+ /*
+ * Check if it can be forwarded and add/remove
+ * CIPSO options as needed.
+ */
+ if ((mp1 = tsol_ip_forward(ire, mp, ira)) == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+ ip_drop_input("tsol_ip_forward", mp, ill);
+ freemsg(mp);
+ nce_refrele(nce);
+ return;
+ }
+ /*
+ * Size may have changed. Remember amount added in case
+ * IP needs to send an ICMP too big.
+ */
+ mp = mp1;
+ ipha = (ipha_t *)mp->b_rptr;
+ ira->ira_pktlen = ntohs(ipha->ipha_length);
+ ira->ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
+ if (ira->ira_pktlen > old_pkt_len)
+ added_tx_len = ira->ira_pktlen - old_pkt_len;
+
+ /* Options can have been added or removed */
+ if (ira->ira_ip_hdr_length != IP_SIMPLE_HDR_LENGTH)
+ ira->ira_flags |= IRAF_IPV4_OPTIONS;
+ else
+ ira->ira_flags &= ~IRAF_IPV4_OPTIONS;
+ }
+
+ mtu = dst_ill->ill_mtu;
+ if ((iremtu = ire->ire_metrics.iulp_mtu) != 0 && iremtu < mtu)
+ mtu = iremtu;
+ ip_forward_xmit_v4(nce, ill, mp, ipha, ira, mtu, added_tx_len);
+ nce_refrele(nce);
+}
+
+/*
+ * Used for sending out unicast and multicast packets that are
+ * forwarded.
+ */
+void
+ip_forward_xmit_v4(nce_t *nce, ill_t *ill, mblk_t *mp, ipha_t *ipha,
+ ip_recv_attr_t *ira, uint32_t mtu, uint32_t added_tx_len)
+{
+ ill_t *dst_ill = nce->nce_ill;
+ uint32_t pkt_len;
+ uint32_t sum;
+ iaflags_t iraflags = ira->ira_flags;
+ ip_stack_t *ipst = ill->ill_ipst;
+ iaflags_t ixaflags;
+
+ if (ipha->ipha_ttl <= 1) {
+ /* Perhaps the checksum was bad */
+ if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
+ ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ICMP_TTL_EXCEEDED", mp, ill);
+ icmp_time_exceeded(mp, ICMP_TTL_EXCEEDED, ira);
+ return;
+ }
+ ipha->ipha_ttl--;
+ /* Adjust the checksum to reflect the ttl decrement. */
+ sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST;
+ ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16));
+
+ /* Check if there are options to update */
+ if (iraflags & IRAF_IPV4_OPTIONS) {
+ ASSERT(ipha->ipha_version_and_hdr_length !=
+ IP_SIMPLE_HDR_VERSION);
+ ASSERT(!(iraflags & IRAF_VERIFY_IP_CKSUM));
+
+ if (!ip_forward_options(mp, ipha, dst_ill, ira)) {
+ /* ipIfStatsForwProhibits and ip_drop_input done */
+ return;
+ }
+
+ ipha->ipha_hdr_checksum = 0;
+ ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
+ }
+
+ /* Initiate Write side IPPF processing before any fragmentation */
+ if (IPP_ENABLED(IPP_FWD_OUT, ipst)) {
+ /* ip_process translates an IS_UNDER_IPMP */
+ mp = ip_process(IPP_FWD_OUT, mp, dst_ill, dst_ill);
+ if (mp == NULL) {
+ /* ip_drop_packet and MIB done */
+ ip2dbg(("ire_recv_forward_v4: pkt dropped/deferred" \
+ " during IPPF processing\n"));
+ return;
+ }
+ }
+
+ pkt_len = ira->ira_pktlen;
+
+ BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams);
+
+ ixaflags = IXAF_IS_IPV4 | IXAF_NO_DEV_FLOW_CTL;
+
+ if (pkt_len > mtu) {
+ /*
+ * It needs fragging on its way out. If we haven't
+ * verified the header checksum yet we do it now since
+ * are going to put a surely good checksum in the
+ * outgoing header, we have to make sure that it
+ * was good coming in.
+ */
+ if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
+ ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ if (ipha->ipha_fragment_offset_and_flags & IPH_DF_HTONS) {
+ BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutFragFails);
+ ip_drop_output("ipIfStatsOutFragFails", mp, dst_ill);
+ if (iraflags & IRAF_SYSTEM_LABELED) {
+ /*
+ * Remove any CIPSO option added by
+ * tsol_ip_forward, and make sure we report
+ * a path MTU so that there
+ * is room to add such a CIPSO option for future
+ * packets.
+ */
+ mtu = tsol_pmtu_adjust(mp, mtu, added_tx_len,
+ AF_INET);
+ }
+
+ icmp_frag_needed(mp, mtu, ira);
+ return;
+ }
+
+ (void) ip_fragment_v4(mp, nce, ixaflags, pkt_len, mtu,
+ ira->ira_xmit_hint, GLOBAL_ZONEID, 0, ip_xmit, NULL);
+ return;
+ }
+
+ ASSERT(pkt_len == ntohs(((ipha_t *)mp->b_rptr)->ipha_length));
+ if (iraflags & IRAF_LOOPBACK_COPY) {
+ /*
+ * IXAF_NO_LOOP_ZONEID is not set hence 7th arg
+ * is don't care
+ */
+ (void) ip_postfrag_loopcheck(mp, nce,
+ ixaflags | IXAF_LOOPBACK_COPY,
+ pkt_len, ira->ira_xmit_hint, GLOBAL_ZONEID, 0, NULL);
+ } else {
+ (void) ip_xmit(mp, nce, ixaflags, pkt_len, ira->ira_xmit_hint,
+ GLOBAL_ZONEID, 0, NULL);
+ }
+}
+
+/*
+ * ire_recvfn for RTF_REJECT and RTF_BLACKHOLE routes, including IRE_NOROUTE,
+ * which is what ire_route_recursive returns when there is no matching ire.
+ * Send ICMP unreachable unless blackhole.
+ */
+void
+ire_recv_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
+{
+ ipha_t *ipha = (ipha_t *)iph_arg;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+
+ /* Would we have forwarded this packet if we had a route? */
+ if (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+ ip_drop_input("l2 multicast not forwarded", mp, ill);
+ freemsg(mp);
+ return;
+ }
+
+ if (!(ill->ill_flags & ILLF_ROUTER)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+ ip_drop_input("ipIfStatsForwProhibits", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ /*
+ * If we had a route this could have been forwarded. Count as such.
+ *
+ * ipIfStatsHCInForwDatagrams should only be increment if there
+ * will be an attempt to forward the packet, which is why we
+ * increment after the above condition has been checked.
+ */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams);
+
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInNoRoutes);
+
+ ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0, RTA_DST,
+ ipst);
+
+ if (ire->ire_flags & RTF_BLACKHOLE) {
+ ip_drop_input("ipIfStatsInNoRoutes RTF_BLACKHOLE", mp, ill);
+ freemsg(mp);
+ } else {
+ ip_drop_input("ipIfStatsInNoRoutes RTF_REJECT", mp, ill);
+
+ if (ip_source_routed(ipha, ipst)) {
+ icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira);
+ } else {
+ icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, ira);
+ }
+ }
+}
+
+/*
+ * ire_recvfn for IRE_LOCALs marked with ire_noaccept. Such IREs are used for
+ * VRRP when in noaccept mode.
+ * We silently drop the packet. ARP handles packets even if noaccept is set.
+ */
+/* ARGSUSED */
+void
+ire_recv_noaccept_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
+ ip_recv_attr_t *ira)
+{
+ ill_t *ill = ira->ira_ill;
+
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards - noaccept", mp, ill);
+ freemsg(mp);
+}
+
+/*
+ * ire_recvfn for IRE_BROADCAST.
+ */
+void
+ire_recv_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
+ ip_recv_attr_t *ira)
+{
+ ipha_t *ipha = (ipha_t *)iph_arg;
+ ill_t *ill = ira->ira_ill;
+ ill_t *dst_ill = ire->ire_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+ ire_t *alt_ire;
+ nce_t *nce;
+ ipaddr_t ipha_dst;
+
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInBcastPkts);
+
+ /* Tag for higher-level protocols */
+ ira->ira_flags |= IRAF_BROADCAST;
+
+ /*
+ * Whether local or directed broadcast forwarding: don't allow
+ * for TCP.
+ */
+ if (ipha->ipha_protocol == IPPROTO_TCP) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ freemsg(mp);
+ return;
+ }
+
+ /*
+ * So that we don't end up with dups, only one ill an IPMP group is
+ * nominated to receive broadcast traffic.
+ * If we have no cast_ill we are liberal and accept everything.
+ */
+ if (IS_UNDER_IPMP(ill)) {
+ /* For an under ill_grp can change under lock */
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+ if (!ill->ill_nom_cast && ill->ill_grp != NULL &&
+ ill->ill_grp->ig_cast_ill != NULL) {
+ rw_exit(&ipst->ips_ill_g_lock);
+ /* No MIB since this is normal operation */
+ ip_drop_input("not nom_cast", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ rw_exit(&ipst->ips_ill_g_lock);
+
+ ira->ira_ruifindex = ill_get_upper_ifindex(ill);
+ }
+
+ /*
+ * After reassembly and IPsec we will need to duplicate the
+ * broadcast packet for all matching zones on the ill.
+ */
+ ira->ira_zoneid = ALL_ZONES;
+
+ /*
+ * Check for directed broadcast i.e. ire->ire_ill is different than
+ * the incoming ill.
+ * The same broadcast address can be assigned to multiple interfaces
+ * so have to check explicitly for that case by looking up the alt_ire
+ */
+ if (dst_ill == ill && !(ire->ire_flags & RTF_MULTIRT)) {
+ /* Reassemble on the ill on which the packet arrived */
+ ip_input_local_v4(ire, mp, ipha, ira);
+ /* Restore */
+ ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+ return;
+ }
+
+ /* Is there an IRE_BROADCAST on the incoming ill? */
+ ipha_dst = ((ira->ira_flags & IRAF_DHCP_UNICAST) ? INADDR_BROADCAST :
+ ipha->ipha_dst);
+ alt_ire = ire_ftable_lookup_v4(ipha_dst, 0, 0, IRE_BROADCAST, ill,
+ ALL_ZONES, ira->ira_tsl,
+ MATCH_IRE_TYPE|MATCH_IRE_ILL|MATCH_IRE_SECATTR, 0, ipst, NULL);
+ if (alt_ire != NULL) {
+ /* Not a directed broadcast */
+ /*
+ * In the special case of multirouted broadcast
+ * packets, we unconditionally need to "gateway"
+ * them to the appropriate interface here so that reassembly
+ * works. We know that the IRE_BROADCAST on cgtp0 doesn't
+ * have RTF_MULTIRT set so we look for such an IRE in the
+ * bucket.
+ */
+ if (alt_ire->ire_flags & RTF_MULTIRT) {
+ irb_t *irb;
+ ire_t *ire1;
+
+ irb = ire->ire_bucket;
+ irb_refhold(irb);
+ for (ire1 = irb->irb_ire; ire1 != NULL;
+ ire1 = ire1->ire_next) {
+ if (IRE_IS_CONDEMNED(ire1))
+ continue;
+ if (!(ire1->ire_type & IRE_BROADCAST) ||
+ (ire1->ire_flags & RTF_MULTIRT))
+ continue;
+ ill = ire1->ire_ill;
+ ill_refhold(ill);
+ break;
+ }
+ irb_refrele(irb);
+ if (ire1 != NULL) {
+ ill_t *orig_ill = ira->ira_ill;
+
+ ire_refrele(alt_ire);
+ /* Reassemble on the new ill */
+ ira->ira_ill = ill;
+ ip_input_local_v4(ire, mp, ipha, ira);
+ ill_refrele(ill);
+ /* Restore */
+ ira->ira_ill = orig_ill;
+ ira->ira_ruifindex =
+ orig_ill->ill_phyint->phyint_ifindex;
+ return;
+ }
+ }
+ ire_refrele(alt_ire);
+ /* Reassemble on the ill on which the packet arrived */
+ ip_input_local_v4(ire, mp, ipha, ira);
+ goto done;
+ }
+
+ /*
+ * This is a directed broadcast
+ *
+ * If directed broadcast is allowed, then forward the packet out
+ * the destination interface with IXAF_LOOPBACK_COPY set. That will
+ * result in ip_input() receiving a copy of the packet on the
+ * appropriate ill. (We could optimize this to avoid the extra trip
+ * via ip_input(), but since directed broadcasts are normally disabled
+ * it doesn't make sense to optimize it.)
+ */
+ if (!ipst->ips_ip_g_forward_directed_bcast ||
+ (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST))) {
+ ip_drop_input("directed broadcast not allowed", mp, ill);
+ freemsg(mp);
+ goto done;
+ }
+ if ((ira->ira_flags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
+ ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
+ freemsg(mp);
+ goto done;
+ }
+
+ /*
+ * Clear the indication that this may have hardware
+ * checksum as we are not using it for forwarding.
+ */
+ DB_CKSUMFLAGS(mp) = 0;
+
+ /*
+ * Adjust ttl to 2 (1+1 - the forward engine will decrement it by one.
+ */
+ ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl + 1;
+ ipha->ipha_hdr_checksum = 0;
+ ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
+
+ /*
+ * We use ip_forward_xmit to do any fragmentation.
+ * and loopback copy on the outbound interface.
+ *
+ * Make it so that IXAF_LOOPBACK_COPY to be set on transmit side.
+ */
+ ira->ira_flags |= IRAF_LOOPBACK_COPY;
+
+ nce = arp_nce_init(dst_ill, ipha->ipha_dst, IRE_BROADCAST);
+ if (nce == NULL) {
+ BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("No nce", mp, dst_ill);
+ freemsg(mp);
+ goto done;
+ }
+
+ ip_forward_xmit_v4(nce, ill, mp, ipha, ira, dst_ill->ill_mtu, 0);
+ nce_refrele(nce);
+done:
+ /* Restore */
+ ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+}
+
+/*
+ * ire_recvfn for IRE_MULTICAST.
+ */
+void
+ire_recv_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
+ ip_recv_attr_t *ira)
+{
+ ipha_t *ipha = (ipha_t *)iph_arg;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+
+ ASSERT(ire->ire_ill == ira->ira_ill);
+
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastPkts);
+ UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastOctets, ira->ira_pktlen);
+
+ /* RSVP hook */
+ if (ira->ira_flags & IRAF_RSVP)
+ goto forus;
+
+ /* Tag for higher-level protocols */
+ ira->ira_flags |= IRAF_MULTICAST;
+
+ /*
+ * So that we don't end up with dups, only one ill an IPMP group is
+ * nominated to receive multicast traffic.
+ * If we have no cast_ill we are liberal and accept everything.
+ */
+ if (IS_UNDER_IPMP(ill)) {
+ ip_stack_t *ipst = ill->ill_ipst;
+
+ /* For an under ill_grp can change under lock */
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+ if (!ill->ill_nom_cast && ill->ill_grp != NULL &&
+ ill->ill_grp->ig_cast_ill != NULL) {
+ rw_exit(&ipst->ips_ill_g_lock);
+ ip_drop_input("not on cast ill", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ rw_exit(&ipst->ips_ill_g_lock);
+ /*
+ * We switch to the upper ill so that mrouter and hasmembers
+ * can operate on upper here and in ip_input_multicast.
+ */
+ ill = ipmp_ill_hold_ipmp_ill(ill);
+ if (ill != NULL) {
+ ASSERT(ill != ira->ira_ill);
+ ASSERT(ire->ire_ill == ira->ira_ill);
+ ira->ira_ill = ill;
+ ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+ } else {
+ ill = ira->ira_ill;
+ }
+ }
+
+ /*
+ * Check if we are a multicast router - send ip_mforward a copy of
+ * the packet.
+ * Due to mroute_decap tunnels we consider forwarding packets even if
+ * mrouted has not joined the allmulti group on this interface.
+ */
+ if (ipst->ips_ip_g_mrouter) {
+ int retval;
+
+ /*
+ * Clear the indication that this may have hardware
+ * checksum as we are not using it for forwarding.
+ */
+ DB_CKSUMFLAGS(mp) = 0;
+
+ /*
+ * ip_mforward helps us make these distinctions: If received
+ * on tunnel and not IGMP, then drop.
+ * If IGMP packet, then don't check membership
+ * If received on a phyint and IGMP or PIM, then
+ * don't check membership
+ */
+ retval = ip_mforward(mp, ira);
+ /* ip_mforward updates mib variables if needed */
+
+ switch (retval) {
+ case 0:
+ /*
+ * pkt is okay and arrived on phyint.
+ *
+ * If we are running as a multicast router
+ * we need to see all IGMP and/or PIM packets.
+ */
+ if ((ipha->ipha_protocol == IPPROTO_IGMP) ||
+ (ipha->ipha_protocol == IPPROTO_PIM)) {
+ goto forus;
+ }
+ break;
+ case -1:
+ /* pkt is mal-formed, toss it */
+ freemsg(mp);
+ goto done;
+ case 1:
+ /*
+ * pkt is okay and arrived on a tunnel
+ *
+ * If we are running a multicast router
+ * we need to see all igmp packets.
+ */
+ if (ipha->ipha_protocol == IPPROTO_IGMP) {
+ goto forus;
+ }
+ ip_drop_input("Multicast on tunnel ignored", mp, ill);
+ freemsg(mp);
+ goto done;
+ }
+ }
+
+ /*
+ * Check if we have members on this ill. This is not necessary for
+ * correctness because even if the NIC/GLD had a leaky filter, we
+ * filter before passing to each conn_t.
+ */
+ if (!ill_hasmembers_v4(ill, ipha->ipha_dst)) {
+ /*
+ * Nobody interested
+ *
+ * This might just be caused by the fact that
+ * multiple IP Multicast addresses map to the same
+ * link layer multicast - no need to increment counter!
+ */
+ ip_drop_input("Multicast with no members", mp, ill);
+ freemsg(mp);
+ goto done;
+ }
+forus:
+ ip2dbg(("ire_recv_multicast_v4: multicast for us: 0x%x\n",
+ ntohl(ipha->ipha_dst)));
+
+ /*
+ * After reassembly and IPsec we will need to duplicate the
+ * multicast packet for all matching zones on the ill.
+ */
+ ira->ira_zoneid = ALL_ZONES;
+
+ /* Reassemble on the ill on which the packet arrived */
+ ip_input_local_v4(ire, mp, ipha, ira);
+done:
+ if (ill != ire->ire_ill) {
+ ill_refrele(ill);
+ ira->ira_ill = ire->ire_ill;
+ ira->ira_ruifindex = ira->ira_ill->ill_phyint->phyint_ifindex;
+ }
+}
+
+/*
+ * ire_recvfn for IRE_OFFLINK with RTF_MULTIRT.
+ * Drop packets since we don't forward out multirt routes.
+ */
+/* ARGSUSED */
+void
+ire_recv_multirt_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
+{
+ ill_t *ill = ira->ira_ill;
+
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInNoRoutes);
+ ip_drop_input("Not forwarding out MULTIRT", mp, ill);
+ freemsg(mp);
+}
+
+/*
+ * ire_recvfn for IRE_LOOPBACK. This is only used when a FW_HOOK
+ * has rewritten the packet to have a loopback destination address (We
+ * filter out packet with a loopback destination from arriving over the wire).
+ * We don't know what zone to use, thus we always use the GLOBAL_ZONEID.
+ */
+void
+ire_recv_loopback_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
+{
+ ipha_t *ipha = (ipha_t *)iph_arg;
+ ill_t *ill = ira->ira_ill;
+ ill_t *ire_ill = ire->ire_ill;
+
+ ira->ira_zoneid = GLOBAL_ZONEID;
+
+ /* Switch to the lo0 ill for further processing */
+ if (ire_ill != ill) {
+ /*
+ * Update ira_ill to be the ILL on which the IP address
+ * is hosted.
+ * No need to hold the ill since we have a hold on the ire
+ */
+ ASSERT(ira->ira_ill == ira->ira_rill);
+ ira->ira_ill = ire_ill;
+
+ ip_input_local_v4(ire, mp, ipha, ira);
+
+ /* Restore */
+ ASSERT(ira->ira_ill == ire_ill);
+ ira->ira_ill = ill;
+ return;
+
+ }
+ ip_input_local_v4(ire, mp, ipha, ira);
+}
+
+/*
+ * ire_recvfn for IRE_LOCAL.
+ */
+void
+ire_recv_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
+{
+ ipha_t *ipha = (ipha_t *)iph_arg;
+ ill_t *ill = ira->ira_ill;
+ ill_t *ire_ill = ire->ire_ill;
+
+ /* Make a note for DAD that this address is in use */
+ ire->ire_last_used_time = lbolt;
+
+ /* Only target the IRE_LOCAL with the right zoneid. */
+ ira->ira_zoneid = ire->ire_zoneid;
+
+ /*
+ * If the packet arrived on the wrong ill, we check that
+ * this is ok.
+ * If it is, then we ensure that we do the reassembly on
+ * the ill on which the address is hosted. We keep ira_rill as
+ * the one on which the packet arrived, so that IP_PKTINFO and
+ * friends can report this.
+ */
+ if (ire_ill != ill) {
+ ire_t *new_ire;
+
+ new_ire = ip_check_multihome(&ipha->ipha_dst, ire, ill);
+ if (new_ire == NULL) {
+ /* Drop packet */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
+ ip_drop_input("ipIfStatsInForwProhibits", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ /*
+ * Update ira_ill to be the ILL on which the IP address
+ * is hosted. No need to hold the ill since we have a
+ * hold on the ire. Note that we do the switch even if
+ * new_ire == ire (for IPMP, ire would be the one corresponding
+ * to the IPMP ill).
+ */
+ ASSERT(ira->ira_ill == ira->ira_rill);
+ ira->ira_ill = new_ire->ire_ill;
+
+ /* ira_ruifindex tracks the upper for ira_rill */
+ if (IS_UNDER_IPMP(ill))
+ ira->ira_ruifindex = ill_get_upper_ifindex(ill);
+
+ ip_input_local_v4(new_ire, mp, ipha, ira);
+
+ /* Restore */
+ ASSERT(ira->ira_ill == new_ire->ire_ill);
+ ira->ira_ill = ill;
+ ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+
+ if (new_ire != ire)
+ ire_refrele(new_ire);
+ return;
+ }
+
+ ip_input_local_v4(ire, mp, ipha, ira);
+}
+
+/*
+ * Common function for packets arriving for the host. Handles
+ * checksum verification, reassembly checks, etc.
+ */
+static void
+ip_input_local_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
+{
+ ill_t *ill = ira->ira_ill;
+ iaflags_t iraflags = ira->ira_flags;
+
+ /*
+ * Verify IP header checksum. If the packet was AH or ESP then
+ * this flag has already been cleared. Likewise if the packet
+ * had a hardware checksum.
+ */
+ if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
+ ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
+ freemsg(mp);
+ return;
+ }
+
+ if (iraflags & IRAF_IPV4_OPTIONS) {
+ if (!ip_input_local_options(mp, ipha, ira)) {
+ /* Error has been sent and mp consumed */
+ return;
+ }
+ }
+
+ /*
+ * Is packet part of fragmented IP packet?
+ * We compare against defined values in network byte order
+ */
+ if (ipha->ipha_fragment_offset_and_flags &
+ (IPH_MF_HTONS | IPH_OFFSET_HTONS)) {
+ /*
+ * Make sure we have ira_l2src before we loose the original
+ * mblk
+ */
+ if (!(ira->ira_flags & IRAF_L2SRC_SET))
+ ip_setl2src(mp, ira, ira->ira_rill);
+
+ mp = ip_input_fragment(mp, ipha, ira);
+ if (mp == NULL)
+ return;
+ /* Completed reassembly */
+ ipha = (ipha_t *)mp->b_rptr;
+ }
+
+ /*
+ * For broadcast and multicast we need some extra work before
+ * we call ip_fanout_v4(), since in the case of shared-IP zones
+ * we need to pretend that a packet arrived for each zoneid.
+ */
+ if (iraflags & IRAF_MULTIBROADCAST) {
+ if (iraflags & IRAF_BROADCAST)
+ ip_input_broadcast_v4(ire, mp, ipha, ira);
+ else
+ ip_input_multicast_v4(ire, mp, ipha, ira);
+ return;
+ }
+ ip_fanout_v4(mp, ipha, ira);
+}
+
+
+/*
+ * Handle multiple zones which match the same broadcast address
+ * and ill by delivering a packet to each of them.
+ * Walk the bucket and look for different ire_zoneid but otherwise
+ * the same IRE (same ill/addr/mask/type).
+ * Note that ire_add() tracks IREs that are identical in all
+ * fields (addr/mask/type/gw/ill/zoneid) within a single IRE by
+ * increasing ire_identical_cnt. Thus we don't need to be concerned
+ * about those.
+ */
+static void
+ip_input_broadcast_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
+{
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+ netstack_t *ns = ipst->ips_netstack;
+ irb_t *irb;
+ ire_t *ire1;
+ mblk_t *mp1;
+ ipha_t *ipha1;
+
+ irb = ire->ire_bucket;
+
+ /*
+ * If we don't have more than one shared-IP zone, or if
+ * there can't be more than one IRE_BROADCAST for this
+ * IP address, then just set the zoneid and proceed.
+ */
+ if (ns->netstack_numzones == 1 || irb->irb_ire_cnt == 1) {
+ ira->ira_zoneid = ire->ire_zoneid;
+
+ ip_fanout_v4(mp, ipha, ira);
+ return;
+ }
+ irb_refhold(irb);
+ for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
+ /* We do the main IRE after the end of the loop */
+ if (ire1 == ire)
+ continue;
+
+ /*
+ * Only IREs for the same IP address should be in the same
+ * bucket.
+ * But could have IRE_HOSTs in the case of CGTP.
+ */
+ ASSERT(ire1->ire_addr == ire->ire_addr);
+ if (!(ire1->ire_type & IRE_BROADCAST))
+ continue;
+
+ if (IRE_IS_CONDEMNED(ire1))
+ continue;
+
+ mp1 = copymsg(mp);
+ if (mp1 == NULL) {
+ /* Failed to deliver to one zone */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ continue;
+ }
+ ira->ira_zoneid = ire1->ire_zoneid;
+ ipha1 = (ipha_t *)mp1->b_rptr;
+ ip_fanout_v4(mp1, ipha1, ira);
+ }
+ irb_refrele(irb);
+ /* Do the main ire */
+ ira->ira_zoneid = ire->ire_zoneid;
+ ip_fanout_v4(mp, ipha, ira);
+}
+
+/*
+ * Handle multiple zones which want to receive the same multicast packets
+ * on this ill by delivering a packet to each of them.
+ *
+ * Note that for packets delivered to transports we could instead do this
+ * as part of the fanout code, but since we need to handle icmp_inbound
+ * it is simpler to have multicast work the same as broadcast.
+ *
+ * The ip_fanout matching for multicast matches based on ilm independent of
+ * zoneid since the zoneid restriction is applied when joining a multicast
+ * group.
+ */
+/* ARGSUSED */
+static void
+ip_input_multicast_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
+{
+ ill_t *ill = ira->ira_ill;
+ iaflags_t iraflags = ira->ira_flags;
+ ip_stack_t *ipst = ill->ill_ipst;
+ netstack_t *ns = ipst->ips_netstack;
+ zoneid_t zoneid;
+ mblk_t *mp1;
+ ipha_t *ipha1;
+
+ /* ire_recv_multicast has switched to the upper ill for IPMP */
+ ASSERT(!IS_UNDER_IPMP(ill));
+
+ /*
+ * If we don't have more than one shared-IP zone, or if
+ * there are no members in anything but the global zone,
+ * then just set the zoneid and proceed.
+ */
+ if (ns->netstack_numzones == 1 ||
+ !ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst,
+ GLOBAL_ZONEID)) {
+ ira->ira_zoneid = GLOBAL_ZONEID;
+
+ /* If sender didn't want this zone to receive it, drop */
+ if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
+ ira->ira_no_loop_zoneid == ira->ira_zoneid) {
+ ip_drop_input("Multicast but wrong zoneid", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ ip_fanout_v4(mp, ipha, ira);
+ return;
+ }
+
+ /*
+ * Here we loop over all zoneids that have members in the group
+ * and deliver a packet to ip_fanout for each zoneid.
+ *
+ * First find any members in the lowest numeric zoneid by looking for
+ * first zoneid larger than -1 (ALL_ZONES).
+ * We terminate the loop when we receive -1 (ALL_ZONES).
+ */
+ zoneid = ill_hasmembers_nextzone_v4(ill, ipha->ipha_dst, ALL_ZONES);
+ for (; zoneid != ALL_ZONES;
+ zoneid = ill_hasmembers_nextzone_v4(ill, ipha->ipha_dst, zoneid)) {
+ /*
+ * Avoid an extra copymsg/freemsg by skipping global zone here
+ * and doing that at the end.
+ */
+ if (zoneid == GLOBAL_ZONEID)
+ continue;
+
+ ira->ira_zoneid = zoneid;
+
+ /* If sender didn't want this zone to receive it, skip */
+ if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
+ ira->ira_no_loop_zoneid == ira->ira_zoneid)
+ continue;
+
+ mp1 = copymsg(mp);
+ if (mp1 == NULL) {
+ /* Failed to deliver to one zone */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ continue;
+ }
+ ipha1 = (ipha_t *)mp1->b_rptr;
+ ip_fanout_v4(mp1, ipha1, ira);
+ }
+
+ /* Do the main ire */
+ ira->ira_zoneid = GLOBAL_ZONEID;
+ /* If sender didn't want this zone to receive it, drop */
+ if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
+ ira->ira_no_loop_zoneid == ira->ira_zoneid) {
+ ip_drop_input("Multicast but wrong zoneid", mp, ill);
+ freemsg(mp);
+ } else {
+ ip_fanout_v4(mp, ipha, ira);
+ }
+}
+
+
+/*
+ * Determine the zoneid and IRAF_TX_* flags if trusted extensions
+ * is in use. Updates ira_zoneid and ira_flags as a result.
+ */
+static void
+ip_fanout_tx_v4(mblk_t *mp, ipha_t *ipha, uint8_t protocol,
+ uint_t ip_hdr_length, ip_recv_attr_t *ira)
+{
+ uint16_t *up;
+ uint16_t lport;
+ zoneid_t zoneid;
+
+ ASSERT(ira->ira_flags & IRAF_SYSTEM_LABELED);
+
+ /*
+ * If the packet is unlabeled we might allow read-down
+ * for MAC_EXEMPT. Below we clear this if it is a multi-level
+ * port (MLP).
+ * Note that ira_tsl can be NULL here.
+ */
+ if (ira->ira_tsl != NULL && ira->ira_tsl->tsl_flags & TSLF_UNLABELED)
+ ira->ira_flags |= IRAF_TX_MAC_EXEMPTABLE;
+
+ if (ira->ira_zoneid != ALL_ZONES)
+ return;
+
+ ira->ira_flags |= IRAF_TX_SHARED_ADDR;
+
+ up = (uint16_t *)((uchar_t *)ipha + ip_hdr_length);
+ switch (protocol) {
+ case IPPROTO_TCP:
+ case IPPROTO_SCTP:
+ case IPPROTO_UDP:
+ /* Caller ensures this */
+ ASSERT(((uchar_t *)ipha) + ip_hdr_length +4 <= mp->b_wptr);
+
+ /*
+ * Only these transports support MLP.
+ * We know their destination port numbers is in
+ * the same place in the header.
+ */
+ lport = up[1];
+
+ /*
+ * No need to handle exclusive-stack zones
+ * since ALL_ZONES only applies to the shared IP instance.
+ */
+ zoneid = tsol_mlp_findzone(protocol, lport);
+ /*
+ * If no shared MLP is found, tsol_mlp_findzone returns
+ * ALL_ZONES. In that case, we assume it's SLP, and
+ * search for the zone based on the packet label.
+ *
+ * If there is such a zone, we prefer to find a
+ * connection in it. Otherwise, we look for a
+ * MAC-exempt connection in any zone whose label
+ * dominates the default label on the packet.
+ */
+ if (zoneid == ALL_ZONES)
+ zoneid = tsol_attr_to_zoneid(ira);
+ else
+ ira->ira_flags &= ~IRAF_TX_MAC_EXEMPTABLE;
+ break;
+ default:
+ /* Handle shared address for other protocols */
+ zoneid = tsol_attr_to_zoneid(ira);
+ break;
+ }
+ ira->ira_zoneid = zoneid;
+}
+
+/*
+ * Increment checksum failure statistics
+ */
+static void
+ip_input_cksum_err_v4(uint8_t protocol, uint16_t hck_flags, ill_t *ill)
+{
+ ip_stack_t *ipst = ill->ill_ipst;
+
+ switch (protocol) {
+ case IPPROTO_TCP:
+ BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs);
+
+ if (hck_flags & HCK_FULLCKSUM)
+ IP_STAT(ipst, ip_tcp_in_full_hw_cksum_err);
+ else if (hck_flags & HCK_PARTIALCKSUM)
+ IP_STAT(ipst, ip_tcp_in_part_hw_cksum_err);
+ else
+ IP_STAT(ipst, ip_tcp_in_sw_cksum_err);
+ break;
+ case IPPROTO_UDP:
+ BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs);
+ if (hck_flags & HCK_FULLCKSUM)
+ IP_STAT(ipst, ip_udp_in_full_hw_cksum_err);
+ else if (hck_flags & HCK_PARTIALCKSUM)
+ IP_STAT(ipst, ip_udp_in_part_hw_cksum_err);
+ else
+ IP_STAT(ipst, ip_udp_in_sw_cksum_err);
+ break;
+ case IPPROTO_ICMP:
+ BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs);
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+}
+
+/* Calculate the IPv4 pseudo-header checksum */
+uint32_t
+ip_input_cksum_pseudo_v4(ipha_t *ipha, ip_recv_attr_t *ira)
+{
+ uint_t ulp_len;
+ uint32_t cksum;
+ uint8_t protocol = ira->ira_protocol;
+ uint16_t ip_hdr_length = ira->ira_ip_hdr_length;
+
+#define iphs ((uint16_t *)ipha)
+
+ switch (protocol) {
+ case IPPROTO_TCP:
+ ulp_len = ira->ira_pktlen - ip_hdr_length;
+
+ /* Protocol and length */
+ cksum = htons(ulp_len) + IP_TCP_CSUM_COMP;
+ /* IP addresses */
+ cksum += iphs[6] + iphs[7] + iphs[8] + iphs[9];
+ break;
+
+ case IPPROTO_UDP: {
+ udpha_t *udpha;
+
+ udpha = (udpha_t *)((uchar_t *)ipha + ip_hdr_length);
+
+ /* Protocol and length */
+ cksum = udpha->uha_length + IP_UDP_CSUM_COMP;
+ /* IP addresses */
+ cksum += iphs[6] + iphs[7] + iphs[8] + iphs[9];
+ break;
+ }
+
+ default:
+ cksum = 0;
+ break;
+ }
+#undef iphs
+ return (cksum);
+}
+
+
+/*
+ * Software verification of the ULP checksums.
+ * Returns B_TRUE if ok.
+ * Increments statistics of failed.
+ */
+static boolean_t
+ip_input_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
+{
+ ip_stack_t *ipst = ira->ira_ill->ill_ipst;
+ uint32_t cksum;
+ uint8_t protocol = ira->ira_protocol;
+ uint16_t ip_hdr_length = ira->ira_ip_hdr_length;
+
+ IP_STAT(ipst, ip_in_sw_cksum);
+
+ ASSERT(protocol == IPPROTO_TCP || protocol == IPPROTO_UDP);
+
+ cksum = ip_input_cksum_pseudo_v4(ipha, ira);
+ cksum = IP_CSUM(mp, ip_hdr_length, cksum);
+ if (cksum == 0)
+ return (B_TRUE);
+
+ ip_input_cksum_err_v4(protocol, 0, ira->ira_ill);
+ return (B_FALSE);
+}
+
+/* There are drivers that can't do partial checksum with IP options */
+int eri_cksum_workaround = 1;
+
+/*
+ * Verify the ULP checksums.
+ * Returns B_TRUE if ok, or if the ULP doesn't have a well-defined checksum
+ * algorithm.
+ * Increments statistics if failed.
+ */
+static boolean_t
+ip_input_cksum_v4(iaflags_t iraflags, mblk_t *mp, ipha_t *ipha,
+ ip_recv_attr_t *ira)
+{
+ ill_t *ill = ira->ira_rill;
+ uint16_t hck_flags;
+ uint32_t cksum;
+ mblk_t *mp1;
+ int32_t len;
+ uint8_t protocol = ira->ira_protocol;
+ uint16_t ip_hdr_length = ira->ira_ip_hdr_length;
+
+
+ switch (protocol) {
+ case IPPROTO_TCP:
+ break;
+
+ case IPPROTO_UDP: {
+ udpha_t *udpha;
+
+ udpha = (udpha_t *)((uchar_t *)ipha + ip_hdr_length);
+ if (udpha->uha_checksum == 0) {
+ /* Packet doesn't have a UDP checksum */
+ return (B_TRUE);
+ }
+ break;
+ }
+ case IPPROTO_SCTP: {
+ sctp_hdr_t *sctph;
+ uint32_t pktsum;
+
+ sctph = (sctp_hdr_t *)((uchar_t *)ipha + ip_hdr_length);
+#ifdef DEBUG
+ if (skip_sctp_cksum)
+ return (B_TRUE);
+#endif
+ pktsum = sctph->sh_chksum;
+ sctph->sh_chksum = 0;
+ cksum = sctp_cksum(mp, ip_hdr_length);
+ sctph->sh_chksum = pktsum;
+ if (cksum == pktsum)
+ return (B_TRUE);
+
+ /*
+ * Defer until later whether a bad checksum is ok
+ * in order to allow RAW sockets to use Adler checksum
+ * with SCTP.
+ */
+ ira->ira_flags |= IRAF_SCTP_CSUM_ERR;
+ return (B_TRUE);
+ }
+
+ default:
+ /* No ULP checksum to verify. */
+ return (B_TRUE);
+ }
+ /*
+ * Revert to software checksum calculation if the interface
+ * isn't capable of checksum offload.
+ * We clear DB_CKSUMFLAGS when going through IPsec in ip_fanout.
+ * Note: IRAF_NO_HW_CKSUM is not currently used.
+ */
+ ASSERT(!IS_IPMP(ill));
+ if ((iraflags & IRAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) ||
+ !dohwcksum) {
+ return (ip_input_sw_cksum_v4(mp, ipha, ira));
+ }
+
+ /*
+ * We apply this for all ULP protocols. Does the HW know to
+ * not set the flags for SCTP and other protocols.
+ */
+
+ hck_flags = DB_CKSUMFLAGS(mp);
+
+ if (hck_flags & HCK_FULLCKSUM) {
+ /*
+ * Full checksum has been computed by the hardware
+ * and has been attached. If the driver wants us to
+ * verify the correctness of the attached value, in
+ * order to protect against faulty hardware, compare
+ * it against -0 (0xFFFF) to see if it's valid.
+ */
+ if (hck_flags & HCK_FULLCKSUM_OK)
+ return (B_TRUE);
+
+ cksum = DB_CKSUM16(mp);
+ if (cksum == 0xFFFF)
+ return (B_TRUE);
+ ip_input_cksum_err_v4(protocol, hck_flags, ira->ira_ill);
+ return (B_FALSE);
+ }
+
+ mp1 = mp->b_cont;
+ if ((hck_flags & HCK_PARTIALCKSUM) &&
+ (mp1 == NULL || mp1->b_cont == NULL) &&
+ ip_hdr_length >= DB_CKSUMSTART(mp) &&
+ (!eri_cksum_workaround || ip_hdr_length == IP_SIMPLE_HDR_LENGTH) &&
+ ((len = ip_hdr_length - DB_CKSUMSTART(mp)) & 1) == 0) {
+ uint32_t adj;
+ uchar_t *cksum_start;
+
+ cksum = ip_input_cksum_pseudo_v4(ipha, ira);
+
+ cksum_start = ((uchar_t *)ipha + DB_CKSUMSTART(mp));
+
+ /*
+ * Partial checksum has been calculated by hardware
+ * and attached to the packet; in addition, any
+ * prepended extraneous data is even byte aligned,
+ * and there are at most two mblks associated with
+ * the packet. If any such data exists, we adjust
+ * the checksum; also take care any postpended data.
+ */
+ IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, len, adj);
+ /*
+ * One's complement subtract extraneous checksum
+ */
+ cksum += DB_CKSUM16(mp);
+ if (adj >= cksum)
+ cksum = ~(adj - cksum) & 0xFFFF;
+ else
+ cksum -= adj;
+ cksum = (cksum & 0xFFFF) + ((int)cksum >> 16);
+ cksum = (cksum & 0xFFFF) + ((int)cksum >> 16);
+ if (!(~cksum & 0xFFFF))
+ return (B_TRUE);
+
+ ip_input_cksum_err_v4(protocol, hck_flags, ira->ira_ill);
+ return (B_FALSE);
+ }
+ return (ip_input_sw_cksum_v4(mp, ipha, ira));
+}
+
+
+/*
+ * Handle fanout of received packets.
+ * Unicast packets that are looped back (from ire_send_local_v4) and packets
+ * from the wire are differentiated by checking IRAF_VERIFY_ULP_CKSUM.
+ *
+ * IPQoS Notes
+ * Before sending it to the client, invoke IPPF processing. Policy processing
+ * takes place only if the callout_position, IPP_LOCAL_IN, is enabled.
+ */
+void
+ip_fanout_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
+{
+ ill_t *ill = ira->ira_ill;
+ iaflags_t iraflags = ira->ira_flags;
+ ip_stack_t *ipst = ill->ill_ipst;
+ uint8_t protocol = ipha->ipha_protocol;
+ conn_t *connp;
+#define rptr ((uchar_t *)ipha)
+ uint_t ip_hdr_length;
+ uint_t min_ulp_header_length;
+ int offset;
+ ssize_t len;
+ netstack_t *ns = ipst->ips_netstack;
+ ipsec_stack_t *ipss = ns->netstack_ipsec;
+ ill_t *rill = ira->ira_rill;
+
+ ASSERT(ira->ira_pktlen == ntohs(ipha->ipha_length));
+
+ ip_hdr_length = ira->ira_ip_hdr_length;
+ ira->ira_protocol = protocol;
+
+ /*
+ * Time for IPP once we've done reassembly and IPsec.
+ * We skip this for loopback packets since we don't do IPQoS
+ * on loopback.
+ */
+ if (IPP_ENABLED(IPP_LOCAL_IN, ipst) &&
+ !(iraflags & IRAF_LOOPBACK) &&
+ (protocol != IPPROTO_ESP || protocol != IPPROTO_AH)) {
+ /*
+ * Use the interface on which the packet arrived - not where
+ * the IP address is hosted.
+ */
+ /* ip_process translates an IS_UNDER_IPMP */
+ mp = ip_process(IPP_LOCAL_IN, mp, rill, ill);
+ if (mp == NULL) {
+ /* ip_drop_packet and MIB done */
+ return;
+ }
+ }
+
+ /* Determine the minimum required size of the upper-layer header */
+ /* Need to do this for at least the set of ULPs that TX handles. */
+ switch (protocol) {
+ case IPPROTO_TCP:
+ min_ulp_header_length = TCP_MIN_HEADER_LENGTH;
+ break;
+ case IPPROTO_SCTP:
+ min_ulp_header_length = SCTP_COMMON_HDR_LENGTH;
+ break;
+ case IPPROTO_UDP:
+ min_ulp_header_length = UDPH_SIZE;
+ break;
+ case IPPROTO_ICMP:
+ min_ulp_header_length = ICMPH_SIZE;
+ break;
+ default:
+ min_ulp_header_length = 0;
+ break;
+ }
+ /* Make sure we have the min ULP header length */
+ len = mp->b_wptr - rptr;
+ if (len < ip_hdr_length + min_ulp_header_length) {
+ if (ira->ira_pktlen < ip_hdr_length + min_ulp_header_length) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
+ ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ IP_STAT(ipst, ip_recv_pullup);
+ ipha = ip_pullup(mp, ip_hdr_length + min_ulp_header_length,
+ ira);
+ if (ipha == NULL)
+ goto discard;
+ len = mp->b_wptr - rptr;
+ }
+
+ /*
+ * If trusted extensions then determine the zoneid and TX specific
+ * ira_flags.
+ */
+ if (iraflags & IRAF_SYSTEM_LABELED) {
+ /* This can update ira->ira_flags and ira->ira_zoneid */
+ ip_fanout_tx_v4(mp, ipha, protocol, ip_hdr_length, ira);
+ iraflags = ira->ira_flags;
+ }
+
+
+ /* Verify ULP checksum. Handles TCP, UDP, and SCTP */
+ if (iraflags & IRAF_VERIFY_ULP_CKSUM) {
+ if (!ip_input_cksum_v4(iraflags, mp, ipha, ira)) {
+ /* Bad checksum. Stats are already incremented */
+ ip_drop_input("Bad ULP checksum", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ /* IRAF_SCTP_CSUM_ERR could have been set */
+ iraflags = ira->ira_flags;
+ }
+ switch (protocol) {
+ case IPPROTO_TCP:
+ /* For TCP, discard broadcast and multicast packets. */
+ if (iraflags & IRAF_MULTIBROADCAST)
+ goto discard;
+
+ /* First mblk contains IP+TCP headers per above check */
+ ASSERT(len >= ip_hdr_length + TCP_MIN_HEADER_LENGTH);
+
+ /* TCP options present? */
+ offset = ((uchar_t *)ipha)[ip_hdr_length + 12] >> 4;
+ if (offset != 5) {
+ if (offset < 5)
+ goto discard;
+
+ /*
+ * There must be TCP options.
+ * Make sure we can grab them.
+ */
+ offset <<= 2;
+ offset += ip_hdr_length;
+ if (len < offset) {
+ if (ira->ira_pktlen < offset) {
+ BUMP_MIB(ill->ill_ip_mib,
+ ipIfStatsInTruncatedPkts);
+ ip_drop_input(
+ "ipIfStatsInTruncatedPkts",
+ mp, ill);
+ freemsg(mp);
+ return;
+ }
+ IP_STAT(ipst, ip_recv_pullup);
+ ipha = ip_pullup(mp, offset, ira);
+ if (ipha == NULL)
+ goto discard;
+ len = mp->b_wptr - rptr;
+ }
+ }
+
+ /*
+ * Pass up a squeue hint to tcp.
+ * If ira_sqp is already set (this is loopback) we leave it
+ * alone.
+ */
+ if (ira->ira_sqp == NULL) {
+ ira->ira_sqp = ip_squeue_get(ira->ira_ring);
+ }
+
+ /* Look for AF_INET or AF_INET6 that matches */
+ connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_length,
+ ira, ipst);
+ if (connp == NULL) {
+ /* Send the TH_RST */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+ tcp_xmit_listeners_reset(mp, ira, ipst, NULL);
+ return;
+ }
+ if (connp->conn_incoming_ifindex != 0 &&
+ connp->conn_incoming_ifindex != ira->ira_ruifindex) {
+ CONN_DEC_REF(connp);
+
+ /* Send the TH_RST */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+ tcp_xmit_listeners_reset(mp, ira, ipst, NULL);
+ return;
+ }
+ if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) ||
+ (iraflags & IRAF_IPSEC_SECURE)) {
+ mp = ipsec_check_inbound_policy(mp, connp,
+ ipha, NULL, ira);
+ if (mp == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ /* Note that mp is NULL */
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ CONN_DEC_REF(connp);
+ return;
+ }
+ }
+ /* Found a client; up it goes */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+ ira->ira_ill = ira->ira_rill = NULL;
+ if (!IPCL_IS_TCP(connp)) {
+ /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
+ (connp->conn_recv)(connp, mp, NULL, ira);
+ CONN_DEC_REF(connp);
+ ira->ira_ill = ill;
+ ira->ira_rill = rill;
+ return;
+ }
+
+ /*
+ * We do different processing whether called from
+ * ip_accept_tcp and we match the target, don't match
+ * the target, and when we are called by ip_input.
+ */
+ if (iraflags & IRAF_TARGET_SQP) {
+ if (ira->ira_target_sqp == connp->conn_sqp) {
+ mblk_t *attrmp;
+
+ attrmp = ip_recv_attr_to_mblk(ira);
+ if (attrmp == NULL) {
+ BUMP_MIB(ill->ill_ip_mib,
+ ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards",
+ mp, ill);
+ freemsg(mp);
+ CONN_DEC_REF(connp);
+ } else {
+ SET_SQUEUE(attrmp, connp->conn_recv,
+ connp);
+ attrmp->b_cont = mp;
+ ASSERT(ira->ira_target_sqp_mp == NULL);
+ ira->ira_target_sqp_mp = attrmp;
+ /*
+ * Conn ref release when drained from
+ * the squeue.
+ */
+ }
+ } else {
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
+ connp->conn_recv, connp, ira, SQ_FILL,
+ SQTAG_IP_TCP_INPUT);
+ }
+ } else {
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv,
+ connp, ira, ip_squeue_flag, SQTAG_IP_TCP_INPUT);
+ }
+ ira->ira_ill = ill;
+ ira->ira_rill = rill;
+ return;
+
+ case IPPROTO_SCTP: {
+ sctp_hdr_t *sctph;
+ in6_addr_t map_src, map_dst;
+ uint32_t ports; /* Source and destination ports */
+ sctp_stack_t *sctps = ipst->ips_netstack->netstack_sctp;
+
+ /* For SCTP, discard broadcast and multicast packets. */
+ if (iraflags & IRAF_MULTIBROADCAST)
+ goto discard;
+
+ /*
+ * Since there is no SCTP h/w cksum support yet, just
+ * clear the flag.
+ */
+ DB_CKSUMFLAGS(mp) = 0;
+
+ /* Length ensured above */
+ ASSERT(MBLKL(mp) >= ip_hdr_length + SCTP_COMMON_HDR_LENGTH);
+ sctph = (sctp_hdr_t *)(rptr + ip_hdr_length);
+
+ /* get the ports */
+ ports = *(uint32_t *)&sctph->sh_sport;
+
+ IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_dst);
+ IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_src);
+ if (iraflags & IRAF_SCTP_CSUM_ERR) {
+ /*
+ * No potential sctp checksum errors go to the Sun
+ * sctp stack however they might be Adler-32 summed
+ * packets a userland stack bound to a raw IP socket
+ * could reasonably use. Note though that Adler-32 is
+ * a long deprecated algorithm and customer sctp
+ * networks should eventually migrate to CRC-32 at
+ * which time this facility should be removed.
+ */
+ ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira);
+ return;
+ }
+ connp = sctp_fanout(&map_src, &map_dst, ports, ira, mp, sctps);
+ if (connp == NULL) {
+ /* Check for raw socket or OOTB handling */
+ ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira);
+ return;
+ }
+ if (connp->conn_incoming_ifindex != 0 &&
+ connp->conn_incoming_ifindex != ira->ira_ruifindex) {
+ CONN_DEC_REF(connp);
+ /* Check for raw socket or OOTB handling */
+ ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira);
+ return;
+ }
+
+ /* Found a client; up it goes */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+ sctp_input(connp, ipha, NULL, mp, ira);
+ /* sctp_input does a rele of the sctp_t */
+ return;
+ }
+
+ case IPPROTO_UDP:
+ /* First mblk contains IP+UDP headers as checked above */
+ ASSERT(MBLKL(mp) >= ip_hdr_length + UDPH_SIZE);
+
+ if (iraflags & IRAF_MULTIBROADCAST) {
+ uint16_t *up; /* Pointer to ports in ULP header */
+
+ up = (uint16_t *)((uchar_t *)ipha + ip_hdr_length);
+ ip_fanout_udp_multi_v4(mp, ipha, up[1], up[0], ira);
+ return;
+ }
+
+ /* Look for AF_INET or AF_INET6 that matches */
+ connp = ipcl_classify_v4(mp, IPPROTO_UDP, ip_hdr_length,
+ ira, ipst);
+ if (connp == NULL) {
+ no_udp_match:
+ if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_UDP].
+ connf_head != NULL) {
+ ASSERT(ira->ira_protocol == IPPROTO_UDP);
+ ip_fanout_proto_v4(mp, ipha, ira);
+ } else {
+ ip_fanout_send_icmp_v4(mp,
+ ICMP_DEST_UNREACHABLE,
+ ICMP_PORT_UNREACHABLE, ira);
+ }
+ return;
+
+ }
+ if (connp->conn_incoming_ifindex != 0 &&
+ connp->conn_incoming_ifindex != ira->ira_ruifindex) {
+ CONN_DEC_REF(connp);
+ goto no_udp_match;
+ }
+ if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld :
+ !canputnext(connp->conn_rq)) {
+ CONN_DEC_REF(connp);
+ BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
+ ip_drop_input("udpIfStatsInOverflows", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) ||
+ (iraflags & IRAF_IPSEC_SECURE)) {
+ mp = ipsec_check_inbound_policy(mp, connp,
+ ipha, NULL, ira);
+ if (mp == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ /* Note that mp is NULL */
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ CONN_DEC_REF(connp);
+ return;
+ }
+ }
+ /*
+ * Remove 0-spi if it's 0, or move everything behind
+ * the UDP header over it and forward to ESP via
+ * ip_fanout_v4().
+ */
+ if (connp->conn_udp->udp_nat_t_endpoint) {
+ if (iraflags & IRAF_IPSEC_SECURE) {
+ ip_drop_packet(mp, B_TRUE, ira->ira_ill,
+ DROPPER(ipss, ipds_esp_nat_t_ipsec),
+ &ipss->ipsec_dropper);
+ CONN_DEC_REF(connp);
+ return;
+ }
+
+ mp = zero_spi_check(mp, ira);
+ if (mp == NULL) {
+ /*
+ * Packet was consumed - probably sent to
+ * ip_fanout_v4.
+ */
+ CONN_DEC_REF(connp);
+ return;
+ }
+ /* Else continue like a normal UDP packet. */
+ ipha = (ipha_t *)mp->b_rptr;
+ protocol = ipha->ipha_protocol;
+ ira->ira_protocol = protocol;
+ }
+ /* Found a client; up it goes */
+ IP_STAT(ipst, ip_udp_fannorm);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+ ira->ira_ill = ira->ira_rill = NULL;
+ (connp->conn_recv)(connp, mp, NULL, ira);
+ CONN_DEC_REF(connp);
+ ira->ira_ill = ill;
+ ira->ira_rill = rill;
+ return;
+ default:
+ break;
+ }
+
+ /*
+ * Clear hardware checksumming flag as it is currently only
+ * used by TCP and UDP.
+ */
+ DB_CKSUMFLAGS(mp) = 0;
+
+ switch (protocol) {
+ case IPPROTO_ICMP:
+ /*
+ * We need to accomodate icmp messages coming in clear
+ * until we get everything secure from the wire. If
+ * icmp_accept_clear_messages is zero we check with
+ * the global policy and act accordingly. If it is
+ * non-zero, we accept the message without any checks.
+ * But *this does not mean* that this will be delivered
+ * to RAW socket clients. By accepting we might send
+ * replies back, change our MTU value etc.,
+ * but delivery to the ULP/clients depends on their
+ * policy dispositions.
+ */
+ if (ipst->ips_icmp_accept_clear_messages == 0) {
+ mp = ipsec_check_global_policy(mp, NULL,
+ ipha, NULL, ira, ns);
+ if (mp == NULL)
+ return;
+ }
+
+ /*
+ * On a labeled system, we have to check whether the zone
+ * itself is permitted to receive raw traffic.
+ */
+ if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
+ if (!tsol_can_accept_raw(mp, ira, B_FALSE)) {
+ BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
+ ip_drop_input("tsol_can_accept_raw", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ }
+
+ /*
+ * ICMP header checksum, including checksum field,
+ * should be zero.
+ */
+ if (IP_CSUM(mp, ip_hdr_length, 0)) {
+ BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs);
+ ip_drop_input("icmpInCksumErrs", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+ mp = icmp_inbound_v4(mp, ira);
+ if (mp == NULL) {
+ /* No need to pass to RAW sockets */
+ return;
+ }
+ break;
+
+ case IPPROTO_IGMP:
+ /*
+ * If we are not willing to accept IGMP packets in clear,
+ * then check with global policy.
+ */
+ if (ipst->ips_igmp_accept_clear_messages == 0) {
+ mp = ipsec_check_global_policy(mp, NULL,
+ ipha, NULL, ira, ns);
+ if (mp == NULL)
+ return;
+ }
+ if ((ira->ira_flags & IRAF_SYSTEM_LABELED) &&
+ !tsol_can_accept_raw(mp, ira, B_TRUE)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ /*
+ * Validate checksum
+ */
+ if (IP_CSUM(mp, ip_hdr_length, 0)) {
+ ++ipst->ips_igmpstat.igps_rcv_badsum;
+ ip_drop_input("igps_rcv_badsum", mp, ill);
+ freemsg(mp);
+ return;
+ }
+
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+ mp = igmp_input(mp, ira);
+ if (mp == NULL) {
+ /* Bad packet - discarded by igmp_input */
+ return;
+ }
+ break;
+ case IPPROTO_PIM:
+ /*
+ * If we are not willing to accept PIM packets in clear,
+ * then check with global policy.
+ */
+ if (ipst->ips_pim_accept_clear_messages == 0) {
+ mp = ipsec_check_global_policy(mp, NULL,
+ ipha, NULL, ira, ns);
+ if (mp == NULL)
+ return;
+ }
+ if ((ira->ira_flags & IRAF_SYSTEM_LABELED) &&
+ !tsol_can_accept_raw(mp, ira, B_TRUE)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+
+ /* Checksum is verified in pim_input */
+ mp = pim_input(mp, ira);
+ if (mp == NULL) {
+ /* Bad packet - discarded by pim_input */
+ return;
+ }
+ break;
+ case IPPROTO_AH:
+ case IPPROTO_ESP: {
+ /*
+ * Fast path for AH/ESP.
+ */
+ netstack_t *ns = ipst->ips_netstack;
+ ipsec_stack_t *ipss = ns->netstack_ipsec;
+
+ IP_STAT(ipst, ipsec_proto_ahesp);
+
+ if (!ipsec_loaded(ipss)) {
+ ip_proto_not_sup(mp, ira);
+ return;
+ }
+
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+ /* select inbound SA and have IPsec process the pkt */
+ if (protocol == IPPROTO_ESP) {
+ esph_t *esph;
+ boolean_t esp_in_udp_sa;
+ boolean_t esp_in_udp_packet;
+
+ mp = ipsec_inbound_esp_sa(mp, ira, &esph);
+ if (mp == NULL)
+ return;
+
+ ASSERT(esph != NULL);
+ ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
+ ASSERT(ira->ira_ipsec_esp_sa != NULL);
+ ASSERT(ira->ira_ipsec_esp_sa->ipsa_input_func != NULL);
+
+ esp_in_udp_sa = ((ira->ira_ipsec_esp_sa->ipsa_flags &
+ IPSA_F_NATT) != 0);
+ esp_in_udp_packet =
+ (ira->ira_flags & IRAF_ESP_UDP_PORTS) != 0;
+
+ /*
+ * The following is a fancy, but quick, way of saying:
+ * ESP-in-UDP SA and Raw ESP packet --> drop
+ * OR
+ * ESP SA and ESP-in-UDP packet --> drop
+ */
+ if (esp_in_udp_sa != esp_in_udp_packet) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_packet(mp, B_TRUE, ira->ira_ill,
+ DROPPER(ipss, ipds_esp_no_sa),
+ &ipss->ipsec_dropper);
+ return;
+ }
+ mp = ira->ira_ipsec_esp_sa->ipsa_input_func(mp, esph,
+ ira);
+ } else {
+ ah_t *ah;
+
+ mp = ipsec_inbound_ah_sa(mp, ira, &ah);
+ if (mp == NULL)
+ return;
+
+ ASSERT(ah != NULL);
+ ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
+ ASSERT(ira->ira_ipsec_ah_sa != NULL);
+ ASSERT(ira->ira_ipsec_ah_sa->ipsa_input_func != NULL);
+ mp = ira->ira_ipsec_ah_sa->ipsa_input_func(mp, ah,
+ ira);
+ }
+
+ if (mp == NULL) {
+ /*
+ * Either it failed or is pending. In the former case
+ * ipIfStatsInDiscards was increased.
+ */
+ return;
+ }
+ /* we're done with IPsec processing, send it up */
+ ip_input_post_ipsec(mp, ira);
+ return;
+ }
+ case IPPROTO_ENCAP: {
+ ipha_t *inner_ipha;
+
+ /*
+ * Handle self-encapsulated packets (IP-in-IP where
+ * the inner addresses == the outer addresses).
+ */
+ if ((uchar_t *)ipha + ip_hdr_length + sizeof (ipha_t) >
+ mp->b_wptr) {
+ if (ira->ira_pktlen <
+ ip_hdr_length + sizeof (ipha_t)) {
+ BUMP_MIB(ill->ill_ip_mib,
+ ipIfStatsInTruncatedPkts);
+ ip_drop_input("ipIfStatsInTruncatedPkts",
+ mp, ill);
+ freemsg(mp);
+ return;
+ }
+ ipha = ip_pullup(mp, (uchar_t *)ipha + ip_hdr_length +
+ sizeof (ipha_t) - mp->b_rptr, ira);
+ if (ipha == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ }
+ inner_ipha = (ipha_t *)((uchar_t *)ipha + ip_hdr_length);
+ /*
+ * Check the sanity of the inner IP header.
+ */
+ if ((IPH_HDR_VERSION(inner_ipha) != IPV4_VERSION)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ if (IPH_HDR_LENGTH(inner_ipha) < sizeof (ipha_t)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ if (inner_ipha->ipha_src != ipha->ipha_src ||
+ inner_ipha->ipha_dst != ipha->ipha_dst) {
+ /* We fallthru to iptun fanout below */
+ goto iptun;
+ }
+
+ /*
+ * Self-encapsulated tunnel packet. Remove
+ * the outer IP header and fanout again.
+ * We also need to make sure that the inner
+ * header is pulled up until options.
+ */
+ mp->b_rptr = (uchar_t *)inner_ipha;
+ ipha = inner_ipha;
+ ip_hdr_length = IPH_HDR_LENGTH(ipha);
+ if ((uchar_t *)ipha + ip_hdr_length > mp->b_wptr) {
+ if (ira->ira_pktlen <
+ (uchar_t *)ipha + ip_hdr_length - mp->b_rptr) {
+ BUMP_MIB(ill->ill_ip_mib,
+ ipIfStatsInTruncatedPkts);
+ ip_drop_input("ipIfStatsInTruncatedPkts",
+ mp, ill);
+ freemsg(mp);
+ return;
+ }
+ ipha = ip_pullup(mp,
+ (uchar_t *)ipha + ip_hdr_length - mp->b_rptr, ira);
+ if (ipha == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ }
+ if (ip_hdr_length > sizeof (ipha_t)) {
+ /* We got options on the inner packet. */
+ ipaddr_t dst = ipha->ipha_dst;
+ int error = 0;
+
+ dst = ip_input_options(ipha, dst, mp, ira, &error);
+ if (error != 0) {
+ /*
+ * An ICMP error has been sent and the packet
+ * has been dropped.
+ */
+ return;
+ }
+ if (dst != ipha->ipha_dst) {
+ /*
+ * Someone put a source-route in
+ * the inside header of a self-
+ * encapsulated packet. Drop it
+ * with extreme prejudice and let
+ * the sender know.
+ */
+ ip_drop_input("ICMP_SOURCE_ROUTE_FAILED",
+ mp, ill);
+ icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED,
+ ira);
+ return;
+ }
+ }
+ if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
+ /*
+ * This means that somebody is sending
+ * Self-encapsualted packets without AH/ESP.
+ *
+ * Send this packet to find a tunnel endpoint.
+ * if I can't find one, an ICMP
+ * PROTOCOL_UNREACHABLE will get sent.
+ */
+ protocol = ipha->ipha_protocol;
+ ira->ira_protocol = protocol;
+ goto iptun;
+ }
+
+ /* Update based on removed IP header */
+ ira->ira_ip_hdr_length = ip_hdr_length;
+ ira->ira_pktlen = ntohs(ipha->ipha_length);
+
+ if (ira->ira_flags & IRAF_IPSEC_DECAPS) {
+ /*
+ * This packet is self-encapsulated multiple
+ * times. We don't want to recurse infinitely.
+ * To keep it simple, drop the packet.
+ */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
+ ira->ira_flags |= IRAF_IPSEC_DECAPS;
+
+ ip_input_post_ipsec(mp, ira);
+ return;
+ }
+
+ iptun: /* IPPROTO_ENCAPS that is not self-encapsulated */
+ case IPPROTO_IPV6:
+ /* iptun will verify trusted label */
+ connp = ipcl_classify_v4(mp, protocol, ip_hdr_length,
+ ira, ipst);
+ if (connp != NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
+ ira->ira_ill = ira->ira_rill = NULL;
+ (connp->conn_recv)(connp, mp, NULL, ira);
+ CONN_DEC_REF(connp);
+ ira->ira_ill = ill;
+ ira->ira_rill = rill;
+ return;
+ }
+ /* FALLTHRU */
+ default:
+ /*
+ * On a labeled system, we have to check whether the zone
+ * itself is permitted to receive raw traffic.
+ */
+ if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
+ if (!tsol_can_accept_raw(mp, ira, B_FALSE)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ }
+ break;
+ }
+
+ /*
+ * The above input functions may have returned the pulled up message.
+ * So ipha need to be reinitialized.
+ */
+ ipha = (ipha_t *)mp->b_rptr;
+ ira->ira_protocol = protocol = ipha->ipha_protocol;
+ if (ipst->ips_ipcl_proto_fanout_v4[protocol].connf_head == NULL) {
+ /*
+ * No user-level listener for these packets packets.
+ * Check for IPPROTO_ENCAP...
+ */
+ if (protocol == IPPROTO_ENCAP && ipst->ips_ip_g_mrouter) {
+ /*
+ * Check policy here,
+ * THEN ship off to ip_mroute_decap().
+ *
+ * BTW, If I match a configured IP-in-IP
+ * tunnel above, this path will not be reached, and
+ * ip_mroute_decap will never be called.
+ */
+ mp = ipsec_check_global_policy(mp, connp,
+ ipha, NULL, ira, ns);
+ if (mp != NULL) {
+ ip_mroute_decap(mp, ira);
+ } /* Else we already freed everything! */
+ } else {
+ ip_proto_not_sup(mp, ira);
+ }
+ return;
+ }
+
+ /*
+ * Handle fanout to raw sockets. There
+ * can be more than one stream bound to a particular
+ * protocol. When this is the case, each one gets a copy
+ * of any incoming packets.
+ */
+ ASSERT(ira->ira_protocol == ipha->ipha_protocol);
+ ip_fanout_proto_v4(mp, ipha, ira);
+ return;
+
+discard:
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ freemsg(mp);
+#undef rptr
+}
diff --git a/usr/src/uts/common/inet/ip/ip_ire.c b/usr/src/uts/common/inet/ip/ip_ire.c
index 63a6863844..be0017cb62 100644
--- a/usr/src/uts/common/inet/ip/ip_ire.c
+++ b/usr/src/uts/common/inet/ip/ip_ire.c
@@ -60,9 +60,6 @@
#include <inet/ip_rts.h>
#include <inet/nd.h>
-#include <net/pfkeyv2.h>
-#include <inet/ipsec_info.h>
-#include <inet/sadb.h>
#include <inet/tcp.h>
#include <inet/ipclassifier.h>
#include <sys/zone.h>
@@ -73,6 +70,11 @@
struct kmem_cache *rt_entry_cache;
+typedef struct nce_clookup_s {
+ ipaddr_t ncecl_addr;
+ boolean_t ncecl_found;
+} nce_clookup_t;
+
/*
* Synchronization notes:
*
@@ -80,17 +82,17 @@ struct kmem_cache *rt_entry_cache;
*
* ire_next/ire_ptpn
*
- * - bucket lock of the respective tables (cache or forwarding tables).
+ * - bucket lock of the forwarding table in which is ire stored.
*
- * ire_mp, ire_rfq, ire_stq, ire_u *except* ire_gateway_addr[v6], ire_mask,
- * ire_type, ire_create_time, ire_masklen, ire_ipversion, ire_flags, ire_ipif,
- * ire_ihandle, ire_phandle, ire_nce, ire_bucket, ire_in_ill, ire_in_src_addr
+ * ire_ill, ire_u *except* ire_gateway_addr[v6], ire_mask,
+ * ire_type, ire_create_time, ire_masklen, ire_ipversion, ire_flags,
+ * ire_bucket
*
* - Set in ire_create_v4/v6 and never changes after that. Thus,
* we don't need a lock whenever these fields are accessed.
*
* - ire_bucket and ire_masklen (also set in ire_create) is set in
- * ire_add_v4/ire_add_v6 before inserting in the bucket and never
+ * ire_add before inserting in the bucket and never
* changes after that. Thus we don't need a lock whenever these
* fields are accessed.
*
@@ -102,7 +104,7 @@ struct kmem_cache *rt_entry_cache;
* does not use any locks. ire_gateway_addr_v6 updates are not atomic
* and hence any access to it uses ire_lock to get/set the right value.
*
- * ire_ident, ire_refcnt
+ * ire_refcnt, ire_identical_ref
*
* - Updated atomically using atomic_add_32
*
@@ -111,40 +113,33 @@ struct kmem_cache *rt_entry_cache;
* - Assumes that 32 bit writes are atomic. No locks. ire_lock is
* used to serialize updates to ire_ssthresh, ire_rtt_sd, ire_rtt.
*
- * ire_max_frag, ire_frag_flag
- *
- * - ire_lock is used to set/read both of them together.
- *
- * ire_tire_mark
+ * ire_generation
+ * - Under ire_lock
*
- * - Set in ire_create and updated in ire_expire, which is called
- * by only one function namely ip_trash_timer_expire. Thus only
- * one function updates and examines the value.
+ * ire_nce_cache
+ * - Under ire_lock
*
- * ire_marks
- * - bucket lock protects this.
+ * ire_dep_parent (To next IRE in recursive lookup chain)
+ * - Under ips_ire_dep_lock. Write held when modifying. Read held when
+ * walking. We also hold ire_lock when modifying to allow the data path
+ * to only acquire ire_lock.
*
- * ire_ll_hdr_length
+ * ire_dep_parent_generation (Generation number from ire_dep_parent)
+ * - Under ips_ire_dep_lock and/or ire_lock. (A read claim on the dep_lock
+ * and ire_lock held when modifying)
*
- * - Place holder for returning the information to the upper layers
- * when IRE_DB_REQ comes down.
- *
- *
- * ipv6_ire_default_count is protected by the bucket lock of
- * ip_forwarding_table_v6[0][0].
- *
- * ipv6_ire_default_index is not protected as it is just a hint
- * at which default gateway to use. There is nothing
- * wrong in using the same gateway for two different connections.
+ * ire_dep_children (From parent to first child)
+ * ire_dep_sib_next (linked list of siblings)
+ * ire_dep_sib_ptpn (linked list of siblings)
+ * - Under ips_ire_dep_lock. Write held when modifying. Read held when
+ * walking.
*
* As we always hold the bucket locks in all the places while accessing
* the above values, it is natural to use them for protecting them.
*
- * We have a separate cache table and forwarding table for IPv4 and IPv6.
- * Cache table (ip_cache_table/ip_cache_table_v6) is a pointer to an
- * array of irb_t structures. The IPv6 forwarding table
+ * We have a forwarding table for IPv4 and IPv6. The IPv6 forwarding table
* (ip_forwarding_table_v6) is an array of pointers to arrays of irb_t
- * structure. ip_forwarding_table_v6 is allocated dynamically in
+ * structures. ip_forwarding_table_v6 is allocated dynamically in
* ire_add_v6. ire_ft_init_lock is used to serialize multiple threads
* initializing the same bucket. Once a bucket is initialized, it is never
* de-alloacted. This assumption enables us to access
@@ -158,39 +153,37 @@ struct kmem_cache *rt_entry_cache;
* a bucket and the ires residing in the bucket have a back pointer to
* the bucket structure. It also has a reference count for the number
* of threads walking the bucket - irb_refcnt which is bumped up
- * using the macro IRB_REFHOLD macro. The flags irb_flags can be
- * set to IRE_MARK_CONDEMNED indicating that there are some ires
- * in this bucket that are marked with IRE_MARK_CONDEMNED and the
+ * using the irb_refhold function. The flags irb_marks can be
+ * set to IRB_MARK_CONDEMNED indicating that there are some ires
+ * in this bucket that are IRE_IS_CONDEMNED and the
* last thread to leave the bucket should delete the ires. Usually
- * this is done by the IRB_REFRELE macro which is used to decrement
+ * this is done by the irb_refrele function which is used to decrement
* the reference count on a bucket. See comments above irb_t structure
* definition in ip.h for further details.
*
- * IRE_REFHOLD/IRE_REFRELE macros operate on the ire which increments/
+ * The ire_refhold/ire_refrele functions operate on the ire which increments/
* decrements the reference count, ire_refcnt, atomically on the ire.
- * ire_refcnt is modified only using this macro. Operations on the IRE
+ * ire_refcnt is modified only using those functions. Operations on the IRE
* could be described as follows :
*
* CREATE an ire with reference count initialized to 1.
*
* ADDITION of an ire holds the bucket lock, checks for duplicates
- * and then adds the ire. ire_add_v4/ire_add_v6 returns the ire after
+ * and then adds the ire. ire_add returns the ire after
* bumping up once more i.e the reference count is 2. This is to avoid
* an extra lookup in the functions calling ire_add which wants to
* work with the ire after adding.
*
- * LOOKUP of an ire bumps up the reference count using IRE_REFHOLD
- * macro. It is valid to bump up the referece count of the IRE,
+ * LOOKUP of an ire bumps up the reference count using ire_refhold
+ * function. It is valid to bump up the referece count of the IRE,
* after the lookup has returned an ire. Following are the lookup
* functions that return an HELD ire :
*
- * ire_lookup_local[_v6], ire_ctable_lookup[_v6], ire_ftable_lookup[_v6],
- * ire_cache_lookup[_v6], ire_lookup_multi[_v6], ire_route_lookup[_v6],
- * ipif_to_ire[_v6].
+ * ire_ftable_lookup[_v6], ire_lookup_multi_ill[_v6]
*
* DELETION of an ire holds the bucket lock, removes it from the list
* and then decrements the reference count for having removed from the list
- * by using the IRE_REFRELE macro. If some other thread has looked up
+ * by using the ire_refrele function. If some other thread has looked up
* the ire, the reference count would have been bumped up and hence
* this ire will not be freed once deleted. It will be freed once the
* reference count drops to zero.
@@ -198,27 +191,12 @@ struct kmem_cache *rt_entry_cache;
* Add and Delete acquires the bucket lock as RW_WRITER, while all the
* lookups acquire the bucket lock as RW_READER.
*
- * NOTE : The only functions that does the IRE_REFRELE when an ire is
- * passed as an argument are :
- *
- * 1) ip_wput_ire : This is because it IRE_REFHOLD/RELEs the
- * broadcast ires it looks up internally within
- * the function. Currently, for simplicity it does
- * not differentiate the one that is passed in and
- * the ones it looks up internally. It always
- * IRE_REFRELEs.
- * 2) ire_send
- * ire_send_v6 : As ire_send calls ip_wput_ire and other functions
- * that take ire as an argument, it has to selectively
- * IRE_REFRELE the ire. To maintain symmetry,
- * ire_send_v6 does the same.
- *
- * Otherwise, the general rule is to do the IRE_REFRELE in the function
+ * The general rule is to do the ire_refrele in the function
* that is passing the ire as an argument.
*
* In trying to locate ires the following points are to be noted.
*
- * IRE_MARK_CONDEMNED signifies that the ire has been logically deleted and is
+ * IRE_IS_CONDEMNED signifies that the ire has been logically deleted and is
* to be ignored when walking the ires using ire_next.
*
* Zones note:
@@ -230,14 +208,6 @@ struct kmem_cache *rt_entry_cache;
*/
/*
- * The minimum size of IRE cache table. It will be recalcuated in
- * ip_ire_init().
- * Setable in /etc/system
- */
-uint32_t ip_cache_table_size = IP_CACHE_TABLE_SIZE;
-uint32_t ip6_cache_table_size = IP6_CACHE_TABLE_SIZE;
-
-/*
* The size of the forwarding table. We will make sure that it is a
* power of 2 in ip_ire_init().
* Setable in /etc/system
@@ -245,313 +215,213 @@ uint32_t ip6_cache_table_size = IP6_CACHE_TABLE_SIZE;
uint32_t ip6_ftable_hash_size = IP6_FTABLE_HASH_SIZE;
struct kmem_cache *ire_cache;
-static ire_t ire_null;
-
-/*
- * The threshold number of IRE in a bucket when the IREs are
- * cleaned up. This threshold is calculated later in ip_open()
- * based on the speed of CPU and available memory. This default
- * value is the maximum.
- *
- * We have two kinds of cached IRE, temporary and
- * non-temporary. Temporary IREs are marked with
- * IRE_MARK_TEMPORARY. They are IREs created for non
- * TCP traffic and for forwarding purposes. All others
- * are non-temporary IREs. We don't mark IRE created for
- * TCP as temporary because TCP is stateful and there are
- * info stored in the IRE which can be shared by other TCP
- * connections to the same destination. For connected
- * endpoint, we also don't want to mark the IRE used as
- * temporary because the same IRE will be used frequently,
- * otherwise, the app should not do a connect(). We change
- * the marking at ip_bind_connected_*() if necessary.
- *
- * We want to keep the cache IRE hash bucket length reasonably
- * short, otherwise IRE lookup functions will take "forever."
- * We use the "crude" function that the IRE bucket
- * length should be based on the CPU speed, which is 1 entry
- * per x MHz, depending on the shift factor ip_ire_cpu_ratio
- * (n). This means that with a 750MHz CPU, the max bucket
- * length can be (750 >> n) entries.
- *
- * Note that this threshold is separate for temp and non-temp
- * IREs. This means that the actual bucket length can be
- * twice as that. And while we try to keep temporary IRE
- * length at most at the threshold value, we do not attempt to
- * make the length for non-temporary IREs fixed, for the
- * reason stated above. Instead, we start trying to find
- * "unused" non-temporary IREs when the bucket length reaches
- * this threshold and clean them up.
- *
- * We also want to limit the amount of memory used by
- * IREs. So if we are allowed to use ~3% of memory (M)
- * for those IREs, each bucket should not have more than
- *
- * M / num of cache bucket / sizeof (ire_t)
- *
- * Again the above memory uses are separate for temp and
- * non-temp cached IREs.
- *
- * We may also want the limit to be a function of the number
- * of interfaces and number of CPUs. Doing the initialization
- * in ip_open() means that every time an interface is plumbed,
- * the max is re-calculated. Right now, we don't do anything
- * different. In future, when we have more experience, we
- * may want to change this behavior.
- */
-uint32_t ip_ire_max_bucket_cnt = 10; /* Setable in /etc/system */
-uint32_t ip6_ire_max_bucket_cnt = 10;
-uint32_t ip_ire_cleanup_cnt = 2;
-
-/*
- * The minimum of the temporary IRE bucket count. We do not want
- * the length of each bucket to be too short. This may hurt
- * performance of some apps as the temporary IREs are removed too
- * often.
- */
-uint32_t ip_ire_min_bucket_cnt = 3; /* /etc/system - not used */
-uint32_t ip6_ire_min_bucket_cnt = 3;
-
-/*
- * The ratio of memory consumed by IRE used for temporary to available
- * memory. This is a shift factor, so 6 means the ratio 1 to 64. This
- * value can be changed in /etc/system. 6 is a reasonable number.
- */
-uint32_t ip_ire_mem_ratio = 6; /* /etc/system */
-/* The shift factor for CPU speed to calculate the max IRE bucket length. */
-uint32_t ip_ire_cpu_ratio = 7; /* /etc/system */
-
-typedef struct nce_clookup_s {
- ipaddr_t ncecl_addr;
- boolean_t ncecl_found;
-} nce_clookup_t;
-
-/*
- * The maximum number of buckets in IRE cache table. In future, we may
- * want to make it a dynamic hash table. For the moment, we fix the
- * size and allocate the table in ip_ire_init() when IP is first loaded.
- * We take into account the amount of memory a system has.
- */
-#define IP_MAX_CACHE_TABLE_SIZE 4096
-
-/* Setable in /etc/system */
-static uint32_t ip_max_cache_table_size = IP_MAX_CACHE_TABLE_SIZE;
-static uint32_t ip6_max_cache_table_size = IP_MAX_CACHE_TABLE_SIZE;
+struct kmem_cache *ncec_cache;
+struct kmem_cache *nce_cache;
-/* Zero iulp_t for initialization. */
-const iulp_t ire_uinfo_null = { 0 };
+static ire_t ire_null;
-static int ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp,
- ipsq_func_t func, boolean_t);
+static ire_t *ire_add_v4(ire_t *ire);
static void ire_delete_v4(ire_t *ire);
+static void ire_dep_invalidate_children(ire_t *child);
static void ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers,
zoneid_t zoneid, ip_stack_t *);
static void ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type,
pfv_t func, void *arg, uchar_t vers, ill_t *ill);
-static void ire_cache_cleanup(irb_t *irb, uint32_t threshold,
- ire_t *ref_ire);
-static void ip_nce_clookup_and_delete(nce_t *nce, void *arg);
-static ire_t *ip4_ctable_lookup_impl(ire_ctable_args_t *margs);
#ifdef DEBUG
static void ire_trace_cleanup(const ire_t *);
#endif
/*
- * To avoid bloating the code, we call this function instead of
- * using the macro IRE_REFRELE. Use macro only in performance
- * critical paths.
- *
- * Must not be called while holding any locks. Otherwise if this is
- * the last reference to be released there is a chance of recursive mutex
- * panic due to ire_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
- * to restart an ioctl. The one exception is when the caller is sure that
- * this is not the last reference to be released. Eg. if the caller is
- * sure that the ire has not been deleted and won't be deleted.
+ * Following are the functions to increment/decrement the reference
+ * count of the IREs and IRBs (ire bucket).
+ *
+ * 1) We bump up the reference count of an IRE to make sure that
+ * it does not get deleted and freed while we are using it.
+ * Typically all the lookup functions hold the bucket lock,
+ * and look for the IRE. If it finds an IRE, it bumps up the
+ * reference count before dropping the lock. Sometimes we *may* want
+ * to bump up the reference count after we *looked* up i.e without
+ * holding the bucket lock. So, the ire_refhold function does not assert
+ * on the bucket lock being held. Any thread trying to delete from
+ * the hash bucket can still do so but cannot free the IRE if
+ * ire_refcnt is not 0.
+ *
+ * 2) We bump up the reference count on the bucket where the IRE resides
+ * (IRB), when we want to prevent the IREs getting deleted from a given
+ * hash bucket. This makes life easier for ire_walk type functions which
+ * wants to walk the IRE list, call a function, but needs to drop
+ * the bucket lock to prevent recursive rw_enters. While the
+ * lock is dropped, the list could be changed by other threads or
+ * the same thread could end up deleting the ire or the ire pointed by
+ * ire_next. ire_refholding the ire or ire_next is not sufficient as
+ * a delete will still remove the ire from the bucket while we have
+ * dropped the lock and hence the ire_next would be NULL. Thus, we
+ * need a mechanism to prevent deletions from a given bucket.
+ *
+ * To prevent deletions, we bump up the reference count on the
+ * bucket. If the bucket is held, ire_delete just marks both
+ * the ire and irb as CONDEMNED. When the
+ * reference count on the bucket drops to zero, all the CONDEMNED ires
+ * are deleted. We don't have to bump up the reference count on the
+ * bucket if we are walking the bucket and never have to drop the bucket
+ * lock. Note that irb_refhold does not prevent addition of new ires
+ * in the list. It is okay because addition of new ires will not cause
+ * ire_next to point to freed memory. We do irb_refhold only when
+ * all of the 3 conditions are true :
+ *
+ * 1) The code needs to walk the IRE bucket from start to end.
+ * 2) It may have to drop the bucket lock sometimes while doing (1)
+ * 3) It does not want any ires to be deleted meanwhile.
+ */
+
+/*
+ * Bump up the reference count on the hash bucket - IRB to
+ * prevent ires from being deleted in this bucket.
*/
void
-ire_refrele(ire_t *ire)
+irb_refhold(irb_t *irb)
{
- IRE_REFRELE(ire);
+ rw_enter(&irb->irb_lock, RW_WRITER);
+ irb->irb_refcnt++;
+ ASSERT(irb->irb_refcnt != 0);
+ rw_exit(&irb->irb_lock);
}
void
-ire_refrele_notr(ire_t *ire)
+irb_refhold_locked(irb_t *irb)
{
- IRE_REFRELE_NOTR(ire);
+ ASSERT(RW_WRITE_HELD(&irb->irb_lock));
+ irb->irb_refcnt++;
+ ASSERT(irb->irb_refcnt != 0);
}
/*
- * kmem_cache_alloc constructor for IRE in kma space.
- * Note that when ire_mp is set the IRE is stored in that mblk and
- * not in this cache.
+ * Note: when IRB_MARK_DYNAMIC is not set the irb_t
+ * is statically allocated, so that when the irb_refcnt goes to 0,
+ * we simply clean up the ire list and continue.
*/
-/* ARGSUSED */
-static int
-ip_ire_constructor(void *buf, void *cdrarg, int kmflags)
+void
+irb_refrele(irb_t *irb)
{
- ire_t *ire = buf;
+ if (irb->irb_marks & IRB_MARK_DYNAMIC) {
+ irb_refrele_ftable(irb);
+ } else {
+ rw_enter(&irb->irb_lock, RW_WRITER);
+ ASSERT(irb->irb_refcnt != 0);
+ if (--irb->irb_refcnt == 0 &&
+ (irb->irb_marks & IRB_MARK_CONDEMNED)) {
+ ire_t *ire_list;
+
+ ire_list = ire_unlink(irb);
+ rw_exit(&irb->irb_lock);
+ ASSERT(ire_list != NULL);
+ ire_cleanup(ire_list);
+ } else {
+ rw_exit(&irb->irb_lock);
+ }
+ }
+}
- ire->ire_nce = NULL;
- return (0);
+/*
+ * Bump up the reference count on the IRE. We cannot assert that the
+ * bucket lock is being held as it is legal to bump up the reference
+ * count after the first lookup has returned the IRE without
+ * holding the lock.
+ */
+void
+ire_refhold(ire_t *ire)
+{
+ atomic_add_32(&(ire)->ire_refcnt, 1);
+ ASSERT((ire)->ire_refcnt != 0);
+#ifdef DEBUG
+ ire_trace_ref(ire);
+#endif
}
-/* ARGSUSED1 */
-static void
-ip_ire_destructor(void *buf, void *cdrarg)
+void
+ire_refhold_notr(ire_t *ire)
{
- ire_t *ire = buf;
+ atomic_add_32(&(ire)->ire_refcnt, 1);
+ ASSERT((ire)->ire_refcnt != 0);
+}
- ASSERT(ire->ire_nce == NULL);
+void
+ire_refhold_locked(ire_t *ire)
+{
+#ifdef DEBUG
+ ire_trace_ref(ire);
+#endif
+ ire->ire_refcnt++;
}
/*
- * This function is associated with the IP_IOC_IRE_ADVISE_NO_REPLY
- * IOCTL. It is used by TCP (or other ULPs) to supply revised information
- * for an existing CACHED IRE.
+ * Release a ref on an IRE.
+ *
+ * Must not be called while holding any locks. Otherwise if this is
+ * the last reference to be released there is a chance of recursive mutex
+ * panic due to ire_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
+ * to restart an ioctl. The one exception is when the caller is sure that
+ * this is not the last reference to be released. Eg. if the caller is
+ * sure that the ire has not been deleted and won't be deleted.
+ *
+ * In architectures e.g sun4u, where atomic_add_32_nv is just
+ * a cas, we need to maintain the right memory barrier semantics
+ * as that of mutex_exit i.e all the loads and stores should complete
+ * before the cas is executed. membar_exit() does that here.
*/
-/* ARGSUSED */
-int
-ip_ire_advise(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
+void
+ire_refrele(ire_t *ire)
{
- uchar_t *addr_ucp;
- ipic_t *ipic;
- ire_t *ire;
- ipaddr_t addr;
- in6_addr_t v6addr;
- irb_t *irb;
- zoneid_t zoneid;
- ip_stack_t *ipst = CONNQ_TO_IPST(q);
-
- ASSERT(q->q_next == NULL);
- zoneid = Q_TO_CONN(q)->conn_zoneid;
-
- /*
- * Check privilege using the ioctl credential; if it is NULL
- * then this is a kernel message and therefor privileged.
- */
- if (ioc_cr != NULL && secpolicy_ip_config(ioc_cr, B_FALSE) != 0)
- return (EPERM);
-
- ipic = (ipic_t *)mp->b_rptr;
- if (!(addr_ucp = mi_offset_param(mp, ipic->ipic_addr_offset,
- ipic->ipic_addr_length))) {
- return (EINVAL);
- }
- if (!OK_32PTR(addr_ucp))
- return (EINVAL);
- switch (ipic->ipic_addr_length) {
- case IP_ADDR_LEN: {
- /* Extract the destination address. */
- addr = *(ipaddr_t *)addr_ucp;
- /* Find the corresponding IRE. */
- ire = ire_cache_lookup(addr, zoneid, NULL, ipst);
- break;
- }
- case IPV6_ADDR_LEN: {
- /* Extract the destination address. */
- v6addr = *(in6_addr_t *)addr_ucp;
- /* Find the corresponding IRE. */
- ire = ire_cache_lookup_v6(&v6addr, zoneid, NULL, ipst);
- break;
- }
- default:
- return (EINVAL);
- }
-
- if (ire == NULL)
- return (ENOENT);
- /*
- * Update the round trip time estimate and/or the max frag size
- * and/or the slow start threshold.
- *
- * We serialize multiple advises using ire_lock.
- */
- mutex_enter(&ire->ire_lock);
- if (ipic->ipic_rtt) {
- /*
- * If there is no old cached values, initialize them
- * conservatively. Set them to be (1.5 * new value).
- */
- if (ire->ire_uinfo.iulp_rtt != 0) {
- ire->ire_uinfo.iulp_rtt = (ire->ire_uinfo.iulp_rtt +
- ipic->ipic_rtt) >> 1;
- } else {
- ire->ire_uinfo.iulp_rtt = ipic->ipic_rtt +
- (ipic->ipic_rtt >> 1);
- }
- if (ire->ire_uinfo.iulp_rtt_sd != 0) {
- ire->ire_uinfo.iulp_rtt_sd =
- (ire->ire_uinfo.iulp_rtt_sd +
- ipic->ipic_rtt_sd) >> 1;
- } else {
- ire->ire_uinfo.iulp_rtt_sd = ipic->ipic_rtt_sd +
- (ipic->ipic_rtt_sd >> 1);
- }
- }
- if (ipic->ipic_max_frag)
- ire->ire_max_frag = MIN(ipic->ipic_max_frag, IP_MAXPACKET);
- if (ipic->ipic_ssthresh != 0) {
- if (ire->ire_uinfo.iulp_ssthresh != 0)
- ire->ire_uinfo.iulp_ssthresh =
- (ipic->ipic_ssthresh +
- ire->ire_uinfo.iulp_ssthresh) >> 1;
- else
- ire->ire_uinfo.iulp_ssthresh = ipic->ipic_ssthresh;
- }
- /*
- * Don't need the ire_lock below this. ire_type does not change
- * after initialization. ire_marks is protected by irb_lock.
- */
- mutex_exit(&ire->ire_lock);
-
- if (ipic->ipic_ire_marks != 0 && ire->ire_type == IRE_CACHE) {
- /*
- * Only increment the temporary IRE count if the original
- * IRE is not already marked temporary.
- */
- irb = ire->ire_bucket;
- rw_enter(&irb->irb_lock, RW_WRITER);
- if ((ipic->ipic_ire_marks & IRE_MARK_TEMPORARY) &&
- !(ire->ire_marks & IRE_MARK_TEMPORARY)) {
- irb->irb_tmp_ire_cnt++;
- }
- ire->ire_marks |= ipic->ipic_ire_marks;
- rw_exit(&irb->irb_lock);
- }
+#ifdef DEBUG
+ ire_untrace_ref(ire);
+#endif
+ ASSERT((ire)->ire_refcnt != 0);
+ membar_exit();
+ if (atomic_add_32_nv(&(ire)->ire_refcnt, -1) == 0)
+ ire_inactive(ire);
+}
- ire_refrele(ire);
- return (0);
+void
+ire_refrele_notr(ire_t *ire)
+{
+ ASSERT((ire)->ire_refcnt != 0);
+ membar_exit();
+ if (atomic_add_32_nv(&(ire)->ire_refcnt, -1) == 0)
+ ire_inactive(ire);
}
/*
* This function is associated with the IP_IOC_IRE_DELETE[_NO_REPLY]
- * IOCTL[s]. The NO_REPLY form is used by TCP to delete a route IRE
- * for a host that is not responding. This will force an attempt to
- * establish a new route, if available, and flush out the ARP entry so
- * it will re-resolve. Management processes may want to use the
- * version that generates a reply.
- *
- * This function does not support IPv6 since Neighbor Unreachability Detection
- * means that negative advise like this is useless.
+ * IOCTL[s]. The NO_REPLY form is used by TCP to tell IP that it is
+ * having problems reaching a particular destination.
+ * This will make IP consider alternate routes (e.g., when there are
+ * muliple default routes), and it will also make IP discard any (potentially)
+ * stale redirect.
+ * Management processes may want to use the version that generates a reply.
+ *
+ * With the use of NUD like behavior for IPv4/ARP in addition to IPv6
+ * this function shouldn't be necessary for IP to recover from a bad redirect,
+ * a bad default router (when there are multiple default routers), or
+ * a stale ND/ARP entry. But we retain it in any case.
+ * For instance, this is helpful when TCP suspects a failure before NUD does.
*/
-/* ARGSUSED */
int
ip_ire_delete(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
{
uchar_t *addr_ucp;
- ipaddr_t addr;
+ uint_t ipversion;
+ sin_t *sin;
+ sin6_t *sin6;
+ ipaddr_t v4addr;
+ in6_addr_t v6addr;
ire_t *ire;
ipid_t *ipid;
- boolean_t routing_sock_info = B_FALSE; /* Sent info? */
zoneid_t zoneid;
- ire_t *gire = NULL;
- ill_t *ill;
- mblk_t *arp_mp;
ip_stack_t *ipst;
ASSERT(q->q_next == NULL);
- zoneid = Q_TO_CONN(q)->conn_zoneid;
+ zoneid = IPCL_ZONEID(Q_TO_CONN(q));
ipst = CONNQ_TO_IPST(q);
/*
@@ -563,948 +433,192 @@ ip_ire_delete(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
ipid = (ipid_t *)mp->b_rptr;
- /* Only actions on IRE_CACHEs are acceptable at present. */
- if (ipid->ipid_ire_type != IRE_CACHE)
- return (EINVAL);
-
addr_ucp = mi_offset_param(mp, ipid->ipid_addr_offset,
ipid->ipid_addr_length);
if (addr_ucp == NULL || !OK_32PTR(addr_ucp))
return (EINVAL);
switch (ipid->ipid_addr_length) {
- case IP_ADDR_LEN:
- /* addr_ucp points at IP addr */
- break;
- case sizeof (sin_t): {
- sin_t *sin;
+ case sizeof (sin_t):
/*
* got complete (sockaddr) address - increment addr_ucp to point
* at the ip_addr field.
*/
sin = (sin_t *)addr_ucp;
addr_ucp = (uchar_t *)&sin->sin_addr.s_addr;
+ ipversion = IPV4_VERSION;
+ break;
+ case sizeof (sin6_t):
+ /*
+ * got complete (sockaddr) address - increment addr_ucp to point
+ * at the ip_addr field.
+ */
+ sin6 = (sin6_t *)addr_ucp;
+ addr_ucp = (uchar_t *)&sin6->sin6_addr;
+ ipversion = IPV6_VERSION;
break;
- }
default:
return (EINVAL);
}
- /* Extract the destination address. */
- bcopy(addr_ucp, &addr, IP_ADDR_LEN);
-
- /* Try to find the CACHED IRE. */
- ire = ire_cache_lookup(addr, zoneid, NULL, ipst);
-
- /* Nail it. */
- if (ire) {
- /* Allow delete only on CACHE entries */
- if (ire->ire_type != IRE_CACHE) {
- ire_refrele(ire);
- return (EINVAL);
- }
-
- /*
- * Verify that the IRE has been around for a while.
- * This is to protect against transport protocols
- * that are too eager in sending delete messages.
- */
- if (gethrestime_sec() <
- ire->ire_create_time + ipst->ips_ip_ignore_delete_time) {
- ire_refrele(ire);
- return (EINVAL);
- }
- /*
- * Now we have a potentially dead cache entry. We need
- * to remove it.
- * If this cache entry is generated from a
- * default route (i.e., ire_cmask == 0),
- * search the default list and mark it dead and some
- * background process will try to activate it.
- */
- if ((ire->ire_gateway_addr != 0) && (ire->ire_cmask == 0)) {
- /*
- * Make sure that we pick a different
- * IRE_DEFAULT next time.
- */
- ire_t *gw_ire;
- irb_t *irb = NULL;
- uint_t match_flags;
-
- match_flags = (MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE);
-
- gire = ire_ftable_lookup(ire->ire_addr,
- ire->ire_cmask, 0, 0,
- ire->ire_ipif, NULL, zoneid, 0, NULL, match_flags,
- ipst);
-
- ip3dbg(("ire_ftable_lookup() returned gire %p\n",
- (void *)gire));
-
- if (gire != NULL) {
- irb = gire->ire_bucket;
-
- /*
- * We grab it as writer just to serialize
- * multiple threads trying to bump up
- * irb_rr_origin
- */
- rw_enter(&irb->irb_lock, RW_WRITER);
- if ((gw_ire = irb->irb_rr_origin) == NULL) {
- rw_exit(&irb->irb_lock);
- goto done;
- }
-
- DTRACE_PROBE1(ip__ire__del__origin,
- (ire_t *), gw_ire);
-
- /* Skip past the potentially bad gateway */
- if (ire->ire_gateway_addr ==
- gw_ire->ire_gateway_addr) {
- ire_t *next = gw_ire->ire_next;
-
- DTRACE_PROBE2(ip__ire__del,
- (ire_t *), gw_ire, (irb_t *), irb);
- IRE_FIND_NEXT_ORIGIN(next);
- irb->irb_rr_origin = next;
- }
- rw_exit(&irb->irb_lock);
- }
- }
-done:
- if (gire != NULL)
- IRE_REFRELE(gire);
- /* report the bad route to routing sockets */
- ip_rts_change(RTM_LOSING, ire->ire_addr, ire->ire_gateway_addr,
- ire->ire_mask, ire->ire_src_addr, 0, 0, 0,
- (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA), ipst);
- routing_sock_info = B_TRUE;
+ if (ipversion == IPV4_VERSION) {
+ /* Extract the destination address. */
+ bcopy(addr_ucp, &v4addr, IP_ADDR_LEN);
- /*
- * TCP is really telling us to start over completely, and it
- * expects that we'll resend the ARP query. Tell ARP to
- * discard the entry, if this is a local destination.
- *
- * But, if the ARP entry is permanent then it shouldn't be
- * deleted, so we set ARED_F_PRESERVE_PERM.
- */
- ill = ire->ire_stq->q_ptr;
- if (ire->ire_gateway_addr == 0 &&
- (arp_mp = ill_ared_alloc(ill, addr)) != NULL) {
- ared_t *ared = (ared_t *)arp_mp->b_rptr;
-
- ASSERT(ared->ared_cmd == AR_ENTRY_DELETE);
- ared->ared_flags |= ARED_F_PRESERVE_PERM;
- putnext(ill->ill_rq, arp_mp);
- }
+ ire = ire_ftable_lookup_v4(v4addr, 0, 0, 0, NULL,
+ zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
+ } else {
+ /* Extract the destination address. */
+ bcopy(addr_ucp, &v6addr, IPV6_ADDR_LEN);
- ire_delete(ire);
- ire_refrele(ire);
+ ire = ire_ftable_lookup_v6(&v6addr, NULL, NULL, 0, NULL,
+ zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
}
- /*
- * Also look for an IRE_HOST type redirect ire and
- * remove it if present.
- */
- ire = ire_route_lookup(addr, 0, 0, IRE_HOST, NULL, NULL,
- ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
-
- /* Nail it. */
if (ire != NULL) {
- if (ire->ire_flags & RTF_DYNAMIC) {
- if (!routing_sock_info) {
- ip_rts_change(RTM_LOSING, ire->ire_addr,
- ire->ire_gateway_addr, ire->ire_mask,
- ire->ire_src_addr, 0, 0, 0,
- (RTA_DST | RTA_GATEWAY |
- RTA_NETMASK | RTA_IFA),
- ipst);
- }
- ire_delete(ire);
- }
+ if (ipversion == IPV4_VERSION) {
+ ip_rts_change(RTM_LOSING, ire->ire_addr,
+ ire->ire_gateway_addr, ire->ire_mask,
+ (Q_TO_CONN(q))->conn_laddr_v4, 0, 0, 0,
+ (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA),
+ ire->ire_ipst);
+ }
+ (void) ire_no_good(ire);
ire_refrele(ire);
}
return (0);
}
/*
- * ip_ire_req is called by ip_wput when an IRE_DB_REQ_TYPE message is handed
- * down from the Upper Level Protocol to request a copy of the IRE (to check
- * its type or to extract information like round-trip time estimates or the
- * MTU.)
- * The address is assumed to be in the ire_addr field. If no IRE is found
- * an IRE is returned with ire_type being zero.
- * Note that the upper lavel protocol has to check for broadcast
- * (IRE_BROADCAST) and multicast (CLASSD(addr)).
- * If there is a b_cont the resulting IRE_DB_TYPE mblk is placed at the
- * end of the returned message.
- *
- * TCP sends down a message of this type with a connection request packet
- * chained on. UDP and ICMP send it down to verify that a route exists for
- * the destination address when they get connected.
- */
-void
-ip_ire_req(queue_t *q, mblk_t *mp)
-{
- ire_t *inire;
- ire_t *ire;
- mblk_t *mp1;
- ire_t *sire = NULL;
- zoneid_t zoneid = Q_TO_CONN(q)->conn_zoneid;
- ip_stack_t *ipst = CONNQ_TO_IPST(q);
-
- ASSERT(q->q_next == NULL);
-
- if ((mp->b_wptr - mp->b_rptr) < sizeof (ire_t) ||
- !OK_32PTR(mp->b_rptr)) {
- freemsg(mp);
- return;
- }
- inire = (ire_t *)mp->b_rptr;
- /*
- * Got it, now take our best shot at an IRE.
- */
- if (inire->ire_ipversion == IPV6_VERSION) {
- ire = ire_route_lookup_v6(&inire->ire_addr_v6, 0, 0, 0,
- NULL, &sire, zoneid, NULL,
- (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst);
- } else {
- ASSERT(inire->ire_ipversion == IPV4_VERSION);
- ire = ire_route_lookup(inire->ire_addr, 0, 0, 0,
- NULL, &sire, zoneid, NULL,
- (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst);
- }
-
- /*
- * We prevent returning IRES with source address INADDR_ANY
- * as these were temporarily created for sending packets
- * from endpoints that have conn_unspec_src set.
- */
- if (ire == NULL ||
- (ire->ire_ipversion == IPV4_VERSION &&
- ire->ire_src_addr == INADDR_ANY) ||
- (ire->ire_ipversion == IPV6_VERSION &&
- IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6))) {
- inire->ire_type = 0;
- } else {
- bcopy(ire, inire, sizeof (ire_t));
- /* Copy the route metrics from the parent. */
- if (sire != NULL) {
- bcopy(&(sire->ire_uinfo), &(inire->ire_uinfo),
- sizeof (iulp_t));
- }
-
- /* Pass the latest setting of the ip_path_mtu_discovery */
- inire->ire_frag_flag |=
- (ipst->ips_ip_path_mtu_discovery) ? IPH_DF : 0;
- }
- if (ire != NULL)
- ire_refrele(ire);
- if (sire != NULL)
- ire_refrele(sire);
- mp->b_wptr = &mp->b_rptr[sizeof (ire_t)];
- mp->b_datap->db_type = IRE_DB_TYPE;
-
- /* Put the IRE_DB_TYPE mblk last in the chain */
- mp1 = mp->b_cont;
- if (mp1 != NULL) {
- mp->b_cont = NULL;
- linkb(mp1, mp);
- mp = mp1;
- }
- qreply(q, mp);
-}
-
-/*
- * Send a packet using the specified IRE.
- * If ire_src_addr_v6 is all zero then discard the IRE after
- * the packet has been sent.
- */
-static void
-ire_send(queue_t *q, mblk_t *pkt, ire_t *ire)
-{
- mblk_t *ipsec_mp;
- boolean_t is_secure;
- uint_t ifindex;
- ill_t *ill;
- zoneid_t zoneid = ire->ire_zoneid;
- ip_stack_t *ipst = ire->ire_ipst;
-
- ASSERT(ire->ire_ipversion == IPV4_VERSION);
- ASSERT(!(ire->ire_type & IRE_LOCAL)); /* Has different ire_zoneid */
- ipsec_mp = pkt;
- is_secure = (pkt->b_datap->db_type == M_CTL);
- if (is_secure) {
- ipsec_out_t *io;
-
- pkt = pkt->b_cont;
- io = (ipsec_out_t *)ipsec_mp->b_rptr;
- if (io->ipsec_out_type == IPSEC_OUT)
- zoneid = io->ipsec_out_zoneid;
- }
-
- /* If the packet originated externally then */
- if (pkt->b_prev) {
- ire_refrele(ire);
- /*
- * Extract the ifindex from b_prev (set in ip_rput_noire).
- * Look up interface to see if it still exists (it could have
- * been unplumbed by the time the reply came back from ARP)
- */
- ifindex = (uint_t)(uintptr_t)pkt->b_prev;
- ill = ill_lookup_on_ifindex(ifindex, B_FALSE,
- NULL, NULL, NULL, NULL, ipst);
- if (ill == NULL) {
- pkt->b_prev = NULL;
- pkt->b_next = NULL;
- freemsg(ipsec_mp);
- return;
- }
- q = ill->ill_rq;
- pkt->b_prev = NULL;
- /*
- * This packet has not gone through IPSEC processing
- * and hence we should not have any IPSEC message
- * prepended.
- */
- ASSERT(ipsec_mp == pkt);
- put(q, pkt);
- ill_refrele(ill);
- } else if (pkt->b_next) {
- /* Packets from multicast router */
- pkt->b_next = NULL;
- /*
- * We never get the IPSEC_OUT while forwarding the
- * packet for multicast router.
- */
- ASSERT(ipsec_mp == pkt);
- ip_rput_forward(ire, (ipha_t *)pkt->b_rptr, ipsec_mp, NULL);
- ire_refrele(ire);
- } else {
- /* Locally originated packets */
- boolean_t delete_ire = B_FALSE;
- ipha_t *ipha = (ipha_t *)pkt->b_rptr;
-
- /*
- * If this IRE shouldn't be kept in the table (because its
- * source address is unspecified), hold a reference to it so
- * we can delete it even after e.g. ip_wput_ire() has dropped
- * its reference.
- */
- if (!(ire->ire_marks & IRE_MARK_NOADD) &&
- ire->ire_src_addr == INADDR_ANY) {
- delete_ire = B_TRUE;
- IRE_REFHOLD(ire);
- }
-
- /*
- * If we were resolving a router we can not use the
- * routers IRE for sending the packet (since it would
- * violate the uniqness of the IP idents) thus we
- * make another pass through ip_wput to create the IRE_CACHE
- * for the destination.
- * When IRE_MARK_NOADD is set, ire_add() is not called.
- * Thus ip_wput() will never find a ire and result in an
- * infinite loop. Thus we check whether IRE_MARK_NOADD is
- * is set. This also implies that IRE_MARK_NOADD can only be
- * used to send packets to directly connected hosts.
- */
- if (ipha->ipha_dst != ire->ire_addr &&
- !(ire->ire_marks & IRE_MARK_NOADD)) {
- ire_refrele(ire); /* Held in ire_add */
- if (CONN_Q(q)) {
- (void) ip_output(Q_TO_CONN(q), ipsec_mp, q,
- IRE_SEND);
- } else {
- (void) ip_output((void *)(uintptr_t)zoneid,
- ipsec_mp, q, IRE_SEND);
- }
- } else {
- if (is_secure) {
- ipsec_out_t *oi;
- ipha_t *ipha;
-
- oi = (ipsec_out_t *)ipsec_mp->b_rptr;
- ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr;
- if (oi->ipsec_out_proc_begin) {
- /*
- * This is the case where
- * ip_wput_ipsec_out could not find
- * the IRE and recreated a new one.
- * As ip_wput_ipsec_out does ire
- * lookups, ire_refrele for the extra
- * bump in ire_add.
- */
- ire_refrele(ire);
- ip_wput_ipsec_out(q, ipsec_mp, ipha,
- NULL, NULL);
- } else {
- /*
- * IRE_REFRELE will be done in
- * ip_wput_ire.
- */
- ip_wput_ire(q, ipsec_mp, ire, NULL,
- IRE_SEND, zoneid);
- }
- } else {
- /*
- * IRE_REFRELE will be done in ip_wput_ire.
- */
- ip_wput_ire(q, ipsec_mp, ire, NULL,
- IRE_SEND, zoneid);
- }
- }
- /*
- * Special code to support sending a single packet with
- * conn_unspec_src using an IRE which has no source address.
- * The IRE is deleted here after sending the packet to avoid
- * having other code trip on it. But before we delete the
- * ire, somebody could have looked up this ire.
- * We prevent returning/using this IRE by the upper layers
- * by making checks to NULL source address in other places
- * like e.g ip_ire_append, ip_ire_req and ip_bind_connected.
- * Though this does not completely prevent other threads
- * from using this ire, this should not cause any problems.
- */
- if (delete_ire) {
- ip1dbg(("ire_send: delete IRE\n"));
- ire_delete(ire);
- ire_refrele(ire); /* Held above */
- }
- }
-}
-
-/*
- * Send a packet using the specified IRE.
- * If ire_src_addr_v6 is all zero then discard the IRE after
- * the packet has been sent.
- */
-static void
-ire_send_v6(queue_t *q, mblk_t *pkt, ire_t *ire)
-{
- mblk_t *ipsec_mp;
- boolean_t secure;
- uint_t ifindex;
- zoneid_t zoneid = ire->ire_zoneid;
- ip_stack_t *ipst = ire->ire_ipst;
-
- ASSERT(ire->ire_ipversion == IPV6_VERSION);
- ASSERT(!(ire->ire_type & IRE_LOCAL)); /* Has different ire_zoneid */
- if (pkt->b_datap->db_type == M_CTL) {
- ipsec_out_t *io;
-
- ipsec_mp = pkt;
- pkt = pkt->b_cont;
- secure = B_TRUE;
- io = (ipsec_out_t *)ipsec_mp->b_rptr;
- if (io->ipsec_out_type == IPSEC_OUT)
- zoneid = io->ipsec_out_zoneid;
- } else {
- ipsec_mp = pkt;
- secure = B_FALSE;
- }
-
- /* If the packet originated externally then */
- if (pkt->b_prev) {
- ill_t *ill;
- /*
- * Extract the ifindex from b_prev (set in ip_rput_data_v6).
- * Look up interface to see if it still exists (it could have
- * been unplumbed by the time the reply came back from the
- * resolver).
- */
- ifindex = (uint_t)(uintptr_t)pkt->b_prev;
- ill = ill_lookup_on_ifindex(ifindex, B_TRUE,
- NULL, NULL, NULL, NULL, ipst);
- if (ill == NULL) {
- pkt->b_prev = NULL;
- pkt->b_next = NULL;
- freemsg(ipsec_mp);
- ire_refrele(ire); /* Held in ire_add */
- return;
- }
- q = ill->ill_rq;
- pkt->b_prev = NULL;
- /*
- * This packet has not gone through IPSEC processing
- * and hence we should not have any IPSEC message
- * prepended.
- */
- ASSERT(ipsec_mp == pkt);
- put(q, pkt);
- ill_refrele(ill);
- } else if (pkt->b_next) {
- /* Packets from multicast router */
- pkt->b_next = NULL;
- /*
- * We never get the IPSEC_OUT while forwarding the
- * packet for multicast router.
- */
- ASSERT(ipsec_mp == pkt);
- /*
- * XXX TODO IPv6.
- */
- freemsg(pkt);
-#ifdef XXX
- ip_rput_forward(ire, (ipha_t *)pkt->b_rptr, pkt, NULL);
-#endif
- } else {
- if (secure) {
- ipsec_out_t *oi;
- ip6_t *ip6h;
-
- oi = (ipsec_out_t *)ipsec_mp->b_rptr;
- ip6h = (ip6_t *)ipsec_mp->b_cont->b_rptr;
- if (oi->ipsec_out_proc_begin) {
- /*
- * This is the case where
- * ip_wput_ipsec_out could not find
- * the IRE and recreated a new one.
- */
- ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h,
- NULL, NULL);
- } else {
- if (CONN_Q(q)) {
- (void) ip_output_v6(Q_TO_CONN(q),
- ipsec_mp, q, IRE_SEND);
- } else {
- (void) ip_output_v6(
- (void *)(uintptr_t)zoneid,
- ipsec_mp, q, IRE_SEND);
- }
- }
- } else {
- /*
- * Send packets through ip_output_v6 so that any
- * ip6_info header can be processed again.
- */
- if (CONN_Q(q)) {
- (void) ip_output_v6(Q_TO_CONN(q), ipsec_mp, q,
- IRE_SEND);
- } else {
- (void) ip_output_v6((void *)(uintptr_t)zoneid,
- ipsec_mp, q, IRE_SEND);
- }
- }
- /*
- * Special code to support sending a single packet with
- * conn_unspec_src using an IRE which has no source address.
- * The IRE is deleted here after sending the packet to avoid
- * having other code trip on it. But before we delete the
- * ire, somebody could have looked up this ire.
- * We prevent returning/using this IRE by the upper layers
- * by making checks to NULL source address in other places
- * like e.g ip_ire_append_v6, ip_ire_req and
- * ip_bind_connected_v6. Though, this does not completely
- * prevent other threads from using this ire, this should
- * not cause any problems.
- */
- if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6)) {
- ip1dbg(("ire_send_v6: delete IRE\n"));
- ire_delete(ire);
- }
- }
- ire_refrele(ire); /* Held in ire_add */
-}
-
-/*
- * Make sure that IRE bucket does not get too long.
- * This can cause lock up because ire_cache_lookup()
- * may take "forever" to finish.
- *
- * We only remove a maximum of cnt IREs each time. This
- * should keep the bucket length approximately constant,
- * depending on cnt. This should be enough to defend
- * against DoS attack based on creating temporary IREs
- * (for forwarding and non-TCP traffic).
- *
- * We also pass in the address of the newly created IRE
- * as we do not want to remove this straight after adding
- * it. New IREs are normally added at the tail of the
- * bucket. This means that we are removing the "oldest"
- * temporary IREs added. Only if there are IREs with
- * the same ire_addr, do we not add it at the tail. Refer
- * to ire_add_v*(). It should be OK for our purpose.
- *
- * For non-temporary cached IREs, we make sure that they
- * have not been used for some time (defined below), they
- * are non-local destinations, and there is no one using
- * them at the moment (refcnt == 1).
- *
- * The above means that the IRE bucket length may become
- * very long, consisting of mostly non-temporary IREs.
- * This can happen when the hash function does a bad job
- * so that most TCP connections cluster to a specific bucket.
- * This "hopefully" should never happen. It can also
- * happen if most TCP connections have very long lives.
- * Even with the minimal hash table size of 256, there
- * has to be a lot of such connections to make the bucket
- * length unreasonably long. This should probably not
- * happen either. The third can when this can happen is
- * when the machine is under attack, such as SYN flooding.
- * TCP should already have the proper mechanism to protect
- * that. So we should be safe.
- *
- * This function is called by ire_add_then_send() after
- * a new IRE is added and the packet is sent.
- *
- * The idle cutoff interval is set to 60s. It can be
- * changed using /etc/system.
- */
-uint32_t ire_idle_cutoff_interval = 60000;
-
-static void
-ire_cache_cleanup(irb_t *irb, uint32_t threshold, ire_t *ref_ire)
-{
- ire_t *ire;
- clock_t cut_off = drv_usectohz(ire_idle_cutoff_interval * 1000);
- int cnt = ip_ire_cleanup_cnt;
-
- /*
- * Try to remove cnt temporary IREs first.
- */
- for (ire = irb->irb_ire; cnt > 0 && ire != NULL; ire = ire->ire_next) {
- if (ire == ref_ire)
- continue;
- if (ire->ire_marks & IRE_MARK_CONDEMNED)
- continue;
- if (ire->ire_marks & IRE_MARK_TEMPORARY) {
- ASSERT(ire->ire_type == IRE_CACHE);
- ire_delete(ire);
- cnt--;
- }
- }
- if (cnt == 0)
- return;
-
- /*
- * If we didn't satisfy our removal target from temporary IREs
- * we see how many non-temporary IREs are currently in the bucket.
- * If this quantity is above the threshold then we see if there are any
- * candidates for removal. We are still limited to removing a maximum
- * of cnt IREs.
- */
- if ((irb->irb_ire_cnt - irb->irb_tmp_ire_cnt) > threshold) {
- for (ire = irb->irb_ire; cnt > 0 && ire != NULL;
- ire = ire->ire_next) {
- if (ire == ref_ire)
- continue;
- if (ire->ire_type != IRE_CACHE)
- continue;
- if (ire->ire_marks & IRE_MARK_CONDEMNED)
- continue;
- if ((ire->ire_refcnt == 1) &&
- (lbolt - ire->ire_last_used_time > cut_off)) {
- ire_delete(ire);
- cnt--;
- }
- }
- }
-}
-
-/*
- * ire_add_then_send is called when a new IRE has been created in order to
- * route an outgoing packet. Typically, it is called from ip_wput when
- * a response comes back down from a resolver. We add the IRE, and then
- * possibly run the packet through ip_wput or ip_rput, as appropriate.
- * However, we do not add the newly created IRE in the cache when
- * IRE_MARK_NOADD is set in the IRE. IRE_MARK_NOADD is set at
- * ip_newroute_ipif(). The ires with IRE_MARK_NOADD are ire_refrele'd by
- * ip_wput_ire() and get deleted.
- * Multirouting support: the packet is silently discarded when the new IRE
- * holds the RTF_MULTIRT flag, but is not the first IRE to be added with the
- * RTF_MULTIRT flag for the same destination address.
- * In this case, we just want to register this additional ire without
- * sending the packet, as it has already been replicated through
- * existing multirt routes in ip_wput().
- */
-void
-ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp)
-{
- irb_t *irb;
- boolean_t drop = B_FALSE;
- boolean_t mctl_present;
- mblk_t *first_mp = NULL;
- mblk_t *data_mp = NULL;
- ire_t *dst_ire;
- ipha_t *ipha;
- ip6_t *ip6h;
- ip_stack_t *ipst = ire->ire_ipst;
- int ire_limit;
-
- if (mp != NULL) {
- /*
- * We first have to retrieve the destination address carried
- * by the packet.
- * We can't rely on ire as it can be related to a gateway.
- * The destination address will help in determining if
- * other RTF_MULTIRT ires are already registered.
- *
- * We first need to know where we are going : v4 or V6.
- * the ire version is enough, as there is no risk that
- * we resolve an IPv6 address with an IPv4 ire
- * or vice versa.
- */
- EXTRACT_PKT_MP(mp, first_mp, mctl_present);
- data_mp = mp;
- mp = first_mp;
- if (ire->ire_ipversion == IPV4_VERSION) {
- ipha = (ipha_t *)data_mp->b_rptr;
- dst_ire = ire_cache_lookup(ipha->ipha_dst,
- ire->ire_zoneid, msg_getlabel(mp), ipst);
- } else {
- ASSERT(ire->ire_ipversion == IPV6_VERSION);
- ip6h = (ip6_t *)data_mp->b_rptr;
- dst_ire = ire_cache_lookup_v6(&ip6h->ip6_dst,
- ire->ire_zoneid, msg_getlabel(mp), ipst);
- }
- if (dst_ire != NULL) {
- if (dst_ire->ire_flags & RTF_MULTIRT) {
- /*
- * At least one resolved multirt route
- * already exists for the destination,
- * don't sent this packet: either drop it
- * or complete the pending resolution,
- * depending on the ire.
- */
- drop = B_TRUE;
- }
- ip1dbg(("ire_add_then_send: dst_ire %p "
- "[dst %08x, gw %08x], drop %d\n",
- (void *)dst_ire,
- (dst_ire->ire_ipversion == IPV4_VERSION) ? \
- ntohl(dst_ire->ire_addr) : \
- ntohl(V4_PART_OF_V6(dst_ire->ire_addr_v6)),
- (dst_ire->ire_ipversion == IPV4_VERSION) ? \
- ntohl(dst_ire->ire_gateway_addr) : \
- ntohl(V4_PART_OF_V6(
- dst_ire->ire_gateway_addr_v6)),
- drop));
- ire_refrele(dst_ire);
- }
- }
-
- if (!(ire->ire_marks & IRE_MARK_NOADD)) {
- /* Regular packets with cache bound ires are here. */
- (void) ire_add(&ire, NULL, NULL, NULL, B_FALSE);
-
- if (ire == NULL) {
- mp->b_prev = NULL;
- mp->b_next = NULL;
- MULTIRT_DEBUG_UNTAG(mp);
- freemsg(mp);
- return;
- }
- if (mp == NULL) {
- ire_refrele(ire); /* Held in ire_add_v4/v6 */
- return;
- }
- }
- if (drop) {
- /*
- * If we're adding an RTF_MULTIRT ire, the resolution
- * is over: we just drop the packet.
- */
- if (ire->ire_flags & RTF_MULTIRT) {
- data_mp->b_prev = NULL;
- data_mp->b_next = NULL;
- MULTIRT_DEBUG_UNTAG(mp);
- freemsg(mp);
- } else {
- /*
- * Otherwise, we're adding the ire to a gateway
- * for a multirt route.
- * Invoke ip_newroute() to complete the resolution
- * of the route. We will then come back here and
- * finally drop this packet in the above code.
- */
- if (ire->ire_ipversion == IPV4_VERSION) {
- /*
- * TODO: in order for CGTP to work in non-global
- * zones, ip_newroute() must create the IRE
- * cache in the zone indicated by
- * ire->ire_zoneid.
- */
- ip_newroute(q, mp, ipha->ipha_dst,
- (CONN_Q(q) ? Q_TO_CONN(q) : NULL),
- ire->ire_zoneid, ipst);
- } else {
- int minlen = sizeof (ip6i_t) + IPV6_HDR_LEN;
-
- ASSERT(ire->ire_ipversion == IPV6_VERSION);
-
- /*
- * If necessary, skip over the ip6i_t to find
- * the header with the actual source address.
- */
- if (ip6h->ip6_nxt == IPPROTO_RAW) {
- if (MBLKL(data_mp) < minlen &&
- pullupmsg(data_mp, -1) == 0) {
- ip1dbg(("ire_add_then_send: "
- "cannot pullupmsg ip6i\n"));
- if (mctl_present)
- freeb(first_mp);
- ire_refrele(ire);
- return;
- }
- ASSERT(MBLKL(data_mp) >= IPV6_HDR_LEN);
- ip6h = (ip6_t *)(data_mp->b_rptr +
- sizeof (ip6i_t));
- }
- ip_newroute_v6(q, mp, &ip6h->ip6_dst,
- &ip6h->ip6_src, NULL, ire->ire_zoneid,
- ipst);
- }
- }
-
- ire_refrele(ire); /* As done by ire_send(). */
- return;
- }
- /*
- * Need to remember ire_bucket here as ire_send*() may delete
- * the ire so we cannot reference it after that.
- */
- irb = ire->ire_bucket;
- if (ire->ire_ipversion == IPV4_VERSION) {
- ire_send(q, mp, ire);
- ire_limit = ip_ire_max_bucket_cnt;
- } else {
- ire_send_v6(q, mp, ire);
- ire_limit = ip6_ire_max_bucket_cnt;
- }
-
- /*
- * irb is NULL if the IRE was not added to the hash. This happens
- * when IRE_MARK_NOADD is set and when IREs are returned from
- * ire_update_srcif_v4().
- */
- if (irb != NULL) {
- IRB_REFHOLD(irb);
- if (irb->irb_ire_cnt > ire_limit)
- ire_cache_cleanup(irb, ire_limit, ire);
- IRB_REFRELE(irb);
- }
-}
-
-/*
* Initialize the ire that is specific to IPv4 part and call
* ire_init_common to finish it.
+ * Returns zero or errno.
*/
-ire_t *
-ire_init(ire_t *ire, uchar_t *addr, uchar_t *mask, uchar_t *src_addr,
- uchar_t *gateway, uint_t *max_fragp, nce_t *src_nce, queue_t *rfq,
- queue_t *stq, ushort_t type, ipif_t *ipif, ipaddr_t cmask, uint32_t phandle,
- uint32_t ihandle, uint32_t flags, const iulp_t *ulp_info, tsol_gc_t *gc,
- tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
+int
+ire_init_v4(ire_t *ire, uchar_t *addr, uchar_t *mask, uchar_t *gateway,
+ ushort_t type, ill_t *ill, zoneid_t zoneid, uint_t flags,
+ tsol_gc_t *gc, ip_stack_t *ipst)
{
- ASSERT(type != IRE_CACHE || stq != NULL);
+ int error;
+
/*
* Reject IRE security attribute creation/initialization
* if system is not running in Trusted mode.
*/
- if ((gc != NULL || gcgrp != NULL) && !is_system_labeled())
- return (NULL);
+ if (gc != NULL && !is_system_labeled())
+ return (EINVAL);
BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_alloced);
if (addr != NULL)
bcopy(addr, &ire->ire_addr, IP_ADDR_LEN);
- if (src_addr != NULL)
- bcopy(src_addr, &ire->ire_src_addr, IP_ADDR_LEN);
- if (mask != NULL) {
- bcopy(mask, &ire->ire_mask, IP_ADDR_LEN);
- ire->ire_masklen = ip_mask_to_plen(ire->ire_mask);
- }
- if (gateway != NULL) {
+ if (gateway != NULL)
bcopy(gateway, &ire->ire_gateway_addr, IP_ADDR_LEN);
+
+ /* Make sure we don't have stray values in some fields */
+ switch (type) {
+ case IRE_LOOPBACK:
+ bcopy(&ire->ire_addr, &ire->ire_gateway_addr, IP_ADDR_LEN);
+ /* FALLTHRU */
+ case IRE_HOST:
+ case IRE_BROADCAST:
+ case IRE_LOCAL:
+ case IRE_IF_CLONE:
+ ire->ire_mask = IP_HOST_MASK;
+ ire->ire_masklen = IPV4_ABITS;
+ break;
+ case IRE_PREFIX:
+ case IRE_DEFAULT:
+ case IRE_IF_RESOLVER:
+ case IRE_IF_NORESOLVER:
+ if (mask != NULL) {
+ bcopy(mask, &ire->ire_mask, IP_ADDR_LEN);
+ ire->ire_masklen = ip_mask_to_plen(ire->ire_mask);
+ }
+ break;
+ case IRE_MULTICAST:
+ case IRE_NOROUTE:
+ ASSERT(mask == NULL);
+ break;
+ default:
+ ASSERT(0);
+ return (EINVAL);
}
- if (type == IRE_CACHE)
- ire->ire_cmask = cmask;
+ error = ire_init_common(ire, type, ill, zoneid, flags, IPV4_VERSION,
+ gc, ipst);
+ if (error != NULL)
+ return (error);
- /* ire_init_common will free the mblks upon encountering any failure */
- if (!ire_init_common(ire, max_fragp, src_nce, rfq, stq, type, ipif,
- phandle, ihandle, flags, IPV4_VERSION, ulp_info, gc, gcgrp, ipst))
- return (NULL);
+ /* Determine which function pointers to use */
+ ire->ire_postfragfn = ip_xmit; /* Common case */
- return (ire);
+ switch (ire->ire_type) {
+ case IRE_LOCAL:
+ ire->ire_sendfn = ire_send_local_v4;
+ ire->ire_recvfn = ire_recv_local_v4;
+#ifdef SO_VRRP
+ ASSERT(ire->ire_ill != NULL);
+ if (ire->ire_ill->ill_flags & ILLF_NOACCEPT) {
+ ire->ire_noaccept = B_TRUE;
+ ire->ire_recvfn = ire_recv_noaccept_v6;
+ }
+#endif
+ break;
+ case IRE_LOOPBACK:
+ ire->ire_sendfn = ire_send_local_v4;
+ ire->ire_recvfn = ire_recv_loopback_v4;
+ break;
+ case IRE_BROADCAST:
+ ire->ire_postfragfn = ip_postfrag_loopcheck;
+ ire->ire_sendfn = ire_send_broadcast_v4;
+ ire->ire_recvfn = ire_recv_broadcast_v4;
+ break;
+ case IRE_MULTICAST:
+ ire->ire_postfragfn = ip_postfrag_loopcheck;
+ ire->ire_sendfn = ire_send_multicast_v4;
+ ire->ire_recvfn = ire_recv_multicast_v4;
+ break;
+ default:
+ /*
+ * For IRE_IF_ALL and IRE_OFFLINK we forward received
+ * packets by default.
+ */
+ ire->ire_sendfn = ire_send_wire_v4;
+ ire->ire_recvfn = ire_recv_forward_v4;
+ break;
+ }
+ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+ ire->ire_sendfn = ire_send_noroute_v4;
+ ire->ire_recvfn = ire_recv_noroute_v4;
+ } else if (ire->ire_flags & RTF_MULTIRT) {
+ ire->ire_postfragfn = ip_postfrag_multirt_v4;
+ ire->ire_sendfn = ire_send_multirt_v4;
+ /* Multirt receive of broadcast uses ire_recv_broadcast_v4 */
+ if (ire->ire_type != IRE_BROADCAST)
+ ire->ire_recvfn = ire_recv_multirt_v4;
+ }
+ ire->ire_nce_capable = ire_determine_nce_capable(ire);
+ return (0);
}
/*
- * Similar to ire_create except that it is called only when
- * we want to allocate ire as an mblk e.g. we have an external
- * resolver ARP.
+ * Determine ire_nce_capable
*/
-ire_t *
-ire_create_mp(uchar_t *addr, uchar_t *mask, uchar_t *src_addr, uchar_t *gateway,
- uint_t max_frag, nce_t *src_nce, queue_t *rfq, queue_t *stq, ushort_t type,
- ipif_t *ipif, ipaddr_t cmask, uint32_t phandle, uint32_t ihandle,
- uint32_t flags, const iulp_t *ulp_info, tsol_gc_t *gc, tsol_gcgrp_t *gcgrp,
- ip_stack_t *ipst)
+boolean_t
+ire_determine_nce_capable(ire_t *ire)
{
- ire_t *ire, *buf;
- ire_t *ret_ire;
- mblk_t *mp;
- size_t bufsize;
- frtn_t *frtnp;
- ill_t *ill;
+ int max_masklen;
- bufsize = sizeof (ire_t) + sizeof (frtn_t);
- buf = kmem_alloc(bufsize, KM_NOSLEEP);
- if (buf == NULL) {
- ip1dbg(("ire_create_mp: alloc failed\n"));
- return (NULL);
- }
- frtnp = (frtn_t *)(buf + 1);
- frtnp->free_arg = (caddr_t)buf;
- frtnp->free_func = ire_freemblk;
+ if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
+ (ire->ire_type & IRE_MULTICAST))
+ return (B_TRUE);
- /*
- * Allocate the new IRE. The ire created will hold a ref on
- * an nce_t after ire_nce_init, and this ref must either be
- * (a) transferred to the ire_cache entry created when ire_add_v4
- * is called after successful arp resolution, or,
- * (b) released, when arp resolution fails
- * Case (b) is handled in ire_freemblk() which will be called
- * when mp is freed as a result of failed arp.
- */
- mp = esballoc((unsigned char *)buf, bufsize, BPRI_MED, frtnp);
- if (mp == NULL) {
- ip1dbg(("ire_create_mp: alloc failed\n"));
- kmem_free(buf, bufsize);
- return (NULL);
- }
- ire = (ire_t *)mp->b_rptr;
- mp->b_wptr = (uchar_t *)&ire[1];
+ if (ire->ire_ipversion == IPV4_VERSION)
+ max_masklen = IPV4_ABITS;
+ else
+ max_masklen = IPV6_ABITS;
- /* Start clean. */
- *ire = ire_null;
- ire->ire_mp = mp;
- mp->b_datap->db_type = IRE_DB_TYPE;
- ire->ire_marks |= IRE_MARK_UNCACHED;
-
- ret_ire = ire_init(ire, addr, mask, src_addr, gateway, NULL, src_nce,
- rfq, stq, type, ipif, cmask, phandle, ihandle, flags, ulp_info, gc,
- gcgrp, ipst);
-
- ill = (ill_t *)(stq->q_ptr);
- if (ret_ire == NULL) {
- /* ire_freemblk needs these set */
- ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex;
- ire->ire_stackid = ipst->ips_netstack->netstack_stackid;
- ire->ire_ipst = ipst;
- freeb(ire->ire_mp);
- return (NULL);
- }
- ret_ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex;
- ret_ire->ire_stackid = ipst->ips_netstack->netstack_stackid;
- ASSERT(ret_ire == ire);
- ASSERT(ret_ire->ire_ipst == ipst);
- /*
- * ire_max_frag is normally zero here and is atomically set
- * under the irebucket lock in ire_add_v[46] except for the
- * case of IRE_MARK_NOADD. In that event the the ire_max_frag
- * is non-zero here.
- */
- ire->ire_max_frag = max_frag;
- return (ire);
+ if ((ire->ire_type & IRE_ONLINK) && ire->ire_masklen == max_masklen)
+ return (B_TRUE);
+ return (B_FALSE);
}
/*
@@ -1514,49 +628,43 @@ ire_create_mp(uchar_t *addr, uchar_t *mask, uchar_t *src_addr, uchar_t *gateway,
* by this function.
*/
ire_t *
-ire_create(uchar_t *addr, uchar_t *mask, uchar_t *src_addr, uchar_t *gateway,
- uint_t *max_fragp, nce_t *src_nce, queue_t *rfq, queue_t *stq,
- ushort_t type, ipif_t *ipif, ipaddr_t cmask, uint32_t phandle,
- uint32_t ihandle, uint32_t flags, const iulp_t *ulp_info, tsol_gc_t *gc,
- tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
+ire_create(uchar_t *addr, uchar_t *mask, uchar_t *gateway,
+ ushort_t type, ill_t *ill, zoneid_t zoneid, uint_t flags, tsol_gc_t *gc,
+ ip_stack_t *ipst)
{
ire_t *ire;
- ire_t *ret_ire;
+ int error;
ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
if (ire == NULL) {
- ip1dbg(("ire_create: alloc failed\n"));
+ DTRACE_PROBE(kmem__cache__alloc);
return (NULL);
}
*ire = ire_null;
- ret_ire = ire_init(ire, addr, mask, src_addr, gateway, max_fragp,
- src_nce, rfq, stq, type, ipif, cmask, phandle, ihandle, flags,
- ulp_info, gc, gcgrp, ipst);
-
- if (ret_ire == NULL) {
+ error = ire_init_v4(ire, addr, mask, gateway, type, ill, zoneid, flags,
+ gc, ipst);
+ if (error != 0) {
+ DTRACE_PROBE2(ire__init, ire_t *, ire, int, error);
kmem_cache_free(ire_cache, ire);
return (NULL);
}
- ASSERT(ret_ire == ire);
return (ire);
}
/*
* Common to IPv4 and IPv6
+ * Returns zero or errno.
*/
-boolean_t
-ire_init_common(ire_t *ire, uint_t *max_fragp, nce_t *src_nce, queue_t *rfq,
- queue_t *stq, ushort_t type, ipif_t *ipif, uint32_t phandle,
- uint32_t ihandle, uint32_t flags, uchar_t ipversion, const iulp_t *ulp_info,
- tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
+int
+ire_init_common(ire_t *ire, ushort_t type, ill_t *ill, zoneid_t zoneid,
+ uint_t flags, uchar_t ipversion, tsol_gc_t *gc, ip_stack_t *ipst)
{
- ire->ire_max_fragp = max_fragp;
- ire->ire_frag_flag |= (ipst->ips_ip_path_mtu_discovery) ? IPH_DF : 0;
+ int error;
#ifdef DEBUG
- if (ipif != NULL) {
- if (ipif->ipif_isv6)
+ if (ill != NULL) {
+ if (ill->ill_isv6)
ASSERT(ipversion == IPV6_VERSION);
else
ASSERT(ipversion == IPV4_VERSION);
@@ -1565,223 +673,73 @@ ire_init_common(ire_t *ire, uint_t *max_fragp, nce_t *src_nce, queue_t *rfq,
/*
* Create/initialize IRE security attribute only in Trusted mode;
- * if the passed in gc/gcgrp is non-NULL, we expect that the caller
+ * if the passed in gc is non-NULL, we expect that the caller
* has held a reference to it and will release it when this routine
* returns a failure, otherwise we own the reference. We do this
* prior to initializing the rest IRE fields.
- *
- * Don't allocate ire_gw_secattr for the resolver case to prevent
- * memory leak (in case of external resolution failure). We'll
- * allocate it after a successful external resolution, in ire_add().
- * Note that ire->ire_mp != NULL here means this ire is headed
- * to an external resolver.
*/
if (is_system_labeled()) {
if ((type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST |
- IRE_INTERFACE)) != 0) {
+ IRE_IF_ALL | IRE_MULTICAST | IRE_NOROUTE)) != 0) {
/* release references on behalf of caller */
if (gc != NULL)
GC_REFRELE(gc);
- if (gcgrp != NULL)
- GCGRP_REFRELE(gcgrp);
- } else if ((ire->ire_mp == NULL) &&
- tsol_ire_init_gwattr(ire, ipversion, gc, gcgrp) != 0) {
- return (B_FALSE);
+ } else {
+ error = tsol_ire_init_gwattr(ire, ipversion, gc);
+ if (error != 0)
+ return (error);
}
}
- ire->ire_stq = stq;
- ire->ire_rfq = rfq;
ire->ire_type = type;
ire->ire_flags = RTF_UP | flags;
- ire->ire_ident = TICK_TO_MSEC(lbolt);
- bcopy(ulp_info, &ire->ire_uinfo, sizeof (iulp_t));
-
- ire->ire_tire_mark = ire->ire_ob_pkt_count + ire->ire_ib_pkt_count;
- ire->ire_last_used_time = lbolt;
ire->ire_create_time = (uint32_t)gethrestime_sec();
+ ire->ire_generation = IRE_GENERATION_INITIAL;
/*
- * If this IRE is an IRE_CACHE, inherit the handles from the
- * parent IREs. For others in the forwarding table, assign appropriate
- * new ones.
+ * The ill_ire_cnt isn't increased until
+ * the IRE is added to ensure that a walker will find
+ * all IREs that hold a reference on an ill.
*
- * The mutex protecting ire_handle is because ire_create is not always
- * called as a writer.
+ * Note that ill_ire_multicast doesn't hold a ref on the ill since
+ * ire_add() is not called for the IRE_MULTICAST.
*/
- if (ire->ire_type & IRE_OFFSUBNET) {
- mutex_enter(&ipst->ips_ire_handle_lock);
- ire->ire_phandle = (uint32_t)ipst->ips_ire_handle++;
- mutex_exit(&ipst->ips_ire_handle_lock);
- } else if (ire->ire_type & IRE_INTERFACE) {
- mutex_enter(&ipst->ips_ire_handle_lock);
- ire->ire_ihandle = (uint32_t)ipst->ips_ire_handle++;
- mutex_exit(&ipst->ips_ire_handle_lock);
- } else if (ire->ire_type == IRE_CACHE) {
- ire->ire_phandle = phandle;
- ire->ire_ihandle = ihandle;
- }
- ire->ire_ipif = ipif;
- if (ipif != NULL) {
- ire->ire_ipif_seqid = ipif->ipif_seqid;
- ire->ire_ipif_ifindex =
- ipif->ipif_ill->ill_phyint->phyint_ifindex;
- ire->ire_zoneid = ipif->ipif_zoneid;
- } else {
- ire->ire_zoneid = GLOBAL_ZONEID;
- }
+ ire->ire_ill = ill;
+ ire->ire_zoneid = zoneid;
ire->ire_ipversion = ipversion;
+
mutex_init(&ire->ire_lock, NULL, MUTEX_DEFAULT, NULL);
- if (ipversion == IPV4_VERSION) {
- /*
- * IPv6 initializes the ire_nce in ire_add_v6, which expects
- * to find the ire_nce to be null when it is called.
- */
- if (ire_nce_init(ire, src_nce) != 0) {
- /* some failure occurred. propagate error back */
- return (B_FALSE);
- }
- }
ire->ire_refcnt = 1;
+ ire->ire_identical_ref = 1; /* Number of ire_delete's needed */
ire->ire_ipst = ipst; /* No netstack_hold */
ire->ire_trace_disable = B_FALSE;
- return (B_TRUE);
+ return (0);
}
/*
- * This routine is called repeatedly by ipif_up to create broadcast IREs.
- * It is passed a pointer to a slot in an IRE pointer array into which to
- * place the pointer to the new IRE, if indeed we create one. If the
- * IRE corresponding to the address passed in would be a duplicate of an
- * existing one, we don't create the new one. irep is incremented before
- * return only if we do create a new IRE. (Always called as writer.)
+ * This creates an IRE_BROADCAST based on the arguments.
+ * A mirror is ire_lookup_bcast().
*
- * Note that with the "match_flags" parameter, we can match on either
- * a particular logical interface (MATCH_IRE_IPIF) or for all logical
- * interfaces for a given physical interface (MATCH_IRE_ILL). Currently,
- * we only create broadcast ire's on a per physical interface basis. If
- * someone is going to be mucking with logical interfaces, it is important
- * to call "ipif_check_bcast_ires()" to make sure that any change to a
- * logical interface will not cause critical broadcast IRE's to be deleted.
- */
-ire_t **
-ire_check_and_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep,
- int match_flags)
-{
- ire_t *ire;
- uint64_t check_flags = IPIF_DEPRECATED | IPIF_NOLOCAL | IPIF_ANYCAST;
- boolean_t prefer;
- ill_t *ill = ipif->ipif_ill;
- ip_stack_t *ipst = ill->ill_ipst;
-
- /*
- * No broadcast IREs for the LOOPBACK interface
- * or others such as point to point and IPIF_NOXMIT.
- */
- if (!(ipif->ipif_flags & IPIF_BROADCAST) ||
- (ipif->ipif_flags & IPIF_NOXMIT))
- return (irep);
-
- /*
- * If this new IRE would be a duplicate, only prefer it if one of
- * the following is true:
- *
- * 1. The existing one has IPIF_DEPRECATED|IPIF_LOCAL|IPIF_ANYCAST
- * set and the new one has all of those clear.
- *
- * 2. The existing one corresponds to an underlying ILL in an IPMP
- * group and the new one corresponds to an IPMP group interface.
- */
- if ((ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ipif,
- ipif->ipif_zoneid, NULL, match_flags, ipst)) != NULL) {
- prefer = ((ire->ire_ipif->ipif_flags & check_flags) &&
- !(ipif->ipif_flags & check_flags)) ||
- (IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) && IS_IPMP(ill));
- if (!prefer) {
- ire_refrele(ire);
- return (irep);
- }
-
- /*
- * Bcast ires exist in pairs. Both have to be deleted,
- * Since we are exclusive we can make the above assertion.
- * The 1st has to be refrele'd since it was ctable_lookup'd.
- */
- ASSERT(IAM_WRITER_IPIF(ipif));
- ASSERT(ire->ire_next->ire_addr == ire->ire_addr);
- ire_delete(ire->ire_next);
- ire_delete(ire);
- ire_refrele(ire);
- }
- return (ire_create_bcast(ipif, addr, irep));
-}
-
-uint_t ip_loopback_mtu = IP_LOOPBACK_MTU;
-
-/*
- * This routine is called from ipif_check_bcast_ires and ire_check_bcast.
- * It leaves all the verifying and deleting to those routines. So it always
- * creates 2 bcast ires and chains them into the ire array passed in.
+ * Any supression of unneeded ones is done in ire_add_v4.
+ * We add one IRE_BROADCAST per address. ire_send_broadcast_v4()
+ * takes care of generating a loopback copy of the packet.
*/
ire_t **
-ire_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep)
+ire_create_bcast(ill_t *ill, ipaddr_t addr, zoneid_t zoneid, ire_t **irep)
{
- ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
- ill_t *ill = ipif->ipif_ill;
-
- ASSERT(IAM_WRITER_IPIF(ipif));
+ ip_stack_t *ipst = ill->ill_ipst;
- if (IS_IPMP(ill)) {
- /*
- * Broadcast IREs for the IPMP meta-interface use the
- * nominated broadcast interface to send and receive packets.
- * If there's no nominated interface, send the packets down to
- * the IPMP stub driver, which will discard them. If the
- * nominated broadcast interface changes, ill_refresh_bcast()
- * will refresh the broadcast IREs.
- */
- if ((ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL)
- ill = ipif->ipif_ill;
- }
+ ASSERT(IAM_WRITER_ILL(ill));
*irep++ = ire_create(
(uchar_t *)&addr, /* dest addr */
(uchar_t *)&ip_g_all_ones, /* mask */
- (uchar_t *)&ipif->ipif_src_addr, /* source addr */
NULL, /* no gateway */
- &ipif->ipif_mtu, /* max frag */
- NULL, /* no src nce */
- ill->ill_rq, /* recv-from queue */
- ill->ill_wq, /* send-to queue */
IRE_BROADCAST,
- ipif,
- 0,
- 0,
- 0,
- 0,
- &ire_uinfo_null,
- NULL,
- NULL,
- ipst);
-
- *irep++ = ire_create(
- (uchar_t *)&addr, /* dest address */
- (uchar_t *)&ip_g_all_ones, /* mask */
- (uchar_t *)&ipif->ipif_src_addr, /* source address */
- NULL, /* no gateway */
- &ip_loopback_mtu, /* max frag size */
- NULL, /* no src_nce */
- ill->ill_rq, /* recv-from queue */
- NULL, /* no send-to queue */
- IRE_BROADCAST, /* Needed for fanout in wput */
- ipif,
- 0,
- 0,
- 0,
- 0,
- &ire_uinfo_null,
- NULL,
+ ill,
+ zoneid,
+ RTF_KERNEL,
NULL,
ipst);
@@ -1789,174 +747,34 @@ ire_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep)
}
/*
- * ire_walk routine to delete or update any IRE_CACHE that might contain
- * stale information.
- * The flags state which entries to delete or update.
- * Garbage collection is done separately using kmem alloc callbacks to
- * ip_trash_ire_reclaim.
- * Used for both IPv4 and IPv6. However, IPv6 only uses FLUSH_MTU_TIME
- * since other stale information is cleaned up using NUD.
- */
-void
-ire_expire(ire_t *ire, char *arg)
-{
- ire_expire_arg_t *ieap = (ire_expire_arg_t *)(uintptr_t)arg;
- ill_t *stq_ill;
- int flush_flags = ieap->iea_flush_flag;
- ip_stack_t *ipst = ieap->iea_ipst;
-
- if ((flush_flags & FLUSH_REDIRECT_TIME) &&
- (ire->ire_flags & RTF_DYNAMIC)) {
- /* Make sure we delete the corresponding IRE_CACHE */
- ip1dbg(("ire_expire: all redirects\n"));
- ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
- ire_delete(ire);
- atomic_dec_32(&ipst->ips_ip_redirect_cnt);
- return;
- }
- if (ire->ire_type != IRE_CACHE)
- return;
-
- if (flush_flags & FLUSH_ARP_TIME) {
- /*
- * Remove all IRE_CACHE except IPv4 multicast ires. These
- * ires will be deleted by ip_trash_ire_reclaim_stack()
- * when system runs low in memory.
- * Verify that create time is more than ip_ire_arp_interval
- * milliseconds ago.
- */
-
- if (!(ire->ire_ipversion == IPV4_VERSION &&
- CLASSD(ire->ire_addr)) && NCE_EXPIRED(ire->ire_nce, ipst)) {
- ire_delete(ire);
- return;
- }
- }
-
- if (ipst->ips_ip_path_mtu_discovery && (flush_flags & FLUSH_MTU_TIME) &&
- (ire->ire_ipif != NULL)) {
- /* Increase pmtu if it is less than the interface mtu */
- mutex_enter(&ire->ire_lock);
- /*
- * If the ipif is a vni (whose mtu is 0, since it's virtual)
- * get the mtu from the sending interfaces' ipif
- */
- if (IS_VNI(ire->ire_ipif->ipif_ill)) {
- stq_ill = ire->ire_stq->q_ptr;
- ire->ire_max_frag = MIN(stq_ill->ill_ipif->ipif_mtu,
- IP_MAXPACKET);
- } else {
- ire->ire_max_frag = MIN(ire->ire_ipif->ipif_mtu,
- IP_MAXPACKET);
- }
- ire->ire_marks &= ~IRE_MARK_PMTU;
- ire->ire_frag_flag |= IPH_DF;
- mutex_exit(&ire->ire_lock);
- }
-}
-
-/*
- * Return any local address. We use this to target ourselves
- * when the src address was specified as 'default'.
- * Preference for IRE_LOCAL entries.
+ * This looks up an IRE_BROADCAST based on the arguments.
+ * Mirrors ire_create_bcast().
*/
ire_t *
-ire_lookup_local(zoneid_t zoneid, ip_stack_t *ipst)
+ire_lookup_bcast(ill_t *ill, ipaddr_t addr, zoneid_t zoneid)
{
- ire_t *ire;
- irb_t *irb;
- ire_t *maybe = NULL;
- int i;
+ ire_t *ire;
+ int match_args;
- for (i = 0; i < ipst->ips_ip_cache_table_size; i++) {
- irb = &ipst->ips_ip_cache_table[i];
- if (irb->irb_ire == NULL)
- continue;
- rw_enter(&irb->irb_lock, RW_READER);
- for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
- if ((ire->ire_marks & IRE_MARK_CONDEMNED) ||
- (ire->ire_zoneid != zoneid &&
- ire->ire_zoneid != ALL_ZONES))
- continue;
- switch (ire->ire_type) {
- case IRE_LOOPBACK:
- if (maybe == NULL) {
- IRE_REFHOLD(ire);
- maybe = ire;
- }
- break;
- case IRE_LOCAL:
- if (maybe != NULL) {
- ire_refrele(maybe);
- }
- IRE_REFHOLD(ire);
- rw_exit(&irb->irb_lock);
- return (ire);
- }
- }
- rw_exit(&irb->irb_lock);
- }
- return (maybe);
-}
+ match_args = MATCH_IRE_TYPE | MATCH_IRE_ILL | MATCH_IRE_GW |
+ MATCH_IRE_MASK | MATCH_IRE_ZONEONLY;
-/*
- * If the specified IRE is associated with a particular ILL, return
- * that ILL pointer (May be called as writer.).
- *
- * NOTE : This is not a generic function that can be used always.
- * This function always returns the ill of the outgoing packets
- * if this ire is used.
- */
-ill_t *
-ire_to_ill(const ire_t *ire)
-{
- ill_t *ill = NULL;
+ if (IS_UNDER_IPMP(ill))
+ match_args |= MATCH_IRE_TESTHIDDEN;
- /*
- * 1) For an IRE_CACHE, ire_ipif is the one where it obtained
- * the source address from. ire_stq is the one where the
- * packets will be sent out on. We return that here.
- *
- * 2) IRE_BROADCAST normally has a loopback and a non-loopback
- * copy and they always exist next to each other with loopback
- * copy being the first one. If we are called on the non-loopback
- * copy, return the one pointed by ire_stq. If it was called on
- * a loopback copy, we still return the one pointed by the next
- * ire's ire_stq pointer i.e the one pointed by the non-loopback
- * copy. We don't want use ire_ipif as it might represent the
- * source address (if we borrow source addresses for
- * IRE_BROADCASTS in the future).
- * However if an interface is currently coming up, the above
- * condition may not hold during that period since the ires
- * are added one at a time. Thus one of the pair could have been
- * added and the other not yet added.
- * 3) For many other IREs (e.g., IRE_LOCAL), ire_rfq indicates the ill.
- * 4) For all others return the ones pointed by ire_ipif->ipif_ill.
- * That handles IRE_LOOPBACK.
- */
-
- if (ire->ire_type == IRE_CACHE) {
- ill = (ill_t *)ire->ire_stq->q_ptr;
- } else if (ire->ire_type == IRE_BROADCAST) {
- if (ire->ire_stq != NULL) {
- ill = (ill_t *)ire->ire_stq->q_ptr;
- } else {
- ire_t *ire_next;
-
- ire_next = ire->ire_next;
- if (ire_next != NULL &&
- ire_next->ire_type == IRE_BROADCAST &&
- ire_next->ire_addr == ire->ire_addr &&
- ire_next->ire_ipif == ire->ire_ipif) {
- ill = (ill_t *)ire_next->ire_stq->q_ptr;
- }
- }
- } else if (ire->ire_rfq != NULL) {
- ill = ire->ire_rfq->q_ptr;
- } else if (ire->ire_ipif != NULL) {
- ill = ire->ire_ipif->ipif_ill;
- }
- return (ill);
+ ire = ire_ftable_lookup_v4(
+ addr, /* dest addr */
+ ip_g_all_ones, /* mask */
+ 0, /* no gateway */
+ IRE_BROADCAST,
+ ill,
+ zoneid,
+ NULL,
+ match_args,
+ 0,
+ ill->ill_ipst,
+ NULL);
+ return (ire);
}
/* Arrange to call the specified function for every IRE in the world. */
@@ -1992,15 +810,13 @@ ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers, zoneid_t zoneid,
*/
ire_walk_ill_tables(0, 0, func, arg, IP_MASK_TABLE_SIZE,
0, NULL,
- ipst->ips_ip_cache_table_size, ipst->ips_ip_cache_table,
NULL, zoneid, ipst);
}
if (vers != IPV4_VERSION) {
ire_walk_ill_tables(0, 0, func, arg, IP6_MASK_TABLE_SIZE,
ipst->ips_ip6_ftable_hash_size,
ipst->ips_ip_forwarding_table_v6,
- ipst->ips_ip6_cache_table_size,
- ipst->ips_ip_cache_table_v6, NULL, zoneid, ipst);
+ NULL, zoneid, ipst);
}
}
@@ -2016,22 +832,6 @@ ire_walk_ill(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg,
ire_walk_ill_ipvers(match_flags, ire_type, func, arg, vers, ill);
}
-void
-ire_walk_ill_v4(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg,
- ill_t *ill)
-{
- ire_walk_ill_ipvers(match_flags, ire_type, func, arg, IPV4_VERSION,
- ill);
-}
-
-void
-ire_walk_ill_v6(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg,
- ill_t *ill)
-{
- ire_walk_ill_ipvers(match_flags, ire_type, func, arg, IPV6_VERSION,
- ill);
-}
-
/*
* Walk a particular ill and version.
*/
@@ -2043,137 +843,121 @@ ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type, pfv_t func,
if (vers == IPV4_VERSION) {
ire_walk_ill_tables(match_flags, ire_type, func, arg,
- IP_MASK_TABLE_SIZE, 0,
- NULL, ipst->ips_ip_cache_table_size,
- ipst->ips_ip_cache_table, ill, ALL_ZONES, ipst);
- } else if (vers == IPV6_VERSION) {
+ IP_MASK_TABLE_SIZE,
+ 0, NULL,
+ ill, ALL_ZONES, ipst);
+ }
+ if (vers != IPV4_VERSION) {
ire_walk_ill_tables(match_flags, ire_type, func, arg,
IP6_MASK_TABLE_SIZE, ipst->ips_ip6_ftable_hash_size,
ipst->ips_ip_forwarding_table_v6,
- ipst->ips_ip6_cache_table_size,
- ipst->ips_ip_cache_table_v6, ill, ALL_ZONES, ipst);
+ ill, ALL_ZONES, ipst);
}
}
+/*
+ * Do the specific matching of IREs to shared-IP zones.
+ *
+ * We have the same logic as in ire_match_args but implemented slightly
+ * differently.
+ */
boolean_t
ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire,
ill_t *ill, zoneid_t zoneid, ip_stack_t *ipst)
{
- ill_t *ire_stq_ill = NULL;
- ill_t *ire_ipif_ill = NULL;
+ ill_t *dst_ill = NULL;
ASSERT(match_flags != 0 || zoneid != ALL_ZONES);
- /*
- * MATCH_IRE_ILL: We match both on ill pointed by ire_stq and
- * ire_ipif. Only in the case of IRE_CACHEs can ire_stq and
- * ire_ipif be pointing to different ills. But we want to keep
- * this function generic enough for future use. So, we always
- * try to match on both. The only caller of this function
- * ire_walk_ill_tables, will call "func" after we return from
- * this function. We expect "func" to do the right filtering
- * of ires in this case.
- */
if (match_flags & MATCH_IRE_ILL) {
- if (ire->ire_stq != NULL)
- ire_stq_ill = ire->ire_stq->q_ptr;
- if (ire->ire_ipif != NULL)
- ire_ipif_ill = ire->ire_ipif->ipif_ill;
+ dst_ill = ire->ire_ill;
}
- if (zoneid != ALL_ZONES) {
+ if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid &&
+ ire->ire_zoneid != ALL_ZONES) {
/*
* We're walking the IREs for a specific zone. The only relevant
* IREs are:
* - all IREs with a matching ire_zoneid
- * - all IRE_OFFSUBNETs as they're shared across all zones
- * - IRE_INTERFACE IREs for interfaces with a usable source addr
+ * - IRE_IF_ALL IREs for interfaces with a usable source addr
* with a matching zone
- * - IRE_DEFAULTs with a gateway reachable from the zone
- * We should really match on IRE_OFFSUBNETs and IRE_DEFAULTs
- * using the same rule; but the above rules are consistent with
- * the behavior of ire_ftable_lookup[_v6]() so that all the
- * routes that can be matched during lookup are also matched
- * here.
+ * - IRE_OFFLINK with a gateway reachable from the zone
+ * Note that ealier we only did the IRE_OFFLINK check for
+ * IRE_DEFAULT (and only when we had multiple IRE_DEFAULTs).
*/
- if (zoneid != ire->ire_zoneid && ire->ire_zoneid != ALL_ZONES) {
+ dst_ill = ire->ire_ill;
+
+ if (ire->ire_type & IRE_ONLINK) {
+ uint_t ifindex;
+
/*
- * Note, IRE_INTERFACE can have the stq as NULL. For
- * example, if the default multicast route is tied to
- * the loopback address.
+ * Note there is no IRE_INTERFACE on vniN thus
+ * can't do an IRE lookup for a matching route.
*/
- if ((ire->ire_type & IRE_INTERFACE) &&
- (ire->ire_stq != NULL)) {
- ire_stq_ill = (ill_t *)ire->ire_stq->q_ptr;
- if (ire->ire_ipversion == IPV4_VERSION) {
- if (!ipif_usesrc_avail(ire_stq_ill,
- zoneid))
- /* No usable src addr in zone */
- return (B_FALSE);
- } else if (ire_stq_ill->ill_usesrc_ifindex
- != 0) {
- /*
- * For IPv6 use ipif_select_source_v6()
- * so the right scope selection is done
- */
- ipif_t *src_ipif;
- src_ipif =
- ipif_select_source_v6(ire_stq_ill,
- &ire->ire_addr_v6, B_FALSE,
- IPV6_PREFER_SRC_DEFAULT,
- zoneid);
- if (src_ipif != NULL) {
- ipif_refrele(src_ipif);
- } else {
- return (B_FALSE);
- }
- } else {
- return (B_FALSE);
- }
+ ifindex = dst_ill->ill_usesrc_ifindex;
+ if (ifindex == 0)
+ return (B_FALSE);
- } else if (!(ire->ire_type & IRE_OFFSUBNET)) {
+ /*
+ * If there is a usable source address in the
+ * zone, then it's ok to return an
+ * IRE_INTERFACE
+ */
+ if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6,
+ zoneid, ipst)) {
+ return (B_FALSE);
+ }
+ }
+
+ if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) {
+ ipif_t *tipif;
+
+ mutex_enter(&dst_ill->ill_lock);
+ for (tipif = dst_ill->ill_ipif;
+ tipif != NULL; tipif = tipif->ipif_next) {
+ if (!IPIF_IS_CONDEMNED(tipif) &&
+ (tipif->ipif_flags & IPIF_UP) &&
+ (tipif->ipif_zoneid == zoneid ||
+ tipif->ipif_zoneid == ALL_ZONES))
+ break;
+ }
+ mutex_exit(&dst_ill->ill_lock);
+ if (tipif == NULL) {
return (B_FALSE);
}
}
/*
- * Match all default routes from the global zone, irrespective
+ * Match all offlink routes from the global zone, irrespective
* of reachability. For a non-global zone only match those
- * where ire_gateway_addr has a IRE_INTERFACE for the zoneid.
+ * where ire_gateway_addr has an IRE_INTERFACE for the zoneid.
*/
- if (ire->ire_type == IRE_DEFAULT && zoneid != GLOBAL_ZONEID) {
- int ire_match_flags = 0;
+ if ((ire->ire_type & IRE_OFFLINK) && zoneid != GLOBAL_ZONEID &&
+ zoneid != ALL_ZONES) {
in6_addr_t gw_addr_v6;
- ire_t *rire;
-
- ire_match_flags |= MATCH_IRE_TYPE;
- if (ire->ire_ipif != NULL)
- ire_match_flags |= MATCH_IRE_ILL;
if (ire->ire_ipversion == IPV4_VERSION) {
- rire = ire_route_lookup(ire->ire_gateway_addr,
- 0, 0, IRE_INTERFACE, ire->ire_ipif, NULL,
- zoneid, NULL, ire_match_flags, ipst);
+ if (!ire_gateway_ok_zone_v4(
+ ire->ire_gateway_addr, zoneid,
+ dst_ill, NULL, ipst, B_FALSE))
+ return (B_FALSE);
} else {
ASSERT(ire->ire_ipversion == IPV6_VERSION);
mutex_enter(&ire->ire_lock);
gw_addr_v6 = ire->ire_gateway_addr_v6;
mutex_exit(&ire->ire_lock);
- rire = ire_route_lookup_v6(&gw_addr_v6,
- NULL, NULL, IRE_INTERFACE, ire->ire_ipif,
- NULL, zoneid, NULL, ire_match_flags, ipst);
- }
- if (rire == NULL) {
- return (B_FALSE);
+
+ if (!ire_gateway_ok_zone_v6(&gw_addr_v6, zoneid,
+ dst_ill, NULL, ipst, B_FALSE))
+ return (B_FALSE);
}
- ire_refrele(rire);
}
}
if (((!(match_flags & MATCH_IRE_TYPE)) ||
(ire->ire_type & ire_type)) &&
((!(match_flags & MATCH_IRE_ILL)) ||
- (ire_stq_ill == ill || ire_ipif_ill == ill ||
- ire_ipif_ill != NULL && IS_IN_SAME_ILLGRP(ire_ipif_ill, ill)))) {
+ (dst_ill == ill ||
+ dst_ill != NULL && IS_IN_SAME_ILLGRP(dst_ill, ill)))) {
return (B_TRUE);
}
return (B_FALSE);
@@ -2197,8 +981,9 @@ rtfunc(struct radix_node *rn, void *arg)
ret = ire_walk_ill_match(rtf->rt_match_flags,
rtf->rt_ire_type, ire,
rtf->rt_ill, rtf->rt_zoneid, rtf->rt_ipst);
- } else
+ } else {
ret = B_TRUE;
+ }
if (ret)
(*rtf->rt_func)(ire, rtf->rt_arg);
}
@@ -2206,12 +991,12 @@ rtfunc(struct radix_node *rn, void *arg)
}
/*
- * Walk the ftable and the ctable entries that match the ill.
+ * Walk the ftable entries that match the ill.
*/
void
ire_walk_ill_tables(uint_t match_flags, uint_t ire_type, pfv_t func,
void *arg, size_t ftbl_sz, size_t htbl_sz, irb_t **ipftbl,
- size_t ctbl_sz, irb_t *ipctbl, ill_t *ill, zoneid_t zoneid,
+ ill_t *ill, zoneid_t zoneid,
ip_stack_t *ipst)
{
irb_t *irb_ptr;
@@ -2223,85 +1008,50 @@ ire_walk_ill_tables(uint_t match_flags, uint_t ire_type, pfv_t func,
ASSERT((!(match_flags & MATCH_IRE_ILL)) || (ill != NULL));
ASSERT(!(match_flags & MATCH_IRE_TYPE) || (ire_type != 0));
- /*
- * Optimize by not looking at the forwarding table if there
- * is a MATCH_IRE_TYPE specified with no IRE_FORWARDTABLE
- * specified in ire_type.
- */
- if (!(match_flags & MATCH_IRE_TYPE) ||
- ((ire_type & IRE_FORWARDTABLE) != 0)) {
- /* knobs such that routine is called only for v6 case */
- if (ipftbl == ipst->ips_ip_forwarding_table_v6) {
- for (i = (ftbl_sz - 1); i >= 0; i--) {
- if ((irb_ptr = ipftbl[i]) == NULL)
+
+ /* knobs such that routine is called only for v6 case */
+ if (ipftbl == ipst->ips_ip_forwarding_table_v6) {
+ for (i = (ftbl_sz - 1); i >= 0; i--) {
+ if ((irb_ptr = ipftbl[i]) == NULL)
+ continue;
+ for (j = 0; j < htbl_sz; j++) {
+ irb = &irb_ptr[j];
+ if (irb->irb_ire == NULL)
continue;
- for (j = 0; j < htbl_sz; j++) {
- irb = &irb_ptr[j];
- if (irb->irb_ire == NULL)
- continue;
-
- IRB_REFHOLD(irb);
- for (ire = irb->irb_ire; ire != NULL;
- ire = ire->ire_next) {
- if (match_flags == 0 &&
- zoneid == ALL_ZONES) {
- ret = B_TRUE;
- } else {
- ret =
- ire_walk_ill_match(
- match_flags,
- ire_type, ire, ill,
- zoneid, ipst);
- }
- if (ret)
- (*func)(ire, arg);
+
+ irb_refhold(irb);
+ for (ire = irb->irb_ire; ire != NULL;
+ ire = ire->ire_next) {
+ if (match_flags == 0 &&
+ zoneid == ALL_ZONES) {
+ ret = B_TRUE;
+ } else {
+ ret =
+ ire_walk_ill_match(
+ match_flags,
+ ire_type, ire, ill,
+ zoneid, ipst);
}
- IRB_REFRELE(irb);
+ if (ret)
+ (*func)(ire, arg);
}
+ irb_refrele(irb);
}
- } else {
- (void) memset(&rtfarg, 0, sizeof (rtfarg));
- rtfarg.rt_func = func;
- rtfarg.rt_arg = arg;
- if (match_flags != 0) {
- rtfarg.rt_match_flags = match_flags;
- }
- rtfarg.rt_ire_type = ire_type;
- rtfarg.rt_ill = ill;
- rtfarg.rt_zoneid = zoneid;
- rtfarg.rt_ipst = ipst; /* No netstack_hold */
- (void) ipst->ips_ip_ftable->rnh_walktree_mt(
- ipst->ips_ip_ftable,
- rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn);
}
- }
-
- /*
- * Optimize by not looking at the cache table if there
- * is a MATCH_IRE_TYPE specified with no IRE_CACHETABLE
- * specified in ire_type.
- */
- if (!(match_flags & MATCH_IRE_TYPE) ||
- ((ire_type & IRE_CACHETABLE) != 0)) {
- for (i = 0; i < ctbl_sz; i++) {
- irb = &ipctbl[i];
- if (irb->irb_ire == NULL)
- continue;
- IRB_REFHOLD(irb);
- for (ire = irb->irb_ire; ire != NULL;
- ire = ire->ire_next) {
- if (match_flags == 0 && zoneid == ALL_ZONES) {
- ret = B_TRUE;
- } else {
- ret = ire_walk_ill_match(
- match_flags, ire_type,
- ire, ill, zoneid, ipst);
- }
- if (ret)
- (*func)(ire, arg);
- }
- IRB_REFRELE(irb);
+ } else {
+ (void) memset(&rtfarg, 0, sizeof (rtfarg));
+ rtfarg.rt_func = func;
+ rtfarg.rt_arg = arg;
+ if (match_flags != 0) {
+ rtfarg.rt_match_flags = match_flags;
}
+ rtfarg.rt_ire_type = ire_type;
+ rtfarg.rt_ill = ill;
+ rtfarg.rt_zoneid = zoneid;
+ rtfarg.rt_ipst = ipst; /* No netstack_hold */
+ (void) ipst->ips_ip_ftable->rnh_walktree_mt(
+ ipst->ips_ip_ftable,
+ rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn);
}
}
@@ -2323,557 +1073,178 @@ ip_mask_to_plen(ipaddr_t mask)
ipaddr_t
ip_plen_to_mask(uint_t masklen)
{
+ if (masklen == 0)
+ return (0);
+
return (htonl(IP_HOST_MASK << (IP_ABITS - masklen)));
}
void
ire_atomic_end(irb_t *irb_ptr, ire_t *ire)
{
- ill_t *stq_ill, *ipif_ill;
- ip_stack_t *ipst = ire->ire_ipst;
+ ill_t *ill;
- stq_ill = ire->ire_stq != NULL ? ire->ire_stq->q_ptr : NULL;
- ipif_ill = ire->ire_ipif != NULL ? ire->ire_ipif->ipif_ill : NULL;
- RELEASE_ILL_LOCKS(ipif_ill, stq_ill);
+ ill = ire->ire_ill;
+ if (ill != NULL)
+ mutex_exit(&ill->ill_lock);
rw_exit(&irb_ptr->irb_lock);
- rw_exit(&ipst->ips_ill_g_usesrc_lock);
}
/*
- * ire_add_v[46] atomically make sure that the ipif or ill associated
- * with the new ire being added is stable and not IPIF_CHANGING or ILL_CHANGING
- * before adding the ire to the table. This ensures that we don't create
- * new IRE_CACHEs with stale values for parameters that are passed to
- * ire_create such as ire_max_frag. Note that ire_create() is passed a pointer
- * to the ipif_mtu, and not the value. The actual value is derived from the
- * parent ire or ipif under the bucket lock.
+ * ire_add_v[46] atomically make sure that the ill associated
+ * with the new ire is not going away i.e., we check ILL_CONDEMNED.
*/
int
-ire_atomic_start(irb_t *irb_ptr, ire_t *ire, queue_t *q, mblk_t *mp,
- ipsq_func_t func)
+ire_atomic_start(irb_t *irb_ptr, ire_t *ire)
{
- ill_t *stq_ill;
- ill_t *ipif_ill;
- int error = 0;
- ill_t *ill = NULL;
- ip_stack_t *ipst = ire->ire_ipst;
+ ill_t *ill;
- stq_ill = ire->ire_stq != NULL ? ire->ire_stq->q_ptr : NULL;
- ipif_ill = ire->ire_ipif != NULL ? ire->ire_ipif->ipif_ill : NULL;
+ ill = ire->ire_ill;
- ASSERT((q != NULL && mp != NULL && func != NULL) ||
- (q == NULL && mp == NULL && func == NULL));
- rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER);
- GRAB_CONN_LOCK(q);
rw_enter(&irb_ptr->irb_lock, RW_WRITER);
- GRAB_ILL_LOCKS(ipif_ill, stq_ill);
+ if (ill != NULL) {
+ mutex_enter(&ill->ill_lock);
- /*
- * While the IRE is in the process of being added, a user may have
- * invoked the ifconfig usesrc option on the stq_ill to make it a
- * usesrc client ILL. Check for this possibility here, if it is true
- * then we fail adding the IRE_CACHE. Another check is to make sure
- * that an ipif_ill of an IRE_CACHE being added is not part of a usesrc
- * group. The ill_g_usesrc_lock is released in ire_atomic_end
- */
- if ((ire->ire_type & IRE_CACHE) &&
- (ire->ire_marks & IRE_MARK_USESRC_CHECK)) {
- if (stq_ill->ill_usesrc_ifindex != 0) {
- ASSERT(stq_ill->ill_usesrc_grp_next != NULL);
- if ((ipif_ill->ill_phyint->phyint_ifindex !=
- stq_ill->ill_usesrc_ifindex) ||
- (ipif_ill->ill_usesrc_grp_next == NULL) ||
- (ipif_ill->ill_usesrc_ifindex != 0)) {
- error = EINVAL;
- goto done;
- }
- } else if (ipif_ill->ill_usesrc_grp_next != NULL) {
- error = EINVAL;
- goto done;
+ /*
+ * Don't allow IRE's to be created on dying ills.
+ */
+ if (ill->ill_state_flags & ILL_CONDEMNED) {
+ ire_atomic_end(irb_ptr, ire);
+ return (ENXIO);
}
- }
- /*
- * Don't allow IRE's to be created on changing ill's. Also, since
- * IPMP flags can be set on an ill without quiescing it, if we're not
- * a writer on stq_ill, check that the flags still allow IRE creation.
- */
- if ((stq_ill != NULL) && !IAM_WRITER_ILL(stq_ill)) {
- if (stq_ill->ill_state_flags & ILL_CHANGING) {
- ill = stq_ill;
- error = EAGAIN;
- } else if (IS_UNDER_IPMP(stq_ill)) {
- mutex_enter(&stq_ill->ill_phyint->phyint_lock);
- if (!ipmp_ill_is_active(stq_ill) &&
- !(ire->ire_marks & IRE_MARK_TESTHIDDEN)) {
+ if (IS_UNDER_IPMP(ill)) {
+ int error = 0;
+ mutex_enter(&ill->ill_phyint->phyint_lock);
+ if (!ipmp_ill_is_active(ill) &&
+ IRE_HIDDEN_TYPE(ire->ire_type) &&
+ !ire->ire_testhidden) {
error = EINVAL;
}
- mutex_exit(&stq_ill->ill_phyint->phyint_lock);
+ mutex_exit(&ill->ill_phyint->phyint_lock);
+ if (error != 0) {
+ ire_atomic_end(irb_ptr, ire);
+ return (error);
+ }
}
- if (error != 0)
- goto done;
- }
- if ((ipif_ill != NULL) && !IAM_WRITER_ILL(ipif_ill) &&
- (ipif_ill->ill_state_flags & ILL_CHANGING)) {
- ill = ipif_ill;
- error = EAGAIN;
- goto done;
}
-
- if ((ire->ire_ipif != NULL) && !IAM_WRITER_IPIF(ire->ire_ipif) &&
- (ire->ire_ipif->ipif_state_flags & IPIF_CHANGING)) {
- ill = ire->ire_ipif->ipif_ill;
- ASSERT(ill != NULL);
- error = EAGAIN;
- goto done;
- }
-
-done:
- if (error == EAGAIN && ILL_CAN_WAIT(ill, q)) {
- ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
- mutex_enter(&ipsq->ipsq_lock);
- mutex_enter(&ipsq->ipsq_xop->ipx_lock);
- ire_atomic_end(irb_ptr, ire);
- ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
- mutex_exit(&ipsq->ipsq_xop->ipx_lock);
- mutex_exit(&ipsq->ipsq_lock);
- error = EINPROGRESS;
- } else if (error != 0) {
- ire_atomic_end(irb_ptr, ire);
- }
-
- RELEASE_CONN_LOCK(q);
- return (error);
+ return (0);
}
/*
- * Add a fully initialized IRE to an appropriate table based on
- * ire_type.
- *
- * allow_unresolved == B_FALSE indicates a legacy code-path call
- * that has prohibited the addition of incomplete ire's. If this
- * parameter is set, and we find an nce that is in a state other
- * than ND_REACHABLE, we fail the add. Note that nce_state could be
- * something other than ND_REACHABLE if the nce had just expired and
- * the ire_create preceding the ire_add added a new ND_INITIAL nce.
+ * Add a fully initialized IRE to the forwarding table.
+ * This returns NULL on failure, or a held IRE on success.
+ * Normally the returned IRE is the same as the argument. But a different
+ * IRE will be returned if the added IRE is deemed identical to an existing
+ * one. In that case ire_identical_ref will be increased.
+ * The caller always needs to do an ire_refrele() on the returned IRE.
*/
-int
-ire_add(ire_t **irep, queue_t *q, mblk_t *mp, ipsq_func_t func,
- boolean_t allow_unresolved)
+ire_t *
+ire_add(ire_t *ire)
{
- ire_t *ire1;
- ill_t *stq_ill = NULL;
- ill_t *ill;
- ipif_t *ipif = NULL;
- ill_walk_context_t ctx;
- ire_t *ire = *irep;
- int error;
- boolean_t ire_is_mblk = B_FALSE;
- tsol_gcgrp_t *gcgrp = NULL;
- tsol_gcgrp_addr_t ga;
- ip_stack_t *ipst = ire->ire_ipst;
-
- /* get ready for the day when original ire is not created as mblk */
- if (ire->ire_mp != NULL) {
- ire_is_mblk = B_TRUE;
- /* Copy the ire to a kmem_alloc'ed area */
- ire1 = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
- if (ire1 == NULL) {
- ip1dbg(("ire_add: alloc failed\n"));
- ire_delete(ire);
- *irep = NULL;
- return (ENOMEM);
- }
- ire->ire_marks &= ~IRE_MARK_UNCACHED;
- *ire1 = *ire;
- ire1->ire_mp = NULL;
- ire1->ire_stq_ifindex = 0;
- freeb(ire->ire_mp);
- ire = ire1;
- }
- if (ire->ire_stq != NULL)
- stq_ill = ire->ire_stq->q_ptr;
-
- if (stq_ill != NULL && ire->ire_type == IRE_CACHE &&
- stq_ill->ill_net_type == IRE_IF_RESOLVER) {
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- ill = ILL_START_WALK_ALL(&ctx, ipst);
- for (; ill != NULL; ill = ill_next(&ctx, ill)) {
- mutex_enter(&ill->ill_lock);
- if (ill->ill_state_flags & ILL_CONDEMNED) {
- mutex_exit(&ill->ill_lock);
- continue;
- }
- /*
- * We need to make sure that the ipif is a valid one
- * before adding the IRE_CACHE. This happens only
- * with IRE_CACHE when there is an external resolver.
- *
- * We can unplumb a logical interface while the
- * packet is waiting in ARP with the IRE. Then,
- * later on when we feed the IRE back, the ipif
- * has to be re-checked. This can't happen with
- * NDP currently, as we never queue the IRE with
- * the packet. We always try to recreate the IRE
- * when the resolution is completed. But, we do
- * it for IPv6 also here so that in future if
- * we have external resolvers, it will work without
- * any change.
- */
- ipif = ipif_lookup_seqid(ill, ire->ire_ipif_seqid);
- if (ipif != NULL) {
- ipif_refhold_locked(ipif);
- mutex_exit(&ill->ill_lock);
- break;
- }
- mutex_exit(&ill->ill_lock);
- }
- rw_exit(&ipst->ips_ill_g_lock);
- if (ipif == NULL ||
- (ipif->ipif_isv6 &&
- !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6) &&
- !IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6,
- &ipif->ipif_v6src_addr)) ||
- (!ipif->ipif_isv6 &&
- ire->ire_src_addr != ipif->ipif_src_addr) ||
- ire->ire_zoneid != ipif->ipif_zoneid) {
- if (ipif != NULL)
- ipif_refrele(ipif);
- ire->ire_ipif = NULL;
- ire_delete(ire);
- *irep = NULL;
- return (EINVAL);
- }
-
- ASSERT(ill != NULL);
-
+ if (IRE_HIDDEN_TYPE(ire->ire_type) &&
+ ire->ire_ill != NULL && IS_UNDER_IPMP(ire->ire_ill)) {
/*
- * Since we didn't attach label security attributes to the
- * ire for the resolver case, we need to add it now. (only
- * for v4 resolver and v6 xresolv case).
+ * IREs hosted on interfaces that are under IPMP
+ * should be hidden so that applications don't
+ * accidentally end up sending packets with test
+ * addresses as their source addresses, or
+ * sending out interfaces that are e.g. IFF_INACTIVE.
+ * Hide them here.
*/
- if (is_system_labeled() && ire_is_mblk) {
- if (ire->ire_ipversion == IPV4_VERSION) {
- ga.ga_af = AF_INET;
- IN6_IPADDR_TO_V4MAPPED(ire->ire_gateway_addr !=
- INADDR_ANY ? ire->ire_gateway_addr :
- ire->ire_addr, &ga.ga_addr);
- } else {
- ga.ga_af = AF_INET6;
- ga.ga_addr = IN6_IS_ADDR_UNSPECIFIED(
- &ire->ire_gateway_addr_v6) ?
- ire->ire_addr_v6 :
- ire->ire_gateway_addr_v6;
- }
- gcgrp = gcgrp_lookup(&ga, B_FALSE);
- error = tsol_ire_init_gwattr(ire, ire->ire_ipversion,
- NULL, gcgrp);
- if (error != 0) {
- if (gcgrp != NULL) {
- GCGRP_REFRELE(gcgrp);
- gcgrp = NULL;
- }
- ipif_refrele(ipif);
- ire->ire_ipif = NULL;
- ire_delete(ire);
- *irep = NULL;
- return (error);
- }
- }
+ ire->ire_testhidden = B_TRUE;
}
- /*
- * In case ire was changed
- */
- *irep = ire;
if (ire->ire_ipversion == IPV6_VERSION)
- error = ire_add_v6(irep, q, mp, func);
+ return (ire_add_v6(ire));
else
- error = ire_add_v4(irep, q, mp, func, allow_unresolved);
- if (ipif != NULL)
- ipif_refrele(ipif);
- return (error);
+ return (ire_add_v4(ire));
}
/*
- * Add an initialized IRE to an appropriate table based on ire_type.
- *
- * The forward table contains IRE_PREFIX/IRE_HOST and
- * IRE_IF_RESOLVER/IRE_IF_NORESOLVER and IRE_DEFAULT.
- *
- * The cache table contains IRE_BROADCAST/IRE_LOCAL/IRE_LOOPBACK
- * and IRE_CACHE.
- *
- * NOTE : This function is called as writer though not required
- * by this function.
+ * Add a fully initialized IPv4 IRE to the forwarding table.
+ * This returns NULL on failure, or a held IRE on success.
+ * Normally the returned IRE is the same as the argument. But a different
+ * IRE will be returned if the added IRE is deemed identical to an existing
+ * one. In that case ire_identical_ref will be increased.
+ * The caller always needs to do an ire_refrele() on the returned IRE.
*/
-static int
-ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
- boolean_t allow_unresolved)
+static ire_t *
+ire_add_v4(ire_t *ire)
{
ire_t *ire1;
irb_t *irb_ptr;
ire_t **irep;
- int flags;
- ire_t *pire = NULL;
- ill_t *stq_ill;
- ire_t *ire = *ire_p;
+ int match_flags;
int error;
- boolean_t need_refrele = B_FALSE;
- nce_t *nce;
ip_stack_t *ipst = ire->ire_ipst;
- uint_t marks = 0;
- /*
- * IREs with source addresses hosted on interfaces that are under IPMP
- * should be hidden so that applications don't accidentally end up
- * sending packets with test addresses as their source addresses, or
- * sending out interfaces that are e.g. IFF_INACTIVE. Hide them here.
- */
- if (ire->ire_ipif != NULL && IS_UNDER_IPMP(ire->ire_ipif->ipif_ill))
- marks |= IRE_MARK_TESTHIDDEN;
-
- if (ire->ire_ipif != NULL)
- ASSERT(!MUTEX_HELD(&ire->ire_ipif->ipif_ill->ill_lock));
- if (ire->ire_stq != NULL)
- ASSERT(!MUTEX_HELD(
- &((ill_t *)(ire->ire_stq->q_ptr))->ill_lock));
+ if (ire->ire_ill != NULL)
+ ASSERT(!MUTEX_HELD(&ire->ire_ill->ill_lock));
ASSERT(ire->ire_ipversion == IPV4_VERSION);
- ASSERT(ire->ire_mp == NULL); /* Calls should go through ire_add */
-
- /* Find the appropriate list head. */
- switch (ire->ire_type) {
- case IRE_HOST:
- ire->ire_mask = IP_HOST_MASK;
- ire->ire_masklen = IP_ABITS;
- ire->ire_marks |= marks;
- if ((ire->ire_flags & RTF_SETSRC) == 0)
- ire->ire_src_addr = 0;
- break;
- case IRE_CACHE:
- ire->ire_mask = IP_HOST_MASK;
- ire->ire_masklen = IP_ABITS;
- ire->ire_marks |= marks;
- break;
- case IRE_BROADCAST:
- case IRE_LOCAL:
- case IRE_LOOPBACK:
- ire->ire_mask = IP_HOST_MASK;
- ire->ire_masklen = IP_ABITS;
- break;
- case IRE_PREFIX:
- case IRE_DEFAULT:
- ire->ire_marks |= marks;
- if ((ire->ire_flags & RTF_SETSRC) == 0)
- ire->ire_src_addr = 0;
- break;
- case IRE_IF_RESOLVER:
- case IRE_IF_NORESOLVER:
- ire->ire_marks |= marks;
- break;
- default:
- ip0dbg(("ire_add_v4: ire %p has unrecognized IRE type (%d)\n",
- (void *)ire, ire->ire_type));
- ire_delete(ire);
- *ire_p = NULL;
- return (EINVAL);
- }
/* Make sure the address is properly masked. */
ire->ire_addr &= ire->ire_mask;
- /*
- * ip_newroute/ip_newroute_multi are unable to prevent the deletion
- * of the interface route while adding an IRE_CACHE for an on-link
- * destination in the IRE_IF_RESOLVER case, since the ire has to
- * go to ARP and return. We can't do a REFHOLD on the
- * associated interface ire for fear of ARP freeing the message.
- * Here we look up the interface ire in the forwarding table and
- * make sure that the interface route has not been deleted.
- */
- if (ire->ire_type == IRE_CACHE && ire->ire_gateway_addr == 0 &&
- ((ill_t *)ire->ire_stq->q_ptr)->ill_net_type == IRE_IF_RESOLVER) {
-
- ASSERT(ire->ire_max_fragp == NULL);
- if (CLASSD(ire->ire_addr) && !(ire->ire_flags & RTF_SETSRC)) {
- /*
- * The ihandle that we used in ip_newroute_multi
- * comes from the interface route corresponding
- * to ire_ipif. Lookup here to see if it exists
- * still.
- * If the ire has a source address assigned using
- * RTF_SETSRC, ire_ipif is the logical interface holding
- * this source address, so we can't use it to check for
- * the existence of the interface route. Instead we rely
- * on the brute force ihandle search in
- * ire_ihandle_lookup_onlink() below.
- */
- pire = ipif_to_ire(ire->ire_ipif);
- if (pire == NULL) {
- ire_delete(ire);
- *ire_p = NULL;
- return (EINVAL);
- } else if (pire->ire_ihandle != ire->ire_ihandle) {
- ire_refrele(pire);
- ire_delete(ire);
- *ire_p = NULL;
- return (EINVAL);
- }
- } else {
- pire = ire_ihandle_lookup_onlink(ire);
- if (pire == NULL) {
- ire_delete(ire);
- *ire_p = NULL;
- return (EINVAL);
- }
- }
- /* Prevent pire from getting deleted */
- IRB_REFHOLD(pire->ire_bucket);
- /* Has it been removed already ? */
- if (pire->ire_marks & IRE_MARK_CONDEMNED) {
- IRB_REFRELE(pire->ire_bucket);
- ire_refrele(pire);
- ire_delete(ire);
- *ire_p = NULL;
- return (EINVAL);
- }
- } else {
- ASSERT(ire->ire_max_fragp != NULL);
- }
- flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW);
+ match_flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW);
- if (ire->ire_ipif != NULL) {
- /*
- * We use MATCH_IRE_IPIF while adding IRE_CACHES only
- * for historic reasons and to maintain symmetry with
- * IPv6 code path. Historically this was used by
- * multicast code to create multiple IRE_CACHES on
- * a single ill with different ipifs. This was used
- * so that multicast packets leaving the node had the
- * right source address. This is no longer needed as
- * ip_wput initializes the address correctly.
- */
- flags |= MATCH_IRE_IPIF;
- /*
- * If we are creating a hidden IRE, make sure we search for
- * hidden IREs when searching for duplicates below.
- * Otherwise, we might find an IRE on some other interface
- * that's not marked hidden.
- */
- if (ire->ire_marks & IRE_MARK_TESTHIDDEN)
- flags |= MATCH_IRE_MARK_TESTHIDDEN;
+ if (ire->ire_ill != NULL) {
+ match_flags |= MATCH_IRE_ILL;
}
- if ((ire->ire_type & IRE_CACHETABLE) == 0) {
- irb_ptr = ire_get_bucket(ire);
- need_refrele = B_TRUE;
- if (irb_ptr == NULL) {
- /*
- * This assumes that the ire has not added
- * a reference to the ipif.
- */
- ire->ire_ipif = NULL;
- ire_delete(ire);
- if (pire != NULL) {
- IRB_REFRELE(pire->ire_bucket);
- ire_refrele(pire);
- }
- *ire_p = NULL;
- return (EINVAL);
- }
- } else {
- irb_ptr = &(ipst->ips_ip_cache_table[IRE_ADDR_HASH(
- ire->ire_addr, ipst->ips_ip_cache_table_size)]);
+ irb_ptr = ire_get_bucket(ire);
+ if (irb_ptr == NULL) {
+ printf("no bucket for %p\n", (void *)ire);
+ ire_delete(ire);
+ return (NULL);
}
/*
- * Start the atomic add of the ire. Grab the ill locks,
- * ill_g_usesrc_lock and the bucket lock. Check for condemned
- *
- * If ipif or ill is changing ire_atomic_start() may queue the
- * request and return EINPROGRESS.
- * To avoid lock order problems, get the ndp4->ndp_g_lock.
+ * Start the atomic add of the ire. Grab the ill lock,
+ * the bucket lock. Check for condemned.
*/
- mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
- error = ire_atomic_start(irb_ptr, ire, q, mp, func);
+ error = ire_atomic_start(irb_ptr, ire);
if (error != 0) {
- mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
- /*
- * We don't know whether it is a valid ipif or not.
- * So, set it to NULL. This assumes that the ire has not added
- * a reference to the ipif.
- */
- ire->ire_ipif = NULL;
+ printf("no ire_atomic_start for %p\n", (void *)ire);
ire_delete(ire);
- if (pire != NULL) {
- IRB_REFRELE(pire->ire_bucket);
- ire_refrele(pire);
- }
- *ire_p = NULL;
- if (need_refrele)
- IRB_REFRELE(irb_ptr);
- return (error);
+ irb_refrele(irb_ptr);
+ return (NULL);
}
/*
- * To avoid creating ires having stale values for the ire_max_frag
- * we get the latest value atomically here. For more details
- * see the block comment in ip_sioctl_mtu and in DL_NOTE_SDU_CHANGE
- * in ip_rput_dlpi_writer
+ * If we are creating a hidden IRE, make sure we search for
+ * hidden IREs when searching for duplicates below.
+ * Otherwise, we might find an IRE on some other interface
+ * that's not marked hidden.
*/
- if (ire->ire_max_fragp == NULL) {
- if (CLASSD(ire->ire_addr))
- ire->ire_max_frag = ire->ire_ipif->ipif_mtu;
- else
- ire->ire_max_frag = pire->ire_max_frag;
- } else {
- uint_t max_frag;
+ if (ire->ire_testhidden)
+ match_flags |= MATCH_IRE_TESTHIDDEN;
- max_frag = *ire->ire_max_fragp;
- ire->ire_max_fragp = NULL;
- ire->ire_max_frag = max_frag;
- }
/*
* Atomically check for duplicate and insert in the table.
*/
for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
- if (ire1->ire_marks & IRE_MARK_CONDEMNED)
+ if (IRE_IS_CONDEMNED(ire1))
continue;
- if (ire->ire_ipif != NULL) {
- /*
- * We do MATCH_IRE_ILL implicitly here for IREs
- * with a non-null ire_ipif, including IRE_CACHEs.
- * As ire_ipif and ire_stq could point to two
- * different ills, we can't pass just ire_ipif to
- * ire_match_args and get a match on both ills.
- * This is just needed for duplicate checks here and
- * so we don't add an extra argument to
- * ire_match_args for this. Do it locally.
- *
- * NOTE : Currently there is no part of the code
- * that asks for both MATH_IRE_IPIF and MATCH_IRE_ILL
- * match for IRE_CACHEs. Thus we don't want to
- * extend the arguments to ire_match_args.
- */
- if (ire1->ire_stq != ire->ire_stq)
- continue;
- /*
- * Multiroute IRE_CACHEs for a given destination can
- * have the same ire_ipif, typically if their source
- * address is forced using RTF_SETSRC, and the same
- * send-to queue. We differentiate them using the parent
- * handle.
- */
- if (ire->ire_type == IRE_CACHE &&
- (ire1->ire_flags & RTF_MULTIRT) &&
- (ire->ire_flags & RTF_MULTIRT) &&
- (ire1->ire_phandle != ire->ire_phandle))
- continue;
- }
+ /*
+ * Here we need an exact match on zoneid, i.e.,
+ * ire_match_args doesn't fit.
+ */
if (ire1->ire_zoneid != ire->ire_zoneid)
continue;
+
+ if (ire1->ire_type != ire->ire_type)
+ continue;
+
+ /*
+ * Note: We do not allow multiple routes that differ only
+ * in the gateway security attributes; such routes are
+ * considered duplicates.
+ * To change that we explicitly have to treat them as
+ * different here.
+ */
if (ire_match_args(ire1, ire->ire_addr, ire->ire_mask,
- ire->ire_gateway_addr, ire->ire_type, ire->ire_ipif,
- ire->ire_zoneid, 0, NULL, flags, NULL)) {
+ ire->ire_gateway_addr, ire->ire_type, ire->ire_ill,
+ ire->ire_zoneid, NULL, match_flags)) {
/*
* Return the old ire after doing a REFHOLD.
* As most of the callers continue to use the IRE
@@ -2881,149 +1252,36 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func,
* avoid a lookup in the caller again. If the callers
* don't want to use it, they need to do a REFRELE.
*/
- ip1dbg(("found dup ire existing %p new %p\n",
- (void *)ire1, (void *)ire));
- IRE_REFHOLD(ire1);
+ atomic_add_32(&ire1->ire_identical_ref, 1);
+ DTRACE_PROBE2(ire__add__exist, ire_t *, ire1,
+ ire_t *, ire);
+ ire_refhold(ire1);
ire_atomic_end(irb_ptr, ire);
- mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
ire_delete(ire);
- if (pire != NULL) {
- /*
- * Assert that it is not removed from the
- * list yet.
- */
- ASSERT(pire->ire_ptpn != NULL);
- IRB_REFRELE(pire->ire_bucket);
- ire_refrele(pire);
- }
- *ire_p = ire1;
- if (need_refrele)
- IRB_REFRELE(irb_ptr);
- return (0);
+ irb_refrele(irb_ptr);
+ return (ire1);
}
}
- if (ire->ire_type & IRE_CACHE) {
- ASSERT(ire->ire_stq != NULL);
- nce = ndp_lookup_v4(ire_to_ill(ire),
- ((ire->ire_gateway_addr != INADDR_ANY) ?
- &ire->ire_gateway_addr : &ire->ire_addr),
- B_TRUE);
- if (nce != NULL)
- mutex_enter(&nce->nce_lock);
- /*
- * if the nce is NCE_F_CONDEMNED, or if it is not ND_REACHABLE
- * and the caller has prohibited the addition of incomplete
- * ire's, we fail the add. Note that nce_state could be
- * something other than ND_REACHABLE if the nce had
- * just expired and the ire_create preceding the
- * ire_add added a new ND_INITIAL nce.
- */
- if ((nce == NULL) ||
- (nce->nce_flags & NCE_F_CONDEMNED) ||
- (!allow_unresolved &&
- (nce->nce_state != ND_REACHABLE))) {
- if (nce != NULL) {
- DTRACE_PROBE1(ire__bad__nce, nce_t *, nce);
- mutex_exit(&nce->nce_lock);
- }
- ire_atomic_end(irb_ptr, ire);
- mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
- if (nce != NULL)
- NCE_REFRELE(nce);
- DTRACE_PROBE1(ire__no__nce, ire_t *, ire);
- ire_delete(ire);
- if (pire != NULL) {
- IRB_REFRELE(pire->ire_bucket);
- ire_refrele(pire);
- }
- *ire_p = NULL;
- if (need_refrele)
- IRB_REFRELE(irb_ptr);
- return (EINVAL);
- } else {
- ire->ire_nce = nce;
- mutex_exit(&nce->nce_lock);
- /*
- * We are associating this nce to the ire, so
- * change the nce ref taken in ndp_lookup_v4() from
- * NCE_REFHOLD to NCE_REFHOLD_NOTR
- */
- NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce);
- }
- }
/*
- * Make it easy for ip_wput_ire() to hit multiple broadcast ires by
- * grouping identical addresses together on the hash chain. We do
- * this only for IRE_BROADCASTs as ip_wput_ire is currently interested
- * in such groupings only for broadcasts.
- *
- * Find the first entry that matches ire_addr. *irep will be null
- * if no match.
- *
- * Note: the loopback and non-loopback broadcast entries for an
- * interface MUST be added before any MULTIRT entries.
+ * Normally we do head insertion since most things do not care about
+ * the order of the IREs in the bucket. Note that ip_cgtp_bcast_add
+ * assumes we at least do head insertion so that its IRE_BROADCAST
+ * arrive ahead of existing IRE_HOST for the same address.
+ * However, due to shared-IP zones (and restrict_interzone_loopback)
+ * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same
+ * address. For that reason we do tail insertion for IRE_IF_CLONE.
+ * Due to the IRE_BROADCAST on cgtp0, which must be last in the bucket,
+ * we do tail insertion of IRE_BROADCASTs that do not have RTF_MULTIRT
+ * set.
*/
irep = (ire_t **)irb_ptr;
- while ((ire1 = *irep) != NULL && ire->ire_addr != ire1->ire_addr)
- irep = &ire1->ire_next;
- if (ire->ire_type == IRE_BROADCAST && *irep != NULL) {
- /*
- * We found some ire (i.e *irep) with a matching addr. We
- * want to group ires with same addr.
- */
- for (;;) {
- ire1 = *irep;
- if ((ire1->ire_next == NULL) ||
- (ire1->ire_next->ire_addr != ire->ire_addr) ||
- (ire1->ire_type != IRE_BROADCAST) ||
- (ire1->ire_flags & RTF_MULTIRT) ||
- (ire1->ire_ipif->ipif_ill->ill_grp ==
- ire->ire_ipif->ipif_ill->ill_grp))
- break;
- irep = &ire1->ire_next;
- }
- ASSERT(*irep != NULL);
- /*
- * The ire will be added before *irep, so
- * if irep is a MULTIRT ire, just break to
- * ire insertion code.
- */
- if (((*irep)->ire_flags & RTF_MULTIRT) != 0)
- goto insert_ire;
-
- irep = &((*irep)->ire_next);
-
- /*
- * Either we have hit the end of the list or the address
- * did not match.
- */
- while (*irep != NULL) {
- ire1 = *irep;
- if ((ire1->ire_addr != ire->ire_addr) ||
- (ire1->ire_type != IRE_BROADCAST))
- break;
- if (ire1->ire_ipif == ire->ire_ipif) {
- irep = &ire1->ire_next;
- break;
- }
- irep = &ire1->ire_next;
- }
- } else if (*irep != NULL) {
- /*
- * Find the last ire which matches ire_addr.
- * Needed to do tail insertion among entries with the same
- * ire_addr.
- */
- while (ire->ire_addr == ire1->ire_addr) {
+ if ((ire->ire_type & IRE_IF_CLONE) ||
+ ((ire->ire_type & IRE_BROADCAST) &&
+ !(ire->ire_flags & RTF_MULTIRT))) {
+ while ((ire1 = *irep) != NULL)
irep = &ire1->ire_next;
- ire1 = *irep;
- if (ire1 == NULL)
- break;
- }
}
-
-insert_ire:
/* Insert at *irep */
ire1 = *irep;
if (ire1 != NULL)
@@ -3058,82 +1316,31 @@ insert_ire:
* in the list for the first time and no one else can bump
* up the reference count on this yet.
*/
- IRE_REFHOLD_LOCKED(ire);
+ ire_refhold_locked(ire);
BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_inserted);
irb_ptr->irb_ire_cnt++;
- if (irb_ptr->irb_marks & IRB_MARK_FTABLE)
+ if (irb_ptr->irb_marks & IRB_MARK_DYNAMIC)
irb_ptr->irb_nire++;
- if (ire->ire_marks & IRE_MARK_TEMPORARY)
- irb_ptr->irb_tmp_ire_cnt++;
-
- if (ire->ire_ipif != NULL) {
- DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *), ire->ire_ipif,
- (char *), "ire", (void *), ire);
- ire->ire_ipif->ipif_ire_cnt++;
- if (ire->ire_stq != NULL) {
- stq_ill = (ill_t *)ire->ire_stq->q_ptr;
- DTRACE_PROBE3(ill__incr__cnt, (ill_t *), stq_ill,
- (char *), "ire", (void *), ire);
- stq_ill->ill_ire_cnt++;
- }
- } else {
- ASSERT(ire->ire_stq == NULL);
+ if (ire->ire_ill != NULL) {
+ ire->ire_ill->ill_ire_cnt++;
+ ASSERT(ire->ire_ill->ill_ire_cnt != 0); /* Wraparound */
}
ire_atomic_end(irb_ptr, ire);
- mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
- if (pire != NULL) {
- /* Assert that it is not removed from the list yet */
- ASSERT(pire->ire_ptpn != NULL);
- IRB_REFRELE(pire->ire_bucket);
- ire_refrele(pire);
- }
+ /* Make any caching of the IREs be notified or updated */
+ ire_flush_cache_v4(ire, IRE_FLUSH_ADD);
- if (ire->ire_type != IRE_CACHE) {
- /*
- * For ire's with host mask see if there is an entry
- * in the cache. If there is one flush the whole cache as
- * there might be multiple entries due to RTF_MULTIRT (CGTP).
- * If no entry is found than there is no need to flush the
- * cache.
- */
- if (ire->ire_mask == IP_HOST_MASK) {
- ire_t *lire;
- lire = ire_ctable_lookup(ire->ire_addr, NULL, IRE_CACHE,
- NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
- if (lire != NULL) {
- ire_refrele(lire);
- ire_flush_cache_v4(ire, IRE_FLUSH_ADD);
- }
- } else {
- ire_flush_cache_v4(ire, IRE_FLUSH_ADD);
- }
- }
- /*
- * We had to delay the fast path probe until the ire is inserted
- * in the list. Otherwise the fast path ack won't find the ire in
- * the table.
- */
- if (ire->ire_type == IRE_CACHE ||
- (ire->ire_type == IRE_BROADCAST && ire->ire_stq != NULL)) {
- ASSERT(ire->ire_nce != NULL);
- if (ire->ire_nce->nce_state == ND_REACHABLE)
- nce_fastpath(ire->ire_nce);
- }
- if (ire->ire_ipif != NULL)
- ASSERT(!MUTEX_HELD(&ire->ire_ipif->ipif_ill->ill_lock));
- *ire_p = ire;
- if (need_refrele) {
- IRB_REFRELE(irb_ptr);
- }
- return (0);
+ if (ire->ire_ill != NULL)
+ ASSERT(!MUTEX_HELD(&ire->ire_ill->ill_lock));
+ irb_refrele(irb_ptr);
+ return (ire);
}
/*
- * IRB_REFRELE is the only caller of the function. ire_unlink calls to
+ * irb_refrele is the only caller of the function. ire_unlink calls to
* do the final cleanup for this ire.
*/
void
@@ -3162,13 +1369,13 @@ ire_cleanup(ire_t *ire)
* so.
*/
ire->ire_next = NULL;
- IRE_REFRELE_NOTR(ire);
+ ire_refrele_notr(ire);
ire = ire_next;
}
}
/*
- * IRB_REFRELE is the only caller of the function. It calls to unlink
+ * irb_refrele is the only caller of the function. It calls to unlink
* all the CONDEMNED ires from this bucket.
*/
ire_t *
@@ -3180,16 +1387,14 @@ ire_unlink(irb_t *irb)
ire_t *ire_list = NULL;
ASSERT(RW_WRITE_HELD(&irb->irb_lock));
- ASSERT(((irb->irb_marks & IRB_MARK_FTABLE) && irb->irb_refcnt == 1) ||
+ ASSERT(((irb->irb_marks & IRB_MARK_DYNAMIC) && irb->irb_refcnt == 1) ||
(irb->irb_refcnt == 0));
ASSERT(irb->irb_marks & IRB_MARK_CONDEMNED);
ASSERT(irb->irb_ire != NULL);
for (ire = irb->irb_ire; ire != NULL; ire = ire1) {
- ip_stack_t *ipst = ire->ire_ipst;
-
ire1 = ire->ire_next;
- if (ire->ire_marks & IRE_MARK_CONDEMNED) {
+ if (IRE_IS_CONDEMNED(ire)) {
ptpn = ire->ire_ptpn;
ire1 = ire->ire_next;
if (ire1)
@@ -3197,22 +1402,10 @@ ire_unlink(irb_t *irb)
*ptpn = ire1;
ire->ire_ptpn = NULL;
ire->ire_next = NULL;
- if (ire->ire_type == IRE_DEFAULT) {
- /*
- * IRE is out of the list. We need to adjust
- * the accounting before the caller drops
- * the lock.
- */
- if (ire->ire_ipversion == IPV6_VERSION) {
- ASSERT(ipst->
- ips_ipv6_ire_default_count !=
- 0);
- ipst->ips_ipv6_ire_default_count--;
- }
- }
+
/*
- * We need to call ire_delete_v4 or ire_delete_v6
- * to clean up the cache or the redirects pointing at
+ * We need to call ire_delete_v4 or ire_delete_v6 to
+ * clean up dependents and the redirects pointing at
* the default gateway. We need to drop the lock
* as ire_flush_cache/ire_delete_host_redircts require
* so. But we can't drop the lock, as ire_unlink needs
@@ -3230,76 +1423,7 @@ ire_unlink(irb_t *irb)
}
/*
- * Delete all the cache entries with this 'addr'. When IP gets a gratuitous
- * ARP message on any of its interface queue, it scans the nce table and
- * deletes and calls ndp_delete() for the appropriate nce. This action
- * also deletes all the neighbor/ire cache entries for that address.
- * This function is called from ip_arp_news in ip.c and also for
- * ARP ioctl processing in ip_if.c. ip_ire_clookup_and_delete returns
- * true if it finds a nce entry which is used by ip_arp_news to determine if
- * it needs to do an ire_walk_v4. The return value is also used for the
- * same purpose by ARP IOCTL processing * in ip_if.c when deleting
- * ARP entries. For SIOC*IFARP ioctls in addition to the address,
- * ip_if->ipif_ill also needs to be matched.
- */
-boolean_t
-ip_ire_clookup_and_delete(ipaddr_t addr, ipif_t *ipif, ip_stack_t *ipst)
-{
- ill_t *ill;
- nce_t *nce;
-
- ill = (ipif ? ipif->ipif_ill : NULL);
-
- if (ill != NULL) {
- /*
- * clean up the nce (and any relevant ire's) that matches
- * on addr and ill.
- */
- nce = ndp_lookup_v4(ill, &addr, B_FALSE);
- if (nce != NULL) {
- ndp_delete(nce);
- return (B_TRUE);
- }
- } else {
- /*
- * ill is wildcard. clean up all nce's and
- * ire's that match on addr
- */
- nce_clookup_t cl;
-
- cl.ncecl_addr = addr;
- cl.ncecl_found = B_FALSE;
-
- ndp_walk_common(ipst->ips_ndp4, NULL,
- (pfi_t)ip_nce_clookup_and_delete, (uchar_t *)&cl, B_TRUE);
-
- /*
- * ncecl_found would be set by ip_nce_clookup_and_delete if
- * we found a matching nce.
- */
- return (cl.ncecl_found);
- }
- return (B_FALSE);
-
-}
-
-/* Delete the supplied nce if its nce_addr matches the supplied address */
-static void
-ip_nce_clookup_and_delete(nce_t *nce, void *arg)
-{
- nce_clookup_t *cl = (nce_clookup_t *)arg;
- ipaddr_t nce_addr;
-
- IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
- if (nce_addr == cl->ncecl_addr) {
- cl->ncecl_found = B_TRUE;
- /* clean up the nce (and any relevant ire's) */
- ndp_delete(nce);
- }
-}
-
-/*
- * Clean up the radix node for this ire. Must be called by IRB_REFRELE
+ * Clean up the radix node for this ire. Must be called by irb_refrele
* when there are no ire's left in the bucket. Returns TRUE if the bucket
* is deleted and freed.
*/
@@ -3335,40 +1459,55 @@ irb_inactive(irb_t *irb)
/*
* Delete the specified IRE.
+ * We assume that if ire_bucket is not set then ire_ill->ill_ire_cnt was
+ * not incremented i.e., that the insertion in the bucket and the increment
+ * of that counter is done atomically.
*/
void
ire_delete(ire_t *ire)
{
ire_t *ire1;
ire_t **ptpn;
- irb_t *irb;
+ irb_t *irb;
+ nce_t *nce;
ip_stack_t *ipst = ire->ire_ipst;
+ /* We can clear ire_nce_cache under ire_lock even if the IRE is used */
+ mutex_enter(&ire->ire_lock);
+ nce = ire->ire_nce_cache;
+ ire->ire_nce_cache = NULL;
+ mutex_exit(&ire->ire_lock);
+ if (nce != NULL)
+ nce_refrele(nce);
+
if ((irb = ire->ire_bucket) == NULL) {
/*
* It was never inserted in the list. Should call REFRELE
* to free this IRE.
*/
- IRE_REFRELE_NOTR(ire);
+ ire_refrele_notr(ire);
return;
}
- rw_enter(&irb->irb_lock, RW_WRITER);
-
- if (irb->irb_rr_origin == ire) {
- irb->irb_rr_origin = NULL;
- }
-
/*
- * In case of V4 we might still be waiting for fastpath ack.
+ * Move the use counts from an IRE_IF_CLONE to its parent
+ * IRE_INTERFACE.
+ * We need to do this before acquiring irb_lock.
*/
- if (ire->ire_ipversion == IPV4_VERSION &&
- (ire->ire_type == IRE_CACHE ||
- (ire->ire_type == IRE_BROADCAST && ire->ire_stq != NULL))) {
- ASSERT(ire->ire_nce != NULL);
- nce_fastpath_list_delete(ire->ire_nce);
+ if (ire->ire_type & IRE_IF_CLONE) {
+ ire_t *parent;
+
+ rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
+ if ((parent = ire->ire_dep_parent) != NULL) {
+ parent->ire_ob_pkt_count += ire->ire_ob_pkt_count;
+ parent->ire_ib_pkt_count += ire->ire_ib_pkt_count;
+ ire->ire_ob_pkt_count = 0;
+ ire->ire_ib_pkt_count = 0;
+ }
+ rw_exit(&ipst->ips_ire_dep_lock);
}
+ rw_enter(&irb->irb_lock, RW_WRITER);
if (ire->ire_ptpn == NULL) {
/*
* Some other thread has removed us from the list.
@@ -3378,13 +1517,17 @@ ire_delete(ire_t *ire)
return;
}
- if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) {
- irb->irb_ire_cnt--;
- ire->ire_marks |= IRE_MARK_CONDEMNED;
- if (ire->ire_marks & IRE_MARK_TEMPORARY) {
- irb->irb_tmp_ire_cnt--;
- ire->ire_marks &= ~IRE_MARK_TEMPORARY;
+ if (!IRE_IS_CONDEMNED(ire)) {
+ /* Is this an IRE representing multiple duplicate entries? */
+ ASSERT(ire->ire_identical_ref >= 1);
+ if (atomic_add_32_nv(&ire->ire_identical_ref, -1) != 0) {
+ /* Removed one of the identical parties */
+ rw_exit(&irb->irb_lock);
+ return;
}
+
+ irb->irb_ire_cnt--;
+ ire_make_condemned(ire);
}
if (irb->irb_refcnt != 0) {
@@ -3419,22 +1562,9 @@ ire_delete(ire_t *ire)
} else {
BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_deleted);
}
- /*
- * ip_wput/ip_wput_v6 checks this flag to see whether
- * it should still use the cached ire or not.
- */
- if (ire->ire_type == IRE_DEFAULT) {
- /*
- * IRE is out of the list. We need to adjust the
- * accounting before we drop the lock.
- */
- if (ire->ire_ipversion == IPV6_VERSION) {
- ASSERT(ipst->ips_ipv6_ire_default_count != 0);
- ipst->ips_ipv6_ire_default_count--;
- }
- }
rw_exit(&irb->irb_lock);
+ /* Cleanup dependents and related stuff */
if (ire->ire_ipversion == IPV6_VERSION) {
ire_delete_v6(ire);
} else {
@@ -3444,7 +1574,7 @@ ire_delete(ire_t *ire)
* We removed it from the list. Decrement the
* reference count.
*/
- IRE_REFRELE_NOTR(ire);
+ ire_refrele_notr(ire);
}
/*
@@ -3463,8 +1593,7 @@ ire_delete_v4(ire_t *ire)
ASSERT(ire->ire_refcnt >= 1);
ASSERT(ire->ire_ipversion == IPV4_VERSION);
- if (ire->ire_type != IRE_CACHE)
- ire_flush_cache_v4(ire, IRE_FLUSH_DELETE);
+ ire_flush_cache_v4(ire, IRE_FLUSH_DELETE);
if (ire->ire_type == IRE_DEFAULT) {
/*
* when a default gateway is going away
@@ -3473,20 +1602,33 @@ ire_delete_v4(ire_t *ire)
*/
ire_delete_host_redirects(ire->ire_gateway_addr, ipst);
}
+
+ /*
+ * If we are deleting an IRE_INTERFACE then we make sure we also
+ * delete any IRE_IF_CLONE that has been created from it.
+ * Those are always in ire_dep_children.
+ */
+ if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != NULL)
+ ire_dep_delete_if_clone(ire);
+
+ /* Remove from parent dependencies and child */
+ rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER);
+ if (ire->ire_dep_parent != NULL)
+ ire_dep_remove(ire);
+
+ while (ire->ire_dep_children != NULL)
+ ire_dep_remove(ire->ire_dep_children);
+ rw_exit(&ipst->ips_ire_dep_lock);
}
/*
- * IRE_REFRELE/ire_refrele are the only caller of the function. It calls
+ * ire_refrele is the only caller of the function. It calls
* to free the ire when the reference count goes to zero.
*/
void
ire_inactive(ire_t *ire)
{
- nce_t *nce;
- ill_t *ill = NULL;
- ill_t *stq_ill = NULL;
- ipif_t *ipif;
- boolean_t need_wakeup = B_FALSE;
+ ill_t *ill;
irb_t *irb;
ip_stack_t *ipst = ire->ire_ipst;
@@ -3494,128 +1636,71 @@ ire_inactive(ire_t *ire)
ASSERT(ire->ire_ptpn == NULL);
ASSERT(ire->ire_next == NULL);
+ /* Count how many condemned ires for kmem_cache callback */
+ if (IRE_IS_CONDEMNED(ire))
+ atomic_add_32(&ipst->ips_num_ire_condemned, -1);
+
if (ire->ire_gw_secattr != NULL) {
ire_gw_secattr_free(ire->ire_gw_secattr);
ire->ire_gw_secattr = NULL;
}
- if (ire->ire_mp != NULL) {
- ASSERT(ire->ire_bucket == NULL);
- mutex_destroy(&ire->ire_lock);
- BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed);
- if (ire->ire_nce != NULL)
- NCE_REFRELE_NOTR(ire->ire_nce);
- freeb(ire->ire_mp);
- return;
- }
-
- if ((nce = ire->ire_nce) != NULL) {
- NCE_REFRELE_NOTR(nce);
- ire->ire_nce = NULL;
- }
-
- if (ire->ire_ipif == NULL)
- goto end;
-
- ipif = ire->ire_ipif;
- ill = ipif->ipif_ill;
+ /*
+ * ire_nce_cache is cleared in ire_delete, and we make sure we don't
+ * set it once the ire is marked condemned.
+ */
+ ASSERT(ire->ire_nce_cache == NULL);
- if (ire->ire_bucket == NULL) {
- /* The ire was never inserted in the table. */
- goto end;
- }
+ /*
+ * Since any parent would have a refhold on us they would already
+ * have been removed.
+ */
+ ASSERT(ire->ire_dep_parent == NULL);
+ ASSERT(ire->ire_dep_sib_next == NULL);
+ ASSERT(ire->ire_dep_sib_ptpn == NULL);
/*
- * ipif_ire_cnt on this ipif goes down by 1. If the ire_stq is
- * non-null ill_ire_count also goes down by 1.
- *
- * The ipif that is associated with an ire is ire->ire_ipif and
- * hence when the ire->ire_ipif->ipif_ire_cnt drops to zero we call
- * ipif_ill_refrele_tail. Usually stq_ill is null or the same as
- * ire->ire_ipif->ipif_ill. So nothing more needs to be done.
- * However, for VNI or IPMP IRE entries, stq_ill can be different.
- * If this is different from ire->ire_ipif->ipif_ill and if the
- * ill_ire_cnt on the stq_ill also has dropped to zero, we call
- * ipif_ill_refrele_tail on the stq_ill.
+ * Since any children would have a refhold on us they should have
+ * already been removed.
*/
- if (ire->ire_stq != NULL)
- stq_ill = ire->ire_stq->q_ptr;
+ ASSERT(ire->ire_dep_children == NULL);
- if (stq_ill == NULL || stq_ill == ill) {
- /* Optimize the most common case */
+ /*
+ * ill_ire_ref is increased when the IRE is inserted in the
+ * bucket - not when the IRE is created.
+ */
+ irb = ire->ire_bucket;
+ ill = ire->ire_ill;
+ if (irb != NULL && ill != NULL) {
mutex_enter(&ill->ill_lock);
- ASSERT(ipif->ipif_ire_cnt != 0);
- DTRACE_PROBE3(ipif__decr__cnt, (ipif_t *), ipif,
+ ASSERT(ill->ill_ire_cnt != 0);
+ DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
(char *), "ire", (void *), ire);
- ipif->ipif_ire_cnt--;
- if (IPIF_DOWN_OK(ipif))
- need_wakeup = B_TRUE;
- if (stq_ill != NULL) {
- ASSERT(stq_ill->ill_ire_cnt != 0);
- DTRACE_PROBE3(ill__decr__cnt, (ill_t *), stq_ill,
- (char *), "ire", (void *), ire);
- stq_ill->ill_ire_cnt--;
- if (ILL_DOWN_OK(stq_ill))
- need_wakeup = B_TRUE;
- }
- if (need_wakeup) {
+ ill->ill_ire_cnt--;
+ if (ILL_DOWN_OK(ill)) {
/* Drops the ill lock */
ipif_ill_refrele_tail(ill);
} else {
mutex_exit(&ill->ill_lock);
}
- } else {
- /*
- * We can't grab all the ill locks at the same time.
- * It can lead to recursive lock enter in the call to
- * ipif_ill_refrele_tail and later. Instead do it 1 at
- * a time.
- */
- mutex_enter(&ill->ill_lock);
- ASSERT(ipif->ipif_ire_cnt != 0);
- DTRACE_PROBE3(ipif__decr__cnt, (ipif_t *), ipif,
- (char *), "ire", (void *), ire);
- ipif->ipif_ire_cnt--;
- if (IPIF_DOWN_OK(ipif)) {
- /* Drops the lock */
- ipif_ill_refrele_tail(ill);
- } else {
- mutex_exit(&ill->ill_lock);
- }
- if (stq_ill != NULL) {
- mutex_enter(&stq_ill->ill_lock);
- ASSERT(stq_ill->ill_ire_cnt != 0);
- DTRACE_PROBE3(ill__decr__cnt, (ill_t *), stq_ill,
- (char *), "ire", (void *), ire);
- stq_ill->ill_ire_cnt--;
- if (ILL_DOWN_OK(stq_ill)) {
- /* Drops the ill lock */
- ipif_ill_refrele_tail(stq_ill);
- } else {
- mutex_exit(&stq_ill->ill_lock);
- }
- }
}
-end:
- /* This should be true for both V4 and V6 */
+ ire->ire_ill = NULL;
- if ((ire->ire_type & IRE_FORWARDTABLE) &&
- (ire->ire_ipversion == IPV4_VERSION) &&
- ((irb = ire->ire_bucket) != NULL)) {
+ /* This should be true for both V4 and V6 */
+ if (irb != NULL && (irb->irb_marks & IRB_MARK_DYNAMIC)) {
rw_enter(&irb->irb_lock, RW_WRITER);
irb->irb_nire--;
/*
* Instead of examining the conditions for freeing
* the radix node here, we do it by calling
- * IRB_REFRELE which is a single point in the code
+ * irb_refrele which is a single point in the code
* that embeds that logic. Bump up the refcnt to
- * be able to call IRB_REFRELE
+ * be able to call irb_refrele
*/
- IRB_REFHOLD_LOCKED(irb);
+ irb_refhold_locked(irb);
rw_exit(&irb->irb_lock);
- IRB_REFRELE(irb);
+ irb_refrele(irb);
}
- ire->ire_ipif = NULL;
#ifdef DEBUG
ire_trace_cleanup(ire);
@@ -3626,333 +1711,276 @@ end:
} else {
BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed);
}
- ASSERT(ire->ire_mp == NULL);
- /* Has been allocated out of the cache */
kmem_cache_free(ire_cache, ire);
}
/*
- * ire_walk routine to delete all IRE_CACHE/IRE_HOST types redirect
- * entries that have a given gateway address.
+ * ire_update_generation is the callback function provided by
+ * ire_get_bucket() to update the generation number of any
+ * matching shorter route when a new route is added.
+ *
+ * This fucntion always returns a failure return (B_FALSE)
+ * to force the caller (rn_matchaddr_args)
+ * to back-track up the tree looking for shorter matches.
+ */
+/* ARGSUSED */
+static boolean_t
+ire_update_generation(struct radix_node *rn, void *arg)
+{
+ struct rt_entry *rt = (struct rt_entry *)rn;
+
+ /* We need to handle all in the same bucket */
+ irb_increment_generation(&rt->rt_irb);
+ return (B_FALSE);
+}
+
+/*
+ * Take care of all the generation numbers in the bucket.
*/
void
-ire_delete_cache_gw(ire_t *ire, char *cp)
+irb_increment_generation(irb_t *irb)
{
- ipaddr_t gw_addr;
+ ire_t *ire;
- if (!(ire->ire_type & IRE_CACHE) &&
- !(ire->ire_flags & RTF_DYNAMIC))
+ if (irb == NULL || irb->irb_ire_cnt == 0)
return;
- bcopy(cp, &gw_addr, sizeof (gw_addr));
- if (ire->ire_gateway_addr == gw_addr) {
- ip1dbg(("ire_delete_cache_gw: deleted 0x%x type %d to 0x%x\n",
- (int)ntohl(ire->ire_addr), ire->ire_type,
- (int)ntohl(ire->ire_gateway_addr)));
- ire_delete(ire);
+ irb_refhold(irb);
+ for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
+ if (!IRE_IS_CONDEMNED(ire))
+ ire_increment_generation(ire); /* Ourselves */
+ ire_dep_incr_generation(ire); /* Dependants */
}
+ irb_refrele(irb);
}
/*
- * Remove all IRE_CACHE entries that match the ire specified.
+ * When an IRE is added or deleted this routine is called to make sure
+ * any caching of IRE information is notified or updated.
*
* The flag argument indicates if the flush request is due to addition
- * of new route (IRE_FLUSH_ADD) or deletion of old route (IRE_FLUSH_DELETE).
- *
- * This routine takes only the IREs from the forwarding table and flushes
- * the corresponding entries from the cache table.
- *
- * When flushing due to the deletion of an old route, it
- * just checks the cache handles (ire_phandle and ire_ihandle) and
- * deletes the ones that match.
- *
- * When flushing due to the creation of a new route, it checks
- * if a cache entry's address matches the one in the IRE and
- * that the cache entry's parent has a less specific mask than the
- * one in IRE. The destination of such a cache entry could be the
- * gateway for other cache entries, so we need to flush those as
- * well by looking for gateway addresses matching the IRE's address.
+ * of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE),
+ * or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE).
*/
void
ire_flush_cache_v4(ire_t *ire, int flag)
{
- int i;
- ire_t *cire;
- irb_t *irb;
- ip_stack_t *ipst = ire->ire_ipst;
+ irb_t *irb = ire->ire_bucket;
+ struct rt_entry *rt = IRB2RT(irb);
+ ip_stack_t *ipst = ire->ire_ipst;
- if (ire->ire_type & IRE_CACHE)
+ /*
+ * IRE_IF_CLONE ire's don't provide any new information
+ * than the parent from which they are cloned, so don't
+ * perturb the generation numbers.
+ */
+ if (ire->ire_type & IRE_IF_CLONE)
return;
/*
- * If a default is just created, there is no point
- * in going through the cache, as there will not be any
- * cached ires.
+ * Ensure that an ire_add during a lookup serializes the updates of the
+ * generation numbers under the radix head lock so that the lookup gets
+ * either the old ire and old generation number, or a new ire and new
+ * generation number.
+ */
+ RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable);
+
+ /*
+ * If a route was just added, we need to notify everybody that
+ * has cached an IRE_NOROUTE since there might now be a better
+ * route for them.
*/
- if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD)
- return;
if (flag == IRE_FLUSH_ADD) {
+ ire_increment_generation(ipst->ips_ire_reject_v4);
+ ire_increment_generation(ipst->ips_ire_blackhole_v4);
+ }
+
+ /* Adding a default can't otherwise provide a better route */
+ if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) {
+ RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
+ return;
+ }
+
+ switch (flag) {
+ case IRE_FLUSH_DELETE:
+ case IRE_FLUSH_GWCHANGE:
/*
- * This selective flush is due to the addition of
- * new IRE.
+ * Update ire_generation for all ire_dep_children chains
+ * starting with this IRE
*/
- for (i = 0; i < ipst->ips_ip_cache_table_size; i++) {
- irb = &ipst->ips_ip_cache_table[i];
- if ((cire = irb->irb_ire) == NULL)
- continue;
- IRB_REFHOLD(irb);
- for (cire = irb->irb_ire; cire != NULL;
- cire = cire->ire_next) {
- if (cire->ire_type != IRE_CACHE)
- continue;
- /*
- * If 'cire' belongs to the same subnet
- * as the new ire being added, and 'cire'
- * is derived from a prefix that is less
- * specific than the new ire being added,
- * we need to flush 'cire'; for instance,
- * when a new interface comes up.
- */
- if (((cire->ire_addr & ire->ire_mask) ==
- (ire->ire_addr & ire->ire_mask)) &&
- (ip_mask_to_plen(cire->ire_cmask) <=
- ire->ire_masklen)) {
- ire_delete(cire);
- continue;
- }
- /*
- * This is the case when the ire_gateway_addr
- * of 'cire' belongs to the same subnet as
- * the new ire being added.
- * Flushing such ires is sometimes required to
- * avoid misrouting: say we have a machine with
- * two interfaces (I1 and I2), a default router
- * R on the I1 subnet, and a host route to an
- * off-link destination D with a gateway G on
- * the I2 subnet.
- * Under normal operation, we will have an
- * on-link cache entry for G and an off-link
- * cache entry for D with G as ire_gateway_addr,
- * traffic to D will reach its destination
- * through gateway G.
- * If the administrator does 'ifconfig I2 down',
- * the cache entries for D and G will be
- * flushed. However, G will now be resolved as
- * an off-link destination using R (the default
- * router) as gateway. Then D will also be
- * resolved as an off-link destination using G
- * as gateway - this behavior is due to
- * compatibility reasons, see comment in
- * ire_ihandle_lookup_offlink(). Traffic to D
- * will go to the router R and probably won't
- * reach the destination.
- * The administrator then does 'ifconfig I2 up'.
- * Since G is on the I2 subnet, this routine
- * will flush its cache entry. It must also
- * flush the cache entry for D, otherwise
- * traffic will stay misrouted until the IRE
- * times out.
- */
- if ((cire->ire_gateway_addr & ire->ire_mask) ==
- (ire->ire_addr & ire->ire_mask)) {
- ire_delete(cire);
- continue;
- }
- }
- IRB_REFRELE(irb);
- }
- } else {
+ ire_dep_incr_generation(ire);
+ break;
+ case IRE_FLUSH_ADD:
/*
- * delete the cache entries based on
- * handle in the IRE as this IRE is
- * being deleted/changed.
+ * Update the generation numbers of all shorter matching routes.
+ * ire_update_generation takes care of the dependants by
+ * using ire_dep_incr_generation.
*/
- for (i = 0; i < ipst->ips_ip_cache_table_size; i++) {
- irb = &ipst->ips_ip_cache_table[i];
- if ((cire = irb->irb_ire) == NULL)
- continue;
- IRB_REFHOLD(irb);
- for (cire = irb->irb_ire; cire != NULL;
- cire = cire->ire_next) {
- if (cire->ire_type != IRE_CACHE)
- continue;
- if ((cire->ire_phandle == 0 ||
- cire->ire_phandle != ire->ire_phandle) &&
- (cire->ire_ihandle == 0 ||
- cire->ire_ihandle != ire->ire_ihandle))
- continue;
- ire_delete(cire);
- }
- IRB_REFRELE(irb);
- }
+ (void) ipst->ips_ip_ftable->rnh_matchaddr_args(&rt->rt_dst,
+ ipst->ips_ip_ftable, ire_update_generation, NULL);
+ break;
}
+ RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
}
/*
* Matches the arguments passed with the values in the ire.
*
- * Note: for match types that match using "ipif" passed in, ipif
+ * Note: for match types that match using "ill" passed in, ill
* must be checked for non-NULL before calling this routine.
*/
boolean_t
ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
- int type, const ipif_t *ipif, zoneid_t zoneid, uint32_t ihandle,
- const ts_label_t *tsl, int match_flags, queue_t *wq)
+ int type, const ill_t *ill, zoneid_t zoneid,
+ const ts_label_t *tsl, int match_flags)
{
ill_t *ire_ill = NULL, *dst_ill;
- ill_t *ipif_ill = NULL;
+ ip_stack_t *ipst = ire->ire_ipst;
ASSERT(ire->ire_ipversion == IPV4_VERSION);
ASSERT((ire->ire_addr & ~ire->ire_mask) == 0);
ASSERT((!(match_flags & MATCH_IRE_ILL)) ||
- (ipif != NULL && !ipif->ipif_isv6));
- ASSERT(!(match_flags & MATCH_IRE_WQ) || wq != NULL);
+ (ill != NULL && !ill->ill_isv6));
/*
- * If MATCH_IRE_MARK_TESTHIDDEN is set, then only return the IRE if it
- * is in fact hidden, to ensure the caller gets the right one. One
- * exception: if the caller passed MATCH_IRE_IHANDLE, then they
- * already know the identity of the given IRE_INTERFACE entry and
- * there's no point trying to hide it from them.
+ * If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it is
+ * in fact hidden, to ensure the caller gets the right one.
*/
- if (ire->ire_marks & IRE_MARK_TESTHIDDEN) {
- if (match_flags & MATCH_IRE_IHANDLE)
- match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
-
- if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN))
+ if (ire->ire_testhidden) {
+ if (!(match_flags & MATCH_IRE_TESTHIDDEN))
return (B_FALSE);
}
- /*
- * MATCH_IRE_MARK_PRIVATE_ADDR is set when IP_NEXTHOP option
- * is used. In that case the routing table is bypassed and the
- * packets are sent directly to the specified nexthop. The
- * IRE_CACHE entry representing this route should be marked
- * with IRE_MARK_PRIVATE_ADDR.
- */
-
- if (!(match_flags & MATCH_IRE_MARK_PRIVATE_ADDR) &&
- (ire->ire_marks & IRE_MARK_PRIVATE_ADDR))
- return (B_FALSE);
-
if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid &&
ire->ire_zoneid != ALL_ZONES) {
/*
- * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid is
- * valid and does not match that of ire_zoneid, a failure to
+ * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid
+ * does not match that of ire_zoneid, a failure to
* match is reported at this point. Otherwise, since some IREs
* that are available in the global zone can be used in local
* zones, additional checks need to be performed:
*
- * IRE_BROADCAST, IRE_CACHE and IRE_LOOPBACK
+ * IRE_LOOPBACK
* entries should never be matched in this situation.
+ * Each zone has its own IRE_LOOPBACK.
+ *
+ * IRE_LOCAL
+ * We allow them for any zoneid. ire_route_recursive
+ * does additional checks when
+ * ip_restrict_interzone_loopback is set.
*
- * IRE entries that have an interface associated with them
- * should in general not match unless they are an IRE_LOCAL
- * or in the case when MATCH_IRE_DEFAULT has been set in
- * the caller. In the case of the former, checking of the
- * other fields supplied should take place.
+ * If ill_usesrc_ifindex is set
+ * Then we check if the zone has a valid source address
+ * on the usesrc ill.
*
- * In the case where MATCH_IRE_DEFAULT has been set,
- * all of the ipif's associated with the IRE's ill are
- * checked to see if there is a matching zoneid. If any
- * one ipif has a matching zoneid, this IRE is a
- * potential candidate so checking of the other fields
- * takes place.
+ * If ire_ill is set, then check that the zone has an ipif
+ * on that ill.
*
- * In the case where the IRE_INTERFACE has a usable source
- * address (indicated by ill_usesrc_ifindex) in the
- * correct zone then it's permitted to return this IRE
+ * Outside of this function (in ire_round_robin) we check
+ * that any IRE_OFFLINK has a gateway that reachable from the
+ * zone when we have multiple choices (ECMP).
*/
if (match_flags & MATCH_IRE_ZONEONLY)
return (B_FALSE);
- if (ire->ire_type & (IRE_BROADCAST | IRE_CACHE | IRE_LOOPBACK))
+ if (ire->ire_type & IRE_LOOPBACK)
return (B_FALSE);
+
+ if (ire->ire_type & IRE_LOCAL)
+ goto matchit;
+
/*
- * Note, IRE_INTERFACE can have the stq as NULL. For
- * example, if the default multicast route is tied to
- * the loopback address.
+ * The normal case of IRE_ONLINK has a matching zoneid.
+ * Here we handle the case when shared-IP zones have been
+ * configured with IP addresses on vniN. In that case it
+ * is ok for traffic from a zone to use IRE_ONLINK routes
+ * if the ill has a usesrc pointing at vniN
*/
- if ((ire->ire_type & IRE_INTERFACE) &&
- (ire->ire_stq != NULL)) {
- dst_ill = (ill_t *)ire->ire_stq->q_ptr;
+ dst_ill = ire->ire_ill;
+ if (ire->ire_type & IRE_ONLINK) {
+ uint_t ifindex;
+
+ /*
+ * Note there is no IRE_INTERFACE on vniN thus
+ * can't do an IRE lookup for a matching route.
+ */
+ ifindex = dst_ill->ill_usesrc_ifindex;
+ if (ifindex == 0)
+ return (B_FALSE);
+
/*
* If there is a usable source address in the
- * zone, then it's ok to return an
- * IRE_INTERFACE
+ * zone, then it's ok to return this IRE_INTERFACE
*/
- if (ipif_usesrc_avail(dst_ill, zoneid)) {
- ip3dbg(("ire_match_args: dst_ill %p match %d\n",
- (void *)dst_ill,
- (ire->ire_addr == (addr & mask))));
- } else {
- ip3dbg(("ire_match_args: src_ipif NULL"
+ if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6,
+ zoneid, ipst)) {
+ ip3dbg(("ire_match_args: no usrsrc for zone"
" dst_ill %p\n", (void *)dst_ill));
return (B_FALSE);
}
}
- if (ire->ire_ipif != NULL && ire->ire_type != IRE_LOCAL &&
- !(ire->ire_type & IRE_INTERFACE)) {
+ /*
+ * For exampe, with
+ * route add 11.0.0.0 gw1 -ifp bge0
+ * route add 11.0.0.0 gw2 -ifp bge1
+ * this code would differentiate based on
+ * where the sending zone has addresses.
+ * Only if the zone has an address on bge0 can it use the first
+ * route. It isn't clear if this behavior is documented
+ * anywhere.
+ */
+ if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) {
ipif_t *tipif;
- if ((match_flags & MATCH_IRE_DEFAULT) == 0) {
- return (B_FALSE);
- }
- mutex_enter(&ire->ire_ipif->ipif_ill->ill_lock);
- for (tipif = ire->ire_ipif->ipif_ill->ill_ipif;
+ mutex_enter(&dst_ill->ill_lock);
+ for (tipif = dst_ill->ill_ipif;
tipif != NULL; tipif = tipif->ipif_next) {
- if (IPIF_CAN_LOOKUP(tipif) &&
+ if (!IPIF_IS_CONDEMNED(tipif) &&
(tipif->ipif_flags & IPIF_UP) &&
(tipif->ipif_zoneid == zoneid ||
tipif->ipif_zoneid == ALL_ZONES))
break;
}
- mutex_exit(&ire->ire_ipif->ipif_ill->ill_lock);
+ mutex_exit(&dst_ill->ill_lock);
if (tipif == NULL) {
return (B_FALSE);
}
}
}
- /*
- * For IRE_CACHE entries, MATCH_IRE_ILL means that somebody wants to
- * send out ire_stq (ire_ipif for IRE_CACHE entries is just the means
- * of getting a source address -- i.e., ire_src_addr ==
- * ire->ire_ipif->ipif_src_addr). ire_to_ill() handles this.
- *
- * NOTE: For IPMP, MATCH_IRE_ILL usually matches any ill in the group.
- * However, if MATCH_IRE_MARK_TESTHIDDEN is set (i.e., the IRE is for
- * IPMP test traffic), then the ill must match exactly.
- */
+matchit:
if (match_flags & MATCH_IRE_ILL) {
- ire_ill = ire_to_ill(ire);
- ipif_ill = ipif->ipif_ill;
+ ire_ill = ire->ire_ill;
+
+ /*
+ * If asked to match an ill, we *must* match
+ * on the ire_ill for ipmp test addresses, or
+ * any of the ill in the group for data addresses.
+ * If we don't, we may as well fail.
+ * However, we need an exception for IRE_LOCALs to ensure
+ * we loopback packets even sent to test addresses on different
+ * interfaces in the group.
+ */
+ if ((match_flags & MATCH_IRE_TESTHIDDEN) &&
+ !(ire->ire_type & IRE_LOCAL)) {
+ if (ire->ire_ill != ill)
+ return (B_FALSE);
+ } else {
+ match_flags &= ~MATCH_IRE_TESTHIDDEN;
+ /*
+ * We know that ill is not NULL, but ire_ill could be
+ * NULL
+ */
+ if (ire_ill == NULL || !IS_ON_SAME_LAN(ill, ire_ill))
+ return (B_FALSE);
+ }
}
if ((ire->ire_addr == (addr & mask)) &&
((!(match_flags & MATCH_IRE_GW)) ||
(ire->ire_gateway_addr == gateway)) &&
- ((!(match_flags & MATCH_IRE_TYPE)) ||
- (ire->ire_type & type)) &&
- ((!(match_flags & MATCH_IRE_SRC)) ||
- (ire->ire_src_addr == ipif->ipif_src_addr)) &&
- ((!(match_flags & MATCH_IRE_IPIF)) ||
- (ire->ire_ipif == ipif)) &&
- ((!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) ||
- (ire->ire_marks & IRE_MARK_TESTHIDDEN)) &&
- ((!(match_flags & MATCH_IRE_MARK_PRIVATE_ADDR)) ||
- (ire->ire_type != IRE_CACHE ||
- ire->ire_marks & IRE_MARK_PRIVATE_ADDR)) &&
- ((!(match_flags & MATCH_IRE_WQ)) ||
- (ire->ire_stq == wq)) &&
- ((!(match_flags & MATCH_IRE_ILL)) ||
- (ire_ill == ipif_ill ||
- (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN) &&
- ire_ill != NULL && IS_IN_SAME_ILLGRP(ipif_ill, ire_ill)))) &&
- ((!(match_flags & MATCH_IRE_IHANDLE)) ||
- (ire->ire_ihandle == ihandle)) &&
- ((!(match_flags & MATCH_IRE_MASK)) ||
- (ire->ire_mask == mask)) &&
+ ((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & type)) &&
+ ((!(match_flags & MATCH_IRE_TESTHIDDEN)) || ire->ire_testhidden) &&
+ ((!(match_flags & MATCH_IRE_MASK)) || (ire->ire_mask == mask)) &&
((!(match_flags & MATCH_IRE_SECATTR)) ||
(!is_system_labeled()) ||
(tsol_ire_match_gwattr(ire, tsl) == 0))) {
@@ -3963,494 +1991,207 @@ ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
}
/*
- * Lookup for a route in all the tables
+ * Check if the IRE_LOCAL uses the same ill as another route would use.
+ * If there is no alternate route, or the alternate is a REJECT or BLACKHOLE,
+ * then we don't allow this IRE_LOCAL to be used.
+ * We always return an IRE; will be RTF_REJECT if no route available.
*/
ire_t *
-ire_route_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
- int type, const ipif_t *ipif, ire_t **pire, zoneid_t zoneid,
- const ts_label_t *tsl, int flags, ip_stack_t *ipst)
+ire_alt_local(ire_t *ire, zoneid_t zoneid, const ts_label_t *tsl,
+ const ill_t *ill, uint_t *generationp)
{
- ire_t *ire = NULL;
+ ip_stack_t *ipst = ire->ire_ipst;
+ ire_t *alt_ire;
+ uint_t ire_type;
+ uint_t generation;
+ uint_t match_flags;
- /*
- * ire_match_args() will dereference ipif MATCH_IRE_SRC or
- * MATCH_IRE_ILL is set.
- */
- if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL))
- return (NULL);
+ ASSERT(ire->ire_type & IRE_LOCAL);
+ ASSERT(ire->ire_ill != NULL);
/*
- * might be asking for a cache lookup,
- * This is not best way to lookup cache,
- * user should call ire_cache_lookup directly.
- *
- * If MATCH_IRE_TYPE was set, first lookup in the cache table and then
- * in the forwarding table, if the applicable type flags were set.
+ * Need to match on everything but local.
+ * This might result in the creation of a IRE_IF_CLONE for the
+ * same address as the IRE_LOCAL when restrict_interzone_loopback is
+ * set. ire_add_*() ensures that the IRE_IF_CLONE are tail inserted
+ * to make sure the IRE_LOCAL is always found first.
*/
- if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_CACHETABLE) != 0) {
- ire = ire_ctable_lookup(addr, gateway, type, ipif, zoneid,
- tsl, flags, ipst);
- if (ire != NULL)
- return (ire);
+ ire_type = (IRE_ONLINK | IRE_OFFLINK) & ~(IRE_LOCAL|IRE_LOOPBACK);
+ match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR;
+ if (ill != NULL)
+ match_flags |= MATCH_IRE_ILL;
+
+ if (ire->ire_ipversion == IPV4_VERSION) {
+ alt_ire = ire_route_recursive_v4(ire->ire_addr, ire_type,
+ ill, zoneid, tsl, match_flags, B_TRUE, 0, ipst, NULL, NULL,
+ &generation);
+ } else {
+ alt_ire = ire_route_recursive_v6(&ire->ire_addr_v6, ire_type,
+ ill, zoneid, tsl, match_flags, B_TRUE, 0, ipst, NULL, NULL,
+ &generation);
}
- if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_FORWARDTABLE) != 0) {
- ire = ire_ftable_lookup(addr, mask, gateway, type, ipif, pire,
- zoneid, 0, tsl, flags, ipst);
+ ASSERT(alt_ire != NULL);
+
+ if (alt_ire->ire_ill == ire->ire_ill) {
+ /* Going out the same ILL - ok to send to IRE_LOCAL */
+ ire_refrele(alt_ire);
+ } else {
+ /* Different ill - ignore IRE_LOCAL */
+ ire_refrele(ire);
+ ire = alt_ire;
+ if (generationp != NULL)
+ *generationp = generation;
}
return (ire);
}
-/*
- * Delete the IRE cache for the gateway and all IRE caches whose
- * ire_gateway_addr points to this gateway, and allow them to
- * be created on demand by ip_newroute.
- */
-void
-ire_clookup_delete_cache_gw(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst)
+boolean_t
+ire_find_zoneid(struct radix_node *rn, void *arg)
{
+ struct rt_entry *rt = (struct rt_entry *)rn;
irb_t *irb;
ire_t *ire;
+ ire_ftable_args_t *margs = arg;
- irb = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(addr,
- ipst->ips_ip_cache_table_size)];
- IRB_REFHOLD(irb);
- for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
- if (ire->ire_marks & IRE_MARK_CONDEMNED)
- continue;
-
- ASSERT(ire->ire_mask == IP_HOST_MASK);
- if (ire_match_args(ire, addr, ire->ire_mask, 0, IRE_CACHE,
- NULL, zoneid, 0, NULL, MATCH_IRE_TYPE, NULL)) {
- ire_delete(ire);
- }
- }
- IRB_REFRELE(irb);
+ ASSERT(rt != NULL);
- ire_walk_v4(ire_delete_cache_gw, &addr, zoneid, ipst);
-}
+ irb = &rt->rt_irb;
-/*
- * Looks up cache table for a route.
- * specific lookup can be indicated by
- * passing the MATCH_* flags and the
- * necessary parameters.
- */
-ire_t *
-ire_ctable_lookup(ipaddr_t addr, ipaddr_t gateway, int type, const ipif_t *ipif,
- zoneid_t zoneid, const ts_label_t *tsl, int flags, ip_stack_t *ipst)
-{
- ire_ctable_args_t margs;
-
- margs.ict_addr = &addr;
- margs.ict_gateway = &gateway;
- margs.ict_type = type;
- margs.ict_ipif = ipif;
- margs.ict_zoneid = zoneid;
- margs.ict_tsl = tsl;
- margs.ict_flags = flags;
- margs.ict_ipst = ipst;
- margs.ict_wq = NULL;
-
- return (ip4_ctable_lookup_impl(&margs));
-}
+ if (irb->irb_ire_cnt == 0)
+ return (B_FALSE);
-/*
- * Check whether the IRE_LOCAL and the IRE potentially used to transmit
- * (could be an IRE_CACHE, IRE_BROADCAST, or IRE_INTERFACE) are identical
- * or part of the same illgrp. (In the IPMP case, usually the two IREs
- * will both belong to the IPMP ill, but exceptions are possible -- e.g.
- * if IPMP test addresses are on their own subnet.)
- */
-boolean_t
-ire_local_same_lan(ire_t *ire_local, ire_t *xmit_ire)
-{
- ill_t *recv_ill, *xmit_ill;
+ rw_enter(&irb->irb_lock, RW_READER);
+ for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
+ if (IRE_IS_CONDEMNED(ire))
+ continue;
- ASSERT(ire_local->ire_type & (IRE_LOCAL|IRE_LOOPBACK));
- ASSERT(xmit_ire->ire_type & (IRE_CACHETABLE|IRE_INTERFACE));
+ if (ire->ire_zoneid != ALL_ZONES &&
+ ire->ire_zoneid != margs->ift_zoneid)
+ continue;
- recv_ill = ire_to_ill(ire_local);
- xmit_ill = ire_to_ill(xmit_ire);
+ if (margs->ift_ill != NULL && margs->ift_ill != ire->ire_ill)
+ continue;
- ASSERT(recv_ill != NULL);
- ASSERT(xmit_ill != NULL);
+ if (is_system_labeled() &&
+ tsol_ire_match_gwattr(ire, margs->ift_tsl) != 0)
+ continue;
- return (IS_ON_SAME_LAN(recv_ill, xmit_ill));
+ rw_exit(&irb->irb_lock);
+ return (B_TRUE);
+ }
+ rw_exit(&irb->irb_lock);
+ return (B_FALSE);
}
/*
- * Check if the IRE_LOCAL uses the same ill as another route would use.
- * If there is no alternate route, or the alternate is a REJECT or BLACKHOLE,
- * then we don't allow this IRE_LOCAL to be used.
+ * Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified
+ * gateway address. If ill is non-NULL we also match on it.
+ * The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set.
*/
boolean_t
-ire_local_ok_across_zones(ire_t *ire_local, zoneid_t zoneid, void *addr,
- const ts_label_t *tsl, ip_stack_t *ipst)
+ire_gateway_ok_zone_v4(ipaddr_t gateway, zoneid_t zoneid, ill_t *ill,
+ const ts_label_t *tsl, ip_stack_t *ipst, boolean_t lock_held)
{
- ire_t *alt_ire;
- boolean_t rval;
- int flags;
+ struct rt_sockaddr rdst;
+ struct rt_entry *rt;
+ ire_ftable_args_t margs;
- flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE;
+ ASSERT(ill == NULL || !ill->ill_isv6);
+ if (lock_held)
+ ASSERT(RW_READ_HELD(&ipst->ips_ip_ftable->rnh_lock));
+ else
+ RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable);
- if (ire_local->ire_ipversion == IPV4_VERSION) {
- alt_ire = ire_ftable_lookup(*((ipaddr_t *)addr), 0, 0, 0, NULL,
- NULL, zoneid, 0, tsl, flags, ipst);
- } else {
- alt_ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL,
- NULL, zoneid, 0, tsl, flags, ipst);
- }
+ rdst.rt_sin_len = sizeof (rdst);
+ rdst.rt_sin_family = AF_INET;
+ rdst.rt_sin_addr.s_addr = gateway;
- if (alt_ire == NULL)
- return (B_FALSE);
+ /*
+ * We only use margs for ill, zoneid, and tsl matching in
+ * ire_find_zoneid
+ */
+ (void) memset(&margs, 0, sizeof (margs));
+ margs.ift_ill = ill;
+ margs.ift_zoneid = zoneid;
+ margs.ift_tsl = tsl;
+ rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst,
+ ipst->ips_ip_ftable, ire_find_zoneid, (void *)&margs);
- if (alt_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
- ire_refrele(alt_ire);
- return (B_FALSE);
- }
- rval = ire_local_same_lan(ire_local, alt_ire);
+ if (!lock_held)
+ RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable);
- ire_refrele(alt_ire);
- return (rval);
+ return (rt != NULL);
}
/*
- * Lookup cache
- *
- * In general the zoneid has to match (where ALL_ZONES match all of them).
- * But for IRE_LOCAL we also need to handle the case where L2 should
- * conceptually loop back the packet. This is necessary since neither
- * Ethernet drivers nor Ethernet hardware loops back packets sent to their
- * own MAC address. This loopback is needed when the normal
- * routes (ignoring IREs with different zoneids) would send out the packet on
- * the same ill as the ill with which this IRE_LOCAL is associated.
- *
- * Earlier versions of this code always matched an IRE_LOCAL independently of
- * the zoneid. We preserve that earlier behavior when
- * ip_restrict_interzone_loopback is turned off.
+ * ire_walk routine to delete a fraction of redirect IREs and IRE_CLONE_IF IREs.
+ * The fraction argument tells us what fraction of the IREs to delete.
+ * Common for IPv4 and IPv6.
+ * Used when memory backpressure.
*/
-ire_t *
-ire_cache_lookup(ipaddr_t addr, zoneid_t zoneid, const ts_label_t *tsl,
- ip_stack_t *ipst)
+static void
+ire_delete_reclaim(ire_t *ire, char *arg)
{
- irb_t *irb_ptr;
- ire_t *ire;
-
- irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(addr,
- ipst->ips_ip_cache_table_size)];
- rw_enter(&irb_ptr->irb_lock, RW_READER);
- for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
- if (ire->ire_marks & (IRE_MARK_CONDEMNED |
- IRE_MARK_TESTHIDDEN | IRE_MARK_PRIVATE_ADDR)) {
- continue;
- }
- if (ire->ire_addr == addr) {
- /*
- * Finally, check if the security policy has any
- * restriction on using this route for the specified
- * message.
- */
- if (tsl != NULL &&
- ire->ire_gw_secattr != NULL &&
- tsol_ire_match_gwattr(ire, tsl) != 0) {
- continue;
- }
+ ip_stack_t *ipst = ire->ire_ipst;
+ uint_t fraction = *(uint_t *)arg;
+ uint_t rand;
- if (zoneid == ALL_ZONES || ire->ire_zoneid == zoneid ||
- ire->ire_zoneid == ALL_ZONES) {
- IRE_REFHOLD(ire);
- rw_exit(&irb_ptr->irb_lock);
- return (ire);
- }
+ if ((ire->ire_flags & RTF_DYNAMIC) ||
+ (ire->ire_type & IRE_IF_CLONE)) {
- if (ire->ire_type == IRE_LOCAL) {
- if (ipst->ips_ip_restrict_interzone_loopback &&
- !ire_local_ok_across_zones(ire, zoneid,
- &addr, tsl, ipst))
- continue;
+ /* Pick a random number */
+ rand = (uint_t)lbolt +
+ IRE_ADDR_HASH_V6(ire->ire_addr_v6, 256);
- IRE_REFHOLD(ire);
- rw_exit(&irb_ptr->irb_lock);
- return (ire);
- }
+ /* Use truncation */
+ if ((rand/fraction)*fraction == rand) {
+ IP_STAT(ipst, ip_ire_reclaim_deleted);
+ ire_delete(ire);
}
}
- rw_exit(&irb_ptr->irb_lock);
- return (NULL);
-}
-ire_t *
-ire_cache_lookup_simple(ipaddr_t dst, ip_stack_t *ipst)
-{
- irb_t *irb_ptr;
- ire_t *ire;
-
- /*
- * Look for an ire in the cachetable whose
- * ire_addr matches the destination.
- * Since we are being called by forwarding fastpath
- * no need to check for Trusted Solaris label.
- */
- irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(
- dst, ipst->ips_ip_cache_table_size)];
- rw_enter(&irb_ptr->irb_lock, RW_READER);
- for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
- if (ire->ire_marks & (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN |
- IRE_MARK_PRIVATE_ADDR)) {
- continue;
- }
- if (ire->ire_addr == dst) {
- IRE_REFHOLD(ire);
- rw_exit(&irb_ptr->irb_lock);
- return (ire);
- }
- }
- rw_exit(&irb_ptr->irb_lock);
- return (NULL);
}
/*
- * Locate the interface ire that is tied to the cache ire 'cire' via
- * cire->ire_ihandle.
+ * kmem_cache callback to free up memory.
*
- * We are trying to create the cache ire for an offlink destn based
- * on the cache ire of the gateway in 'cire'. 'pire' is the prefix ire
- * as found by ip_newroute(). We are called from ip_newroute() in
- * the IRE_CACHE case.
+ * Free a fraction (ips_ip_ire_reclaim_fraction) of things IP added dynamically
+ * (RTF_DYNAMIC and IRE_IF_CLONE).
*/
-ire_t *
-ire_ihandle_lookup_offlink(ire_t *cire, ire_t *pire)
+static void
+ip_ire_reclaim_stack(ip_stack_t *ipst)
{
- ire_t *ire;
- int match_flags;
- ipaddr_t gw_addr;
- ipif_t *gw_ipif;
- ip_stack_t *ipst = cire->ire_ipst;
-
- ASSERT(cire != NULL && pire != NULL);
-
- /*
- * We don't need to specify the zoneid to ire_ftable_lookup() below
- * because the ihandle refers to an ipif which can be in only one zone.
- */
- match_flags = MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK;
- if (pire->ire_ipif != NULL)
- match_flags |= MATCH_IRE_ILL;
- /*
- * We know that the mask of the interface ire equals cire->ire_cmask.
- * (When ip_newroute() created 'cire' for the gateway it set its
- * cmask from the interface ire's mask)
- */
- ire = ire_ftable_lookup(cire->ire_addr, cire->ire_cmask, 0,
- IRE_INTERFACE, pire->ire_ipif, NULL, ALL_ZONES, cire->ire_ihandle,
- NULL, match_flags, ipst);
- if (ire != NULL)
- return (ire);
- /*
- * If we didn't find an interface ire above, we can't declare failure.
- * For backwards compatibility, we need to support prefix routes
- * pointing to next hop gateways that are not on-link.
- *
- * Assume we are trying to ping some offlink destn, and we have the
- * routing table below.
- *
- * Eg. default - gw1 <--- pire (line 1)
- * gw1 - gw2 (line 2)
- * gw2 - hme0 (line 3)
- *
- * If we already have a cache ire for gw1 in 'cire', the
- * ire_ftable_lookup above would have failed, since there is no
- * interface ire to reach gw1. We will fallthru below.
- *
- * Here we duplicate the steps that ire_ftable_lookup() did in
- * getting 'cire' from 'pire', in the MATCH_IRE_RECURSIVE case.
- * The differences are the following
- * i. We want the interface ire only, so we call ire_ftable_lookup()
- * instead of ire_route_lookup()
- * ii. We look for only prefix routes in the 1st call below.
- * ii. We want to match on the ihandle in the 2nd call below.
- */
- match_flags = MATCH_IRE_TYPE;
- if (pire->ire_ipif != NULL)
- match_flags |= MATCH_IRE_ILL;
- ire = ire_ftable_lookup(pire->ire_gateway_addr, 0, 0, IRE_OFFSUBNET,
- pire->ire_ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
- if (ire == NULL)
- return (NULL);
- /*
- * At this point 'ire' corresponds to the entry shown in line 2.
- * gw_addr is 'gw2' in the example above.
- */
- gw_addr = ire->ire_gateway_addr;
- gw_ipif = ire->ire_ipif;
- ire_refrele(ire);
+ uint_t fraction = ipst->ips_ip_ire_reclaim_fraction;
- match_flags |= MATCH_IRE_IHANDLE;
- ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE,
- gw_ipif, NULL, ALL_ZONES, cire->ire_ihandle, NULL, match_flags,
- ipst);
- return (ire);
-}
+ IP_STAT(ipst, ip_ire_reclaim_calls);
-/*
- * Return the IRE_LOOPBACK, IRE_IF_RESOLVER or IRE_IF_NORESOLVER
- * ire associated with the specified ipif.
- *
- * This might occasionally be called when IPIF_UP is not set since
- * the IP_MULTICAST_IF as well as creating interface routes
- * allows specifying a down ipif (ipif_lookup* match ipifs that are down).
- *
- * Note that if IPIF_NOLOCAL, IPIF_NOXMIT, or IPIF_DEPRECATED is set on
- * the ipif, this routine might return NULL.
- */
-ire_t *
-ipif_to_ire(const ipif_t *ipif)
-{
- ire_t *ire;
- ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
- uint_t match_flags = MATCH_IRE_TYPE | MATCH_IRE_IPIF | MATCH_IRE_MASK;
+ ire_walk(ire_delete_reclaim, &fraction, ipst);
/*
- * IRE_INTERFACE entries for ills under IPMP are IRE_MARK_TESTHIDDEN
- * so that they aren't accidentally returned. However, if the
- * caller's ipif is on an ill under IPMP, there's no need to hide 'em.
+ * Walk all CONNs that can have a reference on an ire, nce or dce.
+ * Get them to update any stale references to drop any refholds they
+ * have.
*/
- if (IS_UNDER_IPMP(ipif->ipif_ill))
- match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
-
- ASSERT(!ipif->ipif_isv6);
- if (ipif->ipif_ire_type == IRE_LOOPBACK) {
- ire = ire_ctable_lookup(ipif->ipif_lcl_addr, 0, IRE_LOOPBACK,
- ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF),
- ipst);
- } else if (ipif->ipif_flags & IPIF_POINTOPOINT) {
- /* In this case we need to lookup destination address. */
- ire = ire_ftable_lookup(ipif->ipif_pp_dst_addr, IP_HOST_MASK, 0,
- IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL, match_flags,
- ipst);
- } else {
- ire = ire_ftable_lookup(ipif->ipif_subnet,
- ipif->ipif_net_mask, 0, IRE_INTERFACE, ipif, NULL,
- ALL_ZONES, 0, NULL, match_flags, ipst);
- }
- return (ire);
+ ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
}
/*
- * ire_walk function.
- * Count the number of IRE_CACHE entries in different categories.
- */
-void
-ire_cache_count(ire_t *ire, char *arg)
-{
- ire_cache_count_t *icc = (ire_cache_count_t *)arg;
-
- if (ire->ire_type != IRE_CACHE)
- return;
-
- icc->icc_total++;
-
- if (ire->ire_ipversion == IPV6_VERSION) {
- mutex_enter(&ire->ire_lock);
- if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) {
- mutex_exit(&ire->ire_lock);
- icc->icc_onlink++;
- return;
- }
- mutex_exit(&ire->ire_lock);
- } else {
- if (ire->ire_gateway_addr == 0) {
- icc->icc_onlink++;
- return;
- }
- }
-
- ASSERT(ire->ire_ipif != NULL);
- if (ire->ire_max_frag < ire->ire_ipif->ipif_mtu)
- icc->icc_pmtu++;
- else if (ire->ire_tire_mark != ire->ire_ob_pkt_count +
- ire->ire_ib_pkt_count)
- icc->icc_offlink++;
- else
- icc->icc_unused++;
-}
-
-/*
- * ire_walk function called by ip_trash_ire_reclaim().
- * Free a fraction of the IRE_CACHE cache entries. The fractions are
- * different for different categories of IRE_CACHE entries.
- * A fraction of zero means to not free any in that category.
- * Use the hash bucket id plus lbolt as a random number. Thus if the fraction
- * is N then every Nth hash bucket chain will be freed.
+ * Called by the memory allocator subsystem directly, when the system
+ * is running low on memory.
*/
+/* ARGSUSED */
void
-ire_cache_reclaim(ire_t *ire, char *arg)
+ip_ire_reclaim(void *args)
{
- ire_cache_reclaim_t *icr = (ire_cache_reclaim_t *)arg;
- uint_t rand;
- ip_stack_t *ipst = icr->icr_ipst;
-
- if (ire->ire_type != IRE_CACHE)
- return;
+ netstack_handle_t nh;
+ netstack_t *ns;
- if (ire->ire_ipversion == IPV6_VERSION) {
- rand = (uint_t)lbolt +
- IRE_ADDR_HASH_V6(ire->ire_addr_v6,
- ipst->ips_ip6_cache_table_size);
- mutex_enter(&ire->ire_lock);
- if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) {
- mutex_exit(&ire->ire_lock);
- if (icr->icr_onlink != 0 &&
- (rand/icr->icr_onlink)*icr->icr_onlink == rand) {
- ire_delete(ire);
- return;
- }
- goto done;
- }
- mutex_exit(&ire->ire_lock);
- } else {
- rand = (uint_t)lbolt +
- IRE_ADDR_HASH(ire->ire_addr, ipst->ips_ip_cache_table_size);
- if (ire->ire_gateway_addr == 0) {
- if (icr->icr_onlink != 0 &&
- (rand/icr->icr_onlink)*icr->icr_onlink == rand) {
- ire_delete(ire);
- return;
- }
- goto done;
- }
- }
- /* Not onlink IRE */
- ASSERT(ire->ire_ipif != NULL);
- if (ire->ire_max_frag < ire->ire_ipif->ipif_mtu) {
- /* Use ptmu fraction */
- if (icr->icr_pmtu != 0 &&
- (rand/icr->icr_pmtu)*icr->icr_pmtu == rand) {
- ire_delete(ire);
- return;
- }
- } else if (ire->ire_tire_mark != ire->ire_ob_pkt_count +
- ire->ire_ib_pkt_count) {
- /* Use offlink fraction */
- if (icr->icr_offlink != 0 &&
- (rand/icr->icr_offlink)*icr->icr_offlink == rand) {
- ire_delete(ire);
- return;
- }
- } else {
- /* Use unused fraction */
- if (icr->icr_unused != 0 &&
- (rand/icr->icr_unused)*icr->icr_unused == rand) {
- ire_delete(ire);
- return;
- }
+ netstack_next_init(&nh);
+ while ((ns = netstack_next(&nh)) != NULL) {
+ ip_ire_reclaim_stack(ns->netstack_ip);
+ netstack_rele(ns);
}
-done:
- /*
- * Update tire_mark so that those that haven't been used since this
- * reclaim will be considered unused next time we reclaim.
- */
- ire->ire_tire_mark = ire->ire_ob_pkt_count + ire->ire_ib_pkt_count;
+ netstack_next_fini(&nh);
}
static void
@@ -4470,14 +2211,21 @@ void
ip_ire_g_init()
{
/*
- * Create ire caches, ire_reclaim()
- * will give IRE_CACHE back to system when needed.
+ * Create kmem_caches. ip_ire_reclaim() and ip_nce_reclaim()
+ * will give disposable IREs back to system when needed.
* This needs to be done here before anything else, since
* ire_add() expects the cache to be created.
*/
ire_cache = kmem_cache_create("ire_cache",
- sizeof (ire_t), 0, ip_ire_constructor,
- ip_ire_destructor, ip_trash_ire_reclaim, NULL, NULL, 0);
+ sizeof (ire_t), 0, NULL, NULL,
+ ip_ire_reclaim, NULL, NULL, 0);
+
+ ncec_cache = kmem_cache_create("ncec_cache",
+ sizeof (ncec_t), 0, NULL, NULL,
+ ip_nce_reclaim, NULL, NULL, 0);
+ nce_cache = kmem_cache_create("nce_cache",
+ sizeof (nce_t), 0, NULL, NULL,
+ NULL, NULL, NULL, 0);
rt_entry_cache = kmem_cache_create("rt_entry",
sizeof (struct rt_entry), 0, NULL, NULL, NULL, NULL, NULL, 0);
@@ -4491,104 +2239,65 @@ ip_ire_g_init()
void
ip_ire_init(ip_stack_t *ipst)
{
- int i;
- uint32_t mem_cnt;
- uint32_t cpu_cnt;
- uint32_t min_cnt;
- pgcnt_t mem_avail;
-
- /*
- * ip_ire_max_bucket_cnt is sized below based on the memory
- * size and the cpu speed of the machine. This is upper
- * bounded by the compile time value of ip_ire_max_bucket_cnt
- * and is lower bounded by the compile time value of
- * ip_ire_min_bucket_cnt. Similar logic applies to
- * ip6_ire_max_bucket_cnt.
- *
- * We calculate this for each IP Instances in order to use
- * the kmem_avail and ip_ire_{min,max}_bucket_cnt that are
- * in effect when the zone is booted.
- */
- mem_avail = kmem_avail();
- mem_cnt = (mem_avail >> ip_ire_mem_ratio) /
- ip_cache_table_size / sizeof (ire_t);
- cpu_cnt = CPU->cpu_type_info.pi_clock >> ip_ire_cpu_ratio;
-
- min_cnt = MIN(cpu_cnt, mem_cnt);
- if (min_cnt < ip_ire_min_bucket_cnt)
- min_cnt = ip_ire_min_bucket_cnt;
- if (ip_ire_max_bucket_cnt > min_cnt) {
- ip_ire_max_bucket_cnt = min_cnt;
- }
-
- mem_cnt = (mem_avail >> ip_ire_mem_ratio) /
- ip6_cache_table_size / sizeof (ire_t);
- min_cnt = MIN(cpu_cnt, mem_cnt);
- if (min_cnt < ip6_ire_min_bucket_cnt)
- min_cnt = ip6_ire_min_bucket_cnt;
- if (ip6_ire_max_bucket_cnt > min_cnt) {
- ip6_ire_max_bucket_cnt = min_cnt;
- }
+ ire_t *ire;
+ int error;
mutex_init(&ipst->ips_ire_ft_init_lock, NULL, MUTEX_DEFAULT, 0);
- mutex_init(&ipst->ips_ire_handle_lock, NULL, MUTEX_DEFAULT, NULL);
(void) rn_inithead((void **)&ipst->ips_ip_ftable, 32);
- /* Calculate the IPv4 cache table size. */
- ipst->ips_ip_cache_table_size = MAX(ip_cache_table_size,
- ((mem_avail >> ip_ire_mem_ratio) / sizeof (ire_t) /
- ip_ire_max_bucket_cnt));
- if (ipst->ips_ip_cache_table_size > ip_max_cache_table_size)
- ipst->ips_ip_cache_table_size = ip_max_cache_table_size;
/*
- * Make sure that the table size is always a power of 2. The
- * hash macro IRE_ADDR_HASH() depends on that.
+ * Make sure that the forwarding table size is a power of 2.
+ * The IRE*_ADDR_HASH() macroes depend on that.
*/
- power2_roundup(&ipst->ips_ip_cache_table_size);
-
- ipst->ips_ip_cache_table = kmem_zalloc(ipst->ips_ip_cache_table_size *
- sizeof (irb_t), KM_SLEEP);
-
- for (i = 0; i < ipst->ips_ip_cache_table_size; i++) {
- rw_init(&ipst->ips_ip_cache_table[i].irb_lock, NULL,
- RW_DEFAULT, NULL);
- }
+ ipst->ips_ip6_ftable_hash_size = ip6_ftable_hash_size;
+ power2_roundup(&ipst->ips_ip6_ftable_hash_size);
- /* Calculate the IPv6 cache table size. */
- ipst->ips_ip6_cache_table_size = MAX(ip6_cache_table_size,
- ((mem_avail >> ip_ire_mem_ratio) / sizeof (ire_t) /
- ip6_ire_max_bucket_cnt));
- if (ipst->ips_ip6_cache_table_size > ip6_max_cache_table_size)
- ipst->ips_ip6_cache_table_size = ip6_max_cache_table_size;
/*
- * Make sure that the table size is always a power of 2. The
- * hash macro IRE_ADDR_HASH_V6() depends on that.
+ * Allocate/initialize a pair of IRE_NOROUTEs for each of IPv4 and IPv6.
+ * The ire_reject_v* has RTF_REJECT set, and the ire_blackhole_v* has
+ * RTF_BLACKHOLE set. We use the latter for transient errors such
+ * as memory allocation failures and tripping on IRE_IS_CONDEMNED
+ * entries.
*/
- power2_roundup(&ipst->ips_ip6_cache_table_size);
+ ire = kmem_cache_alloc(ire_cache, KM_SLEEP);
+ *ire = ire_null;
+ error = ire_init_v4(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES,
+ RTF_REJECT|RTF_UP, NULL, ipst);
+ ASSERT(error == 0);
+ ipst->ips_ire_reject_v4 = ire;
- ipst->ips_ip_cache_table_v6 = kmem_zalloc(
- ipst->ips_ip6_cache_table_size * sizeof (irb_t), KM_SLEEP);
+ ire = kmem_cache_alloc(ire_cache, KM_SLEEP);
+ *ire = ire_null;
+ error = ire_init_v6(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES,
+ RTF_REJECT|RTF_UP, NULL, ipst);
+ ASSERT(error == 0);
+ ipst->ips_ire_reject_v6 = ire;
- for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) {
- rw_init(&ipst->ips_ip_cache_table_v6[i].irb_lock, NULL,
- RW_DEFAULT, NULL);
- }
+ ire = kmem_cache_alloc(ire_cache, KM_SLEEP);
+ *ire = ire_null;
+ error = ire_init_v4(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES,
+ RTF_BLACKHOLE|RTF_UP, NULL, ipst);
+ ASSERT(error == 0);
+ ipst->ips_ire_blackhole_v4 = ire;
- /*
- * Make sure that the forwarding table size is a power of 2.
- * The IRE*_ADDR_HASH() macroes depend on that.
- */
- ipst->ips_ip6_ftable_hash_size = ip6_ftable_hash_size;
- power2_roundup(&ipst->ips_ip6_ftable_hash_size);
+ ire = kmem_cache_alloc(ire_cache, KM_SLEEP);
+ *ire = ire_null;
+ error = ire_init_v6(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES,
+ RTF_BLACKHOLE|RTF_UP, NULL, ipst);
+ ASSERT(error == 0);
+ ipst->ips_ire_blackhole_v6 = ire;
- ipst->ips_ire_handle = 1;
+ rw_init(&ipst->ips_ip6_ire_head_lock, NULL, RW_DEFAULT, NULL);
+ rw_init(&ipst->ips_ire_dep_lock, NULL, RW_DEFAULT, NULL);
}
void
ip_ire_g_fini(void)
{
kmem_cache_destroy(ire_cache);
+ kmem_cache_destroy(ncec_cache);
+ kmem_cache_destroy(nce_cache);
kmem_cache_destroy(rt_entry_cache);
rn_fini();
@@ -4599,9 +2308,21 @@ ip_ire_fini(ip_stack_t *ipst)
{
int i;
+ rw_destroy(&ipst->ips_ire_dep_lock);
+ rw_destroy(&ipst->ips_ip6_ire_head_lock);
+
+ ire_refrele_notr(ipst->ips_ire_reject_v6);
+ ipst->ips_ire_reject_v6 = NULL;
+ ire_refrele_notr(ipst->ips_ire_reject_v4);
+ ipst->ips_ire_reject_v4 = NULL;
+ ire_refrele_notr(ipst->ips_ire_blackhole_v6);
+ ipst->ips_ire_blackhole_v6 = NULL;
+ ire_refrele_notr(ipst->ips_ire_blackhole_v4);
+ ipst->ips_ire_blackhole_v4 = NULL;
+
/*
* Delete all IREs - assumes that the ill/ipifs have
- * been removed so what remains are just the ftable and IRE_CACHE.
+ * been removed so what remains are just the ftable to handle.
*/
ire_walk(ire_delete, NULL, ipst);
@@ -4609,23 +2330,6 @@ ip_ire_fini(ip_stack_t *ipst)
ipst->ips_ip_ftable = NULL;
mutex_destroy(&ipst->ips_ire_ft_init_lock);
- mutex_destroy(&ipst->ips_ire_handle_lock);
-
- for (i = 0; i < ipst->ips_ip_cache_table_size; i++) {
- ASSERT(ipst->ips_ip_cache_table[i].irb_ire == NULL);
- rw_destroy(&ipst->ips_ip_cache_table[i].irb_lock);
- }
- kmem_free(ipst->ips_ip_cache_table,
- ipst->ips_ip_cache_table_size * sizeof (irb_t));
- ipst->ips_ip_cache_table = NULL;
-
- for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) {
- ASSERT(ipst->ips_ip_cache_table_v6[i].irb_ire == NULL);
- rw_destroy(&ipst->ips_ip_cache_table_v6[i].irb_lock);
- }
- kmem_free(ipst->ips_ip_cache_table_v6,
- ipst->ips_ip6_cache_table_size * sizeof (irb_t));
- ipst->ips_ip_cache_table_v6 = NULL;
for (i = 0; i < IP6_MASK_TABLE_SIZE; i++) {
irb_t *ptr;
@@ -4643,1116 +2347,1177 @@ ip_ire_fini(ip_stack_t *ipst)
}
}
+#ifdef DEBUG
+void
+ire_trace_ref(ire_t *ire)
+{
+ mutex_enter(&ire->ire_lock);
+ if (ire->ire_trace_disable) {
+ mutex_exit(&ire->ire_lock);
+ return;
+ }
+
+ if (th_trace_ref(ire, ire->ire_ipst)) {
+ mutex_exit(&ire->ire_lock);
+ } else {
+ ire->ire_trace_disable = B_TRUE;
+ mutex_exit(&ire->ire_lock);
+ ire_trace_cleanup(ire);
+ }
+}
+
+void
+ire_untrace_ref(ire_t *ire)
+{
+ mutex_enter(&ire->ire_lock);
+ if (!ire->ire_trace_disable)
+ th_trace_unref(ire);
+ mutex_exit(&ire->ire_lock);
+}
+
+static void
+ire_trace_cleanup(const ire_t *ire)
+{
+ th_trace_cleanup(ire, ire->ire_trace_disable);
+}
+#endif /* DEBUG */
+
/*
- * Check if another multirt route resolution is needed.
- * B_TRUE is returned is there remain a resolvable route,
- * or if no route for that dst is resolved yet.
- * B_FALSE is returned if all routes for that dst are resolved
- * or if the remaining unresolved routes are actually not
- * resolvable.
- * This only works in the global zone.
+ * Find, or create if needed, the nce_t pointer to the neighbor cache
+ * entry ncec_t for an IPv4 address. The nce_t will be created on the ill_t
+ * in the non-IPMP case, or on the cast-ill in the IPMP bcast/mcast case, or
+ * on the next available under-ill (selected by the IPMP rotor) in the
+ * unicast IPMP case.
+ *
+ * If a neighbor-cache entry has to be created (i.e., one does not already
+ * exist in the nce list) the ncec_lladdr and ncec_state of the neighbor cache
+ * entry are initialized in nce_add_v4(). The broadcast, multicast, and
+ * link-layer type determine the contents of {ncec_state, ncec_lladdr} of
+ * the ncec_t created. The ncec_lladdr is non-null for all link types with
+ * non-zero ill_phys_addr_length, though the contents may be zero in cases
+ * where the link-layer type is not known at the time of creation
+ * (e.g., IRE_IFRESOLVER links)
+ *
+ * All IRE_BROADCAST entries have ncec_state = ND_REACHABLE, and the nce_lladr
+ * has the physical broadcast address of the outgoing interface.
+ * For unicast ire entries,
+ * - if the outgoing interface is of type IRE_IF_RESOLVER, a newly created
+ * ncec_t with 0 nce_lladr contents, and will be in the ND_INITIAL state.
+ * - if the outgoing interface is a IRE_IF_NORESOLVER interface, no link
+ * layer resolution is necessary, so that the ncec_t will be in the
+ * ND_REACHABLE state
+ *
+ * The link layer information needed for broadcast addresses, and for
+ * packets sent on IRE_IF_NORESOLVER interfaces is a constant mapping that
+ * never needs re-verification for the lifetime of the ncec_t. These are
+ * therefore marked NCE_F_NONUD.
+ *
+ * The nce returned will be created such that the nce_ill == ill that
+ * is passed in. Note that the nce itself may not have ncec_ill == ill
+ * where IPMP links are involved.
*/
-boolean_t
-ire_multirt_need_resolve(ipaddr_t dst, const ts_label_t *tsl, ip_stack_t *ipst)
+static nce_t *
+ire_nce_init(ill_t *ill, const void *addr, int ire_type)
{
- ire_t *first_fire;
- ire_t *first_cire;
- ire_t *fire;
- ire_t *cire;
- irb_t *firb;
- irb_t *cirb;
- int unres_cnt = 0;
- boolean_t resolvable = B_FALSE;
-
- /* Retrieve the first IRE_HOST that matches the destination */
- first_fire = ire_ftable_lookup(dst, IP_HOST_MASK, 0, IRE_HOST, NULL,
- NULL, ALL_ZONES, 0, tsl,
- MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_SECATTR, ipst);
-
- /* No route at all */
- if (first_fire == NULL) {
- return (B_TRUE);
+ int err;
+ nce_t *nce = NULL;
+ uint16_t ncec_flags;
+ uchar_t *hwaddr;
+ boolean_t need_refrele = B_FALSE;
+ ill_t *in_ill = ill;
+ boolean_t is_unicast;
+ uint_t hwaddr_len;
+
+ is_unicast = ((ire_type & (IRE_MULTICAST|IRE_BROADCAST)) == 0);
+ if (IS_IPMP(ill) ||
+ ((ire_type & IRE_BROADCAST) && IS_UNDER_IPMP(ill))) {
+ if ((ill = ipmp_ill_get_xmit_ill(ill, is_unicast)) == NULL)
+ return (NULL);
+ need_refrele = B_TRUE;
}
+ ncec_flags = (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0;
- firb = first_fire->ire_bucket;
- ASSERT(firb != NULL);
+ switch (ire_type) {
+ case IRE_BROADCAST:
+ ASSERT(!ill->ill_isv6);
+ ncec_flags |= (NCE_F_BCAST|NCE_F_NONUD);
+ break;
+ case IRE_MULTICAST:
+ ncec_flags |= (NCE_F_MCAST|NCE_F_NONUD);
+ break;
+ }
- /* Retrieve the first IRE_CACHE ire for that destination. */
- first_cire = ire_cache_lookup(dst, GLOBAL_ZONEID, tsl, ipst);
+ if (ill->ill_net_type == IRE_IF_NORESOLVER && is_unicast) {
+ hwaddr = ill->ill_dest_addr;
+ } else {
+ hwaddr = NULL;
+ }
+ hwaddr_len = ill->ill_phys_addr_length;
- /* No resolved route. */
- if (first_cire == NULL) {
- ire_refrele(first_fire);
- return (B_TRUE);
+retry:
+ /* nce_state will be computed by nce_add_common() */
+ if (!ill->ill_isv6) {
+ err = nce_lookup_then_add_v4(ill, hwaddr, hwaddr_len, addr,
+ ncec_flags, ND_UNCHANGED, &nce);
+ } else {
+ err = nce_lookup_then_add_v6(ill, hwaddr, hwaddr_len, addr,
+ ncec_flags, ND_UNCHANGED, &nce);
}
+ switch (err) {
+ case 0:
+ break;
+ case EEXIST:
+ /*
+ * When subnets change or partially overlap what was once
+ * a broadcast address could now be a unicast, or vice versa.
+ */
+ if (((ncec_flags ^ nce->nce_common->ncec_flags) &
+ NCE_F_BCAST) != 0) {
+ ASSERT(!ill->ill_isv6);
+ ncec_delete(nce->nce_common);
+ nce_refrele(nce);
+ goto retry;
+ }
+ break;
+ default:
+ DTRACE_PROBE2(nce__init__fail, ill_t *, ill, int, err);
+ if (need_refrele)
+ ill_refrele(ill);
+ return (NULL);
+ }
/*
- * At least one route is resolved. Here we look through the forward
- * and cache tables, to compare the number of declared routes
- * with the number of resolved routes. The search for a resolvable
- * route is performed only if at least one route remains
- * unresolved.
+ * If the ill was an under-ill of an IPMP group, we need to verify
+ * that it is still active so that we select an active interface in
+ * the group. However, since ipmp_ill_is_active ASSERTs for
+ * IS_UNDER_IPMP(), we first need to verify that the ill is an
+ * under-ill, and since this is being done in the data path, the
+ * only way to ascertain this is by holding the ill_g_lock.
*/
- cirb = first_cire->ire_bucket;
- ASSERT(cirb != NULL);
-
- /* Count the number of routes to that dest that are declared. */
- IRB_REFHOLD(firb);
- for (fire = first_fire; fire != NULL; fire = fire->ire_next) {
- if (!(fire->ire_flags & RTF_MULTIRT))
- continue;
- if (fire->ire_addr != dst)
- continue;
- unres_cnt++;
+ rw_enter(&ill->ill_ipst->ips_ill_g_lock, RW_READER);
+ mutex_enter(&ill->ill_lock);
+ mutex_enter(&ill->ill_phyint->phyint_lock);
+ if (need_refrele && IS_UNDER_IPMP(ill) && !ipmp_ill_is_active(ill)) {
+ /*
+ * need_refrele implies that the under ill was selected by
+ * ipmp_ill_get_xmit_ill() because either the in_ill was an
+ * ipmp_ill, or we are sending a non-unicast packet on
+ * an under_ill. However, when we get here, the ill selected by
+ * ipmp_ill_get_xmit_ill was pulled out of the active set
+ * (for unicast) or cast_ill nomination (for
+ * !unicast) after it was picked as the outgoing ill.
+ * We have to pick an active interface and/or cast_ill in the
+ * group.
+ */
+ mutex_exit(&ill->ill_phyint->phyint_lock);
+ nce_delete(nce);
+ mutex_exit(&ill->ill_lock);
+ rw_exit(&ill->ill_ipst->ips_ill_g_lock);
+ nce_refrele(nce);
+ ill_refrele(ill);
+ if ((ill = ipmp_ill_get_xmit_ill(in_ill, is_unicast)) == NULL)
+ return (NULL);
+ goto retry;
+ } else {
+ mutex_exit(&ill->ill_phyint->phyint_lock);
+ mutex_exit(&ill->ill_lock);
+ rw_exit(&ill->ill_ipst->ips_ill_g_lock);
}
- IRB_REFRELE(firb);
+done:
+ ASSERT(nce->nce_ill == ill);
+ if (need_refrele)
+ ill_refrele(ill);
+ return (nce);
+}
- /* Then subtract the number of routes to that dst that are resolved */
- IRB_REFHOLD(cirb);
- for (cire = first_cire; cire != NULL; cire = cire->ire_next) {
- if (!(cire->ire_flags & RTF_MULTIRT))
- continue;
- if (cire->ire_addr != dst)
- continue;
- if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN))
- continue;
- unres_cnt--;
- }
- IRB_REFRELE(cirb);
+nce_t *
+arp_nce_init(ill_t *ill, in_addr_t addr4, int ire_type)
+{
+ return (ire_nce_init(ill, &addr4, ire_type));
+}
- /* At least one route is unresolved; search for a resolvable route. */
- if (unres_cnt > 0)
- resolvable = ire_multirt_lookup(&first_cire, &first_fire,
- MULTIRT_USESTAMP | MULTIRT_CACHEGW, NULL, tsl, ipst);
+nce_t *
+ndp_nce_init(ill_t *ill, const in6_addr_t *addr6, int ire_type)
+{
+ ASSERT((ire_type & IRE_BROADCAST) == 0);
+ return (ire_nce_init(ill, addr6, ire_type));
+}
- if (first_fire != NULL)
- ire_refrele(first_fire);
+/*
+ * The caller should hold irb_lock as a writer if the ire is in a bucket.
+ */
+void
+ire_make_condemned(ire_t *ire)
+{
+ ip_stack_t *ipst = ire->ire_ipst;
+
+ mutex_enter(&ire->ire_lock);
+ ASSERT(ire->ire_bucket == NULL ||
+ RW_WRITE_HELD(&ire->ire_bucket->irb_lock));
+ ASSERT(!IRE_IS_CONDEMNED(ire));
+ ire->ire_generation = IRE_GENERATION_CONDEMNED;
+ /* Count how many condemned ires for kmem_cache callback */
+ atomic_add_32(&ipst->ips_num_ire_condemned, 1);
+ mutex_exit(&ire->ire_lock);
+}
- if (first_cire != NULL)
- ire_refrele(first_cire);
+/*
+ * Increment the generation avoiding the special condemned value
+ */
+void
+ire_increment_generation(ire_t *ire)
+{
+ uint_t generation;
- return (resolvable);
+ mutex_enter(&ire->ire_lock);
+ /*
+ * Even though the caller has a hold it can't prevent a concurrent
+ * ire_delete marking the IRE condemned
+ */
+ if (!IRE_IS_CONDEMNED(ire)) {
+ generation = ire->ire_generation + 1;
+ if (generation == IRE_GENERATION_CONDEMNED)
+ generation = IRE_GENERATION_INITIAL;
+ ASSERT(generation != IRE_GENERATION_VERIFY);
+ ire->ire_generation = generation;
+ }
+ mutex_exit(&ire->ire_lock);
}
/*
- * Explore a forward_table bucket, starting from fire_arg.
- * fire_arg MUST be an IRE_HOST entry.
- *
- * Return B_TRUE and update *ire_arg and *fire_arg
- * if at least one resolvable route is found. *ire_arg
- * is the IRE entry for *fire_arg's gateway.
- *
- * Return B_FALSE otherwise (all routes are resolved or
- * the remaining unresolved routes are all unresolvable).
- *
- * The IRE selection relies on a priority mechanism
- * driven by the flags passed in by the caller.
- * The caller, such as ip_newroute_ipif(), can get the most
- * relevant ire at each stage of a multiple route resolution.
- *
- * The rules are:
- *
- * - if MULTIRT_CACHEGW is specified in flags, IRE_CACHETABLE
- * ires are preferred for the gateway. This gives the highest
- * priority to routes that can be resolved without using
- * a resolver.
+ * Increment ire_generation on all the IRE_MULTICASTs
+ * Used when the default multicast interface (as determined by
+ * ill_lookup_multicast) might have changed.
*
- * - if MULTIRT_CACHEGW is not specified, or if MULTIRT_CACHEGW
- * is specified but no IRE_CACHETABLE ire entry for the gateway
- * is found, the following rules apply.
- *
- * - if MULTIRT_USESTAMP is specified in flags, IRE_INTERFACE
- * ires for the gateway, that have not been tried since
- * a configurable amount of time, are preferred.
- * This applies when a resolver must be invoked for
- * a missing route, but we don't want to use the resolver
- * upon each packet emission. If no such resolver is found,
- * B_FALSE is returned.
- * The MULTIRT_USESTAMP flag can be combined with
- * MULTIRT_CACHEGW.
- *
- * - if MULTIRT_USESTAMP is not specified in flags, the first
- * unresolved but resolvable route is selected.
- *
- * - Otherwise, there is no resolvable route, and
- * B_FALSE is returned.
- *
- * At last, MULTIRT_SETSTAMP can be specified in flags to
- * request the timestamp of unresolvable routes to
- * be refreshed. This prevents the useless exploration
- * of those routes for a while, when MULTIRT_USESTAMP is used.
- *
- * The argument already_resolved_count is an output variable to track number
- * of already resolved multirt routes.
- *
- * This only works in the global zone.
+ * That includes the zoneid, IFF_ flags, the IPv6 scope of the address, and
+ * ill unplumb.
*/
-boolean_t
-ire_multirt_lookup(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags,
- int *already_resolved_count, const ts_label_t *tsl, ip_stack_t *ipst)
+void
+ire_increment_multicast_generation(ip_stack_t *ipst, boolean_t isv6)
{
- clock_t delta;
- ire_t *best_fire = NULL;
- ire_t *best_cire = NULL;
- ire_t *first_fire;
- ire_t *first_cire;
- ire_t *fire;
- ire_t *cire;
- irb_t *firb = NULL;
- irb_t *cirb = NULL;
- ire_t *gw_ire;
- boolean_t already_resolved;
- boolean_t res;
- ipaddr_t dst;
- ipaddr_t gw;
-
- ip2dbg(("ire_multirt_lookup: *ire_arg %p, *fire_arg %p, flags %04x\n",
- (void *)*ire_arg, (void *)*fire_arg, flags));
-
- ASSERT(ire_arg != NULL);
- ASSERT(fire_arg != NULL);
-
- /* Not an IRE_HOST ire; give up. */
- if ((*fire_arg == NULL) || ((*fire_arg)->ire_type != IRE_HOST)) {
- return (B_FALSE);
+ ill_t *ill;
+ ill_walk_context_t ctx;
+
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+ if (isv6)
+ ill = ILL_START_WALK_V6(&ctx, ipst);
+ else
+ ill = ILL_START_WALK_V4(&ctx, ipst);
+ for (; ill != NULL; ill = ill_next(&ctx, ill)) {
+ if (ILL_IS_CONDEMNED(ill))
+ continue;
+ if (ill->ill_ire_multicast != NULL)
+ ire_increment_generation(ill->ill_ire_multicast);
}
+ rw_exit(&ipst->ips_ill_g_lock);
+}
- /* This is the first IRE_HOST ire for that destination. */
- first_fire = *fire_arg;
- firb = first_fire->ire_bucket;
- ASSERT(firb != NULL);
+/*
+ * Return a held IRE_NOROUTE with RTF_REJECT set
+ */
+ire_t *
+ire_reject(ip_stack_t *ipst, boolean_t isv6)
+{
+ ire_t *ire;
- dst = first_fire->ire_addr;
+ if (isv6)
+ ire = ipst->ips_ire_reject_v6;
+ else
+ ire = ipst->ips_ire_reject_v4;
- ip2dbg(("ire_multirt_lookup: dst %08x\n", ntohl(dst)));
+ ASSERT(ire->ire_generation != IRE_GENERATION_CONDEMNED);
+ ire_refhold(ire);
+ return (ire);
+}
- /*
- * Retrieve the first IRE_CACHE ire for that destination;
- * if we don't find one, no route for that dest is
- * resolved yet.
- */
- first_cire = ire_cache_lookup(dst, GLOBAL_ZONEID, tsl, ipst);
- if (first_cire != NULL) {
- cirb = first_cire->ire_bucket;
- }
+/*
+ * Return a held IRE_NOROUTE with RTF_BLACKHOLE set
+ */
+ire_t *
+ire_blackhole(ip_stack_t *ipst, boolean_t isv6)
+{
+ ire_t *ire;
- ip2dbg(("ire_multirt_lookup: first_cire %p\n", (void *)first_cire));
+ if (isv6)
+ ire = ipst->ips_ire_blackhole_v6;
+ else
+ ire = ipst->ips_ire_blackhole_v4;
- /*
- * Search for a resolvable route, giving the top priority
- * to routes that can be resolved without any call to the resolver.
- */
- IRB_REFHOLD(firb);
+ ASSERT(ire->ire_generation != IRE_GENERATION_CONDEMNED);
+ ire_refhold(ire);
+ return (ire);
+}
+
+/*
+ * Return a held IRE_MULTICAST.
+ */
+ire_t *
+ire_multicast(ill_t *ill)
+{
+ ire_t *ire = ill->ill_ire_multicast;
+
+ ASSERT(ire == NULL || ire->ire_generation != IRE_GENERATION_CONDEMNED);
+ if (ire == NULL)
+ ire = ire_blackhole(ill->ill_ipst, ill->ill_isv6);
+ else
+ ire_refhold(ire);
+ return (ire);
+}
+
+/*
+ * Given an IRE return its nexthop IRE. The nexthop IRE is an IRE_ONLINK
+ * that is an exact match (i.e., a /32 for IPv4 and /128 for IPv6).
+ * This can return an RTF_REJECT|RTF_BLACKHOLE.
+ * The returned IRE is held.
+ * The assumption is that ip_select_route() has been called and returned the
+ * IRE (thus ip_select_route would have set up the ire_dep* information.)
+ * If some IRE is deleteted then ire_dep_remove() will have been called and
+ * we might not find a nexthop IRE, in which case we return NULL.
+ */
+ire_t *
+ire_nexthop(ire_t *ire)
+{
+ ip_stack_t *ipst = ire->ire_ipst;
- if (!CLASSD(dst)) {
+ /* Acquire lock to walk ire_dep_parent */
+ rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
+ while (ire != NULL) {
+ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+ goto done;
+ }
/*
- * For all multiroute IRE_HOST ires for that destination,
- * check if the route via the IRE_HOST's gateway is
- * resolved yet.
+ * If we find an IRE_ONLINK we are done. This includes
+ * the case of IRE_MULTICAST.
+ * Note that in order to send packets we need a host-specific
+ * IRE_IF_ALL first in the ire_dep_parent chain. Normally this
+ * is done by inserting an IRE_IF_CLONE if the IRE_INTERFACE
+ * was not host specific.
+ * However, ip_rts_request doesn't want to send packets
+ * hence doesn't want to allocate an IRE_IF_CLONE. Yet
+ * it needs an IRE_IF_ALL to get to the ill. Thus
+ * we return IRE_IF_ALL that are not host specific here.
*/
- for (fire = first_fire; fire != NULL; fire = fire->ire_next) {
-
- if (!(fire->ire_flags & RTF_MULTIRT))
- continue;
- if (fire->ire_addr != dst)
- continue;
+ if (ire->ire_type & IRE_ONLINK)
+ goto done;
+ ire = ire->ire_dep_parent;
+ }
+ rw_exit(&ipst->ips_ire_dep_lock);
+ return (NULL);
- if (fire->ire_gw_secattr != NULL &&
- tsol_ire_match_gwattr(fire, tsl) != 0) {
- continue;
- }
+done:
+ ire_refhold(ire);
+ rw_exit(&ipst->ips_ire_dep_lock);
+ return (ire);
+}
- gw = fire->ire_gateway_addr;
-
- ip2dbg(("ire_multirt_lookup: fire %p, "
- "ire_addr %08x, ire_gateway_addr %08x\n",
- (void *)fire, ntohl(fire->ire_addr), ntohl(gw)));
-
- already_resolved = B_FALSE;
-
- if (first_cire != NULL) {
- ASSERT(cirb != NULL);
-
- IRB_REFHOLD(cirb);
- /*
- * For all IRE_CACHE ires for that
- * destination.
- */
- for (cire = first_cire;
- cire != NULL;
- cire = cire->ire_next) {
-
- if (!(cire->ire_flags & RTF_MULTIRT))
- continue;
- if (cire->ire_addr != dst)
- continue;
- if (cire->ire_marks &
- (IRE_MARK_CONDEMNED |
- IRE_MARK_TESTHIDDEN))
- continue;
-
- if (cire->ire_gw_secattr != NULL &&
- tsol_ire_match_gwattr(cire,
- tsl) != 0) {
- continue;
- }
+/*
+ * Find the ill used to send packets. This will be NULL in case
+ * of a reject or blackhole.
+ * The returned ill is held; caller needs to do ill_refrele when done.
+ */
+ill_t *
+ire_nexthop_ill(ire_t *ire)
+{
+ ill_t *ill;
- /*
- * Check if the IRE_CACHE's gateway
- * matches the IRE_HOST's gateway.
- */
- if (cire->ire_gateway_addr == gw) {
- already_resolved = B_TRUE;
- break;
- }
- }
- IRB_REFRELE(cirb);
- }
+ ire = ire_nexthop(ire);
+ if (ire == NULL)
+ return (NULL);
- /*
- * This route is already resolved;
- * proceed with next one.
- */
- if (already_resolved) {
- ip2dbg(("ire_multirt_lookup: found cire %p, "
- "already resolved\n", (void *)cire));
+ /* ire_ill can not change for an existing ire */
+ ill = ire->ire_ill;
+ if (ill != NULL)
+ ill_refhold(ill);
+ ire_refrele(ire);
+ return (ill);
+}
- if (already_resolved_count != NULL)
- (*already_resolved_count)++;
- continue;
- }
+#ifdef DEBUG
+static boolean_t
+parent_has_child(ire_t *parent, ire_t *child)
+{
+ ire_t *ire;
+ ire_t *prev;
- /*
- * The route is unresolved; is it actually
- * resolvable, i.e. is there a cache or a resolver
- * for the gateway?
- */
- gw_ire = ire_route_lookup(gw, 0, 0, 0, NULL, NULL,
- ALL_ZONES, tsl,
- MATCH_IRE_RECURSIVE | MATCH_IRE_SECATTR, ipst);
+ ire = parent->ire_dep_children;
+ prev = NULL;
+ while (ire != NULL) {
+ if (prev == NULL) {
+ ASSERT(ire->ire_dep_sib_ptpn ==
+ &(parent->ire_dep_children));
+ } else {
+ ASSERT(ire->ire_dep_sib_ptpn ==
+ &(prev->ire_dep_sib_next));
+ }
+ if (ire == child)
+ return (B_TRUE);
+ prev = ire;
+ ire = ire->ire_dep_sib_next;
+ }
+ return (B_FALSE);
+}
- ip2dbg(("ire_multirt_lookup: looked up gw_ire %p\n",
- (void *)gw_ire));
+static void
+ire_dep_verify(ire_t *ire)
+{
+ ire_t *parent = ire->ire_dep_parent;
+ ire_t *child = ire->ire_dep_children;
- /*
- * If gw_ire is typed IRE_CACHETABLE,
- * this route can be resolved without any call to the
- * resolver. If the MULTIRT_CACHEGW flag is set,
- * give the top priority to this ire and exit the
- * loop.
- * This is typically the case when an ARP reply
- * is processed through ip_wput_nondata().
- */
- if ((flags & MULTIRT_CACHEGW) &&
- (gw_ire != NULL) &&
- (gw_ire->ire_type & IRE_CACHETABLE)) {
- ASSERT(gw_ire->ire_nce == NULL ||
- gw_ire->ire_nce->nce_state == ND_REACHABLE);
- /*
- * Release the resolver associated to the
- * previous candidate best ire, if any.
- */
- if (best_cire != NULL) {
- ire_refrele(best_cire);
- ASSERT(best_fire != NULL);
- }
+ ASSERT(ire->ire_ipversion == IPV4_VERSION ||
+ ire->ire_ipversion == IPV6_VERSION);
+ if (parent != NULL) {
+ ASSERT(parent->ire_ipversion == IPV4_VERSION ||
+ parent->ire_ipversion == IPV6_VERSION);
+ ASSERT(parent->ire_refcnt >= 1);
+ ASSERT(parent_has_child(parent, ire));
+ }
+ if (child != NULL) {
+ ASSERT(child->ire_ipversion == IPV4_VERSION ||
+ child->ire_ipversion == IPV6_VERSION);
+ ASSERT(child->ire_dep_parent == ire);
+ ASSERT(child->ire_dep_sib_ptpn != NULL);
+ ASSERT(parent_has_child(ire, child));
+ }
+}
+#endif /* DEBUG */
- best_fire = fire;
- best_cire = gw_ire;
+/*
+ * Assumes ire_dep_parent is set. Remove this child from its parent's linkage.
+ */
+void
+ire_dep_remove(ire_t *ire)
+{
+ ip_stack_t *ipst = ire->ire_ipst;
+ ire_t *parent = ire->ire_dep_parent;
+ ire_t *next;
+ nce_t *nce;
- ip2dbg(("ire_multirt_lookup: found top prio "
- "best_fire %p, best_cire %p\n",
- (void *)best_fire, (void *)best_cire));
- break;
- }
+ ASSERT(RW_WRITE_HELD(&ipst->ips_ire_dep_lock));
+ ASSERT(ire->ire_dep_parent != NULL);
+ ASSERT(ire->ire_dep_sib_ptpn != NULL);
- /*
- * Compute the time elapsed since our preceding
- * attempt to resolve that route.
- * If the MULTIRT_USESTAMP flag is set, we take that
- * route into account only if this time interval
- * exceeds ip_multirt_resolution_interval;
- * this prevents us from attempting to resolve a
- * broken route upon each sending of a packet.
- */
- delta = lbolt - fire->ire_last_used_time;
- delta = TICK_TO_MSEC(delta);
-
- res = (boolean_t)((delta >
- ipst->ips_ip_multirt_resolution_interval) ||
- (!(flags & MULTIRT_USESTAMP)));
-
- ip2dbg(("ire_multirt_lookup: fire %p, delta %lu, "
- "res %d\n",
- (void *)fire, delta, res));
-
- if (res) {
- /*
- * We are here if MULTIRT_USESTAMP flag is set
- * and the resolver for fire's gateway
- * has not been tried since
- * ip_multirt_resolution_interval, or if
- * MULTIRT_USESTAMP is not set but gw_ire did
- * not fill the conditions for MULTIRT_CACHEGW,
- * or if neither MULTIRT_USESTAMP nor
- * MULTIRT_CACHEGW are set.
- */
- if (gw_ire != NULL) {
- if (best_fire == NULL) {
- ASSERT(best_cire == NULL);
-
- best_fire = fire;
- best_cire = gw_ire;
-
- ip2dbg(("ire_multirt_lookup:"
- "found candidate "
- "best_fire %p, "
- "best_cire %p\n",
- (void *)best_fire,
- (void *)best_cire));
-
- /*
- * If MULTIRT_CACHEGW is not
- * set, we ignore the top
- * priority ires that can
- * be resolved without any
- * call to the resolver;
- * In that case, there is
- * actually no need
- * to continue the loop.
- */
- if (!(flags &
- MULTIRT_CACHEGW)) {
- break;
- }
- continue;
- }
- } else {
- /*
- * No resolver for the gateway: the
- * route is not resolvable.
- * If the MULTIRT_SETSTAMP flag is
- * set, we stamp the IRE_HOST ire,
- * so we will not select it again
- * during this resolution interval.
- */
- if (flags & MULTIRT_SETSTAMP)
- fire->ire_last_used_time =
- lbolt;
- }
- }
+#ifdef DEBUG
+ ire_dep_verify(ire);
+ ire_dep_verify(parent);
+#endif
- if (gw_ire != NULL)
- ire_refrele(gw_ire);
- }
- } else { /* CLASSD(dst) */
+ next = ire->ire_dep_sib_next;
+ if (next != NULL)
+ next->ire_dep_sib_ptpn = ire->ire_dep_sib_ptpn;
- for (fire = first_fire;
- fire != NULL;
- fire = fire->ire_next) {
+ ASSERT(*(ire->ire_dep_sib_ptpn) == ire);
+ *(ire->ire_dep_sib_ptpn) = ire->ire_dep_sib_next;
- if (!(fire->ire_flags & RTF_MULTIRT))
- continue;
- if (fire->ire_addr != dst)
- continue;
+ ire->ire_dep_sib_ptpn = NULL;
+ ire->ire_dep_sib_next = NULL;
- if (fire->ire_gw_secattr != NULL &&
- tsol_ire_match_gwattr(fire, tsl) != 0) {
- continue;
- }
+ mutex_enter(&ire->ire_lock);
+ parent = ire->ire_dep_parent;
+ ire->ire_dep_parent = NULL;
+ mutex_exit(&ire->ire_lock);
- already_resolved = B_FALSE;
+ /*
+ * Make sure all our children, grandchildren, etc set
+ * ire_dep_parent_generation to IRE_GENERATION_VERIFY since
+ * we can no longer guarantee than the children have a current
+ * ire_nce_cache and ire_nexthop_ill().
+ */
+ if (ire->ire_dep_children != NULL)
+ ire_dep_invalidate_children(ire->ire_dep_children);
- gw = fire->ire_gateway_addr;
+ /*
+ * Since the parent is gone we make sure we clear ire_nce_cache.
+ * We can clear it under ire_lock even if the IRE is used
+ */
+ mutex_enter(&ire->ire_lock);
+ nce = ire->ire_nce_cache;
+ ire->ire_nce_cache = NULL;
+ mutex_exit(&ire->ire_lock);
+ if (nce != NULL)
+ nce_refrele(nce);
- gw_ire = ire_ftable_lookup(gw, 0, 0, IRE_INTERFACE,
- NULL, NULL, ALL_ZONES, 0, tsl,
- MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE |
- MATCH_IRE_SECATTR, ipst);
+#ifdef DEBUG
+ ire_dep_verify(ire);
+ ire_dep_verify(parent);
+#endif
- /* No resolver for the gateway; we skip this ire. */
- if (gw_ire == NULL) {
- continue;
- }
- ASSERT(gw_ire->ire_nce == NULL ||
- gw_ire->ire_nce->nce_state == ND_REACHABLE);
-
- if (first_cire != NULL) {
-
- IRB_REFHOLD(cirb);
- /*
- * For all IRE_CACHE ires for that
- * destination.
- */
- for (cire = first_cire;
- cire != NULL;
- cire = cire->ire_next) {
-
- if (!(cire->ire_flags & RTF_MULTIRT))
- continue;
- if (cire->ire_addr != dst)
- continue;
- if (cire->ire_marks &
- (IRE_MARK_CONDEMNED |
- IRE_MARK_TESTHIDDEN))
- continue;
-
- if (cire->ire_gw_secattr != NULL &&
- tsol_ire_match_gwattr(cire,
- tsl) != 0) {
- continue;
- }
+ ire_refrele_notr(parent);
+ ire_refrele_notr(ire);
+}
- /*
- * Cache entries are linked to the
- * parent routes using the parent handle
- * (ire_phandle). If no cache entry has
- * the same handle as fire, fire is
- * still unresolved.
- */
- ASSERT(cire->ire_phandle != 0);
- if (cire->ire_phandle ==
- fire->ire_phandle) {
- already_resolved = B_TRUE;
- break;
- }
- }
- IRB_REFRELE(cirb);
- }
+/*
+ * Insert the child in the linkage of the parent
+ */
+static void
+ire_dep_parent_insert(ire_t *child, ire_t *parent)
+{
+ ip_stack_t *ipst = child->ire_ipst;
+ ire_t *next;
- /*
- * This route is already resolved; proceed with
- * next one.
- */
- if (already_resolved) {
- ire_refrele(gw_ire);
- if (already_resolved_count != NULL)
- (*already_resolved_count)++;
- continue;
- }
+ ASSERT(RW_WRITE_HELD(&ipst->ips_ire_dep_lock));
+ ASSERT(child->ire_dep_parent == NULL);
- /*
- * Compute the time elapsed since our preceding
- * attempt to resolve that route.
- * If the MULTIRT_USESTAMP flag is set, we take
- * that route into account only if this time
- * interval exceeds ip_multirt_resolution_interval;
- * this prevents us from attempting to resolve a
- * broken route upon each sending of a packet.
- */
- delta = lbolt - fire->ire_last_used_time;
- delta = TICK_TO_MSEC(delta);
-
- res = (boolean_t)((delta >
- ipst->ips_ip_multirt_resolution_interval) ||
- (!(flags & MULTIRT_USESTAMP)));
-
- ip3dbg(("ire_multirt_lookup: fire %p, delta %lx, "
- "flags %04x, res %d\n",
- (void *)fire, delta, flags, res));
-
- if (res) {
- if (best_cire != NULL) {
- /*
- * Release the resolver associated
- * to the preceding candidate best
- * ire, if any.
- */
- ire_refrele(best_cire);
- ASSERT(best_fire != NULL);
- }
- best_fire = fire;
- best_cire = gw_ire;
- continue;
- }
+#ifdef DEBUG
+ ire_dep_verify(child);
+ ire_dep_verify(parent);
+#endif
+ /* No parents => no siblings */
+ ASSERT(child->ire_dep_sib_ptpn == NULL);
+ ASSERT(child->ire_dep_sib_next == NULL);
- ire_refrele(gw_ire);
- }
- }
+ ire_refhold_notr(parent);
+ ire_refhold_notr(child);
- if (best_fire != NULL) {
- IRE_REFHOLD(best_fire);
+ /* Head insertion */
+ next = parent->ire_dep_children;
+ if (next != NULL) {
+ ASSERT(next->ire_dep_sib_ptpn == &(parent->ire_dep_children));
+ child->ire_dep_sib_next = next;
+ next->ire_dep_sib_ptpn = &(child->ire_dep_sib_next);
}
- IRB_REFRELE(firb);
+ parent->ire_dep_children = child;
+ child->ire_dep_sib_ptpn = &(parent->ire_dep_children);
- /* Release the first IRE_CACHE we initially looked up, if any. */
- if (first_cire != NULL)
- ire_refrele(first_cire);
+ mutex_enter(&child->ire_lock);
+ child->ire_dep_parent = parent;
+ mutex_exit(&child->ire_lock);
- /* Found a resolvable route. */
- if (best_fire != NULL) {
- ASSERT(best_cire != NULL);
-
- if (*fire_arg != NULL)
- ire_refrele(*fire_arg);
- if (*ire_arg != NULL)
- ire_refrele(*ire_arg);
+#ifdef DEBUG
+ ire_dep_verify(child);
+ ire_dep_verify(parent);
+#endif
+}
- /*
- * Update the passed-in arguments with the
- * resolvable multirt route we found.
- */
- *fire_arg = best_fire;
- *ire_arg = best_cire;
- ip2dbg(("ire_multirt_lookup: returning B_TRUE, "
- "*fire_arg %p, *ire_arg %p\n",
- (void *)best_fire, (void *)best_cire));
+/*
+ * Given count worth of ires and generations, build ire_dep_* relationships
+ * from ires[0] to ires[count-1]. Record generations[i+1] in
+ * ire_dep_parent_generation for ires[i].
+ * We graft onto an existing parent chain by making sure that we don't
+ * touch ire_dep_parent for ires[count-1].
+ *
+ * We check for any condemned ire_generation count and return B_FALSE in
+ * that case so that the caller can tear it apart.
+ *
+ * Note that generations[0] is not used. Caller handles that.
+ */
+boolean_t
+ire_dep_build(ire_t *ires[], uint_t generations[], uint_t count)
+{
+ ire_t *ire = ires[0];
+ ip_stack_t *ipst;
+ uint_t i;
+ ASSERT(count > 0);
+ if (count == 1) {
+ /* No work to do */
return (B_TRUE);
}
+ ipst = ire->ire_ipst;
+ rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER);
+ /*
+ * Do not remove the linkage for any existing parent chain i.e.,
+ * ires[count-1] is left alone.
+ */
+ for (i = 0; i < count-1; i++) {
+ /* Remove existing parent if we need to change it */
+ if (ires[i]->ire_dep_parent != NULL &&
+ ires[i]->ire_dep_parent != ires[i+1])
+ ire_dep_remove(ires[i]);
+ }
- ASSERT(best_cire == NULL);
+ for (i = 0; i < count - 1; i++) {
+ ASSERT(ires[i]->ire_ipversion == IPV4_VERSION ||
+ ires[i]->ire_ipversion == IPV6_VERSION);
+ /* Does it need to change? */
+ if (ires[i]->ire_dep_parent != ires[i+1])
+ ire_dep_parent_insert(ires[i], ires[i+1]);
- ip2dbg(("ire_multirt_lookup: returning B_FALSE, *fire_arg %p, "
- "*ire_arg %p\n",
- (void *)*fire_arg, (void *)*ire_arg));
+ mutex_enter(&ires[i+1]->ire_lock);
+ if (IRE_IS_CONDEMNED(ires[i+1])) {
+ mutex_exit(&ires[i+1]->ire_lock);
+ rw_exit(&ipst->ips_ire_dep_lock);
+ return (B_FALSE);
+ }
+ mutex_exit(&ires[i+1]->ire_lock);
- /* No resolvable route. */
- return (B_FALSE);
+ mutex_enter(&ires[i]->ire_lock);
+ ires[i]->ire_dep_parent_generation = generations[i+1];
+ mutex_exit(&ires[i]->ire_lock);
+ }
+ rw_exit(&ipst->ips_ire_dep_lock);
+ return (B_TRUE);
}
/*
- * IRE iterator for inbound and loopback broadcast processing.
- * Given an IRE_BROADCAST ire, walk the ires with the same destination
- * address, but skip over the passed-in ire. Returns the next ire without
- * a hold - assumes that the caller holds a reference on the IRE bucket.
+ * Given count worth of ires, unbuild ire_dep_* relationships
+ * from ires[0] to ires[count-1].
*/
-ire_t *
-ire_get_next_bcast_ire(ire_t *curr, ire_t *ire)
+void
+ire_dep_unbuild(ire_t *ires[], uint_t count)
{
- ill_t *ill;
+ ip_stack_t *ipst;
+ uint_t i;
- if (curr == NULL) {
- for (curr = ire->ire_bucket->irb_ire; curr != NULL;
- curr = curr->ire_next) {
- if (curr->ire_addr == ire->ire_addr)
- break;
- }
- } else {
- curr = curr->ire_next;
+ if (count == 0) {
+ /* No work to do */
+ return;
}
- ill = ire_to_ill(ire);
- for (; curr != NULL; curr = curr->ire_next) {
- if (curr->ire_addr != ire->ire_addr) {
- /*
- * All the IREs to a given destination are contiguous;
- * break out once the address doesn't match.
- */
- break;
- }
- if (curr == ire) {
- /* skip over the passed-in ire */
- continue;
- }
- if ((curr->ire_stq != NULL && ire->ire_stq == NULL) ||
- (curr->ire_stq == NULL && ire->ire_stq != NULL)) {
+ ipst = ires[0]->ire_ipst;
+ rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER);
+ for (i = 0; i < count; i++) {
+ ASSERT(ires[i]->ire_ipversion == IPV4_VERSION ||
+ ires[i]->ire_ipversion == IPV6_VERSION);
+ if (ires[i]->ire_dep_parent != NULL)
+ ire_dep_remove(ires[i]);
+ mutex_enter(&ires[i]->ire_lock);
+ ires[i]->ire_dep_parent_generation = IRE_GENERATION_VERIFY;
+ mutex_exit(&ires[i]->ire_lock);
+ }
+ rw_exit(&ipst->ips_ire_dep_lock);
+}
+
+/*
+ * Both the forwarding and the outbound code paths can trip on
+ * a condemned NCE, in which case we call this function.
+ * We have two different behaviors: if the NCE was UNREACHABLE
+ * it is an indication that something failed. In that case
+ * we see if we should look for a different IRE (for example,
+ * delete any matching redirect IRE, or try a different
+ * IRE_DEFAULT (ECMP)). We mark the ire as bad so a hopefully
+ * different IRE will be picked next time we send/forward.
+ *
+ * If we are called by the output path then fail_if_better is set
+ * and we return NULL if there could be a better IRE. This is because the
+ * output path retries the IRE lookup. (The input/forward path can not retry.)
+ *
+ * If the NCE was not unreachable then we pick/allocate a
+ * new (most likely ND_INITIAL) NCE and proceed with it.
+ *
+ * ipha/ip6h are needed for multicast packets; ipha needs to be
+ * set for IPv4 and ip6h needs to be set for IPv6 packets.
+ */
+nce_t *
+ire_handle_condemned_nce(nce_t *nce, ire_t *ire, ipha_t *ipha, ip6_t *ip6h,
+ boolean_t fail_if_better)
+{
+ if (nce->nce_common->ncec_state == ND_UNREACHABLE) {
+ if (ire_no_good(ire) && fail_if_better) {
/*
- * If the passed-in ire is loopback, skip over
- * non-loopback ires and vice versa.
+ * Did some changes, or ECMP likely to exist.
+ * Make ip_output look for a different IRE
*/
- continue;
+ return (NULL);
}
- if (ire_to_ill(curr) != ill) {
- /* skip over IREs going through a different interface */
- continue;
+ }
+ if (ire_revalidate_nce(ire) == ENETUNREACH) {
+ /* The ire_dep_parent chain went bad, or no memory? */
+ (void) ire_no_good(ire);
+ return (NULL);
+ }
+ if (ire->ire_ipversion == IPV4_VERSION) {
+ ASSERT(ipha != NULL);
+ nce = ire_to_nce(ire, ipha->ipha_dst, NULL);
+ } else {
+ ASSERT(ip6h != NULL);
+ nce = ire_to_nce(ire, INADDR_ANY, &ip6h->ip6_dst);
+ }
+
+ if (nce == NULL)
+ return (NULL);
+ if (nce->nce_is_condemned) {
+ nce_refrele(nce);
+ return (NULL);
+ }
+ return (nce);
+}
+
+/*
+ * The caller has found that the ire is bad, either due to a reference to an NCE
+ * in ND_UNREACHABLE state, or a MULTIRT route whose gateway can't be resolved.
+ * We update things so a subsequent attempt to send to the destination
+ * is likely to find different IRE, or that a new NCE would be created.
+ *
+ * Returns B_TRUE if it is likely that a subsequent ire_ftable_lookup would
+ * find a different route (either due to having deleted a redirect, or there
+ * being ECMP routes.)
+ *
+ * If we have a redirect (RTF_DYNAMIC) we delete it.
+ * Otherwise we increment ire_badcnt and increment the generation number so
+ * that a cached ixa_ire will redo the route selection. ire_badcnt is taken
+ * into account in the route selection when we have multiple choices (multiple
+ * default routes or ECMP in general).
+ * Any time ip_select_route find an ire with a condemned ire_nce_cache
+ * (e.g., if no equal cost route to the bad one) ip_select_route will make
+ * sure the NCE is revalidated to avoid getting stuck on a
+ * NCE_F_CONDMNED ncec that caused ire_no_good to be called.
+ */
+boolean_t
+ire_no_good(ire_t *ire)
+{
+ ip_stack_t *ipst = ire->ire_ipst;
+ ire_t *ire2;
+ nce_t *nce;
+
+ if (ire->ire_flags & RTF_DYNAMIC) {
+ ire_delete(ire);
+ return (B_TRUE);
+ }
+ if (ire->ire_flags & RTF_INDIRECT) {
+ /* Check if next IRE is a redirect */
+ rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
+ if (ire->ire_dep_parent != NULL &&
+ (ire->ire_dep_parent->ire_flags & RTF_DYNAMIC)) {
+ ire2 = ire->ire_dep_parent;
+ ire_refhold(ire2);
+ } else {
+ ire2 = NULL;
}
- if (curr->ire_marks & IRE_MARK_CONDEMNED) {
- /* skip over deleted IREs */
- continue;
+ rw_exit(&ipst->ips_ire_dep_lock);
+ if (ire2 != NULL) {
+ ire_delete(ire2);
+ ire_refrele(ire2);
+ return (B_TRUE);
}
- return (curr);
}
- return (NULL);
+ /*
+ * No redirect involved. Increment badcnt so that if we have ECMP
+ * routes we are likely to pick a different one for the next packet.
+ *
+ * If the NCE is unreachable and condemned we should drop the reference
+ * to it so that a new NCE can be created.
+ *
+ * Finally we increment the generation number so that any ixa_ire
+ * cache will be revalidated.
+ */
+ mutex_enter(&ire->ire_lock);
+ ire->ire_badcnt++;
+ ire->ire_last_badcnt = TICK_TO_SEC(lbolt64);
+ nce = ire->ire_nce_cache;
+ if (nce != NULL && nce->nce_is_condemned &&
+ nce->nce_common->ncec_state == ND_UNREACHABLE)
+ ire->ire_nce_cache = NULL;
+ else
+ nce = NULL;
+ mutex_exit(&ire->ire_lock);
+ if (nce != NULL)
+ nce_refrele(nce);
+
+ ire_increment_generation(ire);
+ ire_dep_incr_generation(ire);
+
+ return (ire->ire_bucket->irb_ire_cnt > 1);
}
-#ifdef DEBUG
-void
-ire_trace_ref(ire_t *ire)
+/*
+ * Walk ire_dep_parent chain and validate that ire_dep_parent->ire_generation ==
+ * ire_dep_parent_generation.
+ * If they all match we just return ire_generation from the topmost IRE.
+ * Otherwise we propagate the mismatch by setting all ire_dep_parent_generation
+ * above the mismatch to IRE_GENERATION_VERIFY and also returning
+ * IRE_GENERATION_VERIFY.
+ */
+uint_t
+ire_dep_validate_generations(ire_t *ire)
{
- mutex_enter(&ire->ire_lock);
- if (ire->ire_trace_disable) {
+ ip_stack_t *ipst = ire->ire_ipst;
+ uint_t generation;
+ ire_t *ire1;
+
+ rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
+ generation = ire->ire_generation; /* Assuming things match */
+ for (ire1 = ire; ire1 != NULL; ire1 = ire1->ire_dep_parent) {
+ ASSERT(ire1->ire_ipversion == IPV4_VERSION ||
+ ire1->ire_ipversion == IPV6_VERSION);
+ if (ire1->ire_dep_parent == NULL)
+ break;
+ if (ire1->ire_dep_parent_generation !=
+ ire1->ire_dep_parent->ire_generation)
+ goto mismatch;
+ }
+ rw_exit(&ipst->ips_ire_dep_lock);
+ return (generation);
+
+mismatch:
+ generation = IRE_GENERATION_VERIFY;
+ /* Fill from top down to the mismatch with _VERIFY */
+ while (ire != ire1) {
+ ASSERT(ire->ire_ipversion == IPV4_VERSION ||
+ ire->ire_ipversion == IPV6_VERSION);
+ mutex_enter(&ire->ire_lock);
+ ire->ire_dep_parent_generation = IRE_GENERATION_VERIFY;
mutex_exit(&ire->ire_lock);
- return;
+ ire = ire->ire_dep_parent;
}
+ rw_exit(&ipst->ips_ire_dep_lock);
+ return (generation);
+}
- if (th_trace_ref(ire, ire->ire_ipst)) {
- mutex_exit(&ire->ire_lock);
- } else {
- ire->ire_trace_disable = B_TRUE;
+/*
+ * Used when we need to return an ire with ire_dep_parent, but we
+ * know the chain is invalid for instance we didn't create an IRE_IF_CLONE
+ * Using IRE_GENERATION_VERIFY means that next time we'll redo the
+ * recursive lookup.
+ */
+void
+ire_dep_invalidate_generations(ire_t *ire)
+{
+ ip_stack_t *ipst = ire->ire_ipst;
+
+ rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
+ while (ire != NULL) {
+ ASSERT(ire->ire_ipversion == IPV4_VERSION ||
+ ire->ire_ipversion == IPV6_VERSION);
+ mutex_enter(&ire->ire_lock);
+ ire->ire_dep_parent_generation = IRE_GENERATION_VERIFY;
mutex_exit(&ire->ire_lock);
- ire_trace_cleanup(ire);
+ ire = ire->ire_dep_parent;
}
+ rw_exit(&ipst->ips_ire_dep_lock);
}
-void
-ire_untrace_ref(ire_t *ire)
+/* Set _VERIFY ire_dep_parent_generation for all children recursively */
+static void
+ire_dep_invalidate_children(ire_t *child)
{
- mutex_enter(&ire->ire_lock);
- if (!ire->ire_trace_disable)
- th_trace_unref(ire);
- mutex_exit(&ire->ire_lock);
+ ip_stack_t *ipst = child->ire_ipst;
+
+ ASSERT(RW_WRITE_HELD(&ipst->ips_ire_dep_lock));
+ /* Depth first */
+ if (child->ire_dep_children != NULL)
+ ire_dep_invalidate_children(child->ire_dep_children);
+
+ while (child != NULL) {
+ mutex_enter(&child->ire_lock);
+ child->ire_dep_parent_generation = IRE_GENERATION_VERIFY;
+ mutex_exit(&child->ire_lock);
+ child = child->ire_dep_sib_next;
+ }
}
static void
-ire_trace_cleanup(const ire_t *ire)
+ire_dep_increment_children(ire_t *child)
{
- th_trace_cleanup(ire, ire->ire_trace_disable);
+ ip_stack_t *ipst = child->ire_ipst;
+
+ ASSERT(RW_READ_HELD(&ipst->ips_ire_dep_lock));
+ /* Depth first */
+ if (child->ire_dep_children != NULL)
+ ire_dep_increment_children(child->ire_dep_children);
+
+ while (child != NULL) {
+ if (!IRE_IS_CONDEMNED(child))
+ ire_increment_generation(child);
+ child = child->ire_dep_sib_next;
+ }
}
-#endif /* DEBUG */
/*
- * Generate a message chain with an arp request to resolve the in_ire.
- * It is assumed that in_ire itself is currently in the ire cache table,
- * so we create a fake_ire filled with enough information about ire_addr etc.
- * to retrieve in_ire when the DL_UNITDATA response from the resolver
- * comes back. The fake_ire itself is created by calling esballoc with
- * the fr_rtnp (free routine) set to ire_freemblk. This routine will be
- * invoked when the mblk containing fake_ire is freed.
+ * Walk all the children of this ire recursively and increment their
+ * generation number.
*/
void
-ire_arpresolve(ire_t *in_ire)
+ire_dep_incr_generation(ire_t *parent)
{
- areq_t *areq;
- ipaddr_t *addrp;
- mblk_t *ire_mp, *areq_mp;
- ire_t *ire, *buf;
- size_t bufsize;
- frtn_t *frtnp;
- ill_t *dst_ill;
- ip_stack_t *ipst;
+ ip_stack_t *ipst = parent->ire_ipst;
- ASSERT(in_ire->ire_nce != NULL);
+ rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
+ if (parent->ire_dep_children != NULL)
+ ire_dep_increment_children(parent->ire_dep_children);
+ rw_exit(&ipst->ips_ire_dep_lock);
+}
- dst_ill = ire_to_ill(in_ire);
- ipst = dst_ill->ill_ipst;
+/*
+ * Get a new ire_nce_cache for this IRE as well as its nexthop.
+ * Returns zero if it succeeds. Can fail due to lack of memory or when
+ * the route has become unreachable. Returns ENOMEM and ENETUNREACH in those
+ * cases.
+ *
+ * In the in.mpathd case, the ire will have ire_testhidden
+ * set; so we should create the ncec for the underlying ill.
+ *
+ * Note that the error returned by ire_revalidate_nce() is ignored by most
+ * callers except ire_handle_condemned_nce(), which handles the ENETUNREACH
+ * error to mark potentially bad ire's. For all the other callers, an
+ * error return could indicate a transient condition like ENOMEM, or could
+ * be the result of an interface that is going down/unplumbing. In the former
+ * case (transient error), we would leave the old stale ire/ire_nce_cache
+ * in place, and possibly use incorrect link-layer information to send packets
+ * but would eventually recover. In the latter case (ill down/replumb),
+ * ire_revalidate_nce() might return a condemned nce back, but we would then
+ * recover in the packet output path.
+ */
+int
+ire_revalidate_nce(ire_t *ire)
+{
+ nce_t *nce, *old_nce;
+ ire_t *nexthop;
/*
- * Construct message chain for the resolver
- * of the form:
- * ARP_REQ_MBLK-->IRE_MBLK
- *
- * NOTE : If the response does not
- * come back, ARP frees the packet. For this reason,
- * we can't REFHOLD the bucket of save_ire to prevent
- * deletions. We may not be able to REFRELE the bucket
- * if the response never comes back. Thus, before
- * adding the ire, ire_add_v4 will make sure that the
- * interface route does not get deleted. This is the
- * only case unlike ip_newroute_v6, ip_newroute_ipif_v6
- * where we can always prevent deletions because of
- * the synchronous nature of adding IRES i.e
- * ire_add_then_send is called after creating the IRE.
+ * For multicast we conceptually have an NCE but we don't store it
+ * in ire_nce_cache; when ire_to_nce is called we allocate the nce.
*/
+ if (ire->ire_type & IRE_MULTICAST)
+ return (0);
- /*
- * We use esballoc to allocate the second part (IRE_MBLK)
- * of the message chain depicted above. This mblk will be freed
- * by arp when there is a timeout, and otherwise passed to IP
- * and IP will free it after processing the ARP response.
- */
+ /* ire_testhidden should only be set on under-interfaces */
+ ASSERT(!ire->ire_testhidden || !IS_IPMP(ire->ire_ill));
- bufsize = sizeof (ire_t) + sizeof (frtn_t);
- buf = kmem_alloc(bufsize, KM_NOSLEEP);
- if (buf == NULL) {
- ip1dbg(("ire_arpresolve: alloc buffer failed\n"));
- return;
- }
- frtnp = (frtn_t *)(buf + 1);
- frtnp->free_arg = (caddr_t)buf;
- frtnp->free_func = ire_freemblk;
-
- ire_mp = esballoc((unsigned char *)buf, bufsize, BPRI_MED, frtnp);
- if (ire_mp == NULL) {
- ip1dbg(("ire_arpresolve: esballoc failed\n"));
- kmem_free(buf, bufsize);
- return;
+ nexthop = ire_nexthop(ire);
+ if (nexthop == NULL) {
+ /* The route is potentially bad */
+ (void) ire_no_good(ire);
+ return (ENETUNREACH);
}
+ if (ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) {
+ ASSERT(ire->ire_ill != NULL);
- areq_mp = copyb(dst_ill->ill_resolver_mp);
- if (areq_mp == NULL) {
- freemsg(ire_mp);
- return;
+ if (ire->ire_ipversion == IPV4_VERSION)
+ nce = nce_lookup_v4(ire->ire_ill, &ire->ire_addr);
+ else
+ nce = nce_lookup_v6(ire->ire_ill, &ire->ire_addr_v6);
+ } else {
+ ASSERT(nexthop->ire_type & IRE_ONLINK);
+ if (ire->ire_ipversion == IPV4_VERSION) {
+ nce = arp_nce_init(nexthop->ire_ill, nexthop->ire_addr,
+ nexthop->ire_type);
+ } else {
+ nce = ndp_nce_init(nexthop->ire_ill,
+ &nexthop->ire_addr_v6, nexthop->ire_type);
+ }
+ }
+ if (nce == NULL) {
+ /*
+ * Leave the old stale one in place to avoid a NULL
+ * ire_nce_cache.
+ */
+ ire_refrele(nexthop);
+ return (ENOMEM);
}
- ire_mp->b_datap->db_type = IRE_ARPRESOLVE_TYPE;
- ire = (ire_t *)buf;
- /*
- * keep enough info in the fake ire so that we can pull up
- * the incomplete ire (in_ire) after result comes back from
- * arp and make it complete.
- */
- *ire = ire_null;
- ire->ire_u = in_ire->ire_u;
- ire->ire_ipif_seqid = in_ire->ire_ipif_seqid;
- ire->ire_ipif_ifindex = in_ire->ire_ipif_ifindex;
- ire->ire_ipif = in_ire->ire_ipif;
- ire->ire_stq = dst_ill->ill_wq;
- ire->ire_stq_ifindex = dst_ill->ill_phyint->phyint_ifindex;
- ire->ire_zoneid = in_ire->ire_zoneid;
- ire->ire_stackid = ipst->ips_netstack->netstack_stackid;
- ire->ire_ipst = ipst;
-
- /*
- * ire_freemblk will be called when ire_mp is freed, both for
- * successful and failed arp resolution. IRE_MARK_UNCACHED will be set
- * when the arp resolution failed.
- */
- ire->ire_marks |= IRE_MARK_UNCACHED;
- ire->ire_mp = ire_mp;
- ire_mp->b_wptr = (uchar_t *)&ire[1];
- ire_mp->b_cont = NULL;
- linkb(areq_mp, ire_mp);
-
- /*
- * Fill in the source and dest addrs for the resolver.
- * NOTE: this depends on memory layouts imposed by
- * ill_init().
- */
- areq = (areq_t *)areq_mp->b_rptr;
- addrp = (ipaddr_t *)((char *)areq + areq->areq_sender_addr_offset);
- *addrp = ire->ire_src_addr;
-
- addrp = (ipaddr_t *)((char *)areq + areq->areq_target_addr_offset);
- if (ire->ire_gateway_addr != INADDR_ANY) {
- *addrp = ire->ire_gateway_addr;
- } else {
- *addrp = ire->ire_addr;
+ if (nexthop != ire) {
+ /* Update the nexthop ire */
+ mutex_enter(&nexthop->ire_lock);
+ old_nce = nexthop->ire_nce_cache;
+ if (!IRE_IS_CONDEMNED(nexthop)) {
+ nce_refhold(nce);
+ nexthop->ire_nce_cache = nce;
+ } else {
+ nexthop->ire_nce_cache = NULL;
+ }
+ mutex_exit(&nexthop->ire_lock);
+ if (old_nce != NULL)
+ nce_refrele(old_nce);
}
+ ire_refrele(nexthop);
- /* Up to the resolver. */
- if (canputnext(dst_ill->ill_rq)) {
- putnext(dst_ill->ill_rq, areq_mp);
+ mutex_enter(&ire->ire_lock);
+ old_nce = ire->ire_nce_cache;
+ if (!IRE_IS_CONDEMNED(ire)) {
+ nce_refhold(nce);
+ ire->ire_nce_cache = nce;
} else {
- freemsg(areq_mp);
+ ire->ire_nce_cache = NULL;
}
+ mutex_exit(&ire->ire_lock);
+ if (old_nce != NULL)
+ nce_refrele(old_nce);
+
+ nce_refrele(nce);
+ return (0);
}
/*
- * Esballoc free function for AR_ENTRY_QUERY request to clean up any
- * unresolved ire_t and/or nce_t structures when ARP resolution fails.
- *
- * This function can be called by ARP via free routine for ire_mp or
- * by IPv4(both host and forwarding path) via ire_delete
- * in case ARP resolution fails.
- * NOTE: Since IP is MT, ARP can call into IP but not vice versa
- * (for IP to talk to ARP, it still has to send AR* messages).
- *
- * Note that the ARP/IP merge should replace the functioanlity by providing
- * direct function calls to clean up unresolved entries in ire/nce lists.
+ * Get a held nce for a given ire.
+ * In the common case this is just from ire_nce_cache.
+ * For IRE_MULTICAST this needs to do an explicit lookup since we do not
+ * have an IRE_MULTICAST per address.
+ * Note that this explicitly returns CONDEMNED NCEs. The caller needs those
+ * so they can check whether the NCE went unreachable (as opposed to was
+ * condemned for some other reason).
*/
-void
-ire_freemblk(ire_t *ire_mp)
+nce_t *
+ire_to_nce(ire_t *ire, ipaddr_t v4nexthop, const in6_addr_t *v6nexthop)
{
- nce_t *nce = NULL;
- ill_t *ill;
- ip_stack_t *ipst;
- netstack_t *ns = NULL;
+ nce_t *nce;
- ASSERT(ire_mp != NULL);
+ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
+ return (NULL);
- if ((ire_mp->ire_addr == NULL) && (ire_mp->ire_gateway_addr == NULL)) {
- ip1dbg(("ire_freemblk(0x%p) ire_addr is NULL\n",
- (void *)ire_mp));
- goto cleanup;
- }
- if ((ire_mp->ire_marks & IRE_MARK_UNCACHED) == 0) {
- goto cleanup; /* everything succeeded. just free and return */
+ /* ire_testhidden should only be set on under-interfaces */
+ ASSERT(!ire->ire_testhidden || !IS_IPMP(ire->ire_ill));
+
+ mutex_enter(&ire->ire_lock);
+ nce = ire->ire_nce_cache;
+ if (nce != NULL) {
+ nce_refhold(nce);
+ mutex_exit(&ire->ire_lock);
+ return (nce);
}
+ mutex_exit(&ire->ire_lock);
- /*
- * the arp information corresponding to this ire_mp was not
- * transferred to an ire_cache entry. Need
- * to clean up incomplete ire's and nce, if necessary.
- */
- ASSERT(ire_mp->ire_stq != NULL);
- ASSERT(ire_mp->ire_stq_ifindex != 0);
- ASSERT(ire_mp->ire_ipst != NULL);
+ if (ire->ire_type & IRE_MULTICAST) {
+ ASSERT(ire->ire_ill != NULL);
- ns = netstack_find_by_stackid(ire_mp->ire_stackid);
- ipst = (ns ? ns->netstack_ip : NULL);
- if (ipst == NULL || ipst != ire_mp->ire_ipst) /* Disapeared on us */
- goto cleanup;
+ if (ire->ire_ipversion == IPV4_VERSION) {
+ ASSERT(v6nexthop == NULL);
- /*
- * Get any nce's corresponding to this ire_mp. We first have to
- * make sure that the ill is still around.
- */
- ill = ill_lookup_on_ifindex(ire_mp->ire_stq_ifindex,
- B_FALSE, NULL, NULL, NULL, NULL, ipst);
- if (ill == NULL || (ire_mp->ire_stq != ill->ill_wq) ||
- (ill->ill_state_flags & ILL_CONDEMNED)) {
- /*
- * ill went away. no nce to clean up.
- * Note that the ill_state_flags could be set to
- * ILL_CONDEMNED after this point, but if we know
- * that it is CONDEMNED now, we just bail out quickly.
- */
- if (ill != NULL)
- ill_refrele(ill);
- goto cleanup;
+ nce = arp_nce_init(ire->ire_ill, v4nexthop,
+ ire->ire_type);
+ } else {
+ ASSERT(v6nexthop != NULL);
+ ASSERT(v4nexthop == 0);
+ nce = ndp_nce_init(ire->ire_ill, v6nexthop,
+ ire->ire_type);
+ }
+ return (nce);
}
- nce = ndp_lookup_v4(ill,
- ((ire_mp->ire_gateway_addr != INADDR_ANY) ?
- &ire_mp->ire_gateway_addr : &ire_mp->ire_addr),
- B_FALSE);
- ill_refrele(ill);
+ return (NULL);
+}
- if ((nce != NULL) && (nce->nce_state != ND_REACHABLE)) {
- /*
- * some incomplete nce was found.
- */
- DTRACE_PROBE2(ire__freemblk__arp__resolv__fail,
- nce_t *, nce, ire_t *, ire_mp);
- /*
- * Send the icmp_unreachable messages for the queued mblks in
- * ire->ire_nce->nce_qd_mp, since ARP resolution failed
- * for this ire
- */
- arp_resolv_failed(nce);
- /*
- * Delete the nce and clean up all ire's pointing at this nce
- * in the cachetable
- */
- ndp_delete(nce);
- }
- if (nce != NULL)
- NCE_REFRELE(nce); /* release the ref taken by ndp_lookup_v4 */
+nce_t *
+ire_to_nce_pkt(ire_t *ire, mblk_t *mp)
+{
+ ipha_t *ipha;
+ ip6_t *ip6h;
-cleanup:
- if (ns != NULL)
- netstack_rele(ns);
- /*
- * Get rid of the ire buffer
- * We call kmem_free here(instead of ire_delete()), since
- * this is the freeb's callback.
- */
- kmem_free(ire_mp, sizeof (ire_t) + sizeof (frtn_t));
+ if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
+ ipha = (ipha_t *)mp->b_rptr;
+ return (ire_to_nce(ire, ipha->ipha_dst, NULL));
+ } else {
+ ip6h = (ip6_t *)mp->b_rptr;
+ return (ire_to_nce(ire, INADDR_ANY, &ip6h->ip6_dst));
+ }
}
/*
- * find, or create if needed, a neighbor cache entry nce_t for IRE_CACHE and
- * non-loopback IRE_BROADCAST ire's.
- *
- * If a neighbor-cache entry has to be created (i.e., one does not already
- * exist in the nce list) the nce_res_mp and nce_state of the neighbor cache
- * entry are initialized in ndp_add_v4(). These values are picked from
- * the src_nce, if one is passed in. Otherwise (if src_nce == NULL) the
- * ire->ire_type and the outgoing interface (ire_to_ill(ire)) values
- * determine the {nce_state, nce_res_mp} of the nce_t created. All
- * IRE_BROADCAST entries have nce_state = ND_REACHABLE, and the nce_res_mp
- * is set to the ill_bcast_mp of the outgoing inerface. For unicast ire
- * entries,
- * - if the outgoing interface is of type IRE_IF_RESOLVER, a newly created
- * nce_t will have a null nce_res_mp, and will be in the ND_INITIAL state.
- * - if the outgoing interface is a IRE_IF_NORESOLVER interface, no link
- * layer resolution is necessary, so that the nce_t will be in the
- * ND_REACHABLE state and the nce_res_mp will have a copy of the
- * ill_resolver_mp of the outgoing interface.
- *
- * The link layer information needed for broadcast addresses, and for
- * packets sent on IRE_IF_NORESOLVER interfaces is a constant mapping that
- * never needs re-verification for the lifetime of the nce_t. These are
- * therefore marked NCE_F_PERMANENT, and never allowed to expire via
- * NCE_EXPIRED.
- *
- * IRE_CACHE ire's contain the information for the nexthop (ire_gateway_addr)
- * in the case of indirect routes, and for the dst itself (ire_addr) in the
- * case of direct routes, with the nce_res_mp containing a template
- * DL_UNITDATA request.
- *
- * The actual association of the ire_nce to the nce created here is
- * typically done in ire_add_v4 for IRE_CACHE entries. Exceptions
- * to this rule are SO_DONTROUTE ire's (IRE_MARK_NO_ADD), for which
- * the ire_nce assignment is done in ire_add_then_send.
+ * Given an IRE_INTERFACE (that matches more than one address) create
+ * and return an IRE_IF_CLONE for the specific address.
+ * Return the generation number.
+ * Returns NULL is no memory for the IRE.
+ * Handles both IPv4 and IPv6.
*/
-int
-ire_nce_init(ire_t *ire, nce_t *src_nce)
+ire_t *
+ire_create_if_clone(ire_t *ire_if, const in6_addr_t *addr, uint_t *generationp)
{
- in_addr_t addr4;
- int err;
- nce_t *nce = NULL;
- ill_t *ire_ill;
- uint16_t nce_flags = 0;
- ip_stack_t *ipst;
-
- if (ire->ire_stq == NULL)
- return (0); /* no need to create nce for local/loopback */
-
- switch (ire->ire_type) {
- case IRE_CACHE:
- if (ire->ire_gateway_addr != INADDR_ANY)
- addr4 = ire->ire_gateway_addr; /* 'G' route */
- else
- addr4 = ire->ire_addr; /* direct route */
- break;
- case IRE_BROADCAST:
- addr4 = ire->ire_addr;
- nce_flags |= (NCE_F_PERMANENT|NCE_F_BCAST);
- break;
- default:
- return (0);
+ ire_t *ire;
+ ire_t *nire;
+
+ if (ire_if->ire_ipversion == IPV4_VERSION) {
+ ipaddr_t v4addr;
+ ipaddr_t mask = IP_HOST_MASK;
+
+ ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
+ IN6_V4MAPPED_TO_IPADDR(addr, v4addr);
+
+ ire = ire_create(
+ (uchar_t *)&v4addr, /* dest address */
+ (uchar_t *)&mask, /* mask */
+ (uchar_t *)&ire_if->ire_gateway_addr,
+ IRE_IF_CLONE, /* IRE type */
+ ire_if->ire_ill,
+ ire_if->ire_zoneid,
+ ire_if->ire_flags | RTF_HOST,
+ NULL, /* No security attr for IRE_IF_ALL */
+ ire_if->ire_ipst);
+ } else {
+ ASSERT(!IN6_IS_ADDR_V4MAPPED(addr));
+ ire = ire_create_v6(
+ addr, /* dest address */
+ &ipv6_all_ones, /* mask */
+ &ire_if->ire_gateway_addr_v6, /* gateway addr */
+ IRE_IF_CLONE, /* IRE type */
+ ire_if->ire_ill,
+ ire_if->ire_zoneid,
+ ire_if->ire_flags | RTF_HOST,
+ NULL, /* No security attr for IRE_IF_ALL */
+ ire_if->ire_ipst);
}
+ if (ire == NULL)
+ return (NULL);
- /*
- * ire_ipif is picked based on RTF_SETSRC, usesrc etc.
- * rules in ire_forward_src_ipif. We want the dlureq_mp
- * for the outgoing interface, which we get from the ire_stq.
- */
- ire_ill = ire_to_ill(ire);
- ipst = ire_ill->ill_ipst;
-
- /*
- * IRE_IF_NORESOLVER entries never need re-verification and
- * do not expire, so we mark them as NCE_F_PERMANENT.
- */
- if (ire_ill->ill_net_type == IRE_IF_NORESOLVER)
- nce_flags |= NCE_F_PERMANENT;
-
-retry_nce:
- err = ndp_lookup_then_add_v4(ire_ill, &addr4, nce_flags,
- &nce, src_nce);
+ /* Take the metrics, in particular the mtu, from the IRE_IF */
+ ire->ire_metrics = ire_if->ire_metrics;
- if (err == EEXIST && NCE_EXPIRED(nce, ipst)) {
- /*
- * We looked up an expired nce.
- * Go back and try to create one again.
- */
- ndp_delete(nce);
- NCE_REFRELE(nce);
- nce = NULL;
- goto retry_nce;
- }
+ nire = ire_add(ire);
+ if (nire == NULL) /* Some failure */
+ return (NULL);
- ip1dbg(("ire 0x%p addr 0x%lx type 0x%x; found nce 0x%p err %d\n",
- (void *)ire, (ulong_t)addr4, ire->ire_type, (void *)nce, err));
+ if (generationp != NULL)
+ *generationp = nire->ire_generation;
- switch (err) {
- case 0:
- case EEXIST:
- /*
- * return a pointer to a newly created or existing nce_t;
- * note that the ire-nce mapping is many-one, i.e.,
- * multiple ire's could point to the same nce_t.
- */
- break;
- default:
- DTRACE_PROBE2(nce__init__fail, ill_t *, ire_ill, int, err);
- return (EINVAL);
- }
/*
- * IRE_BROADCAST ire's must be linked to NCE_F_BCAST nce's and
- * vice-versa (IRE_CACHE <-> unicast nce entries). We may have found an
- * existing unicast (or bcast) nce when trying to add a BROADCAST (or
- * unicast) ire, e.g., when address/netmask modifications were in
- * progress, and the ipif_ndp_down() call to quiesce existing state
- * during the addr/mask modification may have skipped the ndp_delete()
- * because the ipif being affected was not the last one on the ill. We
- * recover from the missed ndp_delete() now, by deleting the old nce and
- * adding a new one with the correct NCE_F_BCAST state.
+ * Make sure races don't add a duplicate by
+ * catching the case when an identical was returned.
*/
- if (ire->ire_type == IRE_BROADCAST) {
- if ((nce->nce_flags & NCE_F_BCAST) == 0) {
- /* IRE_BROADCAST needs NCE_F_BCAST */
- ndp_delete(nce);
- NCE_REFRELE(nce);
- goto retry_nce;
- }
- /*
- * Two bcast ires are created for each interface;
- * 1. loopback copy (which does not have an
- * ire_stq, and therefore has no ire_nce), and,
- * 2. the non-loopback copy, which has the nce_res_mp
- * initialized to a copy of the ill_bcast_mp, and
- * is marked as ND_REACHABLE at this point.
- * This nce does not undergo any further state changes,
- * and exists as long as the interface is plumbed.
- * Note: the assignment of ire_nce here is a historical
- * artifact of old code that used to inline ire_add().
- */
- ire->ire_nce = nce;
- /*
- * We are associating this nce to the ire,
- * so change the nce ref taken in
- * ndp_lookup_then_add_v4() from
- * NCE_REFHOLD to NCE_REFHOLD_NOTR
- */
- NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce);
- } else {
- if ((nce->nce_flags & NCE_F_BCAST) != 0) {
- /* IRE_CACHE needs unicast nce */
- ndp_delete(nce);
- NCE_REFRELE(nce);
- goto retry_nce;
- }
- /*
- * We are not using this nce_t just yet so release
- * the ref taken in ndp_lookup_then_add_v4()
- */
- NCE_REFRELE(nce);
+ if (nire != ire) {
+ ASSERT(nire->ire_identical_ref > 1);
+ ire_delete(nire);
}
- return (0);
+ return (nire);
}
/*
- * This is the implementation of the IPv4 IRE cache lookup procedure.
- * Separating the interface from the implementation allows additional
- * flexibility when specifying search criteria.
+ * The argument is an IRE_INTERFACE. Delete all of IRE_IF_CLONE in the
+ * ire_dep_children (just walk the ire_dep_sib_next since they are all
+ * immediate children.)
+ * Since we hold a lock while we remove them we need to defer the actual
+ * calls to ire_delete() until we have dropped the lock. This makes things
+ * less efficient since we restart at the top after dropping the lock. But
+ * we only run when an IRE_INTERFACE is deleted which is infrquent.
+ *
+ * Note that ire_dep_children can be any mixture of offlink routes and
+ * IRE_IF_CLONE entries.
*/
-static ire_t *
-ip4_ctable_lookup_impl(ire_ctable_args_t *margs)
+void
+ire_dep_delete_if_clone(ire_t *parent)
{
- irb_t *irb_ptr;
- ire_t *ire;
- ip_stack_t *ipst = margs->ict_ipst;
+ ip_stack_t *ipst = parent->ire_ipst;
+ ire_t *child, *next;
- if ((margs->ict_flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) &&
- (margs->ict_ipif == NULL)) {
- return (NULL);
+restart:
+ rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
+ if (parent->ire_dep_children == NULL) {
+ rw_exit(&ipst->ips_ire_dep_lock);
+ return;
}
-
- irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(
- *((ipaddr_t *)margs->ict_addr), ipst->ips_ip_cache_table_size)];
- rw_enter(&irb_ptr->irb_lock, RW_READER);
- for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
- if (ire->ire_marks & IRE_MARK_CONDEMNED)
- continue;
- ASSERT(ire->ire_mask == IP_HOST_MASK);
- if (ire_match_args(ire, *((ipaddr_t *)margs->ict_addr),
- ire->ire_mask, *((ipaddr_t *)margs->ict_gateway),
- margs->ict_type, margs->ict_ipif, margs->ict_zoneid, 0,
- margs->ict_tsl, margs->ict_flags, margs->ict_wq)) {
- IRE_REFHOLD(ire);
- rw_exit(&irb_ptr->irb_lock);
- return (ire);
+ child = parent->ire_dep_children;
+ while (child != NULL) {
+ next = child->ire_dep_sib_next;
+ if ((child->ire_type & IRE_IF_CLONE) &&
+ !IRE_IS_CONDEMNED(child)) {
+ ire_refhold(child);
+ rw_exit(&ipst->ips_ire_dep_lock);
+ ire_delete(child);
+ ASSERT(IRE_IS_CONDEMNED(child));
+ ire_refrele(child);
+ goto restart;
}
+ child = next;
}
-
- rw_exit(&irb_ptr->irb_lock);
- return (NULL);
+ rw_exit(&ipst->ips_ire_dep_lock);
}
/*
- * This function locates IRE_CACHE entries which were added by the
- * ire_forward() path. We can fully specify the IRE we are looking for by
- * providing the ipif (MATCH_IRE_IPIF) *and* the stq (MATCH_IRE_WQ).
+ * ire_pref() is used in recursive route-resolution for a destination to
+ * determine the preference of an ire, where "preference" is determined
+ * based on the level of indirection to the destination of the ire.
+ * A higher preference indicates that fewer lookups are needed to complete
+ * recursive route lookup. Thus
+ * ire_pref(RTF_INDIRECT) < ire_pref(IRE_IF_RESOLVER) < ire_pref(IRE_PREF_CLONE)
*/
-ire_t *
-ire_arpresolve_lookup(ipaddr_t addr, ipaddr_t gw, ipif_t *ipif,
- zoneid_t zoneid, ip_stack_t *ipst, queue_t *wq)
-{
- ire_ctable_args_t margs;
-
- margs.ict_addr = &addr;
- margs.ict_gateway = &gw;
- margs.ict_type = IRE_CACHE;
- margs.ict_ipif = ipif;
- margs.ict_zoneid = zoneid;
- margs.ict_tsl = NULL;
- margs.ict_flags = MATCH_IRE_GW | MATCH_IRE_IPIF | MATCH_IRE_ZONEONLY |
- MATCH_IRE_TYPE | MATCH_IRE_WQ;
- margs.ict_ipst = ipst;
- margs.ict_wq = wq;
-
- return (ip4_ctable_lookup_impl(&margs));
+int
+ire_pref(ire_t *ire)
+{
+ if (ire->ire_flags & RTF_INDIRECT)
+ return (1);
+ if (ire->ire_type & IRE_OFFLINK)
+ return (2);
+ if (ire->ire_type & (IRE_IF_RESOLVER|IRE_IF_NORESOLVER))
+ return (3);
+ if (ire->ire_type & IRE_IF_CLONE)
+ return (4);
+ if (ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST))
+ return (5);
+ return (-1); /* unknown ire_type */
}
diff --git a/usr/src/uts/common/inet/ip/ip_mroute.c b/usr/src/uts/common/inet/ip/ip_mroute.c
index 5418c2d8d4..41f4f3f221 100644
--- a/usr/src/uts/common/inet/ip/ip_mroute.c
+++ b/usr/src/uts/common/inet/ip/ip_mroute.c
@@ -1,8 +1,4 @@
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
@@ -23,8 +19,8 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc.
- * All rights reserved. Use is subject to license terms.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -65,6 +61,7 @@
#include <netinet/in.h>
#include <net/if_dl.h>
+#include <inet/ipsec_impl.h>
#include <inet/common.h>
#include <inet/mi.h>
#include <inet/nd.h>
@@ -79,6 +76,7 @@
#include <netinet/ip_mroute.h>
#include <inet/ip_multi.h>
#include <inet/ip_ire.h>
+#include <inet/ip_ndp.h>
#include <inet/ip_if.h>
#include <inet/ipclassifier.h>
@@ -98,7 +96,7 @@
* is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates
* that vif is being initalized.
* Each structure is freed when the refcnt goes down to zero. If a delete comes
- * in when the the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED
+ * in when the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED
* which prevents the struct from further use. When the refcnt goes to zero
* the struct is freed and is marked VIF_MARK_NOTINUSE.
* vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill
@@ -171,9 +169,9 @@
/* Function declarations */
static int add_mfc(struct mfcctl *, ip_stack_t *);
-static int add_vif(struct vifctl *, conn_t *, mblk_t *, ip_stack_t *);
+static int add_vif(struct vifctl *, conn_t *, ip_stack_t *);
static int del_mfc(struct mfcctl *, ip_stack_t *);
-static int del_vif(vifi_t *, conn_t *, mblk_t *, ip_stack_t *);
+static int del_vif(vifi_t *, ip_stack_t *);
static void del_vifp(struct vif *);
static void encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
static void expire_upcalls(void *);
@@ -188,7 +186,7 @@ static int ip_mdq(mblk_t *, ipha_t *, ill_t *,
ipaddr_t, struct mfc *);
static int ip_mrouter_init(conn_t *, uchar_t *, int, ip_stack_t *);
static void phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
-static int register_mforward(queue_t *, mblk_t *, ill_t *);
+static int register_mforward(mblk_t *, ip_recv_attr_t *);
static void register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
static int set_assert(int *, ip_stack_t *);
@@ -331,10 +329,9 @@ static ipha_t multicast_encap_iphdr = {
* Handle MRT setsockopt commands to modify the multicast routing tables.
*/
int
-ip_mrouter_set(int cmd, queue_t *q, int checkonly, uchar_t *data,
- int datalen, mblk_t *first_mp)
+ip_mrouter_set(int cmd, conn_t *connp, int checkonly, uchar_t *data,
+ int datalen)
{
- conn_t *connp = Q_TO_CONN(q);
ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
@@ -376,11 +373,9 @@ ip_mrouter_set(int cmd, queue_t *q, int checkonly, uchar_t *data,
switch (cmd) {
case MRT_INIT: return (ip_mrouter_init(connp, data, datalen, ipst));
- case MRT_DONE: return (ip_mrouter_done(first_mp, ipst));
- case MRT_ADD_VIF: return (add_vif((struct vifctl *)data, connp,
- first_mp, ipst));
- case MRT_DEL_VIF: return (del_vif((vifi_t *)data, connp, first_mp,
- ipst));
+ case MRT_DONE: return (ip_mrouter_done(ipst));
+ case MRT_ADD_VIF: return (add_vif((struct vifctl *)data, connp, ipst));
+ case MRT_DEL_VIF: return (del_vif((vifi_t *)data, ipst));
case MRT_ADD_MFC: return (add_mfc((struct mfcctl *)data, ipst));
case MRT_DEL_MFC: return (del_mfc((struct mfcctl *)data, ipst));
case MRT_ASSERT: return (set_assert((int *)data, ipst));
@@ -392,9 +387,8 @@ ip_mrouter_set(int cmd, queue_t *q, int checkonly, uchar_t *data,
* Handle MRT getsockopt commands
*/
int
-ip_mrouter_get(int cmd, queue_t *q, uchar_t *data)
+ip_mrouter_get(int cmd, conn_t *connp, uchar_t *data)
{
- conn_t *connp = Q_TO_CONN(q);
ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
if (connp != ipst->ips_ip_g_mrouter)
@@ -611,7 +605,7 @@ ip_mrouter_stack_init(ip_stack_t *ipst)
* Didn't use global timeout_val (BSD version), instead check the mfctable.
*/
int
-ip_mrouter_done(mblk_t *mp, ip_stack_t *ipst)
+ip_mrouter_done(ip_stack_t *ipst)
{
conn_t *mrouter;
vifi_t vifi;
@@ -665,47 +659,19 @@ ip_mrouter_done(mblk_t *mp, ip_stack_t *ipst)
/* Phyint only */
if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
ipif_t *ipif = vifp->v_ipif;
- ipsq_t *ipsq;
- boolean_t suc;
- ill_t *ill;
+ ilm_t *ilm = vifp->v_ilm;
- ill = ipif->ipif_ill;
- suc = B_FALSE;
- if (mp == NULL) {
- /*
- * being called from ip_close,
- * lets do it synchronously.
- * Clear VIF_MARK_GOOD and
- * set VIF_MARK_CONDEMNED.
- */
- vifp->v_marks &= ~VIF_MARK_GOOD;
- vifp->v_marks |= VIF_MARK_CONDEMNED;
- mutex_exit(&(vifp)->v_lock);
- suc = ipsq_enter(ill, B_FALSE, NEW_OP);
- ipsq = ill->ill_phyint->phyint_ipsq;
- } else {
- ipsq = ipsq_try_enter(ipif, NULL,
- mrouter->conn_wq, mp,
- ip_restart_optmgmt, NEW_OP, B_TRUE);
- if (ipsq == NULL) {
- mutex_exit(&(vifp)->v_lock);
- ipif_refrele(ipif);
- return (EINPROGRESS);
- }
- /*
- * Clear VIF_MARK_GOOD and
- * set VIF_MARK_CONDEMNED.
- */
- vifp->v_marks &= ~VIF_MARK_GOOD;
- vifp->v_marks |= VIF_MARK_CONDEMNED;
- mutex_exit(&(vifp)->v_lock);
- suc = B_TRUE;
- }
+ vifp->v_ilm = NULL;
+ vifp->v_marks &= ~VIF_MARK_GOOD;
+ vifp->v_marks |= VIF_MARK_CONDEMNED;
- if (suc) {
- (void) ip_delmulti(INADDR_ANY, ipif,
- B_TRUE, B_TRUE);
- ipsq_exit(ipsq);
+ mutex_exit(&(vifp)->v_lock);
+ if (ilm != NULL) {
+ ill_t *ill = ipif->ipif_ill;
+
+ (void) ip_delmulti(ilm);
+ ASSERT(ill->ill_mrouter_cnt > 0);
+ atomic_dec_32(&ill->ill_mrouter_cnt);
}
mutex_enter(&vifp->v_lock);
}
@@ -866,14 +832,15 @@ lock_good_vif(struct vif *vifp)
* Add a vif to the vif table.
*/
static int
-add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
+add_vif(struct vifctl *vifcp, conn_t *connp, ip_stack_t *ipst)
{
struct vif *vifp = ipst->ips_vifs + vifcp->vifc_vifi;
ipif_t *ipif;
- int error;
+ int error = 0;
struct tbf *v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi;
- ipsq_t *ipsq;
conn_t *mrouter = ipst->ips_ip_g_mrouter;
+ ilm_t *ilm;
+ ill_t *ill;
ASSERT(connp != NULL);
@@ -913,28 +880,12 @@ add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
mutex_exit(&vifp->v_lock);
/* Find the interface with the local address */
ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL,
- connp->conn_zoneid, CONNP_TO_WQ(connp), first_mp,
- ip_restart_optmgmt, &error, ipst);
+ IPCL_ZONEID(connp), ipst);
if (ipif == NULL) {
VIF_REFRELE(vifp);
- if (error == EINPROGRESS)
- return (error);
return (EADDRNOTAVAIL);
}
- /*
- * We have to be exclusive as we have to call ip_addmulti()
- * This is the best position to try to be exclusive in case
- * we have to wait.
- */
- ipsq = ipsq_try_enter(ipif, NULL, CONNP_TO_WQ(connp), first_mp,
- ip_restart_optmgmt, NEW_OP, B_TRUE);
- if ((ipsq) == NULL) {
- VIF_REFRELE(vifp);
- ipif_refrele(ipif);
- return (EINPROGRESS);
- }
-
if (ipst->ips_ip_mrtdebug > 1) {
(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
"add_vif: src 0x%x enter",
@@ -959,7 +910,6 @@ add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
"add_vif: source route tunnels not supported\n");
VIF_REFRELE_LOCKED(vifp);
ipif_refrele(ipif);
- ipsq_exit(ipsq);
return (EOPNOTSUPP);
}
vifp->v_rmt_addr = vifcp->vifc_rmt_addr;
@@ -981,7 +931,6 @@ add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
mutex_exit(&ipst->ips_numvifs_mutex);
VIF_REFRELE_LOCKED(vifp);
ipif_refrele(ipif);
- ipsq_exit(ipsq);
return (EADDRINUSE);
}
}
@@ -995,22 +944,39 @@ add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
ipst->ips_reg_vif_num = ALL_VIFS;
mutex_exit(&ipst->ips_numvifs_mutex);
}
- ipsq_exit(ipsq);
return (EOPNOTSUPP);
}
/* Enable promiscuous reception of all IP mcasts from the if */
mutex_exit(&vifp->v_lock);
- error = ip_addmulti(INADDR_ANY, ipif, ILGSTAT_NONE,
- MODE_IS_EXCLUDE, NULL);
+
+ ill = ipif->ipif_ill;
+ if (IS_UNDER_IPMP(ill))
+ ill = ipmp_ill_hold_ipmp_ill(ill);
+
+ if (ill == NULL) {
+ ilm = NULL;
+ } else {
+ ilm = ip_addmulti(&ipv6_all_zeros, ill,
+ ipif->ipif_zoneid, &error);
+ if (ilm != NULL)
+ atomic_inc_32(&ill->ill_mrouter_cnt);
+ if (IS_UNDER_IPMP(ipif->ipif_ill)) {
+ ill_refrele(ill);
+ ill = ipif->ipif_ill;
+ }
+ }
+
mutex_enter(&vifp->v_lock);
/*
* since we released the lock lets make sure that
* ip_mrouter_done() has not been called.
*/
- if (error != 0 || is_mrouter_off(ipst)) {
- if (error == 0)
- (void) ip_delmulti(INADDR_ANY, ipif, B_TRUE,
- B_TRUE);
+ if (ilm == NULL || is_mrouter_off(ipst)) {
+ if (ilm != NULL) {
+ (void) ip_delmulti(ilm);
+ ASSERT(ill->ill_mrouter_cnt > 0);
+ atomic_dec_32(&ill->ill_mrouter_cnt);
+ }
if (vifcp->vifc_flags & VIFF_REGISTER) {
mutex_enter(&ipst->ips_numvifs_mutex);
ipst->ips_reg_vif_num = ALL_VIFS;
@@ -1018,9 +984,9 @@ add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
}
VIF_REFRELE_LOCKED(vifp);
ipif_refrele(ipif);
- ipsq_exit(ipsq);
return (error?error:EINVAL);
}
+ vifp->v_ilm = ilm;
}
/* Define parameters for the tbf structure */
vifp->v_tbf = v_tbf;
@@ -1063,7 +1029,6 @@ add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
vifp->v_marks = VIF_MARK_GOOD;
mutex_exit(&vifp->v_lock);
- ipsq_exit(ipsq);
return (0);
}
@@ -1131,10 +1096,9 @@ del_vifp(struct vif *vifp)
}
static int
-del_vif(vifi_t *vifip, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
+del_vif(vifi_t *vifip, ip_stack_t *ipst)
{
struct vif *vifp = ipst->ips_vifs + *vifip;
- ipsq_t *ipsq;
if (*vifip >= ipst->ips_numvifs)
return (EINVAL);
@@ -1151,41 +1115,6 @@ del_vif(vifi_t *vifip, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
return (EADDRNOTAVAIL);
}
- /*
- * This is an optimization, if first_mp == NULL
- * than we are being called from reset_mrt_vif_ipif()
- * so we already have exclusive access to the ipsq.
- * the ASSERT below is a check for this condition.
- */
- if (first_mp != NULL &&
- !(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
- ASSERT(connp != NULL);
- /*
- * We have to be exclusive as we have to call ip_delmulti()
- * This is the best position to try to be exclusive in case
- * we have to wait.
- */
- ipsq = ipsq_try_enter(vifp->v_ipif, NULL, CONNP_TO_WQ(connp),
- first_mp, ip_restart_optmgmt, NEW_OP, B_TRUE);
- if ((ipsq) == NULL) {
- mutex_exit(&vifp->v_lock);
- return (EINPROGRESS);
- }
- /* recheck after being exclusive */
- if (vifp->v_lcl_addr.s_addr == 0 ||
- !vifp->v_marks & VIF_MARK_GOOD) {
- /*
- * someone beat us.
- */
- mutex_exit(&vifp->v_lock);
- ipsq_exit(ipsq);
- return (EADDRNOTAVAIL);
- }
- }
-
-
- ASSERT(IAM_WRITER_IPIF(vifp->v_ipif));
-
/* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */
vifp->v_marks &= ~VIF_MARK_GOOD;
vifp->v_marks |= VIF_MARK_CONDEMNED;
@@ -1193,18 +1122,30 @@ del_vif(vifi_t *vifip, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
/* Phyint only */
if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
ipif_t *ipif = vifp->v_ipif;
+ ilm_t *ilm = vifp->v_ilm;
+
+ vifp->v_ilm = NULL;
+
ASSERT(ipif != NULL);
/*
* should be OK to drop the lock as we
* have marked this as CONDEMNED.
*/
mutex_exit(&(vifp)->v_lock);
- (void) ip_delmulti(INADDR_ANY, ipif, B_TRUE, B_TRUE);
- if (first_mp != NULL)
- ipsq_exit(ipsq);
+ if (ilm != NULL) {
+ (void) ip_delmulti(ilm);
+ ASSERT(ipif->ipif_ill->ill_mrouter_cnt > 0);
+ atomic_dec_32(&ipif->ipif_ill->ill_mrouter_cnt);
+ }
mutex_enter(&(vifp)->v_lock);
}
+ if (vifp->v_flags & VIFF_REGISTER) {
+ mutex_enter(&ipst->ips_numvifs_mutex);
+ ipst->ips_reg_vif_num = ALL_VIFS;
+ mutex_exit(&ipst->ips_numvifs_mutex);
+ }
+
/*
* decreases the refcnt added in add_vif.
*/
@@ -1584,16 +1525,21 @@ del_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
* 1 - pkt came in on tunnel
*/
int
-ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp)
+ip_mforward(mblk_t *mp, ip_recv_attr_t *ira)
{
+ ipha_t *ipha = (ipha_t *)mp->b_rptr;
+ ill_t *ill = ira->ira_ill;
struct mfc *rt;
ipaddr_t src, dst, tunnel_src = 0;
static int srctun = 0;
vifi_t vifi;
boolean_t pim_reg_packet = B_FALSE;
- struct mfcb *mfcbp;
+ struct mfcb *mfcbp;
ip_stack_t *ipst = ill->ill_ipst;
conn_t *mrouter = ipst->ips_ip_g_mrouter;
+ ill_t *rill = ira->ira_rill;
+
+ ASSERT(ira->ira_pktlen == msgdsize(mp));
if (ipst->ips_ip_mrtdebug > 1) {
(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
@@ -1603,10 +1549,10 @@ ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp)
}
dst = ipha->ipha_dst;
- if ((uint32_t)(uintptr_t)mp->b_prev == PIM_REGISTER_MARKER)
+ if (ira->ira_flags & IRAF_PIM_REGISTER)
pim_reg_packet = B_TRUE;
- else
- tunnel_src = (ipaddr_t)(uintptr_t)mp->b_prev;
+ else if (ira->ira_flags & IRAF_MROUTE_TUNNEL_SET)
+ tunnel_src = ira->ira_mroute_tunnel;
/*
* Don't forward a packet with time-to-live of zero or one,
@@ -1620,7 +1566,6 @@ ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp)
" dst 0x%x ill %s",
ipha->ipha_ttl, ntohl(dst), ill->ill_name);
}
- mp->b_prev = NULL;
if (tunnel_src != 0)
return (1);
else
@@ -1630,10 +1575,8 @@ ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp)
if ((tunnel_src != 0) || pim_reg_packet) {
/*
* Packet arrived over an encapsulated tunnel or via a PIM
- * register message. Both ip_mroute_decap() and pim_input()
- * encode information in mp->b_prev.
+ * register message.
*/
- mp->b_prev = NULL;
if (ipst->ips_ip_mrtdebug > 1) {
if (tunnel_src != 0) {
(void) mi_strlog(mrouter->conn_rq, 1,
@@ -1926,10 +1869,16 @@ ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp)
mutex_exit(&mfc_rt->mfc_mutex);
mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
/* Pass to RAWIP */
- (mrouter->conn_recv)(mrouter, mp_copy, NULL);
+ ira->ira_ill = ira->ira_rill = NULL;
+ (mrouter->conn_recv)(mrouter, mp_copy, NULL, ira);
+ ira->ira_ill = ill;
+ ira->ira_rill = rill;
} else {
mutex_exit(&mfc_rt->mfc_mutex);
mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ip_mforward - upcall already waiting",
+ mp_copy, ill);
freemsg(mp_copy);
}
@@ -1945,8 +1894,11 @@ ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp)
mi_free((char *)mfc_rt);
if (rte != NULL)
mi_free((char *)rte);
- if (mp_copy != NULL)
+ if (mp_copy != NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ip_mforward error", mp_copy, ill);
freemsg(mp_copy);
+ }
if (mp0 != NULL)
freemsg(mp0);
return (-1);
@@ -2023,7 +1975,6 @@ static int
ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
struct mfc *rt)
{
- ill_t *vill;
vifi_t vifi;
struct vif *vifp;
ipaddr_t dst = ipha->ipha_dst;
@@ -2031,6 +1982,7 @@ ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
vifi_t num_of_vifs;
ip_stack_t *ipst = ill->ill_ipst;
conn_t *mrouter = ipst->ips_ip_g_mrouter;
+ ip_recv_attr_t iras;
if (ipst->ips_ip_mrtdebug > 1) {
(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
@@ -2091,19 +2043,19 @@ ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
* Don't forward if it didn't arrive from the parent vif for its
* origin.
*/
- vill = ipst->ips_vifs[vifi].v_ipif->ipif_ill;
- if ((vill != ill && !IS_IN_SAME_ILLGRP(vill, ill)) ||
+ if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill) ||
(ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) {
/* Came in the wrong interface */
ip1dbg(("ip_mdq: arrived wrong if, vifi %d "
"numvifs %d ill %s viftable ill %s\n",
(int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
- vill->ill_name));
+ ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name));
if (ipst->ips_ip_mrtdebug > 1) {
(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
"ip_mdq: arrived wrong if, vifi %d ill "
"%s viftable ill %s\n",
- (int)vifi, ill->ill_name, vill->ill_name);
+ (int)vifi, ill->ill_name,
+ ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
}
ipst->ips_mrtstat->mrts_wrong_if++;
rt->mfc_wrong_if++;
@@ -2137,7 +2089,14 @@ ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
im->im_mbz = 0;
im->im_vif = (ushort_t)vifi;
/* Pass to RAWIP */
- (mrouter->conn_recv)(mrouter, mp_copy, NULL);
+
+ bzero(&iras, sizeof (iras));
+ iras.ira_flags = IRAF_IS_IPV4;
+ iras.ira_ip_hdr_length =
+ IPH_HDR_LENGTH(mp_copy->b_rptr);
+ iras.ira_pktlen = msgdsize(mp_copy);
+ (mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras);
+ ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
}
unlock_good_vif(&ipst->ips_vifs[vifi]);
if (tunnel_src != 0)
@@ -2239,8 +2198,10 @@ register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
struct igmpmsg *im;
mblk_t *mp_copy;
ipha_t *ipha_copy;
- ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
+ ill_t *ill = vifp->v_ipif->ipif_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
conn_t *mrouter = ipst->ips_ip_g_mrouter;
+ ip_recv_attr_t iras;
if (ipst->ips_ip_mrtdebug > 1) {
(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
@@ -2307,16 +2268,24 @@ register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
im->im_mbz = 0;
++ipst->ips_mrtstat->mrts_upcalls;
- if (!canputnext(mrouter->conn_rq)) {
+ if (IPCL_IS_NONSTR(mrouter) ? mrouter->conn_flow_cntrld :
+ !canputnext(mrouter->conn_rq)) {
++ipst->ips_mrtstat->mrts_pim_regsend_drops;
if (ipst->ips_ip_mrtdebug > 3) {
(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
"register_send: register upcall failure.");
}
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("mrts_pim_regsend_drops", mp_copy, ill);
freemsg(mp_copy);
} else {
/* Pass to RAWIP */
- (mrouter->conn_recv)(mrouter, mp_copy, NULL);
+ bzero(&iras, sizeof (iras));
+ iras.ira_flags = IRAF_IS_IPV4;
+ iras.ira_ip_hdr_length = sizeof (ipha_t);
+ iras.ira_pktlen = msgdsize(mp_copy);
+ (mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras);
+ ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
}
}
@@ -2349,18 +2318,22 @@ pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp)
}
/*
- * int
- * pim_input(queue_t *, mblk_t *, ill_t *ill) - Process PIM protocol packets.
- * IP Protocol 103. Register messages are decapsulated and sent
- * onto multicast forwarding.
+ * Process PIM protocol packets i.e. IP Protocol 103.
+ * Register messages are decapsulated and sent onto multicast forwarding.
+ *
+ * Return NULL for a bad packet that is discarded here.
+ * Return mp if the message is OK and should be handed to "raw" receivers.
+ * Callers of pim_input() may need to reinitialize variables that were copied
+ * from the mblk as this calls pullupmsg().
*/
-int
-pim_input(queue_t *q, mblk_t *mp, ill_t *ill)
+mblk_t *
+pim_input(mblk_t *mp, ip_recv_attr_t *ira)
{
ipha_t *eip, *ip;
int iplen, pimlen, iphlen;
struct pim *pimp; /* pointer to a pim struct */
uint32_t *reghdr;
+ ill_t *ill = ira->ira_ill;
ip_stack_t *ipst = ill->ill_ipst;
conn_t *mrouter = ipst->ips_ip_g_mrouter;
@@ -2369,8 +2342,10 @@ pim_input(queue_t *q, mblk_t *mp, ill_t *ill)
*/
if (pullupmsg(mp, -1) == 0) {
++ipst->ips_mrtstat->mrts_pim_nomemory;
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("mrts_pim_nomemory", mp, ill);
freemsg(mp);
- return (-1);
+ return (NULL);
}
ip = (ipha_t *)mp->b_rptr;
@@ -2387,8 +2362,10 @@ pim_input(queue_t *q, mblk_t *mp, ill_t *ill)
(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
"pim_input: length not at least minlen");
}
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("mrts_pim_malformed", mp, ill);
freemsg(mp);
- return (-1);
+ return (NULL);
}
/*
@@ -2405,8 +2382,10 @@ pim_input(queue_t *q, mblk_t *mp, ill_t *ill)
(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
"pim_input: unknown version of PIM");
}
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("mrts_pim_badversion", mp, ill);
freemsg(mp);
- return (-1);
+ return (NULL);
}
/*
@@ -2418,12 +2397,14 @@ pim_input(queue_t *q, mblk_t *mp, ill_t *ill)
(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
"pim_input: invalid checksum");
}
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("pim_rcv_badcsum", mp, ill);
freemsg(mp);
- return (-1);
+ return (NULL);
}
if (pimp->pim_type != PIM_REGISTER)
- return (0);
+ return (mp);
reghdr = (uint32_t *)(pimp + 1);
eip = (ipha_t *)(reghdr + 1);
@@ -2437,8 +2418,10 @@ pim_input(queue_t *q, mblk_t *mp, ill_t *ill)
(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
"pim_input: Inner pkt not mcast .. !");
}
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("mrts_pim_badregisters", mp, ill);
freemsg(mp);
- return (-1);
+ return (NULL);
}
if (ipst->ips_ip_mrtdebug > 1) {
(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
@@ -2450,27 +2433,36 @@ pim_input(queue_t *q, mblk_t *mp, ill_t *ill)
/*
* If the null register bit is not set, decapsulate
* the packet before forwarding it.
+ * Avoid this in no register vif
*/
- if (!(ntohl(*reghdr) & PIM_NULL_REGISTER)) {
+ if (!(ntohl(*reghdr) & PIM_NULL_REGISTER) &&
+ ipst->ips_reg_vif_num != ALL_VIFS) {
mblk_t *mp_copy;
+ uint_t saved_pktlen;
/* Copy the message */
if ((mp_copy = copymsg(mp)) == NULL) {
++ipst->ips_mrtstat->mrts_pim_nomemory;
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("mrts_pim_nomemory", mp, ill);
freemsg(mp);
- return (-1);
+ return (NULL);
}
/*
* Decapsulate the packet and give it to
* register_mforward.
*/
- mp_copy->b_rptr += iphlen + sizeof (pim_t) +
- sizeof (*reghdr);
- if (register_mforward(q, mp_copy, ill) != 0) {
+ mp_copy->b_rptr += iphlen + sizeof (pim_t) + sizeof (*reghdr);
+ saved_pktlen = ira->ira_pktlen;
+ ira->ira_pktlen -= iphlen + sizeof (pim_t) + sizeof (*reghdr);
+ if (register_mforward(mp_copy, ira) != 0) {
+ /* register_mforward already called ip_drop_input */
freemsg(mp);
- return (-1);
+ ira->ira_pktlen = saved_pktlen;
+ return (NULL);
}
+ ira->ira_pktlen = saved_pktlen;
}
/*
@@ -2478,7 +2470,7 @@ pim_input(queue_t *q, mblk_t *mp, ill_t *ill)
* PIM socket. For Solaris it is done right after pim_input() is
* called.
*/
- return (0);
+ return (mp);
}
/*
@@ -2486,38 +2478,52 @@ pim_input(queue_t *q, mblk_t *mp, ill_t *ill)
* the packet. Loop back the packet, as if we have received it.
* In pim_input() we have to check if the destination is a multicast address.
*/
-/* ARGSUSED */
static int
-register_mforward(queue_t *q, mblk_t *mp, ill_t *ill)
+register_mforward(mblk_t *mp, ip_recv_attr_t *ira)
{
+ ire_t *ire;
+ ipha_t *ipha = (ipha_t *)mp->b_rptr;
+ ill_t *ill = ira->ira_ill;
ip_stack_t *ipst = ill->ill_ipst;
conn_t *mrouter = ipst->ips_ip_g_mrouter;
ASSERT(ipst->ips_reg_vif_num <= ipst->ips_numvifs);
if (ipst->ips_ip_mrtdebug > 3) {
- ipha_t *ipha;
-
- ipha = (ipha_t *)mp->b_rptr;
(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
"register_mforward: src %x, dst %x\n",
ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
}
/*
* Need to pass in to ip_mforward() the information that the
- * packet has arrived on the register_vif. We use the solution that
- * ip_mroute_decap() employs: use mp->b_prev to pass some information
- * to ip_mforward(). Nonzero value means the packet has arrived on a
- * tunnel (ip_mroute_decap() puts the address of the other side of the
- * tunnel there.) This is safe since ip_rput() either frees the packet
- * or passes it to ip_mforward(). We use
- * PIM_REGISTER_MARKER = 0xffffffff to indicate the has arrived on the
- * register vif. If in the future we have more than one register vifs,
- * then this will need re-examination.
+ * packet has arrived on the register_vif. We mark it with
+ * the IRAF_PIM_REGISTER attribute.
+ * pim_input verified that the (inner) destination is multicast,
+ * hence we skip the generic code in ip_input.
*/
- mp->b_prev = (mblk_t *)PIM_REGISTER_MARKER;
+ ira->ira_flags |= IRAF_PIM_REGISTER;
++ipst->ips_mrtstat->mrts_pim_regforwards;
- ip_rput(q, mp);
+
+ if (!CLASSD(ipha->ipha_dst)) {
+ ire = ire_route_recursive_v4(ipha->ipha_dst, 0, NULL, ALL_ZONES,
+ ira->ira_tsl, MATCH_IRE_SECATTR, B_TRUE, 0, ipst, NULL,
+ NULL, NULL);
+ } else {
+ ire = ire_multicast(ill);
+ }
+ ASSERT(ire != NULL);
+ /* Normally this will return the IRE_MULTICAST */
+ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("mrts_pim RTF_REJECT", mp, ill);
+ freemsg(mp);
+ ire_refrele(ire);
+ return (-1);
+ }
+ ASSERT(ire->ire_type & IRE_MULTICAST);
+ (*ire->ire_recvfn)(ire, mp, ipha, ira);
+ ire_refrele(ire);
+
return (0);
}
@@ -2575,6 +2581,8 @@ encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
ipha->ipha_hdr_checksum = 0;
ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
+ ipha_copy->ipha_ttl = ipha->ipha_ttl;
+
if (ipst->ips_ip_mrtdebug > 1) {
(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
"encap_send: group 0x%x", ntohl(ipha->ipha_dst));
@@ -2587,21 +2595,53 @@ encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
}
/*
- * De-encapsulate a packet and feed it back through IP input.
+ * De-encapsulate a packet and feed it back through IP input if it
+ * matches one of our multicast tunnels.
+ *
* This routine is called whenever IP gets a packet with prototype
- * IPPROTO_ENCAP and a local destination address.
+ * IPPROTO_ENCAP and a local destination address and the packet didn't
+ * match one of our configured IP-in-IP tunnels.
*/
void
-ip_mroute_decap(queue_t *q, mblk_t *mp, ill_t *ill)
+ip_mroute_decap(mblk_t *mp, ip_recv_attr_t *ira)
{
ipha_t *ipha = (ipha_t *)mp->b_rptr;
ipha_t *ipha_encap;
int hlen = IPH_HDR_LENGTH(ipha);
+ int hlen_encap;
ipaddr_t src;
struct vif *vifp;
+ ire_t *ire;
+ ill_t *ill = ira->ira_ill;
ip_stack_t *ipst = ill->ill_ipst;
conn_t *mrouter = ipst->ips_ip_g_mrouter;
+ /* Make sure we have all of the inner header */
+ ipha_encap = (ipha_t *)((char *)ipha + hlen);
+ if (mp->b_wptr - mp->b_rptr < hlen + IP_SIMPLE_HDR_LENGTH) {
+ ipha = ip_pullup(mp, hlen + IP_SIMPLE_HDR_LENGTH, ira);
+ if (ipha == NULL) {
+ ipst->ips_mrtstat->mrts_bad_tunnel++;
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ip_mroute_decap: too short", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ ipha_encap = (ipha_t *)((char *)ipha + hlen);
+ }
+ hlen_encap = IPH_HDR_LENGTH(ipha_encap);
+ if (mp->b_wptr - mp->b_rptr < hlen + hlen_encap) {
+ ipha = ip_pullup(mp, hlen + hlen_encap, ira);
+ if (ipha == NULL) {
+ ipst->ips_mrtstat->mrts_bad_tunnel++;
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ip_mroute_decap: too short", mp, ill);
+ freemsg(mp);
+ return;
+ }
+ ipha_encap = (ipha_t *)((char *)ipha + hlen);
+ }
+
/*
* Dump the packet if it's not to a multicast destination or if
* we don't have an encapsulating tunnel with the source.
@@ -2609,10 +2649,11 @@ ip_mroute_decap(queue_t *q, mblk_t *mp, ill_t *ill)
* uniquely identifies the tunnel (i.e., that this site has
* at most one tunnel with the remote site).
*/
- ipha_encap = (ipha_t *)((char *)ipha + hlen);
if (!CLASSD(ipha_encap->ipha_dst)) {
ipst->ips_mrtstat->mrts_bad_tunnel++;
ip1dbg(("ip_mroute_decap: bad tunnel\n"));
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("mrts_bad_tunnel", mp, ill);
freemsg(mp);
return;
}
@@ -2648,6 +2689,8 @@ ip_mroute_decap(queue_t *q, mblk_t *mp, ill_t *ill)
if ((vifp = ipst->ips_last_encap_vif) == 0) {
mutex_exit(&ipst->ips_last_encap_lock);
ipst->ips_mrtstat->mrts_bad_tunnel++;
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("mrts_bad_tunnel", mp, ill);
freemsg(mp);
ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n",
(ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(src)));
@@ -2657,14 +2700,43 @@ ip_mroute_decap(queue_t *q, mblk_t *mp, ill_t *ill)
/*
* Need to pass in the tunnel source to ip_mforward (so that it can
- * verify that the packet arrived over the correct vif.) We use b_prev
- * to pass this information. This is safe since the ip_rput either
- * frees the packet or passes it to ip_mforward.
+ * verify that the packet arrived over the correct vif.)
*/
- mp->b_prev = (mblk_t *)(uintptr_t)src;
+ ira->ira_flags |= IRAF_MROUTE_TUNNEL_SET;
+ ira->ira_mroute_tunnel = src;
mp->b_rptr += hlen;
- /* Feed back into ip_rput as an M_DATA. */
- ip_rput(q, mp);
+ ira->ira_pktlen -= hlen;
+ ira->ira_ip_hdr_length = hlen_encap;
+
+ /*
+ * We don't redo any of the filtering in ill_input_full_v4 and we
+ * have checked that all of ipha_encap and any IP options are
+ * pulled up. Hence we call ire_recv_multicast_v4 directly.
+ * However, we have to check for RSVP as in ip_input_full_v4
+ * and if so we pass it to ire_recv_broadcast_v4 for local delivery
+ * to the rsvpd.
+ */
+ if (ipha_encap->ipha_protocol == IPPROTO_RSVP &&
+ ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head != NULL) {
+ ire = ire_route_recursive_v4(INADDR_BROADCAST, 0, ill,
+ ALL_ZONES, ira->ira_tsl, MATCH_IRE_ILL|MATCH_IRE_SECATTR,
+ B_TRUE, 0, ipst, NULL, NULL, NULL);
+ } else {
+ ire = ire_multicast(ill);
+ }
+ ASSERT(ire != NULL);
+ /* Normally this will return the IRE_MULTICAST or IRE_BROADCAST */
+ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ip_mroute_decap: RTF_REJECT", mp, ill);
+ freemsg(mp);
+ ire_refrele(ire);
+ return;
+ }
+ ire->ire_ib_pkt_count++;
+ ASSERT(ire->ire_type & (IRE_MULTICAST|IRE_BROADCAST));
+ (*ire->ire_recvfn)(ire, mp, ipha_encap, ira);
+ ire_refrele(ire);
}
/*
@@ -2687,7 +2759,7 @@ reset_mrt_vif_ipif(ipif_t *ipif)
for (vifi = num_of_vifs; vifi != 0; vifi--) {
tmp_vifi = vifi - 1;
if (ipst->ips_vifs[tmp_vifi].v_ipif == ipif) {
- (void) del_vif(&tmp_vifi, NULL, NULL, ipst);
+ (void) del_vif(&tmp_vifi, ipst);
}
}
}
@@ -2696,11 +2768,12 @@ reset_mrt_vif_ipif(ipif_t *ipif)
void
reset_mrt_ill(ill_t *ill)
{
- struct mfc *rt;
+ struct mfc *rt;
struct rtdetq *rte;
- int i;
+ int i;
ip_stack_t *ipst = ill->ill_ipst;
conn_t *mrouter = ipst->ips_ip_g_mrouter;
+ timeout_id_t id;
for (i = 0; i < MFCTBLSIZ; i++) {
MFCB_REFHOLD(&ipst->ips_mfcs[i]);
@@ -2713,6 +2786,18 @@ reset_mrt_ill(ill_t *ill)
while (rt != NULL) {
mutex_enter(&rt->mfc_mutex);
while ((rte = rt->mfc_rte) != NULL) {
+ if (rte->ill == ill &&
+ (id = rt->mfc_timeout_id) != 0) {
+ /*
+ * Its ok to drop the lock, the
+ * struct cannot be freed since
+ * we have a ref on the hash
+ * bucket.
+ */
+ mutex_exit(&rt->mfc_mutex);
+ (void) untimeout(id);
+ mutex_enter(&rt->mfc_mutex);
+ }
if (rte->ill == ill) {
if (ipst->ips_ip_mrtdebug > 1) {
(void) mi_strlog(
@@ -2744,12 +2829,15 @@ tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha)
size_t p_len = msgdsize(mp);
struct tbf *t = vifp->v_tbf;
timeout_id_t id = 0;
- ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
+ ill_t *ill = vifp->v_ipif->ipif_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
conn_t *mrouter = ipst->ips_ip_g_mrouter;
/* Drop if packet is too large */
if (p_len > MAX_BKT_SIZE) {
ipst->ips_mrtstat->mrts_pkt2large++;
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("tbf_control - too large", mp, ill);
freemsg(mp);
return;
}
@@ -2800,6 +2888,9 @@ tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha)
if ((mp->b_wptr - mp->b_rptr) < hdr_length) {
if (!pullupmsg(mp, hdr_length)) {
+ BUMP_MIB(ill->ill_ip_mib,
+ ipIfStatsOutDiscards);
+ ip_drop_output("tbf_control - pullup", mp, ill);
freemsg(mp);
ip1dbg(("tbf_ctl: couldn't pullup udp hdr, "
"vif %ld src 0x%x dst 0x%x\n",
@@ -2818,6 +2909,8 @@ tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha)
*/
if (!tbf_dq_sel(vifp, ipha)) {
ipst->ips_mrtstat->mrts_q_overflow++;
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("mrts_q_overflow", mp, ill);
freemsg(mp);
} else {
tbf_queue(vifp, mp);
@@ -2958,7 +3051,8 @@ tbf_dq_sel(struct vif *vifp, ipha_t *ipha)
struct tbf *t = vifp->v_tbf;
mblk_t **np;
mblk_t *last, *mp;
- ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
+ ill_t *ill = vifp->v_ipif->ipif_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
conn_t *mrouter = ipst->ips_ip_g_mrouter;
if (ipst->ips_ip_mrtdebug > 1) {
@@ -2979,6 +3073,8 @@ tbf_dq_sel(struct vif *vifp, ipha_t *ipha)
if (mp == t->tbf_t)
t->tbf_t = last;
mp->b_prev = mp->b_next = NULL;
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("tbf_dq_send", mp, ill);
freemsg(mp);
/*
* It's impossible for the queue to be empty, but
@@ -3000,76 +3096,97 @@ tbf_dq_sel(struct vif *vifp, ipha_t *ipha)
static void
tbf_send_packet(struct vif *vifp, mblk_t *mp)
{
- ipif_t *ipif;
- ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst;
+ ipif_t *ipif = vifp->v_ipif;
+ ill_t *ill = ipif->ipif_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
conn_t *mrouter = ipst->ips_ip_g_mrouter;
+ ipha_t *ipha;
+ ipha = (ipha_t *)mp->b_rptr;
/* If encap tunnel options */
if (vifp->v_flags & VIFF_TUNNEL) {
+ ip_xmit_attr_t ixas;
+
if (ipst->ips_ip_mrtdebug > 1) {
(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
- "tbf_send_pkt: ENCAP tunnel vif %ld",
+ "tbf_send_packet: ENCAP tunnel vif %ld",
(ptrdiff_t)(vifp - ipst->ips_vifs));
}
+ bzero(&ixas, sizeof (ixas));
+ ixas.ixa_flags = IXAF_IS_IPV4 | IXAF_NO_TTL_CHANGE;
+ ixas.ixa_ipst = ipst;
+ ixas.ixa_ifindex = 0;
+ ixas.ixa_cred = kcred;
+ ixas.ixa_cpid = NOPID;
+ ixas.ixa_tsl = NULL;
+ ixas.ixa_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */
+ ixas.ixa_pktlen = ntohs(ipha->ipha_length);
+ ixas.ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha);
/*
- * Feed into ip_wput which will set the ident field and
- * checksum the encapsulating header.
+ * Feed into ip_output_simple which will set the ident field
+ * and checksum the encapsulating header.
* BSD gets the cached route vifp->v_route from ip_output()
* to speed up route table lookups. Not necessary in SunOS 5.x.
+ * One could make multicast forwarding faster by putting an
+ * ip_xmit_attr_t in each vif thereby caching the ire/nce.
*/
- put(vifp->v_ipif->ipif_wq, mp);
+ (void) ip_output_simple(mp, &ixas);
+ ixa_cleanup(&ixas);
return;
/* phyint */
} else {
/* Need to loop back to members on the outgoing interface. */
- ipha_t *ipha;
- ipaddr_t dst;
- ipha = (ipha_t *)mp->b_rptr;
- dst = ipha->ipha_dst;
- ipif = vifp->v_ipif;
-
- if (ilm_lookup_ipif(ipif, dst) != NULL) {
- /*
- * The packet is not yet reassembled, thus we need to
- * pass it to ip_rput_local for checksum verification
- * and reassembly (and fanout the user stream).
- */
- mblk_t *mp_loop;
- ire_t *ire;
-
- if (ipst->ips_ip_mrtdebug > 1) {
- (void) mi_strlog(mrouter->conn_rq, 1,
- SL_TRACE,
- "tbf_send_pkt: loopback vif %ld",
- (ptrdiff_t)(vifp - ipst->ips_vifs));
- }
- mp_loop = copymsg(mp);
- ire = ire_ctable_lookup(~0, 0, IRE_BROADCAST, NULL,
- ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
-
- if (mp_loop != NULL && ire != NULL) {
- IP_RPUT_LOCAL(ipif->ipif_rq, mp_loop,
- ((ipha_t *)mp_loop->b_rptr),
- ire, (ill_t *)ipif->ipif_rq->q_ptr);
- } else {
- /* Either copymsg failed or no ire */
- (void) mi_strlog(mrouter->conn_rq, 1,
- SL_TRACE,
- "tbf_send_pkt: mp_loop 0x%p, ire 0x%p "
- "vif %ld\n", (void *)mp_loop, (void *)ire,
- (ptrdiff_t)(vifp - ipst->ips_vifs));
- }
- if (ire != NULL)
- ire_refrele(ire);
+ ipaddr_t dst;
+ ip_recv_attr_t iras;
+ nce_t *nce;
+
+ bzero(&iras, sizeof (iras));
+ iras.ira_flags = IRAF_IS_IPV4;
+ iras.ira_ill = iras.ira_rill = ill;
+ iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+ iras.ira_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */
+ iras.ira_pktlen = ntohs(ipha->ipha_length);
+ iras.ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
+
+ dst = ipha->ipha_dst;
+ if (ill_hasmembers_v4(ill, dst)) {
+ iras.ira_flags |= IRAF_LOOPBACK_COPY;
}
if (ipst->ips_ip_mrtdebug > 1) {
(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
"tbf_send_pkt: phyint forward vif %ld dst = 0x%x",
(ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(dst));
}
- ip_rput_forward_multicast(dst, mp, ipif);
+ /*
+ * Find an NCE which matches the nexthop.
+ * For a pt-pt interface we use the other end of the pt-pt
+ * link.
+ */
+ if (ipif->ipif_flags & IPIF_POINTOPOINT) {
+ dst = ipif->ipif_pp_dst_addr;
+ nce = arp_nce_init(ill, dst, ill->ill_net_type);
+ } else {
+ nce = arp_nce_init(ill, dst, IRE_MULTICAST);
+ }
+ if (nce == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("tbf_send_packet - no nce", mp, ill);
+ freemsg(mp);
+ return;
+ }
+
+ /*
+ * We don't remeber the incoming ill. Thus we
+ * pretend the packet arrived on the outbound ill. This means
+ * statistics for input errors will be increased on the wrong
+ * ill but that isn't a big deal.
+ */
+ ip_forward_xmit_v4(nce, ill, mp, ipha, &iras, ill->ill_mtu, 0);
+ ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
+
+ nce_refrele(nce);
}
}
diff --git a/usr/src/uts/common/inet/ip/ip_multi.c b/usr/src/uts/common/inet/ip/ip_multi.c
index d7be67cd26..0912d87227 100644
--- a/usr/src/uts/common/inet/ip/ip_multi.c
+++ b/usr/src/uts/common/inet/ip/ip_multi.c
@@ -66,29 +66,41 @@ static void ilm_bld_flists(conn_t *conn, void *arg);
static void ilm_gen_filter(ilm_t *ilm, mcast_record_t *fmode,
slist_t *flist);
-static ilm_t *ilm_add_v6(ipif_t *ipif, const in6_addr_t *group,
+static ilm_t *ilm_add(ill_t *ill, const in6_addr_t *group,
ilg_stat_t ilgstat, mcast_record_t ilg_fmode, slist_t *ilg_flist,
zoneid_t zoneid);
static void ilm_delete(ilm_t *ilm);
-static int ip_ll_addmulti_v6(ipif_t *ipif, const in6_addr_t *group);
-static int ip_ll_delmulti_v6(ipif_t *ipif, const in6_addr_t *group);
-static ilg_t *ilg_lookup_ipif(conn_t *connp, ipaddr_t group,
- ipif_t *ipif);
-static int ilg_add(conn_t *connp, ipaddr_t group, ipif_t *ipif,
- mcast_record_t fmode, ipaddr_t src);
-static int ilg_add_v6(conn_t *connp, const in6_addr_t *group, ill_t *ill,
- mcast_record_t fmode, const in6_addr_t *v6src);
+static int ilm_numentries(ill_t *, const in6_addr_t *);
+
+static ilm_t *ip_addmulti_serial(const in6_addr_t *, ill_t *, zoneid_t,
+ ilg_stat_t, mcast_record_t, slist_t *, int *);
+static ilm_t *ip_addmulti_impl(const in6_addr_t *, ill_t *,
+ zoneid_t, ilg_stat_t, mcast_record_t, slist_t *, int *);
+static int ip_delmulti_serial(ilm_t *, boolean_t, boolean_t);
+static int ip_delmulti_impl(ilm_t *, boolean_t, boolean_t);
+
+static int ip_ll_multireq(ill_t *ill, const in6_addr_t *group,
+ t_uscalar_t);
+static ilg_t *ilg_lookup(conn_t *, const in6_addr_t *, ipaddr_t ifaddr,
+ uint_t ifindex);
+
+static int ilg_add(conn_t *connp, const in6_addr_t *group,
+ ipaddr_t ifaddr, uint_t ifindex, ill_t *ill, mcast_record_t fmode,
+ const in6_addr_t *v6src);
static void ilg_delete(conn_t *connp, ilg_t *ilg, const in6_addr_t *src);
static mblk_t *ill_create_dl(ill_t *ill, uint32_t dl_primitive,
- uint32_t length, uint32_t *addr_lenp, uint32_t *addr_offp);
-static void conn_ilg_reap(conn_t *connp);
-static int ip_opt_delete_group_excl(conn_t *connp, ipaddr_t group,
- ipif_t *ipif, mcast_record_t fmode, ipaddr_t src);
-static int ip_opt_delete_group_excl_v6(conn_t *connp,
- const in6_addr_t *v6group, ill_t *ill, mcast_record_t fmode,
- const in6_addr_t *v6src);
-static void ill_ilm_walker_hold(ill_t *ill);
-static void ill_ilm_walker_rele(ill_t *ill);
+ uint32_t *addr_lenp, uint32_t *addr_offp);
+static int ip_opt_delete_group_excl(conn_t *connp,
+ const in6_addr_t *v6group, ipaddr_t ifaddr, uint_t ifindex,
+ mcast_record_t fmode, const in6_addr_t *v6src);
+
+static ilm_t *ilm_lookup(ill_t *, const in6_addr_t *, zoneid_t);
+
+static int ip_msfilter_ill(conn_t *, mblk_t *, const ip_ioctl_cmd_t *,
+ ill_t **);
+
+static void ilg_check_detach(conn_t *, ill_t *);
+static void ilg_check_reattach(conn_t *);
/*
* MT notes:
@@ -98,124 +110,122 @@ static void ill_ilm_walker_rele(ill_t *ill);
* need to synchronize when operating on the ilg. Multiple threads
* potentially operating on different conn (socket endpoints) trying to
* do multicast joins could eventually end up trying to manipulate the
- * ilm simultaneously and need to synchronize access to the ilm. Currently,
- * this is done by synchronizing join/leave via per-phyint ipsq_t
- * serialization.
+ * ilm simulatenously and need to synchronize on the access to the ilm.
+ * The access and lookup of the ilm, as well as other ill multicast state,
+ * is under ill_mcast_lock.
+ * The modifications and lookup of ilg entries is serialized using conn_ilg_lock
+ * rwlock. An ilg will not be freed until ilg_refcnt drops to zero.
+ *
+ * In some cases we hold ill_mcast_lock and then acquire conn_ilg_lock, but
+ * never the other way around.
*
* An ilm is an IP data structure used to track multicast join/leave.
* An ilm is associated with a <multicast group, ipif> tuple in IPv4 and
* with just <multicast group> in IPv6. ilm_refcnt is the number of ilg's
- * referencing the ilm. ilms are created / destroyed only as writer. ilms
- * are not passed around, instead they are looked up and used under the
- * ill_lock or as writer. So we don't need a dynamic refcount of the number
+ * referencing the ilm.
+ * The modifications and lookup of ilm entries is serialized using the
+ * ill_mcast_lock rwlock; that lock handles all the igmp/mld modifications
+ * of the ilm state.
+ * ilms are created / destroyed only as writer. ilms
+ * are not passed around. The datapath (anything outside of this file
+ * and igmp.c) use functions that do not return ilms - just the number
+ * of members. So we don't need a dynamic refcount of the number
* of threads holding reference to an ilm.
*
- * Multicast Join operation:
- *
- * The first step is to determine the ipif (v4) or ill (v6) on which
- * the join operation is to be done. The join is done after becoming
- * exclusive on the ipsq associated with the ipif or ill. The conn->conn_ilg
- * and ill->ill_ilm are thus accessed and modified exclusively per ill.
- * Multiple threads can attempt to join simultaneously on different ipif/ill
- * on the same conn. In this case the ipsq serialization does not help in
- * protecting the ilg. It is the conn_lock that is used to protect the ilg.
- * The conn_lock also protects all the ilg_t members.
+ * In the cases where we serially access the ilg and ilm, which happens when
+ * we handle the applications requests to join or leave groups and sources,
+ * we use the ill_mcast_serializer mutex to ensure that a multithreaded
+ * application which does concurrent joins and/or leaves on the same group on
+ * the same socket always results in a consistent order for the ilg and ilm
+ * modifications.
*
- * Leave operation.
- *
- * Similar to the join operation, the first step is to determine the ipif
- * or ill (v6) on which the leave operation is to be done. The leave operation
- * is done after becoming exclusive on the ipsq associated with the ipif or ill.
- * As with join ilg modification is done under the protection of the conn lock.
+ * When a multicast operation results in needing to send a message to
+ * the driver (to join/leave a L2 multicast address), we use ill_dlpi_queue()
+ * which serialized the DLPI requests. The IGMP/MLD code uses ill_mcast_queue()
+ * to send IGMP/MLD IP packet to avoid dropping the lock just to send a packet.
*/
-#define IPSQ_ENTER_IPIF(ipif, connp, first_mp, func, ipsq, type) \
- ASSERT(connp != NULL); \
- (ipsq) = ipsq_try_enter((ipif), NULL, CONNP_TO_WQ(connp), \
- (first_mp), (func), (type), B_TRUE); \
- if ((ipsq) == NULL) { \
- ipif_refrele(ipif); \
- return (EINPROGRESS); \
- }
-
-#define IPSQ_ENTER_ILL(ill, connp, first_mp, func, ipsq, type) \
- ASSERT(connp != NULL); \
- (ipsq) = ipsq_try_enter(NULL, ill, CONNP_TO_WQ(connp), \
- (first_mp), (func), (type), B_TRUE); \
- if ((ipsq) == NULL) { \
- ill_refrele(ill); \
- return (EINPROGRESS); \
- }
-
-#define IPSQ_EXIT(ipsq) \
- if (ipsq != NULL) \
- ipsq_exit(ipsq);
+#define GETSTRUCT(structure, number) \
+ ((structure *)mi_zalloc(sizeof (structure) * (number)))
-#define ILG_WALKER_HOLD(connp) (connp)->conn_ilg_walker_cnt++
+/*
+ * Caller must ensure that the ilg has not been condemned
+ * The condemned flag is only set in ilg_delete under conn_ilg_lock.
+ *
+ * The caller must hold conn_ilg_lock as writer.
+ */
+static void
+ilg_refhold(ilg_t *ilg)
+{
+ ASSERT(ilg->ilg_refcnt != 0);
+ ASSERT(!ilg->ilg_condemned);
+ ASSERT(RW_WRITE_HELD(&ilg->ilg_connp->conn_ilg_lock));
-#define ILG_WALKER_RELE(connp) \
- { \
- (connp)->conn_ilg_walker_cnt--; \
- if ((connp)->conn_ilg_walker_cnt == 0) \
- conn_ilg_reap(connp); \
- }
+ ilg->ilg_refcnt++;
+}
static void
-conn_ilg_reap(conn_t *connp)
+ilg_inactive(ilg_t *ilg)
{
- int to;
- int from;
- ilg_t *ilg;
-
- ASSERT(MUTEX_HELD(&connp->conn_lock));
+ ASSERT(ilg->ilg_ill == NULL);
+ ASSERT(ilg->ilg_ilm == NULL);
+ ASSERT(ilg->ilg_filter == NULL);
+ ASSERT(ilg->ilg_condemned);
- to = 0;
- from = 0;
- while (from < connp->conn_ilg_inuse) {
- if (connp->conn_ilg[from].ilg_flags & ILG_DELETED) {
- ilg = &connp->conn_ilg[from];
- FREE_SLIST(ilg->ilg_filter);
- ilg->ilg_flags &= ~ILG_DELETED;
- from++;
- continue;
- }
- if (to != from)
- connp->conn_ilg[to] = connp->conn_ilg[from];
- to++;
- from++;
- }
+ /* Unlink from list */
+ *ilg->ilg_ptpn = ilg->ilg_next;
+ if (ilg->ilg_next != NULL)
+ ilg->ilg_next->ilg_ptpn = ilg->ilg_ptpn;
+ ilg->ilg_next = NULL;
+ ilg->ilg_ptpn = NULL;
- connp->conn_ilg_inuse = to;
+ ilg->ilg_connp = NULL;
+ kmem_free(ilg, sizeof (*ilg));
+}
- if (connp->conn_ilg_inuse == 0) {
- mi_free((char *)connp->conn_ilg);
- connp->conn_ilg = NULL;
- cv_broadcast(&connp->conn_refcv);
- }
+/*
+ * The caller must hold conn_ilg_lock as writer.
+ */
+static void
+ilg_refrele(ilg_t *ilg)
+{
+ ASSERT(RW_WRITE_HELD(&ilg->ilg_connp->conn_ilg_lock));
+ ASSERT(ilg->ilg_refcnt != 0);
+ if (--ilg->ilg_refcnt == 0)
+ ilg_inactive(ilg);
}
-#define GETSTRUCT(structure, number) \
- ((structure *)mi_zalloc(sizeof (structure) * (number)))
+/*
+ * Acquire reference on ilg and drop reference on held_ilg.
+ * In the case when held_ilg is the same as ilg we already have
+ * a reference, but the held_ilg might be condemned. In that case
+ * we avoid the ilg_refhold/rele so that we can assert in ire_refhold
+ * that the ilg isn't condemned.
+ */
+static void
+ilg_transfer_hold(ilg_t *held_ilg, ilg_t *ilg)
+{
+ if (held_ilg == ilg)
+ return;
-#define ILG_ALLOC_CHUNK 16
+ ilg_refhold(ilg);
+ if (held_ilg != NULL)
+ ilg_refrele(held_ilg);
+}
/*
- * Returns a pointer to the next available ilg in conn_ilg. Allocs more
- * buffers in size of ILG_ALLOC_CHUNK ilgs when needed, and updates conn's
- * ilg tracking fields appropriately (conn_ilg_inuse reflects usage of the
- * returned ilg). Returns NULL on failure, in which case `*errp' will be
+ * Allocate a new ilg_t and links it into conn_ilg.
+ * Returns NULL on failure, in which case `*errp' will be
* filled in with the reason.
*
- * Assumes connp->conn_lock is held.
+ * Assumes connp->conn_ilg_lock is held.
*/
static ilg_t *
conn_ilg_alloc(conn_t *connp, int *errp)
{
- ilg_t *new, *ret;
- int curcnt;
+ ilg_t *ilg;
- ASSERT(MUTEX_HELD(&connp->conn_lock));
- ASSERT(connp->conn_ilg_inuse <= connp->conn_ilg_allocated);
+ ASSERT(RW_WRITE_HELD(&connp->conn_ilg_lock));
/*
* If CONN_CLOSING is set, conn_ilg cleanup has begun and we must not
@@ -226,44 +236,23 @@ conn_ilg_alloc(conn_t *connp, int *errp)
return (NULL);
}
- if (connp->conn_ilg == NULL) {
- connp->conn_ilg = GETSTRUCT(ilg_t, ILG_ALLOC_CHUNK);
- if (connp->conn_ilg == NULL) {
- *errp = ENOMEM;
- return (NULL);
- }
- connp->conn_ilg_allocated = ILG_ALLOC_CHUNK;
- connp->conn_ilg_inuse = 0;
- }
- if (connp->conn_ilg_inuse == connp->conn_ilg_allocated) {
- if (connp->conn_ilg_walker_cnt != 0) {
- /*
- * XXX We cannot grow the array at this point
- * because a list walker could be in progress, and
- * we cannot wipe out the existing array until the
- * walker is done. Just return NULL for now.
- * ilg_delete_all() will have to be changed when
- * this logic is changed.
- */
- *errp = EBUSY;
- return (NULL);
- }
- curcnt = connp->conn_ilg_allocated;
- new = GETSTRUCT(ilg_t, curcnt + ILG_ALLOC_CHUNK);
- if (new == NULL) {
- *errp = ENOMEM;
- return (NULL);
- }
- bcopy(connp->conn_ilg, new, sizeof (ilg_t) * curcnt);
- mi_free((char *)connp->conn_ilg);
- connp->conn_ilg = new;
- connp->conn_ilg_allocated += ILG_ALLOC_CHUNK;
+ ilg = kmem_zalloc(sizeof (ilg_t), KM_NOSLEEP);
+ if (ilg == NULL) {
+ *errp = ENOMEM;
+ return (NULL);
}
- ret = &connp->conn_ilg[connp->conn_ilg_inuse++];
- ASSERT((ret->ilg_flags & ILG_DELETED) == 0);
- bzero(ret, sizeof (*ret));
- return (ret);
+ ilg->ilg_refcnt = 1;
+
+ /* Insert at head */
+ if (connp->conn_ilg != NULL)
+ connp->conn_ilg->ilg_ptpn = &ilg->ilg_next;
+ ilg->ilg_next = connp->conn_ilg;
+ ilg->ilg_ptpn = &connp->conn_ilg;
+ connp->conn_ilg = ilg;
+
+ ilg->ilg_connp = connp;
+ return (ilg);
}
typedef struct ilm_fbld_s {
@@ -275,15 +264,18 @@ typedef struct ilm_fbld_s {
boolean_t fbld_in_overflow;
} ilm_fbld_t;
+/*
+ * Caller must hold ill_mcast_lock
+ */
static void
-ilm_bld_flists(conn_t *conn, void *arg)
+ilm_bld_flists(conn_t *connp, void *arg)
{
- int i;
+ ilg_t *ilg;
ilm_fbld_t *fbld = (ilm_fbld_t *)(arg);
ilm_t *ilm = fbld->fbld_ilm;
in6_addr_t *v6group = &ilm->ilm_v6addr;
- if (conn->conn_ilg_inuse == 0)
+ if (connp->conn_ilg == NULL)
return;
/*
@@ -300,12 +292,26 @@ ilm_bld_flists(conn_t *conn, void *arg)
* ilm (group, interface match). If so, update the master
* include and exclude lists we're building in the fbld struct
* with this ilg's filter info.
+ *
+ * Note that the caller has already serialized on the ill we care
+ * about.
*/
- mutex_enter(&conn->conn_lock);
- for (i = 0; i < conn->conn_ilg_inuse; i++) {
- ilg_t *ilg = &conn->conn_ilg[i];
+ ASSERT(MUTEX_HELD(&ilm->ilm_ill->ill_mcast_serializer));
+
+ rw_enter(&connp->conn_ilg_lock, RW_READER);
+ for (ilg = connp->conn_ilg; ilg != NULL; ilg = ilg->ilg_next) {
+ if (ilg->ilg_condemned)
+ continue;
+
+ /*
+ * Since we are under the ill_mcast_serializer we know
+ * that any ilg+ilm operations on this ilm have either
+ * not started or completed, except for the last ilg
+ * (the one that caused us to be called) which doesn't
+ * have ilg_ilm set yet. Hence we compare using ilg_ill
+ * and the address.
+ */
if ((ilg->ilg_ill == ilm->ilm_ill) &&
- (ilg->ilg_ipif == ilm->ilm_ipif) &&
IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, v6group)) {
if (ilg->ilg_fmode == MODE_IS_INCLUDE) {
fbld->fbld_in_cnt++;
@@ -337,9 +343,12 @@ ilm_bld_flists(conn_t *conn, void *arg)
break;
}
}
- mutex_exit(&conn->conn_lock);
+ rw_exit(&connp->conn_ilg_lock);
}
+/*
+ * Caller must hold ill_mcast_lock
+ */
static void
ilm_gen_filter(ilm_t *ilm, mcast_record_t *fmode, slist_t *flist)
{
@@ -385,15 +394,17 @@ ilm_gen_filter(ilm_t *ilm, mcast_record_t *fmode, slist_t *flist)
}
}
+/*
+ * Caller must hold ill_mcast_lock
+ */
static int
-ilm_update_add(ilm_t *ilm, ilg_stat_t ilgstat, slist_t *ilg_flist,
- boolean_t isv6)
+ilm_update_add(ilm_t *ilm, ilg_stat_t ilgstat, slist_t *ilg_flist)
{
mcast_record_t fmode;
slist_t *flist;
boolean_t fdefault;
char buf[INET6_ADDRSTRLEN];
- ill_t *ill = isv6 ? ilm->ilm_ill : ilm->ilm_ipif->ipif_ill;
+ ill_t *ill = ilm->ilm_ill;
/*
* There are several cases where the ilm's filter state
@@ -444,7 +455,7 @@ ilm_update_add(ilm_t *ilm, ilg_stat_t ilgstat, slist_t *ilg_flist,
/* send the state change report */
if (!IS_LOOPBACK(ill)) {
- if (isv6)
+ if (ill->ill_isv6)
mld_statechange(ilm, fmode, flist);
else
igmp_statechange(ilm, fmode, flist);
@@ -464,12 +475,15 @@ ilm_update_add(ilm_t *ilm, ilg_stat_t ilgstat, slist_t *ilg_flist,
return (0);
}
+/*
+ * Caller must hold ill_mcast_lock
+ */
static int
-ilm_update_del(ilm_t *ilm, boolean_t isv6)
+ilm_update_del(ilm_t *ilm)
{
mcast_record_t fmode;
slist_t *flist;
- ill_t *ill = isv6 ? ilm->ilm_ill : ilm->ilm_ipif->ipif_ill;
+ ill_t *ill = ilm->ilm_ill;
ip1dbg(("ilm_update_del: still %d left; updating state\n",
ilm->ilm_refcnt));
@@ -500,7 +514,7 @@ ilm_update_del(ilm_t *ilm, boolean_t isv6)
}
if (!IS_LOOPBACK(ill)) {
- if (isv6)
+ if (ill->ill_isv6)
mld_statechange(ilm, fmode, flist);
else
igmp_statechange(ilm, fmode, flist);
@@ -531,240 +545,245 @@ ilm_update_del(ilm_t *ilm, boolean_t isv6)
}
/*
- * INADDR_ANY means all multicast addresses.
- * INADDR_ANY is stored as IPv6 unspecified addr.
+ * Create/update the ilm for the group/ill. Used by other parts of IP to
+ * do the ILGSTAT_NONE (no ilg), MODE_IS_EXCLUDE, with no slist join.
+ * Returns with a refhold on the ilm.
+ *
+ * The unspecified address means all multicast addresses for in both the
+ * case of IPv4 and IPv6.
+ *
+ * The caller should have already mapped an IPMP under ill to the upper.
*/
-int
-ip_addmulti(ipaddr_t group, ipif_t *ipif, ilg_stat_t ilgstat,
- mcast_record_t ilg_fmode, slist_t *ilg_flist)
+ilm_t *
+ip_addmulti(const in6_addr_t *v6group, ill_t *ill, zoneid_t zoneid,
+ int *errorp)
{
- ill_t *ill = ipif->ipif_ill;
- ilm_t *ilm;
- in6_addr_t v6group;
- int ret;
-
- ASSERT(IAM_WRITER_IPIF(ipif));
-
- if (!CLASSD(group) && group != INADDR_ANY)
- return (EINVAL);
-
- if (IS_UNDER_IPMP(ill))
- return (EINVAL);
-
- /*
- * INADDR_ANY is represented as the IPv6 unspecified addr.
- */
- if (group == INADDR_ANY)
- v6group = ipv6_all_zeros;
- else
- IN6_IPADDR_TO_V4MAPPED(group, &v6group);
-
- ilm = ilm_lookup_ipif(ipif, group);
- /*
- * Since we are writer, we know the ilm_flags itself cannot
- * change at this point, and ilm_lookup_ipif would not have
- * returned a DELETED ilm. However, the data path can free
- * ilm->ilm_next via ilm_walker_cleanup() so we can safely
- * access anything in ilm except ilm_next (for safe access to
- * ilm_next we'd have to take the ill_lock).
- */
- if (ilm != NULL)
- return (ilm_update_add(ilm, ilgstat, ilg_flist, B_FALSE));
-
- ilm = ilm_add_v6(ipif, &v6group, ilgstat, ilg_fmode, ilg_flist,
- ipif->ipif_zoneid);
- if (ilm == NULL)
- return (ENOMEM);
-
- if (group == INADDR_ANY) {
- /*
- * Check how many ipif's have members in this group -
- * if more then one we should not tell the driver to join
- * this time
- */
- if (ilm_numentries_v6(ill, &v6group) > 1)
- return (0);
- ret = ill_join_allmulti(ill);
- if (ret != 0)
- ilm_delete(ilm);
- return (ret);
- }
-
- if (!IS_LOOPBACK(ill))
- igmp_joingroup(ilm);
-
- if (ilm_numentries_v6(ill, &v6group) > 1)
- return (0);
+ ilm_t *ilm;
- ret = ip_ll_addmulti_v6(ipif, &v6group);
- if (ret != 0)
- ilm_delete(ilm);
- return (ret);
+ /* Acquire serializer to keep assert in ilm_bld_flists happy */
+ mutex_enter(&ill->ill_mcast_serializer);
+ ilm = ip_addmulti_serial(v6group, ill, zoneid, ILGSTAT_NONE,
+ MODE_IS_EXCLUDE, NULL, errorp);
+ mutex_exit(&ill->ill_mcast_serializer);
+ return (ilm);
}
/*
- * The unspecified address means all multicast addresses.
+ * Create/update the ilm for the group/ill. If ILGSTAT_CHANGE is not set
+ * then this returns with a refhold on the ilm.
+ *
+ * Internal routine which assumes the caller has already acquired
+ * ill_multi_serializer.
*
- * ill identifies the interface to join on.
+ * The unspecified address means all multicast addresses for in both the
+ * case of IPv4 and IPv6.
*
* ilgstat tells us if there's an ilg associated with this join,
* and if so, if it's a new ilg or a change to an existing one.
* ilg_fmode and ilg_flist give us the current filter state of
* the ilg (and will be EXCLUDE {NULL} in the case of no ilg).
+ *
+ * The caller should have already mapped an IPMP under ill to the upper.
*/
-int
-ip_addmulti_v6(const in6_addr_t *v6group, ill_t *ill, zoneid_t zoneid,
- ilg_stat_t ilgstat, mcast_record_t ilg_fmode, slist_t *ilg_flist)
+static ilm_t *
+ip_addmulti_serial(const in6_addr_t *v6group, ill_t *ill, zoneid_t zoneid,
+ ilg_stat_t ilgstat, mcast_record_t ilg_fmode, slist_t *ilg_flist,
+ int *errorp)
{
- ilm_t *ilm;
- int ret;
+ ilm_t *ilm;
- ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(MUTEX_HELD(&ill->ill_mcast_serializer));
- if (!IN6_IS_ADDR_MULTICAST(v6group) &&
- !IN6_IS_ADDR_UNSPECIFIED(v6group)) {
- return (EINVAL);
+ if (ill->ill_isv6) {
+ if (!IN6_IS_ADDR_MULTICAST(v6group) &&
+ !IN6_IS_ADDR_UNSPECIFIED(v6group)) {
+ *errorp = EINVAL;
+ return (NULL);
+ }
+ } else {
+ if (IN6_IS_ADDR_V4MAPPED(v6group)) {
+ ipaddr_t v4group;
+
+ IN6_V4MAPPED_TO_IPADDR(v6group, v4group);
+ if (!CLASSD(v4group)) {
+ *errorp = EINVAL;
+ return (NULL);
+ }
+ } else if (!IN6_IS_ADDR_UNSPECIFIED(v6group)) {
+ *errorp = EINVAL;
+ return (NULL);
+ }
}
- if (IS_UNDER_IPMP(ill) && !IN6_IS_ADDR_MC_SOLICITEDNODE(v6group))
- return (EINVAL);
+ if (IS_UNDER_IPMP(ill)) {
+ *errorp = EINVAL;
+ return (NULL);
+ }
+
+ rw_enter(&ill->ill_mcast_lock, RW_WRITER);
+ /*
+ * We do the equivalent of a lookup by checking after we get the lock
+ * This is needed since the ill could have been condemned after
+ * we looked it up, and we need to check condemned after we hold
+ * ill_mcast_lock to synchronize with the unplumb code.
+ */
+ if (ill->ill_state_flags & ILL_CONDEMNED) {
+ rw_exit(&ill->ill_mcast_lock);
+ *errorp = ENXIO;
+ return (NULL);
+ }
+ ilm = ip_addmulti_impl(v6group, ill, zoneid, ilgstat, ilg_fmode,
+ ilg_flist, errorp);
+ rw_exit(&ill->ill_mcast_lock);
+
+ /* Send any deferred/queued DLPI or IP packets */
+ ill_mcast_send_queued(ill);
+ ill_dlpi_send_queued(ill);
+ ill_mcast_timer_start(ill->ill_ipst);
+ return (ilm);
+}
+
+static ilm_t *
+ip_addmulti_impl(const in6_addr_t *v6group, ill_t *ill, zoneid_t zoneid,
+ ilg_stat_t ilgstat, mcast_record_t ilg_fmode, slist_t *ilg_flist,
+ int *errorp)
+{
+ ilm_t *ilm;
+ int ret = 0;
+
+ ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
+ *errorp = 0;
/*
* An ilm is uniquely identified by the tuple of (group, ill) where
* `group' is the multicast group address, and `ill' is the interface
* on which it is currently joined.
*/
- ilm = ilm_lookup_ill_v6(ill, v6group, B_TRUE, zoneid);
- if (ilm != NULL)
- return (ilm_update_add(ilm, ilgstat, ilg_flist, B_TRUE));
- ilm = ilm_add_v6(ill->ill_ipif, v6group, ilgstat, ilg_fmode,
- ilg_flist, zoneid);
- if (ilm == NULL)
- return (ENOMEM);
+ ilm = ilm_lookup(ill, v6group, zoneid);
+ if (ilm != NULL) {
+ /* ilm_update_add bumps ilm_refcnt unless ILGSTAT_CHANGE */
+ ret = ilm_update_add(ilm, ilgstat, ilg_flist);
+ if (ret == 0)
+ return (ilm);
- if (IN6_IS_ADDR_UNSPECIFIED(v6group)) {
- /*
- * Check how many ipif's that have members in this group -
- * if more then one we should not tell the driver to join
- * this time
- */
- if (ilm_numentries_v6(ill, v6group) > 1)
- return (0);
- ret = ill_join_allmulti(ill);
- if (ret != 0)
- ilm_delete(ilm);
- return (ret);
+ *errorp = ret;
+ return (NULL);
}
- if (!IS_LOOPBACK(ill))
- mld_joingroup(ilm);
-
/*
- * If we have more then one we should not tell the driver
- * to join this time.
+ * The callers checks on the ilg and the ilg+ilm consistency under
+ * ill_mcast_serializer ensures that we can not have ILGSTAT_CHANGE
+ * and no ilm.
*/
- if (ilm_numentries_v6(ill, v6group) > 1)
- return (0);
-
- ret = ip_ll_addmulti_v6(ill->ill_ipif, v6group);
- if (ret != 0)
- ilm_delete(ilm);
- return (ret);
-}
+ ASSERT(ilgstat != ILGSTAT_CHANGE);
+ ilm = ilm_add(ill, v6group, ilgstat, ilg_fmode, ilg_flist, zoneid);
+ if (ilm == NULL) {
+ *errorp = ENOMEM;
+ return (NULL);
+ }
-/*
- * Mapping the given IP multicast address to the L2 multicast mac address.
- */
-static void
-ill_multicast_mapping(ill_t *ill, ipaddr_t ip_addr, uint8_t *hw_addr,
- uint32_t hw_addrlen)
-{
- dl_unitdata_req_t *dlur;
- ipaddr_t proto_extract_mask;
- uint8_t *from, *bcast_addr;
- uint32_t hw_extract_start;
- int len;
+ if (IN6_IS_ADDR_UNSPECIFIED(v6group)) {
+ /*
+ * If we have more then one we should not tell the driver
+ * to join this time.
+ */
+ if (ilm_numentries(ill, v6group) == 1) {
+ ret = ill_join_allmulti(ill);
+ }
+ } else {
+ if (!IS_LOOPBACK(ill)) {
+ if (ill->ill_isv6)
+ mld_joingroup(ilm);
+ else
+ igmp_joingroup(ilm);
+ }
- ASSERT(IN_CLASSD(ntohl(ip_addr)));
- ASSERT(hw_addrlen == ill->ill_phys_addr_length);
- ASSERT((ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) == 0);
- ASSERT((ill->ill_flags & ILLF_MULTICAST) != 0);
+ /*
+ * If we have more then one we should not tell the driver
+ * to join this time.
+ */
+ if (ilm_numentries(ill, v6group) == 1) {
+ ret = ip_ll_multireq(ill, v6group, DL_ENABMULTI_REQ);
+ }
+ }
+ if (ret != 0) {
+ if (ret == ENETDOWN) {
+ char buf[INET6_ADDRSTRLEN];
- /*
- * Find the physical broadcast address.
- */
- dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
- bcast_addr = (uint8_t *)dlur + dlur->dl_dest_addr_offset;
- if (ill->ill_sap_length > 0)
- bcast_addr += ill->ill_sap_length;
-
- VERIFY(MEDIA_V4MINFO(ill->ill_media, hw_addrlen, bcast_addr,
- hw_addr, &hw_extract_start, &proto_extract_mask));
-
- len = MIN((int)hw_addrlen - hw_extract_start, IP_ADDR_LEN);
- ip_addr &= proto_extract_mask;
- from = (uint8_t *)&ip_addr;
- while (len-- > 0)
- hw_addr[hw_extract_start + len] |= from[len];
+ ip0dbg(("ip_addmulti: ENETDOWN for %s on %s",
+ inet_ntop(AF_INET6, &ilm->ilm_v6addr,
+ buf, sizeof (buf)), ill->ill_name));
+ }
+ ilm_delete(ilm);
+ *errorp = ret;
+ return (NULL);
+ } else {
+ return (ilm);
+ }
}
/*
- * Send a multicast request to the driver for enabling multicast reception
- * for v6groupp address. The caller has already checked whether it is
- * appropriate to send one or not.
+ * Send a multicast request to the driver for enabling or disabling
+ * multicast reception for v6groupp address. The caller has already
+ * checked whether it is appropriate to send one or not.
+ *
+ * For IPMP we switch to the cast_ill since it has the right hardware
+ * information.
*/
-int
-ip_ll_send_enabmulti_req(ill_t *ill, const in6_addr_t *v6groupp)
+static int
+ip_ll_send_multireq(ill_t *ill, const in6_addr_t *v6groupp, t_uscalar_t prim)
{
mblk_t *mp;
uint32_t addrlen, addroff;
- char group_buf[INET6_ADDRSTRLEN];
-
- ASSERT(IAM_WRITER_ILL(ill));
-
- /*
- * If we're on the IPMP ill, use the nominated multicast interface to
- * send and receive DLPI messages, if one exists. (If none exists,
- * there are no usable interfaces and thus nothing to do.)
- */
- if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL)
- return (0);
-
- /*
- * Create a DL_ENABMULTI_REQ.
- */
- mp = ill_create_dl(ill, DL_ENABMULTI_REQ, sizeof (dl_enabmulti_req_t),
- &addrlen, &addroff);
- if (!mp)
- return (ENOMEM);
-
- if (IN6_IS_ADDR_V4MAPPED(v6groupp)) {
- ipaddr_t v4group;
+ ill_t *release_ill = NULL;
+ int err = 0;
- IN6_V4MAPPED_TO_IPADDR(v6groupp, v4group);
+ ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
- ill_multicast_mapping(ill, v4group,
- mp->b_rptr + addroff, addrlen);
+ if (IS_IPMP(ill)) {
+ /* On the upper IPMP ill. */
+ release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
+ if (release_ill == NULL) {
+ /*
+ * Avoid sending it down to the ipmpstub.
+ * We will be called again once the members of the
+ * group are in place
+ */
+ ip1dbg(("ip_ll_send_multireq: no cast_ill for %s %d\n",
+ ill->ill_name, ill->ill_isv6));
+ return (0);
+ }
+ ill = release_ill;
+ }
+ /* Create a DL_ENABMULTI_REQ or DL_DISABMULTI_REQ message. */
+ mp = ill_create_dl(ill, prim, &addrlen, &addroff);
+ if (mp == NULL) {
+ err = ENOMEM;
+ goto done;
+ }
- ip1dbg(("ip_ll_send_enabmulti_req: IPv4 %s on %s\n",
- inet_ntop(AF_INET6, v6groupp, group_buf,
- sizeof (group_buf)),
- ill->ill_name));
+ mp = ndp_mcastreq(ill, v6groupp, addrlen, addroff, mp);
+ if (mp == NULL) {
+ ip0dbg(("null from ndp_mcastreq(ill %s)\n", ill->ill_name));
+ err = ENOMEM;
+ goto done;
+ }
+ switch (((union DL_primitives *)mp->b_rptr)->dl_primitive) {
+ case DL_ENABMULTI_REQ:
+ mutex_enter(&ill->ill_lock);
/* Track the state if this is the first enabmulti */
if (ill->ill_dlpi_multicast_state == IDS_UNKNOWN)
ill->ill_dlpi_multicast_state = IDS_INPROGRESS;
- ill_dlpi_send(ill, mp);
- } else {
- ip1dbg(("ip_ll_send_enabmulti_req: IPv6 ndp_mcastreq %s on"
- " %s\n",
- inet_ntop(AF_INET6, v6groupp, group_buf,
- sizeof (group_buf)),
- ill->ill_name));
- return (ndp_mcastreq(ill, v6groupp, addrlen, addroff, mp));
+ mutex_exit(&ill->ill_lock);
+ break;
}
- return (0);
+ ill_dlpi_queue(ill, mp);
+done:
+ if (release_ill != NULL)
+ ill_refrele(release_ill);
+ return (err);
}
/*
@@ -772,132 +791,71 @@ ip_ll_send_enabmulti_req(ill_t *ill, const in6_addr_t *v6groupp)
* membership for v6group if appropriate.
*/
static int
-ip_ll_addmulti_v6(ipif_t *ipif, const in6_addr_t *v6groupp)
+ip_ll_multireq(ill_t *ill, const in6_addr_t *v6groupp, t_uscalar_t prim)
{
- ill_t *ill = ipif->ipif_ill;
-
- ASSERT(IAM_WRITER_IPIF(ipif));
-
if (ill->ill_net_type != IRE_IF_RESOLVER ||
- ipif->ipif_flags & IPIF_POINTOPOINT) {
- ip1dbg(("ip_ll_addmulti_v6: not resolver\n"));
+ ill->ill_ipif->ipif_flags & IPIF_POINTOPOINT) {
+ ip1dbg(("ip_ll_multireq: not resolver\n"));
return (0); /* Must be IRE_IF_NORESOLVER */
}
if (ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) {
- ip1dbg(("ip_ll_addmulti_v6: MULTI_BCAST\n"));
- return (0);
- }
- if (!ill->ill_dl_up) {
- /*
- * Nobody there. All multicast addresses will be re-joined
- * when we get the DL_BIND_ACK bringing the interface up.
- */
- ip1dbg(("ip_ll_addmulti_v6: nobody up\n"));
+ ip1dbg(("ip_ll_multireq: MULTI_BCAST\n"));
return (0);
}
- return (ip_ll_send_enabmulti_req(ill, v6groupp));
+ return (ip_ll_send_multireq(ill, v6groupp, prim));
}
/*
- * INADDR_ANY means all multicast addresses.
- * INADDR_ANY is stored as the IPv6 unspecified addr.
+ * Delete the ilm. Used by other parts of IP for the case of no_ilg/leaving
+ * being true.
*/
int
-ip_delmulti(ipaddr_t group, ipif_t *ipif, boolean_t no_ilg, boolean_t leaving)
+ip_delmulti(ilm_t *ilm)
{
- ill_t *ill = ipif->ipif_ill;
- ilm_t *ilm;
- in6_addr_t v6group;
-
- ASSERT(IAM_WRITER_IPIF(ipif));
-
- if (!CLASSD(group) && group != INADDR_ANY)
- return (EINVAL);
-
- /*
- * INADDR_ANY is represented as the IPv6 unspecified addr.
- */
- if (group == INADDR_ANY)
- v6group = ipv6_all_zeros;
- else
- IN6_IPADDR_TO_V4MAPPED(group, &v6group);
+ ill_t *ill = ilm->ilm_ill;
+ int error;
- /*
- * Look for a match on the ipif.
- * (IP_DROP_MEMBERSHIP specifies an ipif using an IP address).
- */
- ilm = ilm_lookup_ipif(ipif, group);
- if (ilm == NULL)
- return (ENOENT);
-
- /* Update counters */
- if (no_ilg)
- ilm->ilm_no_ilg_cnt--;
-
- if (leaving)
- ilm->ilm_refcnt--;
-
- if (ilm->ilm_refcnt > 0)
- return (ilm_update_del(ilm, B_FALSE));
-
- if (group == INADDR_ANY) {
- ilm_delete(ilm);
- /*
- * Check how many ipif's that have members in this group -
- * if there are still some left then don't tell the driver
- * to drop it.
- */
- if (ilm_numentries_v6(ill, &v6group) != 0)
- return (0);
-
- /* If we never joined, then don't leave. */
- if (ill->ill_join_allmulti)
- ill_leave_allmulti(ill);
-
- return (0);
- }
-
- if (!IS_LOOPBACK(ill))
- igmp_leavegroup(ilm);
-
- ilm_delete(ilm);
- /*
- * Check how many ipif's that have members in this group -
- * if there are still some left then don't tell the driver
- * to drop it.
- */
- if (ilm_numentries_v6(ill, &v6group) != 0)
- return (0);
- return (ip_ll_delmulti_v6(ipif, &v6group));
+ /* Acquire serializer to keep assert in ilm_bld_flists happy */
+ mutex_enter(&ill->ill_mcast_serializer);
+ error = ip_delmulti_serial(ilm, B_TRUE, B_TRUE);
+ mutex_exit(&ill->ill_mcast_serializer);
+ return (error);
}
+
/*
- * The unspecified address means all multicast addresses.
+ * Delete the ilm.
+ * Assumes ill_multi_serializer is held by the caller.
*/
-int
-ip_delmulti_v6(const in6_addr_t *v6group, ill_t *ill, zoneid_t zoneid,
- boolean_t no_ilg, boolean_t leaving)
+static int
+ip_delmulti_serial(ilm_t *ilm, boolean_t no_ilg, boolean_t leaving)
{
- ipif_t *ipif;
- ilm_t *ilm;
+ ill_t *ill = ilm->ilm_ill;
+ int ret;
- ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(MUTEX_HELD(&ill->ill_mcast_serializer));
+ ASSERT(!(IS_UNDER_IPMP(ill)));
- if (!IN6_IS_ADDR_MULTICAST(v6group) &&
- !IN6_IS_ADDR_UNSPECIFIED(v6group))
- return (EINVAL);
+ rw_enter(&ill->ill_mcast_lock, RW_WRITER);
+ ret = ip_delmulti_impl(ilm, no_ilg, leaving);
+ rw_exit(&ill->ill_mcast_lock);
+ /* Send any deferred/queued DLPI or IP packets */
+ ill_mcast_send_queued(ill);
+ ill_dlpi_send_queued(ill);
+ ill_mcast_timer_start(ill->ill_ipst);
- /*
- * Look for a match on the ill.
- */
- ilm = ilm_lookup_ill_v6(ill, v6group, B_TRUE, zoneid);
- if (ilm == NULL)
- return (ENOENT);
+ return (ret);
+}
- ASSERT(ilm->ilm_ill == ill);
+static int
+ip_delmulti_impl(ilm_t *ilm, boolean_t no_ilg, boolean_t leaving)
+{
+ ill_t *ill = ilm->ilm_ill;
+ int error;
+ in6_addr_t v6group;
- ipif = ill->ill_ipif;
+ ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
/* Update counters */
if (no_ilg)
@@ -907,150 +865,90 @@ ip_delmulti_v6(const in6_addr_t *v6group, ill_t *ill, zoneid_t zoneid,
ilm->ilm_refcnt--;
if (ilm->ilm_refcnt > 0)
- return (ilm_update_del(ilm, B_TRUE));
+ return (ilm_update_del(ilm));
- if (IN6_IS_ADDR_UNSPECIFIED(v6group)) {
+ v6group = ilm->ilm_v6addr;
+
+ if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) {
ilm_delete(ilm);
/*
- * Check how many ipif's that have members in this group -
- * if there are still some left then don't tell the driver
- * to drop it.
+ * If we have some left then one we should not tell the driver
+ * to leave.
*/
- if (ilm_numentries_v6(ill, v6group) != 0)
+ if (ilm_numentries(ill, &v6group) != 0)
return (0);
- /* If we never joined, then don't leave. */
- if (ill->ill_join_allmulti)
- ill_leave_allmulti(ill);
+ ill_leave_allmulti(ill);
return (0);
}
- if (!IS_LOOPBACK(ill))
- mld_leavegroup(ilm);
+ if (!IS_LOOPBACK(ill)) {
+ if (ill->ill_isv6)
+ mld_leavegroup(ilm);
+ else
+ igmp_leavegroup(ilm);
+ }
ilm_delete(ilm);
/*
- * Check how many ipif's that have members in this group -
- * if there are still some left then don't tell the driver
- * to drop it.
+ * If we have some left then one we should not tell the driver
+ * to leave.
*/
- if (ilm_numentries_v6(ill, v6group) != 0)
+ if (ilm_numentries(ill, &v6group) != 0)
return (0);
- return (ip_ll_delmulti_v6(ipif, v6group));
-}
-/*
- * Send a multicast request to the driver for disabling multicast reception
- * for v6groupp address. The caller has already checked whether it is
- * appropriate to send one or not.
- */
-int
-ip_ll_send_disabmulti_req(ill_t *ill, const in6_addr_t *v6groupp)
-{
- mblk_t *mp;
- char group_buf[INET6_ADDRSTRLEN];
- uint32_t addrlen, addroff;
+ error = ip_ll_multireq(ill, &v6group, DL_DISABMULTI_REQ);
+ /* We ignore the case when ill_dl_up is not set */
+ if (error == ENETDOWN) {
+ char buf[INET6_ADDRSTRLEN];
- ASSERT(IAM_WRITER_ILL(ill));
-
- /*
- * See comment in ip_ll_send_enabmulti_req().
- */
- if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL)
- return (0);
-
- /*
- * Create a DL_DISABMULTI_REQ.
- */
- mp = ill_create_dl(ill, DL_DISABMULTI_REQ,
- sizeof (dl_disabmulti_req_t), &addrlen, &addroff);
- if (!mp)
- return (ENOMEM);
-
- if (IN6_IS_ADDR_V4MAPPED(v6groupp)) {
- ipaddr_t v4group;
-
- IN6_V4MAPPED_TO_IPADDR(v6groupp, v4group);
-
- ill_multicast_mapping(ill, v4group,
- mp->b_rptr + addroff, addrlen);
-
- ip1dbg(("ip_ll_send_disabmulti_req: IPv4 %s on %s\n",
- inet_ntop(AF_INET6, v6groupp, group_buf,
- sizeof (group_buf)),
+ ip0dbg(("ip_delmulti: ENETDOWN for %s on %s",
+ inet_ntop(AF_INET6, &v6group, buf, sizeof (buf)),
ill->ill_name));
- ill_dlpi_send(ill, mp);
- } else {
- ip1dbg(("ip_ll_send_disabmulti_req: IPv6 ndp_mcastreq %s on"
- " %s\n",
- inet_ntop(AF_INET6, v6groupp, group_buf,
- sizeof (group_buf)),
- ill->ill_name));
- return (ndp_mcastreq(ill, v6groupp, addrlen, addroff, mp));
- }
- return (0);
-}
-
-/*
- * Send a multicast request to the driver for disabling multicast
- * membership for v6group if appropriate.
- */
-static int
-ip_ll_delmulti_v6(ipif_t *ipif, const in6_addr_t *v6group)
-{
- ill_t *ill = ipif->ipif_ill;
-
- ASSERT(IAM_WRITER_IPIF(ipif));
-
- if (ill->ill_net_type != IRE_IF_RESOLVER ||
- ipif->ipif_flags & IPIF_POINTOPOINT) {
- return (0); /* Must be IRE_IF_NORESOLVER */
- }
- if (ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) {
- ip1dbg(("ip_ll_delmulti_v6: MULTI_BCAST\n"));
- return (0);
}
- if (!ill->ill_dl_up) {
- /*
- * Nobody there. All multicast addresses will be re-joined
- * when we get the DL_BIND_ACK bringing the interface up.
- */
- ip1dbg(("ip_ll_delmulti_v6: nobody up\n"));
- return (0);
- }
- return (ip_ll_send_disabmulti_req(ill, v6group));
+ return (error);
}
/*
- * Make the driver pass up all multicast packets. NOTE: to keep callers
- * IPMP-unaware, if an IPMP ill is passed in, the ill_join_allmulti flag is
- * set on it (rather than the cast ill).
+ * Make the driver pass up all multicast packets.
*/
int
ill_join_allmulti(ill_t *ill)
{
- mblk_t *promiscon_mp, *promiscoff_mp;
+ mblk_t *promiscon_mp, *promiscoff_mp = NULL;
uint32_t addrlen, addroff;
- ill_t *join_ill = ill;
+ ill_t *release_ill = NULL;
- ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
if (!ill->ill_dl_up) {
/*
* Nobody there. All multicast addresses will be re-joined
* when we get the DL_BIND_ACK bringing the interface up.
*/
- return (0);
+ return (ENETDOWN);
}
- /*
- * See comment in ip_ll_send_enabmulti_req().
- */
- if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL)
- return (0);
-
- ASSERT(!join_ill->ill_join_allmulti);
+ if (IS_IPMP(ill)) {
+ /* On the upper IPMP ill. */
+ release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
+ if (release_ill == NULL) {
+ /*
+ * Avoid sending it down to the ipmpstub.
+ * We will be called again once the members of the
+ * group are in place
+ */
+ ip1dbg(("ill_join_allmulti: no cast_ill for %s %d\n",
+ ill->ill_name, ill->ill_isv6));
+ return (0);
+ }
+ ill = release_ill;
+ if (!ill->ill_dl_up) {
+ ill_refrele(ill);
+ return (ENETDOWN);
+ }
+ }
/*
* Create a DL_PROMISCON_REQ message and send it directly to the DLPI
@@ -1062,19 +960,24 @@ ill_join_allmulti(ill_t *ill)
if ((ill->ill_net_type == IRE_IF_RESOLVER) &&
!(ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST)) {
promiscon_mp = ill_create_dl(ill, DL_PROMISCON_REQ,
- sizeof (dl_promiscon_req_t), &addrlen, &addroff);
- promiscoff_mp = ill_create_dl(ill, DL_PROMISCOFF_REQ,
- sizeof (dl_promiscoff_req_t), &addrlen, &addroff);
- if (promiscon_mp == NULL || promiscoff_mp == NULL) {
+ &addrlen, &addroff);
+ if (ill->ill_promiscoff_mp == NULL)
+ promiscoff_mp = ill_create_dl(ill, DL_PROMISCOFF_REQ,
+ &addrlen, &addroff);
+ if (promiscon_mp == NULL ||
+ (ill->ill_promiscoff_mp == NULL && promiscoff_mp == NULL)) {
freemsg(promiscon_mp);
freemsg(promiscoff_mp);
+ if (release_ill != NULL)
+ ill_refrele(release_ill);
return (ENOMEM);
}
- ill->ill_promiscoff_mp = promiscoff_mp;
- ill_dlpi_send(ill, promiscon_mp);
+ if (ill->ill_promiscoff_mp == NULL)
+ ill->ill_promiscoff_mp = promiscoff_mp;
+ ill_dlpi_queue(ill, promiscon_mp);
}
-
- join_ill->ill_join_allmulti = B_TRUE;
+ if (release_ill != NULL)
+ ill_refrele(release_ill);
return (0);
}
@@ -1085,9 +988,9 @@ void
ill_leave_allmulti(ill_t *ill)
{
mblk_t *promiscoff_mp;
- ill_t *leave_ill = ill;
+ ill_t *release_ill = NULL;
- ASSERT(IAM_WRITER_ILL(ill));
+ ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
if (!ill->ill_dl_up) {
/*
@@ -1097,105 +1000,130 @@ ill_leave_allmulti(ill_t *ill)
return;
}
- /*
- * See comment in ip_ll_send_enabmulti_req().
- */
- if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL)
- return;
-
- ASSERT(leave_ill->ill_join_allmulti);
+ if (IS_IPMP(ill)) {
+ /* On the upper IPMP ill. */
+ release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
+ if (release_ill == NULL) {
+ /*
+ * Avoid sending it down to the ipmpstub.
+ * We will be called again once the members of the
+ * group are in place
+ */
+ ip1dbg(("ill_leave_allmulti: no cast_ill on %s %d\n",
+ ill->ill_name, ill->ill_isv6));
+ return;
+ }
+ ill = release_ill;
+ if (!ill->ill_dl_up)
+ goto done;
+ }
/*
- * Create a DL_PROMISCOFF_REQ message and send it directly to
- * the DLPI provider. We don't need to do this for certain
- * media types for which we never need to turn promiscuous
- * mode on.
+ * In the case of IPMP and ill_dl_up not being set when we joined
+ * we didn't allocate a promiscoff_mp. In that case we have
+ * nothing to do when we leave.
+ * Ditto for PHYI_MULTI_BCAST
*/
- if ((ill->ill_net_type == IRE_IF_RESOLVER) &&
- !(ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST)) {
- promiscoff_mp = ill->ill_promiscoff_mp;
- ASSERT(promiscoff_mp != NULL);
+ promiscoff_mp = ill->ill_promiscoff_mp;
+ if (promiscoff_mp != NULL) {
ill->ill_promiscoff_mp = NULL;
- ill_dlpi_send(ill, promiscoff_mp);
- }
-
- leave_ill->ill_join_allmulti = B_FALSE;
-}
-
-static ill_t *
-ipsq_enter_byifindex(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst)
-{
- ill_t *ill;
- boolean_t in_ipsq;
-
- ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL,
- ipst);
- if (ill != NULL) {
- if (!ill_waiter_inc(ill)) {
- ill_refrele(ill);
- return (NULL);
- }
- ill_refrele(ill);
- in_ipsq = ipsq_enter(ill, B_FALSE, NEW_OP);
- ill_waiter_dcr(ill);
- if (!in_ipsq)
- ill = NULL;
+ ill_dlpi_queue(ill, promiscoff_mp);
}
- return (ill);
+done:
+ if (release_ill != NULL)
+ ill_refrele(release_ill);
}
int
ip_join_allmulti(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst)
{
ill_t *ill;
- int ret = 0;
+ int ret;
+ ilm_t *ilm;
- if ((ill = ipsq_enter_byifindex(ifindex, isv6, ipst)) == NULL)
+ ill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
+ if (ill == NULL)
return (ENODEV);
/*
- * The ip_addmulti*() functions won't allow IPMP underlying interfaces
+ * The ip_addmulti() function doesn't allow IPMP underlying interfaces
* to join allmulti since only the nominated underlying interface in
* the group should receive multicast. We silently succeed to avoid
* having to teach IPobs (currently the only caller of this routine)
* to ignore failures in this case.
*/
- if (IS_UNDER_IPMP(ill))
- goto out;
+ if (IS_UNDER_IPMP(ill)) {
+ ill_refrele(ill);
+ return (0);
+ }
+ mutex_enter(&ill->ill_lock);
+ if (ill->ill_ipallmulti_cnt > 0) {
+ /* Already joined */
+ ASSERT(ill->ill_ipallmulti_ilm != NULL);
+ ill->ill_ipallmulti_cnt++;
+ mutex_exit(&ill->ill_lock);
+ goto done;
+ }
+ mutex_exit(&ill->ill_lock);
- if (isv6) {
- ret = ip_addmulti_v6(&ipv6_all_zeros, ill, ill->ill_zoneid,
- ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL);
- } else {
- ret = ip_addmulti(INADDR_ANY, ill->ill_ipif, ILGSTAT_NONE,
- MODE_IS_EXCLUDE, NULL);
+ ilm = ip_addmulti(&ipv6_all_zeros, ill, ill->ill_zoneid, &ret);
+ if (ilm == NULL) {
+ ASSERT(ret != 0);
+ ill_refrele(ill);
+ return (ret);
}
+
+ mutex_enter(&ill->ill_lock);
+ if (ill->ill_ipallmulti_cnt > 0) {
+ /* Another thread added it concurrently */
+ (void) ip_delmulti(ilm);
+ mutex_exit(&ill->ill_lock);
+ goto done;
+ }
+ ASSERT(ill->ill_ipallmulti_ilm == NULL);
+ ill->ill_ipallmulti_ilm = ilm;
ill->ill_ipallmulti_cnt++;
-out:
- ipsq_exit(ill->ill_phyint->phyint_ipsq);
- return (ret);
+ mutex_exit(&ill->ill_lock);
+done:
+ ill_refrele(ill);
+ return (0);
}
-
int
ip_leave_allmulti(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst)
{
ill_t *ill;
+ ilm_t *ilm;
- if ((ill = ipsq_enter_byifindex(ifindex, isv6, ipst)) == NULL)
+ ill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
+ if (ill == NULL)
return (ENODEV);
- if (ill->ill_ipallmulti_cnt > 0) {
- if (isv6) {
- (void) ip_delmulti_v6(&ipv6_all_zeros, ill,
- ill->ill_zoneid, B_TRUE, B_TRUE);
- } else {
- (void) ip_delmulti(INADDR_ANY, ill->ill_ipif, B_TRUE,
- B_TRUE);
- }
- ill->ill_ipallmulti_cnt--;
+ if (IS_UNDER_IPMP(ill)) {
+ ill_refrele(ill);
+ return (0);
+ }
+
+ mutex_enter(&ill->ill_lock);
+ if (ill->ill_ipallmulti_cnt == 0) {
+ /* ip_purge_allmulti could have removed them all */
+ mutex_exit(&ill->ill_lock);
+ goto done;
+ }
+ ill->ill_ipallmulti_cnt--;
+ if (ill->ill_ipallmulti_cnt == 0) {
+ /* Last one */
+ ilm = ill->ill_ipallmulti_ilm;
+ ill->ill_ipallmulti_ilm = NULL;
+ } else {
+ ilm = NULL;
}
- ipsq_exit(ill->ill_phyint->phyint_ipsq);
+ mutex_exit(&ill->ill_lock);
+ if (ilm != NULL)
+ (void) ip_delmulti(ilm);
+
+done:
+ ill_refrele(ill);
return (0);
}
@@ -1206,108 +1134,34 @@ ip_leave_allmulti(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst)
void
ip_purge_allmulti(ill_t *ill)
{
- ASSERT(IAM_WRITER_ILL(ill));
-
- for (; ill->ill_ipallmulti_cnt > 0; ill->ill_ipallmulti_cnt--) {
- if (ill->ill_isv6) {
- (void) ip_delmulti_v6(&ipv6_all_zeros, ill,
- ill->ill_zoneid, B_TRUE, B_TRUE);
- } else {
- (void) ip_delmulti(INADDR_ANY, ill->ill_ipif, B_TRUE,
- B_TRUE);
- }
- }
-}
-
-/*
- * Copy mp_orig and pass it in as a local message.
- */
-void
-ip_multicast_loopback(queue_t *q, ill_t *ill, mblk_t *mp_orig, int fanout_flags,
- zoneid_t zoneid)
-{
- mblk_t *mp;
- mblk_t *ipsec_mp;
- ipha_t *iph;
- ip_stack_t *ipst = ill->ill_ipst;
-
- if (DB_TYPE(mp_orig) == M_DATA &&
- ((ipha_t *)mp_orig->b_rptr)->ipha_protocol == IPPROTO_UDP) {
- uint_t hdrsz;
-
- hdrsz = IPH_HDR_LENGTH((ipha_t *)mp_orig->b_rptr) +
- sizeof (udpha_t);
- ASSERT(MBLKL(mp_orig) >= hdrsz);
-
- if (((mp = allocb(hdrsz, BPRI_MED)) != NULL) &&
- (mp_orig = dupmsg(mp_orig)) != NULL) {
- cred_t *cr;
-
- bcopy(mp_orig->b_rptr, mp->b_rptr, hdrsz);
- mp->b_wptr += hdrsz;
- mp->b_cont = mp_orig;
- mp_orig->b_rptr += hdrsz;
- if (is_system_labeled() &&
- (cr = msg_getcred(mp_orig, NULL)) != NULL)
- mblk_setcred(mp, cr, NOPID);
- if (MBLKL(mp_orig) == 0) {
- mp->b_cont = mp_orig->b_cont;
- mp_orig->b_cont = NULL;
- freeb(mp_orig);
- }
- } else if (mp != NULL) {
- freeb(mp);
- mp = NULL;
- }
- } else {
- mp = ip_copymsg(mp_orig); /* No refcnt on ipsec_out netstack */
- }
-
- if (mp == NULL)
- return;
- if (DB_TYPE(mp) == M_CTL) {
- ipsec_mp = mp;
- mp = mp->b_cont;
- } else {
- ipsec_mp = mp;
- }
-
- iph = (ipha_t *)mp->b_rptr;
-
- /*
- * DTrace this as ip:::send. A blocked packet will fire the send
- * probe, but not the receive probe.
- */
- DTRACE_IP7(send, mblk_t *, ipsec_mp, conn_t *, NULL, void_ip_t *, iph,
- __dtrace_ipsr_ill_t *, ill, ipha_t *, iph, ip6_t *, NULL, int, 1);
-
- DTRACE_PROBE4(ip4__loopback__out__start,
- ill_t *, NULL, ill_t *, ill,
- ipha_t *, iph, mblk_t *, ipsec_mp);
+ ilm_t *ilm;
- FW_HOOKS(ipst->ips_ip4_loopback_out_event,
- ipst->ips_ipv4firewall_loopback_out,
- NULL, ill, iph, ipsec_mp, mp, HPE_MULTICAST, ipst);
+ ASSERT(IAM_WRITER_ILL(ill));
- DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, ipsec_mp);
+ mutex_enter(&ill->ill_lock);
+ ilm = ill->ill_ipallmulti_ilm;
+ ill->ill_ipallmulti_ilm = NULL;
+ ill->ill_ipallmulti_cnt = 0;
+ mutex_exit(&ill->ill_lock);
- if (ipsec_mp != NULL)
- ip_wput_local(q, ill, iph, ipsec_mp, NULL,
- fanout_flags, zoneid);
+ if (ilm != NULL)
+ (void) ip_delmulti(ilm);
}
/*
- * Create a DLPI message; for DL_{ENAB,DISAB}MULTI_REQ, room is left for
- * the hardware address.
+ * Create a dlpi message with room for phys+sap. Later
+ * we will strip the sap for those primitives which
+ * only need a physical address.
*/
static mblk_t *
-ill_create_dl(ill_t *ill, uint32_t dl_primitive, uint32_t length,
+ill_create_dl(ill_t *ill, uint32_t dl_primitive,
uint32_t *addr_lenp, uint32_t *addr_offp)
{
mblk_t *mp;
uint32_t hw_addr_length;
char *cp;
uint32_t offset;
+ uint32_t length;
uint32_t size;
*addr_lenp = *addr_offp = 0;
@@ -1318,14 +1172,18 @@ ill_create_dl(ill_t *ill, uint32_t dl_primitive, uint32_t length,
return (NULL);
}
- size = length;
switch (dl_primitive) {
case DL_ENABMULTI_REQ:
+ length = sizeof (dl_enabmulti_req_t);
+ size = length + hw_addr_length;
+ break;
case DL_DISABMULTI_REQ:
- size += hw_addr_length;
+ length = sizeof (dl_disabmulti_req_t);
+ size = length + hw_addr_length;
break;
case DL_PROMISCON_REQ:
case DL_PROMISCOFF_REQ:
+ size = length = sizeof (dl_promiscon_req_t);
break;
default:
return (NULL);
@@ -1373,33 +1231,29 @@ ill_create_dl(ill_t *ill, uint32_t dl_primitive, uint32_t length,
}
/*
- * Rejoin any groups which have been explicitly joined by the application (we
- * left all explicitly joined groups as part of ill_leave_multicast() prior to
- * bringing the interface down). Note that because groups can be joined and
- * left while an interface is down, this may not be the same set of groups
- * that we left in ill_leave_multicast().
+ * Rejoin any groups for which we have ilms.
+ *
+ * This is only needed for IPMP when the cast_ill changes since that
+ * change is invisible to the ilm. Other interface changes are handled
+ * by conn_update_ill.
*/
void
ill_recover_multicast(ill_t *ill)
{
ilm_t *ilm;
- ipif_t *ipif = ill->ill_ipif;
char addrbuf[INET6_ADDRSTRLEN];
- ASSERT(IAM_WRITER_ILL(ill));
-
ill->ill_need_recover_multicast = 0;
- ill_ilm_walker_hold(ill);
+ rw_enter(&ill->ill_mcast_lock, RW_WRITER);
for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
/*
- * Check how many ipif's that have members in this group -
- * if more then one we make sure that this entry is first
- * in the list.
+ * If we have more then one ilm for the group (e.g., with
+ * different zoneid) then we should not tell the driver
+ * to join unless this is the first ilm for the group.
*/
- if (ilm_numentries_v6(ill, &ilm->ilm_v6addr) > 1 &&
- ilm_lookup_ill_v6(ill, &ilm->ilm_v6addr, B_TRUE,
- ALL_ZONES) != ilm) {
+ if (ilm_numentries(ill, &ilm->ilm_v6addr) > 1 &&
+ ilm_lookup(ill, &ilm->ilm_v6addr, ALL_ZONES) != ilm) {
continue;
}
@@ -1414,38 +1268,42 @@ ill_recover_multicast(ill_t *ill)
else
igmp_joingroup(ilm);
- (void) ip_ll_addmulti_v6(ipif, &ilm->ilm_v6addr);
+ (void) ip_ll_multireq(ill, &ilm->ilm_v6addr,
+ DL_ENABMULTI_REQ);
}
}
- ill_ilm_walker_rele(ill);
-
+ rw_exit(&ill->ill_mcast_lock);
+ /* Send any deferred/queued DLPI or IP packets */
+ ill_mcast_send_queued(ill);
+ ill_dlpi_send_queued(ill);
+ ill_mcast_timer_start(ill->ill_ipst);
}
/*
* The opposite of ill_recover_multicast() -- leaves all multicast groups
* that were explicitly joined.
+ *
+ * This is only needed for IPMP when the cast_ill changes since that
+ * change is invisible to the ilm. Other interface changes are handled
+ * by conn_update_ill.
*/
void
ill_leave_multicast(ill_t *ill)
{
ilm_t *ilm;
- ipif_t *ipif = ill->ill_ipif;
char addrbuf[INET6_ADDRSTRLEN];
- ASSERT(IAM_WRITER_ILL(ill));
-
ill->ill_need_recover_multicast = 1;
- ill_ilm_walker_hold(ill);
+ rw_enter(&ill->ill_mcast_lock, RW_WRITER);
for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
/*
- * Check how many ipif's that have members in this group -
- * if more then one we make sure that this entry is first
- * in the list.
+ * If we have more then one ilm for the group (e.g., with
+ * different zoneid) then we should not tell the driver
+ * to leave unless this is the first ilm for the group.
*/
- if (ilm_numentries_v6(ill, &ilm->ilm_v6addr) > 1 &&
- ilm_lookup_ill_v6(ill, &ilm->ilm_v6addr, B_TRUE,
- ALL_ZONES) != ilm) {
+ if (ilm_numentries(ill, &ilm->ilm_v6addr) > 1 &&
+ ilm_lookup(ill, &ilm->ilm_v6addr, ALL_ZONES) != ilm) {
continue;
}
@@ -1460,126 +1318,186 @@ ill_leave_multicast(ill_t *ill)
else
igmp_leavegroup(ilm);
- (void) ip_ll_delmulti_v6(ipif, &ilm->ilm_v6addr);
+ (void) ip_ll_multireq(ill, &ilm->ilm_v6addr,
+ DL_DISABMULTI_REQ);
}
}
- ill_ilm_walker_rele(ill);
+ rw_exit(&ill->ill_mcast_lock);
+ /* Send any deferred/queued DLPI or IP packets */
+ ill_mcast_send_queued(ill);
+ ill_dlpi_send_queued(ill);
+ ill_mcast_timer_start(ill->ill_ipst);
}
-/* Find an ilm for matching the ill */
-ilm_t *
-ilm_lookup_ill(ill_t *ill, ipaddr_t group, zoneid_t zoneid)
+/*
+ * Interface used by IP input/output.
+ * Returns true if there is a member on the ill for any zoneid.
+ */
+boolean_t
+ill_hasmembers_v6(ill_t *ill, const in6_addr_t *v6group)
+{
+ ilm_t *ilm;
+
+ rw_enter(&ill->ill_mcast_lock, RW_READER);
+ ilm = ilm_lookup(ill, v6group, ALL_ZONES);
+ rw_exit(&ill->ill_mcast_lock);
+ return (ilm != NULL);
+}
+
+/*
+ * Interface used by IP input/output.
+ * Returns true if there is a member on the ill for any zoneid.
+ *
+ * The group and source can't be INADDR_ANY here so no need to translate to
+ * the unspecified IPv6 address.
+ */
+boolean_t
+ill_hasmembers_v4(ill_t *ill, ipaddr_t group)
{
in6_addr_t v6group;
- /*
- * INADDR_ANY is represented as the IPv6 unspecified addr.
- */
- if (group == INADDR_ANY)
- v6group = ipv6_all_zeros;
- else
- IN6_IPADDR_TO_V4MAPPED(group, &v6group);
+ IN6_IPADDR_TO_V4MAPPED(group, &v6group);
+ return (ill_hasmembers_v6(ill, &v6group));
+}
+
+/*
+ * Interface used by IP input/output.
+ * Returns true if there is a member on the ill for any zoneid except skipzone.
+ */
+boolean_t
+ill_hasmembers_otherzones_v6(ill_t *ill, const in6_addr_t *v6group,
+ zoneid_t skipzone)
+{
+ ilm_t *ilm;
- return (ilm_lookup_ill_v6(ill, &v6group, B_TRUE, zoneid));
+ rw_enter(&ill->ill_mcast_lock, RW_READER);
+ for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+ if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group) &&
+ ilm->ilm_zoneid != skipzone) {
+ rw_exit(&ill->ill_mcast_lock);
+ return (B_TRUE);
+ }
+ }
+ rw_exit(&ill->ill_mcast_lock);
+ return (B_FALSE);
}
/*
- * Find an ilm for address `v6group' on `ill' and zone `zoneid' (which may be
- * ALL_ZONES). In general, if `ill' is in an IPMP group, we will match
- * against any ill in the group. However, if `restrict_solicited' is set,
- * then specifically for IPv6 solicited-node multicast, the match will be
- * restricted to the specified `ill'.
+ * Interface used by IP input/output.
+ * Returns true if there is a member on the ill for any zoneid except skipzone.
+ *
+ * The group and source can't be INADDR_ANY here so no need to translate to
+ * the unspecified IPv6 address.
*/
-ilm_t *
-ilm_lookup_ill_v6(ill_t *ill, const in6_addr_t *v6group,
- boolean_t restrict_solicited, zoneid_t zoneid)
+boolean_t
+ill_hasmembers_otherzones_v4(ill_t *ill, ipaddr_t group, zoneid_t skipzone)
{
- ilm_t *ilm;
- ilm_walker_t ilw;
- boolean_t restrict_ill = B_FALSE;
+ in6_addr_t v6group;
- /*
- * In general, underlying interfaces cannot have multicast memberships
- * and thus lookups always match across the illgrp. However, we must
- * allow IPv6 solicited-node multicast memberships on underlying
- * interfaces, and thus an IPMP meta-interface and one of its
- * underlying ills may have the same solicited-node multicast address.
- * In that case, we need to restrict the lookup to the requested ill.
- * However, we may receive packets on an underlying interface that
- * are for the corresponding IPMP interface's solicited-node multicast
- * address, and thus in that case we need to match across the group --
- * hence the unfortunate `restrict_solicited' argument.
- */
- if (IN6_IS_ADDR_MC_SOLICITEDNODE(v6group) && restrict_solicited)
- restrict_ill = (IS_IPMP(ill) || IS_UNDER_IPMP(ill));
+ IN6_IPADDR_TO_V4MAPPED(group, &v6group);
+ return (ill_hasmembers_otherzones_v6(ill, &v6group, skipzone));
+}
- ilm = ilm_walker_start(&ilw, ill);
- for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
- if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group))
- continue;
- if (zoneid != ALL_ZONES && zoneid != ilm->ilm_zoneid)
- continue;
- if (!restrict_ill || ill == (ill->ill_isv6 ?
- ilm->ilm_ill : ilm->ilm_ipif->ipif_ill)) {
- break;
+/*
+ * Interface used by IP input.
+ * Returns the next numerically larger zoneid that has a member. If none exist
+ * then returns -1 (ALL_ZONES).
+ * The normal usage is for the caller to start with a -1 zoneid (ALL_ZONES)
+ * to find the first zoneid which has a member, and then pass that in for
+ * subsequent calls until ALL_ZONES is returned.
+ *
+ * The implementation of ill_hasmembers_nextzone() assumes the ilms
+ * are sorted by zoneid for efficiency.
+ */
+zoneid_t
+ill_hasmembers_nextzone_v6(ill_t *ill, const in6_addr_t *v6group,
+ zoneid_t zoneid)
+{
+ ilm_t *ilm;
+
+ rw_enter(&ill->ill_mcast_lock, RW_READER);
+ for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+ if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group) &&
+ ilm->ilm_zoneid > zoneid) {
+ zoneid = ilm->ilm_zoneid;
+ rw_exit(&ill->ill_mcast_lock);
+ return (zoneid);
}
}
- ilm_walker_finish(&ilw);
- return (ilm);
+ rw_exit(&ill->ill_mcast_lock);
+ return (ALL_ZONES);
}
/*
- * Find an ilm for the ipif. Only needed for IPv4 which does
- * ipif specific socket options.
+ * Interface used by IP input.
+ * Returns the next numerically larger zoneid that has a member. If none exist
+ * then returns -1 (ALL_ZONES).
+ *
+ * The group and source can't be INADDR_ANY here so no need to translate to
+ * the unspecified IPv6 address.
*/
-ilm_t *
-ilm_lookup_ipif(ipif_t *ipif, ipaddr_t group)
+zoneid_t
+ill_hasmembers_nextzone_v4(ill_t *ill, ipaddr_t group, zoneid_t zoneid)
{
- ilm_t *ilm;
- ilm_walker_t ilw;
+ in6_addr_t v6group;
- ilm = ilm_walker_start(&ilw, ipif->ipif_ill);
- for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) {
- if (ilm->ilm_ipif == ipif && ilm->ilm_addr == group)
- break;
+ IN6_IPADDR_TO_V4MAPPED(group, &v6group);
+
+ return (ill_hasmembers_nextzone_v6(ill, &v6group, zoneid));
+}
+
+/*
+ * Find an ilm matching the ill, group, and zoneid.
+ */
+static ilm_t *
+ilm_lookup(ill_t *ill, const in6_addr_t *v6group, zoneid_t zoneid)
+{
+ ilm_t *ilm;
+
+ ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
+
+ for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
+ if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group))
+ continue;
+ if (zoneid != ALL_ZONES && zoneid != ilm->ilm_zoneid)
+ continue;
+
+ ASSERT(ilm->ilm_ill == ill);
+ return (ilm);
}
- ilm_walker_finish(&ilw);
- return (ilm);
+ return (NULL);
}
/*
* How many members on this ill?
+ * Since each shared-IP zone has a separate ilm for the same group/ill
+ * we can have several.
*/
-int
-ilm_numentries_v6(ill_t *ill, const in6_addr_t *v6group)
+static int
+ilm_numentries(ill_t *ill, const in6_addr_t *v6group)
{
ilm_t *ilm;
int i = 0;
- mutex_enter(&ill->ill_lock);
+ ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
- if (ilm->ilm_flags & ILM_DELETED)
- continue;
if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group)) {
i++;
}
}
- mutex_exit(&ill->ill_lock);
return (i);
}
/* Caller guarantees that the group is not already on the list */
static ilm_t *
-ilm_add_v6(ipif_t *ipif, const in6_addr_t *v6group, ilg_stat_t ilgstat,
+ilm_add(ill_t *ill, const in6_addr_t *v6group, ilg_stat_t ilgstat,
mcast_record_t ilg_fmode, slist_t *ilg_flist, zoneid_t zoneid)
{
- ill_t *ill = ipif->ipif_ill;
ilm_t *ilm;
ilm_t *ilm_cur;
ilm_t **ilm_ptpn;
- ASSERT(IAM_WRITER_IPIF(ipif));
-
+ ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
ilm = GETSTRUCT(ilm_t, 1);
if (ilm == NULL)
return (NULL);
@@ -1596,44 +1514,23 @@ ilm_add_v6(ipif_t *ipif, const in6_addr_t *v6group, ilg_stat_t ilgstat,
ilm->ilm_timer = INFINITY;
ilm->ilm_rtx.rtx_timer = INFINITY;
- /*
- * IPv4 Multicast groups are joined using ipif.
- * IPv6 Multicast groups are joined using ill.
- */
- if (ill->ill_isv6) {
- ilm->ilm_ill = ill;
- ilm->ilm_ipif = NULL;
- DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
- (char *), "ilm", (void *), ilm);
- ill->ill_ilm_cnt++;
- } else {
- ASSERT(ilm->ilm_zoneid == ipif->ipif_zoneid);
- ilm->ilm_ipif = ipif;
- ilm->ilm_ill = NULL;
- DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *), ipif,
- (char *), "ilm", (void *), ilm);
- ipif->ipif_ilm_cnt++;
- }
+ ilm->ilm_ill = ill;
+ DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
+ (char *), "ilm", (void *), ilm);
+ ill->ill_ilm_cnt++;
ASSERT(ill->ill_ipst);
ilm->ilm_ipst = ill->ill_ipst; /* No netstack_hold */
- ASSERT(!(ipif->ipif_state_flags & IPIF_CONDEMNED));
- ASSERT(!(ill->ill_state_flags & ILL_CONDEMNED));
+ /* The ill/ipif could have just been marked as condemned */
/*
- * Grab lock to give consistent view to readers
- */
- mutex_enter(&ill->ill_lock);
- /*
- * All ilms in the same zone are contiguous in the ill_ilm list.
- * The loops in ip_proto_input() and ip_wput_local() use this to avoid
- * sending duplicates up when two applications in the same zone join the
- * same group on different logical interfaces.
+ * To make ill_hasmembers_nextzone_v6 work we keep the list
+ * sorted by zoneid.
*/
ilm_cur = ill->ill_ilm;
ilm_ptpn = &ill->ill_ilm;
- while (ilm_cur != NULL && ilm_cur->ilm_zoneid != ilm->ilm_zoneid) {
+ while (ilm_cur != NULL && ilm_cur->ilm_zoneid < ilm->ilm_zoneid) {
ilm_ptpn = &ilm_cur->ilm_next;
ilm_cur = ilm_cur->ilm_next;
}
@@ -1653,7 +1550,6 @@ ilm_add_v6(ipif_t *ipif, const in6_addr_t *v6group, ilg_stat_t ilgstat,
ilm->ilm_fmode = MODE_IS_EXCLUDE;
}
- mutex_exit(&ill->ill_lock);
return (ilm);
}
@@ -1668,118 +1564,40 @@ ilm_inactive(ilm_t *ilm)
mi_free((char *)ilm);
}
-void
-ilm_walker_cleanup(ill_t *ill)
-{
- ilm_t **ilmp;
- ilm_t *ilm;
- boolean_t need_wakeup = B_FALSE;
-
- ASSERT(MUTEX_HELD(&ill->ill_lock));
- ASSERT(ill->ill_ilm_walker_cnt == 0);
-
- ilmp = &ill->ill_ilm;
- while (*ilmp != NULL) {
- if ((*ilmp)->ilm_flags & ILM_DELETED) {
- ilm = *ilmp;
- *ilmp = ilm->ilm_next;
- /*
- * check if there are any pending FREE or unplumb
- * operations that need to be restarted.
- */
- if (ilm->ilm_ipif != NULL) {
- /*
- * IPv4 ilms hold a ref on the ipif.
- */
- DTRACE_PROBE3(ipif__decr__cnt,
- (ipif_t *), ilm->ilm_ipif,
- (char *), "ilm", (void *), ilm);
- ilm->ilm_ipif->ipif_ilm_cnt--;
- if (IPIF_FREE_OK(ilm->ilm_ipif))
- need_wakeup = B_TRUE;
- } else {
- /*
- * IPv6 ilms hold a ref on the ill.
- */
- ASSERT(ilm->ilm_ill == ill);
- DTRACE_PROBE3(ill__decr__cnt,
- (ill_t *), ill,
- (char *), "ilm", (void *), ilm);
- ASSERT(ill->ill_ilm_cnt > 0);
- ill->ill_ilm_cnt--;
- if (ILL_FREE_OK(ill))
- need_wakeup = B_TRUE;
- }
- ilm_inactive(ilm); /* frees ilm */
- } else {
- ilmp = &(*ilmp)->ilm_next;
- }
- }
- ill->ill_ilm_cleanup_reqd = 0;
- if (need_wakeup)
- ipif_ill_refrele_tail(ill);
- else
- mutex_exit(&ill->ill_lock);
-}
-
/*
* Unlink ilm and free it.
*/
static void
ilm_delete(ilm_t *ilm)
{
- ill_t *ill;
+ ill_t *ill = ilm->ilm_ill;
ilm_t **ilmp;
boolean_t need_wakeup;
-
- if (ilm->ilm_ipif != NULL) {
- ASSERT(IAM_WRITER_IPIF(ilm->ilm_ipif));
- ASSERT(ilm->ilm_ill == NULL);
- ill = ilm->ilm_ipif->ipif_ill;
- ASSERT(!ill->ill_isv6);
- } else {
- ASSERT(IAM_WRITER_ILL(ilm->ilm_ill));
- ASSERT(ilm->ilm_ipif == NULL);
- ill = ilm->ilm_ill;
- ASSERT(ill->ill_isv6);
- }
/*
* Delete under lock protection so that readers don't stumble
* on bad ilm_next
*/
- mutex_enter(&ill->ill_lock);
- if (ill->ill_ilm_walker_cnt != 0) {
- ilm->ilm_flags |= ILM_DELETED;
- ill->ill_ilm_cleanup_reqd = 1;
- mutex_exit(&ill->ill_lock);
- return;
- }
+ ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
for (ilmp = &ill->ill_ilm; *ilmp != ilm; ilmp = &(*ilmp)->ilm_next)
- ;
+ ;
+
*ilmp = ilm->ilm_next;
+ mutex_enter(&ill->ill_lock);
/*
- * if we are the last reference to the ipif (for IPv4 ilms)
- * or the ill (for IPv6 ilms), we may need to wakeup any
- * pending FREE or unplumb operations.
+ * if we are the last reference to the ill, we may need to wakeup any
+ * pending FREE or unplumb operations. This is because conn_update_ill
+ * bails if there is a ilg_delete_all in progress.
*/
need_wakeup = B_FALSE;
- if (ilm->ilm_ipif != NULL) {
- DTRACE_PROBE3(ipif__decr__cnt, (ipif_t *), ilm->ilm_ipif,
- (char *), "ilm", (void *), ilm);
- ilm->ilm_ipif->ipif_ilm_cnt--;
- if (IPIF_FREE_OK(ilm->ilm_ipif))
- need_wakeup = B_TRUE;
- } else {
- DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
- (char *), "ilm", (void *), ilm);
- ASSERT(ill->ill_ilm_cnt > 0);
- ill->ill_ilm_cnt--;
- if (ILL_FREE_OK(ill))
- need_wakeup = B_TRUE;
- }
+ DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
+ (char *), "ilm", (void *), ilm);
+ ASSERT(ill->ill_ilm_cnt > 0);
+ ill->ill_ilm_cnt--;
+ if (ILL_FREE_OK(ill))
+ need_wakeup = B_TRUE;
ilm_inactive(ilm); /* frees this ilm */
@@ -1791,185 +1609,103 @@ ilm_delete(ilm_t *ilm)
}
}
-/* Increment the ILM walker count for `ill' */
-static void
-ill_ilm_walker_hold(ill_t *ill)
-{
- mutex_enter(&ill->ill_lock);
- ill->ill_ilm_walker_cnt++;
- mutex_exit(&ill->ill_lock);
-}
-
-/* Decrement the ILM walker count for `ill' */
-static void
-ill_ilm_walker_rele(ill_t *ill)
-{
- mutex_enter(&ill->ill_lock);
- ill->ill_ilm_walker_cnt--;
- if (ill->ill_ilm_walker_cnt == 0 && ill->ill_ilm_cleanup_reqd)
- ilm_walker_cleanup(ill); /* drops ill_lock */
- else
- mutex_exit(&ill->ill_lock);
-}
-
-/*
- * Start walking the ILMs associated with `ill'; the first ILM in the walk
- * (if any) is returned. State associated with the walk is stored in `ilw'.
- * Note that walks associated with interfaces under IPMP also walk the ILMs
- * on the associated IPMP interface; this is handled transparently to callers
- * via ilm_walker_step(). (Usually with IPMP all ILMs will be on the IPMP
- * interface; the only exception is to support IPv6 test addresses, which
- * require ILMs for their associated solicited-node multicast addresses.)
- */
-ilm_t *
-ilm_walker_start(ilm_walker_t *ilw, ill_t *ill)
-{
- ilw->ilw_ill = ill;
- if (IS_UNDER_IPMP(ill))
- ilw->ilw_ipmp_ill = ipmp_ill_hold_ipmp_ill(ill);
- else
- ilw->ilw_ipmp_ill = NULL;
-
- ill_ilm_walker_hold(ill);
- if (ilw->ilw_ipmp_ill != NULL)
- ill_ilm_walker_hold(ilw->ilw_ipmp_ill);
-
- if (ilw->ilw_ipmp_ill != NULL && ilw->ilw_ipmp_ill->ill_ilm != NULL)
- ilw->ilw_walk_ill = ilw->ilw_ipmp_ill;
- else
- ilw->ilw_walk_ill = ilw->ilw_ill;
-
- return (ilm_walker_step(ilw, NULL));
-}
-
/*
- * Helper function for ilm_walker_step() that returns the next ILM
- * associated with `ilw', regardless of whether it's deleted.
+ * Lookup an ill based on the group, ifindex, ifaddr, and zoneid.
+ * Applies to both IPv4 and IPv6, although ifaddr is only used with
+ * IPv4.
+ * Returns an error for IS_UNDER_IPMP and VNI interfaces.
+ * On error it sets *errorp.
*/
-static ilm_t *
-ilm_walker_step_all(ilm_walker_t *ilw, ilm_t *ilm)
+static ill_t *
+ill_mcast_lookup(const in6_addr_t *group, ipaddr_t ifaddr, uint_t ifindex,
+ zoneid_t zoneid, ip_stack_t *ipst, int *errorp)
{
- if (ilm == NULL)
- return (ilw->ilw_walk_ill->ill_ilm);
+ ill_t *ill;
+ ipaddr_t v4group;
- if (ilm->ilm_next != NULL)
- return (ilm->ilm_next);
+ if (IN6_IS_ADDR_V4MAPPED(group)) {
+ IN6_V4MAPPED_TO_IPADDR(group, v4group);
- if (ilw->ilw_ipmp_ill != NULL && IS_IPMP(ilw->ilw_walk_ill)) {
- ilw->ilw_walk_ill = ilw->ilw_ill;
- /*
- * It's possible that ilw_ill left the group during our walk,
- * so we can't ASSERT() that it's under IPMP. Callers that
- * care will be writer on the IPSQ anyway.
- */
- return (ilw->ilw_walk_ill->ill_ilm);
- }
- return (NULL);
-}
+ if (ifindex != 0) {
+ ill = ill_lookup_on_ifindex_zoneid(ifindex, zoneid,
+ B_FALSE, ipst);
+ } else if (ifaddr != INADDR_ANY) {
+ ipif_t *ipif;
-/*
- * Step to the next ILM associated with `ilw'.
- */
-ilm_t *
-ilm_walker_step(ilm_walker_t *ilw, ilm_t *ilm)
-{
- while ((ilm = ilm_walker_step_all(ilw, ilm)) != NULL) {
- if (!(ilm->ilm_flags & ILM_DELETED))
- break;
- }
- return (ilm);
-}
-
-/*
- * Finish the ILM walk associated with `ilw'.
- */
-void
-ilm_walker_finish(ilm_walker_t *ilw)
-{
- ill_ilm_walker_rele(ilw->ilw_ill);
- if (ilw->ilw_ipmp_ill != NULL) {
- ill_ilm_walker_rele(ilw->ilw_ipmp_ill);
- ill_refrele(ilw->ilw_ipmp_ill);
+ ipif = ipif_lookup_addr(ifaddr, NULL, zoneid, ipst);
+ if (ipif == NULL) {
+ ill = NULL;
+ } else {
+ ill = ipif->ipif_ill;
+ ill_refhold(ill);
+ ipif_refrele(ipif);
+ }
+ } else {
+ ill = ill_lookup_group_v4(v4group, zoneid, ipst, NULL,
+ NULL);
+ }
+ } else {
+ if (ifindex != 0) {
+ ill = ill_lookup_on_ifindex_zoneid(ifindex, zoneid,
+ B_TRUE, ipst);
+ } else {
+ ill = ill_lookup_group_v6(group, zoneid, ipst, NULL,
+ NULL);
+ }
}
- bzero(&ilw, sizeof (ilw));
-}
-
-/*
- * Looks up the appropriate ipif given a v4 multicast group and interface
- * address. On success, returns 0, with *ipifpp pointing to the found
- * struct. On failure, returns an errno and *ipifpp is NULL.
- */
-int
-ip_opt_check(conn_t *connp, ipaddr_t group, ipaddr_t src, ipaddr_t ifaddr,
- uint_t *ifindexp, mblk_t *first_mp, ipsq_func_t func, ipif_t **ipifpp)
-{
- ipif_t *ipif;
- int err = 0;
- zoneid_t zoneid;
- ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
-
- if (!CLASSD(group) || CLASSD(src)) {
- return (EINVAL);
+ if (ill == NULL) {
+ if (ifindex != 0)
+ *errorp = ENXIO;
+ else
+ *errorp = EADDRNOTAVAIL;
+ return (NULL);
}
- *ipifpp = NULL;
-
- zoneid = IPCL_ZONEID(connp);
-
- ASSERT(!(ifaddr != INADDR_ANY && ifindexp != NULL && *ifindexp != 0));
- if (ifaddr != INADDR_ANY) {
- ipif = ipif_lookup_addr(ifaddr, NULL, zoneid,
- CONNP_TO_WQ(connp), first_mp, func, &err, ipst);
- if (err != 0 && err != EINPROGRESS)
- err = EADDRNOTAVAIL;
- } else if (ifindexp != NULL && *ifindexp != 0) {
- ipif = ipif_lookup_on_ifindex(*ifindexp, B_FALSE, zoneid,
- CONNP_TO_WQ(connp), first_mp, func, &err, ipst);
- } else {
- ipif = ipif_lookup_group(group, zoneid, ipst);
- if (ipif == NULL)
- return (EADDRNOTAVAIL);
+ /* operation not supported on the virtual network interface */
+ if (IS_UNDER_IPMP(ill) || IS_VNI(ill)) {
+ ill_refrele(ill);
+ *errorp = EINVAL;
+ return (NULL);
}
- if (ipif == NULL)
- return (err);
-
- *ipifpp = ipif;
- return (0);
+ return (ill);
}
/*
- * Looks up the appropriate ill (or ipif if v4mapped) given an interface
- * index and IPv6 multicast group. On success, returns 0, with *illpp (or
- * *ipifpp if v4mapped) pointing to the found struct. On failure, returns
- * an errno and *illpp and *ipifpp are undefined.
+ * Looks up the appropriate ill given an interface index (or interface address)
+ * and multicast group. On success, returns 0, with *illpp pointing to the
+ * found struct. On failure, returns an errno and *illpp is set to NULL.
+ *
+ * Returns an error for IS_UNDER_IPMP and VNI interfaces.
+ *
+ * Handles both IPv4 and IPv6. The ifaddr argument only applies in the
+ * case of IPv4.
*/
int
-ip_opt_check_v6(conn_t *connp, const in6_addr_t *v6group, ipaddr_t *v4group,
- const in6_addr_t *v6src, ipaddr_t *v4src, boolean_t *isv6, int ifindex,
- mblk_t *first_mp, ipsq_func_t func, ill_t **illpp, ipif_t **ipifpp)
+ip_opt_check(conn_t *connp, const in6_addr_t *v6group,
+ const in6_addr_t *v6src, ipaddr_t ifaddr, uint_t ifindex, ill_t **illpp)
{
boolean_t src_unspec;
ill_t *ill = NULL;
- ipif_t *ipif = NULL;
- int err;
- zoneid_t zoneid = connp->conn_zoneid;
- queue_t *wq = CONNP_TO_WQ(connp);
ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
+ int error = 0;
+
+ *illpp = NULL;
src_unspec = IN6_IS_ADDR_UNSPECIFIED(v6src);
if (IN6_IS_ADDR_V4MAPPED(v6group)) {
+ ipaddr_t v4group;
+ ipaddr_t v4src;
+
if (!IN6_IS_ADDR_V4MAPPED(v6src) && !src_unspec)
return (EINVAL);
- IN6_V4MAPPED_TO_IPADDR(v6group, *v4group);
+ IN6_V4MAPPED_TO_IPADDR(v6group, v4group);
if (src_unspec) {
- *v4src = INADDR_ANY;
+ v4src = INADDR_ANY;
} else {
- IN6_V4MAPPED_TO_IPADDR(v6src, *v4src);
+ IN6_V4MAPPED_TO_IPADDR(v6src, v4src);
}
- if (!CLASSD(*v4group) || CLASSD(*v4src))
+ if (!CLASSD(v4group) || CLASSD(v4src))
return (EINVAL);
- *ipifpp = NULL;
- *isv6 = B_FALSE;
} else {
if (IN6_IS_ADDR_V4MAPPED(v6src) && !src_unspec)
return (EINVAL);
@@ -1977,43 +1713,17 @@ ip_opt_check_v6(conn_t *connp, const in6_addr_t *v6group, ipaddr_t *v4group,
IN6_IS_ADDR_MULTICAST(v6src)) {
return (EINVAL);
}
- *illpp = NULL;
- *isv6 = B_TRUE;
}
- if (ifindex == 0) {
- if (*isv6)
- ill = ill_lookup_group_v6(v6group, zoneid, ipst);
- else
- ipif = ipif_lookup_group(*v4group, zoneid, ipst);
- if (ill == NULL && ipif == NULL)
- return (EADDRNOTAVAIL);
- } else {
- if (*isv6) {
- ill = ill_lookup_on_ifindex(ifindex, B_TRUE,
- wq, first_mp, func, &err, ipst);
- if (ill != NULL &&
- !ipif_lookup_zoneid(ill, zoneid, 0, NULL)) {
- ill_refrele(ill);
- ill = NULL;
- err = EADDRNOTAVAIL;
- }
- } else {
- ipif = ipif_lookup_on_ifindex(ifindex, B_FALSE,
- zoneid, wq, first_mp, func, &err, ipst);
- }
- if (ill == NULL && ipif == NULL)
- return (err);
- }
-
- *ipifpp = ipif;
+ ill = ill_mcast_lookup(v6group, ifaddr, ifindex, IPCL_ZONEID(connp),
+ ipst, &error);
*illpp = ill;
- return (0);
+ return (error);
}
static int
ip_get_srcfilter(conn_t *connp, struct group_filter *gf,
- struct ip_msfilter *imsf, ipaddr_t grp, ipif_t *ipif, boolean_t isv4mapped)
+ struct ip_msfilter *imsf, const struct in6_addr *group, boolean_t issin6)
{
ilg_t *ilg;
int i, numsrc, fmode, outsrcs;
@@ -2022,24 +1732,30 @@ ip_get_srcfilter(conn_t *connp, struct group_filter *gf,
struct in_addr *addrp;
slist_t *fp;
boolean_t is_v4only_api;
-
- mutex_enter(&connp->conn_lock);
-
- ilg = ilg_lookup_ipif(connp, grp, ipif);
- if (ilg == NULL) {
- mutex_exit(&connp->conn_lock);
- return (EADDRNOTAVAIL);
- }
+ ipaddr_t ifaddr;
+ uint_t ifindex;
if (gf == NULL) {
ASSERT(imsf != NULL);
- ASSERT(!isv4mapped);
+ ASSERT(!issin6);
is_v4only_api = B_TRUE;
outsrcs = imsf->imsf_numsrc;
+ ifaddr = imsf->imsf_interface.s_addr;
+ ifindex = 0;
} else {
ASSERT(imsf == NULL);
is_v4only_api = B_FALSE;
outsrcs = gf->gf_numsrc;
+ ifaddr = INADDR_ANY;
+ ifindex = gf->gf_interface;
+ }
+
+ /* No need to use ill_mcast_serializer for the reader */
+ rw_enter(&connp->conn_ilg_lock, RW_READER);
+ ilg = ilg_lookup(connp, group, ifaddr, ifindex);
+ if (ilg == NULL) {
+ rw_exit(&connp->conn_ilg_lock);
+ return (EADDRNOTAVAIL);
}
/*
@@ -2055,7 +1771,7 @@ ip_get_srcfilter(conn_t *connp, struct group_filter *gf,
for (i = 0; i < outsrcs; i++) {
if (i == fp->sl_numsrc)
break;
- if (isv4mapped) {
+ if (issin6) {
sin6 = (struct sockaddr_in6 *)&gf->gf_slist[i];
sin6->sin6_family = AF_INET6;
sin6->sin6_addr = fp->sl_addr[i];
@@ -2082,57 +1798,18 @@ ip_get_srcfilter(conn_t *connp, struct group_filter *gf,
gf->gf_fmode = fmode;
}
- mutex_exit(&connp->conn_lock);
-
- return (0);
-}
-
-static int
-ip_get_srcfilter_v6(conn_t *connp, struct group_filter *gf,
- const struct in6_addr *grp, ill_t *ill)
-{
- ilg_t *ilg;
- int i;
- struct sockaddr_storage *sl;
- struct sockaddr_in6 *sin6;
- slist_t *fp;
-
- mutex_enter(&connp->conn_lock);
-
- ilg = ilg_lookup_ill_v6(connp, grp, ill);
- if (ilg == NULL) {
- mutex_exit(&connp->conn_lock);
- return (EADDRNOTAVAIL);
- }
-
- /*
- * In the kernel, we use the state definitions MODE_IS_[IN|EX]CLUDE
- * to identify the filter mode; but the API uses MCAST_[IN|EX]CLUDE.
- * So we need to translate here.
- */
- gf->gf_fmode = (ilg->ilg_fmode == MODE_IS_INCLUDE) ?
- MCAST_INCLUDE : MCAST_EXCLUDE;
- if ((fp = ilg->ilg_filter) == NULL) {
- gf->gf_numsrc = 0;
- } else {
- for (i = 0, sl = gf->gf_slist; i < gf->gf_numsrc; i++, sl++) {
- if (i == fp->sl_numsrc)
- break;
- sin6 = (struct sockaddr_in6 *)sl;
- sin6->sin6_family = AF_INET6;
- sin6->sin6_addr = fp->sl_addr[i];
- }
- gf->gf_numsrc = fp->sl_numsrc;
- }
-
- mutex_exit(&connp->conn_lock);
+ rw_exit(&connp->conn_ilg_lock);
return (0);
}
+/*
+ * Common for IPv4 and IPv6.
+ */
static int
ip_set_srcfilter(conn_t *connp, struct group_filter *gf,
- struct ip_msfilter *imsf, ipaddr_t grp, ipif_t *ipif, boolean_t isv4mapped)
+ struct ip_msfilter *imsf, const struct in6_addr *group, ill_t *ill,
+ boolean_t issin6)
{
ilg_t *ilg;
int i, err, infmode, new_fmode;
@@ -2143,20 +1820,27 @@ ip_set_srcfilter(conn_t *connp, struct group_filter *gf,
slist_t *orig_filter = NULL;
slist_t *new_filter = NULL;
mcast_record_t orig_fmode;
- boolean_t leave_grp, is_v4only_api;
+ boolean_t leave_group, is_v4only_api;
ilg_stat_t ilgstat;
+ ilm_t *ilm;
+ ipaddr_t ifaddr;
+ uint_t ifindex;
if (gf == NULL) {
ASSERT(imsf != NULL);
- ASSERT(!isv4mapped);
+ ASSERT(!issin6);
is_v4only_api = B_TRUE;
insrcs = imsf->imsf_numsrc;
infmode = imsf->imsf_fmode;
+ ifaddr = imsf->imsf_interface.s_addr;
+ ifindex = 0;
} else {
ASSERT(imsf == NULL);
is_v4only_api = B_FALSE;
insrcs = gf->gf_numsrc;
infmode = gf->gf_fmode;
+ ifaddr = INADDR_ANY;
+ ifindex = gf->gf_interface;
}
/* Make sure we can handle the source list */
@@ -2167,32 +1851,52 @@ ip_set_srcfilter(conn_t *connp, struct group_filter *gf,
* setting the filter to (INCLUDE, NULL) is treated
* as a request to leave the group.
*/
- leave_grp = (infmode == MCAST_INCLUDE && insrcs == 0);
-
- ASSERT(IAM_WRITER_IPIF(ipif));
+ leave_group = (infmode == MCAST_INCLUDE && insrcs == 0);
- mutex_enter(&connp->conn_lock);
-
- ilg = ilg_lookup_ipif(connp, grp, ipif);
+ mutex_enter(&ill->ill_mcast_serializer);
+ rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+ ilg = ilg_lookup(connp, group, ifaddr, ifindex);
if (ilg == NULL) {
/*
* if the request was actually to leave, and we
* didn't find an ilg, there's nothing to do.
*/
- if (!leave_grp)
- ilg = conn_ilg_alloc(connp, &err);
- if (leave_grp || ilg == NULL) {
- mutex_exit(&connp->conn_lock);
- return (leave_grp ? 0 : err);
+ if (leave_group) {
+ rw_exit(&connp->conn_ilg_lock);
+ mutex_exit(&ill->ill_mcast_serializer);
+ return (0);
+ }
+ ilg = conn_ilg_alloc(connp, &err);
+ if (ilg == NULL) {
+ rw_exit(&connp->conn_ilg_lock);
+ mutex_exit(&ill->ill_mcast_serializer);
+ return (err);
}
ilgstat = ILGSTAT_NEW;
- IN6_IPADDR_TO_V4MAPPED(grp, &ilg->ilg_v6group);
- ilg->ilg_ipif = ipif;
- ilg->ilg_ill = NULL;
- } else if (leave_grp) {
+ ilg->ilg_v6group = *group;
+ ilg->ilg_ill = ill;
+ ilg->ilg_ifaddr = ifaddr;
+ ilg->ilg_ifindex = ifindex;
+ } else if (leave_group) {
+ /*
+ * Make sure we have the correct serializer. The ill argument
+ * might not match ilg_ill.
+ */
+ ilg_refhold(ilg);
+ mutex_exit(&ill->ill_mcast_serializer);
+ ill = ilg->ilg_ill;
+ rw_exit(&connp->conn_ilg_lock);
+
+ mutex_enter(&ill->ill_mcast_serializer);
+ rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+ ilm = ilg->ilg_ilm;
+ ilg->ilg_ilm = NULL;
ilg_delete(connp, ilg, NULL);
- mutex_exit(&connp->conn_lock);
- (void) ip_delmulti(grp, ipif, B_FALSE, B_TRUE);
+ ilg_refrele(ilg);
+ rw_exit(&connp->conn_ilg_lock);
+ if (ilm != NULL)
+ (void) ip_delmulti_serial(ilm, B_FALSE, B_TRUE);
+ mutex_exit(&ill->ill_mcast_serializer);
return (0);
} else {
ilgstat = ILGSTAT_CHANGE;
@@ -2203,7 +1907,8 @@ ip_set_srcfilter(conn_t *connp, struct group_filter *gf,
} else {
orig_filter = l_alloc_copy(ilg->ilg_filter);
if (orig_filter == NULL) {
- mutex_exit(&connp->conn_lock);
+ rw_exit(&connp->conn_ilg_lock);
+ mutex_exit(&ill->ill_mcast_serializer);
return (ENOMEM);
}
}
@@ -2214,7 +1919,7 @@ ip_set_srcfilter(conn_t *connp, struct group_filter *gf,
* we make any changes, so we can bail if it fails.
*/
if ((new_filter = l_alloc()) == NULL) {
- mutex_exit(&connp->conn_lock);
+ rw_exit(&connp->conn_ilg_lock);
err = ENOMEM;
goto free_and_exit;
}
@@ -2228,7 +1933,7 @@ ip_set_srcfilter(conn_t *connp, struct group_filter *gf,
if (fp == NULL) {
if (ilgstat == ILGSTAT_NEW)
ilg_delete(connp, ilg, NULL);
- mutex_exit(&connp->conn_lock);
+ rw_exit(&connp->conn_ilg_lock);
err = ENOMEM;
goto free_and_exit;
}
@@ -2236,7 +1941,7 @@ ip_set_srcfilter(conn_t *connp, struct group_filter *gf,
fp = ilg->ilg_filter;
}
for (i = 0; i < insrcs; i++) {
- if (isv4mapped) {
+ if (issin6) {
sin6 = (struct sockaddr_in6 *)&gf->gf_slist[i];
fp->sl_addr[i] = sin6->sin6_addr;
} else {
@@ -2263,177 +1968,70 @@ ip_set_srcfilter(conn_t *connp, struct group_filter *gf,
/*
* Save copy of ilg's filter state to pass to other functions,
- * so we can release conn_lock now.
+ * so we can release conn_ilg_lock now.
*/
new_fmode = ilg->ilg_fmode;
l_copy(ilg->ilg_filter, new_filter);
- mutex_exit(&connp->conn_lock);
-
- err = ip_addmulti(grp, ipif, ilgstat, new_fmode, new_filter);
- if (err != 0) {
- /*
- * Restore the original filter state, or delete the
- * newly-created ilg. We need to look up the ilg
- * again, though, since we've not been holding the
- * conn_lock.
- */
- mutex_enter(&connp->conn_lock);
- ilg = ilg_lookup_ipif(connp, grp, ipif);
- ASSERT(ilg != NULL);
- if (ilgstat == ILGSTAT_NEW) {
- ilg_delete(connp, ilg, NULL);
- } else {
- ilg->ilg_fmode = orig_fmode;
- if (SLIST_IS_EMPTY(orig_filter)) {
- CLEAR_SLIST(ilg->ilg_filter);
- } else {
- /*
- * We didn't free the filter, even if we
- * were trying to make the source list empty;
- * so if orig_filter isn't empty, the ilg
- * must still have a filter alloc'd.
- */
- l_copy(orig_filter, ilg->ilg_filter);
- }
- }
- mutex_exit(&connp->conn_lock);
- }
-
-free_and_exit:
- l_free(orig_filter);
- l_free(new_filter);
-
- return (err);
-}
-
-static int
-ip_set_srcfilter_v6(conn_t *connp, struct group_filter *gf,
- const struct in6_addr *grp, ill_t *ill)
-{
- ilg_t *ilg;
- int i, orig_fmode, new_fmode, err;
- slist_t *orig_filter = NULL;
- slist_t *new_filter = NULL;
- struct sockaddr_storage *sl;
- struct sockaddr_in6 *sin6;
- boolean_t leave_grp;
- ilg_stat_t ilgstat;
-
- /* Make sure we can handle the source list */
- if (gf->gf_numsrc > MAX_FILTER_SIZE)
- return (ENOBUFS);
+ rw_exit(&connp->conn_ilg_lock);
/*
- * setting the filter to (INCLUDE, NULL) is treated
- * as a request to leave the group.
+ * Now update the ill. We wait to do this until after the ilg
+ * has been updated because we need to update the src filter
+ * info for the ill, which involves looking at the status of
+ * all the ilgs associated with this group/interface pair.
*/
- leave_grp = (gf->gf_fmode == MCAST_INCLUDE && gf->gf_numsrc == 0);
-
- ASSERT(IAM_WRITER_ILL(ill));
-
- mutex_enter(&connp->conn_lock);
- ilg = ilg_lookup_ill_v6(connp, grp, ill);
- if (ilg == NULL) {
- /*
- * if the request was actually to leave, and we
- * didn't find an ilg, there's nothing to do.
- */
- if (!leave_grp)
- ilg = conn_ilg_alloc(connp, &err);
- if (leave_grp || ilg == NULL) {
- mutex_exit(&connp->conn_lock);
- return (leave_grp ? 0 : err);
- }
- ilgstat = ILGSTAT_NEW;
- ilg->ilg_v6group = *grp;
- ilg->ilg_ipif = NULL;
- ilg->ilg_ill = ill;
- } else if (leave_grp) {
- ilg_delete(connp, ilg, NULL);
- mutex_exit(&connp->conn_lock);
- (void) ip_delmulti_v6(grp, ill, connp->conn_zoneid, B_FALSE,
- B_TRUE);
- return (0);
- } else {
- ilgstat = ILGSTAT_CHANGE;
- /* preserve existing state in case ip_addmulti() fails */
- orig_fmode = ilg->ilg_fmode;
- if (ilg->ilg_filter == NULL) {
- orig_filter = NULL;
- } else {
- orig_filter = l_alloc_copy(ilg->ilg_filter);
- if (orig_filter == NULL) {
- mutex_exit(&connp->conn_lock);
- return (ENOMEM);
- }
- }
- }
+ ilm = ip_addmulti_serial(group, ill, connp->conn_zoneid, ilgstat,
+ new_fmode, new_filter, &err);
+ rw_enter(&connp->conn_ilg_lock, RW_WRITER);
/*
- * Alloc buffer to copy new state into (see below) before
- * we make any changes, so we can bail if it fails.
+ * Must look up the ilg again since we've not been holding
+ * conn_ilg_lock. The ilg could have disappeared due to an unplumb
+ * having called conn_update_ill, which can run once we dropped the
+ * conn_ilg_lock above.
*/
- if ((new_filter = l_alloc()) == NULL) {
- mutex_exit(&connp->conn_lock);
- err = ENOMEM;
+ ilg = ilg_lookup(connp, group, ifaddr, ifindex);
+ if (ilg == NULL) {
+ rw_exit(&connp->conn_ilg_lock);
+ if (ilm != NULL) {
+ (void) ip_delmulti_serial(ilm, B_FALSE,
+ (ilgstat == ILGSTAT_NEW));
+ }
+ err = ENXIO;
goto free_and_exit;
}
- if (gf->gf_numsrc == 0) {
- CLEAR_SLIST(ilg->ilg_filter);
- } else {
- slist_t *fp;
- if (ilg->ilg_filter == NULL) {
- fp = l_alloc();
- if (fp == NULL) {
- if (ilgstat == ILGSTAT_NEW)
- ilg_delete(connp, ilg, NULL);
- mutex_exit(&connp->conn_lock);
- err = ENOMEM;
- goto free_and_exit;
- }
+ if (ilm != NULL) {
+ /* Succeeded. Update the ilg to point at the ilm */
+ if (ilgstat == ILGSTAT_NEW) {
+ ASSERT(ilg->ilg_ilm == NULL);
+ ilg->ilg_ilm = ilm;
+ ilm->ilm_ifaddr = ifaddr; /* For netstat */
} else {
- fp = ilg->ilg_filter;
- }
- for (i = 0, sl = gf->gf_slist; i < gf->gf_numsrc; i++, sl++) {
- sin6 = (struct sockaddr_in6 *)sl;
- fp->sl_addr[i] = sin6->sin6_addr;
+ /*
+ * ip_addmulti didn't get a held ilm for
+ * ILGSTAT_CHANGE; ilm_refcnt was unchanged.
+ */
+ ASSERT(ilg->ilg_ilm == ilm);
}
- fp->sl_numsrc = gf->gf_numsrc;
- ilg->ilg_filter = fp;
- }
- /*
- * In the kernel, we use the state definitions MODE_IS_[IN|EX]CLUDE
- * to identify the filter mode; but the API uses MCAST_[IN|EX]CLUDE.
- * So we need to translate here.
- */
- ilg->ilg_fmode = (gf->gf_fmode == MCAST_INCLUDE) ?
- MODE_IS_INCLUDE : MODE_IS_EXCLUDE;
-
- /*
- * Save copy of ilg's filter state to pass to other functions,
- * so we can release conn_lock now.
- */
- new_fmode = ilg->ilg_fmode;
- l_copy(ilg->ilg_filter, new_filter);
-
- mutex_exit(&connp->conn_lock);
-
- err = ip_addmulti_v6(grp, ill, connp->conn_zoneid, ilgstat, new_fmode,
- new_filter);
- if (err != 0) {
+ } else {
+ ASSERT(err != 0);
/*
+ * Failed to allocate the ilm.
* Restore the original filter state, or delete the
- * newly-created ilg. We need to look up the ilg
- * again, though, since we've not been holding the
- * conn_lock.
+ * newly-created ilg.
+ * If ENETDOWN just clear ill_ilg since so that we
+ * will rejoin when the ill comes back; don't report ENETDOWN
+ * to application.
*/
- mutex_enter(&connp->conn_lock);
- ilg = ilg_lookup_ill_v6(connp, grp, ill);
- ASSERT(ilg != NULL);
if (ilgstat == ILGSTAT_NEW) {
- ilg_delete(connp, ilg, NULL);
+ if (err == ENETDOWN) {
+ ilg->ilg_ill = NULL;
+ err = 0;
+ } else {
+ ilg_delete(connp, ilg, NULL);
+ }
} else {
ilg->ilg_fmode = orig_fmode;
if (SLIST_IS_EMPTY(orig_filter)) {
@@ -2448,10 +2046,11 @@ ip_set_srcfilter_v6(conn_t *connp, struct group_filter *gf,
l_copy(orig_filter, ilg->ilg_filter);
}
}
- mutex_exit(&connp->conn_lock);
}
+ rw_exit(&connp->conn_ilg_lock);
free_and_exit:
+ mutex_exit(&ill->ill_mcast_serializer);
l_free(orig_filter);
l_free(new_filter);
@@ -2475,11 +2074,17 @@ ip_sioctl_msfilter(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
boolean_t isv6, is_v4only_api, getcmd;
struct sockaddr_in *gsin;
struct sockaddr_in6 *gsin6;
- ipaddr_t v4grp;
- in6_addr_t v6grp;
+ ipaddr_t v4group;
+ in6_addr_t v6group;
struct group_filter *gf = NULL;
struct ip_msfilter *imsf = NULL;
mblk_t *ndp;
+ ill_t *ill;
+
+ connp = Q_TO_CONN(q);
+ err = ip_msfilter_ill(connp, mp, ipip, &ill);
+ if (err != 0)
+ return (err);
if (data_mp->b_cont != NULL) {
if ((ndp = msgpullup(data_mp, -1)) == NULL)
@@ -2519,132 +2124,119 @@ ip_sioctl_msfilter(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
if (datalen < expsize)
return (EINVAL);
- connp = Q_TO_CONN(q);
-
- /* operation not supported on the virtual network interface */
- if (IS_VNI(ipif->ipif_ill))
- return (EINVAL);
-
if (isv6) {
- ill_t *ill = ipif->ipif_ill;
- ill_refhold(ill);
-
gsin6 = (struct sockaddr_in6 *)&gf->gf_group;
- v6grp = gsin6->sin6_addr;
- if (getcmd)
- err = ip_get_srcfilter_v6(connp, gf, &v6grp, ill);
- else
- err = ip_set_srcfilter_v6(connp, gf, &v6grp, ill);
-
- ill_refrele(ill);
+ v6group = gsin6->sin6_addr;
+ if (getcmd) {
+ err = ip_get_srcfilter(connp, gf, NULL, &v6group,
+ B_TRUE);
+ } else {
+ err = ip_set_srcfilter(connp, gf, NULL, &v6group, ill,
+ B_TRUE);
+ }
} else {
- boolean_t isv4mapped = B_FALSE;
+ boolean_t issin6 = B_FALSE;
if (is_v4only_api) {
- v4grp = (ipaddr_t)imsf->imsf_multiaddr.s_addr;
+ v4group = (ipaddr_t)imsf->imsf_multiaddr.s_addr;
+ IN6_IPADDR_TO_V4MAPPED(v4group, &v6group);
} else {
if (gf->gf_group.ss_family == AF_INET) {
gsin = (struct sockaddr_in *)&gf->gf_group;
- v4grp = (ipaddr_t)gsin->sin_addr.s_addr;
+ v4group = (ipaddr_t)gsin->sin_addr.s_addr;
+ IN6_IPADDR_TO_V4MAPPED(v4group, &v6group);
} else {
gsin6 = (struct sockaddr_in6 *)&gf->gf_group;
IN6_V4MAPPED_TO_IPADDR(&gsin6->sin6_addr,
- v4grp);
- isv4mapped = B_TRUE;
+ v4group);
+ issin6 = B_TRUE;
}
}
- if (getcmd)
- err = ip_get_srcfilter(connp, gf, imsf, v4grp, ipif,
- isv4mapped);
+ /*
+ * INADDR_ANY is represented as the IPv6 unspecifed addr.
+ */
+ if (v4group == INADDR_ANY)
+ v6group = ipv6_all_zeros;
else
- err = ip_set_srcfilter(connp, gf, imsf, v4grp, ipif,
- isv4mapped);
+ IN6_IPADDR_TO_V4MAPPED(v4group, &v6group);
+
+ if (getcmd) {
+ err = ip_get_srcfilter(connp, gf, imsf, &v6group,
+ issin6);
+ } else {
+ err = ip_set_srcfilter(connp, gf, imsf, &v6group, ill,
+ issin6);
+ }
}
+ ill_refrele(ill);
return (err);
}
/*
- * Finds the ipif based on information in the ioctl headers. Needed to make
- * ip_process_ioctl() happy (it needs to know the ipif for IPI_WR-flagged
- * ioctls prior to calling the ioctl's handler function).
+ * Determine the ill for the SIOC*MSFILTER ioctls
+ *
+ * Returns an error for IS_UNDER_IPMP interfaces.
+ *
+ * Finds the ill based on information in the ioctl headers.
*/
-int
-ip_extract_msfilter(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
- cmd_info_t *ci, ipsq_func_t func)
+static int
+ip_msfilter_ill(conn_t *connp, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
+ ill_t **illp)
{
int cmd = ipip->ipi_cmd;
int err = 0;
- conn_t *connp;
- ipif_t *ipif;
+ ill_t *ill;
/* caller has verified this mblk exists */
char *dbuf = (char *)mp->b_cont->b_cont->b_rptr;
struct ip_msfilter *imsf;
struct group_filter *gf;
- ipaddr_t v4addr, v4grp;
- in6_addr_t v6grp;
+ ipaddr_t v4addr, v4group;
+ in6_addr_t v6group;
uint32_t index;
- zoneid_t zoneid;
ip_stack_t *ipst;
- connp = Q_TO_CONN(q);
- zoneid = connp->conn_zoneid;
ipst = connp->conn_netstack->netstack_ip;
+ *illp = NULL;
+
/* don't allow multicast operations on a tcp conn */
if (IPCL_IS_TCP(connp))
return (ENOPROTOOPT);
if (cmd == SIOCSIPMSFILTER || cmd == SIOCGIPMSFILTER) {
/* don't allow v4-specific ioctls on v6 socket */
- if (connp->conn_af_isv6)
+ if (connp->conn_family == AF_INET6)
return (EAFNOSUPPORT);
imsf = (struct ip_msfilter *)dbuf;
v4addr = imsf->imsf_interface.s_addr;
- v4grp = imsf->imsf_multiaddr.s_addr;
- if (v4addr == INADDR_ANY) {
- ipif = ipif_lookup_group(v4grp, zoneid, ipst);
- if (ipif == NULL)
- err = EADDRNOTAVAIL;
- } else {
- ipif = ipif_lookup_addr(v4addr, NULL, zoneid, q, mp,
- func, &err, ipst);
- }
+ v4group = imsf->imsf_multiaddr.s_addr;
+ IN6_IPADDR_TO_V4MAPPED(v4group, &v6group);
+ ill = ill_mcast_lookup(&v6group, v4addr, 0, IPCL_ZONEID(connp),
+ ipst, &err);
+ if (ill == NULL && v4addr != INADDR_ANY)
+ err = ENXIO;
} else {
- boolean_t isv6 = B_FALSE;
gf = (struct group_filter *)dbuf;
index = gf->gf_interface;
if (gf->gf_group.ss_family == AF_INET6) {
struct sockaddr_in6 *sin6;
+
sin6 = (struct sockaddr_in6 *)&gf->gf_group;
- v6grp = sin6->sin6_addr;
- if (IN6_IS_ADDR_V4MAPPED(&v6grp))
- IN6_V4MAPPED_TO_IPADDR(&v6grp, v4grp);
- else
- isv6 = B_TRUE;
+ v6group = sin6->sin6_addr;
} else if (gf->gf_group.ss_family == AF_INET) {
struct sockaddr_in *sin;
+
sin = (struct sockaddr_in *)&gf->gf_group;
- v4grp = sin->sin_addr.s_addr;
+ v4group = sin->sin_addr.s_addr;
+ IN6_IPADDR_TO_V4MAPPED(v4group, &v6group);
} else {
return (EAFNOSUPPORT);
}
- if (index == 0) {
- if (isv6) {
- ipif = ipif_lookup_group_v6(&v6grp, zoneid,
- ipst);
- } else {
- ipif = ipif_lookup_group(v4grp, zoneid, ipst);
- }
- if (ipif == NULL)
- err = EADDRNOTAVAIL;
- } else {
- ipif = ipif_lookup_on_ifindex(index, isv6, zoneid,
- q, mp, func, &err, ipst);
- }
+ ill = ill_mcast_lookup(&v6group, INADDR_ANY, index,
+ IPCL_ZONEID(connp), ipst, &err);
}
-
- ci->ci_ipif = ipif;
+ *illp = ill;
return (err);
}
@@ -2695,6 +2287,7 @@ ip_copyin_msfilter(queue_t *q, mblk_t *mp)
/*
* Handle the following optmgmt:
* IP_ADD_MEMBERSHIP must not have joined already
+ * IPV6_JOIN_GROUP must not have joined already
* MCAST_JOIN_GROUP must not have joined already
* IP_BLOCK_SOURCE must have joined already
* MCAST_BLOCK_SOURCE must have joined already
@@ -2702,91 +2295,15 @@ ip_copyin_msfilter(queue_t *q, mblk_t *mp)
* MCAST_JOIN_SOURCE_GROUP may have joined already
*
* fmode and src parameters may be used to determine which option is
- * being set, as follows (the IP_* and MCAST_* versions of each option
- * are functionally equivalent):
- * opt fmode src
- * IP_ADD_MEMBERSHIP MODE_IS_EXCLUDE INADDR_ANY
- * MCAST_JOIN_GROUP MODE_IS_EXCLUDE INADDR_ANY
- * IP_BLOCK_SOURCE MODE_IS_EXCLUDE v4 addr
- * MCAST_BLOCK_SOURCE MODE_IS_EXCLUDE v4 addr
- * IP_JOIN_SOURCE_GROUP MODE_IS_INCLUDE v4 addr
- * MCAST_JOIN_SOURCE_GROUP MODE_IS_INCLUDE v4 addr
- *
- * Changing the filter mode is not allowed; if a matching ilg already
- * exists and fmode != ilg->ilg_fmode, EINVAL is returned.
- *
- * Verifies that there is a source address of appropriate scope for
- * the group; if not, EADDRNOTAVAIL is returned.
- *
- * The interface to be used may be identified by an address or by an
- * index. A pointer to the index is passed; if it is NULL, use the
- * address, otherwise, use the index.
- */
-int
-ip_opt_add_group(conn_t *connp, boolean_t checkonly, ipaddr_t group,
- ipaddr_t ifaddr, uint_t *ifindexp, mcast_record_t fmode, ipaddr_t src,
- mblk_t *first_mp)
-{
- ipif_t *ipif;
- ipsq_t *ipsq;
- int err = 0;
- ill_t *ill;
-
- err = ip_opt_check(connp, group, src, ifaddr, ifindexp, first_mp,
- ip_restart_optmgmt, &ipif);
- if (err != 0) {
- if (err != EINPROGRESS) {
- ip1dbg(("ip_opt_add_group: no ipif for group 0x%x, "
- "ifaddr 0x%x, ifindex %d\n", ntohl(group),
- ntohl(ifaddr), (ifindexp == NULL) ? 0 : *ifindexp));
- }
- return (err);
- }
- ASSERT(ipif != NULL);
-
- ill = ipif->ipif_ill;
- /* Operation not supported on a virtual network interface */
- if (IS_VNI(ill)) {
- ipif_refrele(ipif);
- return (EINVAL);
- }
-
- if (checkonly) {
- /*
- * do not do operation, just pretend to - new T_CHECK
- * semantics. The error return case above if encountered
- * considered a good enough "check" here.
- */
- ipif_refrele(ipif);
- return (0);
- }
-
- IPSQ_ENTER_IPIF(ipif, connp, first_mp, ip_restart_optmgmt, ipsq,
- NEW_OP);
-
- /* unspecified source addr => no source filtering */
- err = ilg_add(connp, group, ipif, fmode, src);
-
- IPSQ_EXIT(ipsq);
-
- ipif_refrele(ipif);
- return (err);
-}
-
-/*
- * Handle the following optmgmt:
- * IPV6_JOIN_GROUP must not have joined already
- * MCAST_JOIN_GROUP must not have joined already
- * MCAST_BLOCK_SOURCE must have joined already
- * MCAST_JOIN_SOURCE_GROUP may have joined already
- *
- * fmode and src parameters may be used to determine which option is
* being set, as follows (IPV6_JOIN_GROUP and MCAST_JOIN_GROUP options
* are functionally equivalent):
* opt fmode v6src
+ * IP_ADD_MEMBERSHIP MODE_IS_EXCLUDE unspecified
* IPV6_JOIN_GROUP MODE_IS_EXCLUDE unspecified
* MCAST_JOIN_GROUP MODE_IS_EXCLUDE unspecified
+ * IP_BLOCK_SOURCE MODE_IS_EXCLUDE IPv4-mapped addr
* MCAST_BLOCK_SOURCE MODE_IS_EXCLUDE v6 addr
+ * IP_JOIN_SOURCE_GROUP MODE_IS_INCLUDE IPv4-mapped addr
* MCAST_JOIN_SOURCE_GROUP MODE_IS_INCLUDE v6 addr
*
* Changing the filter mode is not allowed; if a matching ilg already
@@ -2795,47 +2312,29 @@ ip_opt_add_group(conn_t *connp, boolean_t checkonly, ipaddr_t group,
* Verifies that there is a source address of appropriate scope for
* the group; if not, EADDRNOTAVAIL is returned.
*
+ * The interface to be used may be identified by an IPv4 address or by an
+ * interface index.
+ *
* Handles IPv4-mapped IPv6 multicast addresses by associating them
- * with the link-local ipif. Assumes that if v6group is v4-mapped,
+ * with the IPv4 address. Assumes that if v6group is v4-mapped,
* v6src is also v4-mapped.
*/
int
-ip_opt_add_group_v6(conn_t *connp, boolean_t checkonly,
- const in6_addr_t *v6group, int ifindex, mcast_record_t fmode,
- const in6_addr_t *v6src, mblk_t *first_mp)
+ip_opt_add_group(conn_t *connp, boolean_t checkonly,
+ const in6_addr_t *v6group, ipaddr_t ifaddr, uint_t ifindex,
+ mcast_record_t fmode, const in6_addr_t *v6src)
{
ill_t *ill;
- ipif_t *ipif;
char buf[INET6_ADDRSTRLEN];
- ipaddr_t v4group, v4src;
- boolean_t isv6;
- ipsq_t *ipsq;
int err;
- err = ip_opt_check_v6(connp, v6group, &v4group, v6src, &v4src, &isv6,
- ifindex, first_mp, ip_restart_optmgmt, &ill, &ipif);
+ err = ip_opt_check(connp, v6group, v6src, ifaddr, ifindex, &ill);
if (err != 0) {
- if (err != EINPROGRESS) {
- ip1dbg(("ip_opt_add_group_v6: no ill for group %s/"
- "index %d\n", inet_ntop(AF_INET6, v6group, buf,
- sizeof (buf)), ifindex));
- }
+ ip1dbg(("ip_opt_add_group: no ill for group %s/"
+ "index %d\n", inet_ntop(AF_INET6, v6group, buf,
+ sizeof (buf)), ifindex));
return (err);
}
- ASSERT((!isv6 && ipif != NULL) || (isv6 && ill != NULL));
-
- /* operation is not supported on the virtual network interface */
- if (isv6) {
- if (IS_VNI(ill)) {
- ill_refrele(ill);
- return (EINVAL);
- }
- } else {
- if (IS_VNI(ipif->ipif_ill)) {
- ipif_refrele(ipif);
- return (EINVAL);
- }
- }
if (checkonly) {
/*
@@ -2843,104 +2342,70 @@ ip_opt_add_group_v6(conn_t *connp, boolean_t checkonly,
* semantics. The error return case above if encountered
* considered a good enough "check" here.
*/
- if (isv6)
- ill_refrele(ill);
- else
- ipif_refrele(ipif);
- return (0);
- }
-
- if (!isv6) {
- IPSQ_ENTER_IPIF(ipif, connp, first_mp, ip_restart_optmgmt,
- ipsq, NEW_OP);
- err = ilg_add(connp, v4group, ipif, fmode, v4src);
- IPSQ_EXIT(ipsq);
- ipif_refrele(ipif);
- } else {
- IPSQ_ENTER_ILL(ill, connp, first_mp, ip_restart_optmgmt,
- ipsq, NEW_OP);
- err = ilg_add_v6(connp, v6group, ill, fmode, v6src);
- IPSQ_EXIT(ipsq);
ill_refrele(ill);
+ return (0);
}
+ mutex_enter(&ill->ill_mcast_serializer);
+ err = ilg_add(connp, v6group, ifaddr, ifindex, ill, fmode, v6src);
+ mutex_exit(&ill->ill_mcast_serializer);
+ ill_refrele(ill);
return (err);
}
+/*
+ * Common for IPv6 and IPv4.
+ * Here we handle ilgs that are still attached to their original ill
+ * (the one ifaddr/ifindex points at), as well as detached ones.
+ * The detached ones might have been attached to some other ill.
+ */
static int
-ip_opt_delete_group_excl(conn_t *connp, ipaddr_t group, ipif_t *ipif,
- mcast_record_t fmode, ipaddr_t src)
+ip_opt_delete_group_excl(conn_t *connp, const in6_addr_t *v6group,
+ ipaddr_t ifaddr, uint_t ifindex, mcast_record_t fmode,
+ const in6_addr_t *v6src)
{
ilg_t *ilg;
- in6_addr_t v6src;
- boolean_t leaving = B_FALSE;
-
- ASSERT(IAM_WRITER_IPIF(ipif));
-
- /*
- * The ilg is valid only while we hold the conn lock. Once we drop
- * the lock, another thread can locate another ilg on this connp,
- * but on a different ipif, and delete it, and cause the ilg array
- * to be reallocated and copied. Hence do the ilg_delete before
- * dropping the lock.
- */
- mutex_enter(&connp->conn_lock);
- ilg = ilg_lookup_ipif(connp, group, ipif);
- if ((ilg == NULL) || (ilg->ilg_flags & ILG_DELETED)) {
- mutex_exit(&connp->conn_lock);
- return (EADDRNOTAVAIL);
- }
+ boolean_t leaving;
+ ilm_t *ilm;
+ ill_t *ill;
+ int err = 0;
- /*
- * Decide if we're actually deleting the ilg or just removing a
- * source filter address; if just removing an addr, make sure we
- * aren't trying to change the filter mode, and that the addr is
- * actually in our filter list already. If we're removing the
- * last src in an include list, just delete the ilg.
- */
- if (src == INADDR_ANY) {
- v6src = ipv6_all_zeros;
- leaving = B_TRUE;
- } else {
- int err = 0;
- IN6_IPADDR_TO_V4MAPPED(src, &v6src);
- if (fmode != ilg->ilg_fmode)
- err = EINVAL;
- else if (ilg->ilg_filter == NULL ||
- !list_has_addr(ilg->ilg_filter, &v6src))
+retry:
+ rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+ ilg = ilg_lookup(connp, v6group, ifaddr, ifindex);
+ if (ilg == NULL) {
+ rw_exit(&connp->conn_ilg_lock);
+ /*
+ * Since we didn't have any ilg we now do the error checks
+ * to determine the best errno.
+ */
+ err = ip_opt_check(connp, v6group, v6src, ifaddr, ifindex,
+ &ill);
+ if (ill != NULL) {
+ /* The only error was a missing ilg for the group */
+ ill_refrele(ill);
err = EADDRNOTAVAIL;
- if (err != 0) {
- mutex_exit(&connp->conn_lock);
- return (err);
- }
- if (fmode == MODE_IS_INCLUDE &&
- ilg->ilg_filter->sl_numsrc == 1) {
- v6src = ipv6_all_zeros;
- leaving = B_TRUE;
}
+ return (err);
}
- ilg_delete(connp, ilg, &v6src);
- mutex_exit(&connp->conn_lock);
-
- (void) ip_delmulti(group, ipif, B_FALSE, leaving);
- return (0);
-}
-
-static int
-ip_opt_delete_group_excl_v6(conn_t *connp, const in6_addr_t *v6group,
- ill_t *ill, mcast_record_t fmode, const in6_addr_t *v6src)
-{
- ilg_t *ilg;
- boolean_t leaving = B_TRUE;
-
- ASSERT(IAM_WRITER_ILL(ill));
-
- mutex_enter(&connp->conn_lock);
- ilg = ilg_lookup_ill_v6(connp, v6group, ill);
- if ((ilg == NULL) || (ilg->ilg_flags & ILG_DELETED)) {
- mutex_exit(&connp->conn_lock);
- return (EADDRNOTAVAIL);
+ /* If the ilg is attached then we serialize using that ill */
+ ill = ilg->ilg_ill;
+ if (ill != NULL) {
+ /* Prevent the ill and ilg from being freed */
+ ill_refhold(ill);
+ ilg_refhold(ilg);
+ rw_exit(&connp->conn_ilg_lock);
+ mutex_enter(&ill->ill_mcast_serializer);
+ rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+ if (ilg->ilg_condemned) {
+ /* Disappeared */
+ ilg_refrele(ilg);
+ rw_exit(&connp->conn_ilg_lock);
+ mutex_exit(&ill->ill_mcast_serializer);
+ ill_refrele(ill);
+ goto retry;
+ }
}
/*
@@ -2950,198 +2415,107 @@ ip_opt_delete_group_excl_v6(conn_t *connp, const in6_addr_t *v6group,
* actually in our filter list already. If we're removing the
* last src in an include list, just delete the ilg.
*/
- if (!IN6_IS_ADDR_UNSPECIFIED(v6src)) {
- int err = 0;
+ if (IN6_IS_ADDR_UNSPECIFIED(v6src)) {
+ leaving = B_TRUE;
+ } else {
if (fmode != ilg->ilg_fmode)
err = EINVAL;
else if (ilg->ilg_filter == NULL ||
!list_has_addr(ilg->ilg_filter, v6src))
err = EADDRNOTAVAIL;
if (err != 0) {
- mutex_exit(&connp->conn_lock);
- return (err);
+ if (ill != NULL)
+ ilg_refrele(ilg);
+ rw_exit(&connp->conn_ilg_lock);
+ goto done;
}
if (fmode == MODE_IS_INCLUDE &&
- ilg->ilg_filter->sl_numsrc == 1)
+ ilg->ilg_filter->sl_numsrc == 1) {
+ leaving = B_TRUE;
v6src = NULL;
- else
+ } else {
leaving = B_FALSE;
+ }
}
+ ilm = ilg->ilg_ilm;
+ if (leaving)
+ ilg->ilg_ilm = NULL;
ilg_delete(connp, ilg, v6src);
- mutex_exit(&connp->conn_lock);
- (void) ip_delmulti_v6(v6group, ill, connp->conn_zoneid, B_FALSE,
- leaving);
-
- return (0);
-}
-
-/*
- * Handle the following optmgmt:
- * IP_DROP_MEMBERSHIP will leave
- * MCAST_LEAVE_GROUP will leave
- * IP_UNBLOCK_SOURCE will not leave
- * MCAST_UNBLOCK_SOURCE will not leave
- * IP_LEAVE_SOURCE_GROUP may leave (if leaving last source)
- * MCAST_LEAVE_SOURCE_GROUP may leave (if leaving last source)
- *
- * fmode and src parameters may be used to determine which option is
- * being set, as follows (the IP_* and MCAST_* versions of each option
- * are functionally equivalent):
- * opt fmode src
- * IP_DROP_MEMBERSHIP MODE_IS_INCLUDE INADDR_ANY
- * MCAST_LEAVE_GROUP MODE_IS_INCLUDE INADDR_ANY
- * IP_UNBLOCK_SOURCE MODE_IS_EXCLUDE v4 addr
- * MCAST_UNBLOCK_SOURCE MODE_IS_EXCLUDE v4 addr
- * IP_LEAVE_SOURCE_GROUP MODE_IS_INCLUDE v4 addr
- * MCAST_LEAVE_SOURCE_GROUP MODE_IS_INCLUDE v4 addr
- *
- * Changing the filter mode is not allowed; if a matching ilg already
- * exists and fmode != ilg->ilg_fmode, EINVAL is returned.
- *
- * The interface to be used may be identified by an address or by an
- * index. A pointer to the index is passed; if it is NULL, use the
- * address, otherwise, use the index.
- */
-int
-ip_opt_delete_group(conn_t *connp, boolean_t checkonly, ipaddr_t group,
- ipaddr_t ifaddr, uint_t *ifindexp, mcast_record_t fmode, ipaddr_t src,
- mblk_t *first_mp)
-{
- ipif_t *ipif;
- ipsq_t *ipsq;
- int err;
- ill_t *ill;
-
- err = ip_opt_check(connp, group, src, ifaddr, ifindexp, first_mp,
- ip_restart_optmgmt, &ipif);
- if (err != 0) {
- if (err != EINPROGRESS) {
- ip1dbg(("ip_opt_delete_group: no ipif for group "
- "0x%x, ifaddr 0x%x\n",
- (int)ntohl(group), (int)ntohl(ifaddr)));
- }
- return (err);
- }
- ASSERT(ipif != NULL);
+ if (ill != NULL)
+ ilg_refrele(ilg);
+ rw_exit(&connp->conn_ilg_lock);
- ill = ipif->ipif_ill;
- /* Operation not supported on a virtual network interface */
- if (IS_VNI(ill)) {
- ipif_refrele(ipif);
- return (EINVAL);
+ if (ilm != NULL) {
+ ASSERT(ill != NULL);
+ (void) ip_delmulti_serial(ilm, B_FALSE, leaving);
}
-
- if (checkonly) {
- /*
- * do not do operation, just pretend to - new T_CHECK
- * semantics. The error return case above if encountered
- * considered a good enough "check" here.
- */
- ipif_refrele(ipif);
- return (0);
+done:
+ if (ill != NULL) {
+ mutex_exit(&ill->ill_mcast_serializer);
+ ill_refrele(ill);
}
-
- IPSQ_ENTER_IPIF(ipif, connp, first_mp, ip_restart_optmgmt, ipsq,
- NEW_OP);
- err = ip_opt_delete_group_excl(connp, group, ipif, fmode, src);
- IPSQ_EXIT(ipsq);
-
- ipif_refrele(ipif);
return (err);
}
/*
* Handle the following optmgmt:
+ * IP_DROP_MEMBERSHIP will leave
* IPV6_LEAVE_GROUP will leave
* MCAST_LEAVE_GROUP will leave
+ * IP_UNBLOCK_SOURCE will not leave
* MCAST_UNBLOCK_SOURCE will not leave
+ * IP_LEAVE_SOURCE_GROUP may leave (if leaving last source)
* MCAST_LEAVE_SOURCE_GROUP may leave (if leaving last source)
*
* fmode and src parameters may be used to determine which option is
- * being set, as follows (IPV6_LEAVE_GROUP and MCAST_LEAVE_GROUP options
- * are functionally equivalent):
+ * being set, as follows:
* opt fmode v6src
+ * IP_DROP_MEMBERSHIP MODE_IS_INCLUDE unspecified
* IPV6_LEAVE_GROUP MODE_IS_INCLUDE unspecified
* MCAST_LEAVE_GROUP MODE_IS_INCLUDE unspecified
+ * IP_UNBLOCK_SOURCE MODE_IS_EXCLUDE IPv4-mapped addr
* MCAST_UNBLOCK_SOURCE MODE_IS_EXCLUDE v6 addr
+ * IP_LEAVE_SOURCE_GROUP MODE_IS_INCLUDE IPv4-mapped addr
* MCAST_LEAVE_SOURCE_GROUP MODE_IS_INCLUDE v6 addr
*
* Changing the filter mode is not allowed; if a matching ilg already
* exists and fmode != ilg->ilg_fmode, EINVAL is returned.
*
+ * The interface to be used may be identified by an IPv4 address or by an
+ * interface index.
+ *
* Handles IPv4-mapped IPv6 multicast addresses by associating them
- * with the link-local ipif. Assumes that if v6group is v4-mapped,
+ * with the IPv4 address. Assumes that if v6group is v4-mapped,
* v6src is also v4-mapped.
*/
int
-ip_opt_delete_group_v6(conn_t *connp, boolean_t checkonly,
- const in6_addr_t *v6group, int ifindex, mcast_record_t fmode,
- const in6_addr_t *v6src, mblk_t *first_mp)
+ip_opt_delete_group(conn_t *connp, boolean_t checkonly,
+ const in6_addr_t *v6group, ipaddr_t ifaddr, uint_t ifindex,
+ mcast_record_t fmode, const in6_addr_t *v6src)
{
- ill_t *ill;
- ipif_t *ipif;
- char buf[INET6_ADDRSTRLEN];
- ipaddr_t v4group, v4src;
- boolean_t isv6;
- ipsq_t *ipsq;
- int err;
-
- err = ip_opt_check_v6(connp, v6group, &v4group, v6src, &v4src, &isv6,
- ifindex, first_mp, ip_restart_optmgmt, &ill, &ipif);
- if (err != 0) {
- if (err != EINPROGRESS) {
- ip1dbg(("ip_opt_delete_group_v6: no ill for group %s/"
- "index %d\n", inet_ntop(AF_INET6, v6group, buf,
- sizeof (buf)), ifindex));
- }
- return (err);
- }
- ASSERT((isv6 && ill != NULL) || (!isv6 && ipif != NULL));
-
- /* operation is not supported on the virtual network interface */
- if (isv6) {
- if (IS_VNI(ill)) {
- ill_refrele(ill);
- return (EINVAL);
- }
- } else {
- if (IS_VNI(ipif->ipif_ill)) {
- ipif_refrele(ipif);
- return (EINVAL);
- }
- }
+ /*
+ * In the normal case below we don't check for the ill existing.
+ * Instead we look for an existing ilg in _excl.
+ * If checkonly we sanity check the arguments
+ */
if (checkonly) {
+ ill_t *ill;
+ int err;
+
+ err = ip_opt_check(connp, v6group, v6src, ifaddr, ifindex,
+ &ill);
/*
- * do not do operation, just pretend to - new T_CHECK
- * semantics. The error return case above if encountered
- * considered a good enough "check" here.
+ * do not do operation, just pretend to - new T_CHECK semantics.
+ * ip_opt_check is considered a good enough "check" here.
*/
- if (isv6)
+ if (ill != NULL)
ill_refrele(ill);
- else
- ipif_refrele(ipif);
- return (0);
- }
-
- if (!isv6) {
- IPSQ_ENTER_IPIF(ipif, connp, first_mp, ip_restart_optmgmt,
- ipsq, NEW_OP);
- err = ip_opt_delete_group_excl(connp, v4group, ipif, fmode,
- v4src);
- IPSQ_EXIT(ipsq);
- ipif_refrele(ipif);
- } else {
- IPSQ_ENTER_ILL(ill, connp, first_mp, ip_restart_optmgmt,
- ipsq, NEW_OP);
- err = ip_opt_delete_group_excl_v6(connp, v6group, ill, fmode,
- v6src);
- IPSQ_EXIT(ipsq);
- ill_refrele(ill);
+ return (err);
}
-
- return (err);
+ return (ip_opt_delete_group_excl(connp, v6group, ifaddr, ifindex,
+ fmode, v6src));
}
/*
@@ -3155,185 +2529,26 @@ ip_opt_delete_group_v6(conn_t *connp, boolean_t checkonly,
/*
* Add a group to an upper conn group data structure and pass things down
* to the interface multicast list (and DLPI)
+ * Common for IPv4 and IPv6; for IPv4 we can have an ifaddr.
*/
static int
-ilg_add(conn_t *connp, ipaddr_t group, ipif_t *ipif, mcast_record_t fmode,
- ipaddr_t src)
-{
- int error = 0;
- ill_t *ill;
- ilg_t *ilg;
- ilg_stat_t ilgstat;
- slist_t *new_filter = NULL;
- int new_fmode;
-
- ASSERT(IAM_WRITER_IPIF(ipif));
-
- ill = ipif->ipif_ill;
-
- if (!(ill->ill_flags & ILLF_MULTICAST))
- return (EADDRNOTAVAIL);
-
- /*
- * conn_ilg[] is protected by conn_lock. Need to hold the conn_lock
- * to walk the conn_ilg[] list in ilg_lookup_ipif(); also needed to
- * serialize 2 threads doing join (sock, group1, hme0:0) and
- * (sock, group2, hme1:0) where hme0 and hme1 map to different ipsqs,
- * but both operations happen on the same conn.
- */
- mutex_enter(&connp->conn_lock);
- ilg = ilg_lookup_ipif(connp, group, ipif);
-
- /*
- * Depending on the option we're handling, may or may not be okay
- * if group has already been added. Figure out our rules based
- * on fmode and src params. Also make sure there's enough room
- * in the filter if we're adding a source to an existing filter.
- */
- if (src == INADDR_ANY) {
- /* we're joining for all sources, must not have joined */
- if (ilg != NULL)
- error = EADDRINUSE;
- } else {
- if (fmode == MODE_IS_EXCLUDE) {
- /* (excl {addr}) => block source, must have joined */
- if (ilg == NULL)
- error = EADDRNOTAVAIL;
- }
- /* (incl {addr}) => join source, may have joined */
-
- if (ilg != NULL &&
- SLIST_CNT(ilg->ilg_filter) == MAX_FILTER_SIZE)
- error = ENOBUFS;
- }
- if (error != 0) {
- mutex_exit(&connp->conn_lock);
- return (error);
- }
-
- ASSERT(!(ipif->ipif_state_flags & IPIF_CONDEMNED));
-
- /*
- * Alloc buffer to copy new state into (see below) before
- * we make any changes, so we can bail if it fails.
- */
- if ((new_filter = l_alloc()) == NULL) {
- mutex_exit(&connp->conn_lock);
- return (ENOMEM);
- }
-
- if (ilg == NULL) {
- ilgstat = ILGSTAT_NEW;
- if ((ilg = conn_ilg_alloc(connp, &error)) == NULL) {
- mutex_exit(&connp->conn_lock);
- l_free(new_filter);
- return (error);
- }
- if (src != INADDR_ANY) {
- ilg->ilg_filter = l_alloc();
- if (ilg->ilg_filter == NULL) {
- ilg_delete(connp, ilg, NULL);
- mutex_exit(&connp->conn_lock);
- l_free(new_filter);
- return (ENOMEM);
- }
- ilg->ilg_filter->sl_numsrc = 1;
- IN6_IPADDR_TO_V4MAPPED(src,
- &ilg->ilg_filter->sl_addr[0]);
- }
- if (group == INADDR_ANY) {
- ilg->ilg_v6group = ipv6_all_zeros;
- } else {
- IN6_IPADDR_TO_V4MAPPED(group, &ilg->ilg_v6group);
- }
- ilg->ilg_ipif = ipif;
- ilg->ilg_ill = NULL;
- ilg->ilg_fmode = fmode;
- } else {
- int index;
- in6_addr_t v6src;
- ilgstat = ILGSTAT_CHANGE;
- if (ilg->ilg_fmode != fmode || src == INADDR_ANY) {
- mutex_exit(&connp->conn_lock);
- l_free(new_filter);
- return (EINVAL);
- }
- if (ilg->ilg_filter == NULL) {
- ilg->ilg_filter = l_alloc();
- if (ilg->ilg_filter == NULL) {
- mutex_exit(&connp->conn_lock);
- l_free(new_filter);
- return (ENOMEM);
- }
- }
- IN6_IPADDR_TO_V4MAPPED(src, &v6src);
- if (list_has_addr(ilg->ilg_filter, &v6src)) {
- mutex_exit(&connp->conn_lock);
- l_free(new_filter);
- return (EADDRNOTAVAIL);
- }
- index = ilg->ilg_filter->sl_numsrc++;
- ilg->ilg_filter->sl_addr[index] = v6src;
- }
-
- /*
- * Save copy of ilg's filter state to pass to other functions,
- * so we can release conn_lock now.
- */
- new_fmode = ilg->ilg_fmode;
- l_copy(ilg->ilg_filter, new_filter);
-
- mutex_exit(&connp->conn_lock);
-
- error = ip_addmulti(group, ipif, ilgstat, new_fmode, new_filter);
- if (error != 0) {
- /*
- * Need to undo what we did before calling ip_addmulti()!
- * Must look up the ilg again since we've not been holding
- * conn_lock.
- */
- in6_addr_t v6src;
- if (ilgstat == ILGSTAT_NEW)
- v6src = ipv6_all_zeros;
- else
- IN6_IPADDR_TO_V4MAPPED(src, &v6src);
- mutex_enter(&connp->conn_lock);
- ilg = ilg_lookup_ipif(connp, group, ipif);
- ASSERT(ilg != NULL);
- ilg_delete(connp, ilg, &v6src);
- mutex_exit(&connp->conn_lock);
- l_free(new_filter);
- return (error);
- }
-
- l_free(new_filter);
- return (0);
-}
-
-static int
-ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
- mcast_record_t fmode, const in6_addr_t *v6src)
+ilg_add(conn_t *connp, const in6_addr_t *v6group, ipaddr_t ifaddr,
+ uint_t ifindex, ill_t *ill, mcast_record_t fmode, const in6_addr_t *v6src)
{
int error = 0;
ilg_t *ilg;
ilg_stat_t ilgstat;
slist_t *new_filter = NULL;
int new_fmode;
-
- ASSERT(IAM_WRITER_ILL(ill));
+ ilm_t *ilm;
if (!(ill->ill_flags & ILLF_MULTICAST))
return (EADDRNOTAVAIL);
- /*
- * conn_lock protects the ilg list. Serializes 2 threads doing
- * join (sock, group1, hme0) and (sock, group2, hme1) where hme0
- * and hme1 map to different ipsq's, but both operations happen
- * on the same conn.
- */
- mutex_enter(&connp->conn_lock);
-
- ilg = ilg_lookup_ill_v6(connp, v6group, ill);
+ /* conn_ilg_lock protects the ilg list. */
+ ASSERT(MUTEX_HELD(&ill->ill_mcast_serializer));
+ rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+ ilg = ilg_lookup(connp, v6group, ifaddr, ifindex);
/*
* Depending on the option we're handling, may or may not be okay
@@ -3358,7 +2573,7 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
error = ENOBUFS;
}
if (error != 0) {
- mutex_exit(&connp->conn_lock);
+ rw_exit(&connp->conn_ilg_lock);
return (error);
}
@@ -3367,21 +2582,23 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
* we make any changes, so we can bail if it fails.
*/
if ((new_filter = l_alloc()) == NULL) {
- mutex_exit(&connp->conn_lock);
+ rw_exit(&connp->conn_ilg_lock);
return (ENOMEM);
}
if (ilg == NULL) {
if ((ilg = conn_ilg_alloc(connp, &error)) == NULL) {
- mutex_exit(&connp->conn_lock);
+ rw_exit(&connp->conn_ilg_lock);
l_free(new_filter);
return (error);
}
+ ilg->ilg_ifindex = ifindex;
+ ilg->ilg_ifaddr = ifaddr;
if (!IN6_IS_ADDR_UNSPECIFIED(v6src)) {
ilg->ilg_filter = l_alloc();
if (ilg->ilg_filter == NULL) {
ilg_delete(connp, ilg, NULL);
- mutex_exit(&connp->conn_lock);
+ rw_exit(&connp->conn_ilg_lock);
l_free(new_filter);
return (ENOMEM);
}
@@ -3391,25 +2608,24 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
ilgstat = ILGSTAT_NEW;
ilg->ilg_v6group = *v6group;
ilg->ilg_fmode = fmode;
- ilg->ilg_ipif = NULL;
ilg->ilg_ill = ill;
} else {
int index;
if (ilg->ilg_fmode != fmode || IN6_IS_ADDR_UNSPECIFIED(v6src)) {
- mutex_exit(&connp->conn_lock);
+ rw_exit(&connp->conn_ilg_lock);
l_free(new_filter);
return (EINVAL);
}
if (ilg->ilg_filter == NULL) {
ilg->ilg_filter = l_alloc();
if (ilg->ilg_filter == NULL) {
- mutex_exit(&connp->conn_lock);
+ rw_exit(&connp->conn_ilg_lock);
l_free(new_filter);
return (ENOMEM);
}
}
if (list_has_addr(ilg->ilg_filter, v6src)) {
- mutex_exit(&connp->conn_lock);
+ rw_exit(&connp->conn_ilg_lock);
l_free(new_filter);
return (EADDRNOTAVAIL);
}
@@ -3420,12 +2636,12 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
/*
* Save copy of ilg's filter state to pass to other functions,
- * so we can release conn_lock now.
+ * so we can release conn_ilg_lock now.
*/
new_fmode = ilg->ilg_fmode;
l_copy(ilg->ilg_filter, new_filter);
- mutex_exit(&connp->conn_lock);
+ rw_exit(&connp->conn_ilg_lock);
/*
* Now update the ill. We wait to do this until after the ilg
@@ -3433,72 +2649,105 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill,
* info for the ill, which involves looking at the status of
* all the ilgs associated with this group/interface pair.
*/
- error = ip_addmulti_v6(v6group, ill, connp->conn_zoneid, ilgstat,
- new_fmode, new_filter);
- if (error != 0) {
+ ilm = ip_addmulti_serial(v6group, ill, connp->conn_zoneid, ilgstat,
+ new_fmode, new_filter, &error);
+
+ rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+ /*
+ * Must look up the ilg again since we've not been holding
+ * conn_ilg_lock. The ilg could have disappeared due to an unplumb
+ * having called conn_update_ill, which can run once we dropped the
+ * conn_ilg_lock above.
+ */
+ ilg = ilg_lookup(connp, v6group, ifaddr, ifindex);
+ if (ilg == NULL) {
+ rw_exit(&connp->conn_ilg_lock);
+ if (ilm != NULL) {
+ (void) ip_delmulti_serial(ilm, B_FALSE,
+ (ilgstat == ILGSTAT_NEW));
+ }
+ error = ENXIO;
+ goto free_and_exit;
+ }
+
+ if (ilm != NULL) {
+ /* Succeeded. Update the ilg to point at the ilm */
+ if (ilgstat == ILGSTAT_NEW) {
+ ASSERT(ilg->ilg_ilm == NULL);
+ ilg->ilg_ilm = ilm;
+ ilm->ilm_ifaddr = ifaddr; /* For netstat */
+ } else {
+ /*
+ * ip_addmulti didn't get a held ilm for
+ * ILGSTAT_CHANGE; ilm_refcnt was unchanged.
+ */
+ ASSERT(ilg->ilg_ilm == ilm);
+ }
+ } else {
+ ASSERT(error != 0);
/*
- * But because we waited, we have to undo the ilg update
- * if ip_addmulti_v6() fails. We also must lookup ilg
- * again, since we've not been holding conn_lock.
+ * Failed to allocate the ilm.
+ * Need to undo what we did before calling ip_addmulti()
+ * If ENETDOWN just clear ill_ilg since so that we
+ * will rejoin when the ill comes back; don't report ENETDOWN
+ * to application.
*/
- in6_addr_t delsrc =
- (ilgstat == ILGSTAT_NEW) ? ipv6_all_zeros : *v6src;
- mutex_enter(&connp->conn_lock);
- ilg = ilg_lookup_ill_v6(connp, v6group, ill);
- ASSERT(ilg != NULL);
- ilg_delete(connp, ilg, &delsrc);
- mutex_exit(&connp->conn_lock);
- l_free(new_filter);
- return (error);
+ if (ilgstat == ILGSTAT_NEW && error == ENETDOWN) {
+ ilg->ilg_ill = NULL;
+ error = 0;
+ } else {
+ in6_addr_t delsrc =
+ (ilgstat == ILGSTAT_NEW) ? ipv6_all_zeros : *v6src;
+
+ ilg_delete(connp, ilg, &delsrc);
+ }
}
+ rw_exit(&connp->conn_ilg_lock);
+free_and_exit:
l_free(new_filter);
-
- return (0);
+ return (error);
}
/*
- * Find an IPv4 ilg matching group, ill and source
+ * Find an IPv4 ilg matching group, ill and source.
+ * The group and source can't be INADDR_ANY here so no need to translate to
+ * the unspecified IPv6 address.
*/
-ilg_t *
-ilg_lookup_ill_withsrc(conn_t *connp, ipaddr_t group, ipaddr_t src, ill_t *ill)
+boolean_t
+conn_hasmembers_ill_withsrc_v4(conn_t *connp, ipaddr_t group, ipaddr_t src,
+ ill_t *ill)
{
in6_addr_t v6group, v6src;
int i;
boolean_t isinlist;
ilg_t *ilg;
- ipif_t *ipif;
- ill_t *ilg_ill;
-
- ASSERT(MUTEX_HELD(&connp->conn_lock));
- /*
- * INADDR_ANY is represented as the IPv6 unspecified addr.
- */
- if (group == INADDR_ANY)
- v6group = ipv6_all_zeros;
- else
- IN6_IPADDR_TO_V4MAPPED(group, &v6group);
+ rw_enter(&connp->conn_ilg_lock, RW_READER);
+ IN6_IPADDR_TO_V4MAPPED(group, &v6group);
+ for (ilg = connp->conn_ilg; ilg != NULL; ilg = ilg->ilg_next) {
+ if (ilg->ilg_condemned)
+ continue;
- for (i = 0; i < connp->conn_ilg_inuse; i++) {
- ilg = &connp->conn_ilg[i];
- if ((ipif = ilg->ilg_ipif) == NULL ||
- (ilg->ilg_flags & ILG_DELETED) != 0)
+ /* ilg_ill could be NULL if an add is in progress */
+ if (ilg->ilg_ill != ill)
continue;
- ASSERT(ilg->ilg_ill == NULL);
- ilg_ill = ipif->ipif_ill;
- ASSERT(!ilg_ill->ill_isv6);
- if (IS_ON_SAME_LAN(ilg_ill, ill) &&
- IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, &v6group)) {
+
+ /* The callers use upper ill for IPMP */
+ ASSERT(!IS_UNDER_IPMP(ill));
+ if (IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, &v6group)) {
if (SLIST_IS_EMPTY(ilg->ilg_filter)) {
/* no source filter, so this is a match */
- return (ilg);
+ rw_exit(&connp->conn_ilg_lock);
+ return (B_TRUE);
}
break;
}
}
- if (i == connp->conn_ilg_inuse)
- return (NULL);
+ if (ilg == NULL) {
+ rw_exit(&connp->conn_ilg_lock);
+ return (B_FALSE);
+ }
/*
* we have an ilg with matching ill and group; but
@@ -3514,44 +2763,49 @@ ilg_lookup_ill_withsrc(conn_t *connp, ipaddr_t group, ipaddr_t src, ill_t *ill)
}
if ((isinlist && ilg->ilg_fmode == MODE_IS_INCLUDE) ||
- (!isinlist && ilg->ilg_fmode == MODE_IS_EXCLUDE))
- return (ilg);
-
- return (NULL);
+ (!isinlist && ilg->ilg_fmode == MODE_IS_EXCLUDE)) {
+ rw_exit(&connp->conn_ilg_lock);
+ return (B_TRUE);
+ }
+ rw_exit(&connp->conn_ilg_lock);
+ return (B_FALSE);
}
/*
* Find an IPv6 ilg matching group, ill, and source
*/
-ilg_t *
-ilg_lookup_ill_withsrc_v6(conn_t *connp, const in6_addr_t *v6group,
+boolean_t
+conn_hasmembers_ill_withsrc_v6(conn_t *connp, const in6_addr_t *v6group,
const in6_addr_t *v6src, ill_t *ill)
{
int i;
boolean_t isinlist;
ilg_t *ilg;
- ill_t *ilg_ill;
- ASSERT(MUTEX_HELD(&connp->conn_lock));
+ rw_enter(&connp->conn_ilg_lock, RW_READER);
+ for (ilg = connp->conn_ilg; ilg != NULL; ilg = ilg->ilg_next) {
+ if (ilg->ilg_condemned)
+ continue;
- for (i = 0; i < connp->conn_ilg_inuse; i++) {
- ilg = &connp->conn_ilg[i];
- if ((ilg_ill = ilg->ilg_ill) == NULL ||
- (ilg->ilg_flags & ILG_DELETED) != 0)
+ /* ilg_ill could be NULL if an add is in progress */
+ if (ilg->ilg_ill != ill)
continue;
- ASSERT(ilg->ilg_ipif == NULL);
- ASSERT(ilg_ill->ill_isv6);
- if (IS_ON_SAME_LAN(ilg_ill, ill) &&
- IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, v6group)) {
+
+ /* The callers use upper ill for IPMP */
+ ASSERT(!IS_UNDER_IPMP(ill));
+ if (IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, v6group)) {
if (SLIST_IS_EMPTY(ilg->ilg_filter)) {
/* no source filter, so this is a match */
- return (ilg);
+ rw_exit(&connp->conn_ilg_lock);
+ return (B_TRUE);
}
break;
}
}
- if (i == connp->conn_ilg_inuse)
- return (NULL);
+ if (ilg == NULL) {
+ rw_exit(&connp->conn_ilg_lock);
+ return (B_FALSE);
+ }
/*
* we have an ilg with matching ill and group; but
@@ -3566,61 +2820,34 @@ ilg_lookup_ill_withsrc_v6(conn_t *connp, const in6_addr_t *v6group,
}
if ((isinlist && ilg->ilg_fmode == MODE_IS_INCLUDE) ||
- (!isinlist && ilg->ilg_fmode == MODE_IS_EXCLUDE))
- return (ilg);
-
- return (NULL);
-}
-
-/*
- * Find an IPv6 ilg matching group and ill
- */
-ilg_t *
-ilg_lookup_ill_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill)
-{
- ilg_t *ilg;
- int i;
- ill_t *mem_ill;
-
- ASSERT(MUTEX_HELD(&connp->conn_lock));
-
- for (i = 0; i < connp->conn_ilg_inuse; i++) {
- ilg = &connp->conn_ilg[i];
- if ((mem_ill = ilg->ilg_ill) == NULL ||
- (ilg->ilg_flags & ILG_DELETED) != 0)
- continue;
- ASSERT(ilg->ilg_ipif == NULL);
- ASSERT(mem_ill->ill_isv6);
- if (mem_ill == ill &&
- IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, v6group))
- return (ilg);
+ (!isinlist && ilg->ilg_fmode == MODE_IS_EXCLUDE)) {
+ rw_exit(&connp->conn_ilg_lock);
+ return (B_TRUE);
}
- return (NULL);
+ rw_exit(&connp->conn_ilg_lock);
+ return (B_FALSE);
}
/*
- * Find an IPv4 ilg matching group and ipif
+ * Find an ilg matching group and ifaddr/ifindex.
+ * We check both ifaddr and ifindex even though at most one of them
+ * will be non-zero; that way we always find the right one.
*/
static ilg_t *
-ilg_lookup_ipif(conn_t *connp, ipaddr_t group, ipif_t *ipif)
+ilg_lookup(conn_t *connp, const in6_addr_t *v6group, ipaddr_t ifaddr,
+ uint_t ifindex)
{
- in6_addr_t v6group;
- int i;
ilg_t *ilg;
- ASSERT(MUTEX_HELD(&connp->conn_lock));
- ASSERT(!ipif->ipif_ill->ill_isv6);
+ ASSERT(RW_LOCK_HELD(&connp->conn_ilg_lock));
- if (group == INADDR_ANY)
- v6group = ipv6_all_zeros;
- else
- IN6_IPADDR_TO_V4MAPPED(group, &v6group);
+ for (ilg = connp->conn_ilg; ilg != NULL; ilg = ilg->ilg_next) {
+ if (ilg->ilg_condemned)
+ continue;
- for (i = 0; i < connp->conn_ilg_inuse; i++) {
- ilg = &connp->conn_ilg[i];
- if ((ilg->ilg_flags & ILG_DELETED) == 0 &&
- IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, &v6group) &&
- ilg->ilg_ipif == ipif)
+ if (ilg->ilg_ifaddr == ifaddr &&
+ ilg->ilg_ifindex == ifindex &&
+ IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, v6group))
return (ilg);
}
return (NULL);
@@ -3634,363 +2861,479 @@ ilg_lookup_ipif(conn_t *connp, ipaddr_t group, ipif_t *ipif)
static void
ilg_delete(conn_t *connp, ilg_t *ilg, const in6_addr_t *src)
{
- int i;
-
- ASSERT((ilg->ilg_ipif != NULL) ^ (ilg->ilg_ill != NULL));
- ASSERT(ilg->ilg_ipif == NULL || IAM_WRITER_IPIF(ilg->ilg_ipif));
- ASSERT(ilg->ilg_ill == NULL || IAM_WRITER_ILL(ilg->ilg_ill));
- ASSERT(MUTEX_HELD(&connp->conn_lock));
- ASSERT(!(ilg->ilg_flags & ILG_DELETED));
+ ASSERT(RW_WRITE_HELD(&connp->conn_ilg_lock));
+ ASSERT(ilg->ilg_ptpn != NULL);
+ ASSERT(!ilg->ilg_condemned);
if (src == NULL || IN6_IS_ADDR_UNSPECIFIED(src)) {
- if (connp->conn_ilg_walker_cnt != 0) {
- ilg->ilg_flags |= ILG_DELETED;
- return;
- }
-
FREE_SLIST(ilg->ilg_filter);
+ ilg->ilg_filter = NULL;
- i = ilg - &connp->conn_ilg[0];
- ASSERT(i >= 0 && i < connp->conn_ilg_inuse);
-
- /* Move other entries up one step */
- connp->conn_ilg_inuse--;
- for (; i < connp->conn_ilg_inuse; i++)
- connp->conn_ilg[i] = connp->conn_ilg[i+1];
+ ASSERT(ilg->ilg_ilm == NULL);
+ ilg->ilg_ill = NULL;
+ ilg->ilg_condemned = B_TRUE;
- if (connp->conn_ilg_inuse == 0) {
- mi_free((char *)connp->conn_ilg);
- connp->conn_ilg = NULL;
- cv_broadcast(&connp->conn_refcv);
- }
+ /* ilg_inactive will unlink from the list */
+ ilg_refrele(ilg);
} else {
l_remove(ilg->ilg_filter, src);
}
}
/*
- * Called from conn close. No new ilg can be added or removed.
+ * Called from conn close. No new ilg can be added or removed
* because CONN_CLOSING has been set by ip_close. ilg_add / ilg_delete
* will return error if conn has started closing.
+ *
+ * We handle locking as follows.
+ * Under conn_ilg_lock we get the first ilg. As we drop the conn_ilg_lock to
+ * proceed with the ilm part of the delete we hold a reference on both the ill
+ * and the ilg. This doesn't prevent changes to the ilg, but prevents it from
+ * being deleted.
+ *
+ * Since the ilg_add code path uses two locks (conn_ilg_lock for the ilg part,
+ * and ill_mcast_lock for the ip_addmulti part) we can run at a point between
+ * the two. At that point ilg_ill is set, but ilg_ilm hasn't yet been set. In
+ * that case we delete the ilg here, which makes ilg_add discover that the ilg
+ * has disappeared when ip_addmulti returns, so it will discard the ilm it just
+ * added.
*/
void
ilg_delete_all(conn_t *connp)
{
- int i;
- ipif_t *ipif = NULL;
- ill_t *ill = NULL;
- ilg_t *ilg;
- in6_addr_t v6group;
- boolean_t success;
- ipsq_t *ipsq;
+ ilg_t *ilg, *next_ilg, *held_ilg;
+ ilm_t *ilm;
+ ill_t *ill;
+ boolean_t need_refrele;
+ /*
+ * Can not run if there is a conn_update_ill already running.
+ * Wait for it to complete. Caller should have already set CONN_CLOSING
+ * which prevents any new threads to run in conn_update_ill.
+ */
mutex_enter(&connp->conn_lock);
-retry:
- ILG_WALKER_HOLD(connp);
- for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) {
- ilg = &connp->conn_ilg[i];
- /*
- * Since this walk is not atomic (we drop the
- * conn_lock and wait in ipsq_enter) we need
- * to check for the ILG_DELETED flag.
- */
- if (ilg->ilg_flags & ILG_DELETED)
- continue;
-
- if (IN6_IS_ADDR_V4MAPPED(&ilg->ilg_v6group)) {
- ipif = ilg->ilg_ipif;
- ill = ipif->ipif_ill;
- } else {
- ipif = NULL;
- ill = ilg->ilg_ill;
- }
+ ASSERT(connp->conn_state_flags & CONN_CLOSING);
+ while (connp->conn_state_flags & CONN_UPDATE_ILL)
+ cv_wait(&connp->conn_cv, &connp->conn_lock);
+ mutex_exit(&connp->conn_lock);
- /*
- * We may not be able to refhold the ill if the ill/ipif
- * is changing. But we need to make sure that the ill will
- * not vanish. So we just bump up the ill_waiter count.
- * If we are unable to do even that, then the ill is closing,
- * in which case the unplumb thread will handle the cleanup,
- * and we move on to the next ilg.
- */
- if (!ill_waiter_inc(ill))
+ rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+ ilg = connp->conn_ilg;
+ held_ilg = NULL;
+ while (ilg != NULL) {
+ if (ilg->ilg_condemned) {
+ ilg = ilg->ilg_next;
continue;
-
- mutex_exit(&connp->conn_lock);
- /*
- * To prevent deadlock between ill close which waits inside
- * the perimeter, and conn close, ipsq_enter returns error,
- * the moment ILL_CONDEMNED is set, in which case ill close
- * takes responsibility to cleanup the ilgs. Note that we
- * have not yet set condemned flag, otherwise the conn can't
- * be refheld for cleanup by those routines and it would be
- * a mutual deadlock.
- */
- success = ipsq_enter(ill, B_FALSE, NEW_OP);
- ipsq = ill->ill_phyint->phyint_ipsq;
- ill_waiter_dcr(ill);
- mutex_enter(&connp->conn_lock);
- if (!success)
+ }
+ /* If the ilg is detached then no need to serialize */
+ if (ilg->ilg_ilm == NULL) {
+ next_ilg = ilg->ilg_next;
+ ilg_delete(connp, ilg, NULL);
+ ilg = next_ilg;
continue;
+ }
+ ill = ilg->ilg_ilm->ilm_ill;
/*
- * Move on if the ilg was deleted while conn_lock was dropped.
+ * In order to serialize on the ill we try to enter
+ * and if that fails we unlock and relock and then
+ * check that we still have an ilm.
*/
- if (ilg->ilg_flags & ILG_DELETED) {
- mutex_exit(&connp->conn_lock);
- ipsq_exit(ipsq);
- mutex_enter(&connp->conn_lock);
- continue;
+ need_refrele = B_FALSE;
+ if (!mutex_tryenter(&ill->ill_mcast_serializer)) {
+ ill_refhold(ill);
+ need_refrele = B_TRUE;
+ ilg_refhold(ilg);
+ if (held_ilg != NULL)
+ ilg_refrele(held_ilg);
+ held_ilg = ilg;
+ rw_exit(&connp->conn_ilg_lock);
+ mutex_enter(&ill->ill_mcast_serializer);
+ rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+ if (ilg->ilg_condemned) {
+ ilg = ilg->ilg_next;
+ goto next;
+ }
}
- v6group = ilg->ilg_v6group;
+ ilm = ilg->ilg_ilm;
+ ilg->ilg_ilm = NULL;
+ next_ilg = ilg->ilg_next;
ilg_delete(connp, ilg, NULL);
- mutex_exit(&connp->conn_lock);
+ ilg = next_ilg;
+ rw_exit(&connp->conn_ilg_lock);
- if (ipif != NULL) {
- (void) ip_delmulti(V4_PART_OF_V6(v6group), ipif,
- B_FALSE, B_TRUE);
- } else {
- (void) ip_delmulti_v6(&v6group, ill,
- connp->conn_zoneid, B_FALSE, B_TRUE);
- }
- ipsq_exit(ipsq);
- mutex_enter(&connp->conn_lock);
- }
- ILG_WALKER_RELE(connp);
+ if (ilm != NULL)
+ (void) ip_delmulti_serial(ilm, B_FALSE, B_TRUE);
- /* If any ill was skipped above wait and retry */
- if (connp->conn_ilg_inuse != 0) {
- cv_wait(&connp->conn_refcv, &connp->conn_lock);
- goto retry;
+ next:
+ mutex_exit(&ill->ill_mcast_serializer);
+ if (need_refrele) {
+ /* Drop ill reference while we hold no locks */
+ ill_refrele(ill);
+ }
+ rw_enter(&connp->conn_ilg_lock, RW_WRITER);
}
- mutex_exit(&connp->conn_lock);
+ if (held_ilg != NULL)
+ ilg_refrele(held_ilg);
+ rw_exit(&connp->conn_ilg_lock);
}
/*
- * Called from ill close by ipcl_walk for clearing conn_ilg and
- * conn_multicast_ipif for a given ipif. conn is held by caller.
- * Note that ipcl_walk only walks conns that are not yet condemned.
- * condemned conns can't be refheld. For this reason, conn must become clean
- * first, i.e. it must not refer to any ill/ire/ipif and then only set
- * condemned flag.
+ * Attach the ilg to an ilm on the ill. If it fails we leave ilg_ill as NULL so
+ * that a subsequent attempt can attach it.
+ * Drops and reacquires conn_ilg_lock.
*/
static void
-conn_delete_ipif(conn_t *connp, caddr_t arg)
+ilg_attach(conn_t *connp, ilg_t *ilg, ill_t *ill)
{
- ipif_t *ipif = (ipif_t *)arg;
- int i;
- char group_buf1[INET6_ADDRSTRLEN];
- char group_buf2[INET6_ADDRSTRLEN];
- ipaddr_t group;
- ilg_t *ilg;
+ ilg_stat_t ilgstat;
+ slist_t *new_filter;
+ int new_fmode;
+ in6_addr_t v6group;
+ ipaddr_t ifaddr;
+ uint_t ifindex;
+ ilm_t *ilm;
+ int error = 0;
+ ASSERT(RW_WRITE_HELD(&connp->conn_ilg_lock));
/*
- * Even though conn_ilg_inuse can change while we are in this loop,
- * i.e.ilgs can be created or deleted on this connp, no new ilgs can
- * be created or deleted for this connp, on this ill, since this ill
- * is the perimeter. So we won't miss any ilg in this cleanup.
+ * Alloc buffer to copy new state into (see below) before
+ * we make any changes, so we can bail if it fails.
*/
- mutex_enter(&connp->conn_lock);
+ if ((new_filter = l_alloc()) == NULL)
+ return;
/*
- * Increment the walker count, so that ilg repacking does not
- * occur while we are in the loop.
+ * Save copy of ilg's filter state to pass to other functions, so
+ * we can release conn_ilg_lock now.
+ * Set ilg_ill so that an unplumb can find us.
*/
- ILG_WALKER_HOLD(connp);
- for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) {
- ilg = &connp->conn_ilg[i];
- if (ilg->ilg_ipif != ipif || (ilg->ilg_flags & ILG_DELETED))
- continue;
- /*
- * ip_close cannot be cleaning this ilg at the same time.
- * since it also has to execute in this ill's perimeter which
- * we are now holding. Only a clean conn can be condemned.
- */
- ASSERT(!(connp->conn_state_flags & CONN_CONDEMNED));
-
- /* Blow away the membership */
- ip1dbg(("conn_delete_ilg_ipif: %s on %s (%s)\n",
- inet_ntop(AF_INET6, &connp->conn_ilg[i].ilg_v6group,
- group_buf1, sizeof (group_buf1)),
- inet_ntop(AF_INET6, &ipif->ipif_v6lcl_addr,
- group_buf2, sizeof (group_buf2)),
- ipif->ipif_ill->ill_name));
-
- /* ilg_ipif is NULL for V6, so we won't be here */
- ASSERT(IN6_IS_ADDR_V4MAPPED(&ilg->ilg_v6group));
+ new_fmode = ilg->ilg_fmode;
+ l_copy(ilg->ilg_filter, new_filter);
+ v6group = ilg->ilg_v6group;
+ ifaddr = ilg->ilg_ifaddr;
+ ifindex = ilg->ilg_ifindex;
+ ilgstat = ILGSTAT_NEW;
- group = V4_PART_OF_V6(ilg->ilg_v6group);
- ilg_delete(connp, &connp->conn_ilg[i], NULL);
- mutex_exit(&connp->conn_lock);
+ ilg->ilg_ill = ill;
+ ASSERT(ilg->ilg_ilm == NULL);
+ rw_exit(&connp->conn_ilg_lock);
- (void) ip_delmulti(group, ipif, B_FALSE, B_TRUE);
- mutex_enter(&connp->conn_lock);
- }
+ ilm = ip_addmulti_serial(&v6group, ill, connp->conn_zoneid, ilgstat,
+ new_fmode, new_filter, &error);
+ l_free(new_filter);
+ rw_enter(&connp->conn_ilg_lock, RW_WRITER);
/*
- * If we are the last walker, need to physically delete the
- * ilgs and repack.
+ * Must look up the ilg again since we've not been holding
+ * conn_ilg_lock. The ilg could have disappeared due to an unplumb
+ * having called conn_update_ill, which can run once we dropped the
+ * conn_ilg_lock above.
*/
- ILG_WALKER_RELE(connp);
-
- if (connp->conn_multicast_ipif == ipif) {
- /* Revert to late binding */
- connp->conn_multicast_ipif = NULL;
+ ilg = ilg_lookup(connp, &v6group, ifaddr, ifindex);
+ if (ilg == NULL) {
+ if (ilm != NULL) {
+ rw_exit(&connp->conn_ilg_lock);
+ (void) ip_delmulti_serial(ilm, B_FALSE,
+ (ilgstat == ILGSTAT_NEW));
+ rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+ }
+ return;
}
- mutex_exit(&connp->conn_lock);
-
- conn_delete_ire(connp, (caddr_t)ipif);
+ if (ilm == NULL) {
+ ilg->ilg_ill = NULL;
+ return;
+ }
+ ASSERT(ilg->ilg_ilm == NULL);
+ ilg->ilg_ilm = ilm;
+ ilm->ilm_ifaddr = ifaddr; /* For netstat */
}
/*
- * Called from ill close by ipcl_walk for clearing conn_ilg and
- * conn_multicast_ill for a given ill. conn is held by caller.
+ * Called when an ill is unplumbed to make sure that there are no
+ * dangling conn references to that ill. In that case ill is non-NULL and
+ * we make sure we remove all references to it.
+ * Also called when we should revisit the ilg_ill used for multicast
+ * memberships, in which case ill is NULL.
+ *
+ * conn is held by caller.
+ *
* Note that ipcl_walk only walks conns that are not yet condemned.
* condemned conns can't be refheld. For this reason, conn must become clean
- * first, i.e. it must not refer to any ill/ire/ipif and then only set
+ * first, i.e. it must not refer to any ill/ire and then only set
* condemned flag.
+ *
+ * We leave ixa_multicast_ifindex in place. We prefer dropping
+ * packets instead of sending them out the wrong interface.
+ *
+ * We keep the ilg around in a detached state (with ilg_ill and ilg_ilm being
+ * NULL) so that the application can leave it later. Also, if ilg_ifaddr and
+ * ilg_ifindex are zero, indicating that the system should pick the interface,
+ * then we attempt to reselect the ill and join on it.
+ *
+ * Locking notes:
+ * Under conn_ilg_lock we get the first ilg. As we drop the conn_ilg_lock to
+ * proceed with the ilm part of the delete we hold a reference on both the ill
+ * and the ilg. This doesn't prevent changes to the ilg, but prevents it from
+ * being deleted.
+ *
+ * Note: if this function is called when new ill/ipif's arrive or change status
+ * (SIOCSLIFINDEX, SIOCSLIFADDR) then we will attempt to attach any ilgs with
+ * a NULL ilg_ill to an ill/ilm.
*/
static void
-conn_delete_ill(conn_t *connp, caddr_t arg)
+conn_update_ill(conn_t *connp, caddr_t arg)
{
ill_t *ill = (ill_t *)arg;
- int i;
- char group_buf[INET6_ADDRSTRLEN];
- in6_addr_t v6group;
- ilg_t *ilg;
/*
- * Even though conn_ilg_inuse can change while we are in this loop,
- * no new ilgs can be created/deleted for this connp, on this
- * ill, since this ill is the perimeter. So we won't miss any ilg
- * in this cleanup.
+ * We have to prevent ip_close/ilg_delete_all from running at
+ * the same time. ip_close sets CONN_CLOSING before doing the ilg_delete
+ * all, and we set CONN_UPDATE_ILL. That ensures that only one of
+ * ilg_delete_all and conn_update_ill run at a time for a given conn.
+ * If ilg_delete_all got here first, then we have nothing to do.
*/
mutex_enter(&connp->conn_lock);
+ if (connp->conn_state_flags & (CONN_CLOSING|CONN_UPDATE_ILL)) {
+ /* Caller has to wait for ill_ilm_cnt to drop to zero */
+ mutex_exit(&connp->conn_lock);
+ return;
+ }
+ connp->conn_state_flags |= CONN_UPDATE_ILL;
+ mutex_exit(&connp->conn_lock);
- /*
- * Increment the walker count, so that ilg repacking does not
- * occur while we are in the loop.
- */
- ILG_WALKER_HOLD(connp);
- for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) {
- ilg = &connp->conn_ilg[i];
- if ((ilg->ilg_ill == ill) && !(ilg->ilg_flags & ILG_DELETED)) {
- /*
- * ip_close cannot be cleaning this ilg at the same
- * time, since it also has to execute in this ill's
- * perimeter which we are now holding. Only a clean
- * conn can be condemned.
- */
- ASSERT(!(connp->conn_state_flags & CONN_CONDEMNED));
-
- /* Blow away the membership */
- ip1dbg(("conn_delete_ilg_ill: %s on %s\n",
- inet_ntop(AF_INET6, &ilg->ilg_v6group,
- group_buf, sizeof (group_buf)),
- ill->ill_name));
+ if (ill != NULL)
+ ilg_check_detach(connp, ill);
- v6group = ilg->ilg_v6group;
- ilg_delete(connp, ilg, NULL);
- mutex_exit(&connp->conn_lock);
+ ilg_check_reattach(connp);
- (void) ip_delmulti_v6(&v6group, ill,
- connp->conn_zoneid, B_FALSE, B_TRUE);
- mutex_enter(&connp->conn_lock);
- }
- }
- /*
- * If we are the last walker, need to physically delete the
- * ilgs and repack.
- */
- ILG_WALKER_RELE(connp);
-
- if (connp->conn_multicast_ill == ill) {
- /* Revert to late binding */
- connp->conn_multicast_ill = NULL;
- }
+ /* Do we need to wake up a thread in ilg_delete_all? */
+ mutex_enter(&connp->conn_lock);
+ connp->conn_state_flags &= ~CONN_UPDATE_ILL;
+ if (connp->conn_state_flags & CONN_CLOSING)
+ cv_broadcast(&connp->conn_cv);
mutex_exit(&connp->conn_lock);
}
-/*
- * Called when an ipif is unplumbed to make sure that there are no
- * dangling conn references to that ipif.
- * Handles ilg_ipif and conn_multicast_ipif
- */
-void
-reset_conn_ipif(ipif)
- ipif_t *ipif;
+/* Detach from an ill that is going away */
+static void
+ilg_check_detach(conn_t *connp, ill_t *ill)
{
- ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
+ char group_buf[INET6_ADDRSTRLEN];
+ ilg_t *ilg, *held_ilg;
+ ilm_t *ilm;
- ipcl_walk(conn_delete_ipif, (caddr_t)ipif, ipst);
-}
+ mutex_enter(&ill->ill_mcast_serializer);
+ rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+ held_ilg = NULL;
+ for (ilg = connp->conn_ilg; ilg != NULL; ilg = ilg->ilg_next) {
+ if (ilg->ilg_condemned)
+ continue;
-/*
- * Called when an ill is unplumbed to make sure that there are no
- * dangling conn references to that ill.
- * Handles ilg_ill, conn_multicast_ill.
- */
-void
-reset_conn_ill(ill_t *ill)
-{
- ip_stack_t *ipst = ill->ill_ipst;
+ if (ilg->ilg_ill != ill)
+ continue;
+
+ /* Detach from current ill */
+ ip1dbg(("ilg_check_detach: detach %s on %s\n",
+ inet_ntop(AF_INET6, &ilg->ilg_v6group,
+ group_buf, sizeof (group_buf)),
+ ilg->ilg_ill->ill_name));
+
+ /* Detach this ilg from the ill/ilm */
+ ilm = ilg->ilg_ilm;
+ ilg->ilg_ilm = NULL;
+ ilg->ilg_ill = NULL;
+ if (ilm == NULL)
+ continue;
- ipcl_walk(conn_delete_ill, (caddr_t)ill, ipst);
+ /* Prevent ilg from disappearing */
+ ilg_transfer_hold(held_ilg, ilg);
+ held_ilg = ilg;
+ rw_exit(&connp->conn_ilg_lock);
+
+ (void) ip_delmulti_serial(ilm, B_FALSE, B_TRUE);
+ rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+ }
+ if (held_ilg != NULL)
+ ilg_refrele(held_ilg);
+ rw_exit(&connp->conn_ilg_lock);
+ mutex_exit(&ill->ill_mcast_serializer);
}
-#ifdef DEBUG
/*
- * Walk functions walk all the interfaces in the system to make
- * sure that there is no refernece to the ipif or ill that is
- * going away.
+ * Check if there is a place to attach the conn_ilgs. We do this for both
+ * detached ilgs and attached ones, since for the latter there could be
+ * a better ill to attach them to.
*/
-int
-ilm_walk_ill(ill_t *ill)
+static void
+ilg_check_reattach(conn_t *connp)
{
- int cnt = 0;
- ill_t *till;
- ilm_t *ilm;
- ill_walk_context_t ctx;
- ip_stack_t *ipst = ill->ill_ipst;
-
- rw_enter(&ipst->ips_ill_g_lock, RW_READER);
- till = ILL_START_WALK_ALL(&ctx, ipst);
- for (; till != NULL; till = ill_next(&ctx, till)) {
- mutex_enter(&till->ill_lock);
- for (ilm = till->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
- if (ilm->ilm_ill == ill) {
- cnt++;
+ ill_t *ill;
+ char group_buf[INET6_ADDRSTRLEN];
+ ilg_t *ilg, *held_ilg;
+ ilm_t *ilm;
+ zoneid_t zoneid = IPCL_ZONEID(connp);
+ int error;
+ ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
+
+ rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+ held_ilg = NULL;
+ for (ilg = connp->conn_ilg; ilg != NULL; ilg = ilg->ilg_next) {
+ if (ilg->ilg_condemned)
+ continue;
+
+ /* Check if the conn_ill matches what we would pick now */
+ ill = ill_mcast_lookup(&ilg->ilg_v6group, ilg->ilg_ifaddr,
+ ilg->ilg_ifindex, zoneid, ipst, &error);
+
+ /*
+ * Make sure the ill is usable for multicast and that
+ * we can send the DL_ADDMULTI_REQ before we create an
+ * ilm.
+ */
+ if (ill != NULL &&
+ (!(ill->ill_flags & ILLF_MULTICAST) || !ill->ill_dl_up)) {
+ /* Drop locks across ill_refrele */
+ ilg_transfer_hold(held_ilg, ilg);
+ held_ilg = ilg;
+ rw_exit(&connp->conn_ilg_lock);
+ ill_refrele(ill);
+ ill = NULL;
+ rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+ /* Note that ilg could have become condemned */
+ }
+
+ /* Is the ill unchanged, even if both are NULL? */
+ if (ill == ilg->ilg_ill) {
+ if (ill != NULL) {
+ /* Drop locks across ill_refrele */
+ ilg_transfer_hold(held_ilg, ilg);
+ held_ilg = ilg;
+ rw_exit(&connp->conn_ilg_lock);
+ ill_refrele(ill);
+ rw_enter(&connp->conn_ilg_lock, RW_WRITER);
}
+ continue;
}
- mutex_exit(&till->ill_lock);
- }
- rw_exit(&ipst->ips_ill_g_lock);
- return (cnt);
+ /* Something changed; detach from old first if needed */
+ if (ilg->ilg_ill != NULL) {
+ ill_t *ill2 = ilg->ilg_ill;
+ boolean_t need_refrele = B_FALSE;
+
+ /*
+ * In order to serialize on the ill we try to enter
+ * and if that fails we unlock and relock.
+ */
+ if (!mutex_tryenter(&ill2->ill_mcast_serializer)) {
+ ill_refhold(ill2);
+ need_refrele = B_TRUE;
+ ilg_transfer_hold(held_ilg, ilg);
+ held_ilg = ilg;
+ rw_exit(&connp->conn_ilg_lock);
+ mutex_enter(&ill2->ill_mcast_serializer);
+ rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+ /* Note that ilg could have become condemned */
+ }
+ /*
+ * Check that nobody else re-attached the ilg while we
+ * dropped the lock.
+ */
+ if (ilg->ilg_ill == ill2) {
+ ASSERT(!ilg->ilg_condemned);
+ /* Detach from current ill */
+ ip1dbg(("conn_check_reattach: detach %s/%s\n",
+ inet_ntop(AF_INET6, &ilg->ilg_v6group,
+ group_buf, sizeof (group_buf)),
+ ill2->ill_name));
+
+ ilm = ilg->ilg_ilm;
+ ilg->ilg_ilm = NULL;
+ ilg->ilg_ill = NULL;
+ } else {
+ ilm = NULL;
+ }
+ rw_exit(&connp->conn_ilg_lock);
+ if (ilm != NULL)
+ (void) ip_delmulti_serial(ilm, B_FALSE, B_TRUE);
+ mutex_exit(&ill2->ill_mcast_serializer);
+ if (need_refrele) {
+ /* Drop ill reference while we hold no locks */
+ ill_refrele(ill2);
+ }
+ rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+ /*
+ * While we dropped conn_ilg_lock some other thread
+ * could have attached this ilg, thus we check again.
+ */
+ if (ilg->ilg_ill != NULL) {
+ if (ill != NULL) {
+ /* Drop locks across ill_refrele */
+ ilg_transfer_hold(held_ilg, ilg);
+ held_ilg = ilg;
+ rw_exit(&connp->conn_ilg_lock);
+ ill_refrele(ill);
+ rw_enter(&connp->conn_ilg_lock,
+ RW_WRITER);
+ }
+ continue;
+ }
+ }
+ if (ill != NULL) {
+ /*
+ * In order to serialize on the ill we try to enter
+ * and if that fails we unlock and relock.
+ */
+ if (!mutex_tryenter(&ill->ill_mcast_serializer)) {
+ /* Already have a refhold on ill */
+ ilg_transfer_hold(held_ilg, ilg);
+ held_ilg = ilg;
+ rw_exit(&connp->conn_ilg_lock);
+ mutex_enter(&ill->ill_mcast_serializer);
+ rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+ /* Note that ilg could have become condemned */
+ }
+
+ /*
+ * Check that nobody else attached the ilg and that
+ * it wasn't condemned while we dropped the lock.
+ */
+ if (ilg->ilg_ill == NULL && !ilg->ilg_condemned) {
+ /*
+ * Attach to the new ill. Can fail in which
+ * case ilg_ill will remain NULL. ilg_attach
+ * drops and reacquires conn_ilg_lock.
+ */
+ ip1dbg(("conn_check_reattach: attach %s/%s\n",
+ inet_ntop(AF_INET6, &ilg->ilg_v6group,
+ group_buf, sizeof (group_buf)),
+ ill->ill_name));
+ ilg_attach(connp, ilg, ill);
+ ASSERT(RW_WRITE_HELD(&connp->conn_ilg_lock));
+ }
+ mutex_exit(&ill->ill_mcast_serializer);
+ /* Drop locks across ill_refrele */
+ ilg_transfer_hold(held_ilg, ilg);
+ held_ilg = ilg;
+ rw_exit(&connp->conn_ilg_lock);
+ ill_refrele(ill);
+ rw_enter(&connp->conn_ilg_lock, RW_WRITER);
+ }
+ }
+ if (held_ilg != NULL)
+ ilg_refrele(held_ilg);
+ rw_exit(&connp->conn_ilg_lock);
}
/*
- * This function is called before the ipif is freed.
+ * Called when an ill is unplumbed to make sure that there are no
+ * dangling conn references to that ill. In that case ill is non-NULL and
+ * we make sure we remove all references to it.
+ * Also called when we should revisit the ilg_ill used for multicast
+ * memberships, in which case ill is NULL.
*/
-int
-ilm_walk_ipif(ipif_t *ipif)
+void
+update_conn_ill(ill_t *ill, ip_stack_t *ipst)
{
- int cnt = 0;
- ill_t *till;
- ilm_t *ilm;
- ill_walk_context_t ctx;
- ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
-
- till = ILL_START_WALK_ALL(&ctx, ipst);
- for (; till != NULL; till = ill_next(&ctx, till)) {
- mutex_enter(&till->ill_lock);
- for (ilm = till->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
- if (ilm->ilm_ipif == ipif) {
- cnt++;
- }
- }
- mutex_exit(&till->ill_lock);
- }
- return (cnt);
+ ipcl_walk(conn_update_ill, (caddr_t)ill, ipst);
}
-#endif
diff --git a/usr/src/uts/common/inet/ip/ip_ndp.c b/usr/src/uts/common/inet/ip/ip_ndp.c
index 35f9d541e8..97096bea99 100644
--- a/usr/src/uts/common/inet/ip/ip_ndp.c
+++ b/usr/src/uts/common/inet/ip/ip_ndp.c
@@ -40,6 +40,7 @@
#include <sys/zone.h>
#include <sys/ethernet.h>
#include <sys/sdt.h>
+#include <sys/mac.h>
#include <net/if.h>
#include <net/if_types.h>
@@ -61,53 +62,93 @@
#include <inet/ip_rts.h>
#include <inet/ip6.h>
#include <inet/ip_ndp.h>
-#include <inet/ipsec_impl.h>
-#include <inet/ipsec_info.h>
#include <inet/sctp_ip.h>
+#include <inet/ip_arp.h>
#include <inet/ip2mac_impl.h>
+#define ANNOUNCE_INTERVAL(isv6) \
+ (isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \
+ ipst->ips_ip_arp_publish_interval)
+
+#define DEFENSE_INTERVAL(isv6) \
+ (isv6 ? ipst->ips_ndp_defend_interval : \
+ ipst->ips_arp_defend_interval)
+
+/* Non-tunable probe interval, based on link capabilities */
+#define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500)
+
+/*
+ * The IPv4 Link Local address space is special; we do extra duplicate checking
+ * there, as the entire assignment mechanism rests on random numbers.
+ */
+#define IS_IPV4_LL_SPACE(ptr) (((uchar_t *)ptr)[0] == 169 && \
+ ((uchar_t *)ptr)[1] == 254)
+
+/*
+ * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed
+ * in to the ncec*add* functions.
+ *
+ * NCE_F_AUTHORITY means that we ignore any incoming adverts for that
+ * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means
+ * that we will respond to requests for the protocol address.
+ */
+#define NCE_EXTERNAL_FLAGS_MASK \
+ (NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \
+ NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \
+ NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC)
+
/*
* Function names with nce_ prefix are static while function
* names with ndp_ prefix are used by rest of the IP.
*
* Lock ordering:
*
- * ndp_g_lock -> ill_lock -> nce_lock
+ * ndp_g_lock -> ill_lock -> ncec_lock
*
* The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and
- * nce_next. Nce_lock protects the contents of the NCE (particularly
- * nce_refcnt).
- */
-
-static boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *new_ll_addr,
- uint32_t ll_addr_len);
-static void nce_ire_delete(nce_t *nce);
-static void nce_ire_delete1(ire_t *ire, char *nce_arg);
-static void nce_set_ll(nce_t *nce, uchar_t *ll_addr);
-static nce_t *nce_lookup_addr(ill_t *, boolean_t, const in6_addr_t *,
- nce_t *);
-static nce_t *nce_lookup_mapping(ill_t *, const in6_addr_t *);
-static void nce_make_mapping(nce_t *nce, uchar_t *addrpos,
- uchar_t *addr);
-static int nce_set_multicast(ill_t *ill, const in6_addr_t *addr);
-static void nce_queue_mp(nce_t *nce, mblk_t *mp);
-static mblk_t *nce_udreq_alloc(ill_t *ill);
-static void nce_update(nce_t *nce, uint16_t new_state,
- uchar_t *new_ll_addr);
-static uint32_t nce_solicit(nce_t *nce, in6_addr_t src);
-static boolean_t nce_xmit(ill_t *ill, uint8_t type,
- boolean_t use_lla_addr, const in6_addr_t *sender,
+ * ncec_next. ncec_lock protects the contents of the NCE (particularly
+ * ncec_refcnt).
+ */
+
+static void nce_cleanup_list(ncec_t *ncec);
+static void nce_set_ll(ncec_t *ncec, uchar_t *ll_addr);
+static ncec_t *ncec_lookup_illgrp(ill_t *, const in6_addr_t *,
+ ncec_t *);
+static nce_t *nce_lookup_addr(ill_t *, const in6_addr_t *);
+static int nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr,
+ uint16_t ncec_flags, nce_t **newnce);
+static int nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
+ uint16_t ncec_flags, nce_t **newnce);
+static boolean_t ndp_xmit(ill_t *ill, uint32_t operation,
+ uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
const in6_addr_t *target, int flag);
-static boolean_t nce_xmit_advert(nce_t *nce, boolean_t use_nd_lla,
- const in6_addr_t *target, uint_t flags);
-static boolean_t nce_xmit_solicit(nce_t *nce, boolean_t use_nd_lla,
- const in6_addr_t *src, uint_t flags);
-static int ndp_add_v4(ill_t *, const in_addr_t *, uint16_t,
- nce_t **, nce_t *);
-static ipif_t *ip_ndp_lookup_addr_v6(const in6_addr_t *v6addrp, ill_t *ill);
+static void ncec_refhold_locked(ncec_t *);
+static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
+static void nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
+static int nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
+ uint16_t, uint16_t, nce_t **);
+static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *);
+static nce_t *nce_add(ill_t *, ncec_t *);
+static void nce_inactive(nce_t *);
+extern nce_t *nce_lookup(ill_t *, const in6_addr_t *);
+static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
+static int nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
+ uint16_t, uint16_t, nce_t **);
+static int nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
+ uint16_t, uint16_t, nce_t **);
+static int nce_add_v6_postprocess(nce_t *);
+static int nce_add_v4_postprocess(nce_t *);
+static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
+static clock_t nce_fuzz_interval(clock_t, boolean_t);
+static void nce_resolv_ipmp_ok(ncec_t *);
+static void nce_walk_common(ill_t *, pfi_t, void *);
+static void nce_start_timer(ncec_t *, uint_t);
+static nce_t *nce_fastpath_create(ill_t *, ncec_t *);
+static void nce_fastpath_trigger(nce_t *);
+static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *);
#ifdef DEBUG
-static void nce_trace_cleanup(const nce_t *);
+static void ncec_trace_cleanup(const ncec_t *);
#endif
#define NCE_HASH_PTR_V4(ipst, addr) \
@@ -117,233 +158,245 @@ static void nce_trace_cleanup(const nce_t *);
(&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \
NCE_TABLE_SIZE)]))
-/* Non-tunable probe interval, based on link capabilities */
-#define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500)
+extern kmem_cache_t *ncec_cache;
+extern kmem_cache_t *nce_cache;
+
+/*
+ * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe
+ * If src_ill is not null, the ncec_addr is bound to src_ill. The
+ * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where
+ * the probe is sent on the ncec_ill (in the non-IPMP case) or the
+ * IPMP cast_ill (in the IPMP case).
+ *
+ * Note that the probe interval is based on ncec->ncec_ill which
+ * may be the ipmp_ill.
+ */
+static void
+nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe)
+{
+ boolean_t dropped;
+ uint32_t probe_interval;
+
+ ASSERT(!(ncec->ncec_flags & NCE_F_MCAST));
+ ASSERT(!(ncec->ncec_flags & NCE_F_BCAST));
+ if (ncec->ncec_ipversion == IPV6_VERSION) {
+ dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
+ ncec->ncec_lladdr, ncec->ncec_lladdr_length,
+ &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE);
+ probe_interval = ILL_PROBE_INTERVAL(ncec->ncec_ill);
+ } else {
+ /* IPv4 DAD delay the initial probe. */
+ if (send_probe)
+ dropped = arp_probe(ncec);
+ else
+ dropped = B_TRUE;
+ probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval,
+ !send_probe);
+ }
+ if (!dropped) {
+ mutex_enter(&ncec->ncec_lock);
+ ncec->ncec_pcnt--;
+ mutex_exit(&ncec->ncec_lock);
+ }
+ nce_restart_timer(ncec, probe_interval);
+}
+
+/*
+ * Compute default flags to use for an advertisement of this ncec's address.
+ */
+static int
+nce_advert_flags(const ncec_t *ncec)
+{
+ int flag = 0;
+
+ if (ncec->ncec_flags & NCE_F_ISROUTER)
+ flag |= NDP_ISROUTER;
+ if (!(ncec->ncec_flags & NCE_F_ANYCAST))
+ flag |= NDP_ORIDE;
+
+ return (flag);
+}
/*
* NDP Cache Entry creation routine.
* Mapped entries will never do NUD .
* This routine must always be called with ndp6->ndp_g_lock held.
- * Prior to return, nce_refcnt is incremented.
*/
int
-ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr,
- const in6_addr_t *mask, const in6_addr_t *extract_mask,
- uint32_t hw_extract_start, uint16_t flags, uint16_t state,
- nce_t **newnce)
+nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
+ const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
{
- static nce_t nce_nil;
- nce_t *nce;
- mblk_t *mp;
- mblk_t *template;
- nce_t **ncep;
int err;
- boolean_t dropped = B_FALSE;
- ip_stack_t *ipst = ill->ill_ipst;
+ nce_t *nce;
- ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
+ ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
ASSERT(ill != NULL && ill->ill_isv6);
- if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
- ip0dbg(("ndp_add_v6: no addr\n"));
- return (EINVAL);
- }
- if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
- ip0dbg(("ndp_add_v6: flags = %x\n", (int)flags));
- return (EINVAL);
- }
- if (IN6_IS_ADDR_UNSPECIFIED(extract_mask) &&
- (flags & NCE_F_MAPPING)) {
- ip0dbg(("ndp_add_v6: extract mask zero for mapping"));
- return (EINVAL);
- }
- /*
- * Allocate the mblk to hold the nce.
- *
- * XXX This can come out of a separate cache - nce_cache.
- * We don't need the mp anymore as there are no more
- * "qwriter"s
- */
- mp = allocb(sizeof (nce_t), BPRI_MED);
- if (mp == NULL)
- return (ENOMEM);
- nce = (nce_t *)mp->b_rptr;
- mp->b_wptr = (uchar_t *)&nce[1];
- *nce = nce_nil;
-
- /*
- * This one holds link layer address
- */
- if (ill->ill_net_type == IRE_IF_RESOLVER) {
- template = nce_udreq_alloc(ill);
- } else {
- if (ill->ill_phys_addr_length == IPV6_ADDR_LEN &&
- ill->ill_mactype != DL_IPV6) {
- /*
- * We create a nce_res_mp with the IP nexthop address
- * as the destination address if the physical length
- * is exactly 16 bytes for point-to-multipoint links
- * that do their own resolution from IP to link-layer
- * address.
- */
- template = ill_dlur_gen((uchar_t *)addr,
- ill->ill_phys_addr_length, ill->ill_sap,
- ill->ill_sap_length);
- } else {
- if (ill->ill_resolver_mp == NULL) {
- freeb(mp);
- return (EINVAL);
- }
- ASSERT((ill->ill_net_type == IRE_IF_NORESOLVER));
- template = copyb(ill->ill_resolver_mp);
- }
- }
- if (template == NULL) {
- freeb(mp);
- return (ENOMEM);
- }
- nce->nce_ill = ill;
- nce->nce_ipversion = IPV6_VERSION;
- nce->nce_flags = flags;
- nce->nce_state = state;
- nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
- nce->nce_rcnt = ill->ill_xmit_count;
- nce->nce_addr = *addr;
- nce->nce_mask = *mask;
- nce->nce_extract_mask = *extract_mask;
- nce->nce_ll_extract_start = hw_extract_start;
- nce->nce_fp_mp = NULL;
- nce->nce_res_mp = template;
- if (state == ND_REACHABLE)
- nce->nce_last = TICK_TO_MSEC(lbolt64);
- else
- nce->nce_last = 0;
- nce->nce_qd_mp = NULL;
- nce->nce_mp = mp;
- if (hw_addr != NULL)
- nce_set_ll(nce, hw_addr);
- /* This one is for nce getting created */
- nce->nce_refcnt = 1;
- mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
- if (nce->nce_flags & NCE_F_MAPPING) {
- ASSERT(IN6_IS_ADDR_MULTICAST(addr));
- ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_mask));
- ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
- ncep = &ipst->ips_ndp6->nce_mask_entries;
- } else {
- ncep = ((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
- }
+ err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state,
+ &nce);
+ if (err != 0)
+ return (err);
+ ASSERT(newnce != NULL);
+ *newnce = nce;
+ return (err);
+}
- nce->nce_trace_disable = B_FALSE;
+/*
+ * Post-processing routine to be executed after nce_add_v6(). This function
+ * triggers fastpath (if appropriate) and DAD on the newly added nce entry
+ * and must be called without any locks held.
+ */
+int
+nce_add_v6_postprocess(nce_t *nce)
+{
+ ncec_t *ncec = nce->nce_common;
+ boolean_t dropped = B_FALSE;
+ uchar_t *hw_addr = ncec->ncec_lladdr;
+ uint_t hw_addr_len = ncec->ncec_lladdr_length;
+ ill_t *ill = ncec->ncec_ill;
+ int err = 0;
+ uint16_t flags = ncec->ncec_flags;
+ ip_stack_t *ipst = ill->ill_ipst;
+ boolean_t trigger_fastpath = B_TRUE;
- list_create(&nce->nce_cb, sizeof (nce_cb_t),
- offsetof(nce_cb_t, nce_cb_node));
/*
- * Atomically ensure that the ill is not CONDEMNED, before
- * adding the NCE.
+ * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
+ * we call nce_fastpath as soon as the ncec is resolved in nce_process.
+ * We call nce_fastpath from nce_update if the link layer address of
+ * the peer changes from nce_update
*/
- mutex_enter(&ill->ill_lock);
- if (ill->ill_state_flags & ILL_CONDEMNED) {
- mutex_exit(&ill->ill_lock);
- freeb(mp);
- freeb(template);
- return (EINVAL);
- }
- if ((nce->nce_next = *ncep) != NULL)
- nce->nce_next->nce_ptpn = &nce->nce_next;
- *ncep = nce;
- nce->nce_ptpn = ncep;
- *newnce = nce;
- /* This one is for nce being used by an active thread */
- NCE_REFHOLD(*newnce);
-
- /* Bump up the number of nce's referencing this ill */
- DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
- (char *), "nce", (void *), nce);
- ill->ill_nce_cnt++;
- mutex_exit(&ill->ill_lock);
-
- err = 0;
- if ((flags & NCE_F_PERMANENT) && state == ND_PROBE) {
- mutex_enter(&nce->nce_lock);
- mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
- nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
- mutex_exit(&nce->nce_lock);
- dropped = nce_xmit_solicit(nce, B_FALSE, NULL, NDP_PROBE);
- if (dropped) {
- mutex_enter(&nce->nce_lock);
- nce->nce_pcnt++;
- mutex_exit(&nce->nce_lock);
+ if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) ||
+ (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER))
+ trigger_fastpath = B_FALSE;
+
+ if (trigger_fastpath)
+ nce_fastpath_trigger(nce);
+ if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
+ ill_t *hwaddr_ill;
+ /*
+ * Unicast entry that needs DAD.
+ */
+ if (IS_IPMP(ill)) {
+ hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
+ hw_addr, hw_addr_len);
+ } else {
+ hwaddr_ill = ill;
}
- NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
- mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
+ nce_dad(ncec, hwaddr_ill, B_TRUE);
err = EINPROGRESS;
} else if (flags & NCE_F_UNSOL_ADV) {
/*
* We account for the transmit below by assigning one
* less than the ndd variable. Subsequent decrements
- * are done in ndp_timer.
+ * are done in nce_timer.
*/
- mutex_enter(&nce->nce_lock);
- mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
- nce->nce_unsolicit_count = ipst->ips_ip_ndp_unsolicit_count - 1;
- mutex_exit(&nce->nce_lock);
- dropped = nce_xmit_advert(nce, B_TRUE, &ipv6_all_hosts_mcast,
- 0);
- mutex_enter(&nce->nce_lock);
+ mutex_enter(&ncec->ncec_lock);
+ ncec->ncec_unsolicit_count =
+ ipst->ips_ip_ndp_unsolicit_count - 1;
+ mutex_exit(&ncec->ncec_lock);
+ dropped = ndp_xmit(ill,
+ ND_NEIGHBOR_ADVERT,
+ hw_addr,
+ hw_addr_len,
+ &ncec->ncec_addr, /* Source and target of the adv */
+ &ipv6_all_hosts_mcast, /* Destination of the packet */
+ nce_advert_flags(ncec));
+ mutex_enter(&ncec->ncec_lock);
if (dropped)
- nce->nce_unsolicit_count++;
- if (nce->nce_unsolicit_count != 0) {
- ASSERT(nce->nce_timeout_id == 0);
- nce->nce_timeout_id = timeout(ndp_timer, nce,
- MSEC_TO_TICK(ipst->ips_ip_ndp_unsolicit_interval));
+ ncec->ncec_unsolicit_count++;
+ else
+ ncec->ncec_last_time_defended = ddi_get_lbolt();
+ if (ncec->ncec_unsolicit_count != 0) {
+ nce_start_timer(ncec,
+ ipst->ips_ip_ndp_unsolicit_interval);
}
- mutex_exit(&nce->nce_lock);
- mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
+ mutex_exit(&ncec->ncec_lock);
}
-
- /*
- * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
- * we call nce_fastpath as soon as the nce is resolved in ndp_process.
- * We call nce_fastpath from nce_update if the link layer address of
- * the peer changes from nce_update
- */
- if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER)
- nce_fastpath(nce);
return (err);
}
+/*
+ * Atomically lookup and add (if needed) Neighbor Cache information for
+ * an address.
+ *
+ * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
+ * are always added pointing at the ipmp_ill. Thus, when the ill passed
+ * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
+ * entries will be created, both pointing at the same ncec_t. The nce_t
+ * entries will have their nce_ill set to the ipmp_ill and the under_ill
+ * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
+ * Local addresses are always created on the ill passed to nce_add_v6.
+ */
int
-ndp_lookup_then_add_v6(ill_t *ill, boolean_t match_illgrp, uchar_t *hw_addr,
- const in6_addr_t *addr, const in6_addr_t *mask,
- const in6_addr_t *extract_mask, uint32_t hw_extract_start, uint16_t flags,
- uint16_t state, nce_t **newnce)
+nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
+ const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
{
- int err = 0;
- nce_t *nce;
+ int err = 0;
ip_stack_t *ipst = ill->ill_ipst;
+ nce_t *nce, *upper_nce = NULL;
+ ill_t *in_ill = ill;
+ boolean_t need_ill_refrele = B_FALSE;
+ if (flags & NCE_F_MCAST) {
+ /*
+ * hw_addr will be figured out in nce_set_multicast_v6;
+ * caller has to select the cast_ill
+ */
+ ASSERT(hw_addr == NULL);
+ ASSERT(!IS_IPMP(ill));
+ err = nce_set_multicast_v6(ill, addr, flags, newnce);
+ return (err);
+ }
ASSERT(ill->ill_isv6);
- mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
+ if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
+ ill = ipmp_ill_hold_ipmp_ill(ill);
+ if (ill == NULL)
+ return (ENXIO);
+ need_ill_refrele = B_TRUE;
+ }
- /* Get head of v6 hash table */
- nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
- nce = nce_lookup_addr(ill, match_illgrp, addr, nce);
+ mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
+ nce = nce_lookup_addr(ill, addr);
if (nce == NULL) {
- err = ndp_add_v6(ill,
- hw_addr,
- addr,
- mask,
- extract_mask,
- hw_extract_start,
- flags,
- state,
- newnce);
+ err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state,
+ &nce);
} else {
- *newnce = nce;
err = EEXIST;
}
mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
+ if (err == 0)
+ err = nce_add_v6_postprocess(nce);
+ if (in_ill != ill && nce != NULL) {
+ nce_t *under_nce;
+
+ /*
+ * in_ill was the under_ill. Try to create the under_nce.
+ * Hold the ill_g_lock to prevent changes to group membership
+ * until we are done.
+ */
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+ if (IS_IN_SAME_ILLGRP(in_ill, ill)) {
+ under_nce = nce_fastpath_create(in_ill,
+ nce->nce_common);
+ upper_nce = nce;
+ if ((nce = under_nce) == NULL)
+ err = EINVAL;
+ }
+ rw_exit(&ipst->ips_ill_g_lock);
+ if (under_nce != NULL && NCE_ISREACHABLE(nce->nce_common))
+ nce_fastpath_trigger(under_nce);
+ }
+ if (nce != NULL) {
+ if (newnce != NULL)
+ *newnce = nce;
+ else
+ nce_refrele(nce);
+ }
+ /* nce_refrele is deferred until the lock is dropped */
+ if (upper_nce != NULL)
+ nce_refrele(upper_nce);
+ if (need_ill_refrele)
+ ill_refrele(ill);
return (err);
}
@@ -351,53 +404,51 @@ ndp_lookup_then_add_v6(ill_t *ill, boolean_t match_illgrp, uchar_t *hw_addr,
* Remove all the CONDEMNED nces from the appropriate hash table.
* We create a private list of NCEs, these may have ires pointing
* to them, so the list will be passed through to clean up dependent
- * ires and only then we can do NCE_REFRELE which can make NCE inactive.
+ * ires and only then we can do ncec_refrele() which can make NCE inactive.
*/
static void
-nce_remove(ndp_g_t *ndp, nce_t *nce, nce_t **free_nce_list)
+nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list)
{
- nce_t *nce1;
- nce_t **ptpn;
+ ncec_t *ncec1;
+ ncec_t **ptpn;
ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
ASSERT(ndp->ndp_g_walker == 0);
- for (; nce; nce = nce1) {
- nce1 = nce->nce_next;
- mutex_enter(&nce->nce_lock);
- if (nce->nce_flags & NCE_F_CONDEMNED) {
- ptpn = nce->nce_ptpn;
- nce1 = nce->nce_next;
- if (nce1 != NULL)
- nce1->nce_ptpn = ptpn;
- *ptpn = nce1;
- nce->nce_ptpn = NULL;
- nce->nce_next = NULL;
- nce->nce_next = *free_nce_list;
- *free_nce_list = nce;
+ for (; ncec; ncec = ncec1) {
+ ncec1 = ncec->ncec_next;
+ mutex_enter(&ncec->ncec_lock);
+ if (NCE_ISCONDEMNED(ncec)) {
+ ptpn = ncec->ncec_ptpn;
+ ncec1 = ncec->ncec_next;
+ if (ncec1 != NULL)
+ ncec1->ncec_ptpn = ptpn;
+ *ptpn = ncec1;
+ ncec->ncec_ptpn = NULL;
+ ncec->ncec_next = NULL;
+ ncec->ncec_next = *free_nce_list;
+ *free_nce_list = ncec;
}
- mutex_exit(&nce->nce_lock);
+ mutex_exit(&ncec->ncec_lock);
}
}
/*
- * 1. Mark the nce CONDEMNED. This ensures that no new nce_lookup()
- * will return this NCE. Also no new IREs will be created that
- * point to this NCE (See ire_add_v6). Also no new timeouts will
- * be started (See NDP_RESTART_TIMER).
+ * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup()
+ * will return this NCE. Also no new timeouts will
+ * be started (See nce_restart_timer).
* 2. Cancel any currently running timeouts.
* 3. If there is an ndp walker, return. The walker will do the cleanup.
* This ensures that walkers see a consistent list of NCEs while walking.
* 4. Otherwise remove the NCE from the list of NCEs
- * 5. Delete all IREs pointing to this NCE.
*/
void
-ndp_delete(nce_t *nce)
+ncec_delete(ncec_t *ncec)
{
- nce_t **ptpn;
- nce_t *nce1;
- int ipversion = nce->nce_ipversion;
+ ncec_t **ptpn;
+ ncec_t *ncec1;
+ int ipversion = ncec->ncec_ipversion;
ndp_g_t *ndp;
- ip_stack_t *ipst = nce->nce_ill->ill_ipst;
+ ip_stack_t *ipst = ncec->ncec_ipst;
if (ipversion == IPV4_VERSION)
ndp = ipst->ips_ndp4;
@@ -405,40 +456,42 @@ ndp_delete(nce_t *nce)
ndp = ipst->ips_ndp6;
/* Serialize deletes */
- mutex_enter(&nce->nce_lock);
- if (nce->nce_flags & NCE_F_CONDEMNED) {
+ mutex_enter(&ncec->ncec_lock);
+ if (NCE_ISCONDEMNED(ncec)) {
/* Some other thread is doing the delete */
- mutex_exit(&nce->nce_lock);
+ mutex_exit(&ncec->ncec_lock);
return;
}
/*
* Caller has a refhold. Also 1 ref for being in the list. Thus
* refcnt has to be >= 2
*/
- ASSERT(nce->nce_refcnt >= 2);
- nce->nce_flags |= NCE_F_CONDEMNED;
- mutex_exit(&nce->nce_lock);
+ ASSERT(ncec->ncec_refcnt >= 2);
+ ncec->ncec_flags |= NCE_F_CONDEMNED;
+ mutex_exit(&ncec->ncec_lock);
- nce_fastpath_list_delete(nce);
+ /* Count how many condemned ires for kmem_cache callback */
+ atomic_add_32(&ipst->ips_num_nce_condemned, 1);
+ nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
/* Complete any waiting callbacks */
- nce_cb_dispatch(nce);
+ ncec_cb_dispatch(ncec);
/*
* Cancel any running timer. Timeout can't be restarted
- * since CONDEMNED is set. Can't hold nce_lock across untimeout.
+ * since CONDEMNED is set. Can't hold ncec_lock across untimeout.
* Passing invalid timeout id is fine.
*/
- if (nce->nce_timeout_id != 0) {
- (void) untimeout(nce->nce_timeout_id);
- nce->nce_timeout_id = 0;
+ if (ncec->ncec_timeout_id != 0) {
+ (void) untimeout(ncec->ncec_timeout_id);
+ ncec->ncec_timeout_id = 0;
}
mutex_enter(&ndp->ndp_g_lock);
- if (nce->nce_ptpn == NULL) {
+ if (ncec->ncec_ptpn == NULL) {
/*
- * The last ndp walker has already removed this nce from
- * the list after we marked the nce CONDEMNED and before
+ * The last ndp walker has already removed this ncec from
+ * the list after we marked the ncec CONDEMNED and before
* we grabbed the global lock.
*/
mutex_exit(&ndp->ndp_g_lock);
@@ -454,62 +507,68 @@ ndp_delete(nce_t *nce)
}
/*
- * Now remove the nce from the list. NDP_RESTART_TIMER won't restart
+ * Now remove the ncec from the list. nce_restart_timer won't restart
* the timer since it is marked CONDEMNED.
*/
- ptpn = nce->nce_ptpn;
- nce1 = nce->nce_next;
- if (nce1 != NULL)
- nce1->nce_ptpn = ptpn;
- *ptpn = nce1;
- nce->nce_ptpn = NULL;
- nce->nce_next = NULL;
+ ptpn = ncec->ncec_ptpn;
+ ncec1 = ncec->ncec_next;
+ if (ncec1 != NULL)
+ ncec1->ncec_ptpn = ptpn;
+ *ptpn = ncec1;
+ ncec->ncec_ptpn = NULL;
+ ncec->ncec_next = NULL;
mutex_exit(&ndp->ndp_g_lock);
- nce_ire_delete(nce);
+ /* Removed from ncec_ptpn/ncec_next list */
+ ncec_refrele_notr(ncec);
}
void
-ndp_inactive(nce_t *nce)
+ncec_inactive(ncec_t *ncec)
{
mblk_t **mpp;
- ill_t *ill;
+ ill_t *ill = ncec->ncec_ill;
+ ip_stack_t *ipst = ncec->ncec_ipst;
- ASSERT(nce->nce_refcnt == 0);
- ASSERT(MUTEX_HELD(&nce->nce_lock));
- ASSERT(nce->nce_fastpath == NULL);
+ ASSERT(ncec->ncec_refcnt == 0);
+ ASSERT(MUTEX_HELD(&ncec->ncec_lock));
- /* Free all nce allocated messages */
- mpp = &nce->nce_first_mp_to_free;
- do {
- while (*mpp != NULL) {
- mblk_t *mp;
+ /* Count how many condemned nces for kmem_cache callback */
+ if (NCE_ISCONDEMNED(ncec))
+ atomic_add_32(&ipst->ips_num_nce_condemned, -1);
- mp = *mpp;
- *mpp = mp->b_next;
+ /* Free all allocated messages */
+ mpp = &ncec->ncec_qd_mp;
+ while (*mpp != NULL) {
+ mblk_t *mp;
- inet_freemsg(mp);
- }
- } while (mpp++ != &nce->nce_last_mp_to_free);
+ mp = *mpp;
+ *mpp = mp->b_next;
- if (nce->nce_ipversion == IPV6_VERSION) {
- /*
- * must have been cleaned up in nce_delete
- */
- ASSERT(list_is_empty(&nce->nce_cb));
- list_destroy(&nce->nce_cb);
+ inet_freemsg(mp);
}
+ /*
+ * must have been cleaned up in ncec_delete
+ */
+ ASSERT(list_is_empty(&ncec->ncec_cb));
+ list_destroy(&ncec->ncec_cb);
+ /*
+ * free the ncec_lladdr if one was allocated in nce_add_common()
+ */
+ if (ncec->ncec_lladdr_length > 0)
+ kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length);
+
#ifdef DEBUG
- nce_trace_cleanup(nce);
+ ncec_trace_cleanup(ncec);
#endif
- ill = nce->nce_ill;
mutex_enter(&ill->ill_lock);
DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
- (char *), "nce", (void *), nce);
- ill->ill_nce_cnt--;
+ (char *), "ncec", (void *), ncec);
+ ill->ill_ncec_cnt--;
+ ncec->ncec_ill = NULL;
/*
- * If the number of nce's associated with this ill have dropped
+ * If the number of ncec's associated with this ill have dropped
* to zero, check whether we need to restart any operation that
* is waiting for this to happen.
*/
@@ -519,104 +578,59 @@ ndp_inactive(nce_t *nce)
} else {
mutex_exit(&ill->ill_lock);
}
- mutex_destroy(&nce->nce_lock);
- if (nce->nce_mp != NULL)
- inet_freemsg(nce->nce_mp);
+
+ mutex_destroy(&ncec->ncec_lock);
+ kmem_cache_free(ncec_cache, ncec);
}
/*
- * ndp_walk routine. Delete the nce if it is associated with the ill
+ * ncec_walk routine. Delete the ncec if it is associated with the ill
* that is going away. Always called as a writer.
*/
void
-ndp_delete_per_ill(nce_t *nce, uchar_t *arg)
+ncec_delete_per_ill(ncec_t *ncec, uchar_t *arg)
{
- if ((nce != NULL) && nce->nce_ill == (ill_t *)arg) {
- ndp_delete(nce);
+ if ((ncec != NULL) && ncec->ncec_ill == (ill_t *)arg) {
+ ncec_delete(ncec);
}
}
/*
- * Walk a list of to be inactive NCEs and blow away all the ires.
+ * Neighbor Cache cleanup logic for a list of ncec_t entries.
*/
static void
-nce_ire_delete_list(nce_t *nce)
+nce_cleanup_list(ncec_t *ncec)
{
- nce_t *nce_next;
+ ncec_t *ncec_next;
- ASSERT(nce != NULL);
- while (nce != NULL) {
- nce_next = nce->nce_next;
- nce->nce_next = NULL;
+ ASSERT(ncec != NULL);
+ while (ncec != NULL) {
+ ncec_next = ncec->ncec_next;
+ ncec->ncec_next = NULL;
/*
* It is possible for the last ndp walker (this thread)
- * to come here after ndp_delete has marked the nce CONDEMNED
- * and before it has removed the nce from the fastpath list
+ * to come here after ncec_delete has marked the ncec CONDEMNED
+ * and before it has removed the ncec from the fastpath list
* or called untimeout. So we need to do it here. It is safe
- * for both ndp_delete and this thread to do it twice or
+ * for both ncec_delete and this thread to do it twice or
* even simultaneously since each of the threads has a
- * reference on the nce.
+ * reference on the ncec.
*/
- nce_fastpath_list_delete(nce);
+ nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
/*
* Cancel any running timer. Timeout can't be restarted
- * since CONDEMNED is set. Can't hold nce_lock across untimeout.
- * Passing invalid timeout id is fine.
+ * since CONDEMNED is set. The ncec_lock can't be
+ * held across untimeout though passing invalid timeout
+ * id is fine.
*/
- if (nce->nce_timeout_id != 0) {
- (void) untimeout(nce->nce_timeout_id);
- nce->nce_timeout_id = 0;
+ if (ncec->ncec_timeout_id != 0) {
+ (void) untimeout(ncec->ncec_timeout_id);
+ ncec->ncec_timeout_id = 0;
}
- /*
- * We might hit this func thus in the v4 case:
- * ipif_down->ipif_ndp_down->ndp_walk
- */
-
- if (nce->nce_ipversion == IPV4_VERSION) {
- ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE,
- IRE_CACHE, nce_ire_delete1, nce, nce->nce_ill);
- } else {
- ASSERT(nce->nce_ipversion == IPV6_VERSION);
- ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE,
- IRE_CACHE, nce_ire_delete1, nce, nce->nce_ill);
- }
- NCE_REFRELE_NOTR(nce);
- nce = nce_next;
- }
-}
-
-/*
- * Delete an ire when the nce goes away.
- */
-/* ARGSUSED */
-static void
-nce_ire_delete(nce_t *nce)
-{
- if (nce->nce_ipversion == IPV6_VERSION) {
- ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
- nce_ire_delete1, (char *)nce, nce->nce_ill);
- NCE_REFRELE_NOTR(nce);
- } else {
- ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
- nce_ire_delete1, (char *)nce, nce->nce_ill);
- NCE_REFRELE_NOTR(nce);
- }
-}
-
-/*
- * ire_walk routine used to delete every IRE that shares this nce
- */
-static void
-nce_ire_delete1(ire_t *ire, char *nce_arg)
-{
- nce_t *nce = (nce_t *)nce_arg;
-
- ASSERT(ire->ire_type == IRE_CACHE);
-
- if (ire->ire_nce == nce) {
- ASSERT(ire->ire_ipversion == nce->nce_ipversion);
- ire_delete(ire);
+ /* Removed from ncec_ptpn/ncec_next list */
+ ncec_refrele_notr(ncec);
+ ncec = ncec_next;
}
}
@@ -624,100 +638,97 @@ nce_ire_delete1(ire_t *ire, char *nce_arg)
* Restart DAD on given NCE. Returns B_TRUE if DAD has been restarted.
*/
boolean_t
-ndp_restart_dad(nce_t *nce)
+nce_restart_dad(ncec_t *ncec)
{
boolean_t started;
- boolean_t dropped;
+ ill_t *ill, *hwaddr_ill;
- if (nce == NULL)
+ if (ncec == NULL)
return (B_FALSE);
- mutex_enter(&nce->nce_lock);
- if (nce->nce_state == ND_PROBE) {
- mutex_exit(&nce->nce_lock);
+ ill = ncec->ncec_ill;
+ mutex_enter(&ncec->ncec_lock);
+ if (ncec->ncec_state == ND_PROBE) {
+ mutex_exit(&ncec->ncec_lock);
started = B_TRUE;
- } else if (nce->nce_state == ND_REACHABLE) {
- nce->nce_state = ND_PROBE;
- nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT - 1;
- mutex_exit(&nce->nce_lock);
- dropped = nce_xmit_solicit(nce, B_FALSE, NULL, NDP_PROBE);
- if (dropped) {
- mutex_enter(&nce->nce_lock);
- nce->nce_pcnt++;
- mutex_exit(&nce->nce_lock);
+ } else if (ncec->ncec_state == ND_REACHABLE) {
+ ASSERT(ncec->ncec_lladdr != NULL);
+ ncec->ncec_state = ND_PROBE;
+ ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
+ /*
+ * Slight cheat here: we don't use the initial probe delay
+ * for IPv4 in this obscure case.
+ */
+ mutex_exit(&ncec->ncec_lock);
+ if (IS_IPMP(ill)) {
+ hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp,
+ ncec->ncec_lladdr, ncec->ncec_lladdr_length);
+ } else {
+ hwaddr_ill = ill;
}
- NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(nce->nce_ill));
+ nce_dad(ncec, hwaddr_ill, B_TRUE);
started = B_TRUE;
} else {
- mutex_exit(&nce->nce_lock);
+ mutex_exit(&ncec->ncec_lock);
started = B_FALSE;
}
return (started);
}
/*
- * IPv6 Cache entry lookup. Try to find an nce matching the parameters passed.
- * If one is found, the refcnt on the nce will be incremented.
+ * IPv6 Cache entry lookup. Try to find an ncec matching the parameters passed.
+ * If one is found, the refcnt on the ncec will be incremented.
*/
-nce_t *
-ndp_lookup_v6(ill_t *ill, boolean_t match_illgrp, const in6_addr_t *addr,
- boolean_t caller_holds_lock)
+ncec_t *
+ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr)
{
- nce_t *nce;
- ip_stack_t *ipst = ill->ill_ipst;
+ ncec_t *ncec;
+ ip_stack_t *ipst = ill->ill_ipst;
- ASSERT(ill->ill_isv6);
- if (!caller_holds_lock)
- mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+ mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
/* Get head of v6 hash table */
- nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
- nce = nce_lookup_addr(ill, match_illgrp, addr, nce);
- if (nce == NULL)
- nce = nce_lookup_mapping(ill, addr);
- if (!caller_holds_lock)
- mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
- return (nce);
+ ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
+ ncec = ncec_lookup_illgrp(ill, addr, ncec);
+ mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
+ rw_exit(&ipst->ips_ill_g_lock);
+ return (ncec);
}
/*
- * IPv4 Cache entry lookup. Try to find an nce matching the parameters passed.
- * If one is found, the refcnt on the nce will be incremented.
- * Since multicast mappings are handled in arp, there are no nce_mcast_entries
- * so we skip the nce_lookup_mapping call.
- * XXX TODO: if the nce is found to be ND_STALE, ndp_delete it and return NULL
+ * IPv4 Cache entry lookup. Try to find an ncec matching the parameters passed.
+ * If one is found, the refcnt on the ncec will be incremented.
*/
-nce_t *
-ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock)
+ncec_t *
+ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr)
{
- nce_t *nce;
+ ncec_t *ncec = NULL;
in6_addr_t addr6;
ip_stack_t *ipst = ill->ill_ipst;
- if (!caller_holds_lock)
- mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+ mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
/* Get head of v4 hash table */
- nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
+ ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr));
IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
- /*
- * NOTE: IPv4 never matches across the illgrp since the NCE's we're
- * looking up have fastpath headers that are inherently per-ill.
- */
- nce = nce_lookup_addr(ill, B_FALSE, &addr6, nce);
- if (!caller_holds_lock)
- mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
- return (nce);
+ ncec = ncec_lookup_illgrp(ill, &addr6, ncec);
+ mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
+ rw_exit(&ipst->ips_ill_g_lock);
+ return (ncec);
}
/*
- * Cache entry lookup. Try to find an nce matching the parameters passed.
- * Look only for exact entries (no mappings). If an nce is found, increment
- * the hold count on that nce. The caller passes in the start of the
- * appropriate hash table, and must be holding the appropriate global
- * lock (ndp_g_lock).
+ * Cache entry lookup. Try to find an ncec matching the parameters passed.
+ * If an ncec is found, increment the hold count on that ncec.
+ * The caller passes in the start of the appropriate hash table, and must
+ * be holding the appropriate global lock (ndp_g_lock). In addition, since
+ * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock
+ * must be held as reader.
+ *
+ * This function always matches across the ipmp group.
*/
-static nce_t *
-nce_lookup_addr(ill_t *ill, boolean_t match_illgrp, const in6_addr_t *addr,
- nce_t *nce)
+ncec_t *
+ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec)
{
ndp_g_t *ndp;
ip_stack_t *ipst = ill->ill_ipst;
@@ -727,348 +738,246 @@ nce_lookup_addr(ill_t *ill, boolean_t match_illgrp, const in6_addr_t *addr,
else
ndp = ipst->ips_ndp4;
+ ASSERT(ill != NULL);
ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
if (IN6_IS_ADDR_UNSPECIFIED(addr))
return (NULL);
- for (; nce != NULL; nce = nce->nce_next) {
- if (nce->nce_ill == ill ||
- match_illgrp && IS_IN_SAME_ILLGRP(ill, nce->nce_ill)) {
- if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) &&
- IN6_ARE_ADDR_EQUAL(&nce->nce_mask,
- &ipv6_all_ones)) {
- mutex_enter(&nce->nce_lock);
- if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
- NCE_REFHOLD_LOCKED(nce);
- mutex_exit(&nce->nce_lock);
+ for (; ncec != NULL; ncec = ncec->ncec_next) {
+ if (ncec->ncec_ill == ill ||
+ IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) {
+ if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
+ mutex_enter(&ncec->ncec_lock);
+ if (!NCE_ISCONDEMNED(ncec)) {
+ ncec_refhold_locked(ncec);
+ mutex_exit(&ncec->ncec_lock);
break;
}
- mutex_exit(&nce->nce_lock);
+ mutex_exit(&ncec->ncec_lock);
}
}
}
+ return (ncec);
+}
+
+/*
+ * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
+ * entries for ill only, i.e., when ill is part of an ipmp group,
+ * nce_lookup_v4 will never try to match across the group.
+ */
+nce_t *
+nce_lookup_v4(ill_t *ill, const in_addr_t *addr)
+{
+ nce_t *nce;
+ in6_addr_t addr6;
+ ip_stack_t *ipst = ill->ill_ipst;
+
+ mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
+ IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
+ nce = nce_lookup_addr(ill, &addr6);
+ mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
return (nce);
}
/*
- * Cache entry lookup. Try to find an nce matching the parameters passed.
- * Look only for mappings.
+ * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t
+ * entries for ill only, i.e., when ill is part of an ipmp group,
+ * nce_lookup_v6 will never try to match across the group.
*/
+nce_t *
+nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6)
+{
+ nce_t *nce;
+ ip_stack_t *ipst = ill->ill_ipst;
+
+ mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
+ nce = nce_lookup_addr(ill, addr6);
+ mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
+ return (nce);
+}
+
static nce_t *
-nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr)
+nce_lookup_addr(ill_t *ill, const in6_addr_t *addr)
{
- nce_t *nce;
- ip_stack_t *ipst = ill->ill_ipst;
+ nce_t *nce;
- ASSERT(ill != NULL && ill->ill_isv6);
- ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock));
- if (!IN6_IS_ADDR_MULTICAST(addr))
- return (NULL);
- nce = ipst->ips_ndp6->nce_mask_entries;
- for (; nce != NULL; nce = nce->nce_next)
- if (nce->nce_ill == ill &&
- (V6_MASK_EQ(*addr, nce->nce_mask, nce->nce_addr))) {
- mutex_enter(&nce->nce_lock);
- if (!(nce->nce_flags & NCE_F_CONDEMNED)) {
- NCE_REFHOLD_LOCKED(nce);
- mutex_exit(&nce->nce_lock);
- break;
- }
- mutex_exit(&nce->nce_lock);
- }
+ ASSERT(ill != NULL);
+#ifdef DEBUG
+ if (ill->ill_isv6)
+ ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock));
+ else
+ ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
+#endif
+ mutex_enter(&ill->ill_lock);
+ nce = nce_lookup(ill, addr);
+ mutex_exit(&ill->ill_lock);
return (nce);
}
+
+/*
+ * Router turned to host. We need to make sure that cached copies of the ncec
+ * are not used for forwarding packets if they were derived from the default
+ * route, and that the default route itself is removed, as required by
+ * section 7.2.5 of RFC 2461.
+ *
+ * Note that the ncec itself probably has valid link-layer information for the
+ * nexthop, so that there is no reason to delete the ncec, as long as the
+ * ISROUTER flag is turned off.
+ */
+static void
+ncec_router_to_host(ncec_t *ncec)
+{
+ ire_t *ire;
+ ip_stack_t *ipst = ncec->ncec_ipst;
+
+ mutex_enter(&ncec->ncec_lock);
+ ncec->ncec_flags &= ~NCE_F_ISROUTER;
+ mutex_exit(&ncec->ncec_lock);
+
+ ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros,
+ &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL,
+ MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL);
+ if (ire != NULL) {
+ ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
+ ire_delete(ire);
+ ire_refrele(ire);
+ }
+}
+
/*
* Process passed in parameters either from an incoming packet or via
* user ioctl.
*/
-static void
-nce_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
+void
+nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
{
- ill_t *ill = nce->nce_ill;
- uint32_t hw_addr_len = ill->ill_nd_lla_len;
- mblk_t *mp;
+ ill_t *ill = ncec->ncec_ill;
+ uint32_t hw_addr_len = ill->ill_phys_addr_length;
boolean_t ll_updated = B_FALSE;
boolean_t ll_changed;
- ip_stack_t *ipst = ill->ill_ipst;
+ nce_t *nce;
- ASSERT(nce->nce_ipversion == IPV6_VERSION);
+ ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
/*
* No updates of link layer address or the neighbor state is
* allowed, when the cache is in NONUD state. This still
* allows for responding to reachability solicitation.
*/
- mutex_enter(&nce->nce_lock);
- if (nce->nce_state == ND_INCOMPLETE) {
+ mutex_enter(&ncec->ncec_lock);
+ if (ncec->ncec_state == ND_INCOMPLETE) {
if (hw_addr == NULL) {
- mutex_exit(&nce->nce_lock);
+ mutex_exit(&ncec->ncec_lock);
return;
}
- nce_set_ll(nce, hw_addr);
+ nce_set_ll(ncec, hw_addr);
/*
- * Update nce state and send the queued packets
+ * Update ncec state and send the queued packets
* back to ip this time ire will be added.
*/
if (flag & ND_NA_FLAG_SOLICITED) {
- nce_update(nce, ND_REACHABLE, NULL);
+ nce_update(ncec, ND_REACHABLE, NULL);
} else {
- nce_update(nce, ND_STALE, NULL);
- }
- mutex_exit(&nce->nce_lock);
- nce_fastpath(nce);
- nce_cb_dispatch(nce); /* complete callbacks */
- mutex_enter(&nce->nce_lock);
- mp = nce->nce_qd_mp;
- nce->nce_qd_mp = NULL;
- mutex_exit(&nce->nce_lock);
- while (mp != NULL) {
- mblk_t *nxt_mp, *data_mp;
-
- nxt_mp = mp->b_next;
- mp->b_next = NULL;
-
- if (mp->b_datap->db_type == M_CTL)
- data_mp = mp->b_cont;
- else
- data_mp = mp;
- if (data_mp->b_prev != NULL) {
- ill_t *inbound_ill;
- queue_t *fwdq = NULL;
- uint_t ifindex;
-
- ifindex = (uint_t)(uintptr_t)data_mp->b_prev;
- inbound_ill = ill_lookup_on_ifindex(ifindex,
- B_TRUE, NULL, NULL, NULL, NULL, ipst);
- if (inbound_ill == NULL) {
- data_mp->b_prev = NULL;
- freemsg(mp);
- return;
- } else {
- fwdq = inbound_ill->ill_rq;
- }
- data_mp->b_prev = NULL;
- /*
- * Send a forwarded packet back into ip_rput_v6
- * just as in ire_send_v6().
- * Extract the queue from b_prev (set in
- * ip_rput_data_v6).
- */
- if (fwdq != NULL) {
- /*
- * Forwarded packets hop count will
- * get decremented in ip_rput_data_v6
- */
- if (data_mp != mp)
- freeb(mp);
- put(fwdq, data_mp);
- } else {
- /*
- * Send locally originated packets back
- * into ip_wput_v6.
- */
- put(ill->ill_wq, mp);
- }
- ill_refrele(inbound_ill);
- } else {
- put(ill->ill_wq, mp);
- }
- mp = nxt_mp;
+ nce_update(ncec, ND_STALE, NULL);
}
+ mutex_exit(&ncec->ncec_lock);
+ nce = nce_fastpath(ncec, B_TRUE, NULL);
+ nce_resolv_ok(ncec);
+ if (nce != NULL)
+ nce_refrele(nce);
return;
}
- ll_changed = nce_cmp_ll_addr(nce, hw_addr, hw_addr_len);
+ ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len);
if (!is_adv) {
/* If this is a SOLICITATION request only */
if (ll_changed)
- nce_update(nce, ND_STALE, hw_addr);
- mutex_exit(&nce->nce_lock);
- nce_cb_dispatch(nce);
+ nce_update(ncec, ND_STALE, hw_addr);
+ mutex_exit(&ncec->ncec_lock);
+ ncec_cb_dispatch(ncec);
return;
}
if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) {
/* If in any other state than REACHABLE, ignore */
- if (nce->nce_state == ND_REACHABLE) {
- nce_update(nce, ND_STALE, NULL);
+ if (ncec->ncec_state == ND_REACHABLE) {
+ nce_update(ncec, ND_STALE, NULL);
}
- mutex_exit(&nce->nce_lock);
- nce_cb_dispatch(nce);
+ mutex_exit(&ncec->ncec_lock);
+ ncec_cb_dispatch(ncec);
return;
} else {
if (ll_changed) {
- nce_update(nce, ND_UNCHANGED, hw_addr);
+ nce_update(ncec, ND_UNCHANGED, hw_addr);
ll_updated = B_TRUE;
}
if (flag & ND_NA_FLAG_SOLICITED) {
- nce_update(nce, ND_REACHABLE, NULL);
+ nce_update(ncec, ND_REACHABLE, NULL);
} else {
if (ll_updated) {
- nce_update(nce, ND_STALE, NULL);
+ nce_update(ncec, ND_STALE, NULL);
}
}
- mutex_exit(&nce->nce_lock);
- if (!(flag & ND_NA_FLAG_ROUTER) && (nce->nce_flags &
+ mutex_exit(&ncec->ncec_lock);
+ if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags &
NCE_F_ISROUTER)) {
- ire_t *ire;
-
- /*
- * Router turned to host. We need to remove the
- * entry as well as any default route that may be
- * using this as a next hop. This is required by
- * section 7.2.5 of RFC 2461.
- */
- ire = ire_ftable_lookup_v6(&ipv6_all_zeros,
- &ipv6_all_zeros, &nce->nce_addr, IRE_DEFAULT,
- nce->nce_ill->ill_ipif, NULL, ALL_ZONES, 0, NULL,
- MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW |
- MATCH_IRE_DEFAULT, ipst);
- if (ire != NULL) {
- ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst);
- ire_delete(ire);
- ire_refrele(ire);
- }
- ndp_delete(nce); /* will do nce_cb_dispatch */
+ ncec_router_to_host(ncec);
} else {
- nce_cb_dispatch(nce);
+ ncec_cb_dispatch(ncec);
}
}
}
/*
- * Walker state structure used by ndp_process() / ndp_process_entry().
- */
-typedef struct ndp_process_data {
- ill_t *np_ill; /* ill/illgrp to match against */
- const in6_addr_t *np_addr; /* IPv6 address to match */
- uchar_t *np_hw_addr; /* passed to nce_process() */
- uint32_t np_flag; /* passed to nce_process() */
- boolean_t np_is_adv; /* passed to nce_process() */
-} ndp_process_data_t;
-
-/*
- * Walker callback used by ndp_process() for IPMP groups: calls nce_process()
- * for each NCE with a matching address that's in the same IPMP group.
- */
-static void
-ndp_process_entry(nce_t *nce, void *arg)
-{
- ndp_process_data_t *npp = arg;
-
- if (IS_IN_SAME_ILLGRP(nce->nce_ill, npp->np_ill) &&
- IN6_ARE_ADDR_EQUAL(&nce->nce_addr, npp->np_addr) &&
- IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) {
- nce_process(nce, npp->np_hw_addr, npp->np_flag, npp->np_is_adv);
- }
-}
-
-/*
- * Wrapper around nce_process() that handles IPMP. In particular, for IPMP,
- * NCEs are per-underlying-ill (because of nce_fp_mp) and thus we may have
- * more than one NCE for a given IPv6 address to tend to. In that case, we
- * need to walk all NCEs and callback nce_process() for each one. Since this
- * is expensive, in the non-IPMP case we just directly call nce_process().
- * Ultimately, nce_fp_mp needs to be moved out of the nce_t so that all IP
- * interfaces in an IPMP group share the same NCEs -- at which point this
- * function can be removed entirely.
- */
-void
-ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv)
-{
- ill_t *ill = nce->nce_ill;
- struct ndp_g_s *ndp = ill->ill_ipst->ips_ndp6;
- ndp_process_data_t np;
-
- if (ill->ill_grp == NULL) {
- nce_process(nce, hw_addr, flag, is_adv);
- return;
- }
-
- /* IPMP case: walk all NCEs */
- np.np_ill = ill;
- np.np_addr = &nce->nce_addr;
- np.np_flag = flag;
- np.np_is_adv = is_adv;
- np.np_hw_addr = hw_addr;
-
- ndp_walk_common(ndp, NULL, (pfi_t)ndp_process_entry, &np, ALL_ZONES);
-}
-
-/*
- * Pass arg1 to the pfi supplied, along with each nce in existence.
- * ndp_walk() places a REFHOLD on the nce and drops the lock when
+ * Pass arg1 to the pfi supplied, along with each ncec in existence.
+ * ncec_walk() places a REFHOLD on the ncec and drops the lock when
* walking the hash list.
*/
void
-ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
+ncec_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
boolean_t trace)
{
- nce_t *nce;
- nce_t *nce1;
- nce_t **ncep;
- nce_t *free_nce_list = NULL;
+ ncec_t *ncec;
+ ncec_t *ncec1;
+ ncec_t **ncep;
+ ncec_t *free_nce_list = NULL;
mutex_enter(&ndp->ndp_g_lock);
- /* Prevent ndp_delete from unlink and free of NCE */
+ /* Prevent ncec_delete from unlink and free of NCE */
ndp->ndp_g_walker++;
mutex_exit(&ndp->ndp_g_lock);
for (ncep = ndp->nce_hash_tbl;
ncep < A_END(ndp->nce_hash_tbl); ncep++) {
- for (nce = *ncep; nce != NULL; nce = nce1) {
- nce1 = nce->nce_next;
- if (ill == NULL || nce->nce_ill == ill) {
+ for (ncec = *ncep; ncec != NULL; ncec = ncec1) {
+ ncec1 = ncec->ncec_next;
+ if (ill == NULL || ncec->ncec_ill == ill) {
if (trace) {
- NCE_REFHOLD(nce);
- (*pfi)(nce, arg1);
- NCE_REFRELE(nce);
+ ncec_refhold(ncec);
+ (*pfi)(ncec, arg1);
+ ncec_refrele(ncec);
} else {
- NCE_REFHOLD_NOTR(nce);
- (*pfi)(nce, arg1);
- NCE_REFRELE_NOTR(nce);
+ ncec_refhold_notr(ncec);
+ (*pfi)(ncec, arg1);
+ ncec_refrele_notr(ncec);
}
}
}
}
- for (nce = ndp->nce_mask_entries; nce != NULL; nce = nce1) {
- nce1 = nce->nce_next;
- if (ill == NULL || nce->nce_ill == ill) {
- if (trace) {
- NCE_REFHOLD(nce);
- (*pfi)(nce, arg1);
- NCE_REFRELE(nce);
- } else {
- NCE_REFHOLD_NOTR(nce);
- (*pfi)(nce, arg1);
- NCE_REFRELE_NOTR(nce);
- }
- }
- }
mutex_enter(&ndp->ndp_g_lock);
ndp->ndp_g_walker--;
- /*
- * While NCE's are removed from global list they are placed
- * in a private list, to be passed to nce_ire_delete_list().
- * The reason is, there may be ires pointing to this nce
- * which needs to cleaned up.
- */
if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) {
/* Time to delete condemned entries */
for (ncep = ndp->nce_hash_tbl;
ncep < A_END(ndp->nce_hash_tbl); ncep++) {
- nce = *ncep;
- if (nce != NULL) {
- nce_remove(ndp, nce, &free_nce_list);
+ ncec = *ncep;
+ if (ncec != NULL) {
+ nce_remove(ndp, ncec, &free_nce_list);
}
}
- nce = ndp->nce_mask_entries;
- if (nce != NULL) {
- nce_remove(ndp, nce, &free_nce_list);
- }
ndp->ndp_g_walker_cleanup = B_FALSE;
}
mutex_exit(&ndp->ndp_g_lock);
if (free_nce_list != NULL) {
- nce_ire_delete_list(free_nce_list);
+ nce_cleanup_list(free_nce_list);
}
}
@@ -1077,198 +986,10 @@ ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1,
* Note that ill can be NULL hence can't derive the ipst from it.
*/
void
-ndp_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
-{
- ndp_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE);
- ndp_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE);
-}
-
-/*
- * Process resolve requests. Handles both mapped entries
- * as well as cases that needs to be send out on the wire.
- * Lookup a NCE for a given IRE. Regardless of whether one exists
- * or one is created, we defer making ire point to nce until the
- * ire is actually added at which point the nce_refcnt on the nce is
- * incremented. This is done primarily to have symmetry between ire_add()
- * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
- */
-int
-ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid)
-{
- nce_t *nce, *hw_nce = NULL;
- int err;
- ill_t *ipmp_ill;
- uint16_t nce_flags;
- mblk_t *mp_nce = NULL;
- ip_stack_t *ipst = ill->ill_ipst;
- uchar_t *hwaddr = NULL;
-
- ASSERT(ill->ill_isv6);
-
- if (IN6_IS_ADDR_MULTICAST(dst))
- return (nce_set_multicast(ill, dst));
-
- nce_flags = (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0;
-
- /*
- * If `ill' is under IPMP, then first check to see if there's an NCE
- * for `dst' on the IPMP meta-interface (e.g., because an application
- * explicitly did an SIOCLIFSETND to tie a hardware address to `dst').
- * If so, we use that hardware address when creating the NCE below.
- * Note that we don't yet have a mechanism to remove these NCEs if the
- * NCE for `dst' on the IPMP meta-interface is subsequently removed --
- * but rather than build such a beast, we should fix NCEs so that they
- * can be properly shared across an IPMP group.
- */
- if (IS_UNDER_IPMP(ill)) {
- if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) {
- hw_nce = ndp_lookup_v6(ipmp_ill, B_FALSE, dst, B_FALSE);
- if (hw_nce != NULL && hw_nce->nce_res_mp != NULL) {
- hwaddr = hw_nce->nce_res_mp->b_rptr +
- NCE_LL_ADDR_OFFSET(ipmp_ill);
- nce_flags |= hw_nce->nce_flags;
- }
- ill_refrele(ipmp_ill);
- }
- }
-
- err = ndp_lookup_then_add_v6(ill,
- B_FALSE, /* NCE fastpath is per ill; don't match across group */
- hwaddr,
- dst,
- &ipv6_all_ones,
- &ipv6_all_zeros,
- 0,
- nce_flags,
- hwaddr != NULL ? ND_REACHABLE : ND_INCOMPLETE,
- &nce);
-
- if (hw_nce != NULL)
- NCE_REFRELE(hw_nce);
-
- switch (err) {
- case 0:
- /*
- * New cache entry was created. Make sure that the state
- * is not ND_INCOMPLETE. It can be in some other state
- * even before we send out the solicitation as we could
- * get un-solicited advertisements.
- *
- * If this is an XRESOLV interface, simply return 0,
- * since we don't want to solicit just yet.
- */
- if (ill->ill_flags & ILLF_XRESOLV) {
- NCE_REFRELE(nce);
- return (0);
- }
-
- mutex_enter(&nce->nce_lock);
- if (nce->nce_state != ND_INCOMPLETE) {
- mutex_exit(&nce->nce_lock);
- NCE_REFRELE(nce);
- return (0);
- }
- if (nce->nce_rcnt == 0) {
- /* The caller will free mp */
- mutex_exit(&nce->nce_lock);
- ndp_delete(nce);
- NCE_REFRELE(nce);
- return (ESRCH);
- }
- mp_nce = ip_prepend_zoneid(mp, zoneid, ipst);
- if (mp_nce == NULL) {
- /* The caller will free mp */
- mutex_exit(&nce->nce_lock);
- ndp_delete(nce);
- NCE_REFRELE(nce);
- return (ENOMEM);
- }
- nce_queue_mp(nce, mp_nce);
- ip_ndp_resolve(nce);
- mutex_exit(&nce->nce_lock);
- NCE_REFRELE(nce);
- return (EINPROGRESS);
- case EEXIST:
- /* Resolution in progress just queue the packet */
- mutex_enter(&nce->nce_lock);
- if (nce->nce_state == ND_INCOMPLETE) {
- mp_nce = ip_prepend_zoneid(mp, zoneid, ipst);
- if (mp_nce == NULL) {
- err = ENOMEM;
- } else {
- nce_queue_mp(nce, mp_nce);
- err = EINPROGRESS;
- }
- } else {
- /*
- * Any other state implies we have
- * a nce but IRE needs to be added ...
- * ire_add_v6() will take care of the
- * the case when the nce becomes CONDEMNED
- * before the ire is added to the table.
- */
- err = 0;
- }
- mutex_exit(&nce->nce_lock);
- NCE_REFRELE(nce);
- break;
- default:
- ip1dbg(("ndp_resolver: Can't create NCE %d\n", err));
- break;
- }
- return (err);
-}
-
-/*
- * When there is no resolver, the link layer template is passed in
- * the IRE.
- * Lookup a NCE for a given IRE. Regardless of whether one exists
- * or one is created, we defer making ire point to nce until the
- * ire is actually added at which point the nce_refcnt on the nce is
- * incremented. This is done primarily to have symmetry between ire_add()
- * and ire_delete() which decrements the nce_refcnt, when an ire is deleted.
- */
-int
-ndp_noresolver(ill_t *ill, const in6_addr_t *dst)
+ncec_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst)
{
- nce_t *nce;
- int err = 0;
-
- ASSERT(ill != NULL);
- ASSERT(ill->ill_isv6);
- if (IN6_IS_ADDR_MULTICAST(dst)) {
- err = nce_set_multicast(ill, dst);
- return (err);
- }
-
- err = ndp_lookup_then_add_v6(ill,
- B_FALSE, /* NCE fastpath is per ill; don't match across group */
- ill->ill_dest_addr, /* hardware address is NULL in most cases */
- dst,
- &ipv6_all_ones,
- &ipv6_all_zeros,
- 0,
- (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0,
- ND_REACHABLE,
- &nce);
-
- switch (err) {
- case 0:
- /*
- * Cache entry with a proper resolver cookie was
- * created.
- */
- NCE_REFRELE(nce);
- break;
- case EEXIST:
- err = 0;
- NCE_REFRELE(nce);
- break;
- default:
- ip1dbg(("ndp_noresolver: Can't create NCE %d\n", err));
- break;
- }
- return (err);
+ ncec_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE);
+ ncec_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE);
}
/*
@@ -1277,83 +998,73 @@ ndp_noresolver(ill_t *ill, const in6_addr_t *dst)
* multicast destination.
*/
static int
-nce_set_multicast(ill_t *ill, const in6_addr_t *dst)
+nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
+ uint16_t flags, nce_t **newnce)
{
- nce_t *mnce; /* Multicast mapping entry */
- nce_t *nce;
- uchar_t *hw_addr = NULL;
+ uchar_t *hw_addr;
int err = 0;
ip_stack_t *ipst = ill->ill_ipst;
+ nce_t *nce;
ASSERT(ill != NULL);
ASSERT(ill->ill_isv6);
ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst)));
mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
- nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *dst));
- nce = nce_lookup_addr(ill, B_FALSE, dst, nce);
+ nce = nce_lookup_addr(ill, dst);
if (nce != NULL) {
mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
- NCE_REFRELE(nce);
- return (0);
- }
- /* No entry, now lookup for a mapping this should never fail */
- mnce = nce_lookup_mapping(ill, dst);
- if (mnce == NULL) {
- /* Something broken for the interface. */
- mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
- return (ESRCH);
+ goto done;
}
- ASSERT(mnce->nce_flags & NCE_F_MAPPING);
if (ill->ill_net_type == IRE_IF_RESOLVER) {
/*
* For IRE_IF_RESOLVER a hardware mapping can be
- * generated, for IRE_IF_NORESOLVER, resolution cookie
- * in the ill is copied in ndp_add_v6().
+ * generated.
*/
hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP);
if (hw_addr == NULL) {
mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
- NCE_REFRELE(mnce);
return (ENOMEM);
}
- nce_make_mapping(mnce, hw_addr, (uchar_t *)dst);
+ ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
+ } else {
+ /*
+ * So no hw_addr is needed for IRE_IF_NORESOLVER.
+ */
+ hw_addr = NULL;
}
- NCE_REFRELE(mnce);
- /*
- * IRE_IF_NORESOLVER type simply copies the resolution
- * cookie passed in. So no hw_addr is needed.
- */
- err = ndp_add_v6(ill,
- hw_addr,
- dst,
- &ipv6_all_ones,
- &ipv6_all_zeros,
- 0,
- NCE_F_NONUD,
- ND_REACHABLE,
- &nce);
+ ASSERT((flags & NCE_F_MCAST) != 0);
+ ASSERT((flags & NCE_F_NONUD) != 0);
+ /* nce_state will be computed by nce_add_common() */
+ err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
+ ND_UNCHANGED, &nce);
mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
+ if (err == 0)
+ err = nce_add_v6_postprocess(nce);
if (hw_addr != NULL)
kmem_free(hw_addr, ill->ill_nd_lla_len);
if (err != 0) {
- ip1dbg(("nce_set_multicast: create failed" "%d\n", err));
+ ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
return (err);
}
- NCE_REFRELE(nce);
+done:
+ ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
+ if (newnce != NULL)
+ *newnce = nce;
+ else
+ nce_refrele(nce);
return (0);
}
/*
- * Return the link layer address, and any flags of a nce.
+ * Return the link layer address, and any flags of a ncec.
*/
int
ndp_query(ill_t *ill, struct lif_nd_req *lnr)
{
- nce_t *nce;
+ ncec_t *ncec;
in6_addr_t *addr;
sin6_t *sin6;
- dl_unitdata_req_t *dl;
ASSERT(ill != NULL && ill->ill_isv6);
sin6 = (sin6_t *)&lnr->lnr_addr;
@@ -1363,158 +1074,135 @@ ndp_query(ill_t *ill, struct lif_nd_req *lnr)
* NOTE: if the ill is an IPMP interface, then match against the whole
* illgrp. This e.g. allows in.ndpd to retrieve the link layer
* addresses for the data addresses on an IPMP interface even though
- * ipif_ndp_up() created them with an nce_ill of ipif_bound_ill.
+ * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill.
*/
- nce = ndp_lookup_v6(ill, IS_IPMP(ill), addr, B_FALSE);
- if (nce == NULL)
+ ncec = ncec_lookup_illgrp_v6(ill, addr);
+ if (ncec == NULL)
return (ESRCH);
- /* If in INCOMPLETE state, no link layer address is available yet */
- if (!NCE_ISREACHABLE(nce)) {
- NCE_REFRELE(nce);
+ /* If no link layer address is available yet, return ESRCH */
+ if (!NCE_ISREACHABLE(ncec)) {
+ ncec_refrele(ncec);
return (ESRCH);
}
- dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr;
- if (ill->ill_flags & ILLF_XRESOLV)
- lnr->lnr_hdw_len = dl->dl_dest_addr_length;
- else
- lnr->lnr_hdw_len = ill->ill_nd_lla_len;
- ASSERT(NCE_LL_ADDR_OFFSET(ill) + lnr->lnr_hdw_len <=
- sizeof (lnr->lnr_hdw_addr));
- bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
- (uchar_t *)&lnr->lnr_hdw_addr, lnr->lnr_hdw_len);
- if (nce->nce_flags & NCE_F_ISROUTER)
+ lnr->lnr_hdw_len = ill->ill_phys_addr_length;
+ bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr,
+ lnr->lnr_hdw_len);
+ if (ncec->ncec_flags & NCE_F_ISROUTER)
lnr->lnr_flags = NDF_ISROUTER_ON;
- if (nce->nce_flags & NCE_F_ANYCAST)
+ if (ncec->ncec_flags & NCE_F_ANYCAST)
lnr->lnr_flags |= NDF_ANYCAST_ON;
- NCE_REFRELE(nce);
+ ncec_refrele(ncec);
return (0);
}
/*
- * Send Enable/Disable multicast reqs to driver.
+ * Finish setting up the Enable/Disable multicast for the driver.
*/
-int
-ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len,
+mblk_t *
+ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len,
uint32_t hw_addr_offset, mblk_t *mp)
{
- nce_t *nce;
uchar_t *hw_addr;
- ip_stack_t *ipst = ill->ill_ipst;
+ ipaddr_t v4group;
+ uchar_t *addr;
- ASSERT(ill != NULL && ill->ill_isv6);
ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
- hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
- if (hw_addr == NULL || !IN6_IS_ADDR_MULTICAST(addr)) {
- freemsg(mp);
- return (EINVAL);
+ if (IN6_IS_ADDR_V4MAPPED(v6group)) {
+ IN6_V4MAPPED_TO_IPADDR(v6group, v4group);
+
+ ASSERT(CLASSD(v4group));
+ ASSERT(!(ill->ill_isv6));
+
+ addr = (uchar_t *)&v4group;
+ } else {
+ ASSERT(IN6_IS_ADDR_MULTICAST(v6group));
+ ASSERT(ill->ill_isv6);
+
+ addr = (uchar_t *)v6group;
}
- mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
- nce = nce_lookup_mapping(ill, addr);
- if (nce == NULL) {
- mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
+ hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len);
+ if (hw_addr == NULL) {
+ ip0dbg(("ndp_mcastreq NULL hw_addr\n"));
freemsg(mp);
- return (ESRCH);
- }
- mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
- /*
- * Update dl_addr_length and dl_addr_offset for primitives that
- * have physical addresses as opposed to full saps
- */
- switch (((union DL_primitives *)mp->b_rptr)->dl_primitive) {
- case DL_ENABMULTI_REQ:
- /* Track the state if this is the first enabmulti */
- if (ill->ill_dlpi_multicast_state == IDS_UNKNOWN)
- ill->ill_dlpi_multicast_state = IDS_INPROGRESS;
- ip1dbg(("ndp_mcastreq: ENABMULTI\n"));
- break;
- case DL_DISABMULTI_REQ:
- ip1dbg(("ndp_mcastreq: DISABMULTI\n"));
- break;
- default:
- NCE_REFRELE(nce);
- ip1dbg(("ndp_mcastreq: default\n"));
- return (EINVAL);
+ return (NULL);
}
- nce_make_mapping(nce, hw_addr, (uchar_t *)addr);
- NCE_REFRELE(nce);
- ill_dlpi_send(ill, mp);
- return (0);
-}
+ ip_mcast_mapping(ill, addr, hw_addr);
+ return (mp);
+}
-/*
- * Send out a NS for resolving the ip address in nce.
- */
void
-ip_ndp_resolve(nce_t *nce)
+ip_ndp_resolve(ncec_t *ncec)
{
+ in_addr_t sender4 = INADDR_ANY;
in6_addr_t sender6 = ipv6_all_zeros;
+ ill_t *src_ill;
uint32_t ms;
- mblk_t *mp;
- ip6_t *ip6h;
- ASSERT(MUTEX_HELD(&nce->nce_lock));
- /*
- * Pick the src from outgoing packet, if one is available.
- * Otherwise let nce_xmit figure out the src.
- */
- if ((mp = nce->nce_qd_mp) != NULL) {
- /* Handle ip_newroute_v6 giving us IPSEC packets */
- if (mp->b_datap->db_type == M_CTL)
- mp = mp->b_cont;
- ip6h = (ip6_t *)mp->b_rptr;
- if (ip6h->ip6_nxt == IPPROTO_RAW) {
- /*
- * This message should have been pulled up already in
- * ip_wput_v6. We can't do pullups here because
- * the message could be from the nce_qd_mp which could
- * have b_next/b_prev non-NULL.
- */
- ASSERT(MBLKL(mp) >= sizeof (ip6i_t) + IPV6_HDR_LEN);
- ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
- }
- sender6 = ip6h->ip6_src;
+ src_ill = nce_resolve_src(ncec, &sender6);
+ if (src_ill == NULL) {
+ /* Make sure we try again later */
+ ms = ncec->ncec_ill->ill_reachable_retrans_time;
+ nce_restart_timer(ncec, (clock_t)ms);
+ return;
}
- ms = nce_solicit(nce, sender6);
- mutex_exit(&nce->nce_lock);
+ if (ncec->ncec_ipversion == IPV4_VERSION)
+ IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
+ mutex_enter(&ncec->ncec_lock);
+ if (ncec->ncec_ipversion == IPV6_VERSION)
+ ms = ndp_solicit(ncec, sender6, src_ill);
+ else
+ ms = arp_request(ncec, sender4, src_ill);
+ mutex_exit(&ncec->ncec_lock);
if (ms == 0) {
- if (nce->nce_state != ND_REACHABLE) {
- nce_resolv_failed(nce);
- ndp_delete(nce);
+ if (ncec->ncec_state != ND_REACHABLE) {
+ if (ncec->ncec_ipversion == IPV6_VERSION)
+ ndp_resolv_failed(ncec);
+ else
+ arp_resolv_failed(ncec);
+ ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0);
+ nce_make_unreachable(ncec);
+ ncec_delete(ncec);
}
} else {
- NDP_RESTART_TIMER(nce, (clock_t)ms);
+ nce_restart_timer(ncec, (clock_t)ms);
}
- mutex_enter(&nce->nce_lock);
+done:
+ ill_refrele(src_ill);
}
/*
- * Send a neighbor solicitation.
+ * Send an IPv6 neighbor solicitation.
* Returns number of milliseconds after which we should either rexmit or abort.
* Return of zero means we should abort.
- * The caller holds the nce_lock to protect nce_qd_mp and nce_rcnt.
+ * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt.
+ * The optional source address is used as a hint to ndp_solicit for
+ * which source to use in the packet.
*
- * NOTE: This routine drops nce_lock (and later reacquires it) when sending
+ * NOTE: This routine drops ncec_lock (and later reacquires it) when sending
* the packet.
*/
uint32_t
-nce_solicit(nce_t *nce, in6_addr_t sender)
+ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill)
{
- boolean_t dropped;
+ in6_addr_t dst;
+ boolean_t dropped = B_FALSE;
- ASSERT(nce->nce_ipversion == IPV6_VERSION);
- ASSERT(MUTEX_HELD(&nce->nce_lock));
+ ASSERT(ncec->ncec_ipversion == IPV6_VERSION);
+ ASSERT(MUTEX_HELD(&ncec->ncec_lock));
- if (nce->nce_rcnt == 0)
+ if (ncec->ncec_rcnt == 0)
return (0);
- nce->nce_rcnt--;
- mutex_exit(&nce->nce_lock);
- dropped = nce_xmit_solicit(nce, B_TRUE, &sender, 0);
- mutex_enter(&nce->nce_lock);
+ dst = ncec->ncec_addr;
+ ncec->ncec_rcnt--;
+ mutex_exit(&ncec->ncec_lock);
+ dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr,
+ ill->ill_phys_addr_length, &src, &dst, 0);
+ mutex_enter(&ncec->ncec_lock);
if (dropped)
- nce->nce_rcnt++;
- return (nce->nce_ill->ill_reachable_retrans_time);
+ ncec->ncec_rcnt++;
+ return (ncec->ncec_ill->ill_reachable_retrans_time);
}
/*
@@ -1528,23 +1216,30 @@ nce_solicit(nce_t *nce, in6_addr_t sender)
* ip_ndp_excl.
*/
/* ARGSUSED */
-static void
-ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
+void
+ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
{
ill_t *ill = rq->q_ptr;
ipif_t *ipif;
- in6_addr_t *addr = (in6_addr_t *)mp->b_rptr;
+ in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr;
+ in_addr_t *addr4 = (in_addr_t *)mp->b_rptr;
+ boolean_t addr_equal;
for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
/*
* We do not support recovery of proxy ARP'd interfaces,
* because the system lacks a complete proxy ARP mechanism.
*/
- if ((ipif->ipif_flags & IPIF_POINTOPOINT) ||
- !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, addr)) {
- continue;
+ if (ill->ill_isv6) {
+ addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
+ addr6);
+ } else {
+ addr_equal = (ipif->ipif_lcl_addr == *addr4);
}
+ if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal)
+ continue;
+
/*
* If we have already recovered or if the interface is going
* away, then ignore.
@@ -1561,13 +1256,20 @@ ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
mutex_exit(&ill->ill_lock);
ipif->ipif_was_dup = B_TRUE;
- VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
- (void) ipif_up_done_v6(ipif);
+ if (ill->ill_isv6) {
+ VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS);
+ (void) ipif_up_done_v6(ipif);
+ } else {
+ VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) !=
+ EINPROGRESS);
+ (void) ipif_up_done(ipif);
+ }
}
freeb(mp);
}
/*
+ *
* Attempt to recover an IPv6 interface that's been shut down as a duplicate.
* As long as someone else holds the address, the interface will stay down.
* When that conflict goes away, the interface is brought back up. This is
@@ -1579,8 +1281,8 @@ ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
*
* This function is entered on a timer expiry; the ID is in ipif_recovery_id.
*/
-static void
-ipif6_dup_recovery(void *arg)
+void
+ipif_dup_recovery(void *arg)
{
ipif_t *ipif = arg;
@@ -1598,7 +1300,7 @@ ipif6_dup_recovery(void *arg)
if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING))
return;
- ndp_do_recovery(ipif);
+ ipif_do_recovery(ipif);
}
/*
@@ -1608,18 +1310,24 @@ ipif6_dup_recovery(void *arg)
* Called both by recovery timer expiry and link-up notification.
*/
void
-ndp_do_recovery(ipif_t *ipif)
+ipif_do_recovery(ipif_t *ipif)
{
ill_t *ill = ipif->ipif_ill;
mblk_t *mp;
ip_stack_t *ipst = ill->ill_ipst;
+ size_t mp_size;
- mp = allocb(sizeof (ipif->ipif_v6lcl_addr), BPRI_MED);
+ if (ipif->ipif_isv6)
+ mp_size = sizeof (ipif->ipif_v6lcl_addr);
+ else
+ mp_size = sizeof (ipif->ipif_lcl_addr);
+ mp = allocb(mp_size, BPRI_MED);
if (mp == NULL) {
mutex_enter(&ill->ill_lock);
- if (ipif->ipif_recovery_id == 0 &&
+ if (ipst->ips_ip_dup_recovery > 0 &&
+ ipif->ipif_recovery_id == 0 &&
!(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
- ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
+ ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
}
mutex_exit(&ill->ill_lock);
@@ -1632,10 +1340,15 @@ ndp_do_recovery(ipif_t *ipif)
(void) untimeout(ipif->ipif_recovery_id);
ipif->ipif_recovery_id = 0;
- bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
- sizeof (ipif->ipif_v6lcl_addr));
+ if (ipif->ipif_isv6) {
+ bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr,
+ sizeof (ipif->ipif_v6lcl_addr));
+ } else {
+ bcopy(&ipif->ipif_lcl_addr, mp->b_rptr,
+ sizeof (ipif->ipif_lcl_addr));
+ }
ill_refhold(ill);
- qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_recover, NEW_OP,
+ qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP,
B_FALSE);
}
}
@@ -1644,80 +1357,19 @@ ndp_do_recovery(ipif_t *ipif)
* Find the MAC and IP addresses in an NA/NS message.
*/
static void
-ip_ndp_find_addresses(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, in6_addr_t *targp,
- uchar_t **haddr, uint_t *haddrlenp)
+ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill,
+ in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp)
{
- ip6_t *ip6h = (ip6_t *)mp->b_rptr;
icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
- nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
uchar_t *addr;
- int alen = 0;
+ int alen;
- if (dl_mp == NULL) {
- nd_opt_hdr_t *opt = NULL;
- int len;
-
- /*
- * If it's from the fast-path, then it can't be a probe
- * message, and thus must include a linkaddr option.
- * Extract that here.
- */
- switch (icmp6->icmp6_type) {
- case ND_NEIGHBOR_SOLICIT:
- len = mp->b_wptr - (uchar_t *)ns;
- if ((len -= sizeof (*ns)) > 0) {
- opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1),
- len, ND_OPT_SOURCE_LINKADDR);
- }
- break;
- case ND_NEIGHBOR_ADVERT:
- len = mp->b_wptr - (uchar_t *)na;
- if ((len -= sizeof (*na)) > 0) {
- opt = ndp_get_option((nd_opt_hdr_t *)(na + 1),
- len, ND_OPT_TARGET_LINKADDR);
- }
- break;
- }
-
- if (opt != NULL && opt->nd_opt_len * 8 - sizeof (*opt) >=
- ill->ill_nd_lla_len) {
- addr = (uchar_t *)(opt + 1);
- alen = ill->ill_nd_lla_len;
- }
-
- /*
- * We cheat a bit here for the sake of printing usable log
- * messages in the rare case where the reply we got was unicast
- * without a source linkaddr option, and the interface is in
- * fastpath mode. (Sigh.)
- */
- if (alen == 0 && ill->ill_type == IFT_ETHER &&
- MBLKHEAD(mp) >= sizeof (struct ether_header)) {
- struct ether_header *pether;
-
- pether = (struct ether_header *)((char *)ip6h -
- sizeof (*pether));
- addr = pether->ether_shost.ether_addr_octet;
- alen = ETHERADDRL;
- }
- } else {
- dl_unitdata_ind_t *dlu;
-
- dlu = (dl_unitdata_ind_t *)dl_mp->b_rptr;
- alen = dlu->dl_src_addr_length;
- if (alen > 0 && dlu->dl_src_addr_offset >= sizeof (*dlu) &&
- dlu->dl_src_addr_offset + alen <= MBLKL(dl_mp)) {
- addr = dl_mp->b_rptr + dlu->dl_src_addr_offset;
- if (ill->ill_sap_length < 0) {
- alen += ill->ill_sap_length;
- } else {
- addr += ill->ill_sap_length;
- alen -= ill->ill_sap_length;
- }
- }
- }
+ /* icmp_inbound_v6 ensures this */
+ ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
+ addr = ira->ira_l2src;
+ alen = ill->ill_phys_addr_length;
if (alen > 0) {
*haddr = addr;
*haddrlenp = alen;
@@ -1740,35 +1392,58 @@ ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
{
ill_t *ill = rq->q_ptr;
ipif_t *ipif;
- mblk_t *dl_mp = NULL;
uchar_t *haddr;
uint_t haddrlen;
ip_stack_t *ipst = ill->ill_ipst;
in6_addr_t targ;
-
- if (DB_TYPE(mp) != M_DATA) {
- dl_mp = mp;
- mp = mp->b_cont;
+ ip_recv_attr_t iras;
+ mblk_t *attrmp;
+
+ attrmp = mp;
+ mp = mp->b_cont;
+ attrmp->b_cont = NULL;
+ if (!ip_recv_attr_from_mblk(attrmp, &iras)) {
+ /* The ill or ip_stack_t disappeared on us */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ip_recv_attr_from_mblk", mp, ill);
+ freemsg(mp);
+ ira_cleanup(&iras, B_TRUE);
+ return;
}
- ip_ndp_find_addresses(mp, dl_mp, ill, &targ, &haddr, &haddrlen);
+ ASSERT(ill == iras.ira_rill);
+
+ ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen);
if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) {
/*
* Ignore conflicts generated by misbehaving switches that
* just reflect our own messages back to us. For IPMP, we may
* see reflections across any ill in the illgrp.
+ *
+ * RFC2462 and revisions tried to detect both the case
+ * when a statically configured IPv6 address is a duplicate,
+ * and the case when the L2 address itself is a duplicate. The
+ * later is important because, with stateles address autoconf,
+ * if the L2 address is a duplicate, the resulting IPv6
+ * address(es) would also be duplicates. We rely on DAD of the
+ * IPv6 address itself to detect the latter case.
*/
+ /* For an under ill_grp can change under lock */
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
IS_UNDER_IPMP(ill) &&
- ipmp_illgrp_find_ill(ill->ill_grp, haddr, haddrlen) != NULL)
+ ipmp_illgrp_find_ill(ill->ill_grp, haddr,
+ haddrlen) != NULL) {
+ rw_exit(&ipst->ips_ill_g_lock);
goto ignore_conflict;
+ }
+ rw_exit(&ipst->ips_ill_g_lock);
}
/*
* Look up the appropriate ipif.
*/
- ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, NULL, NULL, NULL,
- NULL, ipst);
+ ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst);
if (ipif == NULL)
goto ignore_conflict;
@@ -1802,43 +1477,64 @@ ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
ill->ill_ipif_dup_count++;
mutex_exit(&ill->ill_lock);
(void) ipif_down(ipif, NULL, NULL);
- ipif_down_tail(ipif);
+ (void) ipif_down_tail(ipif);
mutex_enter(&ill->ill_lock);
if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
ill->ill_net_type == IRE_IF_RESOLVER &&
!(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
ipst->ips_ip_dup_recovery > 0) {
ASSERT(ipif->ipif_recovery_id == 0);
- ipif->ipif_recovery_id = timeout(ipif6_dup_recovery,
+ ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
}
mutex_exit(&ill->ill_lock);
ipif_refrele(ipif);
+
ignore_conflict:
- if (dl_mp != NULL)
- freeb(dl_mp);
freemsg(mp);
+ ira_cleanup(&iras, B_TRUE);
}
/*
* Handle failure by tearing down the ipifs with the specified address. Note
- * that tearing down the ipif also means deleting the nce through ipif_down, so
- * it's not possible to do recovery by just restarting the nce timer. Instead,
+ * that tearing down the ipif also means deleting the ncec through ipif_down, so
+ * it's not possible to do recovery by just restarting the ncec timer. Instead,
* we start a timer on the ipif.
+ * Caller has to free mp;
*/
static void
-ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
+ndp_failure(mblk_t *mp, ip_recv_attr_t *ira)
{
+ const uchar_t *haddr;
+ ill_t *ill = ira->ira_rill;
+
+ /*
+ * Ignore conflicts generated by misbehaving switches that just
+ * reflect our own messages back to us.
+ */
+
+ /* icmp_inbound_v6 ensures this */
+ ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
+ haddr = ira->ira_l2src;
+ if (haddr != NULL &&
+ bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) {
+ return;
+ }
+
if ((mp = copymsg(mp)) != NULL) {
- if (dl_mp == NULL)
- dl_mp = mp;
- else if ((dl_mp = copyb(dl_mp)) != NULL)
- dl_mp->b_cont = mp;
- if (dl_mp == NULL) {
+ mblk_t *attrmp;
+
+ attrmp = ip_recv_attr_to_mblk(ira);
+ if (attrmp == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
freemsg(mp);
} else {
+ ASSERT(attrmp->b_cont == NULL);
+ attrmp->b_cont = mp;
+ mp = attrmp;
ill_refhold(ill);
- qwriter_ip(ill, ill->ill_rq, dl_mp, ip_ndp_excl, NEW_OP,
+ qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP,
B_FALSE);
}
}
@@ -1848,20 +1544,39 @@ ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
* Handle a discovered conflict: some other system is advertising that it owns
* one of our IP addresses. We need to defend ourselves, or just shut down the
* interface.
+ *
+ * Handles both IPv4 and IPv6
*/
-static void
-ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
+boolean_t
+ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec)
{
- ipif_t *ipif;
- uint32_t now;
- uint_t maxdefense;
- uint_t defs;
- ip_stack_t *ipst = ill->ill_ipst;
+ ipif_t *ipif;
+ clock_t now;
+ uint_t maxdefense;
+ uint_t defs;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+ uint32_t elapsed;
+ boolean_t isv6 = ill->ill_isv6;
+ ipaddr_t ncec_addr;
- ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill, ALL_ZONES, NULL, NULL,
- NULL, NULL, ipst);
+ if (isv6) {
+ ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES,
+ ipst);
+ } else {
+ if (arp_no_defense) {
+ /*
+ * Yes, there is a conflict, but no, we do not
+ * defend ourself.
+ */
+ return (B_TRUE);
+ }
+ IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
+ ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES,
+ ipst);
+ }
if (ipif == NULL)
- return;
+ return (B_FALSE);
/*
* First, figure out if this address is disposable.
@@ -1875,50 +1590,51 @@ ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce)
* Now figure out how many times we've defended ourselves. Ignore
* defenses that happened long in the past.
*/
- now = gethrestime_sec();
- mutex_enter(&nce->nce_lock);
- if ((defs = nce->nce_defense_count) > 0 &&
- now - nce->nce_defense_time > ipst->ips_ip_defend_interval) {
- nce->nce_defense_count = defs = 0;
+ now = ddi_get_lbolt();
+ elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000;
+ mutex_enter(&ncec->ncec_lock);
+ if ((defs = ncec->ncec_defense_count) > 0 &&
+ elapsed > ipst->ips_ip_defend_interval) {
+ /*
+ * ip_defend_interval has elapsed.
+ * reset the defense count.
+ */
+ ncec->ncec_defense_count = defs = 0;
}
- nce->nce_defense_count++;
- nce->nce_defense_time = now;
- mutex_exit(&nce->nce_lock);
+ ncec->ncec_defense_count++;
+ ncec->ncec_last_time_defended = now;
+ mutex_exit(&ncec->ncec_lock);
ipif_refrele(ipif);
/*
* If we've defended ourselves too many times already, then give up and
- * tear down the interface(s) using this address. Otherwise, defend by
- * sending out an unsolicited Neighbor Advertisement.
+ * tear down the interface(s) using this address.
+ * Otherwise, caller has to defend by sending out an announce.
*/
if (defs >= maxdefense) {
- ip_ndp_failure(ill, mp, dl_mp);
+ if (isv6)
+ ndp_failure(mp, ira);
+ else
+ arp_failure(mp, ira);
} else {
- char hbuf[MAC_STR_LEN];
- char sbuf[INET6_ADDRSTRLEN];
- uchar_t *haddr;
- uint_t haddrlen;
- in6_addr_t targ;
-
- ip_ndp_find_addresses(mp, dl_mp, ill, &targ, &haddr, &haddrlen);
- cmn_err(CE_WARN, "node %s is using our IP address %s on %s",
- mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)),
- inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)),
- ill->ill_name);
-
- (void) nce_xmit_advert(nce, B_FALSE, &ipv6_all_hosts_mcast, 0);
+ return (B_TRUE); /* caller must defend this address */
}
+ return (B_FALSE);
}
+/*
+ * Handle reception of Neighbor Solicitation messages.
+ */
static void
-ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
+ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira)
{
+ ill_t *ill = ira->ira_ill, *under_ill;
nd_neighbor_solicit_t *ns;
- uint32_t hlen = ill->ill_nd_lla_len;
+ uint32_t hlen = ill->ill_phys_addr_length;
uchar_t *haddr = NULL;
icmp6_t *icmp_nd;
ip6_t *ip6h;
- nce_t *our_nce = NULL;
+ ncec_t *our_ncec = NULL;
in6_addr_t target;
in6_addr_t src;
int len;
@@ -1926,6 +1642,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
nd_opt_hdr_t *opt = NULL;
boolean_t bad_solicit = B_FALSE;
mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
+ boolean_t need_ill_refrele = B_FALSE;
ip6h = (ip6_t *)mp->b_rptr;
icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
@@ -1951,7 +1668,6 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
bad_solicit = B_TRUE;
goto done;
}
-
}
if (IN6_IS_ADDR_UNSPECIFIED(&src)) {
/* Check to see if this is a valid DAD solicitation */
@@ -1974,20 +1690,20 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
* e.g. the IPMP ill's data link-local. So we match across the illgrp
* to ensure we find the associated NCE.
*/
- our_nce = ndp_lookup_v6(ill, B_TRUE, &target, B_FALSE);
+ our_ncec = ncec_lookup_illgrp_v6(ill, &target);
/*
- * If this is a valid Solicitation, a permanent
- * entry should exist in the cache
+ * If this is a valid Solicitation for an address we are publishing,
+ * then a PUBLISH entry should exist in the cache
*/
- if (our_nce == NULL ||
- !(our_nce->nce_flags & NCE_F_PERMANENT)) {
+ if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) {
ip1dbg(("ndp_input_solicit: Wrong target in NS?!"
"ifname=%s ", ill->ill_name));
if (ip_debug > 2) {
/* ip1dbg */
pr_addr_dbg(" dst %s\n", AF_INET6, &target);
}
- bad_solicit = B_TRUE;
+ if (our_ncec == NULL)
+ bad_solicit = B_TRUE;
goto done;
}
@@ -1998,7 +1714,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
haddr = (uchar_t *)&opt[1];
if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) ||
hlen == 0) {
- ip1dbg(("ndp_input_solicit: bad SLLA\n"));
+ ip1dbg(("ndp_input_advert: bad SLLA\n"));
bad_solicit = B_TRUE;
goto done;
}
@@ -2010,7 +1726,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
flag |= NDP_UNICAST;
/*
- * Create/update the entry for the soliciting node.
+ * Create/update the entry for the soliciting node on the ipmp_ill.
* or respond to outstanding queries, don't if
* the source is unspecified address.
*/
@@ -2035,7 +1751,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
* process of verifying the address, then don't respond at all
* and don't keep track of the sender.
*/
- if (our_nce->nce_state == ND_PROBE)
+ if (our_ncec->ncec_state == ND_PROBE)
goto done;
/*
@@ -2048,27 +1764,37 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
if (haddr == NULL)
goto no_source;
- err = ndp_lookup_then_add_v6(ill,
- B_FALSE,
- haddr,
+ under_ill = ill;
+ if (IS_UNDER_IPMP(under_ill)) {
+ ill = ipmp_ill_hold_ipmp_ill(under_ill);
+ if (ill == NULL)
+ ill = under_ill;
+ else
+ need_ill_refrele = B_TRUE;
+ }
+ err = nce_lookup_then_add_v6(ill,
+ haddr, hlen,
&src, /* Soliciting nodes address */
- &ipv6_all_ones,
- &ipv6_all_zeros,
- 0,
0,
ND_STALE,
&nnce);
+
+ if (need_ill_refrele) {
+ ill_refrele(ill);
+ ill = under_ill;
+ need_ill_refrele = B_FALSE;
+ }
switch (err) {
case 0:
/* done with this entry */
- NCE_REFRELE(nnce);
+ nce_refrele(nnce);
break;
case EEXIST:
/*
* B_FALSE indicates this is not an an advertisement.
*/
- ndp_process(nnce, haddr, 0, B_FALSE);
- NCE_REFRELE(nnce);
+ nce_process(nnce->nce_common, haddr, 0, B_FALSE);
+ nce_refrele(nnce);
break;
default:
ip1dbg(("ndp_input_solicit: Can't create NCE %d\n",
@@ -2088,19 +1814,18 @@ no_source:
bad_solicit = B_TRUE;
goto done;
}
- if (our_nce->nce_state == ND_PROBE) {
+ if (our_ncec->ncec_state == ND_PROBE) {
/*
- * Internally looped-back probes won't have DLPI
- * attached to them. External ones (which are sent by
- * multicast) always will. Just ignore our own
+ * Internally looped-back probes will have
+ * IRAF_L2SRC_LOOPBACK set so we can ignore our own
* transmissions.
*/
- if (dl_mp != NULL) {
+ if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) {
/*
* If someone else is probing our address, then
* we've crossed wires. Declare failure.
*/
- ip_ndp_failure(ill, mp, dl_mp);
+ ndp_failure(mp, ira);
}
goto done;
}
@@ -2110,24 +1835,34 @@ no_source:
*/
src = ipv6_all_hosts_mcast;
}
- /* Response to a solicitation */
- (void) nce_xmit_advert(our_nce, B_TRUE, &src, flag);
+ flag |= nce_advert_flags(our_ncec);
+ (void) ndp_xmit(ill,
+ ND_NEIGHBOR_ADVERT,
+ our_ncec->ncec_lladdr,
+ our_ncec->ncec_lladdr_length,
+ &target, /* Source and target of the advertisement pkt */
+ &src, /* IP Destination (source of original pkt) */
+ flag);
done:
if (bad_solicit)
BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations);
- if (our_nce != NULL)
- NCE_REFRELE(our_nce);
+ if (our_ncec != NULL)
+ ncec_refrele(our_ncec);
}
+/*
+ * Handle reception of Neighbor Solicitation messages
+ */
void
-ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
+ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira)
{
+ ill_t *ill = ira->ira_ill;
nd_neighbor_advert_t *na;
- uint32_t hlen = ill->ill_nd_lla_len;
+ uint32_t hlen = ill->ill_phys_addr_length;
uchar_t *haddr = NULL;
icmp6_t *icmp_nd;
ip6_t *ip6h;
- nce_t *dst_nce = NULL;
+ ncec_t *dst_ncec = NULL;
in6_addr_t target;
nd_opt_hdr_t *opt = NULL;
int len;
@@ -2138,6 +1873,7 @@ ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN);
len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN;
na = (nd_neighbor_advert_t *)icmp_nd;
+
if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
(na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) {
ip1dbg(("ndp_input_advert: Target is multicast but the "
@@ -2179,17 +1915,25 @@ ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
* our local addresses, and those are spread across all the active
* ills in the group.
*/
- if ((dst_nce = ndp_lookup_v6(ill, B_TRUE, &target, B_FALSE)) == NULL)
+ if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL)
return;
- if (dst_nce->nce_flags & NCE_F_PERMANENT) {
+ if (NCE_PUBLISH(dst_ncec)) {
/*
- * Someone just advertised one of our local addresses. First,
+ * Someone just advertised an addresses that we publish. First,
* check it it was us -- if so, we can safely ignore it.
+ * We don't get the haddr from the ira_l2src because, in the
+ * case that the packet originated from us, on an IPMP group,
+ * the ira_l2src may would be the link-layer address of the
+ * cast_ill used to send the packet, which may not be the same
+ * as the dst_ncec->ncec_lladdr of the address.
*/
if (haddr != NULL) {
- if (!nce_cmp_ll_addr(dst_nce, haddr, hlen))
- goto out; /* from us -- no conflict */
+ if (ira->ira_flags & IRAF_L2SRC_LOOPBACK)
+ goto out;
+
+ if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen))
+ goto out; /* from us -- no conflict */
/*
* If we're in an IPMP group, check if this is an echo
@@ -2209,59 +1953,96 @@ ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
}
/*
- * Our own (looped-back) unsolicited neighbor advertisements
- * will get here with dl_mp == NULL. (These will usually be
- * filtered by the `haddr' checks above, but point-to-point
- * links have no hardware address and thus make it here.)
- */
- if (dl_mp == NULL && dst_nce->nce_state != ND_PROBE)
- goto out;
-
- /*
* This appears to be a real conflict. If we're trying to
* configure this NCE (ND_PROBE), then shut it down.
* Otherwise, handle the discovered conflict.
- *
- * In the ND_PROBE case, dl_mp might be NULL if we're getting
- * a unicast reply. This isn't typically done (multicast is
- * the norm in response to a probe), but we can handle it.
*/
- if (dst_nce->nce_state == ND_PROBE)
- ip_ndp_failure(ill, mp, dl_mp);
- else
- ip_ndp_conflict(ill, mp, dl_mp, dst_nce);
+ if (dst_ncec->ncec_state == ND_PROBE) {
+ ndp_failure(mp, ira);
+ } else {
+ if (ip_nce_conflict(mp, ira, dst_ncec)) {
+ char hbuf[MAC_STR_LEN];
+ char sbuf[INET6_ADDRSTRLEN];
+
+ cmn_err(CE_WARN,
+ "node '%s' is using %s on %s",
+ inet_ntop(AF_INET6, &target, sbuf,
+ sizeof (sbuf)),
+ haddr == NULL ? "<none>" :
+ mac_colon_addr(haddr, hlen, hbuf,
+ sizeof (hbuf)), ill->ill_name);
+ /*
+ * RFC 4862, Section 5.4.4 does not mandate
+ * any specific behavior when an NA matches
+ * a non-tentative address assigned to the
+ * receiver. We make the choice of defending
+ * our address, based on the assumption that
+ * the sender has not detected the Duplicate.
+ *
+ * ncec_last_time_defended has been adjusted
+ * in ip_nce_conflict()
+ */
+ (void) ndp_announce(dst_ncec);
+ }
+ }
} else {
if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER)
- dst_nce->nce_flags |= NCE_F_ISROUTER;
+ dst_ncec->ncec_flags |= NCE_F_ISROUTER;
/* B_TRUE indicates this an advertisement */
- ndp_process(dst_nce, haddr, na->nd_na_flags_reserved, B_TRUE);
+ nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE);
}
out:
- NCE_REFRELE(dst_nce);
+ ncec_refrele(dst_ncec);
}
/*
* Process NDP neighbor solicitation/advertisement messages.
* The checksum has already checked o.k before reaching here.
+ * Information about the datalink header is contained in ira_l2src, but
+ * that should be ignored for loopback packets.
*/
void
-ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
+ndp_input(mblk_t *mp, ip_recv_attr_t *ira)
{
+ ill_t *ill = ira->ira_rill;
icmp6_t *icmp_nd;
ip6_t *ip6h;
int len;
mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib;
+ ill_t *orig_ill = NULL;
-
+ /*
+ * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
+ * and make it be the IPMP upper so avoid being confused by a packet
+ * addressed to a unicast address on a different ill.
+ */
+ if (IS_UNDER_IPMP(ill)) {
+ orig_ill = ill;
+ ill = ipmp_ill_hold_ipmp_ill(orig_ill);
+ if (ill == NULL) {
+ ill = orig_ill;
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards - IPMP ill",
+ mp, ill);
+ freemsg(mp);
+ return;
+ }
+ ASSERT(ill != orig_ill);
+ orig_ill = ira->ira_ill;
+ ira->ira_ill = ill;
+ mib = ill->ill_icmp6_mib;
+ }
if (!pullupmsg(mp, -1)) {
ip1dbg(("ndp_input: pullupmsg failed\n"));
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill);
goto done;
}
ip6h = (ip6_t *)mp->b_rptr;
if (ip6h->ip6_hops != IPV6_MAX_HOPS) {
ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n"));
+ ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill);
BUMP_MIB(mib, ipv6IfIcmpBadHoplimit);
goto done;
}
@@ -2275,6 +2056,7 @@ ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
ip1dbg(("ndp_input: Wrong next header 0x%x\n",
ip6h->ip6_nxt));
+ ip_drop_input("Wrong next header", mp, ill);
BUMP_MIB(mib, ipv6IfIcmpInErrors);
goto done;
}
@@ -2283,6 +2065,7 @@ ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT);
if (icmp_nd->icmp6_code != 0) {
ip1dbg(("ndp_input: icmp6 code != 0 \n"));
+ ip_drop_input("code non-zero", mp, ill);
BUMP_MIB(mib, ipv6IfIcmpInErrors);
goto done;
}
@@ -2293,54 +2076,25 @@ ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp)
*/
if (len < sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) {
ip1dbg(("ndp_input: packet too short\n"));
+ ip_drop_input("packet too short", mp, ill);
BUMP_MIB(mib, ipv6IfIcmpInErrors);
goto done;
}
if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) {
- ndp_input_solicit(ill, mp, dl_mp);
+ ndp_input_solicit(mp, ira);
} else {
- ndp_input_advert(ill, mp, dl_mp);
+ ndp_input_advert(mp, ira);
}
done:
freemsg(mp);
+ if (orig_ill != NULL) {
+ ill_refrele(ill);
+ ira->ira_ill = orig_ill;
+ }
}
/*
- * Utility routine to send an advertisement. Assumes that the NCE cannot
- * go away (e.g., because it's refheld).
- */
-static boolean_t
-nce_xmit_advert(nce_t *nce, boolean_t use_nd_lla, const in6_addr_t *target,
- uint_t flags)
-{
- ASSERT((flags & NDP_PROBE) == 0);
-
- if (nce->nce_flags & NCE_F_ISROUTER)
- flags |= NDP_ISROUTER;
- if (!(nce->nce_flags & NCE_F_ANYCAST))
- flags |= NDP_ORIDE;
-
- return (nce_xmit(nce->nce_ill, ND_NEIGHBOR_ADVERT, use_nd_lla,
- &nce->nce_addr, target, flags));
-}
-
-/*
- * Utility routine to send a solicitation. Assumes that the NCE cannot
- * go away (e.g., because it's refheld).
- */
-static boolean_t
-nce_xmit_solicit(nce_t *nce, boolean_t use_nd_lla, const in6_addr_t *sender,
- uint_t flags)
-{
- if (flags & NDP_PROBE)
- sender = &ipv6_all_zeros;
-
- return (nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, use_nd_lla,
- sender, &nce->nce_addr, flags));
-}
-
-/*
- * nce_xmit is called to form and transmit a ND solicitation or
+ * ndp_xmit is called to form and transmit a ND solicitation or
* advertisement ICMP packet.
*
* If the source address is unspecified and this isn't a probe (used for
@@ -2353,112 +2107,123 @@ nce_xmit_solicit(nce_t *nce, boolean_t use_nd_lla, const in6_addr_t *sender,
* corresponding ill's ill_wq otherwise returns B_TRUE.
*/
static boolean_t
-nce_xmit(ill_t *ill, uint8_t type, boolean_t use_nd_lla,
+ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len,
const in6_addr_t *sender, const in6_addr_t *target, int flag)
{
- ill_t *hwaddr_ill;
uint32_t len;
icmp6_t *icmp6;
mblk_t *mp;
ip6_t *ip6h;
nd_opt_hdr_t *opt;
- uint_t plen, maxplen;
- ip6i_t *ip6i;
- ipif_t *src_ipif = NULL;
- uint8_t *hw_addr;
+ uint_t plen;
zoneid_t zoneid = GLOBAL_ZONEID;
- char buf[INET6_ADDRSTRLEN];
+ ill_t *hwaddr_ill = ill;
+ ip_xmit_attr_t ixas;
+ ip_stack_t *ipst = ill->ill_ipst;
+ boolean_t need_refrele = B_FALSE;
+ boolean_t probe = B_FALSE;
- ASSERT(!IS_IPMP(ill));
+ if (IS_UNDER_IPMP(ill)) {
+ probe = ipif_lookup_testaddr_v6(ill, sender, NULL);
+ /*
+ * We send non-probe packets on the upper IPMP interface.
+ * ip_output_simple() will use cast_ill for sending any
+ * multicast packets. Note that we can't follow the same
+ * logic for probe packets because all interfaces in the ipmp
+ * group may have failed, so that we really want to only try
+ * to send the ND packet on the ill corresponding to the src
+ * address.
+ */
+ if (!probe) {
+ ill = ipmp_ill_hold_ipmp_ill(ill);
+ if (ill != NULL)
+ need_refrele = B_TRUE;
+ else
+ ill = hwaddr_ill;
+ }
+ }
/*
- * Check that the sender is actually a usable address on `ill', and if
- * so, track that as the src_ipif. If not, for solicitations, set the
- * sender to :: so that a new one will be picked below; for adverts,
- * drop the packet since we expect nce_xmit_advert() to always provide
- * a valid sender.
+ * If we have a unspecified source(sender) address, select a
+ * proper source address for the solicitation here itself so
+ * that we can initialize the h/w address correctly.
+ *
+ * If the sender is specified then we use this address in order
+ * to lookup the zoneid before calling ip_output_v6(). This is to
+ * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly
+ * by IP (we cannot guarantee that the global zone has an interface
+ * route to the destination).
+ *
+ * Note that the NA never comes here with the unspecified source
+ * address.
*/
- if (!IN6_IS_ADDR_UNSPECIFIED(sender)) {
- if ((src_ipif = ip_ndp_lookup_addr_v6(sender, ill)) == NULL ||
- !src_ipif->ipif_addr_ready) {
- if (src_ipif != NULL) {
- ipif_refrele(src_ipif);
- src_ipif = NULL;
- }
- if (type == ND_NEIGHBOR_ADVERT) {
- ip1dbg(("nce_xmit: No source ipif for src %s\n",
- inet_ntop(AF_INET6, sender, buf,
- sizeof (buf))));
- return (B_TRUE);
- }
- sender = &ipv6_all_zeros;
- }
- }
/*
- * If we still have an unspecified source (sender) address and this
- * isn't a probe, select a source address from `ill'.
+ * Probes will have unspec src at this point.
*/
- if (IN6_IS_ADDR_UNSPECIFIED(sender) && !(flag & NDP_PROBE)) {
- ASSERT(type != ND_NEIGHBOR_ADVERT);
+ if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) {
+ zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst);
/*
- * Pick a source address for this solicitation, but restrict
- * the selection to addresses assigned to the output
- * interface. We do this because the destination will create
- * a neighbor cache entry for the source address of this
- * packet, so the source address needs to be a valid neighbor.
+ * It's possible for ipif_lookup_addr_zoneid_v6() to return
+ * ALL_ZONES if it cannot find a matching ipif for the address
+ * we are trying to use. In this case we err on the side of
+ * trying to send the packet by defaulting to the GLOBAL_ZONEID.
*/
- src_ipif = ipif_select_source_v6(ill, target, B_TRUE,
- IPV6_PREFER_SRC_DEFAULT, ALL_ZONES);
- if (src_ipif == NULL) {
- ip1dbg(("nce_xmit: No source ipif for dst %s\n",
- inet_ntop(AF_INET6, target, buf, sizeof (buf))));
- return (B_TRUE);
- }
- sender = &src_ipif->ipif_v6src_addr;
+ if (zoneid == ALL_ZONES)
+ zoneid = GLOBAL_ZONEID;
}
- /*
- * We're either sending a probe or we have a source address.
- */
- ASSERT((flag & NDP_PROBE) || src_ipif != NULL);
-
- maxplen = roundup(sizeof (nd_opt_hdr_t) + ND_MAX_HDW_LEN, 8);
- len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) +
- maxplen;
+ plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8;
+ len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8;
mp = allocb(len, BPRI_LO);
if (mp == NULL) {
- if (src_ipif != NULL)
- ipif_refrele(src_ipif);
+ if (need_refrele)
+ ill_refrele(ill);
return (B_TRUE);
}
+
bzero((char *)mp->b_rptr, len);
mp->b_wptr = mp->b_rptr + len;
- ip6i = (ip6i_t *)mp->b_rptr;
- ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
- ip6i->ip6i_nxt = IPPROTO_RAW;
- ip6i->ip6i_flags = IP6I_HOPLIMIT;
- if (flag & NDP_PROBE)
- ip6i->ip6i_flags |= IP6I_UNSPEC_SRC;
+ bzero(&ixas, sizeof (ixas));
+ ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6 | IXAF_NO_HW_CKSUM;
- ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t));
+ ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
+ ixas.ixa_ipst = ipst;
+ ixas.ixa_cred = kcred;
+ ixas.ixa_cpid = NOPID;
+ ixas.ixa_tsl = NULL;
+ ixas.ixa_zoneid = zoneid;
+
+ ip6h = (ip6_t *)mp->b_rptr;
ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
- ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
+ ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
ip6h->ip6_nxt = IPPROTO_ICMPV6;
ip6h->ip6_hops = IPV6_MAX_HOPS;
- ip6h->ip6_src = *sender;
+ ixas.ixa_multicast_ttl = ip6h->ip6_hops;
ip6h->ip6_dst = *target;
icmp6 = (icmp6_t *)&ip6h[1];
- opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
- sizeof (nd_neighbor_advert_t));
-
- if (type == ND_NEIGHBOR_SOLICIT) {
+ if (hw_addr_len != 0) {
+ opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN +
+ sizeof (nd_neighbor_advert_t));
+ } else {
+ opt = NULL;
+ }
+ if (operation == ND_NEIGHBOR_SOLICIT) {
nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6;
- if (!(flag & NDP_PROBE))
+ if (opt != NULL && !(flag & NDP_PROBE)) {
+ /*
+ * Note that we don't send out SLLA for ND probes
+ * per RFC 4862, even though we do send out the src
+ * haddr for IPv4 DAD probes, even though both IPv4
+ * and IPv6 go out with the unspecified/INADDR_ANY
+ * src IP addr.
+ */
opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
+ }
+ ip6h->ip6_src = *sender;
ns->nd_ns_target = *target;
if (!(flag & NDP_UNICAST)) {
/* Form multicast address of the target */
@@ -2470,7 +2235,9 @@ nce_xmit(ill_t *ill, uint8_t type, boolean_t use_nd_lla,
nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6;
ASSERT(!(flag & NDP_PROBE));
- opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
+ if (opt != NULL)
+ opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
+ ip6h->ip6_src = *sender;
na->nd_na_target = *sender;
if (flag & NDP_ISROUTER)
na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER;
@@ -2480,231 +2247,223 @@ nce_xmit(ill_t *ill, uint8_t type, boolean_t use_nd_lla,
na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE;
}
- hw_addr = NULL;
if (!(flag & NDP_PROBE)) {
- /*
- * Use our source address to find the hardware address to put
- * in the packet, so that the hardware address and IP address
- * will match up -- even if that hardware address doesn't
- * match the ill we actually transmit the packet through.
- */
- if (IS_IPMP(src_ipif->ipif_ill)) {
- hwaddr_ill = ipmp_ipif_hold_bound_ill(src_ipif);
- if (hwaddr_ill == NULL) {
- ip1dbg(("nce_xmit: no bound ill!\n"));
- ipif_refrele(src_ipif);
- freemsg(mp);
- return (B_TRUE);
- }
- } else {
- hwaddr_ill = src_ipif->ipif_ill;
- ill_refhold(hwaddr_ill); /* for symmetry */
- }
-
- plen = roundup(sizeof (nd_opt_hdr_t) +
- hwaddr_ill->ill_nd_lla_len, 8);
-
- hw_addr = use_nd_lla ? hwaddr_ill->ill_nd_lla :
- hwaddr_ill->ill_phys_addr;
- if (hw_addr != NULL) {
+ if (hw_addr != NULL && opt != NULL) {
/* Fill in link layer address and option len */
- opt->nd_opt_len = (uint8_t)(plen / 8);
- bcopy(hw_addr, &opt[1], hwaddr_ill->ill_nd_lla_len);
+ opt->nd_opt_len = (uint8_t)plen;
+ bcopy(hw_addr, &opt[1], hw_addr_len);
}
-
- ill_refrele(hwaddr_ill);
+ }
+ if (opt != NULL && opt->nd_opt_type == 0) {
+ /* If there's no link layer address option, then strip it. */
+ len -= plen * 8;
+ mp->b_wptr = mp->b_rptr + len;
+ ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
}
- if (hw_addr == NULL)
- plen = 0;
-
- /* Fix up the length of the packet now that plen is known */
- len -= (maxplen - plen);
- mp->b_wptr = mp->b_rptr + len;
- ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t));
-
- icmp6->icmp6_type = type;
+ icmp6->icmp6_type = (uint8_t)operation;
icmp6->icmp6_code = 0;
/*
* Prepare for checksum by putting icmp length in the icmp
- * checksum field. The checksum is calculated in ip_wput_v6.
+ * checksum field. The checksum is calculated in ip_output.c.
*/
icmp6->icmp6_cksum = ip6h->ip6_plen;
- /*
- * Before we toss the src_ipif, look up the zoneid to pass to
- * ip_output_v6(). This is to ensure unicast ND_NEIGHBOR_ADVERT
- * packets to be routed correctly by IP (we cannot guarantee that the
- * global zone has an interface route to the destination).
- */
- if (src_ipif != NULL) {
- if ((zoneid = src_ipif->ipif_zoneid) == ALL_ZONES)
- zoneid = GLOBAL_ZONEID;
- ipif_refrele(src_ipif);
- }
-
- ip_output_v6((void *)(uintptr_t)zoneid, mp, ill->ill_wq, IP_WPUT);
+ (void) ip_output_simple(mp, &ixas);
+ ixa_cleanup(&ixas);
+ if (need_refrele)
+ ill_refrele(ill);
return (B_FALSE);
}
/*
- * Make a link layer address (does not include the SAP) from an nce.
- * To form the link layer address, use the last four bytes of ipv6
- * address passed in and the fixed offset stored in nce.
+ * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED.
+ * The datapath uses this as an indication that there
+ * is a problem (as opposed to a NCE that was just
+ * reclaimed due to lack of memory.
+ * Note that static ARP entries never become unreachable.
*/
-static void
-nce_make_mapping(nce_t *nce, uchar_t *addrpos, uchar_t *addr)
-{
- uchar_t *mask, *to;
- ill_t *ill = nce->nce_ill;
- int len;
-
- if (ill->ill_net_type == IRE_IF_NORESOLVER)
- return;
- ASSERT(nce->nce_res_mp != NULL);
- ASSERT(ill->ill_net_type == IRE_IF_RESOLVER);
- ASSERT(nce->nce_flags & NCE_F_MAPPING);
- ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask));
- ASSERT(addr != NULL);
- bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill),
- addrpos, ill->ill_nd_lla_len);
- len = MIN((int)ill->ill_nd_lla_len - nce->nce_ll_extract_start,
- IPV6_ADDR_LEN);
- mask = (uchar_t *)&nce->nce_extract_mask;
- mask += (IPV6_ADDR_LEN - len);
- addr += (IPV6_ADDR_LEN - len);
- to = addrpos + nce->nce_ll_extract_start;
- while (len-- > 0)
- *to++ |= *mask++ & *addr++;
-}
-
-mblk_t *
-nce_udreq_alloc(ill_t *ill)
+void
+nce_make_unreachable(ncec_t *ncec)
{
- mblk_t *template_mp = NULL;
- dl_unitdata_req_t *dlur;
- int sap_length;
-
- ASSERT(ill->ill_isv6);
-
- sap_length = ill->ill_sap_length;
- template_mp = ip_dlpi_alloc(sizeof (dl_unitdata_req_t) +
- ill->ill_nd_lla_len + ABS(sap_length), DL_UNITDATA_REQ);
- if (template_mp == NULL)
- return (NULL);
-
- dlur = (dl_unitdata_req_t *)template_mp->b_rptr;
- dlur->dl_priority.dl_min = 0;
- dlur->dl_priority.dl_max = 0;
- dlur->dl_dest_addr_length = ABS(sap_length) + ill->ill_nd_lla_len;
- dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t);
-
- /* Copy in the SAP value. */
- NCE_LL_SAP_COPY(ill, template_mp);
-
- return (template_mp);
+ mutex_enter(&ncec->ncec_lock);
+ ncec->ncec_state = ND_UNREACHABLE;
+ mutex_exit(&ncec->ncec_lock);
}
/*
- * NDP retransmit timer.
+ * NCE retransmit timer. Common to IPv4 and IPv6.
* This timer goes off when:
- * a. It is time to retransmit NS for resolver.
+ * a. It is time to retransmit a resolution for resolver.
* b. It is time to send reachability probes.
*/
void
-ndp_timer(void *arg)
+nce_timer(void *arg)
{
- nce_t *nce = arg;
- ill_t *ill = nce->nce_ill;
+ ncec_t *ncec = arg;
+ ill_t *ill = ncec->ncec_ill, *src_ill;
char addrbuf[INET6_ADDRSTRLEN];
boolean_t dropped = B_FALSE;
- ip_stack_t *ipst = ill->ill_ipst;
+ ip_stack_t *ipst = ncec->ncec_ipst;
+ boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
+ in_addr_t sender4 = INADDR_ANY;
+ in6_addr_t sender6 = ipv6_all_zeros;
/*
- * The timer has to be cancelled by ndp_delete before doing the final
+ * The timer has to be cancelled by ncec_delete before doing the final
* refrele. So the NCE is guaranteed to exist when the timer runs
* until it clears the timeout_id. Before clearing the timeout_id
- * bump up the refcnt so that we can continue to use the nce
+ * bump up the refcnt so that we can continue to use the ncec
*/
- ASSERT(nce != NULL);
-
- mutex_enter(&nce->nce_lock);
- NCE_REFHOLD_LOCKED(nce);
- nce->nce_timeout_id = 0;
+ ASSERT(ncec != NULL);
+ mutex_enter(&ncec->ncec_lock);
+ ncec_refhold_locked(ncec);
+ ncec->ncec_timeout_id = 0;
+ mutex_exit(&ncec->ncec_lock);
+
+ src_ill = nce_resolve_src(ncec, &sender6);
+ /* if we could not find a sender address, return */
+ if (src_ill == NULL) {
+ if (!isv6) {
+ IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4);
+ ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET,
+ &sender4, addrbuf, sizeof (addrbuf))));
+ } else {
+ ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6,
+ &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
+ }
+ nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
+ ncec_refrele(ncec);
+ return;
+ }
+ if (!isv6)
+ IN6_V4MAPPED_TO_IPADDR(&sender6, sender4);
+ mutex_enter(&ncec->ncec_lock);
/*
- * Check the reachability state first.
+ * Check the reachability state.
*/
- switch (nce->nce_state) {
+ switch (ncec->ncec_state) {
case ND_DELAY:
- nce->nce_state = ND_PROBE;
- mutex_exit(&nce->nce_lock);
- (void) nce_xmit_solicit(nce, B_FALSE, &ipv6_all_zeros,
- NDP_UNICAST);
+ ASSERT(ncec->ncec_lladdr != NULL);
+ ncec->ncec_state = ND_PROBE;
+ ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
+ if (isv6) {
+ mutex_exit(&ncec->ncec_lock);
+ (void) ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT,
+ src_ill->ill_phys_addr,
+ src_ill->ill_phys_addr_length,
+ &sender6, &ncec->ncec_addr,
+ NDP_UNICAST);
+ } else {
+ (void) arp_request(ncec, sender4, src_ill);
+ mutex_exit(&ncec->ncec_lock);
+ }
if (ip_debug > 3) {
/* ip2dbg */
- pr_addr_dbg("ndp_timer: state for %s changed "
- "to PROBE\n", AF_INET6, &nce->nce_addr);
+ pr_addr_dbg("nce_timer: state for %s changed "
+ "to PROBE\n", AF_INET6, &ncec->ncec_addr);
}
- NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
- NCE_REFRELE(nce);
- return;
+ nce_restart_timer(ncec, ill->ill_reachable_retrans_time);
+ break;
case ND_PROBE:
/* must be retransmit timer */
- nce->nce_pcnt--;
- ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT &&
- nce->nce_pcnt >= -1);
- if (nce->nce_pcnt > 0) {
+ ASSERT(ncec->ncec_pcnt >= -1);
+ if (ncec->ncec_pcnt > 0) {
/*
- * As per RFC2461, the nce gets deleted after
+ * As per RFC2461, the ncec gets deleted after
* MAX_UNICAST_SOLICIT unsuccessful re-transmissions.
* Note that the first unicast solicitation is sent
* during the DELAY state.
*/
- ip2dbg(("ndp_timer: pcount=%x dst %s\n",
- nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr,
- addrbuf, sizeof (addrbuf))));
- mutex_exit(&nce->nce_lock);
- dropped = nce_xmit_solicit(nce, B_FALSE,
- &ipv6_all_zeros,
- (nce->nce_flags & NCE_F_PERMANENT) ? NDP_PROBE :
- NDP_UNICAST);
- if (dropped) {
- mutex_enter(&nce->nce_lock);
- nce->nce_pcnt++;
- mutex_exit(&nce->nce_lock);
+ ip2dbg(("nce_timer: pcount=%x dst %s\n",
+ ncec->ncec_pcnt,
+ inet_ntop((isv6? AF_INET6 : AF_INET),
+ &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
+ if (NCE_PUBLISH(ncec)) {
+ mutex_exit(&ncec->ncec_lock);
+ /*
+ * send out a probe; note that src_ill
+ * is ignored by nce_dad() for all
+ * DAD message types other than IPv6
+ * unicast probes
+ */
+ nce_dad(ncec, src_ill, B_TRUE);
+ } else {
+ ASSERT(src_ill != NULL);
+ ncec->ncec_pcnt--;
+ if (isv6) {
+ mutex_exit(&ncec->ncec_lock);
+ (void) ndp_xmit(src_ill,
+ ND_NEIGHBOR_SOLICIT,
+ src_ill->ill_phys_addr,
+ src_ill->ill_phys_addr_length,
+ &sender6, &ncec->ncec_addr,
+ NDP_UNICAST);
+ } else {
+ /*
+ * since the nce is REACHABLE,
+ * the ARP request will be sent out
+ * as a link-layer unicast.
+ */
+ (void) arp_request(ncec, sender4,
+ src_ill);
+ mutex_exit(&ncec->ncec_lock);
+ }
+ nce_restart_timer(ncec,
+ ill->ill_reachable_retrans_time);
}
- NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill));
- } else if (nce->nce_pcnt < 0) {
- /* No hope, delete the nce */
- nce->nce_state = ND_UNREACHABLE;
- mutex_exit(&nce->nce_lock);
+ } else if (ncec->ncec_pcnt < 0) {
+ /* No hope, delete the ncec */
+ /* Tell datapath it went bad */
+ ncec->ncec_state = ND_UNREACHABLE;
+ mutex_exit(&ncec->ncec_lock);
if (ip_debug > 2) {
/* ip1dbg */
- pr_addr_dbg("ndp_timer: Delete IRE for"
- " dst %s\n", AF_INET6, &nce->nce_addr);
+ pr_addr_dbg("nce_timer: Delete NCE for"
+ " dst %s\n", (isv6? AF_INET6: AF_INET),
+ &ncec->ncec_addr);
}
- ndp_delete(nce);
- } else if (!(nce->nce_flags & NCE_F_PERMANENT)) {
- /* Wait RetransTimer, before deleting the entry */
- ip2dbg(("ndp_timer: pcount=%x dst %s\n",
- nce->nce_pcnt, inet_ntop(AF_INET6,
- &nce->nce_addr, addrbuf, sizeof (addrbuf))));
- mutex_exit(&nce->nce_lock);
+ /* if static ARP can't delete. */
+ if ((ncec->ncec_flags & NCE_F_STATIC) == 0)
+ ncec_delete(ncec);
+
+ } else if (!NCE_PUBLISH(ncec)) {
+ /*
+ * Probe count is 0 for a dynamic entry (one that we
+ * ourselves are not publishing). We should never get
+ * here if NONUD was requested, hence the ASSERT below.
+ */
+ ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
+ ip2dbg(("nce_timer: pcount=%x dst %s\n",
+ ncec->ncec_pcnt, inet_ntop(AF_INET6,
+ &ncec->ncec_addr, addrbuf, sizeof (addrbuf))));
+ ncec->ncec_pcnt--;
+ mutex_exit(&ncec->ncec_lock);
/* Wait one interval before killing */
- NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time);
+ nce_restart_timer(ncec,
+ ill->ill_reachable_retrans_time);
} else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) {
ipif_t *ipif;
+ ipaddr_t ncec_addr;
/*
* We're done probing, and we can now declare this
* address to be usable. Let IP know that it's ok to
* use.
*/
- nce->nce_state = ND_REACHABLE;
- mutex_exit(&nce->nce_lock);
- ipif = ip_ndp_lookup_addr_v6(&nce->nce_addr,
- nce->nce_ill);
+ ncec->ncec_state = ND_REACHABLE;
+ ncec->ncec_flags &= ~NCE_F_UNVERIFIED;
+ mutex_exit(&ncec->ncec_lock);
+ if (isv6) {
+ ipif = ipif_lookup_addr_exact_v6(
+ &ncec->ncec_addr, ill, ipst);
+ } else {
+ IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
+ ncec_addr);
+ ipif = ipif_lookup_addr_exact(ncec_addr, ill,
+ ipst);
+ }
if (ipif != NULL) {
if (ipif->ipif_was_dup) {
char ibuf[LIFNAMSIZ + 10];
@@ -2725,17 +2484,28 @@ ndp_timer(void *arg)
ipif->ipif_addr_ready = 1;
ipif_refrele(ipif);
}
+ if (!isv6 && arp_no_defense)
+ break;
/* Begin defending our new address */
- nce->nce_unsolicit_count = 0;
- dropped = nce_xmit_advert(nce, B_FALSE,
- &ipv6_all_hosts_mcast, 0);
- if (dropped) {
- nce->nce_unsolicit_count = 1;
- NDP_RESTART_TIMER(nce,
- ipst->ips_ip_ndp_unsolicit_interval);
- } else if (ipst->ips_ip_ndp_defense_interval != 0) {
- NDP_RESTART_TIMER(nce,
- ipst->ips_ip_ndp_defense_interval);
+ if (ncec->ncec_unsolicit_count > 0) {
+ ncec->ncec_unsolicit_count--;
+ if (isv6) {
+ dropped = ndp_announce(ncec);
+ } else {
+ dropped = arp_announce(ncec);
+ }
+
+ if (dropped)
+ ncec->ncec_unsolicit_count++;
+ else
+ ncec->ncec_last_time_defended =
+ ddi_get_lbolt();
+ }
+ if (ncec->ncec_unsolicit_count > 0) {
+ nce_restart_timer(ncec,
+ ANNOUNCE_INTERVAL(isv6));
+ } else if (DEFENSE_INTERVAL(isv6) != 0) {
+ nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
}
} else {
/*
@@ -2744,76 +2514,93 @@ ndp_timer(void *arg)
* doing anything, but switch to reachable state so
* that the restart will work.
*/
- nce->nce_state = ND_REACHABLE;
- mutex_exit(&nce->nce_lock);
+ ncec->ncec_state = ND_REACHABLE;
+ mutex_exit(&ncec->ncec_lock);
}
- NCE_REFRELE(nce);
- return;
+ break;
case ND_INCOMPLETE: {
- ip6_t *ip6h;
- ip6i_t *ip6i;
- mblk_t *mp, *datamp, *nextmp, **prevmpp;
+ mblk_t *mp, *nextmp;
+ mblk_t **prevmpp;
/*
- * Per case (2) in the nce_queue_mp() comments, scan nce_qd_mp
- * for any IPMP probe packets, and toss 'em. IPMP probe
- * packets will always be at the head of nce_qd_mp and always
- * have an ip6i_t header, so we can stop at the first queued
- * ND packet without an ip6i_t.
+ * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp
+ * for any IPMP probe packets, and toss them. IPMP probe
+ * packets will always be at the head of ncec_qd_mp, so that
+ * we can stop at the first queued ND packet that is
+ * not a probe packet.
*/
- prevmpp = &nce->nce_qd_mp;
- for (mp = nce->nce_qd_mp; mp != NULL; mp = nextmp) {
+ prevmpp = &ncec->ncec_qd_mp;
+ for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) {
nextmp = mp->b_next;
- datamp = (DB_TYPE(mp) == M_CTL) ? mp->b_cont : mp;
- ip6h = (ip6_t *)datamp->b_rptr;
- if (ip6h->ip6_nxt != IPPROTO_RAW)
- break;
- ip6i = (ip6i_t *)ip6h;
- if (ip6i->ip6i_flags & IP6I_IPMP_PROBE) {
+ if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) {
inet_freemsg(mp);
+ ncec->ncec_nprobes--;
*prevmpp = nextmp;
} else {
prevmpp = &mp->b_next;
}
}
- ip_ndp_resolve(nce);
- mutex_exit(&nce->nce_lock);
- NCE_REFRELE(nce);
+
+ /*
+ * Must be resolver's retransmit timer.
+ */
+ mutex_exit(&ncec->ncec_lock);
+ ip_ndp_resolve(ncec);
break;
}
case ND_REACHABLE:
- if (((nce->nce_flags & NCE_F_UNSOL_ADV) &&
- nce->nce_unsolicit_count != 0) ||
- ((nce->nce_flags & NCE_F_PERMANENT) &&
- ipst->ips_ip_ndp_defense_interval != 0)) {
- if (nce->nce_unsolicit_count > 0)
- nce->nce_unsolicit_count--;
- mutex_exit(&nce->nce_lock);
- dropped = nce_xmit_advert(nce, B_FALSE,
- &ipv6_all_hosts_mcast, 0);
+ if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) &&
+ ncec->ncec_unsolicit_count != 0) ||
+ (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) {
+ if (ncec->ncec_unsolicit_count > 0) {
+ ncec->ncec_unsolicit_count--;
+ mutex_exit(&ncec->ncec_lock);
+ /*
+ * When we get to zero announcements left,
+ * switch to address defense
+ */
+ } else {
+ boolean_t rate_limit;
+
+ mutex_exit(&ncec->ncec_lock);
+ rate_limit = ill_defend_rate_limit(ill, ncec);
+ if (rate_limit) {
+ nce_restart_timer(ncec,
+ DEFENSE_INTERVAL(isv6));
+ break;
+ }
+ }
+ if (isv6) {
+ dropped = ndp_announce(ncec);
+ } else {
+ dropped = arp_announce(ncec);
+ }
+ mutex_enter(&ncec->ncec_lock);
if (dropped) {
- mutex_enter(&nce->nce_lock);
- nce->nce_unsolicit_count++;
- mutex_exit(&nce->nce_lock);
+ ncec->ncec_unsolicit_count++;
+ } else {
+ ncec->ncec_last_time_defended =
+ ddi_get_lbolt();
}
- if (nce->nce_unsolicit_count != 0) {
- NDP_RESTART_TIMER(nce,
- ipst->ips_ip_ndp_unsolicit_interval);
+ mutex_exit(&ncec->ncec_lock);
+ if (ncec->ncec_unsolicit_count != 0) {
+ nce_restart_timer(ncec,
+ ANNOUNCE_INTERVAL(isv6));
} else {
- NDP_RESTART_TIMER(nce,
- ipst->ips_ip_ndp_defense_interval);
+ nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6));
}
} else {
- mutex_exit(&nce->nce_lock);
+ mutex_exit(&ncec->ncec_lock);
}
- NCE_REFRELE(nce);
break;
default:
- mutex_exit(&nce->nce_lock);
- NCE_REFRELE(nce);
+ mutex_exit(&ncec->ncec_lock);
break;
}
+done:
+ ncec_refrele(ncec);
+ ill_refrele(src_ill);
}
/*
@@ -2821,31 +2608,21 @@ ndp_timer(void *arg)
* Copy SAP from ill.
*/
static void
-nce_set_ll(nce_t *nce, uchar_t *ll_addr)
+nce_set_ll(ncec_t *ncec, uchar_t *ll_addr)
{
- ill_t *ill = nce->nce_ill;
- uchar_t *woffset;
+ ill_t *ill = ncec->ncec_ill;
ASSERT(ll_addr != NULL);
- /* Always called before fast_path_probe */
- ASSERT(nce->nce_fp_mp == NULL);
- if (ill->ill_sap_length != 0) {
- /*
- * Copy the SAP type specified in the
- * request into the xmit template.
- */
- NCE_LL_SAP_COPY(ill, nce->nce_res_mp);
- }
if (ill->ill_phys_addr_length > 0) {
/*
* The bcopy() below used to be called for the physical address
* length rather than the link layer address length. For
* ethernet and many other media, the phys_addr and lla are
* identical.
- * However, with xresolv interfaces being introduced, the
- * phys_addr and lla are no longer the same, and the physical
- * address may not have any useful meaning, so we use the lla
- * for IPv6 address resolution and destination addressing.
+ *
+ * The phys_addr and lla may not be the same for devices that
+ * support DL_IPV6_LINK_LAYER_ADDR, though there are currently
+ * no known instances of these.
*
* For PPP or other interfaces with a zero length
* physical address, don't do anything here.
@@ -2854,22 +2631,18 @@ nce_set_ll(nce_t *nce, uchar_t *ll_addr)
* Using the lla for them would change the way they operate.
* Doing nothing in such cases preserves expected behavior.
*/
- woffset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
- bcopy(ll_addr, woffset, ill->ill_nd_lla_len);
+ bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len);
}
}
-static boolean_t
-nce_cmp_ll_addr(const nce_t *nce, const uchar_t *ll_addr, uint32_t ll_addr_len)
+boolean_t
+nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr,
+ uint32_t ll_addr_len)
{
- ill_t *ill = nce->nce_ill;
- uchar_t *ll_offset;
-
- ASSERT(nce->nce_res_mp != NULL);
+ ASSERT(ncec->ncec_lladdr != NULL);
if (ll_addr == NULL)
return (B_FALSE);
- ll_offset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
- if (bcmp(ll_addr, ll_offset, ll_addr_len) != 0)
+ if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0)
return (B_TRUE);
return (B_FALSE);
}
@@ -2878,15 +2651,16 @@ nce_cmp_ll_addr(const nce_t *nce, const uchar_t *ll_addr, uint32_t ll_addr_len)
* Updates the link layer address or the reachability state of
* a cache entry. Reset probe counter if needed.
*/
-static void
-nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr)
+void
+nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr)
{
- ill_t *ill = nce->nce_ill;
+ ill_t *ill = ncec->ncec_ill;
boolean_t need_stop_timer = B_FALSE;
boolean_t need_fastpath_update = B_FALSE;
+ nce_t *nce = NULL;
+ timeout_id_t tid;
- ASSERT(MUTEX_HELD(&nce->nce_lock));
- ASSERT(nce->nce_ipversion == IPV6_VERSION);
+ ASSERT(MUTEX_HELD(&ncec->ncec_lock));
/*
* If this interface does not do NUD, there is no point
* in allowing an update to the cache entry. Although
@@ -2896,184 +2670,251 @@ nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr)
* Non-Resolvers will always be created as REACHABLE.
*/
if (new_state != ND_UNCHANGED) {
- if ((nce->nce_flags & NCE_F_NONUD) &&
- (nce->nce_state != ND_INCOMPLETE))
+ if ((ncec->ncec_flags & NCE_F_NONUD) &&
+ (ncec->ncec_state != ND_INCOMPLETE))
return;
ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN);
ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX);
need_stop_timer = B_TRUE;
if (new_state == ND_REACHABLE)
- nce->nce_last = TICK_TO_MSEC(lbolt64);
+ ncec->ncec_last = TICK_TO_MSEC(lbolt64);
else {
/* We force NUD in this case */
- nce->nce_last = 0;
+ ncec->ncec_last = 0;
}
- nce->nce_state = new_state;
- nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
+ ncec->ncec_state = new_state;
+ ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
+ ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL ||
+ new_state == ND_INCOMPLETE);
+ }
+ if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
+ tid = ncec->ncec_timeout_id;
+ ncec->ncec_timeout_id = 0;
}
/*
- * In case of fast path we need to free the the fastpath
- * M_DATA and do another probe. Otherwise we can just
+ * Re-trigger fastpath probe and
* overwrite the DL_UNITDATA_REQ data, noting we'll lose
* whatever packets that happens to be transmitting at the time.
*/
if (new_ll_addr != NULL) {
- ASSERT(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill) +
- ill->ill_nd_lla_len <= nce->nce_res_mp->b_wptr);
- bcopy(new_ll_addr, nce->nce_res_mp->b_rptr +
- NCE_LL_ADDR_OFFSET(ill), ill->ill_nd_lla_len);
- if (nce->nce_fp_mp != NULL) {
- freemsg(nce->nce_fp_mp);
- nce->nce_fp_mp = NULL;
- }
+ bcopy(new_ll_addr, ncec->ncec_lladdr,
+ ill->ill_phys_addr_length);
need_fastpath_update = B_TRUE;
}
- mutex_exit(&nce->nce_lock);
- if (need_stop_timer) {
- (void) untimeout(nce->nce_timeout_id);
- nce->nce_timeout_id = 0;
+ mutex_exit(&ncec->ncec_lock);
+ if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) {
+ if (tid != 0)
+ (void) untimeout(tid);
}
- if (need_fastpath_update)
- nce_fastpath(nce);
- mutex_enter(&nce->nce_lock);
+ if (need_fastpath_update) {
+ /*
+ * Delete any existing existing dlur_mp and fp_mp information.
+ * For IPMP interfaces, all underlying ill's must be checked
+ * and purged.
+ */
+ nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL);
+ /*
+ * add the new dlur_mp and fp_mp
+ */
+ nce = nce_fastpath(ncec, B_TRUE, NULL);
+ if (nce != NULL)
+ nce_refrele(nce);
+ }
+ mutex_enter(&ncec->ncec_lock);
}
-void
-nce_queue_mp_common(nce_t *nce, mblk_t *mp, boolean_t head_insert)
+static void
+nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
{
uint_t count = 0;
mblk_t **mpp, *tmp;
- ASSERT(MUTEX_HELD(&nce->nce_lock));
+ ASSERT(MUTEX_HELD(&ncec->ncec_lock));
- for (mpp = &nce->nce_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
- if (++count > nce->nce_ill->ill_max_buf) {
- tmp = nce->nce_qd_mp->b_next;
- nce->nce_qd_mp->b_next = NULL;
- nce->nce_qd_mp->b_prev = NULL;
- freemsg(nce->nce_qd_mp);
- nce->nce_qd_mp = tmp;
+ for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) {
+ if (++count > ncec->ncec_ill->ill_max_buf) {
+ tmp = ncec->ncec_qd_mp->b_next;
+ ncec->ncec_qd_mp->b_next = NULL;
+ /*
+ * if we never create data addrs on the under_ill
+ * does this matter?
+ */
+ BUMP_MIB(ncec->ncec_ill->ill_ip_mib,
+ ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp,
+ ncec->ncec_ill);
+ freemsg(ncec->ncec_qd_mp);
+ ncec->ncec_qd_mp = tmp;
}
}
if (head_insert) {
- mp->b_next = nce->nce_qd_mp;
- nce->nce_qd_mp = mp;
+ ncec->ncec_nprobes++;
+ mp->b_next = ncec->ncec_qd_mp;
+ ncec->ncec_qd_mp = mp;
} else {
*mpp = mp;
}
}
-static void
-nce_queue_mp(nce_t *nce, mblk_t *mp)
+/*
+ * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be
+ * queued at the head or tail of the queue based on the input argument
+ * 'head_insert'. The caller should specify this argument as B_TRUE if this
+ * packet is an IPMP probe packet, in which case the following happens:
+ *
+ * 1. Insert it at the head of the ncec_qd_mp list. Consider the normal
+ * (non-ipmp_probe) load-speading case where the source address of the ND
+ * packet is not tied to ncec_ill. If the ill bound to the source address
+ * cannot receive, the response to the ND packet will not be received.
+ * However, if ND packets for ncec_ill's probes are queued behind that ND
+ * packet, those probes will also fail to be sent, and thus in.mpathd will
+ * erroneously conclude that ncec_ill has also failed.
+ *
+ * 2. Drop the ipmp_probe packet in ndp_timer() if the ND did not succeed on
+ * the first attempt. This ensures that ND problems do not manifest as
+ * probe RTT spikes.
+ *
+ * We achieve this by inserting ipmp_probe() packets at the head of the
+ * nce_queue.
+ *
+ * The ncec for the probe target is created with ncec_ill set to the ipmp_ill,
+ * but the caller needs to set head_insert to B_TRUE if this is a probe packet.
+ */
+void
+nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert)
{
- boolean_t head_insert = B_FALSE;
- ip6_t *ip6h;
- ip6i_t *ip6i;
- mblk_t *data_mp;
+ ASSERT(MUTEX_HELD(&ncec->ncec_lock));
+ nce_queue_mp_common(ncec, mp, head_insert);
+}
- ASSERT(MUTEX_HELD(&nce->nce_lock));
+/*
+ * Called when address resolution failed due to a timeout.
+ * Send an ICMP unreachable in response to all queued packets.
+ */
+void
+ndp_resolv_failed(ncec_t *ncec)
+{
+ mblk_t *mp, *nxt_mp;
+ char buf[INET6_ADDRSTRLEN];
+ ill_t *ill = ncec->ncec_ill;
+ ip_recv_attr_t iras;
- if (mp->b_datap->db_type == M_CTL)
- data_mp = mp->b_cont;
- else
- data_mp = mp;
- ip6h = (ip6_t *)data_mp->b_rptr;
- if (ip6h->ip6_nxt == IPPROTO_RAW) {
- /*
- * This message should have been pulled up already in
- * ip_wput_v6. We can't do pullups here because the message
- * could be from the nce_qd_mp which could have b_next/b_prev
- * non-NULL.
- */
- ip6i = (ip6i_t *)ip6h;
- ASSERT(MBLKL(data_mp) >= sizeof (ip6i_t) + IPV6_HDR_LEN);
+ bzero(&iras, sizeof (iras));
+ iras.ira_flags = 0;
+ /*
+ * we are setting the ira_rill to the ipmp_ill (instead of
+ * the actual ill on which the packet was received), but this
+ * is ok because we don't actually need the real ira_rill.
+ * to send the icmp unreachable to the sender.
+ */
+ iras.ira_ill = iras.ira_rill = ill;
+ iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+ iras.ira_rifindex = iras.ira_ruifindex;
+
+ ip1dbg(("ndp_resolv_failed: dst %s\n",
+ inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf))));
+ mutex_enter(&ncec->ncec_lock);
+ mp = ncec->ncec_qd_mp;
+ ncec->ncec_qd_mp = NULL;
+ ncec->ncec_nprobes = 0;
+ mutex_exit(&ncec->ncec_lock);
+ while (mp != NULL) {
+ nxt_mp = mp->b_next;
+ mp->b_next = NULL;
- /*
- * If this packet is marked IP6I_IPMP_PROBE, then we need to:
- *
- * 1. Insert it at the head of the nce_qd_mp list. Consider
- * the normal (non-probe) load-speading case where the
- * source address of the ND packet is not tied to nce_ill.
- * If the ill bound to the source address cannot receive,
- * the response to the ND packet will not be received.
- * However, if ND packets for nce_ill's probes are queued
- * behind that ND packet, those probes will also fail to
- * be sent, and thus in.mpathd will erroneously conclude
- * that nce_ill has also failed.
- *
- * 2. Drop the probe packet in ndp_timer() if the ND did
- * not succeed on the first attempt. This ensures that
- * ND problems do not manifest as probe RTT spikes.
- */
- if (ip6i->ip6i_flags & IP6I_IPMP_PROBE)
- head_insert = B_TRUE;
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards - address unreachable",
+ mp, ill);
+ icmp_unreachable_v6(mp,
+ ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras);
+ ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
+ mp = nxt_mp;
}
- nce_queue_mp_common(nce, mp, head_insert);
+ ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
}
/*
- * Called when address resolution failed due to a timeout.
- * Send an ICMP unreachable in response to all queued packets.
+ * Handle the completion of NDP and ARP resolution.
*/
void
-nce_resolv_failed(nce_t *nce)
+nce_resolv_ok(ncec_t *ncec)
{
- mblk_t *mp, *nxt_mp, *first_mp;
- char buf[INET6_ADDRSTRLEN];
- ip6_t *ip6h;
- zoneid_t zoneid = GLOBAL_ZONEID;
- ip_stack_t *ipst = nce->nce_ill->ill_ipst;
+ mblk_t *mp;
+ uint_t pkt_len;
+ iaflags_t ixaflags = IXAF_NO_TRACE;
+ nce_t *nce;
+ ill_t *ill = ncec->ncec_ill;
+ boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
+ ip_stack_t *ipst = ill->ill_ipst;
+
+ if (IS_IPMP(ncec->ncec_ill)) {
+ nce_resolv_ipmp_ok(ncec);
+ return;
+ }
+ /* non IPMP case */
+
+ mutex_enter(&ncec->ncec_lock);
+ ASSERT(ncec->ncec_nprobes == 0);
+ mp = ncec->ncec_qd_mp;
+ ncec->ncec_qd_mp = NULL;
+ mutex_exit(&ncec->ncec_lock);
- ip1dbg(("nce_resolv_failed: dst %s\n",
- inet_ntop(AF_INET6, (char *)&nce->nce_addr, buf, sizeof (buf))));
- mutex_enter(&nce->nce_lock);
- mp = nce->nce_qd_mp;
- nce->nce_qd_mp = NULL;
- mutex_exit(&nce->nce_lock);
while (mp != NULL) {
+ mblk_t *nxt_mp;
+
+ if (ill->ill_isv6) {
+ ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+
+ pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
+ } else {
+ ipha_t *ipha = (ipha_t *)mp->b_rptr;
+
+ ixaflags |= IXAF_IS_IPV4;
+ pkt_len = ntohs(ipha->ipha_length);
+ }
nxt_mp = mp->b_next;
mp->b_next = NULL;
- mp->b_prev = NULL;
-
- first_mp = mp;
- if (mp->b_datap->db_type == M_CTL) {
- ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr;
- ASSERT(io->ipsec_out_type == IPSEC_OUT);
- zoneid = io->ipsec_out_zoneid;
- ASSERT(zoneid != ALL_ZONES);
- mp = mp->b_cont;
- mp->b_next = NULL;
- mp->b_prev = NULL;
- }
-
- ip6h = (ip6_t *)mp->b_rptr;
- if (ip6h->ip6_nxt == IPPROTO_RAW) {
- ip6i_t *ip6i;
+ /*
+ * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no
+ * longer available, but it's ok to drop this flag because TCP
+ * has its own flow-control in effect, so TCP packets
+ * are not likely to get here when flow-control is in effect.
+ */
+ mutex_enter(&ill->ill_lock);
+ nce = nce_lookup(ill, &ncec->ncec_addr);
+ mutex_exit(&ill->ill_lock);
+
+ if (nce == NULL) {
+ if (isv6) {
+ BUMP_MIB(&ipst->ips_ip6_mib,
+ ipIfStatsOutDiscards);
+ } else {
+ BUMP_MIB(&ipst->ips_ip_mib,
+ ipIfStatsOutDiscards);
+ }
+ ip_drop_output("ipIfStatsOutDiscards - no nce",
+ mp, NULL);
+ freemsg(mp);
+ } else {
/*
- * This message should have been pulled up already
- * in ip_wput_v6. ip_hdr_complete_v6 assumes that
- * the header is pulled up.
+ * We don't know the zoneid, but
+ * ip_xmit does not care since IXAF_NO_TRACE
+ * is set. (We traced the packet the first
+ * time through ip_xmit.)
*/
- ip6i = (ip6i_t *)ip6h;
- ASSERT((mp->b_wptr - (uchar_t *)ip6i) >=
- sizeof (ip6i_t) + IPV6_HDR_LEN);
- mp->b_rptr += sizeof (ip6i_t);
+ (void) ip_xmit(mp, nce, ixaflags, pkt_len, 0,
+ ALL_ZONES, 0, NULL);
+ nce_refrele(nce);
}
- /*
- * Ignore failure since icmp_unreachable_v6 will silently
- * drop packets with an unspecified source address.
- */
- (void) ip_hdr_complete_v6((ip6_t *)mp->b_rptr, zoneid, ipst);
- icmp_unreachable_v6(nce->nce_ill->ill_wq, first_mp,
- ICMP6_DST_UNREACH_ADDR, B_FALSE, B_FALSE, zoneid, ipst);
mp = nxt_mp;
}
- nce_cb_dispatch(nce);
+
+ ncec_cb_dispatch(ncec); /* complete callbacks */
}
/*
- * Called by SIOCSNDP* ioctl to add/change an nce entry
+ * Called by SIOCSNDP* ioctl to add/change an ncec entry
* and the corresponding attributes.
* Disallow states other than ND_REACHABLE or ND_STALE.
*/
@@ -3082,31 +2923,28 @@ ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
{
sin6_t *sin6;
in6_addr_t *addr;
+ ncec_t *ncec;
nce_t *nce;
- int err;
+ int err = 0;
uint16_t new_flags = 0;
uint16_t old_flags = 0;
int inflags = lnr->lnr_flags;
ip_stack_t *ipst = ill->ill_ipst;
+ boolean_t do_postprocess = B_FALSE;
ASSERT(ill->ill_isv6);
if ((lnr->lnr_state_create != ND_REACHABLE) &&
(lnr->lnr_state_create != ND_STALE))
return (EINVAL);
- if (lnr->lnr_hdw_len > ND_MAX_HDW_LEN)
- return (EINVAL);
-
sin6 = (sin6_t *)&lnr->lnr_addr;
addr = &sin6->sin6_addr;
mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
- /* We know it can not be mapping so just look in the hash table */
- nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr));
- /* See comment in ndp_query() regarding IS_IPMP(ill) usage */
- nce = nce_lookup_addr(ill, IS_IPMP(ill), addr, nce);
+ ASSERT(!IS_UNDER_IPMP(ill));
+ nce = nce_lookup_addr(ill, addr);
if (nce != NULL)
- new_flags = nce->nce_flags;
+ new_flags = nce->nce_common->ncec_flags;
switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) {
case NDF_ISROUTER_ON:
@@ -3118,7 +2956,7 @@ ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON):
mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
if (nce != NULL)
- NCE_REFRELE(nce);
+ nce_refrele(nce);
return (EINVAL);
}
@@ -3132,17 +2970,15 @@ ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON):
mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
if (nce != NULL)
- NCE_REFRELE(nce);
+ nce_refrele(nce);
return (EINVAL);
}
if (nce == NULL) {
- err = ndp_add_v6(ill,
+ err = nce_add_v6(ill,
(uchar_t *)lnr->lnr_hdw_addr,
+ ill->ill_phys_addr_length,
addr,
- &ipv6_all_ones,
- &ipv6_all_zeros,
- 0,
new_flags,
lnr->lnr_state_create,
&nce);
@@ -3150,269 +2986,354 @@ ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr)
mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err));
return (err);
+ } else {
+ do_postprocess = B_TRUE;
}
}
- old_flags = nce->nce_flags;
+ ncec = nce->nce_common;
+ old_flags = ncec->ncec_flags;
if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) {
- /*
- * Router turned to host, delete all ires.
- * XXX Just delete the entry, but we need to add too.
- */
- nce->nce_flags &= ~NCE_F_ISROUTER;
+ ncec_router_to_host(ncec);
mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
- ndp_delete(nce);
- NCE_REFRELE(nce);
+ if (do_postprocess)
+ err = nce_add_v6_postprocess(nce);
+ nce_refrele(nce);
return (0);
}
mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
- mutex_enter(&nce->nce_lock);
- nce->nce_flags = new_flags;
- mutex_exit(&nce->nce_lock);
+ if (do_postprocess)
+ err = nce_add_v6_postprocess(nce);
+ /*
+ * err cannot be anything other than 0 because we don't support
+ * proxy arp of static addresses.
+ */
+ ASSERT(err == 0);
+
+ mutex_enter(&ncec->ncec_lock);
+ ncec->ncec_flags = new_flags;
+ mutex_exit(&ncec->ncec_lock);
/*
* Note that we ignore the state at this point, which
* should be either STALE or REACHABLE. Instead we let
* the link layer address passed in to determine the state
* much like incoming packets.
*/
- nce_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
- NCE_REFRELE(nce);
+ nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE);
+ nce_refrele(nce);
return (0);
}
/*
- * If the device driver supports it, we make nce_fp_mp to have
- * an M_DATA prepend. Otherwise nce_fp_mp will be null.
- * The caller ensures there is hold on nce for this function.
- * Note that since ill_fastpath_probe() copies the mblk there is
- * no need for the hold beyond this function.
+ * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up
+ * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must
+ * be held to ensure that they are in the same group.
*/
-void
-nce_fastpath(nce_t *nce)
+static nce_t *
+nce_fastpath_create(ill_t *ill, ncec_t *ncec)
{
- ill_t *ill = nce->nce_ill;
- int res;
- ASSERT(ill != NULL);
- ASSERT(nce->nce_state != ND_INITIAL && nce->nce_state != ND_INCOMPLETE);
+ nce_t *nce;
- if (nce->nce_fp_mp != NULL) {
- /* Already contains fastpath info */
- return;
- }
- if (nce->nce_res_mp != NULL) {
- nce_fastpath_list_add(nce);
- res = ill_fastpath_probe(ill, nce->nce_res_mp);
- /*
- * EAGAIN is an indication of a transient error
- * i.e. allocation failure etc. leave the nce in the list it
- * will be updated when another probe happens for another ire
- * if not it will be taken out of the list when the ire is
- * deleted.
- */
+ nce = nce_ill_lookup_then_add(ill, ncec);
+
+ if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
+ return (nce);
- if (res != 0 && res != EAGAIN)
- nce_fastpath_list_delete(nce);
+ /*
+ * hold the ncec_lock to synchronize with nce_update() so that,
+ * at the end of this function, the contents of nce_dlur_mp are
+ * consistent with ncec->ncec_lladdr, even though some intermediate
+ * packet may have been sent out with a mangled address, which would
+ * only be a transient condition.
+ */
+ mutex_enter(&ncec->ncec_lock);
+ if (ncec->ncec_lladdr != NULL) {
+ bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr +
+ NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length);
+ } else {
+ nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap,
+ ill->ill_sap_length);
}
+ mutex_exit(&ncec->ncec_lock);
+ return (nce);
}
/*
- * Drain the list of nce's waiting for fastpath response.
+ * we make nce_fp_mp to have an M_DATA prepend.
+ * The caller ensures there is hold on ncec for this function.
+ * Note that since ill_fastpath_probe() copies the mblk there is
+ * no need to hold the nce or ncec beyond this function.
+ *
+ * If the caller has passed in a non-null ncec_nce to nce_faspath() that
+ * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
+ * and will be returned back by this function, so that no extra nce_refrele
+ * is required for the caller. The calls from nce_add_common() use this
+ * method. All other callers (that pass in NULL ncec_nce) will have to do a
+ * nce_refrele of the returned nce (when it is non-null).
*/
-void
-nce_fastpath_list_dispatch(ill_t *ill, boolean_t (*func)(nce_t *, void *),
- void *arg)
+nce_t *
+nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
{
+ nce_t *nce;
+ ill_t *ill = ncec->ncec_ill;
- nce_t *next_nce;
- nce_t *current_nce;
- nce_t *first_nce;
- nce_t *prev_nce = NULL;
+ ASSERT(ill != NULL);
+
+ if (IS_IPMP(ill) && trigger_fp_req) {
+ trigger_fp_req = B_FALSE;
+ ipmp_ncec_fastpath(ncec, ill);
- mutex_enter(&ill->ill_lock);
- first_nce = current_nce = (nce_t *)ill->ill_fastpath_list;
- while (current_nce != (nce_t *)&ill->ill_fastpath_list) {
- next_nce = current_nce->nce_fastpath;
- /*
- * Take it off the list if we're flushing, or if the callback
- * routine tells us to do so. Otherwise, leave the nce in the
- * fastpath list to handle any pending response from the lower
- * layer. We can't drain the list when the callback routine
- * comparison failed, because the response is asynchronous in
- * nature, and may not arrive in the same order as the list
- * insertion.
- */
- if (func == NULL || func(current_nce, arg)) {
- current_nce->nce_fastpath = NULL;
- if (current_nce == first_nce)
- ill->ill_fastpath_list = first_nce = next_nce;
- else
- prev_nce->nce_fastpath = next_nce;
- } else {
- /* previous element that is still in the list */
- prev_nce = current_nce;
- }
- current_nce = next_nce;
}
- mutex_exit(&ill->ill_lock);
+ /*
+ * If the caller already has the nce corresponding to the ill, use
+ * that one. Otherwise we have to lookup/add the nce. Calls from
+ * nce_add_common() fall in the former category, and have just done
+ * the nce lookup/add that can be reused.
+ */
+ if (ncec_nce == NULL)
+ nce = nce_fastpath_create(ill, ncec);
+ else
+ nce = ncec_nce;
+
+ if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill))
+ return (nce);
+
+ if (trigger_fp_req)
+ nce_fastpath_trigger(nce);
+ return (nce);
}
/*
- * Add nce to the nce fastpath list.
+ * Trigger fastpath on nce. No locks may be held.
*/
-void
-nce_fastpath_list_add(nce_t *nce)
+static void
+nce_fastpath_trigger(nce_t *nce)
{
- ill_t *ill;
+ int res;
+ ill_t *ill = nce->nce_ill;
+ ncec_t *ncec = nce->nce_common;
- ill = nce->nce_ill;
+ res = ill_fastpath_probe(ill, nce->nce_dlur_mp);
+ /*
+ * EAGAIN is an indication of a transient error
+ * i.e. allocation failure etc. leave the ncec in the list it
+ * will be updated when another probe happens for another ire
+ * if not it will be taken out of the list when the ire is
+ * deleted.
+ */
+ if (res != 0 && res != EAGAIN && res != ENOTSUP)
+ nce_fastpath_list_delete(ill, ncec, NULL);
+}
- mutex_enter(&ill->ill_lock);
- mutex_enter(&nce->nce_lock);
+/*
+ * Add ncec to the nce fastpath list on ill.
+ */
+static nce_t *
+nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec)
+{
+ nce_t *nce = NULL;
+ ASSERT(MUTEX_HELD(&ill->ill_lock));
/*
- * if nce has not been deleted and
+ * Atomically ensure that the ill is not CONDEMNED and is not going
+ * down, before adding the NCE.
+ */
+ if (ill->ill_state_flags & ILL_CONDEMNED)
+ return (NULL);
+ mutex_enter(&ncec->ncec_lock);
+ /*
+ * if ncec has not been deleted and
* is not already in the list add it.
*/
- if (!(nce->nce_flags & NCE_F_CONDEMNED) &&
- (nce->nce_fastpath == NULL)) {
- nce->nce_fastpath = (nce_t *)ill->ill_fastpath_list;
- ill->ill_fastpath_list = nce;
+ if (!NCE_ISCONDEMNED(ncec)) {
+ nce = nce_lookup(ill, &ncec->ncec_addr);
+ if (nce != NULL)
+ goto done;
+ nce = nce_add(ill, ncec);
}
+done:
+ mutex_exit(&ncec->ncec_lock);
+ return (nce);
+}
- mutex_exit(&nce->nce_lock);
+nce_t *
+nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
+{
+ nce_t *nce;
+
+ mutex_enter(&ill->ill_lock);
+ nce = nce_ill_lookup_then_add_locked(ill, ncec);
mutex_exit(&ill->ill_lock);
+ return (nce);
}
+
/*
- * remove nce from the nce fastpath list.
+ * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
+ * nce is added to the 'dead' list, and the caller must nce_refrele() the
+ * entry after all locks have been dropped.
*/
void
-nce_fastpath_list_delete(nce_t *nce)
+nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead)
{
- nce_t *nce_ptr;
-
- ill_t *ill;
+ nce_t *nce;
- ill = nce->nce_ill;
ASSERT(ill != NULL);
- mutex_enter(&ill->ill_lock);
- if (nce->nce_fastpath == NULL)
- goto done;
-
- ASSERT(ill->ill_fastpath_list != &ill->ill_fastpath_list);
+ /* first clean out any nce pointers in the under_ills */
+ if (IS_IPMP(ill))
+ ipmp_ncec_flush_nce(ncec);
- if (ill->ill_fastpath_list == nce) {
- ill->ill_fastpath_list = nce->nce_fastpath;
- } else {
- nce_ptr = ill->ill_fastpath_list;
- while (nce_ptr != (nce_t *)&ill->ill_fastpath_list) {
- if (nce_ptr->nce_fastpath == nce) {
- nce_ptr->nce_fastpath = nce->nce_fastpath;
- break;
- }
- nce_ptr = nce_ptr->nce_fastpath;
+ /* now the ill itself */
+ mutex_enter(&ill->ill_lock);
+ for (nce = list_head(&ill->ill_nce); nce != NULL;
+ nce = list_next(&ill->ill_nce, nce)) {
+ if (nce->nce_common == ncec) {
+ nce_refhold(nce);
+ nce_delete(nce);
+ break;
}
}
-
- nce->nce_fastpath = NULL;
-done:
mutex_exit(&ill->ill_lock);
+ if (nce != NULL) {
+ if (dead == NULL)
+ nce_refrele(nce);
+ else
+ list_insert_tail(dead, nce);
+ }
}
/*
- * Update all NCE's that are not in fastpath mode and
- * have an nce_fp_mp that matches mp. mp->b_cont contains
- * the fastpath header.
- *
- * Returns TRUE if entry should be dequeued, or FALSE otherwise.
+ * when the fastpath response does not fit in the datab
+ * associated with the existing nce_fp_mp, we delete and
+ * add the nce to retrigger fastpath based on the information
+ * in the ncec_t.
*/
-boolean_t
-ndp_fastpath_update(nce_t *nce, void *arg)
+static nce_t *
+nce_delete_then_add(nce_t *nce)
+{
+ ill_t *ill = nce->nce_ill;
+ nce_t *newnce = NULL;
+
+ ip0dbg(("nce_delete_then_add nce %p ill %s\n",
+ (void *)nce, ill->ill_name));
+ mutex_enter(&ill->ill_lock);
+ mutex_enter(&nce->nce_common->ncec_lock);
+ nce_delete(nce);
+ /*
+ * Make sure that ncec is not condemned before adding. We hold the
+ * ill_lock and ncec_lock to synchronize with ncec_delete() and
+ * ipmp_ncec_flush_nce()
+ */
+ if (!NCE_ISCONDEMNED(nce->nce_common))
+ newnce = nce_add(ill, nce->nce_common);
+ mutex_exit(&nce->nce_common->ncec_lock);
+ mutex_exit(&ill->ill_lock);
+ nce_refrele(nce);
+ return (newnce); /* could be null if nomem */
+}
+
+typedef struct nce_fp_match_s {
+ nce_t *nce_fp_match_res;
+ mblk_t *nce_fp_match_ack_mp;
+} nce_fp_match_t;
+
+/* ARGSUSED */
+static int
+nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg)
{
- mblk_t *mp, *fp_mp;
+ nce_fp_match_t *nce_fp_marg = arg;
+ ncec_t *ncec = nce->nce_common;
+ mblk_t *mp = nce_fp_marg->nce_fp_match_ack_mp;
uchar_t *mp_rptr, *ud_mp_rptr;
- mblk_t *ud_mp = nce->nce_res_mp;
+ mblk_t *ud_mp = nce->nce_dlur_mp;
ptrdiff_t cmplen;
- if (nce->nce_flags & NCE_F_MAPPING)
- return (B_TRUE);
- if ((nce->nce_fp_mp != NULL) || (ud_mp == NULL))
- return (B_TRUE);
-
- ip2dbg(("ndp_fastpath_update: trying\n"));
- mp = (mblk_t *)arg;
+ /*
+ * mp is the mp associated with the fastpath ack.
+ * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t
+ * under consideration. If the contents match, then the
+ * fastpath ack is used to update the nce.
+ */
+ if (ud_mp == NULL)
+ return (0); /* MH_WALK_CONTINUE */
mp_rptr = mp->b_rptr;
cmplen = mp->b_wptr - mp_rptr;
ASSERT(cmplen >= 0);
+
ud_mp_rptr = ud_mp->b_rptr;
/*
- * The nce is locked here to prevent any other threads
- * from accessing and changing nce_res_mp when the IPv6 address
- * becomes resolved to an lla while we're in the middle
- * of looking at and comparing the hardware address (lla).
- * It is also locked to prevent multiple threads in nce_fastpath_update
- * from examining nce_res_mp atthe same time.
+ * The ncec is locked here to prevent any other threads from accessing
+ * and changing nce_dlur_mp when the address becomes resolved to an
+ * lla while we're in the middle of looking at and comparing the
+ * hardware address (lla). It is also locked to prevent multiple
+ * threads in nce_fastpath() from examining nce_dlur_mp at the same
+ * time.
*/
- mutex_enter(&nce->nce_lock);
+ mutex_enter(&ncec->ncec_lock);
if (ud_mp->b_wptr - ud_mp_rptr != cmplen ||
- bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) != 0) {
- mutex_exit(&nce->nce_lock);
- /*
- * Don't take the ire off the fastpath list yet,
- * since the response may come later.
- */
- return (B_FALSE);
- }
- /* Matched - install mp as the fastpath mp */
- ip1dbg(("ndp_fastpath_update: match\n"));
- fp_mp = dupb(mp->b_cont);
- if (fp_mp != NULL) {
- nce->nce_fp_mp = fp_mp;
+ bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) {
+ nce_fp_marg->nce_fp_match_res = nce;
+ mutex_exit(&ncec->ncec_lock);
+ nce_refhold(nce);
+ return (1); /* MH_WALK_TERMINATE */
}
- mutex_exit(&nce->nce_lock);
- return (B_TRUE);
+ mutex_exit(&ncec->ncec_lock);
+ return (0); /* MH_WALK_CONTINUE */
}
/*
- * This function handles the DL_NOTE_FASTPATH_FLUSH notification from
- * driver. Note that it assumes IP is exclusive...
+ * Update all NCE's that are not in fastpath mode and
+ * have an nce_fp_mp that matches mp. mp->b_cont contains
+ * the fastpath header.
+ *
+ * Returns TRUE if entry should be dequeued, or FALSE otherwise.
*/
-/* ARGSUSED */
void
-ndp_fastpath_flush(nce_t *nce, char *arg)
+nce_fastpath_update(ill_t *ill, mblk_t *mp)
{
- if (nce->nce_flags & NCE_F_MAPPING)
- return;
- /* No fastpath info? */
- if (nce->nce_fp_mp == NULL || nce->nce_res_mp == NULL)
+ nce_fp_match_t nce_fp_marg;
+ nce_t *nce;
+ mblk_t *nce_fp_mp, *fp_mp;
+
+ nce_fp_marg.nce_fp_match_res = NULL;
+ nce_fp_marg.nce_fp_match_ack_mp = mp;
+
+ nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg);
+
+ if ((nce = nce_fp_marg.nce_fp_match_res) == NULL)
return;
- if (nce->nce_ipversion == IPV4_VERSION &&
- nce->nce_flags & NCE_F_BCAST) {
- /*
- * IPv4 BROADCAST entries:
- * We can't delete the nce since it is difficult to
- * recreate these without going through the
- * ipif down/up dance.
- *
- * All access to nce->nce_fp_mp in the case of these
- * is protected by nce_lock.
- */
- mutex_enter(&nce->nce_lock);
- if (nce->nce_fp_mp != NULL) {
- freeb(nce->nce_fp_mp);
- nce->nce_fp_mp = NULL;
- mutex_exit(&nce->nce_lock);
- nce_fastpath(nce);
- } else {
+ mutex_enter(&nce->nce_lock);
+ nce_fp_mp = nce->nce_fp_mp;
+
+ if (nce_fp_mp != NULL) {
+ fp_mp = mp->b_cont;
+ if (nce_fp_mp->b_rptr + MBLKL(fp_mp) >
+ nce_fp_mp->b_datap->db_lim) {
mutex_exit(&nce->nce_lock);
+ nce = nce_delete_then_add(nce);
+ if (nce == NULL) {
+ return;
+ }
+ mutex_enter(&nce->nce_lock);
+ nce_fp_mp = nce->nce_fp_mp;
}
+ }
+
+ /* Matched - install mp as the fastpath mp */
+ if (nce_fp_mp == NULL) {
+ fp_mp = dupb(mp->b_cont);
+ nce->nce_fp_mp = fp_mp;
} else {
- /* Just delete the NCE... */
- ndp_delete(nce);
+ fp_mp = mp->b_cont;
+ bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp));
+ nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr
+ + MBLKL(fp_mp);
}
+ mutex_exit(&nce->nce_lock);
+ nce_refrele(nce);
}
/*
@@ -3451,74 +3372,103 @@ ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen)
}
/*
- * ndp_walk function.
+ * ncec_walk function.
* Free a fraction of the NCE cache entries.
- * A fraction of zero means to not free any in that category.
+ *
+ * A possible optimization here would be to use ncec_last where possible, and
+ * delete the least-frequently used entry, which would require more complex
+ * computation as we walk through the ncec's (e.g., track ncec entries by
+ * order of ncec_last and/or maintain state)
*/
-void
-ndp_cache_reclaim(nce_t *nce, char *arg)
+static void
+ncec_cache_reclaim(ncec_t *ncec, char *arg)
{
- nce_cache_reclaim_t *ncr = (nce_cache_reclaim_t *)arg;
- uint_t rand;
+ ip_stack_t *ipst = ncec->ncec_ipst;
+ uint_t fraction = *(uint_t *)arg;
+ uint_t rand;
- if (nce->nce_flags & NCE_F_PERMANENT)
+ if ((ncec->ncec_flags &
+ (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) {
return;
+ }
rand = (uint_t)lbolt +
- NCE_ADDR_HASH_V6(nce->nce_addr, NCE_TABLE_SIZE);
- if (ncr->ncr_host != 0 &&
- (rand/ncr->ncr_host)*ncr->ncr_host == rand) {
- ndp_delete(nce);
- return;
+ NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE);
+ if ((rand/fraction)*fraction == rand) {
+ IP_STAT(ipst, ip_nce_reclaim_deleted);
+ ncec_delete(ncec);
}
}
/*
- * ndp_walk function.
- * Count the number of NCEs that can be deleted.
- * These would be hosts but not routers.
+ * kmem_cache callback to free up memory.
+ *
+ * For now we just delete a fixed fraction.
*/
-void
-ndp_cache_count(nce_t *nce, char *arg)
+static void
+ip_nce_reclaim_stack(ip_stack_t *ipst)
{
- ncc_cache_count_t *ncc = (ncc_cache_count_t *)arg;
+ uint_t fraction = ipst->ips_ip_nce_reclaim_fraction;
- if (nce->nce_flags & NCE_F_PERMANENT)
- return;
+ IP_STAT(ipst, ip_nce_reclaim_calls);
- ncc->ncc_total++;
- if (!(nce->nce_flags & NCE_F_ISROUTER))
- ncc->ncc_host++;
+ ncec_walk(NULL, (pfi_t)ncec_cache_reclaim, (uchar_t *)&fraction, ipst);
+
+ /*
+ * Walk all CONNs that can have a reference on an ire, ncec or dce.
+ * Get them to update any stale references to drop any refholds they
+ * have.
+ */
+ ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
+}
+
+/*
+ * Called by the memory allocator subsystem directly, when the system
+ * is running low on memory.
+ */
+/* ARGSUSED */
+void
+ip_nce_reclaim(void *args)
+{
+ netstack_handle_t nh;
+ netstack_t *ns;
+
+ netstack_next_init(&nh);
+ while ((ns = netstack_next(&nh)) != NULL) {
+ ip_nce_reclaim_stack(ns->netstack_ip);
+ netstack_rele(ns);
+ }
+ netstack_next_fini(&nh);
}
#ifdef DEBUG
void
-nce_trace_ref(nce_t *nce)
+ncec_trace_ref(ncec_t *ncec)
{
- ASSERT(MUTEX_HELD(&nce->nce_lock));
+ ASSERT(MUTEX_HELD(&ncec->ncec_lock));
- if (nce->nce_trace_disable)
+ if (ncec->ncec_trace_disable)
return;
- if (!th_trace_ref(nce, nce->nce_ill->ill_ipst)) {
- nce->nce_trace_disable = B_TRUE;
- nce_trace_cleanup(nce);
+ if (!th_trace_ref(ncec, ncec->ncec_ipst)) {
+ ncec->ncec_trace_disable = B_TRUE;
+ ncec_trace_cleanup(ncec);
}
}
void
-nce_untrace_ref(nce_t *nce)
+ncec_untrace_ref(ncec_t *ncec)
{
- ASSERT(MUTEX_HELD(&nce->nce_lock));
+ ASSERT(MUTEX_HELD(&ncec->ncec_lock));
- if (!nce->nce_trace_disable)
- th_trace_unref(nce);
+ if (!ncec->ncec_trace_disable)
+ th_trace_unref(ncec);
}
static void
-nce_trace_cleanup(const nce_t *nce)
+ncec_trace_cleanup(const ncec_t *ncec)
{
- th_trace_cleanup(nce, nce->nce_trace_disable);
+ th_trace_cleanup(ncec, ncec->ncec_trace_disable);
}
#endif
@@ -3527,64 +3477,159 @@ nce_trace_cleanup(const nce_t *nce)
* Send an ICMP unreachable in response to all queued packets.
*/
void
-arp_resolv_failed(nce_t *nce)
+arp_resolv_failed(ncec_t *ncec)
{
- mblk_t *mp, *nxt_mp, *first_mp;
+ mblk_t *mp, *nxt_mp;
char buf[INET6_ADDRSTRLEN];
- zoneid_t zoneid = GLOBAL_ZONEID;
struct in_addr ipv4addr;
- ip_stack_t *ipst = nce->nce_ill->ill_ipst;
+ ill_t *ill = ncec->ncec_ill;
+ ip_stack_t *ipst = ncec->ncec_ipst;
+ ip_recv_attr_t iras;
- IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &ipv4addr);
+ bzero(&iras, sizeof (iras));
+ iras.ira_flags = IRAF_IS_IPV4;
+ /*
+ * we are setting the ira_rill to the ipmp_ill (instead of
+ * the actual ill on which the packet was received), but this
+ * is ok because we don't actually need the real ira_rill.
+ * to send the icmp unreachable to the sender.
+ */
+ iras.ira_ill = iras.ira_rill = ill;
+ iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+ iras.ira_rifindex = iras.ira_ruifindex;
+
+ IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr);
ip3dbg(("arp_resolv_failed: dst %s\n",
inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf))));
- mutex_enter(&nce->nce_lock);
- mp = nce->nce_qd_mp;
- nce->nce_qd_mp = NULL;
- mutex_exit(&nce->nce_lock);
-
+ mutex_enter(&ncec->ncec_lock);
+ mp = ncec->ncec_qd_mp;
+ ncec->ncec_qd_mp = NULL;
+ ncec->ncec_nprobes = 0;
+ mutex_exit(&ncec->ncec_lock);
while (mp != NULL) {
nxt_mp = mp->b_next;
mp->b_next = NULL;
- mp->b_prev = NULL;
- first_mp = mp;
- /*
- * Send icmp unreachable messages
- * to the hosts.
- */
- (void) ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst);
- ip3dbg(("arp_resolv_failed: Calling icmp_unreachable\n"));
- icmp_unreachable(nce->nce_ill->ill_wq, first_mp,
- ICMP_HOST_UNREACHABLE, zoneid, ipst);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards - address unreachable",
+ mp, ill);
+ if (ipst->ips_ip_arp_icmp_error) {
+ ip3dbg(("arp_resolv_failed: "
+ "Calling icmp_unreachable\n"));
+ icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
+ } else {
+ freemsg(mp);
+ }
+ ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
mp = nxt_mp;
}
+ ncec_cb_dispatch(ncec); /* finish off waiting callbacks */
}
+/*
+ * if ill is an under_ill, translate it to the ipmp_ill and add the
+ * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and
+ * one on the underlying in_ill) will be created for the
+ * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill.
+ */
int
-ndp_lookup_then_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
- nce_t **newnce, nce_t *src_nce)
+nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
+ const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
{
int err;
- nce_t *nce;
in6_addr_t addr6;
ip_stack_t *ipst = ill->ill_ipst;
+ nce_t *nce, *upper_nce = NULL;
+ ill_t *in_ill = ill, *under = NULL;
+ boolean_t need_ill_refrele = B_FALSE;
+
+ if (flags & NCE_F_MCAST) {
+ /*
+ * hw_addr will be figured out in nce_set_multicast_v4;
+ * caller needs to pass in the cast_ill for ipmp
+ */
+ ASSERT(hw_addr == NULL);
+ ASSERT(!IS_IPMP(ill));
+ err = nce_set_multicast_v4(ill, addr, flags, newnce);
+ return (err);
+ }
+
+ if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) {
+ ill = ipmp_ill_hold_ipmp_ill(ill);
+ if (ill == NULL)
+ return (ENXIO);
+ need_ill_refrele = B_TRUE;
+ }
+ if ((flags & NCE_F_BCAST) != 0) {
+ /*
+ * IPv4 broadcast ncec: compute the hwaddr.
+ */
+ if (IS_IPMP(ill)) {
+ under = ipmp_ill_get_xmit_ill(ill, B_FALSE);
+ if (under == NULL) {
+ if (need_ill_refrele)
+ ill_refrele(ill);
+ return (ENETDOWN);
+ }
+ hw_addr = under->ill_bcast_mp->b_rptr +
+ NCE_LL_ADDR_OFFSET(under);
+ hw_addr_len = under->ill_phys_addr_length;
+ } else {
+ hw_addr = ill->ill_bcast_mp->b_rptr +
+ NCE_LL_ADDR_OFFSET(ill),
+ hw_addr_len = ill->ill_phys_addr_length;
+ }
+ }
mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
- nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
- /*
- * NOTE: IPv4 never matches across the illgrp since the NCE's we're
- * looking up have fastpath headers that are inherently per-ill.
- */
- nce = nce_lookup_addr(ill, B_FALSE, &addr6, nce);
+ nce = nce_lookup_addr(ill, &addr6);
if (nce == NULL) {
- err = ndp_add_v4(ill, addr, flags, newnce, src_nce);
+ err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags,
+ state, &nce);
} else {
- *newnce = nce;
err = EEXIST;
}
mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
+ if (err == 0)
+ err = nce_add_v4_postprocess(nce);
+
+ if (in_ill != ill && nce != NULL) {
+ nce_t *under_nce;
+
+ /*
+ * in_ill was the under_ill. Try to create the under_nce.
+ * Hold the ill_g_lock to prevent changes to group membership
+ * until we are done.
+ */
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+ if (IS_IN_SAME_ILLGRP(in_ill, ill)) {
+ under_nce = nce_fastpath_create(in_ill,
+ nce->nce_common);
+ upper_nce = nce;
+ if ((nce = under_nce) == NULL)
+ err = EINVAL;
+ }
+ rw_exit(&ipst->ips_ill_g_lock);
+ if (under_nce != NULL && NCE_ISREACHABLE(nce->nce_common))
+ nce_fastpath_trigger(under_nce);
+ }
+ if (nce != NULL) {
+ if (newnce != NULL)
+ *newnce = nce;
+ else
+ nce_refrele(nce);
+ }
+
+ if (under != NULL)
+ ill_refrele(under);
+
+ if (upper_nce != NULL)
+ nce_refrele(upper_nce);
+
+ if (need_ill_refrele)
+ ill_refrele(ill);
+
return (err);
}
@@ -3592,102 +3637,860 @@ ndp_lookup_then_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
* NDP Cache Entry creation routine for IPv4.
* Mapped entries are handled in arp.
* This routine must always be called with ndp4->ndp_g_lock held.
- * Prior to return, nce_refcnt is incremented.
+ * Prior to return, ncec_refcnt is incremented.
+ *
+ * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses
+ * are always added pointing at the ipmp_ill. Thus, when the ill passed
+ * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t
+ * entries will be created, both pointing at the same ncec_t. The nce_t
+ * entries will have their nce_ill set to the ipmp_ill and the under_ill
+ * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill.
+ * Local addresses are always created on the ill passed to nce_add_v4.
*/
-static int
-ndp_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
- nce_t **newnce, nce_t *src_nce)
+int
+nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
+ const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce)
{
- static nce_t nce_nil;
- nce_t *nce;
- mblk_t *mp;
- mblk_t *template = NULL;
- nce_t **ncep;
- ip_stack_t *ipst = ill->ill_ipst;
- uint16_t state = ND_INITIAL;
int err;
+ boolean_t is_multicast = (flags & NCE_F_MCAST);
+ struct in6_addr addr6;
+ nce_t *nce;
- ASSERT(MUTEX_HELD(&ipst->ips_ndp4->ndp_g_lock));
+ ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock));
ASSERT(!ill->ill_isv6);
- ASSERT((flags & NCE_F_MAPPING) == 0);
+ ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast);
+
+ IN6_IPADDR_TO_V4MAPPED(*addr, &addr6);
+ err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state,
+ &nce);
+ ASSERT(newnce != NULL);
+ *newnce = nce;
+ return (err);
+}
+
+/*
+ * Post-processing routine to be executed after nce_add_v4(). This function
+ * triggers fastpath (if appropriate) and DAD on the newly added nce entry
+ * and must be called without any locks held.
+ *
+ * Always returns 0, but we return an int to keep this symmetric with the
+ * IPv6 counter-part.
+ */
+int
+nce_add_v4_postprocess(nce_t *nce)
+{
+ ncec_t *ncec = nce->nce_common;
+ uint16_t flags = ncec->ncec_flags;
+ boolean_t ndp_need_dad = B_FALSE;
+ boolean_t dropped;
+ clock_t delay;
+ ip_stack_t *ipst = ncec->ncec_ill->ill_ipst;
+ uchar_t *hw_addr = ncec->ncec_lladdr;
+ boolean_t trigger_fastpath = B_TRUE;
- if (ill->ill_resolver_mp == NULL)
- return (EINVAL);
/*
- * Allocate the mblk to hold the nce.
+ * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
+ * we call nce_fastpath as soon as the ncec is resolved in nce_process.
+ * We call nce_fastpath from nce_update if the link layer address of
+ * the peer changes from nce_update
*/
- mp = allocb(sizeof (nce_t), BPRI_MED);
- if (mp == NULL)
- return (ENOMEM);
+ if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL &&
+ ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER))
+ trigger_fastpath = B_FALSE;
- nce = (nce_t *)mp->b_rptr;
- mp->b_wptr = (uchar_t *)&nce[1];
- *nce = nce_nil;
- nce->nce_ill = ill;
- nce->nce_ipversion = IPV4_VERSION;
- nce->nce_flags = flags;
- nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
- nce->nce_rcnt = ill->ill_xmit_count;
- IN6_IPADDR_TO_V4MAPPED(*addr, &nce->nce_addr);
- nce->nce_mask = ipv6_all_ones;
- nce->nce_extract_mask = ipv6_all_zeros;
- nce->nce_ll_extract_start = 0;
- nce->nce_qd_mp = NULL;
- nce->nce_mp = mp;
- /* This one is for nce getting created */
- nce->nce_refcnt = 1;
- mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
- ncep = ((nce_t **)NCE_HASH_PTR_V4(ipst, *addr));
+ if (trigger_fastpath)
+ nce_fastpath_trigger(nce);
+
+ if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) {
+ /*
+ * Either the caller (by passing in ND_PROBE)
+ * or nce_add_common() (by the internally computed state
+ * based on ncec_addr and ill_net_type) has determined
+ * that this unicast entry needs DAD. Trigger DAD.
+ */
+ ndp_need_dad = B_TRUE;
+ } else if (flags & NCE_F_UNSOL_ADV) {
+ /*
+ * We account for the transmit below by assigning one
+ * less than the ndd variable. Subsequent decrements
+ * are done in nce_timer.
+ */
+ mutex_enter(&ncec->ncec_lock);
+ ncec->ncec_unsolicit_count =
+ ipst->ips_ip_arp_publish_count - 1;
+ mutex_exit(&ncec->ncec_lock);
+ dropped = arp_announce(ncec);
+ mutex_enter(&ncec->ncec_lock);
+ if (dropped)
+ ncec->ncec_unsolicit_count++;
+ else
+ ncec->ncec_last_time_defended = ddi_get_lbolt();
+ if (ncec->ncec_unsolicit_count != 0) {
+ nce_start_timer(ncec,
+ ipst->ips_ip_arp_publish_interval);
+ }
+ mutex_exit(&ncec->ncec_lock);
+ }
- nce->nce_trace_disable = B_FALSE;
+ /*
+ * If ncec_xmit_interval is 0, user has configured us to send the first
+ * probe right away. Do so, and set up for the subsequent probes.
+ */
+ if (ndp_need_dad) {
+ mutex_enter(&ncec->ncec_lock);
+ if (ncec->ncec_pcnt == 0) {
+ /*
+ * DAD probes and announce can be
+ * administratively disabled by setting the
+ * probe_count to zero. Restart the timer in
+ * this case to mark the ipif as ready.
+ */
+ ncec->ncec_unsolicit_count = 0;
+ mutex_exit(&ncec->ncec_lock);
+ nce_restart_timer(ncec, 0);
+ } else {
+ mutex_exit(&ncec->ncec_lock);
+ delay = ((ncec->ncec_flags & NCE_F_FAST) ?
+ ipst->ips_arp_probe_delay :
+ ipst->ips_arp_fastprobe_delay);
+ nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE));
+ }
+ }
+ return (0);
+}
- if (src_nce != NULL) {
+/*
+ * ncec_walk routine to update all entries that have a given destination or
+ * gateway address and cached link layer (MAC) address. This is used when ARP
+ * informs us that a network-to-link-layer mapping may have changed.
+ */
+void
+nce_update_hw_changed(ncec_t *ncec, void *arg)
+{
+ nce_hw_map_t *hwm = arg;
+ ipaddr_t ncec_addr;
+
+ if (ncec->ncec_state != ND_REACHABLE)
+ return;
+
+ IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
+ if (ncec_addr != hwm->hwm_addr)
+ return;
+
+ mutex_enter(&ncec->ncec_lock);
+ if (hwm->hwm_flags != 0)
+ ncec->ncec_flags = hwm->hwm_flags;
+ nce_update(ncec, ND_STALE, hwm->hwm_hwaddr);
+ mutex_exit(&ncec->ncec_lock);
+}
+
+void
+ncec_refhold(ncec_t *ncec)
+{
+ mutex_enter(&(ncec)->ncec_lock);
+ (ncec)->ncec_refcnt++;
+ ASSERT((ncec)->ncec_refcnt != 0);
+#ifdef DEBUG
+ ncec_trace_ref(ncec);
+#endif
+ mutex_exit(&(ncec)->ncec_lock);
+}
+
+void
+ncec_refhold_notr(ncec_t *ncec)
+{
+ mutex_enter(&(ncec)->ncec_lock);
+ (ncec)->ncec_refcnt++;
+ ASSERT((ncec)->ncec_refcnt != 0);
+ mutex_exit(&(ncec)->ncec_lock);
+}
+
+static void
+ncec_refhold_locked(ncec_t *ncec)
+{
+ ASSERT(MUTEX_HELD(&(ncec)->ncec_lock));
+ (ncec)->ncec_refcnt++;
+#ifdef DEBUG
+ ncec_trace_ref(ncec);
+#endif
+}
+
+/* ncec_inactive destroys the mutex thus no mutex_exit is needed */
+void
+ncec_refrele(ncec_t *ncec)
+{
+ mutex_enter(&(ncec)->ncec_lock);
+#ifdef DEBUG
+ ncec_untrace_ref(ncec);
+#endif
+ ASSERT((ncec)->ncec_refcnt != 0);
+ if (--(ncec)->ncec_refcnt == 0) {
+ ncec_inactive(ncec);
+ } else {
+ mutex_exit(&(ncec)->ncec_lock);
+ }
+}
+
+void
+ncec_refrele_notr(ncec_t *ncec)
+{
+ mutex_enter(&(ncec)->ncec_lock);
+ ASSERT((ncec)->ncec_refcnt != 0);
+ if (--(ncec)->ncec_refcnt == 0) {
+ ncec_inactive(ncec);
+ } else {
+ mutex_exit(&(ncec)->ncec_lock);
+ }
+}
+
+/*
+ * Common to IPv4 and IPv6.
+ */
+void
+nce_restart_timer(ncec_t *ncec, uint_t ms)
+{
+ timeout_id_t tid;
+
+ ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock));
+
+ /* First cancel any running timer */
+ mutex_enter(&ncec->ncec_lock);
+ tid = ncec->ncec_timeout_id;
+ ncec->ncec_timeout_id = 0;
+ if (tid != 0) {
+ mutex_exit(&ncec->ncec_lock);
+ (void) untimeout(tid);
+ mutex_enter(&ncec->ncec_lock);
+ }
+
+ /* Restart timer */
+ nce_start_timer(ncec, ms);
+ mutex_exit(&ncec->ncec_lock);
+}
+
+static void
+nce_start_timer(ncec_t *ncec, uint_t ms)
+{
+ ASSERT(MUTEX_HELD(&ncec->ncec_lock));
+ /*
+ * Don't start the timer if the ncec has been deleted, or if the timer
+ * is already running
+ */
+ if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) {
+ ncec->ncec_timeout_id = timeout(nce_timer, ncec,
+ MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms));
+ }
+}
+
+int
+nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
+ uint16_t flags, nce_t **newnce)
+{
+ uchar_t *hw_addr;
+ int err = 0;
+ ip_stack_t *ipst = ill->ill_ipst;
+ in6_addr_t dst6;
+ nce_t *nce;
+
+ ASSERT(!ill->ill_isv6);
+
+ IN6_IPADDR_TO_V4MAPPED(*dst, &dst6);
+ mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
+ if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) {
+ mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
+ goto done;
+ }
+ if (ill->ill_net_type == IRE_IF_RESOLVER) {
+ /*
+ * For IRE_IF_RESOLVER a hardware mapping can be
+ * generated, for IRE_IF_NORESOLVER, resolution cookie
+ * in the ill is copied in nce_add_v4().
+ */
+ hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP);
+ if (hw_addr == NULL) {
+ mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
+ return (ENOMEM);
+ }
+ ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr);
+ } else {
/*
- * src_nce has been provided by the caller. The only
- * caller who provides a non-null, non-broadcast
- * src_nce is from ip_newroute() which must pass in
- * a ND_REACHABLE src_nce (this condition is verified
- * via an ASSERT for the save_ire->ire_nce in ip_newroute())
+ * IRE_IF_NORESOLVER type simply copies the resolution
+ * cookie passed in. So no hw_addr is needed.
*/
- mutex_enter(&src_nce->nce_lock);
- state = src_nce->nce_state;
- if ((src_nce->nce_flags & NCE_F_CONDEMNED) ||
- (ipst->ips_ndp4->ndp_g_hw_change > 0)) {
+ hw_addr = NULL;
+ }
+ ASSERT(flags & NCE_F_MCAST);
+ ASSERT(flags & NCE_F_NONUD);
+ /* nce_state will be computed by nce_add_common() */
+ err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
+ ND_UNCHANGED, &nce);
+ mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
+ if (err == 0)
+ err = nce_add_v4_postprocess(nce);
+ if (hw_addr != NULL)
+ kmem_free(hw_addr, ill->ill_phys_addr_length);
+ if (err != 0) {
+ ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
+ return (err);
+ }
+done:
+ if (newnce != NULL)
+ *newnce = nce;
+ else
+ nce_refrele(nce);
+ return (0);
+}
+
+/*
+ * This is used when scanning for "old" (least recently broadcast) NCEs. We
+ * don't want to have to walk the list for every single one, so we gather up
+ * batches at a time.
+ */
+#define NCE_RESCHED_LIST_LEN 8
+
+typedef struct {
+ ill_t *ncert_ill;
+ uint_t ncert_num;
+ ncec_t *ncert_nces[NCE_RESCHED_LIST_LEN];
+} nce_resched_t;
+
+/*
+ * Pick the longest waiting NCEs for defense.
+ */
+/* ARGSUSED */
+static int
+ncec_reschedule(ill_t *ill, nce_t *nce, void *arg)
+{
+ nce_resched_t *ncert = arg;
+ ncec_t **ncecs;
+ ncec_t **ncec_max;
+ ncec_t *ncec_temp;
+ ncec_t *ncec = nce->nce_common;
+
+ ASSERT(ncec->ncec_ill == ncert->ncert_ill);
+ /*
+ * Only reachable entries that are ready for announcement are eligible.
+ */
+ if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE)
+ return (0);
+ if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) {
+ ncec_refhold(ncec);
+ ncert->ncert_nces[ncert->ncert_num++] = ncec;
+ } else {
+ ncecs = ncert->ncert_nces;
+ ncec_max = ncecs + NCE_RESCHED_LIST_LEN;
+ ncec_refhold(ncec);
+ for (; ncecs < ncec_max; ncecs++) {
+ ASSERT(ncec != NULL);
+ if ((*ncecs)->ncec_last_time_defended >
+ ncec->ncec_last_time_defended) {
+ ncec_temp = *ncecs;
+ *ncecs = ncec;
+ ncec = ncec_temp;
+ }
+ }
+ ncec_refrele(ncec);
+ }
+ return (0);
+}
+
+/*
+ * Reschedule the ARP defense of any long-waiting NCEs. It's assumed that this
+ * doesn't happen very often (if at all), and thus it needn't be highly
+ * optimized. (Note, though, that it's actually O(N) complexity, because the
+ * outer loop is bounded by a constant rather than by the length of the list.)
+ */
+static void
+nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert)
+{
+ ncec_t *ncec;
+ ip_stack_t *ipst = ill->ill_ipst;
+ uint_t i, defend_rate;
+
+ i = ill->ill_defend_count;
+ ill->ill_defend_count = 0;
+ if (ill->ill_isv6)
+ defend_rate = ipst->ips_ndp_defend_rate;
+ else
+ defend_rate = ipst->ips_arp_defend_rate;
+ /* If none could be sitting around, then don't reschedule */
+ if (i < defend_rate) {
+ DTRACE_PROBE1(reschedule_none, ill_t *, ill);
+ return;
+ }
+ ncert->ncert_ill = ill;
+ while (ill->ill_defend_count < defend_rate) {
+ nce_walk_common(ill, ncec_reschedule, ncert);
+ for (i = 0; i < ncert->ncert_num; i++) {
+
+ ncec = ncert->ncert_nces[i];
+ mutex_enter(&ncec->ncec_lock);
+ ncec->ncec_flags |= NCE_F_DELAYED;
+ mutex_exit(&ncec->ncec_lock);
/*
- * src_nce has been deleted, or
- * ip_arp_news is in the middle of
- * flushing entries in the the nce.
- * Fail the add, since we don't know
- * if it is safe to copy the contents of
- * src_nce
+ * we plan to schedule this ncec, so incr the
+ * defend_count in anticipation.
*/
- DTRACE_PROBE2(nce__bad__src__nce,
- nce_t *, src_nce, ill_t *, ill);
- mutex_exit(&src_nce->nce_lock);
- err = EINVAL;
- goto err_ret;
+ if (++ill->ill_defend_count >= defend_rate)
+ break;
}
- template = copyb(src_nce->nce_res_mp);
- mutex_exit(&src_nce->nce_lock);
- if (template == NULL) {
- err = ENOMEM;
- goto err_ret;
+ if (ncert->ncert_num < NCE_RESCHED_LIST_LEN)
+ break;
+ }
+}
+
+/*
+ * Check if the current rate-limiting parameters permit the sending
+ * of another address defense announcement for both IPv4 and IPv6.
+ * Returns B_TRUE if rate-limiting is in effect (i.e., send is not
+ * permitted), and B_FALSE otherwise. The `defend_rate' parameter
+ * determines how many address defense announcements are permitted
+ * in any `defense_perio' interval.
+ */
+static boolean_t
+ill_defend_rate_limit(ill_t *ill, ncec_t *ncec)
+{
+ clock_t now = ddi_get_lbolt();
+ ip_stack_t *ipst = ill->ill_ipst;
+ clock_t start = ill->ill_defend_start;
+ uint32_t elapsed, defend_period, defend_rate;
+ nce_resched_t ncert;
+ boolean_t ret;
+ int i;
+
+ if (ill->ill_isv6) {
+ defend_period = ipst->ips_ndp_defend_period;
+ defend_rate = ipst->ips_ndp_defend_rate;
+ } else {
+ defend_period = ipst->ips_arp_defend_period;
+ defend_rate = ipst->ips_arp_defend_rate;
+ }
+ if (defend_rate == 0)
+ return (B_TRUE);
+ bzero(&ncert, sizeof (ncert));
+ mutex_enter(&ill->ill_lock);
+ if (start > 0) {
+ elapsed = now - start;
+ if (elapsed > SEC_TO_TICK(defend_period)) {
+ ill->ill_defend_start = now;
+ /*
+ * nce_ill_reschedule will attempt to
+ * prevent starvation by reschduling the
+ * oldest entries, which are marked with
+ * the NCE_F_DELAYED flag.
+ */
+ nce_ill_reschedule(ill, &ncert);
+ }
+ } else {
+ ill->ill_defend_start = now;
+ }
+ ASSERT(ill->ill_defend_count <= defend_rate);
+ mutex_enter(&ncec->ncec_lock);
+ if (ncec->ncec_flags & NCE_F_DELAYED) {
+ /*
+ * This ncec was rescheduled as one of the really old
+ * entries needing on-going defense. The
+ * ill_defend_count was already incremented in
+ * nce_ill_reschedule. Go ahead and send the announce.
+ */
+ ncec->ncec_flags &= ~NCE_F_DELAYED;
+ mutex_exit(&ncec->ncec_lock);
+ ret = B_FALSE;
+ goto done;
+ }
+ mutex_exit(&ncec->ncec_lock);
+ if (ill->ill_defend_count < defend_rate)
+ ill->ill_defend_count++;
+ if (ill->ill_defend_count == defend_rate) {
+ /*
+ * we are no longer allowed to send unbidden defense
+ * messages. Wait for rescheduling.
+ */
+ ret = B_TRUE;
+ } else {
+ ret = B_FALSE;
+ }
+done:
+ mutex_exit(&ill->ill_lock);
+ /*
+ * After all the locks have been dropped we can restart nce timer,
+ * and refrele the delayed ncecs
+ */
+ for (i = 0; i < ncert.ncert_num; i++) {
+ clock_t xmit_interval;
+ ncec_t *tmp;
+
+ tmp = ncert.ncert_nces[i];
+ xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval,
+ B_FALSE);
+ nce_restart_timer(tmp, xmit_interval);
+ ncec_refrele(tmp);
+ }
+ return (ret);
+}
+
+boolean_t
+ndp_announce(ncec_t *ncec)
+{
+ return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr,
+ ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast,
+ nce_advert_flags(ncec)));
+}
+
+ill_t *
+nce_resolve_src(ncec_t *ncec, in6_addr_t *src)
+{
+ mblk_t *mp;
+ in6_addr_t src6;
+ ipaddr_t src4;
+ ill_t *ill = ncec->ncec_ill;
+ ill_t *src_ill = NULL;
+ ipif_t *ipif = NULL;
+ boolean_t is_myaddr = NCE_MYADDR(ncec);
+ boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
+
+ ASSERT(src != NULL);
+ ASSERT(IN6_IS_ADDR_UNSPECIFIED(src));
+ src6 = *src;
+ if (is_myaddr) {
+ src6 = ncec->ncec_addr;
+ if (!isv6)
+ IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4);
+ } else {
+ /*
+ * try to find one from the outgoing packet.
+ */
+ mutex_enter(&ncec->ncec_lock);
+ mp = ncec->ncec_qd_mp;
+ if (mp != NULL) {
+ if (isv6) {
+ ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+
+ src6 = ip6h->ip6_src;
+ } else {
+ ipha_t *ipha = (ipha_t *)mp->b_rptr;
+
+ src4 = ipha->ipha_src;
+ IN6_IPADDR_TO_V4MAPPED(src4, &src6);
+ }
+ }
+ mutex_exit(&ncec->ncec_lock);
+ }
+
+ /*
+ * For outgoing packets, if the src of outgoing packet is one
+ * of the assigned interface addresses use it, otherwise we
+ * will pick the source address below.
+ * For local addresses (is_myaddr) doing DAD, NDP announce
+ * messages are mcast. So we use the (IPMP) cast_ill or the
+ * (non-IPMP) ncec_ill for these message types. The only case
+ * of unicast DAD messages are for IPv6 ND probes, for which
+ * we find the ipif_bound_ill corresponding to the ncec_addr.
+ */
+ if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) {
+ if (isv6) {
+ ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES,
+ ill->ill_ipst);
+ } else {
+ ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES,
+ ill->ill_ipst);
+ }
+
+ /*
+ * If no relevant ipif can be found, then it's not one of our
+ * addresses. Reset to :: and try to find a src for the NS or
+ * ARP request using ipif_select_source_v[4,6] below.
+ * If an ipif can be found, but it's not yet done with
+ * DAD verification, and we are not being invoked for
+ * DAD (i.e., !is_myaddr), then just postpone this
+ * transmission until later.
+ */
+ if (ipif == NULL) {
+ src6 = ipv6_all_zeros;
+ src4 = INADDR_ANY;
+ } else if (!ipif->ipif_addr_ready && !is_myaddr) {
+ DTRACE_PROBE2(nce__resolve__ipif__not__ready,
+ ncec_t *, ncec, ipif_t *, ipif);
+ ipif_refrele(ipif);
+ return (NULL);
}
- } else if (flags & NCE_F_BCAST) {
+ }
+
+ if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) {
/*
- * broadcast nce.
+ * Pick a source address for this solicitation, but
+ * restrict the selection to addresses assigned to the
+ * output interface. We do this because the destination will
+ * create a neighbor cache entry for the source address of
+ * this packet, so the source address had better be a valid
+ * neighbor.
*/
- template = copyb(ill->ill_bcast_mp);
+ if (isv6) {
+ ipif = ipif_select_source_v6(ill, &ncec->ncec_addr,
+ B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
+ B_FALSE, NULL);
+ } else {
+ ipaddr_t nce_addr;
+
+ IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr);
+ ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES,
+ B_FALSE, NULL);
+ }
+ if (ipif == NULL && IS_IPMP(ill)) {
+ ill_t *send_ill = ipmp_ill_get_xmit_ill(ill, B_TRUE);
+
+ if (send_ill != NULL) {
+ if (isv6) {
+ ipif = ipif_select_source_v6(send_ill,
+ &ncec->ncec_addr, B_TRUE,
+ IPV6_PREFER_SRC_DEFAULT, ALL_ZONES,
+ B_FALSE, NULL);
+ } else {
+ IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr,
+ src4);
+ ipif = ipif_select_source_v4(send_ill,
+ src4, ALL_ZONES, B_TRUE, NULL);
+ }
+ ill_refrele(send_ill);
+ }
+ }
+
+ if (ipif == NULL) {
+ char buf[INET6_ADDRSTRLEN];
+
+ ip1dbg(("nce_resolve_src: No source ipif for dst %s\n",
+ inet_ntop((isv6 ? AF_INET6 : AF_INET),
+ (char *)&ncec->ncec_addr, buf, sizeof (buf))));
+ DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec);
+ return (NULL);
+ }
+ src6 = ipif->ipif_v6lcl_addr;
+ }
+ *src = src6;
+ if (ipif != NULL) {
+ src_ill = ipif->ipif_ill;
+ if (IS_IPMP(src_ill))
+ src_ill = ipmp_ipif_hold_bound_ill(ipif);
+ else
+ ill_refhold(src_ill);
+ ipif_refrele(ipif);
+ DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec,
+ ill_t *, src_ill);
+ }
+ return (src_ill);
+}
+
+void
+ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst,
+ uchar_t *hwaddr, int hwaddr_len, int flags)
+{
+ ill_t *ill;
+ ncec_t *ncec;
+ nce_t *nce;
+ uint16_t new_state;
+
+ ill = (ipif ? ipif->ipif_ill : NULL);
+ if (ill != NULL) {
+ /*
+ * only one ncec is possible
+ */
+ nce = nce_lookup_v4(ill, addr);
+ if (nce != NULL) {
+ ncec = nce->nce_common;
+ mutex_enter(&ncec->ncec_lock);
+ if (NCE_ISREACHABLE(ncec))
+ new_state = ND_UNCHANGED;
+ else
+ new_state = ND_STALE;
+ ncec->ncec_flags = flags;
+ nce_update(ncec, new_state, hwaddr);
+ mutex_exit(&ncec->ncec_lock);
+ nce_refrele(nce);
+ return;
+ }
+ } else {
+ /*
+ * ill is wildcard; clean up all ncec's and ire's
+ * that match on addr.
+ */
+ nce_hw_map_t hwm;
+
+ hwm.hwm_addr = *addr;
+ hwm.hwm_hwlen = hwaddr_len;
+ hwm.hwm_hwaddr = hwaddr;
+ hwm.hwm_flags = flags;
+
+ ncec_walk_common(ipst->ips_ndp4, NULL,
+ (pfi_t)nce_update_hw_changed, (uchar_t *)&hwm, B_TRUE);
+ }
+}
+
+/*
+ * Common function to add ncec entries.
+ * we always add the ncec with ncec_ill == ill, and always create
+ * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the
+ * ncec is !reachable.
+ *
+ * When the caller passes in an nce_state of ND_UNCHANGED,
+ * nce_add_common() will determine the state of the created nce based
+ * on the ill_net_type and nce_flags used. Otherwise, the nce will
+ * be created with state set to the passed in nce_state.
+ */
+static int
+nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len,
+ const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce)
+{
+ static ncec_t nce_nil;
+ uchar_t *template = NULL;
+ int err;
+ ncec_t *ncec;
+ ncec_t **ncep;
+ ip_stack_t *ipst = ill->ill_ipst;
+ uint16_t state;
+ boolean_t fastprobe = B_FALSE;
+ struct ndp_g_s *ndp;
+ nce_t *nce = NULL;
+ mblk_t *dlur_mp = NULL;
+
+ if (ill->ill_isv6)
+ ndp = ill->ill_ipst->ips_ndp6;
+ else
+ ndp = ill->ill_ipst->ips_ndp4;
+
+ *retnce = NULL;
+
+ ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
+
+ if (IN6_IS_ADDR_UNSPECIFIED(addr)) {
+ ip0dbg(("nce_add_common: no addr\n"));
+ return (EINVAL);
+ }
+ if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) {
+ ip0dbg(("nce_add_common: flags = %x\n", (int)flags));
+ return (EINVAL);
+ }
+
+ if (ill->ill_isv6) {
+ ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr));
+ } else {
+ ipaddr_t v4addr;
+
+ IN6_V4MAPPED_TO_IPADDR(addr, v4addr);
+ ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr));
+ }
+
+ /*
+ * The caller has ensured that there is no nce on ill, but there could
+ * still be an nce_common_t for the address, so that we find exisiting
+ * ncec_t strucutures first, and atomically add a new nce_t if
+ * one is found. The ndp_g_lock ensures that we don't cross threads
+ * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not
+ * compare for matches across the illgrp because this function is
+ * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common,
+ * with the nce_lookup_then_add_v* passing in the ipmp_ill where
+ * appropriate.
+ */
+ ncec = *ncep;
+ for (; ncec != NULL; ncec = ncec->ncec_next) {
+ if (ncec->ncec_ill == ill) {
+ if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) {
+ *retnce = nce_ill_lookup_then_add(ill, ncec);
+ if (*retnce != NULL)
+ break;
+ }
+ }
+ }
+ if (*retnce != NULL) {
+ /*
+ * We should never find *retnce to be MYADDR, since the caller
+ * may then incorrectly restart a DAD timer that's already
+ * running.
+ */
+ ASSERT(!NCE_MYADDR(ncec));
+ /* caller must trigger fastpath on nce */
+ return (0);
+ }
+ ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP);
+ if (ncec == NULL)
+ return (ENOMEM);
+ *ncec = nce_nil;
+ ncec->ncec_ill = ill;
+ ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION);
+ ncec->ncec_flags = flags;
+ ncec->ncec_ipst = ipst; /* No netstack_hold */
+
+ if (!ill->ill_isv6) {
+ ipaddr_t addr4;
+
+ /*
+ * DAD probe interval and probe count are set based on
+ * fast/slow probe settings. If the underlying link doesn't
+ * have reliably up/down notifications or if we're working
+ * with IPv4 169.254.0.0/16 Link Local Address space, then
+ * don't use the fast timers. Otherwise, use them.
+ */
+ ASSERT(IN6_IS_ADDR_V4MAPPED(addr));
+ IN6_V4MAPPED_TO_IPADDR(addr, addr4);
+ if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4))
+ fastprobe = B_TRUE;
+ if (fastprobe) {
+ ncec->ncec_xmit_interval =
+ ipst->ips_arp_fastprobe_interval;
+ ncec->ncec_pcnt =
+ ipst->ips_arp_fastprobe_count;
+ ncec->ncec_flags |= NCE_F_FAST;
+ } else {
+ ncec->ncec_xmit_interval =
+ ipst->ips_arp_probe_interval;
+ ncec->ncec_pcnt =
+ ipst->ips_arp_probe_count;
+ }
+ if (NCE_PUBLISH(ncec)) {
+ ncec->ncec_unsolicit_count =
+ ipst->ips_ip_arp_publish_count;
+ }
+ } else {
+ /*
+ * probe interval is constant: ILL_PROBE_INTERVAL
+ * probe count is constant: ND_MAX_UNICAST_SOLICIT
+ */
+ ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT;
+ if (NCE_PUBLISH(ncec)) {
+ ncec->ncec_unsolicit_count =
+ ipst->ips_ip_ndp_unsolicit_count;
+ }
+ }
+ ncec->ncec_rcnt = ill->ill_xmit_count;
+ ncec->ncec_addr = *addr;
+ ncec->ncec_qd_mp = NULL;
+ ncec->ncec_refcnt = 1; /* for ncec getting created */
+ mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL);
+ ncec->ncec_trace_disable = B_FALSE;
+
+ /*
+ * ncec_lladdr holds link layer address
+ */
+ if (hw_addr_len > 0) {
+ template = kmem_alloc(hw_addr_len, KM_NOSLEEP);
if (template == NULL) {
err = ENOMEM;
goto err_ret;
}
+ ncec->ncec_lladdr = template;
+ ncec->ncec_lladdr_length = hw_addr_len;
+ bzero(ncec->ncec_lladdr, hw_addr_len);
+ }
+ if ((flags & NCE_F_BCAST) != 0) {
state = ND_REACHABLE;
+ ASSERT(hw_addr_len > 0);
+ } else if (ill->ill_net_type == IRE_IF_RESOLVER) {
+ state = ND_INITIAL;
} else if (ill->ill_net_type == IRE_IF_NORESOLVER) {
/*
* NORESOLVER entries are always created in the REACHABLE
* state.
*/
+ state = ND_REACHABLE;
if (ill->ill_phys_addr_length == IP_ADDR_LEN &&
ill->ill_mactype != DL_IPV4 &&
ill->ill_mactype != DL_6TO4) {
@@ -3698,32 +4501,91 @@ ndp_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
* that do their own resolution from IP to link-layer
* address (e.g. IP over X.25).
*/
- template = ill_dlur_gen((uchar_t *)addr,
- ill->ill_phys_addr_length,
- ill->ill_sap, ill->ill_sap_length);
- } else {
- template = copyb(ill->ill_resolver_mp);
+ bcopy((uchar_t *)addr,
+ ncec->ncec_lladdr, ill->ill_phys_addr_length);
}
- if (template == NULL) {
- err = ENOMEM;
- goto err_ret;
+ if (ill->ill_phys_addr_length == IPV6_ADDR_LEN &&
+ ill->ill_mactype != DL_IPV6) {
+ /*
+ * We create a nce_res_mp with the IP nexthop address
+ * as the destination address if the physical legnth
+ * is exactly 16 bytes for point-to-multipoint links
+ * that do their own resolution from IP to link-layer
+ * address.
+ */
+ bcopy((uchar_t *)addr,
+ ncec->ncec_lladdr, ill->ill_phys_addr_length);
}
+ /*
+ * Since NUD is not part of the base IPv4 protocol definition,
+ * IPv4 neighbor entries on NORESOLVER interfaces will never
+ * age, and are marked NCE_F_NONUD.
+ */
+ if (!ill->ill_isv6)
+ ncec->ncec_flags |= NCE_F_NONUD;
+ } else if (ill->ill_net_type == IRE_LOOPBACK) {
state = ND_REACHABLE;
}
- nce->nce_fp_mp = NULL;
- nce->nce_res_mp = template;
- nce->nce_state = state;
+
+ if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) {
+ /*
+ * We are adding an ncec with a deterministic hw_addr,
+ * so the state can only be one of {REACHABLE, STALE, PROBE}.
+ *
+ * if we are adding a unicast ncec for the local address
+ * it would be REACHABLE; we would be adding a ND_STALE entry
+ * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own
+ * addresses are added in PROBE to trigger DAD.
+ */
+ if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) ||
+ ill->ill_net_type == IRE_IF_NORESOLVER)
+ state = ND_REACHABLE;
+ else if (!NCE_PUBLISH(ncec))
+ state = ND_STALE;
+ else
+ state = ND_PROBE;
+ if (hw_addr != NULL)
+ nce_set_ll(ncec, hw_addr);
+ }
+ /* caller overrides internally computed state */
+ if (nce_state != ND_UNCHANGED)
+ state = nce_state;
+
+ if (state == ND_PROBE)
+ ncec->ncec_flags |= NCE_F_UNVERIFIED;
+
+ ncec->ncec_state = state;
+
if (state == ND_REACHABLE) {
- nce->nce_last = TICK_TO_MSEC(lbolt64);
- nce->nce_init_time = TICK_TO_MSEC(lbolt64);
+ ncec->ncec_last = TICK_TO_MSEC(lbolt64);
+ ncec->ncec_init_time = TICK_TO_MSEC(lbolt64);
} else {
- nce->nce_last = 0;
+ ncec->ncec_last = 0;
if (state == ND_INITIAL)
- nce->nce_init_time = TICK_TO_MSEC(lbolt64);
+ ncec->ncec_init_time = TICK_TO_MSEC(lbolt64);
+ }
+ list_create(&ncec->ncec_cb, sizeof (ncec_cb_t),
+ offsetof(ncec_cb_t, ncec_cb_node));
+ /*
+ * have all the memory allocations out of the way before taking locks
+ * and adding the nce.
+ */
+ nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
+ if (nce == NULL) {
+ err = ENOMEM;
+ goto err_ret;
+ }
+ if (ncec->ncec_lladdr != NULL ||
+ ill->ill_net_type == IRE_IF_NORESOLVER) {
+ dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
+ ill->ill_phys_addr_length, ill->ill_sap,
+ ill->ill_sap_length);
+ if (dlur_mp == NULL) {
+ err = ENOMEM;
+ goto err_ret;
+ }
}
- ASSERT((nce->nce_res_mp == NULL && nce->nce_state == ND_INITIAL) ||
- (nce->nce_res_mp != NULL && nce->nce_state == ND_REACHABLE));
/*
* Atomically ensure that the ill is not CONDEMNED, before
* adding the NCE.
@@ -3734,128 +4596,423 @@ ndp_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags,
err = EINVAL;
goto err_ret;
}
- if ((nce->nce_next = *ncep) != NULL)
- nce->nce_next->nce_ptpn = &nce->nce_next;
- *ncep = nce;
- nce->nce_ptpn = ncep;
- *newnce = nce;
- /* This one is for nce being used by an active thread */
- NCE_REFHOLD(*newnce);
+ if (!NCE_MYADDR(ncec) &&
+ (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) {
+ mutex_exit(&ill->ill_lock);
+ DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec);
+ err = EINVAL;
+ goto err_ret;
+ }
+ /*
+ * Acquire the ncec_lock even before adding the ncec to the list
+ * so that it cannot get deleted after the ncec is added, but
+ * before we add the nce.
+ */
+ mutex_enter(&ncec->ncec_lock);
+ if ((ncec->ncec_next = *ncep) != NULL)
+ ncec->ncec_next->ncec_ptpn = &ncec->ncec_next;
+ *ncep = ncec;
+ ncec->ncec_ptpn = ncep;
- /* Bump up the number of nce's referencing this ill */
+ /* Bump up the number of ncec's referencing this ill */
DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
- (char *), "nce", (void *), nce);
- ill->ill_nce_cnt++;
+ (char *), "ncec", (void *), ncec);
+ ill->ill_ncec_cnt++;
+ /*
+ * Since we hold the ncec_lock at this time, the ncec cannot be
+ * condemned, and we can safely add the nce.
+ */
+ *retnce = nce_add_impl(ill, ncec, nce, dlur_mp);
+ mutex_exit(&ncec->ncec_lock);
mutex_exit(&ill->ill_lock);
- DTRACE_PROBE1(ndp__add__v4, nce_t *, nce);
+
+ /* caller must trigger fastpath on *retnce */
return (0);
+
err_ret:
- freeb(mp);
- freemsg(template);
+ if (ncec != NULL)
+ kmem_cache_free(ncec_cache, ncec);
+ if (nce != NULL)
+ kmem_cache_free(nce_cache, nce);
+ freemsg(dlur_mp);
+ if (template != NULL)
+ kmem_free(template, ill->ill_phys_addr_length);
return (err);
}
/*
- * ndp_walk routine to delete all entries that have a given destination or
- * gateway address and cached link layer (MAC) address. This is used when ARP
- * informs us that a network-to-link-layer mapping may have changed.
+ * take a ref on the nce
*/
void
-nce_delete_hw_changed(nce_t *nce, void *arg)
+nce_refhold(nce_t *nce)
{
- nce_hw_map_t *hwm = arg;
- mblk_t *mp;
- dl_unitdata_req_t *dlu;
- uchar_t *macaddr;
- ill_t *ill;
- int saplen;
- ipaddr_t nce_addr;
+ mutex_enter(&nce->nce_lock);
+ nce->nce_refcnt++;
+ ASSERT((nce)->nce_refcnt != 0);
+ mutex_exit(&nce->nce_lock);
+}
- if (nce->nce_state != ND_REACHABLE)
- return;
+/*
+ * release a ref on the nce; In general, this
+ * cannot be called with locks held because nce_inactive
+ * may result in nce_inactive which will take the ill_lock,
+ * do ipif_ill_refrele_tail etc. Thus the one exception
+ * where this can be called with locks held is when the caller
+ * is certain that the nce_refcnt is sufficient to prevent
+ * the invocation of nce_inactive.
+ */
+void
+nce_refrele(nce_t *nce)
+{
+ ASSERT((nce)->nce_refcnt != 0);
+ mutex_enter(&nce->nce_lock);
+ if (--nce->nce_refcnt == 0)
+ nce_inactive(nce); /* destroys the mutex */
+ else
+ mutex_exit(&nce->nce_lock);
+}
- IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr);
- if (nce_addr != hwm->hwm_addr)
- return;
+/*
+ * free the nce after all refs have gone away.
+ */
+static void
+nce_inactive(nce_t *nce)
+{
+ ill_t *ill = nce->nce_ill;
+
+ ASSERT(nce->nce_refcnt == 0);
+
+ ncec_refrele_notr(nce->nce_common);
+ nce->nce_common = NULL;
+ freemsg(nce->nce_fp_mp);
+ freemsg(nce->nce_dlur_mp);
+
+ mutex_enter(&ill->ill_lock);
+ DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill,
+ (char *), "nce", (void *), nce);
+ ill->ill_nce_cnt--;
+ nce->nce_ill = NULL;
+ /*
+ * If the number of ncec's associated with this ill have dropped
+ * to zero, check whether we need to restart any operation that
+ * is waiting for this to happen.
+ */
+ if (ILL_DOWN_OK(ill)) {
+ /* ipif_ill_refrele_tail drops the ill_lock */
+ ipif_ill_refrele_tail(ill);
+ } else {
+ mutex_exit(&ill->ill_lock);
+ }
+
+ mutex_destroy(&nce->nce_lock);
+ kmem_cache_free(nce_cache, nce);
+}
+
+/*
+ * Add an nce to the ill_nce list.
+ */
+static nce_t *
+nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp)
+{
+ bzero(nce, sizeof (*nce));
+ mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
+ nce->nce_common = ncec;
+ nce->nce_addr = ncec->ncec_addr;
+ nce->nce_ill = ill;
+ DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
+ (char *), "nce", (void *), nce);
+ ill->ill_nce_cnt++;
+
+ nce->nce_refcnt = 1; /* for the thread */
+ ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
+ nce->nce_dlur_mp = dlur_mp;
+
+ /* add nce to the ill's fastpath list. */
+ nce->nce_refcnt++; /* for the list */
+ list_insert_head(&ill->ill_nce, nce);
+ return (nce);
+}
+
+static nce_t *
+nce_add(ill_t *ill, ncec_t *ncec)
+{
+ nce_t *nce;
+ mblk_t *dlur_mp = NULL;
+
+ ASSERT(MUTEX_HELD(&ill->ill_lock));
+ ASSERT(MUTEX_HELD(&ncec->ncec_lock));
+
+ nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
+ if (nce == NULL)
+ return (NULL);
+ if (ncec->ncec_lladdr != NULL ||
+ ill->ill_net_type == IRE_IF_NORESOLVER) {
+ dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
+ ill->ill_phys_addr_length, ill->ill_sap,
+ ill->ill_sap_length);
+ if (dlur_mp == NULL) {
+ kmem_cache_free(nce_cache, nce);
+ return (NULL);
+ }
+ }
+ return (nce_add_impl(ill, ncec, nce, dlur_mp));
+}
+
+/*
+ * remove the nce from the ill_faspath list
+ */
+void
+nce_delete(nce_t *nce)
+{
+ ill_t *ill = nce->nce_ill;
+
+ ASSERT(MUTEX_HELD(&ill->ill_lock));
mutex_enter(&nce->nce_lock);
- if ((mp = nce->nce_res_mp) == NULL) {
+ if (nce->nce_is_condemned) {
+ /*
+ * some other thread has removed this nce from the ill_nce list
+ */
mutex_exit(&nce->nce_lock);
return;
}
- dlu = (dl_unitdata_req_t *)mp->b_rptr;
- macaddr = (uchar_t *)(dlu + 1);
- ill = nce->nce_ill;
- if ((saplen = ill->ill_sap_length) > 0)
- macaddr += saplen;
- else
- saplen = -saplen;
+ nce->nce_is_condemned = B_TRUE;
+ mutex_exit(&nce->nce_lock);
+ list_remove(&ill->ill_nce, nce);
/*
- * If the hardware address is unchanged, then leave this one alone.
- * Note that saplen == abs(saplen) now.
+ * even though we are holding the ill_lock, it is ok to
+ * call nce_refrele here because we know that we should have
+ * at least 2 refs on the nce: one for the thread, and one
+ * for the list. The refrele below will release the one for
+ * the list.
*/
- if (hwm->hwm_hwlen == dlu->dl_dest_addr_length - saplen &&
- bcmp(hwm->hwm_hwaddr, macaddr, hwm->hwm_hwlen) == 0) {
- mutex_exit(&nce->nce_lock);
- return;
+ nce_refrele(nce);
+}
+
+nce_t *
+nce_lookup(ill_t *ill, const in6_addr_t *addr)
+{
+ nce_t *nce = NULL;
+
+ ASSERT(ill != NULL);
+ ASSERT(MUTEX_HELD(&ill->ill_lock));
+
+ for (nce = list_head(&ill->ill_nce); nce != NULL;
+ nce = list_next(&ill->ill_nce, nce)) {
+ if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr))
+ break;
}
- mutex_exit(&nce->nce_lock);
- DTRACE_PROBE1(nce__hw__deleted, nce_t *, nce);
- ndp_delete(nce);
+ /*
+ * if we found the nce on the ill_nce list while holding
+ * the ill_lock, then it cannot be condemned yet.
+ */
+ if (nce != NULL) {
+ ASSERT(!nce->nce_is_condemned);
+ nce_refhold(nce);
+ }
+ return (nce);
}
/*
- * This function verifies whether a given IPv4 address is potentially known to
- * the NCE subsystem. If so, then ARP must not delete the corresponding ace_t,
- * so that it can continue to look for hardware changes on that address.
+ * Walk the ill_nce list on ill. The callback function func() cannot perform
+ * any destructive actions.
*/
-boolean_t
-ndp_lookup_ipaddr(in_addr_t addr, netstack_t *ns)
+static void
+nce_walk_common(ill_t *ill, pfi_t func, void *arg)
{
- nce_t *nce;
- struct in_addr nceaddr;
- ip_stack_t *ipst = ns->netstack_ip;
+ nce_t *nce = NULL, *nce_next;
- if (addr == INADDR_ANY)
- return (B_FALSE);
+ ASSERT(MUTEX_HELD(&ill->ill_lock));
+ for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
+ nce_next = list_next(&ill->ill_nce, nce);
+ if (func(ill, nce, arg) != 0)
+ break;
+ nce = nce_next;
+ }
+}
- mutex_enter(&ipst->ips_ndp4->ndp_g_lock);
- nce = *(nce_t **)NCE_HASH_PTR_V4(ipst, addr);
- for (; nce != NULL; nce = nce->nce_next) {
- /* Note that only v4 mapped entries are in the table. */
- IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &nceaddr);
- if (addr == nceaddr.s_addr &&
- IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) {
- /* Single flag check; no lock needed */
- if (!(nce->nce_flags & NCE_F_CONDEMNED))
- break;
+void
+nce_walk(ill_t *ill, pfi_t func, void *arg)
+{
+ mutex_enter(&ill->ill_lock);
+ nce_walk_common(ill, func, arg);
+ mutex_exit(&ill->ill_lock);
+}
+
+void
+nce_flush(ill_t *ill, boolean_t flushall)
+{
+ nce_t *nce, *nce_next;
+ list_t dead;
+
+ list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
+ mutex_enter(&ill->ill_lock);
+ for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
+ nce_next = list_next(&ill->ill_nce, nce);
+ if (!flushall && NCE_PUBLISH(nce->nce_common)) {
+ nce = nce_next;
+ continue;
}
+ /*
+ * nce_delete requires that the caller should either not
+ * be holding locks, or should hold a ref to ensure that
+ * we wont hit ncec_inactive. So take a ref and clean up
+ * after the list is flushed.
+ */
+ nce_refhold(nce);
+ nce_delete(nce);
+ list_insert_tail(&dead, nce);
+ nce = nce_next;
}
- mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
- return (nce != NULL);
+ mutex_exit(&ill->ill_lock);
+ while ((nce = list_head(&dead)) != NULL) {
+ list_remove(&dead, nce);
+ nce_refrele(nce);
+ }
+ ASSERT(list_is_empty(&dead));
+ list_destroy(&dead);
}
-/*
- * Wrapper around ipif_lookup_addr_exact_v6() that allows ND to work properly
- * with IPMP. Specifically, since neighbor discovery is always done on
- * underlying interfaces (even for addresses owned by an IPMP interface), we
- * need to check for `v6addrp' on both `ill' and on the IPMP meta-interface
- * associated with `ill' (if it exists).
- */
-static ipif_t *
-ip_ndp_lookup_addr_v6(const in6_addr_t *v6addrp, ill_t *ill)
+/* Return an interval that is anywhere in the [1 .. intv] range */
+static clock_t
+nce_fuzz_interval(clock_t intv, boolean_t initial_time)
+{
+ clock_t rnd, frac;
+
+ (void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd));
+ /* Note that clock_t is signed; must chop off bits */
+ rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1;
+ if (initial_time) {
+ if (intv <= 0)
+ intv = 1;
+ else
+ intv = (rnd % intv) + 1;
+ } else {
+ /* Compute 'frac' as 20% of the configured interval */
+ if ((frac = intv / 5) <= 1)
+ frac = 2;
+ /* Set intv randomly in the range [intv-frac .. intv+frac] */
+ if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0)
+ intv = 1;
+ }
+ return (intv);
+}
+
+void
+nce_resolv_ipmp_ok(ncec_t *ncec)
{
- ipif_t *ipif;
+ mblk_t *mp;
+ uint_t pkt_len;
+ iaflags_t ixaflags = IXAF_NO_TRACE;
+ nce_t *under_nce;
+ ill_t *ill = ncec->ncec_ill;
+ boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
+ ipif_t *src_ipif = NULL;
ip_stack_t *ipst = ill->ill_ipst;
+ ill_t *send_ill;
+ uint_t nprobes;
- ipif = ipif_lookup_addr_exact_v6(v6addrp, ill, ipst);
- if (ipif == NULL && IS_UNDER_IPMP(ill)) {
- if ((ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) {
- ipif = ipif_lookup_addr_exact_v6(v6addrp, ill, ipst);
- ill_refrele(ill);
+ ASSERT(IS_IPMP(ill));
+
+ mutex_enter(&ncec->ncec_lock);
+ nprobes = ncec->ncec_nprobes;
+ mp = ncec->ncec_qd_mp;
+ ncec->ncec_qd_mp = NULL;
+ ncec->ncec_nprobes = 0;
+ mutex_exit(&ncec->ncec_lock);
+
+ while (mp != NULL) {
+ mblk_t *nxt_mp;
+
+ nxt_mp = mp->b_next;
+ mp->b_next = NULL;
+ if (isv6) {
+ ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+
+ pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
+ src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src,
+ ill, ALL_ZONES, ipst);
+ } else {
+ ipha_t *ipha = (ipha_t *)mp->b_rptr;
+
+ ixaflags |= IXAF_IS_IPV4;
+ pkt_len = ntohs(ipha->ipha_length);
+ src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src,
+ ill, ALL_ZONES, ipst);
+ }
+
+ /*
+ * find a new nce based on an under_ill. The first IPMP probe
+ * packet gets queued, so we could still find a src_ipif that
+ * matches an IPMP test address.
+ */
+ if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) {
+ /*
+ * if src_ipif is null, this could be either a
+ * forwarded packet or a probe whose src got deleted.
+ * We identify the former case by looking for the
+ * ncec_nprobes: the first ncec_nprobes packets are
+ * probes;
+ */
+ if (src_ipif == NULL && nprobes > 0)
+ goto drop_pkt;
+
+ /*
+ * For forwarded packets, we use the ipmp rotor
+ * to find send_ill.
+ */
+ send_ill = ipmp_ill_get_xmit_ill(ncec->ncec_ill,
+ B_TRUE);
+ } else {
+ send_ill = src_ipif->ipif_ill;
+ ill_refhold(send_ill);
+ }
+
+ DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp,
+ (ncec_t *), ncec, (ipif_t *),
+ src_ipif, (ill_t *), send_ill);
+
+ if (send_ill == NULL) {
+ if (src_ipif != NULL)
+ ipif_refrele(src_ipif);
+ goto drop_pkt;
}
+ /* create an under_nce on send_ill */
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+ if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill))
+ under_nce = nce_fastpath_create(send_ill, ncec);
+ else
+ under_nce = NULL;
+ rw_exit(&ipst->ips_ill_g_lock);
+ if (under_nce != NULL && NCE_ISREACHABLE(ncec))
+ nce_fastpath_trigger(under_nce);
+
+ ill_refrele(send_ill);
+ if (src_ipif != NULL)
+ ipif_refrele(src_ipif);
+
+ if (under_nce != NULL) {
+ (void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0,
+ ALL_ZONES, 0, NULL);
+ nce_refrele(under_nce);
+ if (nprobes > 0)
+ nprobes--;
+ mp = nxt_mp;
+ continue;
+ }
+drop_pkt:
+ if (isv6) {
+ BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
+ } else {
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+ }
+ ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL);
+ freemsg(mp);
+ if (nprobes > 0)
+ nprobes--;
+ mp = nxt_mp;
}
- return (ipif);
+ ncec_cb_dispatch(ncec); /* complete callbacks */
}
diff --git a/usr/src/uts/common/inet/ip/ip_netinfo.c b/usr/src/uts/common/inet/ip/ip_netinfo.c
index 8b97462d13..33e791adac 100644
--- a/usr/src/uts/common/inet/ip/ip_netinfo.c
+++ b/usr/src/uts/common/inet/ip/ip_netinfo.c
@@ -38,6 +38,7 @@
#include <sys/cmn_err.h>
#include <netinet/in.h>
+#include <inet/ipsec_impl.h>
#include <inet/common.h>
#include <inet/mib2.h>
#include <inet/ip.h>
@@ -89,6 +90,20 @@ static phy_if_t ipv6_routeto(net_handle_t, struct sockaddr *,
struct sockaddr *);
static int ipv6_isvalidchecksum(net_handle_t, mblk_t *);
+static int net_no_getmtu(net_handle_t, phy_if_t, lif_if_t);
+static int net_no_getpmtuenabled(net_handle_t);
+static lif_if_t net_no_lifgetnext(net_handle_t, phy_if_t, lif_if_t);
+static int net_no_inject(net_handle_t, inject_t, net_inject_t *);
+static phy_if_t net_no_routeto(net_handle_t, struct sockaddr *,
+ struct sockaddr *);
+static int net_no_ispartialchecksum(net_handle_t, mblk_t *);
+static int net_no_getlifaddr(net_handle_t, phy_if_t, lif_if_t,
+ size_t, net_ifaddr_t [], void *);
+static int net_no_getlifzone(net_handle_t, phy_if_t, lif_if_t,
+ zoneid_t *);
+static int net_no_getlifflags(net_handle_t, phy_if_t, lif_if_t,
+ uint64_t *);
+
/* Netinfo private functions */
static int ip_getifname_impl(phy_if_t, char *,
const size_t, boolean_t, ip_stack_t *);
@@ -111,7 +126,6 @@ static void ip_ni_queue_in_func(void *);
static void ip_ni_queue_out_func(void *);
static void ip_ni_queue_func_impl(injection_t *, boolean_t);
-
static net_protocol_t ipv4info = {
NETINFO_VERSION,
NHF_INET,
@@ -149,6 +163,24 @@ static net_protocol_t ipv6info = {
ipv6_isvalidchecksum
};
+static net_protocol_t arp_netinfo = {
+ NETINFO_VERSION,
+ NHF_ARP,
+ ip_getifname,
+ net_no_getmtu,
+ net_no_getpmtuenabled,
+ net_no_getlifaddr,
+ net_no_getlifzone,
+ net_no_getlifflags,
+ ip_phygetnext,
+ ip_phylookup,
+ net_no_lifgetnext,
+ net_no_inject,
+ net_no_routeto,
+ net_no_ispartialchecksum,
+ ip_isvalidchecksum
+};
+
/*
* The taskq eventq_queue_in is used to process the upside inject messages.
* The taskq eventq_queue_out is used to process the downside inject messages.
@@ -230,6 +262,9 @@ ip_net_init(ip_stack_t *ipst, netstack_t *ns)
ipst->ips_ipv6_net_data = net_protocol_register(id, &ipv6info);
ASSERT(ipst->ips_ipv6_net_data != NULL);
+
+ ipst->ips_arp_net_data = net_protocol_register(id, &arp_netinfo);
+ ASSERT(ipst->ips_ipv6_net_data != NULL);
}
@@ -248,6 +283,11 @@ ip_net_destroy(ip_stack_t *ipst)
if (net_protocol_unregister(ipst->ips_ipv6_net_data) == 0)
ipst->ips_ipv6_net_data = NULL;
}
+
+ if (ipst->ips_arp_net_data != NULL) {
+ if (net_protocol_unregister(ipst->ips_arp_net_data) == 0)
+ ipst->ips_arp_net_data = NULL;
+ }
}
/*
@@ -612,8 +652,7 @@ ip_getifname_impl(phy_if_t phy_ifdata,
ASSERT(buffer != NULL);
- ill = ill_lookup_on_ifindex((uint_t)phy_ifdata, isv6, NULL, NULL,
- NULL, NULL, ipst);
+ ill = ill_lookup_on_ifindex((uint_t)phy_ifdata, isv6, ipst);
if (ill == NULL)
return (1);
@@ -667,17 +706,17 @@ ip_getmtu_impl(phy_if_t phy_ifdata, lif_if_t ifdata, boolean_t isv6,
if (ipif == NULL)
return (0);
- mtu = ipif->ipif_mtu;
+ mtu = ipif->ipif_ill->ill_mtu;
ipif_refrele(ipif);
if (mtu == 0) {
ill_t *ill;
if ((ill = ill_lookup_on_ifindex((uint_t)phy_ifdata, isv6,
- NULL, NULL, NULL, NULL, ipst)) == NULL) {
+ ipst)) == NULL) {
return (0);
}
- mtu = ill->ill_max_frag;
+ mtu = ill->ill_mtu;
ill_refrele(ill);
}
@@ -760,8 +799,7 @@ ip_phylookup_impl(const char *name, boolean_t isv6, ip_stack_t *ipst)
phy_if_t phy;
ill_t *ill;
- ill = ill_lookup_on_name((char *)name, B_FALSE, isv6, NULL, NULL,
- NULL, NULL, NULL, ipst);
+ ill = ill_lookup_on_name((char *)name, B_FALSE, isv6, NULL, ipst);
if (ill == NULL)
return (0);
@@ -813,8 +851,7 @@ ip_lifgetnext_impl(phy_if_t phy_ifdata, lif_if_t ifdata, boolean_t isv6,
ipif_t *ipif;
ill_t *ill;
- ill = ill_lookup_on_ifindex(phy_ifdata, isv6, NULL, NULL,
- NULL, NULL, ipst);
+ ill = ill_lookup_on_ifindex(phy_ifdata, isv6, ipst);
if (ill == NULL)
return (0);
@@ -898,14 +935,10 @@ static int
ip_inject_impl(inject_t style, net_inject_t *packet, boolean_t isv6,
ip_stack_t *ipst)
{
- struct sockaddr_in6 *sin6;
ddi_taskq_t *tq = NULL;
void (* func)(void *);
injection_t *inject;
- ip6_t *ip6h;
- ire_t *ire;
mblk_t *mp;
- zoneid_t zoneid;
ASSERT(packet != NULL);
ASSERT(packet->ni_packet != NULL);
@@ -941,130 +974,44 @@ ip_inject_impl(inject_t style, net_inject_t *packet, boolean_t isv6,
tq = eventq_queue_out;
break;
- case NI_DIRECT_OUT:
- /*
- * Note:
- * For IPv4, the code path below will be greatly simplified
- * with the delivery of surya - it will become a single
- * function call to X. A follow on project is aimed to
- * provide similar functionality for IPv6.
- */
- mp = packet->ni_packet;
- zoneid =
- netstackid_to_zoneid(ipst->ips_netstack->netstack_stackid);
-
- if (!isv6) {
- struct sockaddr *sock;
-
- sock = (struct sockaddr *)&packet->ni_addr;
- /*
- * ipfil_sendpkt was provided by surya to ease the
- * problems associated with sending out a packet.
- * Currently this function only supports IPv4.
- */
- switch (ipfil_sendpkt(sock, mp, packet->ni_physical,
- zoneid)) {
- case 0 :
- case EINPROGRESS:
- return (0);
- case ECOMM :
- case ENONET :
- return (1);
- default :
- return (1);
- }
- /* NOTREACHED */
-
- }
-
- ip6h = (ip6_t *)mp->b_rptr;
- sin6 = (struct sockaddr_in6 *)&packet->ni_addr;
- ASSERT(sin6->sin6_family == AF_INET6);
-
- ire = ire_route_lookup_v6(&sin6->sin6_addr, 0, 0, 0,
- NULL, NULL, zoneid, NULL,
- MATCH_IRE_DSTONLY|MATCH_IRE_DEFAULT|MATCH_IRE_RECURSIVE,
- ipst);
+ case NI_DIRECT_OUT: {
+ struct sockaddr *sock;
- if (ire == NULL) {
- ip2dbg(("ip_inject: ire_cache_lookup failed\n"));
- freemsg(mp);
- return (1);
- }
-
- if (ire->ire_stq == NULL) {
- /* Send to loopback destination. */
- if (ire->ire_rfq == NULL) {
- ip2dbg(("ip_inject: bad nexthop\n"));
- ire_refrele(ire);
- freemsg(mp);
- return (1);
- }
- DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL,
- void_ip_t *, ip6h, __dtrace_ipsr_ill_t *,
- ire->ire_ipif->ipif_ill, ipha_t *, NULL, ip6_t *,
- ip6h, int, 1);
- ip_wput_local_v6(ire->ire_rfq,
- ire->ire_ipif->ipif_ill, ip6h, mp, ire, 0, zoneid);
- ire_refrele(ire);
- return (0);
- }
-
- mp->b_queue = ire->ire_stq;
-
- if (ire->ire_nce == NULL ||
- ire->ire_nce->nce_fp_mp == NULL &&
- ire->ire_nce->nce_res_mp == NULL) {
- ip_newroute_v6(ire->ire_stq, mp, &sin6->sin6_addr,
- &ip6h->ip6_src, NULL, zoneid, ipst);
+ mp = packet->ni_packet;
- ire_refrele(ire);
+ sock = (struct sockaddr *)&packet->ni_addr;
+ /*
+ * ipfil_sendpkt was provided by surya to ease the
+ * problems associated with sending out a packet.
+ */
+ switch (ipfil_sendpkt(sock, mp, packet->ni_physical,
+ netstackid_to_zoneid(
+ ipst->ips_netstack->netstack_stackid))) {
+ case 0 :
+ case EINPROGRESS:
return (0);
- } else {
- /* prepend L2 header for IPv6 packets. */
- mblk_t *llmp;
-
- /*
- * Lock IREs, see 6420438
- */
- mutex_enter(&ire->ire_lock);
- llmp = ire->ire_nce->nce_fp_mp ?
- ire->ire_nce->nce_fp_mp :
- ire->ire_nce->nce_res_mp;
-
- if ((mp = dupb(llmp)) == NULL &&
- (mp = copyb(llmp)) == NULL) {
- ip2dbg(("ip_inject: llhdr failed\n"));
- mutex_exit(&ire->ire_lock);
- ire_refrele(ire);
- freemsg(mp);
- return (1);
- }
- mutex_exit(&ire->ire_lock);
- linkb(mp, packet->ni_packet);
+ case ECOMM :
+ case ENONET :
+ return (1);
+ default :
+ return (1);
}
-
- mp->b_queue = ire->ire_stq;
-
- break;
+ /* NOTREACHED */
+ }
default:
freemsg(packet->ni_packet);
return (1);
}
- if (tq) {
- inject->inj_ptr = ipst;
- if (ddi_taskq_dispatch(tq, func, (void *)inject,
- DDI_SLEEP) == DDI_FAILURE) {
- ip2dbg(("ip_inject: ddi_taskq_dispatch failed\n"));
- freemsg(packet->ni_packet);
- return (1);
- }
- } else {
- putnext(ire->ire_stq, mp);
- ire_refrele(ire);
- }
+ ASSERT(tq != NULL);
+ inject->inj_ptr = ipst;
+ if (ddi_taskq_dispatch(tq, func, (void *)inject,
+ DDI_SLEEP) == DDI_FAILURE) {
+ ip2dbg(("ip_inject: ddi_taskq_dispatch failed\n"));
+ freemsg(packet->ni_packet);
+ return (1);
+ }
return (0);
}
@@ -1121,64 +1068,57 @@ ip_routeto_impl(struct sockaddr *address, struct sockaddr *nexthop,
struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)address;
struct sockaddr_in *next = (struct sockaddr_in *)nexthop;
struct sockaddr_in *sin = (struct sockaddr_in *)address;
- ire_t *sire = NULL;
ire_t *ire;
- ill_t *ill;
+ ire_t *nexthop_ire;
phy_if_t phy_if;
zoneid_t zoneid;
zoneid = netstackid_to_zoneid(ipst->ips_netstack->netstack_stackid);
if (address->sa_family == AF_INET6) {
- ire = ire_route_lookup_v6(&sin6->sin6_addr, NULL,
- 0, 0, NULL, &sire, zoneid, NULL,
- MATCH_IRE_DSTONLY|MATCH_IRE_DEFAULT|MATCH_IRE_RECURSIVE,
- ipst);
+ ire = ire_route_recursive_v6(&sin6->sin6_addr, 0, NULL,
+ zoneid, NULL, MATCH_IRE_DSTONLY, B_TRUE, 0, ipst, NULL,
+ NULL, NULL);
} else {
- ire = ire_route_lookup(sin->sin_addr.s_addr, 0,
- 0, 0, NULL, &sire, zoneid, NULL,
- MATCH_IRE_DSTONLY|MATCH_IRE_DEFAULT|MATCH_IRE_RECURSIVE,
- ipst);
+ ire = ire_route_recursive_v4(sin->sin_addr.s_addr, 0, NULL,
+ zoneid, NULL, MATCH_IRE_DSTONLY, B_TRUE, 0, ipst, NULL,
+ NULL, NULL);
}
-
- if (ire == NULL)
- return (0);
-
+ ASSERT(ire != NULL);
/*
* For some destinations, we have routes that are dead ends, so
* return to indicate that no physical interface can be used to
* reach the destination.
*/
- if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) != 0) {
- if (sire != NULL)
- ire_refrele(sire);
+ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
ire_refrele(ire);
- return (0);
+ return (NULL);
}
- ill = ire_to_ill(ire);
- if (ill == NULL) {
- if (sire != NULL)
- ire_refrele(sire);
+ nexthop_ire = ire_nexthop(ire);
+ if (nexthop_ire == NULL) {
+ ire_refrele(ire);
+ return (0);
+ }
+ if (nexthop_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+ ire_refrele(nexthop_ire);
ire_refrele(ire);
return (0);
}
+ ASSERT(nexthop_ire->ire_ill != NULL);
+
if (nexthop != NULL) {
if (address->sa_family == AF_INET6) {
- next->sin_addr.s_addr = sire ? sire->ire_gateway_addr :
- sin->sin_addr.s_addr;
+ next6->sin6_addr = nexthop_ire->ire_addr_v6;
} else {
- next6->sin6_addr = sire ? sire->ire_gateway_addr_v6 :
- sin6->sin6_addr;
+ next->sin_addr.s_addr = nexthop_ire->ire_addr;
}
}
- ASSERT(ill != NULL);
- phy_if = (phy_if_t)ill->ill_phyint->phyint_ifindex;
- if (sire != NULL)
- ire_refrele(sire);
+ phy_if = (phy_if_t)nexthop_ire->ire_ill->ill_phyint->phyint_ifindex;
ire_refrele(ire);
+ ire_refrele(nexthop_ire);
return (phy_if);
}
@@ -1477,8 +1417,7 @@ ip_getlifflags_impl(sa_family_t family, phy_if_t phy_ifdata, lif_if_t ifdata,
ipif_t *ipif;
ill_t *ill;
- ill = ill_lookup_on_ifindex(phy_ifdata,
- (family == AF_INET6), NULL, NULL, NULL, NULL, ipst);
+ ill = ill_lookup_on_ifindex(phy_ifdata, (family == AF_INET6), ipst);
if (ill == NULL)
return (-1);
phyi = ill->ill_phyint;
@@ -1538,59 +1477,43 @@ static void
ip_ni_queue_func_impl(injection_t *inject, boolean_t out)
{
net_inject_t *packet;
- conn_t *conn;
ill_t *ill;
ip_stack_t *ipst = (ip_stack_t *)inject->inj_ptr;
+ ip_xmit_attr_t ixas;
ASSERT(inject != NULL);
packet = &inject->inj_data;
ASSERT(packet->ni_packet != NULL);
- ill = ill_lookup_on_ifindex((uint_t)packet->ni_physical,
- B_FALSE, NULL, NULL, NULL, NULL, ipst);
- if (ill == NULL) {
- kmem_free(inject, sizeof (*inject));
- return;
- }
-
if (out == 0) {
+ ill = ill_lookup_on_ifindex((uint_t)packet->ni_physical,
+ inject->inj_isv6, ipst);
+
+ if (ill == NULL) {
+ kmem_free(inject, sizeof (*inject));
+ return;
+ }
+
if (inject->inj_isv6) {
- ip_rput_v6(ill->ill_rq, packet->ni_packet);
+ ip_input_v6(ill, NULL, packet->ni_packet, NULL);
} else {
ip_input(ill, NULL, packet->ni_packet, NULL);
}
- kmem_free(inject, sizeof (*inject));
ill_refrele(ill);
- return;
- }
-
- /*
- * Even though ipcl_conn_create requests that it be passed
- * a different value for "TCP", in this case there may not
- * be a TCP connection backing the packet and more than
- * likely, non-TCP packets will go here too.
- */
- conn = ipcl_conn_create(IPCL_IPCCONN, KM_NOSLEEP, ipst->ips_netstack);
- if (conn != NULL) {
+ } else {
+ bzero(&ixas, sizeof (ixas));
+ ixas.ixa_ifindex = packet->ni_physical;
+ ixas.ixa_ipst = ipst;
if (inject->inj_isv6) {
- conn->conn_af_isv6 = B_TRUE;
- conn->conn_src_preferences = IPV6_PREFER_SRC_DEFAULT;
- conn->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
- ip_output_v6(conn, packet->ni_packet, ill->ill_wq,
- IP_WPUT);
+ ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
} else {
- conn->conn_af_isv6 = B_FALSE;
- conn->conn_pkt_isv6 = B_FALSE;
- conn->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
- ip_output(conn, packet->ni_packet, ill->ill_wq,
- IP_WPUT);
+ ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
}
-
- CONN_DEC_REF(conn);
+ (void) ip_output_simple(packet->ni_packet, &ixas);
+ ixa_cleanup(&ixas);
}
kmem_free(inject, sizeof (*inject));
- ill_refrele(ill);
}
/*
@@ -1623,3 +1546,152 @@ done:
kmem_free(info->hnei_event.hne_data, info->hnei_event.hne_datalen);
kmem_free(arg, sizeof (hook_nic_event_int_t));
}
+
+/*
+ * Initialize ARP hook family and events
+ */
+void
+arp_hook_init(ip_stack_t *ipst)
+{
+ HOOK_FAMILY_INIT(&ipst->ips_arproot, Hn_ARP);
+ if (net_family_register(ipst->ips_arp_net_data, &ipst->ips_arproot)
+ != 0) {
+ cmn_err(CE_NOTE, "arp_hook_init"
+ "net_family_register failed for arp");
+ }
+
+ HOOK_EVENT_INIT(&ipst->ips_arp_physical_in_event, NH_PHYSICAL_IN);
+ ipst->ips_arp_physical_in = net_event_register(ipst->ips_arp_net_data,
+ &ipst->ips_arp_physical_in_event);
+ if (ipst->ips_arp_physical_in == NULL) {
+ cmn_err(CE_NOTE, "arp_hook_init: "
+ "net_event_register failed for arp/physical_in");
+ }
+
+ HOOK_EVENT_INIT(&ipst->ips_arp_physical_out_event, NH_PHYSICAL_OUT);
+ ipst->ips_arp_physical_out = net_event_register(ipst->ips_arp_net_data,
+ &ipst->ips_arp_physical_out_event);
+ if (ipst->ips_arp_physical_out == NULL) {
+ cmn_err(CE_NOTE, "arp_hook_init: "
+ "net_event_register failed for arp/physical_out");
+ }
+
+ HOOK_EVENT_INIT(&ipst->ips_arp_nic_events, NH_NIC_EVENTS);
+ ipst->ips_arpnicevents = net_event_register(ipst->ips_arp_net_data,
+ &ipst->ips_arp_nic_events);
+ if (ipst->ips_arpnicevents == NULL) {
+ cmn_err(CE_NOTE, "arp_hook_init: "
+ "net_event_register failed for arp/nic_events");
+ }
+}
+
+void
+arp_hook_destroy(ip_stack_t *ipst)
+{
+ if (ipst->ips_arpnicevents != NULL) {
+ if (net_event_unregister(ipst->ips_arp_net_data,
+ &ipst->ips_arp_nic_events) == 0)
+ ipst->ips_arpnicevents = NULL;
+ }
+
+ if (ipst->ips_arp_physical_out != NULL) {
+ if (net_event_unregister(ipst->ips_arp_net_data,
+ &ipst->ips_arp_physical_out_event) == 0)
+ ipst->ips_arp_physical_out = NULL;
+ }
+
+ if (ipst->ips_arp_physical_in != NULL) {
+ if (net_event_unregister(ipst->ips_arp_net_data,
+ &ipst->ips_arp_physical_in_event) == 0)
+ ipst->ips_arp_physical_in = NULL;
+ }
+
+ (void) net_family_unregister(ipst->ips_arp_net_data,
+ &ipst->ips_arproot);
+}
+
+void
+arp_hook_shutdown(ip_stack_t *ipst)
+{
+ if (ipst->ips_arp_physical_in != NULL) {
+ (void) net_event_shutdown(ipst->ips_arp_net_data,
+ &ipst->ips_arp_physical_in_event);
+ }
+ if (ipst->ips_arp_physical_out != NULL) {
+ (void) net_event_shutdown(ipst->ips_arp_net_data,
+ &ipst->ips_arp_physical_out_event);
+ }
+ if (ipst->ips_arpnicevents != NULL) {
+ (void) net_event_shutdown(ipst->ips_arp_net_data,
+ &ipst->ips_arp_nic_events);
+ }
+}
+
+/* netinfo routines for the unsupported cases */
+
+/* ARGSUSED */
+int
+net_no_getmtu(net_handle_t handle, phy_if_t phy_ifdata, lif_if_t ifdata)
+{
+ return (-1);
+}
+
+/* ARGSUSED */
+static int
+net_no_getpmtuenabled(net_handle_t neti)
+{
+ return (-1);
+}
+
+/* ARGSUSED */
+static lif_if_t
+net_no_lifgetnext(net_handle_t neti, phy_if_t phy_ifdata, lif_if_t ifdata)
+{
+ return (-1);
+}
+
+/* ARGSUSED */
+static int
+net_no_inject(net_handle_t neti, inject_t style, net_inject_t *packet)
+{
+ return (-1);
+}
+
+/* ARGSUSED */
+static phy_if_t
+net_no_routeto(net_handle_t neti, struct sockaddr *address,
+ struct sockaddr *next)
+{
+ return ((phy_if_t)-1);
+}
+
+/* ARGSUSED */
+static int
+net_no_ispartialchecksum(net_handle_t neti, mblk_t *mp)
+{
+ return (-1);
+}
+
+/* ARGSUSED */
+static int
+net_no_getlifaddr(net_handle_t neti, phy_if_t phy_ifdata, lif_if_t ifdata,
+ size_t nelem, net_ifaddr_t type[], void *storage)
+{
+ return (-1);
+}
+
+/* ARGSUSED */
+static int
+net_no_getlifzone(net_handle_t neti, phy_if_t phy_ifdata, lif_if_t ifdata,
+ zoneid_t *zoneid)
+{
+ return (-1);
+}
+
+/* ARGSUSED */
+static int
+net_no_getlifflags(net_handle_t neti, phy_if_t phy_ifdata, lif_if_t ifdata,
+ uint64_t *flags)
+{
+ return (-1);
+}
diff --git a/usr/src/uts/common/inet/ip/ip_opt_data.c b/usr/src/uts/common/inet/ip/ip_opt_data.c
deleted file mode 100644
index e86e59f67d..0000000000
--- a/usr/src/uts/common/inet/ip/ip_opt_data.c
+++ /dev/null
@@ -1,301 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <sys/types.h>
-#include <sys/stream.h>
-#define _SUN_TPI_VERSION 2
-#include <sys/tihdr.h>
-#include <sys/socket.h>
-#include <sys/xti_inet.h>
-
-#include <inet/common.h>
-#include <netinet/ip6.h>
-#include <inet/ip.h>
-
-#include <netinet/in.h>
-#include <netinet/ip_mroute.h>
-#include <inet/optcom.h>
-
-
-extern int ip_opt_default(queue_t *q, int level, int name, uchar_t *ptr);
-extern int ip_opt_get(queue_t *q, int level, int name, uchar_t *ptr);
-extern int ip_opt_set(queue_t *q, uint_t optset_context, int level,
- int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
- void *dummy, cred_t *cr, mblk_t *first_mp);
-
-/*
- * Table of all known options handled on a IP protocol stack.
- *
- * Note: Not all of these options are available through all protocol stacks
- * For example, multicast options are not accessible in TCP over IP.
- * The filtering for that happens in option table at transport level.
- * Also, this table excludes any options processed exclusively at the
- * transport protocol level.
- */
-opdes_t ip_opt_arr[] = {
-
-{ SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-{ SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-{ SO_PROTOTYPE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-{ SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-{ SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-{ SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-
-{ SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
- 0 },
-
-
-{ IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
- (OP_VARLEN|OP_NODEFAULT),
- IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
-{ T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
- (OP_VARLEN|OP_NODEFAULT),
- IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
-
-{ IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-{ T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-{ IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-{ IP_MULTICAST_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
- sizeof (struct in_addr), 0 /* INADDR_ANY */ },
-
-{ IP_MULTICAST_LOOP, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_DEF_FN),
- sizeof (uchar_t), -1 /* not initialized */},
-
-{ IP_MULTICAST_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_DEF_FN),
- sizeof (uchar_t), -1 /* not initialized */ },
-
-{ IP_ADD_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_NODEFAULT),
- sizeof (struct ip_mreq), -1 /* not initialized */ },
-
-{ IP_DROP_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_NODEFAULT),
- sizeof (struct ip_mreq), -1 /* not initialized */ },
-
-{ IP_BLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_NODEFAULT),
- sizeof (struct ip_mreq_source), -1 /* not initialized */ },
-
-{ IP_UNBLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_NODEFAULT),
- sizeof (struct ip_mreq_source), -1 /* not initialized */ },
-
-{ IP_ADD_SOURCE_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP,
- (OP_NODEFAULT), sizeof (struct ip_mreq_source), -1 },
-
-{ IP_DROP_SOURCE_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP,
- (OP_NODEFAULT), sizeof (struct ip_mreq_source), -1 },
-
-{ IP_RECVOPTS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-
-{ IP_RECVDSTADDR, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
- },
-
-{ IP_RECVIF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-
-{ IP_PKTINFO, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-
-{ IP_RECVSLLA, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-
-{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
- sizeof (int), 0 /* no ifindex */ },
-
-{ IP_DHCPINIT_IF, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
- sizeof (int), 0 },
-
-{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
- sizeof (int), 0 },
-
-{ IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_NODEFAULT),
- sizeof (ipsec_req_t), -1 /* not initialized */ },
-
-{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
- sizeof (in_addr_t), -1 /* not initialized */ },
-
-{ MRT_INIT, IPPROTO_IP, 0, OA_X, OP_CONFIG,
- (OP_NODEFAULT), sizeof (int), -1 /* not initialized */ },
-
-{ MRT_DONE, IPPROTO_IP, 0, OA_X, OP_CONFIG,
- (OP_NODEFAULT), 0, -1 /* not initialized */ },
-
-{ MRT_ADD_VIF, IPPROTO_IP, 0, OA_X, OP_CONFIG, (OP_NODEFAULT),
- sizeof (struct vifctl), -1 /* not initialized */ },
-
-{ MRT_DEL_VIF, IPPROTO_IP, 0, OA_X, OP_CONFIG, (OP_NODEFAULT),
- sizeof (vifi_t), -1 /* not initialized */ },
-
-{ MRT_ADD_MFC, IPPROTO_IP, 0, OA_X, OP_CONFIG, (OP_NODEFAULT),
- sizeof (struct mfcctl), -1 /* not initialized */ },
-
-{ MRT_DEL_MFC, IPPROTO_IP, 0, OA_X, OP_CONFIG, (OP_NODEFAULT),
- sizeof (struct mfcctl), -1 /* not initialized */ },
-
-{ MRT_VERSION, IPPROTO_IP, OA_R, OA_R, OP_NP, (OP_NODEFAULT),
- sizeof (int), -1 /* not initialized */ },
-
-{ MRT_ASSERT, IPPROTO_IP, 0, OA_RW, OP_CONFIG, (OP_NODEFAULT),
- sizeof (int), -1 /* not initialized */ },
-
-{ MCAST_JOIN_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP,
- (OP_NODEFAULT), sizeof (struct group_req),
- -1 /* not initialized */ },
-{ MCAST_LEAVE_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP,
- (OP_NODEFAULT), sizeof (struct group_req),
- -1 /* not initialized */ },
-{ MCAST_BLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP,
- (OP_NODEFAULT), sizeof (struct group_source_req),
- -1 /* not initialized */ },
-{ MCAST_UNBLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP,
- (OP_NODEFAULT), sizeof (struct group_source_req),
- -1 /* not initialized */ },
-{ MCAST_JOIN_SOURCE_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP,
- (OP_NODEFAULT), sizeof (struct group_source_req),
- -1 /* not initialized */ },
-{ MCAST_LEAVE_SOURCE_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP,
- (OP_NODEFAULT), sizeof (struct group_source_req),
- -1 /* not initialized */ },
-
-{ IPV6_MULTICAST_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
- sizeof (int), 0 },
-
-{ IPV6_MULTICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_DEF_FN), sizeof (int), -1 /* not initialized */ },
-
-{ IPV6_MULTICAST_LOOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_DEF_FN), sizeof (int), -1 /* not initialized */},
-
-{ IPV6_JOIN_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, (OP_NODEFAULT),
- sizeof (struct ipv6_mreq), -1 /* not initialized */ },
-
-{ IPV6_LEAVE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
- (OP_NODEFAULT),
- sizeof (struct ipv6_mreq), -1 /* not initialized */ },
-
-{ IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_DEF_FN), sizeof (int), -1 /* not initialized */ },
-
-{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
- sizeof (int), 0 /* no ifindex */ },
-
-{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
- sizeof (int), 0 },
-
-{ IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_NODEFAULT|OP_VARLEN),
- sizeof (struct in6_pktinfo), -1 /* not initialized */ },
-{ IPV6_HOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_NODEFAULT|OP_VARLEN),
- sizeof (int), -1 /* not initialized */ },
-{ IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_NODEFAULT|OP_VARLEN),
- sizeof (sin6_t), -1 /* not initialized */ },
-{ IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_VARLEN|OP_NODEFAULT), 255*8,
- -1 /* not initialized */ },
-{ IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_VARLEN|OP_NODEFAULT), 255*8,
- -1 /* not initialized */ },
-{ IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_VARLEN|OP_NODEFAULT), 255*8,
- -1 /* not initialized */ },
-{ IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_VARLEN|OP_NODEFAULT), 255*8,
- -1 /* not initialized */ },
-{ IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_NODEFAULT|OP_VARLEN),
- sizeof (int), -1 /* not initialized */ },
-{ IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
- sizeof (struct ip6_mtuinfo), -1 },
-{ IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
- sizeof (int), 0 },
-{ IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
- sizeof (int), -1 },
-{ IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
- sizeof (int), 0 },
-
-/* Enable receipt of ancillary data */
-{ IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
- sizeof (int), 0 },
-{ IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
- sizeof (int), 0 },
-{ IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
- sizeof (int), 0 },
-{ _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
- sizeof (int), 0 },
-{ IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
- sizeof (int), 0 },
-{ IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
- sizeof (int), 0 },
-{ IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
- sizeof (int), 0 },
-{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
- sizeof (int), 0 },
-{ IPV6_RECVPATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
- sizeof (int), 0 },
-
-{ IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, (OP_NODEFAULT),
- sizeof (ipsec_req_t), -1 /* not initialized */ },
-{ IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
- sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
-
-{ MCAST_JOIN_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
- (OP_NODEFAULT), sizeof (struct group_req),
- -1 /* not initialized */ },
-{ MCAST_LEAVE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
- (OP_NODEFAULT), sizeof (struct group_req),
- -1 /* not initialized */ },
-{ MCAST_BLOCK_SOURCE, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
- (OP_NODEFAULT), sizeof (struct group_source_req),
- -1 /* not initialized */ },
-{ MCAST_UNBLOCK_SOURCE, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
- (OP_NODEFAULT), sizeof (struct group_source_req),
- -1 /* not initialized */ },
-{ MCAST_JOIN_SOURCE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
- (OP_NODEFAULT), sizeof (struct group_source_req),
- -1 /* not initialized */ },
-{ MCAST_LEAVE_SOURCE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
- (OP_NODEFAULT), sizeof (struct group_source_req),
- -1 /* not initialized */ },
-};
-
-
-#define IP_OPT_ARR_CNT A_CNT(ip_opt_arr)
-
-
-/*
- * Initialize option database object for IP
- *
- * This object represents database of options to search passed to
- * {sock,tpi}optcom_req() interface routine to take care of option
- * management and associated methods.
- */
-
-optdb_obj_t ip_opt_obj = {
- ip_opt_default, /* IP default value function pointer */
- ip_opt_get, /* IP get function pointer */
- ip_opt_set, /* IP set function pointer */
- B_FALSE, /* IP is NOT a tpi provider */
- IP_OPT_ARR_CNT, /* IP option database count of entries */
- ip_opt_arr, /* IP option database */
- 0, /* 0 - not needed if not top tpi provider */
- (optlevel_t *)0 /* null - not needed if not top tpi provider */
-};
diff --git a/usr/src/uts/common/inet/ip/ip_output.c b/usr/src/uts/common/inet/ip/ip_output.c
new file mode 100644
index 0000000000..a4940fd3e8
--- /dev/null
+++ b/usr/src/uts/common/inet/ip/ip_output.c
@@ -0,0 +1,2554 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+/* Copyright (c) 1990 Mentat Inc. */
+
+#include <sys/types.h>
+#include <sys/stream.h>
+#include <sys/strsubr.h>
+#include <sys/dlpi.h>
+#include <sys/strsun.h>
+#include <sys/zone.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/atomic.h>
+
+#include <sys/systm.h>
+#include <sys/param.h>
+#include <sys/kmem.h>
+#include <sys/sdt.h>
+#include <sys/socket.h>
+#include <sys/mac.h>
+#include <net/if.h>
+#include <net/if_arp.h>
+#include <net/route.h>
+#include <sys/sockio.h>
+#include <netinet/in.h>
+#include <net/if_dl.h>
+
+#include <inet/common.h>
+#include <inet/mi.h>
+#include <inet/mib2.h>
+#include <inet/nd.h>
+#include <inet/arp.h>
+#include <inet/snmpcom.h>
+#include <inet/kstatcom.h>
+
+#include <netinet/igmp_var.h>
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#include <netinet/sctp.h>
+
+#include <inet/ip.h>
+#include <inet/ip_impl.h>
+#include <inet/ip6.h>
+#include <inet/ip6_asp.h>
+#include <inet/tcp.h>
+#include <inet/ip_multi.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ire.h>
+#include <inet/ip_ftable.h>
+#include <inet/ip_rts.h>
+#include <inet/optcom.h>
+#include <inet/ip_ndp.h>
+#include <inet/ip_listutils.h>
+#include <netinet/igmp.h>
+#include <netinet/ip_mroute.h>
+#include <inet/ipp_common.h>
+
+#include <net/pfkeyv2.h>
+#include <inet/sadb.h>
+#include <inet/ipsec_impl.h>
+#include <inet/ipdrop.h>
+#include <inet/ip_netinfo.h>
+
+#include <sys/pattr.h>
+#include <inet/ipclassifier.h>
+#include <inet/sctp_ip.h>
+#include <inet/sctp/sctp_impl.h>
+#include <inet/udp_impl.h>
+#include <sys/sunddi.h>
+
+#include <sys/tsol/label.h>
+#include <sys/tsol/tnet.h>
+
+#ifdef DEBUG
+extern boolean_t skip_sctp_cksum;
+#endif
+
+static int ip_verify_nce(mblk_t *, ip_xmit_attr_t *);
+static int ip_verify_dce(mblk_t *, ip_xmit_attr_t *);
+static boolean_t ip_verify_lso(ill_t *, ip_xmit_attr_t *);
+static boolean_t ip_verify_zcopy(ill_t *, ip_xmit_attr_t *);
+static void ip_output_simple_broadcast(ip_xmit_attr_t *, mblk_t *);
+
+/*
+ * There are two types of output functions for IP used for different
+ * purposes:
+ * - ip_output_simple() is when sending ICMP errors, TCP resets, etc when there
+ * is no context in the form of a conn_t. However, there is a
+ * ip_xmit_attr_t that the callers use to influence interface selection
+ * (needed for ICMP echo as well as IPv6 link-locals) and IPsec.
+ *
+ * - conn_ip_output() is used when sending packets with a conn_t and
+ * ip_set_destination has been called to cache information. In that case
+ * various socket options are recorded in the ip_xmit_attr_t and should
+ * be taken into account.
+ */
+
+/*
+ * The caller *must* have called conn_connect() or ip_attr_connect()
+ * before calling conn_ip_output(). The caller needs to redo that each time
+ * the destination IP address or port changes, as well as each time there is
+ * a change to any socket option that would modify how packets are routed out
+ * of the box (e.g., SO_DONTROUTE, IP_NEXTHOP, IP_BOUND_IF).
+ *
+ * The ULP caller has to serialize the use of a single ip_xmit_attr_t.
+ * We assert for that here.
+ */
+int
+conn_ip_output(mblk_t *mp, ip_xmit_attr_t *ixa)
+{
+ iaflags_t ixaflags = ixa->ixa_flags;
+ ire_t *ire;
+ nce_t *nce;
+ dce_t *dce;
+ ill_t *ill;
+ ip_stack_t *ipst = ixa->ixa_ipst;
+ int error;
+
+ /* We defer ipIfStatsHCOutRequests until an error or we have an ill */
+
+ ASSERT(ixa->ixa_ire != NULL);
+ /* Note there is no ixa_nce when reject and blackhole routes */
+ ASSERT(ixa->ixa_dce != NULL); /* Could be default dce */
+
+#ifdef DEBUG
+ ASSERT(ixa->ixa_curthread == NULL);
+ ixa->ixa_curthread = curthread;
+#endif
+
+ /*
+ * Even on labeled systems we can have a NULL ixa_tsl e.g.,
+ * for IGMP/MLD traffic.
+ */
+
+ ire = ixa->ixa_ire;
+
+ /*
+ * If the ULP says the (old) IRE resulted in reachability we
+ * record this before determine whether to use a new IRE.
+ * No locking for performance reasons.
+ */
+ if (ixaflags & IXAF_REACH_CONF)
+ ire->ire_badcnt = 0;
+
+ /*
+ * Has routing changed since we cached the results of the lookup?
+ *
+ * This check captures all of:
+ * - the cached ire being deleted (by means of the special
+ * IRE_GENERATION_CONDEMNED)
+ * - A potentially better ire being added (ire_generation being
+ * increased)
+ * - A deletion of the nexthop ire that was used when we did the
+ * lookup.
+ * - An addition of a potentially better nexthop ire.
+ * The last two are handled by walking and increasing the generation
+ * number on all dependant IREs in ire_flush_cache().
+ *
+ * The check also handles all cases of RTF_REJECT and RTF_BLACKHOLE
+ * since we ensure that each time we set ixa_ire to such an IRE we
+ * make sure the ixa_ire_generation does not match (by using
+ * IRE_GENERATION_VERIFY).
+ */
+ if (ire->ire_generation != ixa->ixa_ire_generation) {
+ error = ip_verify_ire(mp, ixa);
+ if (error != 0) {
+ ip_drop_output("ipIfStatsOutDiscards - verify ire",
+ mp, NULL);
+ goto drop;
+ }
+ ire = ixa->ixa_ire;
+ ASSERT(ire != NULL);
+ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+#ifdef DEBUG
+ ASSERT(ixa->ixa_curthread == curthread);
+ ixa->ixa_curthread = NULL;
+#endif
+ ire->ire_ob_pkt_count++;
+ /* ixa_dce might be condemned; use default one */
+ return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa,
+ &ipst->ips_dce_default->dce_ident));
+ }
+ /*
+ * If the ncec changed then ip_verify_ire already set
+ * ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
+ * so we can recheck the interface mtu.
+ */
+
+ /*
+ * Note that ire->ire_generation could already have changed.
+ * We catch that next time we send a packet.
+ */
+ }
+
+ /*
+ * No need to lock access to ixa_nce since the ip_xmit_attr usage
+ * is single threaded.
+ */
+ ASSERT(ixa->ixa_nce != NULL);
+ nce = ixa->ixa_nce;
+ if (nce->nce_is_condemned) {
+ error = ip_verify_nce(mp, ixa);
+ /*
+ * In case ZEROCOPY capability become not available, we
+ * copy the message and free the original one. We might
+ * be copying more data than needed but it doesn't hurt
+ * since such change rarely happens.
+ */
+ switch (error) {
+ case 0:
+ break;
+ case ENOTSUP: { /* ZEROCOPY */
+ mblk_t *nmp;
+
+ if ((nmp = copymsg(mp)) != NULL) {
+ freemsg(mp);
+ mp = nmp;
+
+ break;
+ }
+ /* FALLTHROUGH */
+ }
+ default:
+ ip_drop_output("ipIfStatsOutDiscards - verify nce",
+ mp, NULL);
+ goto drop;
+ }
+ ire = ixa->ixa_ire;
+ ASSERT(ire != NULL);
+ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+#ifdef DEBUG
+ ASSERT(ixa->ixa_curthread == curthread);
+ ixa->ixa_curthread = NULL;
+#endif
+ ire->ire_ob_pkt_count++;
+ /* ixa_dce might be condemned; use default one */
+ return ((ire->ire_sendfn)(ire, mp, mp->b_rptr,
+ ixa, &ipst->ips_dce_default->dce_ident));
+ }
+ ASSERT(ixa->ixa_nce != NULL);
+ nce = ixa->ixa_nce;
+
+ /*
+ * Note that some other event could already have made
+ * the new nce condemned. We catch that next time we
+ * try to send a packet.
+ */
+ }
+ /*
+ * If there is no per-destination dce_t then we have a reference to
+ * the default dce_t (which merely contains the dce_ipid).
+ * The generation check captures both the introduction of a
+ * per-destination dce_t (e.g., due to ICMP packet too big) and
+ * any change to the per-destination dce (including it becoming
+ * condemned by use of the special DCE_GENERATION_CONDEMNED).
+ */
+ dce = ixa->ixa_dce;
+
+ /*
+ * To avoid a periodic timer to increase the path MTU we
+ * look at dce_last_change_time each time we send a packet.
+ */
+ if ((dce->dce_flags & DCEF_PMTU) &&
+ (TICK_TO_SEC(lbolt64) - dce->dce_last_change_time >
+ ipst->ips_ip_pathmtu_interval)) {
+ /*
+ * Older than 20 minutes. Drop the path MTU information.
+ * Since the path MTU changes as a result of this, twiddle
+ * ixa_dce_generation to make us go through the dce
+ * verification code in conn_ip_output.
+ */
+ mutex_enter(&dce->dce_lock);
+ dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
+ dce->dce_last_change_time = TICK_TO_SEC(lbolt64);
+ mutex_exit(&dce->dce_lock);
+ dce_increment_generation(dce);
+ }
+
+ if (dce->dce_generation != ixa->ixa_dce_generation) {
+ error = ip_verify_dce(mp, ixa);
+ if (error != 0) {
+ ip_drop_output("ipIfStatsOutDiscards - verify dce",
+ mp, NULL);
+ goto drop;
+ }
+ dce = ixa->ixa_dce;
+
+ /*
+ * Note that some other event could already have made the
+ * new dce's generation number change.
+ * We catch that next time we try to send a packet.
+ */
+ }
+
+ ill = nce->nce_ill;
+
+ /*
+ * An initial ixa_fragsize was set in ip_set_destination
+ * and we update it if any routing changes above.
+ * A change to ill_mtu with ifconfig will increase all dce_generation
+ * so that we will detect that with the generation check.
+ */
+
+ /*
+ * Caller needs to make sure IXAF_VERIFY_SRC is not set if
+ * conn_unspec_src.
+ */
+ if ((ixaflags & IXAF_VERIFY_SOURCE) &&
+ ixa->ixa_src_generation != ipst->ips_src_generation) {
+ /* Check if the IP source is still assigned to the host. */
+ uint_t gen;
+
+ if (!ip_verify_src(mp, ixa, &gen)) {
+ /* Don't send a packet with a source that isn't ours */
+ error = EADDRNOTAVAIL;
+ ip_drop_output("ipIfStatsOutDiscards - invalid src",
+ mp, NULL);
+ goto drop;
+ }
+ /* The source is still valid - update the generation number */
+ ixa->ixa_src_generation = gen;
+ }
+
+ /*
+ * We don't have an IRE when we fragment, hence ire_ob_pkt_count
+ * can only count the use prior to fragmentation. However the MIB
+ * counters on the ill will be incremented in post fragmentation.
+ */
+ ire->ire_ob_pkt_count++;
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
+
+ /*
+ * Based on ire_type and ire_flags call one of:
+ * ire_send_local_v* - for IRE_LOCAL and IRE_LOOPBACK
+ * ire_send_multirt_v* - if RTF_MULTIRT
+ * ire_send_noroute_v* - if RTF_REJECT or RTF_BLACHOLE
+ * ire_send_multicast_v* - for IRE_MULTICAST
+ * ire_send_broadcast_v4 - for IRE_BROADCAST
+ * ire_send_wire_v* - for the rest.
+ */
+#ifdef DEBUG
+ ASSERT(ixa->ixa_curthread == curthread);
+ ixa->ixa_curthread = NULL;
+#endif
+ return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, &dce->dce_ident));
+
+drop:
+ if (ixaflags & IXAF_IS_IPV4) {
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+ } else {
+ BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsHCOutRequests);
+ BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
+ }
+ freemsg(mp);
+#ifdef DEBUG
+ ASSERT(ixa->ixa_curthread == curthread);
+ ixa->ixa_curthread = NULL;
+#endif
+ return (error);
+}
+
+/*
+ * Handle both IPv4 and IPv6. Sets the generation number
+ * to allow the caller to know when to call us again.
+ * Returns true if the source address in the packet is a valid source.
+ * We handle callers which try to send with a zero address (since we only
+ * get here if UNSPEC_SRC is not set).
+ */
+boolean_t
+ip_verify_src(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp)
+{
+ ip_stack_t *ipst = ixa->ixa_ipst;
+
+ /*
+ * Need to grab the generation number before we check to
+ * avoid a race with a change to the set of local addresses.
+ * No lock needed since the thread which updates the set of local
+ * addresses use ipif/ill locks and exit those (hence a store memory
+ * barrier) before doing the atomic increase of ips_src_generation.
+ */
+ if (generationp != NULL)
+ *generationp = ipst->ips_src_generation;
+
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ ipha_t *ipha = (ipha_t *)mp->b_rptr;
+
+ if (ipha->ipha_src == INADDR_ANY)
+ return (B_FALSE);
+
+ return (ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid,
+ ipst, B_FALSE) != IPVL_BAD);
+ } else {
+ ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+ uint_t scopeid;
+
+ if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src))
+ return (B_FALSE);
+
+ if (ixa->ixa_flags & IXAF_SCOPEID_SET)
+ scopeid = ixa->ixa_scopeid;
+ else
+ scopeid = 0;
+
+ return (ip_laddr_verify_v6(&ip6h->ip6_src, ixa->ixa_zoneid,
+ ipst, B_FALSE, scopeid) != IPVL_BAD);
+ }
+}
+
+/*
+ * Handle both IPv4 and IPv6. Reverify/recalculate the IRE to use.
+ */
+int
+ip_verify_ire(mblk_t *mp, ip_xmit_attr_t *ixa)
+{
+ uint_t gen;
+ ire_t *ire;
+ nce_t *nce;
+ int error;
+ boolean_t multirt = B_FALSE;
+
+ /*
+ * Redo ip_select_route.
+ * Need to grab generation number as part of the lookup to
+ * avoid race.
+ */
+ error = 0;
+ ire = ip_select_route_pkt(mp, ixa, &gen, &error, &multirt);
+ ASSERT(ire != NULL); /* IRE_NOROUTE if none found */
+ if (error != 0) {
+ ire_refrele(ire);
+ return (error);
+ }
+
+ if (ixa->ixa_ire != NULL)
+ ire_refrele_notr(ixa->ixa_ire);
+#ifdef DEBUG
+ ire_refhold_notr(ire);
+ ire_refrele(ire);
+#endif
+ ixa->ixa_ire = ire;
+ ixa->ixa_ire_generation = gen;
+ if (multirt) {
+ if (ixa->ixa_flags & IXAF_IS_IPV4)
+ ixa->ixa_postfragfn = ip_postfrag_multirt_v4;
+ else
+ ixa->ixa_postfragfn = ip_postfrag_multirt_v6;
+ ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
+ } else {
+ ixa->ixa_postfragfn = ire->ire_postfragfn;
+ ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
+ }
+
+ /*
+ * Don't look for an nce for reject or blackhole.
+ * They have ire_generation set to IRE_GENERATION_VERIFY which
+ * makes conn_ip_output avoid references to ixa_nce.
+ */
+ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+ ASSERT(ixa->ixa_ire_generation == IRE_GENERATION_VERIFY);
+ ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
+ return (0);
+ }
+
+ /* The NCE could now be different */
+ nce = ire_to_nce_pkt(ire, mp);
+ if (nce == NULL) {
+ /*
+ * Allocation failure. Make sure we redo ire/nce selection
+ * next time we send.
+ */
+ ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
+ ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
+ return (ENOBUFS);
+ }
+ if (nce == ixa->ixa_nce) {
+ /* No change */
+ nce_refrele(nce);
+ return (0);
+ }
+
+ /*
+ * Since the path MTU might change as a result of this
+ * route change, we twiddle ixa_dce_generation to
+ * make conn_ip_output go through the ip_verify_dce code.
+ */
+ ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
+
+ if (ixa->ixa_nce != NULL)
+ nce_refrele(ixa->ixa_nce);
+ ixa->ixa_nce = nce;
+ return (0);
+}
+
+/*
+ * Handle both IPv4 and IPv6. Reverify/recalculate the NCE to use.
+ */
+static int
+ip_verify_nce(mblk_t *mp, ip_xmit_attr_t *ixa)
+{
+ ire_t *ire = ixa->ixa_ire;
+ nce_t *nce;
+ int error = 0;
+ ipha_t *ipha = NULL;
+ ip6_t *ip6h = NULL;
+
+ if (ire->ire_ipversion == IPV4_VERSION)
+ ipha = (ipha_t *)mp->b_rptr;
+ else
+ ip6h = (ip6_t *)mp->b_rptr;
+
+ nce = ire_handle_condemned_nce(ixa->ixa_nce, ire, ipha, ip6h, B_TRUE);
+ if (nce == NULL) {
+ /* Try to find a better ire */
+ return (ip_verify_ire(mp, ixa));
+ }
+
+ /*
+ * The hardware offloading capabilities, for example LSO, of the
+ * interface might have changed, so do sanity verification here.
+ */
+ if (ixa->ixa_flags & IXAF_VERIFY_LSO) {
+ if (!ip_verify_lso(nce->nce_ill, ixa)) {
+ ASSERT(ixa->ixa_notify != NULL);
+ ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
+ IXAN_LSO, 0);
+ error = ENOTSUP;
+ }
+ }
+
+ /*
+ * Verify ZEROCOPY capability of underlying ill. Notify the ULP with
+ * any ZEROCOPY changes. In case ZEROCOPY capability is not available
+ * any more, return error so that conn_ip_output() can take care of
+ * the ZEROCOPY message properly. It's safe to continue send the
+ * message when ZEROCOPY newly become available.
+ */
+ if (ixa->ixa_flags & IXAF_VERIFY_ZCOPY) {
+ if (!ip_verify_zcopy(nce->nce_ill, ixa)) {
+ ASSERT(ixa->ixa_notify != NULL);
+ ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
+ IXAN_ZCOPY, 0);
+ if ((ixa->ixa_flags & IXAF_ZCOPY_CAPAB) == 0)
+ error = ENOTSUP;
+ }
+ }
+
+ /*
+ * Since the path MTU might change as a result of this
+ * change, we twiddle ixa_dce_generation to
+ * make conn_ip_output go through the ip_verify_dce code.
+ */
+ ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
+
+ nce_refrele(ixa->ixa_nce);
+ ixa->ixa_nce = nce;
+ return (error);
+}
+
+/*
+ * Handle both IPv4 and IPv6. Reverify/recalculate the DCE to use.
+ */
+static int
+ip_verify_dce(mblk_t *mp, ip_xmit_attr_t *ixa)
+{
+ dce_t *dce;
+ uint_t gen;
+ uint_t pmtu;
+
+ dce = dce_lookup_pkt(mp, ixa, &gen);
+ ASSERT(dce != NULL);
+
+ dce_refrele_notr(ixa->ixa_dce);
+#ifdef DEBUG
+ dce_refhold_notr(dce);
+ dce_refrele(dce);
+#endif
+ ixa->ixa_dce = dce;
+ ixa->ixa_dce_generation = gen;
+
+ /* Extract the (path) mtu from the dce, ncec_ill etc */
+ pmtu = ip_get_pmtu(ixa);
+
+ /*
+ * Tell ULP about PMTU changes - increase or decrease - by returning
+ * an error if IXAF_VERIFY_PMTU is set. In such case, ULP should update
+ * both ixa_pmtu and ixa_fragsize appropriately.
+ *
+ * If ULP doesn't set that flag then we need to update ixa_fragsize
+ * since routing could have changed the ill after after ixa_fragsize
+ * was set previously in the conn_ip_output path or in
+ * ip_set_destination.
+ *
+ * In case of LSO, ixa_fragsize might be greater than ixa_pmtu.
+ *
+ * In the case of a path MTU increase we send the packet after the
+ * notify to the ULP.
+ */
+ if (ixa->ixa_flags & IXAF_VERIFY_PMTU) {
+ if (ixa->ixa_pmtu != pmtu) {
+ uint_t oldmtu = ixa->ixa_pmtu;
+
+ DTRACE_PROBE2(verify_pmtu, uint32_t, pmtu,
+ uint32_t, ixa->ixa_pmtu);
+ ASSERT(ixa->ixa_notify != NULL);
+ ixa->ixa_notify(ixa->ixa_notify_cookie, ixa,
+ IXAN_PMTU, pmtu);
+ if (pmtu < oldmtu)
+ return (EMSGSIZE);
+ }
+ } else {
+ ixa->ixa_fragsize = pmtu;
+ }
+ return (0);
+}
+
+/*
+ * Verify LSO usability. Keep the return value simple to indicate whether
+ * the LSO capability has changed. Handle both IPv4 and IPv6.
+ */
+static boolean_t
+ip_verify_lso(ill_t *ill, ip_xmit_attr_t *ixa)
+{
+ ill_lso_capab_t *lsoc = &ixa->ixa_lso_capab;
+ ill_lso_capab_t *new_lsoc = ill->ill_lso_capab;
+
+ if (ixa->ixa_flags & IXAF_LSO_CAPAB) {
+ /*
+ * Not unsable any more.
+ */
+ if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) ||
+ (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) ||
+ (ixa->ixa_ire->ire_flags & RTF_MULTIRT) ||
+ ((ixa->ixa_flags & IXAF_IS_IPV4) ?
+ !ILL_LSO_TCP_IPV4_USABLE(ill) :
+ !ILL_LSO_TCP_IPV6_USABLE(ill))) {
+ ixa->ixa_flags &= ~IXAF_LSO_CAPAB;
+
+ return (B_FALSE);
+ }
+
+ /*
+ * Capability has changed, refresh the copy in ixa.
+ */
+ if (lsoc->ill_lso_max != new_lsoc->ill_lso_max) {
+ *lsoc = *new_lsoc;
+
+ return (B_FALSE);
+ }
+ } else { /* Was not usable */
+ if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) &&
+ !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
+ !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) &&
+ ((ixa->ixa_flags & IXAF_IS_IPV4) ?
+ ILL_LSO_TCP_IPV4_USABLE(ill) :
+ ILL_LSO_TCP_IPV6_USABLE(ill))) {
+ *lsoc = *new_lsoc;
+ ixa->ixa_flags |= IXAF_LSO_CAPAB;
+
+ return (B_FALSE);
+ }
+ }
+
+ return (B_TRUE);
+}
+
+/*
+ * Verify ZEROCOPY usability. Keep the return value simple to indicate whether
+ * the ZEROCOPY capability has changed. Handle both IPv4 and IPv6.
+ */
+static boolean_t
+ip_verify_zcopy(ill_t *ill, ip_xmit_attr_t *ixa)
+{
+ if (ixa->ixa_flags & IXAF_ZCOPY_CAPAB) {
+ /*
+ * Not unsable any more.
+ */
+ if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) ||
+ (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) ||
+ (ixa->ixa_ire->ire_flags & RTF_MULTIRT) ||
+ !ILL_ZCOPY_USABLE(ill)) {
+ ixa->ixa_flags &= ~IXAF_ZCOPY_CAPAB;
+
+ return (B_FALSE);
+ }
+ } else { /* Was not usable */
+ if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) &&
+ !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
+ !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) &&
+ ILL_ZCOPY_USABLE(ill)) {
+ ixa->ixa_flags |= IXAF_ZCOPY_CAPAB;
+
+ return (B_FALSE);
+ }
+ }
+
+ return (B_TRUE);
+}
+
+
+/*
+ * When there is no conn_t context, this will send a packet.
+ * The caller must *not* have called conn_connect() or ip_attr_connect()
+ * before calling ip_output_simple().
+ * Handles IPv4 and IPv6. Returns zero or an errno such as ENETUNREACH.
+ * Honors IXAF_SET_SOURCE.
+ *
+ * We acquire the ire and after calling ire_sendfn we release
+ * the hold on the ire. Ditto for the nce and dce.
+ *
+ * This assumes that the caller has set the following in ip_xmit_attr_t:
+ * ixa_tsl, ixa_zoneid, and ixa_ipst must always be set.
+ * If ixa_ifindex is non-zero it means send out that ill. (If it is
+ * an upper IPMP ill we load balance across the group; if a lower we send
+ * on that lower ill without load balancing.)
+ * IXAF_IS_IPV4 must be set correctly.
+ * If IXAF_IPSEC_SECURE is set then the ixa_ipsec_* fields must be set.
+ * If IXAF_NO_IPSEC is set we'd skip IPsec policy lookup.
+ * If neither of those two are set we do an IPsec policy lookup.
+ *
+ * We handle setting things like
+ * ixa_pktlen
+ * ixa_ip_hdr_length
+ * ixa->ixa_protocol
+ *
+ * The caller may set ixa_xmit_hint, which is used for ECMP selection and
+ * transmit ring selecting in GLD.
+ *
+ * The caller must do an ixa_cleanup() to release any IPsec references
+ * after we return.
+ */
+int
+ip_output_simple(mblk_t *mp, ip_xmit_attr_t *ixa)
+{
+ ts_label_t *effective_tsl = NULL;
+ int err;
+
+ ASSERT(ixa->ixa_ipst != NULL);
+
+ if (is_system_labeled()) {
+ ip_stack_t *ipst = ixa->ixa_ipst;
+
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ err = tsol_check_label_v4(ixa->ixa_tsl, ixa->ixa_zoneid,
+ &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst,
+ &effective_tsl);
+ } else {
+ err = tsol_check_label_v6(ixa->ixa_tsl, ixa->ixa_zoneid,
+ &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst,
+ &effective_tsl);
+ }
+ if (err != 0) {
+ ip2dbg(("tsol_check: label check failed (%d)\n", err));
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("tsol_check_label", mp, NULL);
+ freemsg(mp);
+ return (err);
+ }
+ if (effective_tsl != NULL) {
+ /* Update the label */
+ ip_xmit_attr_replace_tsl(ixa, effective_tsl);
+ }
+ }
+
+ if (ixa->ixa_flags & IXAF_IS_IPV4)
+ return (ip_output_simple_v4(mp, ixa));
+ else
+ return (ip_output_simple_v6(mp, ixa));
+}
+
+int
+ip_output_simple_v4(mblk_t *mp, ip_xmit_attr_t *ixa)
+{
+ ipha_t *ipha;
+ ipaddr_t firsthop; /* In IP header */
+ ipaddr_t dst; /* End of source route, or ipha_dst if none */
+ ire_t *ire;
+ ipaddr_t setsrc; /* RTF_SETSRC */
+ int error;
+ ill_t *ill = NULL;
+ dce_t *dce = NULL;
+ nce_t *nce;
+ iaflags_t ixaflags = ixa->ixa_flags;
+ ip_stack_t *ipst = ixa->ixa_ipst;
+ boolean_t repeat = B_FALSE;
+ boolean_t multirt = B_FALSE;
+
+ ipha = (ipha_t *)mp->b_rptr;
+ ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
+
+ /*
+ * Even on labeled systems we can have a NULL ixa_tsl e.g.,
+ * for IGMP/MLD traffic.
+ */
+
+ /* Caller already set flags */
+ ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
+
+ ASSERT(ixa->ixa_nce == NULL);
+
+ ixa->ixa_pktlen = ntohs(ipha->ipha_length);
+ ASSERT(ixa->ixa_pktlen == msgdsize(mp));
+ ixa->ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha);
+ ixa->ixa_protocol = ipha->ipha_protocol;
+
+ /*
+ * Assumes that source routed packets have already been massaged by
+ * the ULP (ip_massage_options) and as a result ipha_dst is the next
+ * hop in the source route. The final destination is used for IPsec
+ * policy and DCE lookup.
+ */
+ firsthop = ipha->ipha_dst;
+ dst = ip_get_dst(ipha);
+
+repeat_ire:
+ error = 0;
+ setsrc = INADDR_ANY;
+ ire = ip_select_route_v4(firsthop, ixa, NULL, &setsrc, &error,
+ &multirt);
+ ASSERT(ire != NULL); /* IRE_NOROUTE if none found */
+ if (error != 0) {
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards - select route", mp, NULL);
+ freemsg(mp);
+ goto done;
+ }
+
+ if (ire->ire_flags & (RTF_BLACKHOLE|RTF_REJECT)) {
+ /* ire_ill might be NULL hence need to skip some code */
+ if (ixaflags & IXAF_SET_SOURCE)
+ ipha->ipha_src = htonl(INADDR_LOOPBACK);
+ ixa->ixa_fragsize = IP_MAXPACKET;
+ ill = NULL;
+ nce = NULL;
+ ire->ire_ob_pkt_count++;
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
+ /* No dce yet; use default one */
+ error = (ire->ire_sendfn)(ire, mp, ipha, ixa,
+ &ipst->ips_dce_default->dce_ident);
+ goto done;
+ }
+
+ /* Note that ipha_dst is only used for IRE_MULTICAST */
+ nce = ire_to_nce(ire, ipha->ipha_dst, NULL);
+ if (nce == NULL) {
+ /* Allocation failure? */
+ ip_drop_output("ire_to_nce", mp, ill);
+ freemsg(mp);
+ error = ENOBUFS;
+ goto done;
+ }
+ if (nce->nce_is_condemned) {
+ nce_t *nce1;
+
+ nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_TRUE);
+ nce_refrele(nce);
+ if (nce1 == NULL) {
+ if (!repeat) {
+ /* Try finding a better IRE */
+ repeat = B_TRUE;
+ ire_refrele(ire);
+ goto repeat_ire;
+ }
+ /* Tried twice - drop packet */
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("No nce", mp, ill);
+ freemsg(mp);
+ error = ENOBUFS;
+ goto done;
+ }
+ nce = nce1;
+ }
+
+ /*
+ * For multicast with multirt we have a flag passed back from
+ * ire_lookup_multi_ill_v4 since we don't have an IRE for each
+ * possible multicast address.
+ * We also need a flag for multicast since we can't check
+ * whether RTF_MULTIRT is set in ixa_ire for multicast.
+ */
+ if (multirt) {
+ ixa->ixa_postfragfn = ip_postfrag_multirt_v4;
+ ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
+ } else {
+ ixa->ixa_postfragfn = ire->ire_postfragfn;
+ ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
+ }
+ ASSERT(ixa->ixa_nce == NULL);
+ ixa->ixa_nce = nce;
+
+ /*
+ * Check for a dce_t with a path mtu.
+ */
+ dce = dce_lookup_v4(dst, ipst, NULL);
+ ASSERT(dce != NULL);
+
+ if (!(ixaflags & IXAF_PMTU_DISCOVERY)) {
+ ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
+ } else if (dce->dce_flags & DCEF_PMTU) {
+ /*
+ * To avoid a periodic timer to increase the path MTU we
+ * look at dce_last_change_time each time we send a packet.
+ */
+ if (TICK_TO_SEC(lbolt64) - dce->dce_last_change_time >
+ ipst->ips_ip_pathmtu_interval) {
+ /*
+ * Older than 20 minutes. Drop the path MTU information.
+ */
+ mutex_enter(&dce->dce_lock);
+ dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
+ dce->dce_last_change_time = TICK_TO_SEC(lbolt64);
+ mutex_exit(&dce->dce_lock);
+ dce_increment_generation(dce);
+ ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
+ } else {
+ uint_t fragsize;
+
+ fragsize = ip_get_base_mtu(nce->nce_ill, ire);
+ if (fragsize > dce->dce_pmtu)
+ fragsize = dce->dce_pmtu;
+ ixa->ixa_fragsize = fragsize;
+ }
+ } else {
+ ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire);
+ }
+
+ /*
+ * We use use ire_nexthop_ill (and not ncec_ill) to avoid the under ipmp
+ * interface for source address selection.
+ */
+ ill = ire_nexthop_ill(ire);
+
+ if (ixaflags & IXAF_SET_SOURCE) {
+ ipaddr_t src;
+
+ /*
+ * We use the final destination to get
+ * correct selection for source routed packets
+ */
+
+ /* If unreachable we have no ill but need some source */
+ if (ill == NULL) {
+ src = htonl(INADDR_LOOPBACK);
+ error = 0;
+ } else {
+ error = ip_select_source_v4(ill, setsrc, dst,
+ ixa->ixa_multicast_ifaddr, ixa->ixa_zoneid, ipst,
+ &src, NULL, NULL);
+ }
+ if (error != 0) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards - no source",
+ mp, ill);
+ freemsg(mp);
+ goto done;
+ }
+ ipha->ipha_src = src;
+ } else if (ixaflags & IXAF_VERIFY_SOURCE) {
+ /* Check if the IP source is assigned to the host. */
+ if (!ip_verify_src(mp, ixa, NULL)) {
+ /* Don't send a packet with a source that isn't ours */
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards - invalid source",
+ mp, ill);
+ freemsg(mp);
+ error = EADDRNOTAVAIL;
+ goto done;
+ }
+ }
+
+
+ /*
+ * Check against global IPsec policy to set the AH/ESP attributes.
+ * IPsec will set IXAF_IPSEC_* and ixa_ipsec_* as appropriate.
+ */
+ if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) {
+ ASSERT(ixa->ixa_ipsec_policy == NULL);
+ mp = ip_output_attach_policy(mp, ipha, NULL, NULL, ixa);
+ if (mp == NULL) {
+ /* MIB and ip_drop_packet already done */
+ return (EHOSTUNREACH); /* IPsec policy failure */
+ }
+ }
+
+ if (ill != NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
+ } else {
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
+ }
+
+ /*
+ * We update the statistics on the most specific IRE i.e., the first
+ * one we found.
+ * We don't have an IRE when we fragment, hence ire_ob_pkt_count
+ * can only count the use prior to fragmentation. However the MIB
+ * counters on the ill will be incremented in post fragmentation.
+ */
+ ire->ire_ob_pkt_count++;
+
+ /*
+ * Based on ire_type and ire_flags call one of:
+ * ire_send_local_v4 - for IRE_LOCAL and IRE_LOOPBACK
+ * ire_send_multirt_v4 - if RTF_MULTIRT
+ * ire_send_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE
+ * ire_send_multicast_v4 - for IRE_MULTICAST
+ * ire_send_broadcast_v4 - for IRE_BROADCAST
+ * ire_send_wire_v4 - for the rest.
+ */
+ error = (ire->ire_sendfn)(ire, mp, ipha, ixa, &dce->dce_ident);
+done:
+ ire_refrele(ire);
+ if (dce != NULL)
+ dce_refrele(dce);
+ if (ill != NULL)
+ ill_refrele(ill);
+ if (ixa->ixa_nce != NULL)
+ nce_refrele(ixa->ixa_nce);
+ ixa->ixa_nce = NULL;
+ return (error);
+}
+
+/*
+ * ire_sendfn() functions.
+ * These functions use the following xmit_attr:
+ * - ixa_fragsize - read to determine whether or not to fragment
+ * - IXAF_IPSEC_SECURE - to determine whether or not to invoke IPsec
+ * - ixa_ipsec_* are used inside IPsec
+ * - IXAF_SET_SOURCE - replace IP source in broadcast case.
+ * - IXAF_LOOPBACK_COPY - for multicast and broadcast
+ */
+
+
+/*
+ * ire_sendfn for IRE_LOCAL and IRE_LOOPBACK
+ *
+ * The checks for restrict_interzone_loopback are done in ire_route_recursive.
+ */
+/* ARGSUSED4 */
+int
+ire_send_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
+ ip_xmit_attr_t *ixa, uint32_t *identp)
+{
+ ipha_t *ipha = (ipha_t *)iph_arg;
+ ip_stack_t *ipst = ixa->ixa_ipst;
+ ill_t *ill = ire->ire_ill;
+ ip_recv_attr_t iras; /* NOTE: No bzero for performance */
+ uint_t pktlen = ixa->ixa_pktlen;
+
+ /*
+ * No fragmentation, no nce, no application of IPsec,
+ * and no ipha_ident assignment.
+ *
+ * Note different order between IP provider and FW_HOOKS than in
+ * send_wire case.
+ */
+
+ /*
+ * DTrace this as ip:::send. A packet blocked by FW_HOOKS will fire the
+ * send probe, but not the receive probe.
+ */
+ DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+ ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL,
+ int, 1);
+
+ if (HOOKS4_INTERESTED_LOOPBACK_OUT(ipst)) {
+ int error;
+
+ DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL,
+ ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
+ FW_HOOKS(ipst->ips_ip4_loopback_out_event,
+ ipst->ips_ipv4firewall_loopback_out,
+ NULL, ill, ipha, mp, mp, 0, ipst, error);
+ DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp);
+ if (mp == NULL)
+ return (error);
+
+ /*
+ * Even if the destination was changed by the filter we use the
+ * forwarding decision that was made based on the address
+ * in ip_output/ip_set_destination.
+ */
+ /* Length could be different */
+ ipha = (ipha_t *)mp->b_rptr;
+ pktlen = ntohs(ipha->ipha_length);
+ }
+
+ /*
+ * If a callback is enabled then we need to know the
+ * source and destination zoneids for the packet. We already
+ * have those handy.
+ */
+ if (ipst->ips_ip4_observe.he_interested) {
+ zoneid_t szone, dzone;
+ zoneid_t stackzoneid;
+
+ stackzoneid = netstackid_to_zoneid(
+ ipst->ips_netstack->netstack_stackid);
+
+ if (stackzoneid == GLOBAL_ZONEID) {
+ /* Shared-IP zone */
+ dzone = ire->ire_zoneid;
+ szone = ixa->ixa_zoneid;
+ } else {
+ szone = dzone = stackzoneid;
+ }
+ ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst);
+ }
+
+ /* Handle lo0 stats */
+ ipst->ips_loopback_packets++;
+
+ /* Map ixa to ira including IPsec policies */
+ ipsec_out_to_in(ixa, ill, &iras);
+ iras.ira_pktlen = pktlen;
+
+ if (!IS_SIMPLE_IPH(ipha)) {
+ ip_output_local_options(ipha, ipst);
+ iras.ira_flags |= IRAF_IPV4_OPTIONS;
+ }
+
+ if (HOOKS4_INTERESTED_LOOPBACK_IN(ipst)) {
+ int error;
+
+ DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill,
+ ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp);
+ FW_HOOKS(ipst->ips_ip4_loopback_in_event,
+ ipst->ips_ipv4firewall_loopback_in,
+ ill, NULL, ipha, mp, mp, 0, ipst, error);
+
+ DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp);
+ if (mp == NULL) {
+ ira_cleanup(&iras, B_FALSE);
+ return (error);
+ }
+ /*
+ * Even if the destination was changed by the filter we use the
+ * forwarding decision that was made based on the address
+ * in ip_output/ip_set_destination.
+ */
+ /* Length could be different */
+ ipha = (ipha_t *)mp->b_rptr;
+ pktlen = iras.ira_pktlen = ntohs(ipha->ipha_length);
+ }
+
+ DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+ ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL,
+ int, 1);
+
+ ire->ire_ib_pkt_count++;
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
+ UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pktlen);
+
+ /* Destined to ire_zoneid - use that for fanout */
+ iras.ira_zoneid = ire->ire_zoneid;
+
+ if (is_system_labeled()) {
+ iras.ira_flags |= IRAF_SYSTEM_LABELED;
+
+ /*
+ * This updates ira_cred, ira_tsl and ira_free_flags based
+ * on the label. We don't expect this to ever fail for
+ * loopback packets, so we silently drop the packet should it
+ * fail.
+ */
+ if (!tsol_get_pkt_label(mp, IPV4_VERSION, &iras)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("tsol_get_pkt_label", mp, ill);
+ freemsg(mp);
+ return (0);
+ }
+ ASSERT(iras.ira_tsl != NULL);
+
+ /* tsol_get_pkt_label sometimes does pullupmsg */
+ ipha = (ipha_t *)mp->b_rptr;
+ }
+
+ ip_fanout_v4(mp, ipha, &iras);
+
+ /* We moved any IPsec refs from ixa to iras */
+ ira_cleanup(&iras, B_FALSE);
+ return (0);
+}
+
+/*
+ * ire_sendfn for IRE_BROADCAST
+ * If the broadcast address is present on multiple ills and ixa_ifindex
+ * isn't set, then we generate
+ * a separate datagram (potentially with different source address) for
+ * those ills. In any case, only one copy is looped back to ip_input_v4.
+ */
+int
+ire_send_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
+ ip_xmit_attr_t *ixa, uint32_t *identp)
+{
+ ipha_t *ipha = (ipha_t *)iph_arg;
+ ip_stack_t *ipst = ixa->ixa_ipst;
+ irb_t *irb = ire->ire_bucket;
+ ire_t *ire1;
+ mblk_t *mp1;
+ ipha_t *ipha1;
+ iaflags_t ixaflags = ixa->ixa_flags;
+ nce_t *nce1, *nce_orig;
+
+ /*
+ * Unless ire_send_multirt_v4 already set a ttl, force the
+ * ttl to a smallish value.
+ */
+ if (!(ixa->ixa_flags & IXAF_NO_TTL_CHANGE)) {
+ /*
+ * To avoid broadcast storms, we usually set the TTL to 1 for
+ * broadcasts. This can
+ * be overridden stack-wide through the ip_broadcast_ttl
+ * ndd tunable, or on a per-connection basis through the
+ * IP_BROADCAST_TTL socket option.
+ *
+ * If SO_DONTROUTE/IXAF_DONTROUTE is set, then ire_send_wire_v4
+ * will force ttl to one after we've set this.
+ */
+ if (ixaflags & IXAF_BROADCAST_TTL_SET)
+ ipha->ipha_ttl = ixa->ixa_broadcast_ttl;
+ else
+ ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl;
+ }
+ /*
+ * Make sure we get a loopback copy (after IPsec and frag)
+ * Skip hardware checksum so that loopback copy is checksumed.
+ */
+ ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
+
+ /* Do we need to potentially generate multiple copies? */
+ if (irb->irb_ire_cnt == 1 || ixa->ixa_ifindex != 0)
+ return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
+
+ /*
+ * Loop over all IRE_BROADCAST in the bucket (might only be one).
+ * Note that everything in the bucket has the same destination address.
+ */
+ irb_refhold(irb);
+ for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
+ /* We do the main IRE after the end of the loop */
+ if (ire1 == ire)
+ continue;
+
+ /*
+ * Only IREs for the same IP address should be in the same
+ * bucket.
+ * But could have IRE_HOSTs in the case of CGTP.
+ * If we find any multirt routes we bail out of the loop
+ * and just do the single packet at the end; ip_postfrag_multirt
+ * will duplicate the packet.
+ */
+ ASSERT(ire1->ire_addr == ire->ire_addr);
+ if (!(ire1->ire_type & IRE_BROADCAST))
+ continue;
+
+ if (IRE_IS_CONDEMNED(ire1))
+ continue;
+
+ if (ixa->ixa_zoneid != ALL_ZONES &&
+ ire->ire_zoneid != ire1->ire_zoneid)
+ continue;
+
+ ASSERT(ire->ire_ill != ire1->ire_ill && ire1->ire_ill != NULL);
+
+ if (ire1->ire_flags & RTF_MULTIRT)
+ break;
+
+ /*
+ * For IPMP we only send for the ipmp_ill. arp_nce_init() will
+ * ensure that this goes out on the cast_ill.
+ */
+ if (IS_UNDER_IPMP(ire1->ire_ill))
+ continue;
+
+ mp1 = copymsg(mp);
+ if (mp1 == NULL) {
+ BUMP_MIB(ire1->ire_ill->ill_ip_mib,
+ ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards",
+ mp, ire1->ire_ill);
+ continue;
+ }
+
+ ipha1 = (ipha_t *)mp1->b_rptr;
+ if (ixa->ixa_flags & IXAF_SET_SOURCE) {
+ /*
+ * Need to pick a different source address for each
+ * interface. If we have a global IPsec policy and
+ * no per-socket policy then we punt to
+ * ip_output_simple_v4 using a separate ip_xmit_attr_t.
+ */
+ if (ixaflags & IXAF_IPSEC_GLOBAL_POLICY) {
+ ip_output_simple_broadcast(ixa, mp1);
+ continue;
+ }
+ /* Pick a new source address for each interface */
+ if (ip_select_source_v4(ire1->ire_ill, INADDR_ANY,
+ ipha1->ipha_dst, INADDR_ANY, ixa->ixa_zoneid, ipst,
+ &ipha1->ipha_src, NULL, NULL) != 0) {
+ BUMP_MIB(ire1->ire_ill->ill_ip_mib,
+ ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards - select "
+ "broadcast source", mp1, ire1->ire_ill);
+ freemsg(mp1);
+ continue;
+ }
+ /*
+ * Check against global IPsec policy to set the AH/ESP
+ * attributes. IPsec will set IXAF_IPSEC_* and
+ * ixa_ipsec_* as appropriate.
+ */
+ if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) {
+ ASSERT(ixa->ixa_ipsec_policy == NULL);
+ mp1 = ip_output_attach_policy(mp1, ipha, NULL,
+ NULL, ixa);
+ if (mp1 == NULL) {
+ /*
+ * MIB and ip_drop_packet already
+ * done
+ */
+ continue;
+ }
+ }
+ }
+ /* Make sure we have an NCE on this ill */
+ nce1 = arp_nce_init(ire1->ire_ill, ire1->ire_addr,
+ ire1->ire_type);
+ if (nce1 == NULL) {
+ BUMP_MIB(ire1->ire_ill->ill_ip_mib,
+ ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards - broadcast nce",
+ mp1, ire1->ire_ill);
+ freemsg(mp1);
+ continue;
+ }
+ nce_orig = ixa->ixa_nce;
+ ixa->ixa_nce = nce1;
+
+ ire_refhold(ire1);
+ /*
+ * Ignore any errors here. We just collect the errno for
+ * the main ire below
+ */
+ (void) ire_send_wire_v4(ire1, mp1, ipha1, ixa, identp);
+ ire_refrele(ire1);
+
+ ixa->ixa_nce = nce_orig;
+ nce_refrele(nce1);
+
+ ixa->ixa_flags &= ~IXAF_LOOPBACK_COPY;
+ }
+ irb_refrele(irb);
+ /* Finally, the main one */
+
+ /*
+ * For IPMP we only send broadcasts on the ipmp_ill.
+ */
+ if (IS_UNDER_IPMP(ire->ire_ill)) {
+ freemsg(mp);
+ return (0);
+ }
+
+ return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
+}
+
+/*
+ * Send a packet using a different source address and different
+ * IPsec policy.
+ */
+static void
+ip_output_simple_broadcast(ip_xmit_attr_t *ixa, mblk_t *mp)
+{
+ ip_xmit_attr_t ixas;
+
+ bzero(&ixas, sizeof (ixas));
+ ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
+ ixas.ixa_zoneid = ixa->ixa_zoneid;
+ ixas.ixa_ifindex = 0;
+ ixas.ixa_ipst = ixa->ixa_ipst;
+ ixas.ixa_cred = ixa->ixa_cred;
+ ixas.ixa_cpid = ixa->ixa_cpid;
+ ixas.ixa_tsl = ixa->ixa_tsl;
+ ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
+
+ (void) ip_output_simple(mp, &ixas);
+ ixa_cleanup(&ixas);
+}
+
+
+static void
+multirt_check_v4(ire_t *ire, ipha_t *ipha, ip_xmit_attr_t *ixa)
+{
+ ip_stack_t *ipst = ixa->ixa_ipst;
+
+ /* Limit the TTL on multirt packets */
+ if (ire->ire_type & IRE_MULTICAST) {
+ if (ipha->ipha_ttl > 1) {
+ ip2dbg(("ire_send_multirt_v4: forcing multicast "
+ "multirt TTL to 1 (was %d), dst 0x%08x\n",
+ ipha->ipha_ttl, ntohl(ire->ire_addr)));
+ ipha->ipha_ttl = 1;
+ }
+ ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
+ } else if ((ipst->ips_ip_multirt_ttl > 0) &&
+ (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) {
+ ipha->ipha_ttl = ipst->ips_ip_multirt_ttl;
+ /*
+ * Need to ensure we don't increase the ttl should we go through
+ * ire_send_broadcast or multicast.
+ */
+ ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
+ }
+}
+
+/*
+ * ire_sendfn for IRE_MULTICAST
+ */
+int
+ire_send_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
+ ip_xmit_attr_t *ixa, uint32_t *identp)
+{
+ ipha_t *ipha = (ipha_t *)iph_arg;
+ ip_stack_t *ipst = ixa->ixa_ipst;
+ ill_t *ill = ire->ire_ill;
+ iaflags_t ixaflags = ixa->ixa_flags;
+
+ /*
+ * The IRE_MULTICAST is the same whether or not multirt is in use.
+ * Hence we need special-case code.
+ */
+ if (ixaflags & IXAF_MULTIRT_MULTICAST)
+ multirt_check_v4(ire, ipha, ixa);
+
+ /*
+ * Check if anything in ip_input_v4 wants a copy of the transmitted
+ * packet (after IPsec and fragmentation)
+ *
+ * 1. Multicast routers always need a copy unless SO_DONTROUTE is set
+ * RSVP and the rsvp daemon is an example of a
+ * protocol and user level process that
+ * handles it's own routing. Hence, it uses the
+ * SO_DONTROUTE option to accomplish this.
+ * 2. If the sender has set IP_MULTICAST_LOOP, then we just
+ * check whether there are any receivers for the group on the ill
+ * (ignoring the zoneid).
+ * 3. If IP_MULTICAST_LOOP is not set, then we check if there are
+ * any members in other shared-IP zones.
+ * If such members exist, then we indicate that the sending zone
+ * shouldn't get a loopback copy to preserve the IP_MULTICAST_LOOP
+ * behavior.
+ *
+ * When we loopback we skip hardware checksum to make sure loopback
+ * copy is checksumed.
+ *
+ * Note that ire_ill is the upper in the case of IPMP.
+ */
+ ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM);
+ if (ipst->ips_ip_g_mrouter && ill->ill_mrouter_cnt > 0 &&
+ !(ixaflags & IXAF_DONTROUTE)) {
+ ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
+ } else if (ixaflags & IXAF_MULTICAST_LOOP) {
+ /*
+ * If this zone or any other zone has members then loopback
+ * a copy.
+ */
+ if (ill_hasmembers_v4(ill, ipha->ipha_dst))
+ ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
+ } else if (ipst->ips_netstack->netstack_numzones > 1) {
+ /*
+ * This zone should not have a copy. But there are some other
+ * zones which might have members.
+ */
+ if (ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst,
+ ixa->ixa_zoneid)) {
+ ixa->ixa_flags |= IXAF_NO_LOOP_ZONEID_SET;
+ ixa->ixa_no_loop_zoneid = ixa->ixa_zoneid;
+ ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM;
+ }
+ }
+
+ /*
+ * Unless ire_send_multirt_v4 or icmp_output_hdrincl already set a ttl,
+ * force the ttl to the IP_MULTICAST_TTL value
+ */
+ if (!(ixaflags & IXAF_NO_TTL_CHANGE)) {
+ ipha->ipha_ttl = ixa->ixa_multicast_ttl;
+ }
+
+ return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
+}
+
+/*
+ * ire_sendfn for IREs with RTF_MULTIRT
+ */
+int
+ire_send_multirt_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
+ ip_xmit_attr_t *ixa, uint32_t *identp)
+{
+ ipha_t *ipha = (ipha_t *)iph_arg;
+
+ multirt_check_v4(ire, ipha, ixa);
+
+ if (ire->ire_type & IRE_MULTICAST)
+ return (ire_send_multicast_v4(ire, mp, ipha, ixa, identp));
+ else if (ire->ire_type & IRE_BROADCAST)
+ return (ire_send_broadcast_v4(ire, mp, ipha, ixa, identp));
+ else
+ return (ire_send_wire_v4(ire, mp, ipha, ixa, identp));
+}
+
+/*
+ * ire_sendfn for IREs with RTF_REJECT/RTF_BLACKHOLE, including IRE_NOROUTE
+ */
+int
+ire_send_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
+ ip_xmit_attr_t *ixa, uint32_t *identp)
+{
+ ip_stack_t *ipst = ixa->ixa_ipst;
+ ipha_t *ipha = (ipha_t *)iph_arg;
+ ill_t *ill;
+ ip_recv_attr_t iras;
+ boolean_t dummy;
+
+ /* We assign an IP ident for nice errors */
+ ipha->ipha_ident = atomic_add_32_nv(identp, 1);
+
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes);
+
+ if (ire->ire_type & IRE_NOROUTE) {
+ /* A lack of a route as opposed to RTF_REJECT|BLACKHOLE */
+ ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0,
+ RTA_DST, ipst);
+ }
+
+ if (ire->ire_flags & RTF_BLACKHOLE) {
+ ip_drop_output("ipIfStatsOutNoRoutes RTF_BLACKHOLE", mp, NULL);
+ freemsg(mp);
+ /* No error even for local senders - silent blackhole */
+ return (0);
+ }
+ ip_drop_output("ipIfStatsOutNoRoutes RTF_REJECT", mp, NULL);
+
+ /*
+ * We need an ill_t for the ip_recv_attr_t even though this packet
+ * was never received and icmp_unreachable doesn't currently use
+ * ira_ill.
+ */
+ ill = ill_lookup_on_name("lo0", B_FALSE,
+ !(ixa->ixa_flags & IRAF_IS_IPV4), &dummy, ipst);
+ if (ill == NULL) {
+ freemsg(mp);
+ return (EHOSTUNREACH);
+ }
+
+ bzero(&iras, sizeof (iras));
+ /* Map ixa to ira including IPsec policies */
+ ipsec_out_to_in(ixa, ill, &iras);
+
+ if (ip_source_routed(ipha, ipst)) {
+ icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, &iras);
+ } else {
+ icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras);
+ }
+ /* We moved any IPsec refs from ixa to iras */
+ ira_cleanup(&iras, B_FALSE);
+ ill_refrele(ill);
+ return (EHOSTUNREACH);
+}
+
+/*
+ * Calculate a checksum ignoring any hardware capabilities
+ *
+ * Returns B_FALSE if the packet was too short for the checksum. Caller
+ * should free and do stats.
+ */
+static boolean_t
+ip_output_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_xmit_attr_t *ixa)
+{
+ ip_stack_t *ipst = ixa->ixa_ipst;
+ uint_t pktlen = ixa->ixa_pktlen;
+ uint16_t *cksump;
+ uint32_t cksum;
+ uint8_t protocol = ixa->ixa_protocol;
+ uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length;
+ ipaddr_t dst = ipha->ipha_dst;
+ ipaddr_t src = ipha->ipha_src;
+
+ /* Just in case it contained garbage */
+ DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
+
+ /*
+ * Calculate ULP checksum
+ */
+ if (protocol == IPPROTO_TCP) {
+ cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length);
+ cksum = IP_TCP_CSUM_COMP;
+ } else if (protocol == IPPROTO_UDP) {
+ cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length);
+ cksum = IP_UDP_CSUM_COMP;
+ } else if (protocol == IPPROTO_SCTP) {
+ sctp_hdr_t *sctph;
+
+ ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
+ sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
+ /*
+ * Zero out the checksum field to ensure proper
+ * checksum calculation.
+ */
+ sctph->sh_chksum = 0;
+#ifdef DEBUG
+ if (!skip_sctp_cksum)
+#endif
+ sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
+ goto ip_hdr_cksum;
+ } else {
+ goto ip_hdr_cksum;
+ }
+
+ /* ULP puts the checksum field is in the first mblk */
+ ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
+
+ /*
+ * We accumulate the pseudo header checksum in cksum.
+ * This is pretty hairy code, so watch close. One
+ * thing to keep in mind is that UDP and TCP have
+ * stored their respective datagram lengths in their
+ * checksum fields. This lines things up real nice.
+ */
+ cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
+
+ cksum = IP_CSUM(mp, ip_hdr_length, cksum);
+ /*
+ * For UDP/IPv4 a zero means that the packets wasn't checksummed.
+ * Change to 0xffff
+ */
+ if (protocol == IPPROTO_UDP && cksum == 0)
+ *cksump = ~cksum;
+ else
+ *cksump = cksum;
+
+ IP_STAT(ipst, ip_out_sw_cksum);
+ IP_STAT_UPDATE(ipst, ip_out_sw_cksum_bytes, pktlen);
+
+ip_hdr_cksum:
+ /* Calculate IPv4 header checksum */
+ ipha->ipha_hdr_checksum = 0;
+ ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
+ return (B_TRUE);
+}
+
+/*
+ * Calculate the ULP checksum - try to use hardware.
+ * In the case of MULTIRT, broadcast or multicast the
+ * IXAF_NO_HW_CKSUM is set in which case we use software.
+ *
+ * If the hardware supports IP header checksum offload; then clear the
+ * contents of IP header checksum field as expected by NIC.
+ * Do this only if we offloaded either full or partial sum.
+ *
+ * Returns B_FALSE if the packet was too short for the checksum. Caller
+ * should free and do stats.
+ */
+static boolean_t
+ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha,
+ ip_xmit_attr_t *ixa, ill_t *ill)
+{
+ uint_t pktlen = ixa->ixa_pktlen;
+ uint16_t *cksump;
+ uint16_t hck_flags;
+ uint32_t cksum;
+ uint8_t protocol = ixa->ixa_protocol;
+ uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length;
+
+ if ((ixaflags & IXAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) ||
+ !dohwcksum) {
+ return (ip_output_sw_cksum_v4(mp, ipha, ixa));
+ }
+
+ /*
+ * Calculate ULP checksum. Note that we don't use cksump and cksum
+ * if the ill has FULL support.
+ */
+ if (protocol == IPPROTO_TCP) {
+ cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length);
+ cksum = IP_TCP_CSUM_COMP; /* Pseudo-header cksum */
+ } else if (protocol == IPPROTO_UDP) {
+ cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length);
+ cksum = IP_UDP_CSUM_COMP; /* Pseudo-header cksum */
+ } else if (protocol == IPPROTO_SCTP) {
+ sctp_hdr_t *sctph;
+
+ ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph)));
+ sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length);
+ /*
+ * Zero out the checksum field to ensure proper
+ * checksum calculation.
+ */
+ sctph->sh_chksum = 0;
+#ifdef DEBUG
+ if (!skip_sctp_cksum)
+#endif
+ sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length);
+ goto ip_hdr_cksum;
+ } else {
+ ip_hdr_cksum:
+ /* Calculate IPv4 header checksum */
+ ipha->ipha_hdr_checksum = 0;
+ ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
+ return (B_TRUE);
+ }
+
+ /* ULP puts the checksum field is in the first mblk */
+ ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr);
+
+ /*
+ * Underlying interface supports hardware checksum offload for
+ * the payload; leave the payload checksum for the hardware to
+ * calculate. N.B: We only need to set up checksum info on the
+ * first mblk.
+ */
+ hck_flags = ill->ill_hcksum_capab->ill_hcksum_txflags;
+
+ DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS;
+ if (hck_flags & HCKSUM_INET_FULL_V4) {
+ /*
+ * Hardware calculates pseudo-header, header and the
+ * payload checksums, so clear the checksum field in
+ * the protocol header.
+ */
+ *cksump = 0;
+ DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM;
+
+ ipha->ipha_hdr_checksum = 0;
+ if (hck_flags & HCKSUM_IPHDRCKSUM) {
+ DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM;
+ } else {
+ ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
+ }
+ return (B_TRUE);
+ }
+ if ((hck_flags) & HCKSUM_INET_PARTIAL) {
+ ipaddr_t dst = ipha->ipha_dst;
+ ipaddr_t src = ipha->ipha_src;
+ /*
+ * Partial checksum offload has been enabled. Fill
+ * the checksum field in the protocol header with the
+ * pseudo-header checksum value.
+ *
+ * We accumulate the pseudo header checksum in cksum.
+ * This is pretty hairy code, so watch close. One
+ * thing to keep in mind is that UDP and TCP have
+ * stored their respective datagram lengths in their
+ * checksum fields. This lines things up real nice.
+ */
+ cksum += (dst >> 16) + (dst & 0xFFFF) +
+ (src >> 16) + (src & 0xFFFF);
+ cksum += *(cksump);
+ cksum = (cksum & 0xFFFF) + (cksum >> 16);
+ *(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
+
+ /*
+ * Offsets are relative to beginning of IP header.
+ */
+ DB_CKSUMSTART(mp) = ip_hdr_length;
+ DB_CKSUMSTUFF(mp) = (uint8_t *)cksump - (uint8_t *)ipha;
+ DB_CKSUMEND(mp) = pktlen;
+ DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM;
+
+ ipha->ipha_hdr_checksum = 0;
+ if (hck_flags & HCKSUM_IPHDRCKSUM) {
+ DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM;
+ } else {
+ ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
+ }
+ return (B_TRUE);
+ }
+ /* Hardware capabilities include neither full nor partial IPv4 */
+ return (ip_output_sw_cksum_v4(mp, ipha, ixa));
+}
+
+/*
+ * ire_sendfn for offlink and onlink destinations.
+ * Also called from the multicast, broadcast, multirt send functions.
+ *
+ * Assumes that the caller has a hold on the ire.
+ *
+ * This function doesn't care if the IRE just became condemned since that
+ * can happen at any time.
+ */
+/* ARGSUSED */
+int
+ire_send_wire_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
+ ip_xmit_attr_t *ixa, uint32_t *identp)
+{
+ ip_stack_t *ipst = ixa->ixa_ipst;
+ ipha_t *ipha = (ipha_t *)iph_arg;
+ iaflags_t ixaflags = ixa->ixa_flags;
+ ill_t *ill;
+
+ ASSERT(ixa->ixa_nce != NULL);
+ ill = ixa->ixa_nce->nce_ill;
+
+ if (ixaflags & IXAF_DONTROUTE)
+ ipha->ipha_ttl = 1;
+
+ /*
+ * Assign an ident value for this packet. There could be other
+ * threads targeting the same destination, so we have to arrange
+ * for a atomic increment. Note that we use a 32-bit atomic add
+ * because it has better performance than its 16-bit sibling.
+ *
+ * Normally ixa_extra_ident is 0, but in the case of LSO it will
+ * be the number of TCP segments that the driver/hardware will
+ * extraly construct.
+ *
+ * If running in cluster mode and if the source address
+ * belongs to a replicated service then vector through
+ * cl_inet_ipident vector to allocate ip identifier
+ * NOTE: This is a contract private interface with the
+ * clustering group.
+ */
+ if (cl_inet_ipident != NULL) {
+ ipaddr_t src = ipha->ipha_src;
+ ipaddr_t dst = ipha->ipha_dst;
+ netstackid_t stack_id = ipst->ips_netstack->netstack_stackid;
+
+ ASSERT(cl_inet_isclusterwide != NULL);
+ if ((*cl_inet_isclusterwide)(stack_id, IPPROTO_IP,
+ AF_INET, (uint8_t *)(uintptr_t)src, NULL)) {
+ /*
+ * Note: not correct with LSO since we can't allocate
+ * ixa_extra_ident+1 consecutive values.
+ */
+ ipha->ipha_ident = (*cl_inet_ipident)(stack_id,
+ IPPROTO_IP, AF_INET, (uint8_t *)(uintptr_t)src,
+ (uint8_t *)(uintptr_t)dst, NULL);
+ } else {
+ ipha->ipha_ident = atomic_add_32_nv(identp,
+ ixa->ixa_extra_ident + 1);
+ }
+ } else {
+ ipha->ipha_ident = atomic_add_32_nv(identp,
+ ixa->ixa_extra_ident + 1);
+ }
+#ifndef _BIG_ENDIAN
+ ipha->ipha_ident = htons(ipha->ipha_ident);
+#endif
+
+ /*
+ * This might set b_band, thus the IPsec and fragmentation
+ * code in IP ensures that b_band is updated in the first mblk.
+ */
+ if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) {
+ /* ip_process translates an IS_UNDER_IPMP */
+ mp = ip_process(IPP_LOCAL_OUT, mp, ill, ill);
+ if (mp == NULL) {
+ /* ip_drop_packet and MIB done */
+ return (0); /* Might just be delayed */
+ }
+ }
+
+ /*
+ * Verify any IPv4 options.
+ *
+ * The presense of IP options also forces the network stack to
+ * calculate the checksum in software. This is because:
+ *
+ * Wrap around: certain partial-checksum NICs (eri, ce) limit
+ * the size of "start offset" width to 6-bit. This effectively
+ * sets the largest value of the offset to 64-bytes, starting
+ * from the MAC header. When the cumulative MAC and IP headers
+ * exceed such limit, the offset will wrap around. This causes
+ * the checksum to be calculated at the wrong place.
+ *
+ * IPv4 source routing: none of the full-checksum capable NICs
+ * is capable of correctly handling the IPv4 source-routing
+ * option for purposes of calculating the pseudo-header; the
+ * actual destination is different from the destination in the
+ * header which is that of the next-hop. (This case may not be
+ * true for NICs which can parse IPv6 extension headers, but
+ * we choose to simplify the implementation by not offloading
+ * checksum when they are present.)
+ */
+ if (!IS_SIMPLE_IPH(ipha)) {
+ ixaflags = ixa->ixa_flags |= IXAF_NO_HW_CKSUM;
+ /* An IS_UNDER_IPMP ill is ok here */
+ if (ip_output_options(mp, ipha, ixa, ill)) {
+ /* Packet has been consumed and ICMP error sent */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ return (EINVAL);
+ }
+ }
+
+ /*
+ * To handle IPsec/iptun's labeling needs we need to tag packets
+ * while we still have ixa_tsl
+ */
+ if (is_system_labeled() && ixa->ixa_tsl != NULL &&
+ (ill->ill_mactype == DL_6TO4 || ill->ill_mactype == DL_IPV4 ||
+ ill->ill_mactype == DL_IPV6)) {
+ cred_t *newcr;
+
+ newcr = copycred_from_tslabel(ixa->ixa_cred, ixa->ixa_tsl,
+ KM_NOSLEEP);
+ if (newcr == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards - newcr",
+ mp, ill);
+ freemsg(mp);
+ return (ENOBUFS);
+ }
+ mblk_setcred(mp, newcr, NOPID);
+ crfree(newcr); /* mblk_setcred did its own crhold */
+ }
+
+ if (ixa->ixa_pktlen > ixa->ixa_fragsize ||
+ (ixaflags & IXAF_IPSEC_SECURE)) {
+ uint32_t pktlen;
+
+ pktlen = ixa->ixa_pktlen;
+ if (ixaflags & IXAF_IPSEC_SECURE)
+ pktlen += ipsec_out_extra_length(ixa);
+
+ if (pktlen > IP_MAXPACKET)
+ return (EMSGSIZE);
+
+ if (ixaflags & IXAF_SET_ULP_CKSUM) {
+ /*
+ * Compute ULP checksum and IP header checksum
+ * using software
+ */
+ if (!ip_output_sw_cksum_v4(mp, ipha, ixa)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards", mp, ill);
+ freemsg(mp);
+ return (EINVAL);
+ }
+ } else {
+ /* Calculate IPv4 header checksum */
+ ipha->ipha_hdr_checksum = 0;
+ ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
+ }
+
+ /*
+ * If this packet would generate a icmp_frag_needed
+ * message, we need to handle it before we do the IPsec
+ * processing. Otherwise, we need to strip the IPsec
+ * headers before we send up the message to the ULPs
+ * which becomes messy and difficult.
+ *
+ * We check using IXAF_DONTFRAG. The DF bit in the header
+ * is not inspected - it will be copied to any generated
+ * fragments.
+ */
+ if ((pktlen > ixa->ixa_fragsize) &&
+ (ixaflags & IXAF_DONTFRAG)) {
+ /* Generate ICMP and return error */
+ ip_recv_attr_t iras;
+
+ DTRACE_PROBE4(ip4__fragsize__fail, uint_t, pktlen,
+ uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen,
+ uint_t, ixa->ixa_pmtu);
+
+ bzero(&iras, sizeof (iras));
+ /* Map ixa to ira including IPsec policies */
+ ipsec_out_to_in(ixa, ill, &iras);
+
+ ip_drop_output("ICMP_FRAG_NEEDED", mp, ill);
+ icmp_frag_needed(mp, ixa->ixa_fragsize, &iras);
+ /* We moved any IPsec refs from ixa to iras */
+ ira_cleanup(&iras, B_FALSE);
+ return (EMSGSIZE);
+ }
+ DTRACE_PROBE4(ip4__fragsize__ok, uint_t, pktlen,
+ uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen,
+ uint_t, ixa->ixa_pmtu);
+
+ if (ixaflags & IXAF_IPSEC_SECURE) {
+ /*
+ * Pass in sufficient information so that
+ * IPsec can determine whether to fragment, and
+ * which function to call after fragmentation.
+ */
+ return (ipsec_out_process(mp, ixa));
+ }
+ return (ip_fragment_v4(mp, ixa->ixa_nce, ixaflags,
+ ixa->ixa_pktlen, ixa->ixa_fragsize, ixa->ixa_xmit_hint,
+ ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid,
+ ixa->ixa_postfragfn, &ixa->ixa_cookie));
+ }
+ if (ixaflags & IXAF_SET_ULP_CKSUM) {
+ /* Compute ULP checksum and IP header checksum */
+ /* An IS_UNDER_IPMP ill is ok here */
+ if (!ip_output_cksum_v4(ixaflags, mp, ipha, ixa, ill)) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards", mp, ill);
+ freemsg(mp);
+ return (EINVAL);
+ }
+ } else {
+ /* Calculate IPv4 header checksum */
+ ipha->ipha_hdr_checksum = 0;
+ ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
+ }
+ return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixaflags,
+ ixa->ixa_pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid,
+ ixa->ixa_no_loop_zoneid, &ixa->ixa_cookie));
+}
+
+/*
+ * Send mp into ip_input
+ * Common for IPv4 and IPv6
+ */
+void
+ip_postfrag_loopback(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
+ uint_t pkt_len, zoneid_t nolzid)
+{
+ rtc_t rtc;
+ ill_t *ill = nce->nce_ill;
+ ip_recv_attr_t iras; /* NOTE: No bzero for performance */
+ ncec_t *ncec;
+
+ ncec = nce->nce_common;
+ iras.ira_flags = IRAF_VERIFY_IP_CKSUM | IRAF_VERIFY_ULP_CKSUM |
+ IRAF_LOOPBACK | IRAF_L2SRC_LOOPBACK;
+ if (ncec->ncec_flags & NCE_F_BCAST)
+ iras.ira_flags |= IRAF_L2DST_BROADCAST;
+ else if (ncec->ncec_flags & NCE_F_MCAST)
+ iras.ira_flags |= IRAF_L2DST_MULTICAST;
+
+ iras.ira_free_flags = 0;
+ iras.ira_cred = NULL;
+ iras.ira_cpid = NOPID;
+ iras.ira_tsl = NULL;
+ iras.ira_zoneid = ALL_ZONES;
+ iras.ira_pktlen = pkt_len;
+ UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, iras.ira_pktlen);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
+
+ if (ixaflags & IXAF_IS_IPV4)
+ iras.ira_flags |= IRAF_IS_IPV4;
+
+ iras.ira_ill = iras.ira_rill = ill;
+ iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+ iras.ira_rifindex = iras.ira_ruifindex;
+ iras.ira_mhip = NULL;
+
+ iras.ira_flags |= ixaflags & IAF_MASK;
+ iras.ira_no_loop_zoneid = nolzid;
+
+ /* Broadcast and multicast doesn't care about the squeue */
+ iras.ira_sqp = NULL;
+
+ rtc.rtc_ire = NULL;
+ if (ixaflags & IXAF_IS_IPV4) {
+ ipha_t *ipha = (ipha_t *)mp->b_rptr;
+
+ rtc.rtc_ipaddr = INADDR_ANY;
+
+ (*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc);
+ if (rtc.rtc_ire != NULL) {
+ ASSERT(rtc.rtc_ipaddr != INADDR_ANY);
+ ire_refrele(rtc.rtc_ire);
+ }
+ } else {
+ ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+
+ rtc.rtc_ip6addr = ipv6_all_zeros;
+
+ (*ill->ill_inputfn)(mp, ip6h, &ip6h->ip6_dst, &iras, &rtc);
+ if (rtc.rtc_ire != NULL) {
+ ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&rtc.rtc_ip6addr));
+ ire_refrele(rtc.rtc_ire);
+ }
+ }
+ /* Any references to clean up? No hold on ira */
+ if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED))
+ ira_cleanup(&iras, B_FALSE);
+}
+
+/*
+ * Post fragmentation function for IRE_MULTICAST and IRE_BROADCAST which
+ * looks at the IXAF_LOOPBACK_COPY flag.
+ * Common for IPv4 and IPv6.
+ *
+ * If the loopback copy fails (due to no memory) but we send the packet out
+ * on the wire we return no failure. Only in the case we supress the wire
+ * sending do we take the loopback failure into account.
+ *
+ * Note that we do not perform DTRACE_IP7 and FW_HOOKS for the looped back copy.
+ * Those operations are performed on this packet in ip_xmit() and it would
+ * be odd to do it twice for the same packet.
+ */
+int
+ip_postfrag_loopcheck(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
+ uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid,
+ uintptr_t *ixacookie)
+{
+ ill_t *ill = nce->nce_ill;
+ int error = 0;
+
+ /*
+ * Check for IXAF_LOOPBACK_COPY - send a copy to ip as if the driver
+ * had looped it back
+ */
+ if (ixaflags & IXAF_LOOPBACK_COPY) {
+ mblk_t *mp1;
+
+ mp1 = copymsg(mp);
+ if (mp1 == NULL) {
+ /* Failed to deliver the loopback copy. */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards", mp, ill);
+ error = ENOBUFS;
+ } else {
+ ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len,
+ nolzid);
+ }
+ }
+
+ /*
+ * If TTL = 0 then only do the loopback to this host i.e. we are
+ * done. We are also done if this was the
+ * loopback interface since it is sufficient
+ * to loopback one copy of a multicast packet.
+ */
+ if (ixaflags & IXAF_IS_IPV4) {
+ ipha_t *ipha = (ipha_t *)mp->b_rptr;
+
+ if (ipha->ipha_ttl == 0) {
+ ip_drop_output("multicast ipha_ttl not sent to wire",
+ mp, ill);
+ freemsg(mp);
+ return (error);
+ }
+ } else {
+ ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+
+ if (ip6h->ip6_hops == 0) {
+ ip_drop_output("multicast ipha_ttl not sent to wire",
+ mp, ill);
+ freemsg(mp);
+ return (error);
+ }
+ }
+ if (nce->nce_ill->ill_wq == NULL) {
+ /* Loopback interface */
+ ip_drop_output("multicast on lo0 not sent to wire", mp, ill);
+ freemsg(mp);
+ return (error);
+ }
+
+ return (ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0,
+ ixacookie));
+}
+
+/*
+ * Post fragmentation function for RTF_MULTIRT routes.
+ * Since IRE_BROADCASTs can have RTF_MULTIRT, this function
+ * checks IXAF_LOOPBACK_COPY.
+ *
+ * If no packet is sent due to failures then we return an errno, but if at
+ * least one succeeded we return zero.
+ */
+int
+ip_postfrag_multirt_v4(mblk_t *mp, nce_t *nce, iaflags_t ixaflags,
+ uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid,
+ uintptr_t *ixacookie)
+{
+ irb_t *irb;
+ ipha_t *ipha = (ipha_t *)mp->b_rptr;
+ ire_t *ire;
+ ire_t *ire1;
+ mblk_t *mp1;
+ nce_t *nce1;
+ ill_t *ill = nce->nce_ill;
+ ill_t *ill1;
+ ip_stack_t *ipst = ill->ill_ipst;
+ int error = 0;
+ int num_sent = 0;
+ int err;
+ uint_t ire_type;
+ ipaddr_t nexthop;
+
+ ASSERT(ixaflags & IXAF_IS_IPV4);
+
+ /* Check for IXAF_LOOPBACK_COPY */
+ if (ixaflags & IXAF_LOOPBACK_COPY) {
+ mblk_t *mp1;
+
+ mp1 = copymsg(mp);
+ if (mp1 == NULL) {
+ /* Failed to deliver the loopback copy. */
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards", mp, ill);
+ error = ENOBUFS;
+ } else {
+ ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len,
+ nolzid);
+ }
+ }
+
+ /*
+ * Loop over RTF_MULTIRT for ipha_dst in the same bucket. Send
+ * a copy to each one.
+ * Use the nce (nexthop) and ipha_dst to find the ire.
+ *
+ * MULTIRT is not designed to work with shared-IP zones thus we don't
+ * need to pass a zoneid or a label to the IRE lookup.
+ */
+ if (V4_PART_OF_V6(nce->nce_addr) == ipha->ipha_dst) {
+ /* Broadcast and multicast case */
+ ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0, 0,
+ NULL, ALL_ZONES, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
+ } else {
+ ipaddr_t v4addr = V4_PART_OF_V6(nce->nce_addr);
+
+ /* Unicast case */
+ ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, v4addr, 0,
+ NULL, ALL_ZONES, NULL, MATCH_IRE_GW, 0, ipst, NULL);
+ }
+
+ if (ire == NULL ||
+ (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
+ !(ire->ire_flags & RTF_MULTIRT)) {
+ /* Drop */
+ ip_drop_output("ip_postfrag_multirt didn't find route",
+ mp, nce->nce_ill);
+ if (ire != NULL)
+ ire_refrele(ire);
+ return (ENETUNREACH);
+ }
+
+ irb = ire->ire_bucket;
+ irb_refhold(irb);
+ for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
+ /*
+ * For broadcast we can have a mixture of IRE_BROADCAST and
+ * IRE_HOST due to the manually added IRE_HOSTs that are used
+ * to trigger the creation of the special CGTP broadcast routes.
+ * Thus we have to skip if ire_type doesn't match the original.
+ */
+ if (IRE_IS_CONDEMNED(ire1) ||
+ !(ire1->ire_flags & RTF_MULTIRT) ||
+ ire1->ire_type != ire->ire_type)
+ continue;
+
+ /* Do the ire argument one after the loop */
+ if (ire1 == ire)
+ continue;
+
+ ill1 = ire_nexthop_ill(ire1);
+ if (ill1 == NULL) {
+ /*
+ * This ire might not have been picked by
+ * ire_route_recursive, in which case ire_dep might
+ * not have been setup yet.
+ * We kick ire_route_recursive to try to resolve
+ * starting at ire1.
+ */
+ ire_t *ire2;
+
+ ire2 = ire_route_recursive_impl_v4(ire1,
+ ire1->ire_addr, ire1->ire_type, ire1->ire_ill,
+ ire1->ire_zoneid, NULL, MATCH_IRE_DSTONLY,
+ B_TRUE, 0, ipst, NULL, NULL, NULL);
+ if (ire2 != NULL)
+ ire_refrele(ire2);
+ ill1 = ire_nexthop_ill(ire1);
+ }
+
+ if (ill1 == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards - no ill",
+ mp, ill);
+ error = ENETUNREACH;
+ continue;
+ }
+
+ /* Pick the addr and type to use for arp_nce_init */
+ if (nce->nce_common->ncec_flags & NCE_F_BCAST) {
+ ire_type = IRE_BROADCAST;
+ nexthop = ire1->ire_gateway_addr;
+ } else if (nce->nce_common->ncec_flags & NCE_F_MCAST) {
+ ire_type = IRE_MULTICAST;
+ nexthop = ipha->ipha_dst;
+ } else {
+ ire_type = ire1->ire_type; /* Doesn't matter */
+ nexthop = ire1->ire_gateway_addr;
+ }
+
+ /* If IPMP meta or under, then we just drop */
+ if (ill1->ill_grp != NULL) {
+ BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards - IPMP",
+ mp, ill1);
+ ill_refrele(ill1);
+ error = ENETUNREACH;
+ continue;
+ }
+
+ nce1 = arp_nce_init(ill1, nexthop, ire_type);
+ if (nce1 == NULL) {
+ BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards - no nce",
+ mp, ill1);
+ ill_refrele(ill1);
+ error = ENETUNREACH;
+ continue;
+ }
+ mp1 = copymsg(mp);
+ if (mp1 == NULL) {
+ BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards", mp, ill1);
+ nce_refrele(nce1);
+ ill_refrele(ill1);
+ error = ENOBUFS;
+ continue;
+ }
+ /* Preserve HW checksum for this copy */
+ DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp);
+ DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp);
+ DB_CKSUMEND(mp1) = DB_CKSUMEND(mp);
+ DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp);
+ DB_LSOMSS(mp1) = DB_LSOMSS(mp);
+
+ ire1->ire_ob_pkt_count++;
+ err = ip_xmit(mp1, nce1, ixaflags, pkt_len, xmit_hint, szone,
+ 0, ixacookie);
+ if (err == 0)
+ num_sent++;
+ else
+ error = err;
+ nce_refrele(nce1);
+ ill_refrele(ill1);
+ }
+ irb_refrele(irb);
+ ire_refrele(ire);
+ /* Finally, the main one */
+ err = ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0,
+ ixacookie);
+ if (err == 0)
+ num_sent++;
+ else
+ error = err;
+ if (num_sent > 0)
+ return (0);
+ else
+ return (error);
+}
+
+/*
+ * Verify local connectivity. This check is called by ULP fusion code.
+ * The generation number on an IRE_LOCAL or IRE_LOOPBACK only changes if
+ * the interface is brought down and back up. So we simply fail the local
+ * process. The caller, TCP Fusion, should unfuse the connection.
+ */
+boolean_t
+ip_output_verify_local(ip_xmit_attr_t *ixa)
+{
+ ire_t *ire = ixa->ixa_ire;
+
+ if (!(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)))
+ return (B_FALSE);
+
+ return (ixa->ixa_ire->ire_generation == ixa->ixa_ire_generation);
+}
+
+/*
+ * Local process for ULP loopback, TCP Fusion. Handle both IPv4 and IPv6.
+ *
+ * The caller must call ip_output_verify_local() first. This function handles
+ * IPobs, FW_HOOKS, and/or IPsec cases sequentially.
+ */
+mblk_t *
+ip_output_process_local(mblk_t *mp, ip_xmit_attr_t *ixa, boolean_t hooks_out,
+ boolean_t hooks_in, conn_t *peer_connp)
+{
+ ill_t *ill = ixa->ixa_ire->ire_ill;
+ ipha_t *ipha = NULL;
+ ip6_t *ip6h = NULL;
+ ip_stack_t *ipst = ixa->ixa_ipst;
+ iaflags_t ixaflags = ixa->ixa_flags;
+ ip_recv_attr_t iras;
+ int error;
+
+ ASSERT(mp != NULL);
+
+ if (ixaflags & IXAF_IS_IPV4) {
+ ipha = (ipha_t *)mp->b_rptr;
+
+ /*
+ * If a callback is enabled then we need to know the
+ * source and destination zoneids for the packet. We already
+ * have those handy.
+ */
+ if (ipst->ips_ip4_observe.he_interested) {
+ zoneid_t szone, dzone;
+ zoneid_t stackzoneid;
+
+ stackzoneid = netstackid_to_zoneid(
+ ipst->ips_netstack->netstack_stackid);
+
+ if (stackzoneid == GLOBAL_ZONEID) {
+ /* Shared-IP zone */
+ dzone = ixa->ixa_ire->ire_zoneid;
+ szone = ixa->ixa_zoneid;
+ } else {
+ szone = dzone = stackzoneid;
+ }
+ ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill,
+ ipst);
+ }
+ DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+ ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *,
+ NULL, int, 1);
+
+ /* FW_HOOKS: LOOPBACK_OUT */
+ if (hooks_out) {
+ DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL,
+ ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
+ FW_HOOKS(ipst->ips_ip4_loopback_out_event,
+ ipst->ips_ipv4firewall_loopback_out,
+ NULL, ill, ipha, mp, mp, 0, ipst, error);
+ DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp);
+ }
+ if (mp == NULL)
+ return (NULL);
+
+ /* FW_HOOKS: LOOPBACK_IN */
+ if (hooks_in) {
+ DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill,
+ ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp);
+ FW_HOOKS(ipst->ips_ip4_loopback_in_event,
+ ipst->ips_ipv4firewall_loopback_in,
+ ill, NULL, ipha, mp, mp, 0, ipst, error);
+ DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp);
+ }
+ if (mp == NULL)
+ return (NULL);
+
+ DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+ ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *,
+ NULL, int, 1);
+
+ /* Inbound IPsec polocies */
+ if (peer_connp != NULL) {
+ /* Map ixa to ira including IPsec policies. */
+ ipsec_out_to_in(ixa, ill, &iras);
+ mp = ipsec_check_inbound_policy(mp, peer_connp, ipha,
+ NULL, &iras);
+ }
+ } else {
+ ip6h = (ip6_t *)mp->b_rptr;
+
+ /*
+ * If a callback is enabled then we need to know the
+ * source and destination zoneids for the packet. We already
+ * have those handy.
+ */
+ if (ipst->ips_ip6_observe.he_interested) {
+ zoneid_t szone, dzone;
+ zoneid_t stackzoneid;
+
+ stackzoneid = netstackid_to_zoneid(
+ ipst->ips_netstack->netstack_stackid);
+
+ if (stackzoneid == GLOBAL_ZONEID) {
+ /* Shared-IP zone */
+ dzone = ixa->ixa_ire->ire_zoneid;
+ szone = ixa->ixa_zoneid;
+ } else {
+ szone = dzone = stackzoneid;
+ }
+ ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill,
+ ipst);
+ }
+ DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+ ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *,
+ ip6h, int, 1);
+
+ /* FW_HOOKS: LOOPBACK_OUT */
+ if (hooks_out) {
+ DTRACE_PROBE4(ip6__loopback__out__start, ill_t *, NULL,
+ ill_t *, ill, ip6_t *, ip6h, mblk_t *, mp);
+ FW_HOOKS6(ipst->ips_ip6_loopback_out_event,
+ ipst->ips_ipv6firewall_loopback_out,
+ NULL, ill, ip6h, mp, mp, 0, ipst, error);
+ DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp);
+ }
+ if (mp == NULL)
+ return (NULL);
+
+ /* FW_HOOKS: LOOPBACK_IN */
+ if (hooks_in) {
+ DTRACE_PROBE4(ip6__loopback__in__start, ill_t *, ill,
+ ill_t *, NULL, ip6_t *, ip6h, mblk_t *, mp);
+ FW_HOOKS6(ipst->ips_ip6_loopback_in_event,
+ ipst->ips_ipv6firewall_loopback_in,
+ ill, NULL, ip6h, mp, mp, 0, ipst, error);
+ DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp);
+ }
+ if (mp == NULL)
+ return (NULL);
+
+ DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+ ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *,
+ ip6h, int, 1);
+
+ /* Inbound IPsec polocies */
+ if (peer_connp != NULL) {
+ /* Map ixa to ira including IPsec policies. */
+ ipsec_out_to_in(ixa, ill, &iras);
+ mp = ipsec_check_inbound_policy(mp, peer_connp, NULL,
+ ip6h, &iras);
+ }
+ }
+
+ if (mp == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", NULL, ill);
+ }
+
+ return (mp);
+}
diff --git a/usr/src/uts/common/inet/ip/ip_rts.c b/usr/src/uts/common/inet/ip/ip_rts.c
index 70c8bd2ea1..228c7581a3 100644
--- a/usr/src/uts/common/inet/ip/ip_rts.c
+++ b/usr/src/uts/common/inet/ip/ip_rts.c
@@ -81,24 +81,33 @@
static size_t rts_copyfromsockaddr(struct sockaddr *sa, in6_addr_t *addrp);
static void rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst,
ipaddr_t mask, ipaddr_t gateway, ipaddr_t src_addr, ipaddr_t brd_addr,
- ipaddr_t author, const ipif_t *ipif, mblk_t *mp, uint_t, const tsol_gc_t *);
+ ipaddr_t author, ipaddr_t ifaddr, const ill_t *ill, mblk_t *mp,
+ const tsol_gc_t *);
static int rts_getaddrs(rt_msghdr_t *rtm, in6_addr_t *dst_addrp,
in6_addr_t *gw_addrp, in6_addr_t *net_maskp, in6_addr_t *authorp,
in6_addr_t *if_addrp, in6_addr_t *src_addrp, ushort_t *indexp,
sa_family_t *afp, tsol_rtsecattr_t *rtsecattr, int *error);
static void rts_getifdata(if_data_t *if_data, const ipif_t *ipif);
static int rts_getmetrics(ire_t *ire, rt_metrics_t *metrics);
-static mblk_t *rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire,
- sa_family_t af);
+static mblk_t *rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *ifire,
+ const in6_addr_t *setsrc, tsol_ire_gw_secattr_t *attrp, sa_family_t af);
static void rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics);
-static void ip_rts_request_retry(ipsq_t *, queue_t *q, mblk_t *mp, void *);
+static ire_t *ire_lookup_v4(ipaddr_t dst_addr, ipaddr_t net_mask,
+ ipaddr_t gw_addr, const ill_t *ill, zoneid_t zoneid,
+ const ts_label_t *tsl, int match_flags, ip_stack_t *ipst, ire_t **pifire,
+ ipaddr_t *v4setsrcp, tsol_ire_gw_secattr_t **gwattrp);
+static ire_t *ire_lookup_v6(const in6_addr_t *dst_addr_v6,
+ const in6_addr_t *net_mask_v6, const in6_addr_t *gw_addr_v6,
+ const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, int match_flags,
+ ip_stack_t *ipst, ire_t **pifire,
+ in6_addr_t *v6setsrcp, tsol_ire_gw_secattr_t **gwattrp);
/*
* Send `mp' to all eligible routing queues. A queue is ineligible if:
*
* 1. SO_USELOOPBACK is off and it is not the originating queue.
- * 2. RTAW_UNDER_IPMP is on and RTSQ_UNDER_IPMP is clear in `flags'.
- * 3. RTAW_UNDER_IPMP is off and RTSQ_NORMAL is clear in `flags'.
+ * 2. RTA_UNDER_IPMP is on and RTSQ_UNDER_IPMP is not set in `flags'.
+ * 3. RTA_UNDER_IPMP is off and RTSQ_NORMAL is not set in `flags'.
* 4. It is not the same address family as `af', and `af' isn't AF_UNSPEC.
*/
void
@@ -110,7 +119,7 @@ rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, uint_t flags,
/*
* Since we don't have an ill_t here, RTSQ_DEFAULT must already be
- * resolved to one or more of RTSQ_NORMAL|RTSQ_UNDER_IPMP by now.
+ * resolved to one or more of RTSQ_NORMAL|RTSQ_UNDER_IPMP at this point.
*/
ASSERT(!(flags & RTSQ_DEFAULT));
@@ -119,7 +128,6 @@ rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, uint_t flags,
for (; connp != NULL; connp = next_connp) {
next_connp = connp->conn_next;
-
/*
* If there was a family specified when this routing socket was
* created and it doesn't match the family of the message to
@@ -139,28 +147,27 @@ rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, uint_t flags,
if (!(flags & RTSQ_NORMAL))
continue;
}
-
/*
* For the originating queue, we only copy the message upstream
* if loopback is set. For others reading on the routing
* socket, we check if there is room upstream for a copy of the
* message.
*/
- if ((o_connp == connp) && connp->conn_loopback == 0) {
+ if ((o_connp == connp) && connp->conn_useloopback == 0) {
connp = connp->conn_next;
continue;
}
CONN_INC_REF(connp);
mutex_exit(&ipst->ips_rts_clients->connf_lock);
/* Pass to rts_input */
- if ((IPCL_IS_NONSTR(connp) && !PROTO_FLOW_CNTRLD(connp))||
- (!IPCL_IS_NONSTR(connp) &&
- canputnext(CONNP_TO_RQ(connp)))) {
+ if (IPCL_IS_NONSTR(connp) ? !connp->conn_flow_cntrld :
+ canputnext(connp->conn_rq)) {
mp1 = dupmsg(mp);
if (mp1 == NULL)
mp1 = copymsg(mp);
+ /* Note that we pass a NULL ira to rts_input */
if (mp1 != NULL)
- (connp->conn_recv)(connp, mp1, NULL);
+ (connp->conn_recv)(connp, mp1, NULL, NULL);
}
mutex_enter(&ipst->ips_rts_clients->connf_lock);
@@ -176,7 +183,7 @@ rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, uint_t flags,
* Takes an ire and sends an ack to all the routing sockets. This
* routine is used
* - when a route is created/deleted through the ioctl interface.
- * - when ire_expire deletes a stale redirect
+ * - when a stale redirect is deleted
*/
void
ip_rts_rtmsg(int type, ire_t *ire, int error, ip_stack_t *ipst)
@@ -192,6 +199,8 @@ ip_rts_rtmsg(int type, ire_t *ire, int error, ip_stack_t *ipst)
ASSERT(ire->ire_ipversion == IPV4_VERSION ||
ire->ire_ipversion == IPV6_VERSION);
+ ASSERT(!(ire->ire_type & IRE_IF_CLONE));
+
if (ire->ire_flags & RTF_SETSRC)
rtm_addrs |= RTA_SRC;
@@ -202,8 +211,8 @@ ip_rts_rtmsg(int type, ire_t *ire, int error, ip_stack_t *ipst)
if (mp == NULL)
return;
rts_fill_msg(type, rtm_addrs, ire->ire_addr, ire->ire_mask,
- ire->ire_gateway_addr, ire->ire_src_addr, 0, 0, NULL, mp,
- 0, NULL);
+ ire->ire_gateway_addr, ire->ire_setsrc_addr, 0, 0, 0, NULL,
+ mp, NULL);
break;
case IPV6_VERSION:
af = AF_INET6;
@@ -215,8 +224,8 @@ ip_rts_rtmsg(int type, ire_t *ire, int error, ip_stack_t *ipst)
mutex_exit(&ire->ire_lock);
rts_fill_msg_v6(type, rtm_addrs, &ire->ire_addr_v6,
&ire->ire_mask_v6, &gw_addr_v6,
- &ire->ire_src_addr_v6, &ipv6_all_zeros, &ipv6_all_zeros,
- NULL, mp, 0, NULL);
+ &ire->ire_setsrc_addr_v6, &ipv6_all_zeros, &ipv6_all_zeros,
+ &ipv6_all_zeros, NULL, mp, NULL);
break;
}
rtm = (rt_msghdr_t *)mp->b_rptr;
@@ -230,13 +239,6 @@ ip_rts_rtmsg(int type, ire_t *ire, int error, ip_stack_t *ipst)
rts_queue_input(mp, NULL, af, RTSQ_ALL, ipst);
}
-/* ARGSUSED */
-static void
-ip_rts_request_retry(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy)
-{
- (void) ip_rts_request(q, mp, msg_getcred(mp, NULL));
-}
-
/*
* This is a call from the RTS module
* indicating that this is a Routing Socket
@@ -248,7 +250,7 @@ ip_rts_register(conn_t *connp)
{
ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
- connp->conn_loopback = 1;
+ connp->conn_useloopback = 1;
ipcl_hash_insert_wildcard(ipst->ips_rts_clients, connp);
}
@@ -269,18 +271,9 @@ ip_rts_unregister(conn_t *connp)
*
* In general, this function does not consume the message supplied but rather
* sends the message upstream with an appropriate UNIX errno.
- *
- * We may need to restart this operation if the ipif cannot be looked up
- * due to an exclusive operation that is currently in progress. The restart
- * entry point is ip_rts_request_retry. While the request is enqueud in the
- * ipsq the ioctl could be aborted and the conn close. To ensure that we don't
- * have stale conn pointers, ip_wput_ioctl does a conn refhold. This is
- * released at the completion of the rts ioctl at the end of this function
- * by calling CONN_OPER_PENDING_DONE or when the ioctl is aborted and
- * conn close occurs in conn_ioctl_cleanup.
*/
int
-ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr)
+ip_rts_request_common(mblk_t *mp, conn_t *connp, cred_t *ioc_cr)
{
rt_msghdr_t *rtm = NULL;
in6_addr_t dst_addr_v6;
@@ -289,9 +282,12 @@ ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr)
in6_addr_t net_mask_v6;
in6_addr_t author_v6;
in6_addr_t if_addr_v6;
- mblk_t *mp1, *ioc_mp = mp;
+ mblk_t *mp1;
ire_t *ire = NULL;
- ire_t *sire = NULL;
+ ire_t *ifire = NULL;
+ ipaddr_t v4setsrc;
+ in6_addr_t v6setsrc = ipv6_all_zeros;
+ tsol_ire_gw_secattr_t *gwattr = NULL;
int error = 0;
int match_flags = MATCH_IRE_DSTONLY;
int match_flags_local = MATCH_IRE_TYPE | MATCH_IRE_GW;
@@ -302,9 +298,6 @@ ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr)
ipaddr_t src_addr;
ipaddr_t net_mask;
ushort_t index;
- ipif_t *ipif = NULL;
- ipif_t *tmp_ipif = NULL;
- IOCP iocp = (IOCP)mp->b_rptr;
boolean_t gcgrp_xtraref = B_FALSE;
tsol_gcgrp_addr_t ga;
tsol_rtsecattr_t rtsecattr;
@@ -314,42 +307,11 @@ ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr)
ts_label_t *tsl = NULL;
zoneid_t zoneid;
ip_stack_t *ipst;
-
- ip1dbg(("ip_rts_request: mp is %x\n", DB_TYPE(mp)));
+ ill_t *ill = NULL;
zoneid = connp->conn_zoneid;
ipst = connp->conn_netstack->netstack_ip;
- ASSERT(mp->b_cont != NULL);
- /* ioc_mp holds mp */
- mp = mp->b_cont;
-
- /*
- * The Routing Socket data starts on
- * next block. If there is no next block
- * this is an indication from routing module
- * that it is a routing socket stream queue.
- * We need to support that for compatibility with SDP since
- * it has a contract private interface to use IP_IOC_RTS_REQUEST.
- */
- if (mp->b_cont == NULL) {
- /*
- * This is a message from SDP
- * indicating that this is a Routing Socket
- * Stream. Insert this conn_t in routing
- * socket client list.
- */
- connp->conn_loopback = 1;
- ipcl_hash_insert_wildcard(ipst->ips_rts_clients, connp);
- goto done;
- }
- mp1 = dupmsg(mp->b_cont);
- if (mp1 == NULL) {
- error = ENOBUFS;
- goto done;
- }
- mp = mp1;
-
if (mp->b_cont != NULL && !pullupmsg(mp, -1)) {
freemsg(mp);
error = EINVAL;
@@ -446,20 +408,13 @@ ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr)
*/
ASSERT(af == AF_INET || af == AF_INET6);
+ /* Handle RTA_IFP */
if (index != 0) {
- ill_t *ill;
+ ipif_t *ipif;
lookup:
- /*
- * IPC must be refheld somewhere in ip_wput_nondata or
- * ip_wput_ioctl etc... and cleaned up if ioctl is killed.
- * If ILL_CHANGING the request is queued in the ipsq.
- */
- ill = ill_lookup_on_ifindex(index, af == AF_INET6,
- CONNP_TO_WQ(connp), ioc_mp, ip_rts_request_retry, &error,
- ipst);
+ ill = ill_lookup_on_ifindex(index, af == AF_INET6, ipst);
if (ill == NULL) {
- if (error != EINPROGRESS)
- error = EINVAL;
+ error = EINVAL;
goto done;
}
@@ -474,13 +429,13 @@ lookup:
switch (rtm->rtm_type) {
case RTM_CHANGE:
case RTM_DELETE:
- ill_refrele(ill);
error = EINVAL;
goto done;
case RTM_ADD:
index = ipmp_ill_get_ipmp_ifindex(ill);
ill_refrele(ill);
if (index == 0) {
+ ill = NULL; /* already refrele'd */
error = EINVAL;
goto done;
}
@@ -488,9 +443,18 @@ lookup:
}
}
- ipif = ipif_get_next_ipif(NULL, ill);
- ill_refrele(ill);
match_flags |= MATCH_IRE_ILL;
+ /*
+ * This provides the same zoneid as in Solaris 10
+ * that -ifp picks the zoneid from the first ipif on the ill.
+ * But it might not be useful since the first ipif will always
+ * have the same zoneid as the ill.
+ */
+ ipif = ipif_get_next_ipif(NULL, ill);
+ if (ipif != NULL) {
+ zoneid = ipif->ipif_zoneid;
+ ipif_refrele(ipif);
+ }
}
/*
@@ -545,6 +509,8 @@ lookup:
switch (af) {
case AF_INET:
if (src_addr != INADDR_ANY) {
+ uint_t type;
+
/*
* The RTF_SETSRC flag is present, check that
* the supplied src address is not the loopback
@@ -556,20 +522,11 @@ lookup:
}
/*
* Also check that the supplied address is a
- * valid, local one.
+ * valid, local one. Only allow IFF_UP ones
*/
- tmp_ipif = ipif_lookup_addr(src_addr, NULL,
- ALL_ZONES, CONNP_TO_WQ(connp), ioc_mp,
- ip_rts_request_retry, &error, ipst);
- if (tmp_ipif == NULL) {
- if (error != EINPROGRESS)
- error = EADDRNOTAVAIL;
- goto done;
- }
- if (!(tmp_ipif->ipif_flags & IPIF_UP) ||
- (tmp_ipif->ipif_flags &
- (IPIF_NOLOCAL | IPIF_ANYCAST))) {
- error = EINVAL;
+ type = ip_type_v4(src_addr, ipst);
+ if (!(type & (IRE_LOCAL|IRE_LOOPBACK))) {
+ error = EADDRNOTAVAIL;
goto done;
}
} else {
@@ -584,14 +541,15 @@ lookup:
}
error = ip_rt_add(dst_addr, net_mask, gw_addr, src_addr,
- rtm->rtm_flags, ipif, &ire, B_FALSE,
- WR(q), ioc_mp, ip_rts_request_retry,
- rtsap, ipst);
- if (ipif != NULL)
- ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
+ rtm->rtm_flags, ill, &ire, B_FALSE,
+ rtsap, ipst, zoneid);
+ if (ill != NULL)
+ ASSERT(!MUTEX_HELD(&ill->ill_lock));
break;
case AF_INET6:
if (!IN6_IS_ADDR_UNSPECIFIED(&src_addr_v6)) {
+ uint_t type;
+
/*
* The RTF_SETSRC flag is present, check that
* the supplied src address is not the loopback
@@ -603,28 +561,17 @@ lookup:
}
/*
* Also check that the supplied address is a
- * valid, local one.
+ * valid, local one. Only allow UP ones.
*/
- tmp_ipif = ipif_lookup_addr_v6(&src_addr_v6,
- NULL, ALL_ZONES, CONNP_TO_WQ(connp), ioc_mp,
- ip_rts_request_retry, &error, ipst);
- if (tmp_ipif == NULL) {
- if (error != EINPROGRESS)
- error = EADDRNOTAVAIL;
- goto done;
- }
-
- if (!(tmp_ipif->ipif_flags & IPIF_UP) ||
- (tmp_ipif->ipif_flags &
- (IPIF_NOLOCAL | IPIF_ANYCAST))) {
- error = EINVAL;
+ type = ip_type_v6(&src_addr_v6, ipst);
+ if (!(type & (IRE_LOCAL|IRE_LOOPBACK))) {
+ error = EADDRNOTAVAIL;
goto done;
}
error = ip_rt_add_v6(&dst_addr_v6, &net_mask_v6,
&gw_addr_v6, &src_addr_v6, rtm->rtm_flags,
- ipif, &ire, WR(q), ioc_mp,
- ip_rts_request_retry, rtsap, ipst);
+ ill, &ire, rtsap, ipst, zoneid);
break;
}
/*
@@ -637,10 +584,9 @@ lookup:
}
error = ip_rt_add_v6(&dst_addr_v6, &net_mask_v6,
&gw_addr_v6, NULL, rtm->rtm_flags,
- ipif, &ire, WR(q), ioc_mp,
- ip_rts_request_retry, rtsap, ipst);
- if (ipif != NULL)
- ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
+ ill, &ire, rtsap, ipst, zoneid);
+ if (ill != NULL)
+ ASSERT(!MUTEX_HELD(&ill->ill_lock));
break;
}
if (error != 0)
@@ -666,13 +612,13 @@ lookup:
switch (af) {
case AF_INET:
error = ip_rt_delete(dst_addr, net_mask, gw_addr,
- found_addrs, rtm->rtm_flags, ipif, B_FALSE,
- WR(q), ioc_mp, ip_rts_request_retry, ipst);
+ found_addrs, rtm->rtm_flags, ill, B_FALSE,
+ ipst, zoneid);
break;
case AF_INET6:
error = ip_rt_delete_v6(&dst_addr_v6, &net_mask_v6,
- &gw_addr_v6, found_addrs, rtm->rtm_flags, ipif,
- WR(q), ioc_mp, ip_rts_request_retry, ipst);
+ &gw_addr_v6, found_addrs, rtm->rtm_flags, ill,
+ ipst, zoneid);
break;
}
break;
@@ -680,8 +626,7 @@ lookup:
case RTM_CHANGE:
/*
* In the case of RTM_GET, the forwarding table should be
- * searched recursively with default being matched if the
- * specific route doesn't exist. Also, if a gateway was
+ * searched recursively. Also, if a gateway was
* specified then the gateway address must also be matched.
*
* In the case of RTM_CHANGE, the gateway address (if supplied)
@@ -706,9 +651,7 @@ lookup:
}
if (rtm->rtm_type == RTM_GET) {
- match_flags |=
- (MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE |
- MATCH_IRE_SECATTR);
+ match_flags |= MATCH_IRE_SECATTR;
match_flags_local |= MATCH_IRE_SECATTR;
if ((found_addrs & RTA_GATEWAY) != 0)
match_flags |= MATCH_IRE_GW;
@@ -749,57 +692,34 @@ lookup:
* IRE_LOCAL entry.
*
* If we didn't check for or find an IRE_LOOPBACK or IRE_LOCAL
- * entry, then look in the forwarding table.
+ * entry, then look for any other type of IRE.
*/
switch (af) {
case AF_INET:
if (net_mask == IP_HOST_MASK) {
- ire = ire_ctable_lookup(dst_addr, gw_addr,
+ ire = ire_ftable_lookup_v4(dst_addr, 0, gw_addr,
IRE_LOCAL | IRE_LOOPBACK, NULL, zoneid,
- tsl, match_flags_local, ipst);
- /*
- * If we found an IRE_LOCAL, make sure
- * it is one that would be used by this
- * zone to send packets.
- */
- if (ire != NULL &&
- ire->ire_type == IRE_LOCAL &&
- ipst->ips_ip_restrict_interzone_loopback &&
- !ire_local_ok_across_zones(ire,
- zoneid, &dst_addr, tsl, ipst)) {
- ire_refrele(ire);
- ire = NULL;
- }
+ tsl, match_flags_local, 0, ipst, NULL);
}
if (ire == NULL) {
- ire = ire_ftable_lookup(dst_addr, net_mask,
- gw_addr, 0, ipif, &sire, zoneid, 0,
- tsl, match_flags, ipst);
+ ire = ire_lookup_v4(dst_addr, net_mask,
+ gw_addr, ill, zoneid, tsl, match_flags,
+ ipst, &ifire, &v4setsrc, &gwattr);
+ IN6_IPADDR_TO_V4MAPPED(v4setsrc, &v6setsrc);
}
break;
case AF_INET6:
if (IN6_ARE_ADDR_EQUAL(&net_mask_v6, &ipv6_all_ones)) {
- ire = ire_ctable_lookup_v6(&dst_addr_v6,
+ ire = ire_ftable_lookup_v6(&dst_addr_v6, NULL,
&gw_addr_v6, IRE_LOCAL | IRE_LOOPBACK, NULL,
- zoneid, tsl, match_flags_local, ipst);
- /*
- * If we found an IRE_LOCAL, make sure
- * it is one that would be used by this
- * zone to send packets.
- */
- if (ire != NULL &&
- ire->ire_type == IRE_LOCAL &&
- ipst->ips_ip_restrict_interzone_loopback &&
- !ire_local_ok_across_zones(ire,
- zoneid, (void *)&dst_addr_v6, tsl, ipst)) {
- ire_refrele(ire);
- ire = NULL;
- }
+ zoneid, tsl, match_flags_local, 0, ipst,
+ NULL);
}
if (ire == NULL) {
- ire = ire_ftable_lookup_v6(&dst_addr_v6,
- &net_mask_v6, &gw_addr_v6, 0, ipif, &sire,
- zoneid, 0, tsl, match_flags, ipst);
+ ire = ire_lookup_v6(&dst_addr_v6,
+ &net_mask_v6, &gw_addr_v6, ill, zoneid,
+ tsl, match_flags, ipst, &ifire, &v6setsrc,
+ &gwattr);
}
break;
}
@@ -810,10 +730,21 @@ lookup:
error = ESRCH;
goto done;
}
+ /*
+ * Want to return failure if we get an IRE_NOROUTE from
+ * ire_route_recursive
+ */
+ if (ire->ire_type & IRE_NOROUTE) {
+ ire_refrele(ire);
+ ire = NULL;
+ error = ESRCH;
+ goto done;
+ }
+
/* we know the IRE before we come here */
switch (rtm->rtm_type) {
case RTM_GET:
- mp1 = rts_rtmget(mp, ire, sire, af);
+ mp1 = rts_rtmget(mp, ire, ifire, &v6setsrc, gwattr, af);
if (mp1 == NULL) {
error = ENOBUFS;
goto done;
@@ -843,7 +774,6 @@ lookup:
*/
switch (af) {
case AF_INET:
- ire_flush_cache_v4(ire, IRE_FLUSH_DELETE);
if ((found_addrs & RTA_GATEWAY) != 0 &&
(ire->ire_gateway_addr != gw_addr)) {
ire->ire_gateway_addr = gw_addr;
@@ -863,9 +793,10 @@ lookup:
if ((found_addrs & RTA_SRC) != 0 &&
(rtm->rtm_flags & RTF_SETSRC) != 0 &&
- (ire->ire_src_addr != src_addr)) {
-
+ (ire->ire_setsrc_addr != src_addr)) {
if (src_addr != INADDR_ANY) {
+ uint_t type;
+
/*
* The RTF_SETSRC flag is
* present, check that the
@@ -880,50 +811,47 @@ lookup:
goto done;
}
/*
- * Also check that the the
+ * Also check that the
* supplied addr is a valid
* local address.
*/
- tmp_ipif = ipif_lookup_addr(
- src_addr, NULL, ALL_ZONES,
- WR(q), ioc_mp,
- ip_rts_request_retry,
- &error, ipst);
- if (tmp_ipif == NULL) {
- error = (error ==
- EINPROGRESS) ?
- error :
- EADDRNOTAVAIL;
- goto done;
- }
-
- if (!(tmp_ipif->ipif_flags &
- IPIF_UP) ||
- (tmp_ipif->ipif_flags &
- (IPIF_NOLOCAL |
- IPIF_ANYCAST))) {
- error = EINVAL;
+ type = ip_type_v4(src_addr,
+ ipst);
+ if (!(type &
+ (IRE_LOCAL|IRE_LOOPBACK))) {
+ error = EADDRNOTAVAIL;
goto done;
}
ire->ire_flags |= RTF_SETSRC;
+ ire->ire_setsrc_addr =
+ src_addr;
} else {
ire->ire_flags &= ~RTF_SETSRC;
+ ire->ire_setsrc_addr =
+ INADDR_ANY;
}
- ire->ire_src_addr = src_addr;
+ /*
+ * Let conn_ixa caching know that
+ * source address selection changed
+ */
+ ip_update_source_selection(ipst);
}
+ ire_flush_cache_v4(ire, IRE_FLUSH_GWCHANGE);
break;
case AF_INET6:
- ire_flush_cache_v6(ire, IRE_FLUSH_DELETE);
mutex_enter(&ire->ire_lock);
if ((found_addrs & RTA_GATEWAY) != 0 &&
!IN6_ARE_ADDR_EQUAL(
&ire->ire_gateway_addr_v6, &gw_addr_v6)) {
ire->ire_gateway_addr_v6 = gw_addr_v6;
}
+ mutex_exit(&ire->ire_lock);
if (rtsap != NULL) {
ga.ga_af = AF_INET6;
+ mutex_enter(&ire->ire_lock);
ga.ga_addr = ire->ire_gateway_addr_v6;
+ mutex_exit(&ire->ire_lock);
gcgrp = gcgrp_lookup(&ga, B_TRUE);
if (gcgrp == NULL) {
@@ -935,10 +863,11 @@ lookup:
if ((found_addrs & RTA_SRC) != 0 &&
(rtm->rtm_flags & RTF_SETSRC) != 0 &&
!IN6_ARE_ADDR_EQUAL(
- &ire->ire_src_addr_v6, &src_addr_v6)) {
-
+ &ire->ire_setsrc_addr_v6, &src_addr_v6)) {
if (!IN6_IS_ADDR_UNSPECIFIED(
&src_addr_v6)) {
+ uint_t type;
+
/*
* The RTF_SETSRC flag is
* present, check that the
@@ -949,54 +878,44 @@ lookup:
*/
if (IN6_IS_ADDR_LOOPBACK(
&src_addr_v6)) {
- mutex_exit(
- &ire->ire_lock);
error = EINVAL;
goto done;
}
/*
- * Also check that the the
+ * Also check that the
* supplied addr is a valid
* local address.
*/
- tmp_ipif = ipif_lookup_addr_v6(
- &src_addr_v6, NULL,
- ALL_ZONES,
- CONNP_TO_WQ(connp), ioc_mp,
- ip_rts_request_retry,
- &error, ipst);
- if (tmp_ipif == NULL) {
- mutex_exit(
- &ire->ire_lock);
- error = (error ==
- EINPROGRESS) ?
- error :
- EADDRNOTAVAIL;
- goto done;
- }
- if (!(tmp_ipif->ipif_flags &
- IPIF_UP) ||
- (tmp_ipif->ipif_flags &
- (IPIF_NOLOCAL |
- IPIF_ANYCAST))) {
- mutex_exit(
- &ire->ire_lock);
- error = EINVAL;
+ type = ip_type_v6(&src_addr_v6,
+ ipst);
+ if (!(type &
+ (IRE_LOCAL|IRE_LOOPBACK))) {
+ error = EADDRNOTAVAIL;
goto done;
}
+ mutex_enter(&ire->ire_lock);
ire->ire_flags |= RTF_SETSRC;
+ ire->ire_setsrc_addr_v6 =
+ src_addr_v6;
+ mutex_exit(&ire->ire_lock);
} else {
+ mutex_enter(&ire->ire_lock);
ire->ire_flags &= ~RTF_SETSRC;
+ ire->ire_setsrc_addr_v6 =
+ ipv6_all_zeros;
+ mutex_exit(&ire->ire_lock);
}
- ire->ire_src_addr_v6 = src_addr_v6;
+ /*
+ * Let conn_ixa caching know that
+ * source address selection changed
+ */
+ ip_update_source_selection(ipst);
}
- mutex_exit(&ire->ire_lock);
+ ire_flush_cache_v6(ire, IRE_FLUSH_GWCHANGE);
break;
}
if (rtsap != NULL) {
- in_addr_t ga_addr4;
-
ASSERT(gcgrp != NULL);
/*
@@ -1010,7 +929,7 @@ lookup:
gc = gc_create(rtsap, gcgrp, &gcgrp_xtraref);
if (gc == NULL ||
(error = tsol_ire_init_gwattr(ire,
- ire->ire_ipversion, gc, NULL)) != 0) {
+ ire->ire_ipversion, gc)) != 0) {
if (gc != NULL) {
GC_REFRELE(gc);
} else {
@@ -1019,21 +938,6 @@ lookup:
}
goto done;
}
-
- /*
- * Now delete any existing gateway IRE caches
- * as well as all caches using the gateway,
- * and allow them to be created on demand
- * through ip_newroute{_v6}.
- */
- IN6_V4MAPPED_TO_IPADDR(&ga.ga_addr, ga_addr4);
- if (af == AF_INET) {
- ire_clookup_delete_cache_gw(
- ga_addr4, ALL_ZONES, ipst);
- } else {
- ire_clookup_delete_cache_gw_v6(
- &ga.ga_addr, ALL_ZONES, ipst);
- }
}
rts_setmetrics(ire, rtm->rtm_inits, &rtm->rtm_rmx);
break;
@@ -1046,21 +950,14 @@ lookup:
done:
if (ire != NULL)
ire_refrele(ire);
- if (sire != NULL)
- ire_refrele(sire);
- if (ipif != NULL)
- ipif_refrele(ipif);
- if (tmp_ipif != NULL)
- ipif_refrele(tmp_ipif);
+ if (ifire != NULL)
+ ire_refrele(ifire);
+ if (ill != NULL)
+ ill_refrele(ill);
if (gcgrp_xtraref)
GCGRP_REFRELE(gcgrp);
- if (error == EINPROGRESS) {
- if (rtm != NULL)
- freemsg(mp);
- return (error);
- }
if (rtm != NULL) {
ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
if (error != 0) {
@@ -1074,12 +971,190 @@ done:
}
rts_queue_input(mp, connp, af, RTSQ_ALL, ipst);
}
+ return (error);
+}
+
+/*
+ * Helper function that can do recursive lookups including when
+ * MATCH_IRE_GW and/or MATCH_IRE_MASK is set.
+ */
+static ire_t *
+ire_lookup_v4(ipaddr_t dst_addr, ipaddr_t net_mask, ipaddr_t gw_addr,
+ const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl,
+ int match_flags, ip_stack_t *ipst, ire_t **pifire, ipaddr_t *v4setsrcp,
+ tsol_ire_gw_secattr_t **gwattrp)
+{
+ ire_t *ire;
+ ire_t *ifire = NULL;
+ uint_t ire_type;
+
+ *pifire = NULL;
+ *v4setsrcp = INADDR_ANY;
+ *gwattrp = NULL;
+
+ /* Skip IRE_IF_CLONE */
+ match_flags |= MATCH_IRE_TYPE;
+ ire_type = (IRE_ONLINK|IRE_OFFLINK) & ~IRE_IF_CLONE;
+
+ /*
+ * ire_route_recursive can't match gateway or mask thus if they are
+ * set we have to do two steps of lookups
+ */
+ if (match_flags & (MATCH_IRE_GW|MATCH_IRE_MASK)) {
+ ire = ire_ftable_lookup_v4(dst_addr, net_mask, gw_addr,
+ ire_type, ill, zoneid, tsl, match_flags, 0, ipst, NULL);
+
+ if (ire == NULL ||(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)))
+ return (ire);
+
+ if (ire->ire_type & IRE_ONLINK)
+ return (ire);
+
+ if (ire->ire_flags & RTF_SETSRC) {
+ ASSERT(ire->ire_setsrc_addr != INADDR_ANY);
+ *v4setsrcp = ire->ire_setsrc_addr;
+ v4setsrcp = NULL;
+ }
+
+ /* The first ire_gw_secattr is passed back */
+ if (ire->ire_gw_secattr != NULL) {
+ *gwattrp = ire->ire_gw_secattr;
+ gwattrp = NULL;
+ }
+
+ /* Look for an interface ire recursively based on the gateway */
+ dst_addr = ire->ire_gateway_addr;
+ match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_MASK);
+ ifire = ire_route_recursive_v4(dst_addr, ire_type, ill, zoneid,
+ tsl, match_flags, B_FALSE, 0, ipst, v4setsrcp, gwattrp,
+ NULL);
+ } else {
+ ire = ire_route_recursive_v4(dst_addr, ire_type, ill, zoneid,
+ tsl, match_flags, B_FALSE, 0, ipst, v4setsrcp, gwattrp,
+ NULL);
+ }
+ *pifire = ifire;
+ return (ire);
+}
+
+static ire_t *
+ire_lookup_v6(const in6_addr_t *dst_addr_v6,
+ const in6_addr_t *net_mask_v6, const in6_addr_t *gw_addr_v6,
+ const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, int match_flags,
+ ip_stack_t *ipst, ire_t **pifire,
+ in6_addr_t *v6setsrcp, tsol_ire_gw_secattr_t **gwattrp)
+{
+ ire_t *ire;
+ ire_t *ifire = NULL;
+ uint_t ire_type;
+
+ *pifire = NULL;
+ *v6setsrcp = ipv6_all_zeros;
+ *gwattrp = NULL;
+
+ /* Skip IRE_IF_CLONE */
+ match_flags |= MATCH_IRE_TYPE;
+ ire_type = (IRE_ONLINK|IRE_OFFLINK) & ~IRE_IF_CLONE;
+
+ /*
+ * ire_route_recursive can't match gateway or mask thus if they are
+ * set we have to do two steps of lookups
+ */
+ if (match_flags & (MATCH_IRE_GW|MATCH_IRE_MASK)) {
+ in6_addr_t dst;
+
+ ire = ire_ftable_lookup_v6(dst_addr_v6, net_mask_v6,
+ gw_addr_v6, ire_type, ill, zoneid, tsl, match_flags, 0,
+ ipst, NULL);
+
+ if (ire == NULL ||(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)))
+ return (ire);
+
+ if (ire->ire_type & IRE_ONLINK)
+ return (ire);
+
+ if (ire->ire_flags & RTF_SETSRC) {
+ ASSERT(!IN6_IS_ADDR_UNSPECIFIED(
+ &ire->ire_setsrc_addr_v6));
+ *v6setsrcp = ire->ire_setsrc_addr_v6;
+ v6setsrcp = NULL;
+ }
+
+ /* The first ire_gw_secattr is passed back */
+ if (ire->ire_gw_secattr != NULL) {
+ *gwattrp = ire->ire_gw_secattr;
+ gwattrp = NULL;
+ }
+
+ mutex_enter(&ire->ire_lock);
+ dst = ire->ire_gateway_addr_v6;
+ mutex_exit(&ire->ire_lock);
+ match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_MASK);
+ ifire = ire_route_recursive_v6(&dst, ire_type, ill, zoneid, tsl,
+ match_flags, B_FALSE, 0, ipst, v6setsrcp, gwattrp, NULL);
+ } else {
+ ire = ire_route_recursive_v6(dst_addr_v6, ire_type, ill, zoneid,
+ tsl, match_flags, B_FALSE, 0, ipst, v6setsrcp, gwattrp,
+ NULL);
+ }
+ *pifire = ifire;
+ return (ire);
+}
+
+
+/*
+ * Handle IP_IOC_RTS_REQUEST ioctls
+ */
+int
+ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
+{
+ conn_t *connp = Q_TO_CONN(q);
+ IOCP iocp = (IOCP)mp->b_rptr;
+ mblk_t *mp1, *ioc_mp = mp;
+ int error = 0;
+ ip_stack_t *ipst;
+ ipst = connp->conn_netstack->netstack_ip;
+
+ ASSERT(mp->b_cont != NULL);
+ /* ioc_mp holds mp */
+ mp = mp->b_cont;
+
+ /*
+ * The Routing Socket data starts on
+ * next block. If there is no next block
+ * this is an indication from routing module
+ * that it is a routing socket stream queue.
+ * We need to support that for compatibility with SDP since
+ * it has a contract private interface to use IP_IOC_RTS_REQUEST.
+ * Note: SDP no longer uses IP_IOC_RTS_REQUEST - we can remove this.
+ */
+ if (mp->b_cont == NULL) {
+ /*
+ * This is a message from SDP
+ * indicating that this is a Routing Socket
+ * Stream. Insert this conn_t in routing
+ * socket client list.
+ */
+ connp->conn_useloopback = 1;
+ ipcl_hash_insert_wildcard(ipst->ips_rts_clients, connp);
+ goto done;
+ }
+ mp1 = dupmsg(mp->b_cont);
+ if (mp1 == NULL) {
+ error = ENOBUFS;
+ goto done;
+ }
+ mp = mp1;
+
+ error = ip_rts_request_common(mp, connp, ioc_cr);
+done:
iocp->ioc_error = error;
ioc_mp->b_datap->db_type = M_IOCACK;
if (iocp->ioc_error != 0)
iocp->ioc_count = 0;
- (connp->conn_recv)(connp, ioc_mp, NULL);
+ /* Note that we pass a NULL ira to rts_input */
+ (connp->conn_recv)(connp, ioc_mp, NULL, NULL);
/* conn was refheld in ip_wput_ioctl. */
CONN_OPER_PENDING_DONE(connp);
@@ -1087,12 +1162,6 @@ done:
return (error);
}
-int
-ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
-{
- return (ip_rts_request_common(q, mp, Q_TO_CONN(q), ioc_cr));
-}
-
/*
* Build a reply to the RTM_GET request contained in the given message block
* using the retrieved IRE of the destination address, the parent IRE (if it
@@ -1102,26 +1171,34 @@ ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
* otherwise NULL is returned.
*/
static mblk_t *
-rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire, sa_family_t af)
+rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *ifire, const in6_addr_t *setsrc,
+ tsol_ire_gw_secattr_t *attrp, sa_family_t af)
{
rt_msghdr_t *rtm;
rt_msghdr_t *new_rtm;
mblk_t *new_mp;
int rtm_addrs;
int rtm_flags;
- in6_addr_t gw_addr_v6;
- tsol_ire_gw_secattr_t *attrp = NULL;
tsol_gc_t *gc = NULL;
tsol_gcgrp_t *gcgrp = NULL;
- int sacnt = 0;
+ ill_t *ill;
+ ipif_t *ipif = NULL;
+ ipaddr_t brdaddr; /* IFF_POINTOPOINT destination */
+ ipaddr_t ifaddr;
+ in6_addr_t brdaddr6; /* IFF_POINTOPOINT destination */
+ in6_addr_t ifaddr6;
+ ipaddr_t v4setsrc;
- ASSERT(ire->ire_ipif != NULL);
rtm = (rt_msghdr_t *)mp->b_rptr;
- if (sire != NULL && sire->ire_gw_secattr != NULL)
- attrp = sire->ire_gw_secattr;
- else if (ire->ire_gw_secattr != NULL)
- attrp = ire->ire_gw_secattr;
+ /*
+ * Find the ill used to send packets. This will be NULL in case
+ * of a reject or blackhole.
+ */
+ if (ifire != NULL)
+ ill = ire_nexthop_ill(ifire);
+ else
+ ill = ire_nexthop_ill(ire);
if (attrp != NULL) {
mutex_enter(&attrp->igsa_lock);
@@ -1129,29 +1206,9 @@ rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire, sa_family_t af)
gcgrp = gc->gc_grp;
ASSERT(gcgrp != NULL);
rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
- sacnt = 1;
- } else if ((gcgrp = attrp->igsa_gcgrp) != NULL) {
- rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
- gc = gcgrp->gcgrp_head;
- sacnt = gcgrp->gcgrp_count;
}
mutex_exit(&attrp->igsa_lock);
-
- /* do nothing if there's no gc to report */
- if (gc == NULL) {
- ASSERT(sacnt == 0);
- if (gcgrp != NULL) {
- /* we might as well drop the lock now */
- rw_exit(&gcgrp->gcgrp_rwlock);
- gcgrp = NULL;
- }
- attrp = NULL;
- }
-
- ASSERT(gc == NULL || (gcgrp != NULL &&
- RW_LOCK_HELD(&gcgrp->gcgrp_rwlock)));
}
- ASSERT(sacnt == 0 || gc != NULL);
/*
* Always return RTA_DST, RTA_GATEWAY and RTA_NETMASK.
@@ -1162,16 +1219,36 @@ rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire, sa_family_t af)
* point-to-point.
*/
rtm_addrs = (RTA_DST | RTA_GATEWAY | RTA_NETMASK);
- if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) {
+ if ((rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) && ill != NULL) {
rtm_addrs |= (RTA_IFP | RTA_IFA);
- if (ire->ire_ipif->ipif_flags & IPIF_POINTOPOINT)
- rtm_addrs |= RTA_BRD;
+ /*
+ * We associate an IRE with an ILL, hence we don't exactly
+ * know what might make sense for RTA_IFA and RTA_BRD. We
+ * pick the first ipif on the ill.
+ */
+ ipif = ipif_get_next_ipif(NULL, ill);
+ if (ipif != NULL) {
+ if (ipif->ipif_isv6)
+ ifaddr6 = ipif->ipif_v6lcl_addr;
+ else
+ ifaddr = ipif->ipif_lcl_addr;
+ if (ipif->ipif_flags & IPIF_POINTOPOINT) {
+ rtm_addrs |= RTA_BRD;
+ if (ipif->ipif_isv6)
+ brdaddr6 = ipif->ipif_v6pp_dst_addr;
+ else
+ brdaddr = ipif->ipif_pp_dst_addr;
+ }
+ ipif_refrele(ipif);
+ }
}
- new_mp = rts_alloc_msg(RTM_GET, rtm_addrs, af, sacnt);
+ new_mp = rts_alloc_msg(RTM_GET, rtm_addrs, af, gc != NULL ? 1 : 0);
if (new_mp == NULL) {
if (gcgrp != NULL)
rw_exit(&gcgrp->gcgrp_rwlock);
+ if (ill != NULL)
+ ill_refrele(ill);
return (NULL);
}
@@ -1187,49 +1264,24 @@ rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire, sa_family_t af)
ASSERT(af == AF_INET || af == AF_INET6);
switch (af) {
case AF_INET:
- if (sire == NULL) {
- rtm_flags = ire->ire_flags;
- rts_fill_msg(RTM_GET, rtm_addrs, ire->ire_addr,
- ire->ire_mask, ire->ire_src_addr, ire->ire_src_addr,
- ire->ire_ipif->ipif_pp_dst_addr, 0, ire->ire_ipif,
- new_mp, sacnt, gc);
- } else {
- if (sire->ire_flags & RTF_SETSRC)
- rtm_addrs |= RTA_SRC;
-
- rtm_flags = sire->ire_flags;
- rts_fill_msg(RTM_GET, rtm_addrs, sire->ire_addr,
- sire->ire_mask, sire->ire_gateway_addr,
- (sire->ire_flags & RTF_SETSRC) ?
- sire->ire_src_addr : ire->ire_src_addr,
- ire->ire_ipif->ipif_pp_dst_addr,
- 0, ire->ire_ipif, new_mp, sacnt, gc);
- }
+ IN6_V4MAPPED_TO_IPADDR(setsrc, v4setsrc);
+ if (v4setsrc != INADDR_ANY)
+ rtm_addrs |= RTA_SRC;
+
+ rtm_flags = ire->ire_flags;
+ rts_fill_msg(RTM_GET, rtm_addrs, ire->ire_addr,
+ ire->ire_mask, ire->ire_gateway_addr, v4setsrc,
+ brdaddr, 0, ifaddr, ill, new_mp, gc);
break;
case AF_INET6:
- if (sire == NULL) {
- rtm_flags = ire->ire_flags;
- rts_fill_msg_v6(RTM_GET, rtm_addrs, &ire->ire_addr_v6,
- &ire->ire_mask_v6, &ire->ire_src_addr_v6,
- &ire->ire_src_addr_v6,
- &ire->ire_ipif->ipif_v6pp_dst_addr,
- &ipv6_all_zeros, ire->ire_ipif, new_mp,
- sacnt, gc);
- } else {
- if (sire->ire_flags & RTF_SETSRC)
- rtm_addrs |= RTA_SRC;
-
- rtm_flags = sire->ire_flags;
- mutex_enter(&sire->ire_lock);
- gw_addr_v6 = sire->ire_gateway_addr_v6;
- mutex_exit(&sire->ire_lock);
- rts_fill_msg_v6(RTM_GET, rtm_addrs, &sire->ire_addr_v6,
- &sire->ire_mask_v6, &gw_addr_v6,
- (sire->ire_flags & RTF_SETSRC) ?
- &sire->ire_src_addr_v6 : &ire->ire_src_addr_v6,
- &ire->ire_ipif->ipif_v6pp_dst_addr, &ipv6_all_zeros,
- ire->ire_ipif, new_mp, sacnt, gc);
- }
+ if (!IN6_IS_ADDR_UNSPECIFIED(setsrc))
+ rtm_addrs |= RTA_SRC;
+
+ rtm_flags = ire->ire_flags;
+ rts_fill_msg_v6(RTM_GET, rtm_addrs, &ire->ire_addr_v6,
+ &ire->ire_mask_v6, &ire->ire_gateway_addr_v6,
+ setsrc, &brdaddr6, &ipv6_all_zeros,
+ &ifaddr6, ill, new_mp, gc);
break;
}
@@ -1259,11 +1311,9 @@ rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire, sa_family_t af)
new_rtm->rtm_use = rtm->rtm_use;
new_rtm->rtm_addrs = rtm_addrs;
new_rtm->rtm_flags = rtm_flags;
- if (sire == NULL)
- new_rtm->rtm_inits = rts_getmetrics(ire, &new_rtm->rtm_rmx);
- else
- new_rtm->rtm_inits = rts_getmetrics(sire, &new_rtm->rtm_rmx);
-
+ new_rtm->rtm_inits = rts_getmetrics(ire, &new_rtm->rtm_rmx);
+ if (ill != NULL)
+ ill_refrele(ill);
return (new_mp);
}
@@ -1273,10 +1323,11 @@ rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire, sa_family_t af)
static void
rts_getifdata(if_data_t *if_data, const ipif_t *ipif)
{
- if_data->ifi_type = ipif->ipif_type; /* ethernet, tokenring, etc */
+ if_data->ifi_type = ipif->ipif_ill->ill_type;
+ /* ethernet, tokenring, etc */
if_data->ifi_addrlen = 0; /* media address length */
if_data->ifi_hdrlen = 0; /* media header length */
- if_data->ifi_mtu = ipif->ipif_mtu; /* maximum transmission unit */
+ if_data->ifi_mtu = ipif->ipif_ill->ill_mtu; /* mtu */
if_data->ifi_metric = ipif->ipif_metric; /* metric (external only) */
if_data->ifi_baudrate = 0; /* linespeed */
@@ -1302,18 +1353,19 @@ rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics)
{
clock_t rtt;
clock_t rtt_sd;
- ipif_t *ipif;
+ ill_t *ill;
ifrt_t *ifrt;
mblk_t *mp;
in6_addr_t gw_addr_v6;
+ /* Need to add back some metrics to the IRE? */
/*
- * Bypass obtaining the lock and searching ipif_saved_ire_mp in the
+ * Bypass obtaining the lock and searching ill_saved_ire_mp in the
* common case of no metrics.
*/
if (which == 0)
return;
- ire->ire_uinfo.iulp_set = B_TRUE;
+ ire->ire_metrics.iulp_set = B_TRUE;
/*
* iulp_rtt and iulp_rtt_sd are in milliseconds, but 4.4BSD-Lite2's
@@ -1330,42 +1382,41 @@ rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics)
*/
mutex_enter(&ire->ire_lock);
if (which & RTV_MTU)
- ire->ire_max_frag = metrics->rmx_mtu;
+ ire->ire_metrics.iulp_mtu = metrics->rmx_mtu;
if (which & RTV_RTT)
- ire->ire_uinfo.iulp_rtt = rtt;
+ ire->ire_metrics.iulp_rtt = rtt;
if (which & RTV_SSTHRESH)
- ire->ire_uinfo.iulp_ssthresh = metrics->rmx_ssthresh;
+ ire->ire_metrics.iulp_ssthresh = metrics->rmx_ssthresh;
if (which & RTV_RTTVAR)
- ire->ire_uinfo.iulp_rtt_sd = rtt_sd;
+ ire->ire_metrics.iulp_rtt_sd = rtt_sd;
if (which & RTV_SPIPE)
- ire->ire_uinfo.iulp_spipe = metrics->rmx_sendpipe;
+ ire->ire_metrics.iulp_spipe = metrics->rmx_sendpipe;
if (which & RTV_RPIPE)
- ire->ire_uinfo.iulp_rpipe = metrics->rmx_recvpipe;
+ ire->ire_metrics.iulp_rpipe = metrics->rmx_recvpipe;
mutex_exit(&ire->ire_lock);
/*
- * Search through the ifrt_t chain hanging off the IPIF in order to
+ * Search through the ifrt_t chain hanging off the ILL in order to
* reflect the metric change there.
*/
- ipif = ire->ire_ipif;
- if (ipif == NULL)
+ ill = ire->ire_ill;
+ if (ill == NULL)
return;
- ASSERT((ipif->ipif_isv6 && ire->ire_ipversion == IPV6_VERSION) ||
- ((!ipif->ipif_isv6 && ire->ire_ipversion == IPV4_VERSION)));
- if (ipif->ipif_isv6) {
+ ASSERT((ill->ill_isv6 && ire->ire_ipversion == IPV6_VERSION) ||
+ ((!ill->ill_isv6 && ire->ire_ipversion == IPV4_VERSION)));
+ if (ill->ill_isv6) {
mutex_enter(&ire->ire_lock);
gw_addr_v6 = ire->ire_gateway_addr_v6;
mutex_exit(&ire->ire_lock);
}
- mutex_enter(&ipif->ipif_saved_ire_lock);
- for (mp = ipif->ipif_saved_ire_mp; mp != NULL; mp = mp->b_cont) {
+ mutex_enter(&ill->ill_saved_ire_lock);
+ for (mp = ill->ill_saved_ire_mp; mp != NULL; mp = mp->b_cont) {
/*
- * On a given ipif, the triple of address, gateway and mask is
- * unique for each saved IRE (in the case of ordinary interface
- * routes, the gateway address is all-zeroes).
+ * On a given ill, the tuple of address, gateway, mask,
+ * ire_type and zoneid unique for each saved IRE.
*/
ifrt = (ifrt_t *)mp->b_rptr;
- if (ipif->ipif_isv6) {
+ if (ill->ill_isv6) {
if (!IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6addr,
&ire->ire_addr_v6) ||
!IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6gateway_addr,
@@ -1379,23 +1430,36 @@ rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics)
ifrt->ifrt_mask != ire->ire_mask)
continue;
}
+ if (ifrt->ifrt_zoneid != ire->ire_zoneid ||
+ ifrt->ifrt_type != ire->ire_type)
+ continue;
+
if (which & RTV_MTU)
- ifrt->ifrt_max_frag = metrics->rmx_mtu;
+ ifrt->ifrt_metrics.iulp_mtu = metrics->rmx_mtu;
if (which & RTV_RTT)
- ifrt->ifrt_iulp_info.iulp_rtt = rtt;
+ ifrt->ifrt_metrics.iulp_rtt = rtt;
if (which & RTV_SSTHRESH) {
- ifrt->ifrt_iulp_info.iulp_ssthresh =
+ ifrt->ifrt_metrics.iulp_ssthresh =
metrics->rmx_ssthresh;
}
if (which & RTV_RTTVAR)
- ifrt->ifrt_iulp_info.iulp_rtt_sd = metrics->rmx_rttvar;
+ ifrt->ifrt_metrics.iulp_rtt_sd = metrics->rmx_rttvar;
if (which & RTV_SPIPE)
- ifrt->ifrt_iulp_info.iulp_spipe = metrics->rmx_sendpipe;
+ ifrt->ifrt_metrics.iulp_spipe = metrics->rmx_sendpipe;
if (which & RTV_RPIPE)
- ifrt->ifrt_iulp_info.iulp_rpipe = metrics->rmx_recvpipe;
+ ifrt->ifrt_metrics.iulp_rpipe = metrics->rmx_recvpipe;
break;
}
- mutex_exit(&ipif->ipif_saved_ire_lock);
+ mutex_exit(&ill->ill_saved_ire_lock);
+
+ /*
+ * Update any IRE_IF_CLONE hanging created from this IRE_IF so they
+ * get any new iulp_mtu.
+ * We do that by deleting them; ire_create_if_clone will pick
+ * up the new metrics.
+ */
+ if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != 0)
+ ire_dep_delete_if_clone(ire);
}
/*
@@ -1407,27 +1471,69 @@ rts_getmetrics(ire_t *ire, rt_metrics_t *metrics)
int metrics_set = 0;
bzero(metrics, sizeof (rt_metrics_t));
+
/*
* iulp_rtt and iulp_rtt_sd are in milliseconds, but 4.4BSD-Lite2's
* <net/route.h> says: rmx_rtt and rmx_rttvar are stored as
* microseconds.
*/
- metrics->rmx_rtt = ire->ire_uinfo.iulp_rtt * 1000;
+ metrics->rmx_rtt = ire->ire_metrics.iulp_rtt * 1000;
metrics_set |= RTV_RTT;
- metrics->rmx_mtu = ire->ire_max_frag;
+ metrics->rmx_mtu = ire->ire_metrics.iulp_mtu;
metrics_set |= RTV_MTU;
- metrics->rmx_ssthresh = ire->ire_uinfo.iulp_ssthresh;
+ metrics->rmx_ssthresh = ire->ire_metrics.iulp_ssthresh;
metrics_set |= RTV_SSTHRESH;
- metrics->rmx_rttvar = ire->ire_uinfo.iulp_rtt_sd * 1000;
+ metrics->rmx_rttvar = ire->ire_metrics.iulp_rtt_sd * 1000;
metrics_set |= RTV_RTTVAR;
- metrics->rmx_sendpipe = ire->ire_uinfo.iulp_spipe;
+ metrics->rmx_sendpipe = ire->ire_metrics.iulp_spipe;
metrics_set |= RTV_SPIPE;
- metrics->rmx_recvpipe = ire->ire_uinfo.iulp_rpipe;
+ metrics->rmx_recvpipe = ire->ire_metrics.iulp_rpipe;
metrics_set |= RTV_RPIPE;
return (metrics_set);
}
/*
+ * Given two sets of metrics (src and dst), use the dst values if they are
+ * set. If a dst value is not set but the src value is set, then we use
+ * the src value.
+ * dst is updated with the new values.
+ * This is used to merge information from a dce_t and ire_metrics, where the
+ * dce values takes precedence.
+ */
+void
+rts_merge_metrics(iulp_t *dst, const iulp_t *src)
+{
+ if (!src->iulp_set)
+ return;
+
+ if (dst->iulp_ssthresh == 0)
+ dst->iulp_ssthresh = src->iulp_ssthresh;
+ if (dst->iulp_rtt == 0)
+ dst->iulp_rtt = src->iulp_rtt;
+ if (dst->iulp_rtt_sd == 0)
+ dst->iulp_rtt_sd = src->iulp_rtt_sd;
+ if (dst->iulp_spipe == 0)
+ dst->iulp_spipe = src->iulp_spipe;
+ if (dst->iulp_rpipe == 0)
+ dst->iulp_rpipe = src->iulp_rpipe;
+ if (dst->iulp_rtomax == 0)
+ dst->iulp_rtomax = src->iulp_rtomax;
+ if (dst->iulp_sack == 0)
+ dst->iulp_sack = src->iulp_sack;
+ if (dst->iulp_tstamp_ok == 0)
+ dst->iulp_tstamp_ok = src->iulp_tstamp_ok;
+ if (dst->iulp_wscale_ok == 0)
+ dst->iulp_wscale_ok = src->iulp_wscale_ok;
+ if (dst->iulp_ecn_ok == 0)
+ dst->iulp_ecn_ok = src->iulp_ecn_ok;
+ if (dst->iulp_pmtud_ok == 0)
+ dst->iulp_pmtud_ok = src->iulp_pmtud_ok;
+ if (dst->iulp_mtu == 0)
+ dst->iulp_mtu = src->iulp_mtu;
+}
+
+
+/*
* Takes a pointer to a routing message and extracts necessary info by looking
* at the rtm->rtm_addrs bits and store the requested sockaddrs in the pointers
* passed (all of which must be valid).
@@ -1552,7 +1658,8 @@ rts_getaddrs(rt_msghdr_t *rtm, in6_addr_t *dst_addrp, in6_addr_t *gw_addrp,
static void
rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask,
ipaddr_t gateway, ipaddr_t src_addr, ipaddr_t brd_addr, ipaddr_t author,
- const ipif_t *ipif, mblk_t *mp, uint_t sacnt, const tsol_gc_t *gc)
+ ipaddr_t ifaddr, const ill_t *ill, mblk_t *mp,
+ const tsol_gc_t *gc)
{
rt_msghdr_t *rtm;
sin_t *sin;
@@ -1561,7 +1668,6 @@ rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask,
int i;
ASSERT(mp != NULL);
- ASSERT(sacnt == 0 || gc != NULL);
/*
* First find the type of the message
* and its length.
@@ -1571,7 +1677,7 @@ rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask,
* Now find the size of the data
* that follows the message header.
*/
- data_size = rts_data_msg_size(rtm_addrs, AF_INET, sacnt);
+ data_size = rts_data_msg_size(rtm_addrs, AF_INET, gc != NULL ? 1 : 0);
rtm = (rt_msghdr_t *)mp->b_rptr;
mp->b_wptr = &mp->b_rptr[header_size];
@@ -1596,9 +1702,13 @@ rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask,
cp += sizeof (sin_t);
break;
case RTA_IFP:
- cp += ill_dls_info((struct sockaddr_dl *)cp, ipif);
+ cp += ill_dls_info((struct sockaddr_dl *)cp, ill);
break;
case RTA_IFA:
+ sin->sin_addr.s_addr = ifaddr;
+ sin->sin_family = AF_INET;
+ cp += sizeof (sin_t);
+ break;
case RTA_SRC:
sin->sin_addr.s_addr = src_addr;
sin->sin_family = AF_INET;
@@ -1625,24 +1735,20 @@ rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask,
rtm_ext_t *rtm_ext;
struct rtsa_s *rp_dst;
tsol_rtsecattr_t *rsap;
- int i;
ASSERT(gc->gc_grp != NULL);
ASSERT(RW_LOCK_HELD(&gc->gc_grp->gcgrp_rwlock));
- ASSERT(sacnt > 0);
rtm_ext = (rtm_ext_t *)cp;
rtm_ext->rtmex_type = RTMEX_GATEWAY_SECATTR;
- rtm_ext->rtmex_len = TSOL_RTSECATTR_SIZE(sacnt);
+ rtm_ext->rtmex_len = TSOL_RTSECATTR_SIZE(1);
rsap = (tsol_rtsecattr_t *)(rtm_ext + 1);
- rsap->rtsa_cnt = sacnt;
+ rsap->rtsa_cnt = 1;
rp_dst = rsap->rtsa_attr;
- for (i = 0; i < sacnt; i++, gc = gc->gc_next, rp_dst++) {
- ASSERT(gc->gc_db != NULL);
- bcopy(&gc->gc_db->gcdb_attr, rp_dst, sizeof (*rp_dst));
- }
+ ASSERT(gc->gc_db != NULL);
+ bcopy(&gc->gc_db->gcdb_attr, rp_dst, sizeof (*rp_dst));
cp = (uchar_t *)rp_dst;
}
@@ -1659,6 +1765,7 @@ rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask,
/*
* Allocates and initializes a routing socket message.
+ * Note that sacnt is either zero or one.
*/
mblk_t *
rts_alloc_msg(int type, int rtm_addrs, sa_family_t af, uint_t sacnt)
@@ -1755,7 +1862,7 @@ ip_rts_change(int type, ipaddr_t dst_addr, ipaddr_t gw_addr, ipaddr_t net_mask,
if (mp == NULL)
return;
rts_fill_msg(type, rtm_addrs, dst_addr, net_mask, gw_addr, source, 0,
- author, NULL, mp, 0, NULL);
+ author, 0, NULL, mp, NULL);
rtm = (rt_msghdr_t *)mp->b_rptr;
rtm->rtm_flags = flags;
rtm->rtm_errno = error;
@@ -1784,12 +1891,12 @@ ip_rts_xifmsg(const ipif_t *ipif, uint64_t set, uint64_t clear, uint_t flags)
ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
/*
- * This message should be generated only when the physical interface
- * is changing state.
+ * This message should be generated only
+ * when the physical device is changing
+ * state.
*/
if (ipif->ipif_id != 0)
return;
-
if (ipif->ipif_isv6) {
af = AF_INET6;
mp = rts_alloc_msg(RTM_IFINFO, RTA_IFP, af, 0);
@@ -1797,14 +1904,15 @@ ip_rts_xifmsg(const ipif_t *ipif, uint64_t set, uint64_t clear, uint_t flags)
return;
rts_fill_msg_v6(RTM_IFINFO, RTA_IFP, &ipv6_all_zeros,
&ipv6_all_zeros, &ipv6_all_zeros, &ipv6_all_zeros,
- &ipv6_all_zeros, &ipv6_all_zeros, ipif, mp, 0, NULL);
+ &ipv6_all_zeros, &ipv6_all_zeros, &ipv6_all_zeros,
+ ipif->ipif_ill, mp, NULL);
} else {
af = AF_INET;
mp = rts_alloc_msg(RTM_IFINFO, RTA_IFP, af, 0);
if (mp == NULL)
return;
- rts_fill_msg(RTM_IFINFO, RTA_IFP, 0, 0, 0, 0, 0, 0, ipif, mp,
- 0, NULL);
+ rts_fill_msg(RTM_IFINFO, RTA_IFP, 0, 0, 0, 0, 0, 0, 0,
+ ipif->ipif_ill, mp, NULL);
}
ifm = (if_msghdr_t *)mp->b_rptr;
ifm->ifm_index = ipif->ipif_ill->ill_phyint->phyint_ifindex;
@@ -1843,6 +1951,12 @@ ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif, uint_t flags)
sa_family_t af;
ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
+ /*
+ * Let conn_ixa caching know that source address selection
+ * changed
+ */
+ ip_update_source_selection(ipst);
+
if (ipif->ipif_isv6)
af = AF_INET6;
else
@@ -1875,15 +1989,17 @@ ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif, uint_t flags)
case AF_INET:
rts_fill_msg(ncmd, rtm_addrs, 0,
ipif->ipif_net_mask, 0, ipif->ipif_lcl_addr,
- ipif->ipif_pp_dst_addr, 0, ipif, mp,
- 0, NULL);
+ ipif->ipif_pp_dst_addr, 0,
+ ipif->ipif_lcl_addr, ipif->ipif_ill,
+ mp, NULL);
break;
case AF_INET6:
rts_fill_msg_v6(ncmd, rtm_addrs,
&ipv6_all_zeros, &ipif->ipif_v6net_mask,
&ipv6_all_zeros, &ipif->ipif_v6lcl_addr,
&ipif->ipif_v6pp_dst_addr, &ipv6_all_zeros,
- ipif, mp, 0, NULL);
+ &ipif->ipif_v6lcl_addr, ipif->ipif_ill,
+ mp, NULL);
break;
}
ifam = (ifa_msghdr_t *)mp->b_rptr;
@@ -1904,14 +2020,15 @@ ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif, uint_t flags)
case AF_INET:
rts_fill_msg(cmd, rtm_addrs,
ipif->ipif_lcl_addr, ipif->ipif_net_mask, 0,
- 0, 0, 0, NULL, mp, 0, NULL);
+ 0, 0, 0, 0, NULL, mp, NULL);
break;
case AF_INET6:
rts_fill_msg_v6(cmd, rtm_addrs,
&ipif->ipif_v6lcl_addr,
&ipif->ipif_v6net_mask, &ipv6_all_zeros,
&ipv6_all_zeros, &ipv6_all_zeros,
- &ipv6_all_zeros, NULL, mp, 0, NULL);
+ &ipv6_all_zeros, &ipv6_all_zeros,
+ NULL, mp, NULL);
break;
}
rtm = (rt_msghdr_t *)mp->b_rptr;
diff --git a/usr/src/uts/common/inet/ip/ip_sadb.c b/usr/src/uts/common/inet/ip/ip_sadb.c
index 35b822902a..e099d04427 100644
--- a/usr/src/uts/common/inet/ip/ip_sadb.c
+++ b/usr/src/uts/common/inet/ip/ip_sadb.c
@@ -36,7 +36,6 @@
#include <inet/ip6.h>
#include <net/pfkeyv2.h>
-#include <inet/ipsec_info.h>
#include <inet/sadb.h>
#include <inet/ipsec_impl.h>
#include <inet/ipdrop.h>
@@ -57,35 +56,21 @@ ipsec_match_outbound_ids(ipsec_latch_t *ipl, ipsa_t *sa)
ipsid_equal(ipl->ipl_remote_cid, sa->ipsa_dst_cid);
}
-/* cr1 is packet cred; cr2 is SA credential */
+/* l1 is packet label; l2 is SA label */
boolean_t
-ipsec_label_match(cred_t *cr1, cred_t *cr2)
+ipsec_label_match(ts_label_t *l1, ts_label_t *l2)
{
- ts_label_t *l1, *l2;
-
if (!is_system_labeled())
return (B_TRUE);
/*
- * Check for NULL creds. Unlabeled SA always matches;
+ * Check for NULL label. Unlabeled SA (l2) always matches;
* unlabeled user with labeled SA always fails
*/
- if (cr2 == NULL)
+ if (l2 == NULL)
return (B_TRUE);
- if (cr1 == NULL)
- return (B_FALSE);
-
- /* If we reach here, we have two passed-in creds. */
- ASSERT(cr2 != NULL && cr1 != NULL);
-
- /* Check for NULL labels. Two is good, one is bad, zero is good. */
- l1 = crgetlabel(cr1);
- l2 = crgetlabel(cr2);
if (l1 == NULL)
- return (l2 == NULL);
-
- if (l2 == NULL)
return (B_FALSE);
/* Simple IPsec MLS policy: labels must be equal */
@@ -109,32 +94,32 @@ ipsec_label_match(cred_t *cr1, cred_t *cr2)
* The SA ptr I return will have its reference count incremented by one.
*/
ipsa_t *
-ipsec_getassocbyconn(isaf_t *bucket, ipsec_out_t *io, uint32_t *src,
- uint32_t *dst, sa_family_t af, uint8_t protocol, cred_t *cr)
+ipsec_getassocbyconn(isaf_t *bucket, ip_xmit_attr_t *ixa, uint32_t *src,
+ uint32_t *dst, sa_family_t af, uint8_t protocol, ts_label_t *tsl)
{
ipsa_t *retval, *candidate;
ipsec_action_t *candact;
boolean_t need_unique;
- boolean_t tunnel_mode = io->ipsec_out_tunnel;
+ boolean_t tunnel_mode = (ixa->ixa_flags & IXAF_IPSEC_TUNNEL);
uint64_t unique_id;
uint32_t old_flags, excludeflags;
- ipsec_policy_t *pp = io->ipsec_out_policy;
- ipsec_action_t *actlist = io->ipsec_out_act;
+ ipsec_policy_t *pp = ixa->ixa_ipsec_policy;
+ ipsec_action_t *actlist = ixa->ixa_ipsec_action;
ipsec_action_t *act;
- ipsec_latch_t *ipl = io->ipsec_out_latch;
+ ipsec_latch_t *ipl = ixa->ixa_ipsec_latch;
ipsa_ref_t *ipr = NULL;
- sa_family_t inaf = io->ipsec_out_inaf;
- uint32_t *insrc = io->ipsec_out_insrc;
- uint32_t *indst = io->ipsec_out_indst;
- uint8_t insrcpfx = io->ipsec_out_insrcpfx;
- uint8_t indstpfx = io->ipsec_out_indstpfx;
+ sa_family_t inaf = ixa->ixa_ipsec_inaf;
+ uint32_t *insrc = ixa->ixa_ipsec_insrc;
+ uint32_t *indst = ixa->ixa_ipsec_indst;
+ uint8_t insrcpfx = ixa->ixa_ipsec_insrcpfx;
+ uint8_t indstpfx = ixa->ixa_ipsec_indstpfx;
ASSERT(MUTEX_HELD(&bucket->isaf_lock));
/*
- * Caller must set ipsec_out_t structure such that we know
+ * Caller must set ip_xmit_attr_t structure such that we know
* whether this is tunnel mode or transport mode based on
- * io->ipsec_out_tunnel. If this flag is set, we assume that
+ * IXAF_IPSEC_TUNNEL. If this flag is set, we assume that
* there are valid inner src and destination addresses to compare.
*/
@@ -145,7 +130,7 @@ ipsec_getassocbyconn(isaf_t *bucket, ipsec_out_t *io, uint32_t *src,
if (ipl != NULL) {
ASSERT((protocol == IPPROTO_AH) || (protocol == IPPROTO_ESP));
- ipr = &ipl->ipl_ref[protocol - IPPROTO_ESP];
+ ipr = &ixa->ixa_ipsec_ref[protocol - IPPROTO_ESP];
retval = ipr->ipsr_sa;
@@ -169,7 +154,7 @@ ipsec_getassocbyconn(isaf_t *bucket, ipsec_out_t *io, uint32_t *src,
ASSERT(actlist != NULL);
need_unique = actlist->ipa_want_unique;
- unique_id = SA_FORM_UNIQUE_ID(io);
+ unique_id = SA_FORM_UNIQUE_ID(ixa);
/*
* Precompute mask for SA flags comparison: If we need a
@@ -332,7 +317,7 @@ ipsec_getassocbyconn(isaf_t *bucket, ipsec_out_t *io, uint32_t *src,
/*
* Do labels match?
*/
- if (!ipsec_label_match(cr, retval->ipsa_cred))
+ if (!ipsec_label_match(tsl, retval->ipsa_tsl))
goto next_ipsa;
/*
@@ -451,10 +436,9 @@ next_ipsa:
ipsec_latch_ids(ipl,
retval->ipsa_src_cid, retval->ipsa_dst_cid);
}
- if (!ipl->ipl_out_action_latched) {
+ if (ixa->ixa_ipsec_action == NULL) {
IPACT_REFHOLD(act);
- ipl->ipl_out_action = act;
- ipl->ipl_out_action_latched = B_TRUE;
+ ixa->ixa_ipsec_action = act;
}
}
@@ -471,7 +455,7 @@ next_ipsa:
retval->ipsa_flags |= IPSA_F_UNIQUE;
retval->ipsa_unique_id = unique_id;
retval->ipsa_unique_mask = SA_UNIQUE_MASK(
- io->ipsec_out_src_port, io->ipsec_out_dst_port,
+ ixa->ixa_ipsec_src_port, ixa->ixa_ipsec_dst_port,
protocol, 0);
}
@@ -581,45 +565,41 @@ ipsec_getassocbyspi(isaf_t *bucket, uint32_t spi, uint32_t *src, uint32_t *dst,
}
boolean_t
-ipsec_outbound_sa(mblk_t *mp, uint_t proto)
+ipsec_outbound_sa(mblk_t *data_mp, ip_xmit_attr_t *ixa, uint_t proto)
{
- mblk_t *data_mp;
- ipsec_out_t *io;
ipaddr_t dst;
uint32_t *dst_ptr, *src_ptr;
isaf_t *bucket;
ipsa_t *assoc;
- ip6_pkt_t ipp;
+ ip_pkt_t ipp;
in6_addr_t dst6;
ipsa_t **sa;
sadbp_t *sadbp;
sadb_t *sp;
sa_family_t af;
- cred_t *cr;
- netstack_t *ns;
+ ip_stack_t *ipst = ixa->ixa_ipst;
+ netstack_t *ns = ipst->ips_netstack;
- data_mp = mp->b_cont;
- io = (ipsec_out_t *)mp->b_rptr;
- ns = io->ipsec_out_ns;
+ ASSERT(ixa->ixa_flags & IXAF_IPSEC_SECURE);
if (proto == IPPROTO_ESP) {
ipsecesp_stack_t *espstack;
espstack = ns->netstack_ipsecesp;
- sa = &io->ipsec_out_esp_sa;
+ sa = &ixa->ixa_ipsec_esp_sa;
sadbp = &espstack->esp_sadb;
} else {
ipsecah_stack_t *ahstack;
ASSERT(proto == IPPROTO_AH);
ahstack = ns->netstack_ipsecah;
- sa = &io->ipsec_out_ah_sa;
+ sa = &ixa->ixa_ipsec_ah_sa;
sadbp = &ahstack->ah_sadb;
}
ASSERT(*sa == NULL);
- if (io->ipsec_out_v4) {
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
@@ -651,11 +631,9 @@ ipsec_outbound_sa(mblk_t *mp, uint_t proto)
dst_ptr = (uint32_t *)&dst6;
}
- cr = msg_getcred(data_mp, NULL);
-
mutex_enter(&bucket->isaf_lock);
- assoc = ipsec_getassocbyconn(bucket, io, src_ptr, dst_ptr, af,
- proto, cr);
+ assoc = ipsec_getassocbyconn(bucket, ixa, src_ptr, dst_ptr, af,
+ proto, ixa->ixa_tsl);
mutex_exit(&bucket->isaf_lock);
if (assoc == NULL)
@@ -674,17 +652,16 @@ ipsec_outbound_sa(mblk_t *mp, uint_t proto)
/*
* Inbound IPsec SA selection.
+ * Can return a pulled up mblk.
+ * When it returns non-NULL ahp is updated
*/
-
-ah_t *
-ipsec_inbound_ah_sa(mblk_t *mp, netstack_t *ns)
+mblk_t *
+ipsec_inbound_ah_sa(mblk_t *mp, ip_recv_attr_t *ira, ah_t **ahp)
{
- mblk_t *ipsec_in;
ipha_t *ipha;
ipsa_t *assoc;
ah_t *ah;
isaf_t *hptr;
- ipsec_in_t *ii;
boolean_t isv6;
ip6_t *ip6h;
int ah_offset;
@@ -692,20 +669,13 @@ ipsec_inbound_ah_sa(mblk_t *mp, netstack_t *ns)
int pullup_len;
sadb_t *sp;
sa_family_t af;
+ netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack;
ipsec_stack_t *ipss = ns->netstack_ipsec;
ipsecah_stack_t *ahstack = ns->netstack_ipsecah;
IP_AH_BUMP_STAT(ipss, in_requests);
- ASSERT(mp->b_datap->db_type == M_CTL);
-
- ipsec_in = mp;
- ii = (ipsec_in_t *)ipsec_in->b_rptr;
- mp = mp->b_cont;
-
- ASSERT(mp->b_datap->db_type == M_DATA);
-
- isv6 = !ii->ipsec_in_v4;
+ isv6 = !(ira->ira_flags & IRAF_IS_IPV4);
if (isv6) {
ip6h = (ip6_t *)mp->b_rptr;
ah_offset = ipsec_ah_get_hdr_size_v6(mp, B_TRUE);
@@ -729,7 +699,7 @@ ipsec_inbound_ah_sa(mblk_t *mp, netstack_t *ns)
SL_WARN | SL_ERROR,
"ipsec_inbound_ah_sa: Small AH header\n");
IP_AH_BUMP_STAT(ipss, in_discards);
- ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL,
+ ip_drop_packet(mp, B_TRUE, ira->ira_ill,
DROPPER(ipss, ipds_ah_bad_length),
&ipss->ipsec_dropper);
return (NULL);
@@ -763,11 +733,11 @@ ipsec_inbound_ah_sa(mblk_t *mp, netstack_t *ns)
assoc->ipsa_state == IPSA_STATE_ACTIVE_ELSEWHERE) {
IP_AH_BUMP_STAT(ipss, lookup_failure);
IP_AH_BUMP_STAT(ipss, in_discards);
- ipsecah_in_assocfailure(ipsec_in, 0,
+ ipsecah_in_assocfailure(mp, 0,
SL_ERROR | SL_CONSOLE | SL_WARN,
"ipsec_inbound_ah_sa: No association found for "
"spi 0x%x, dst addr %s\n",
- ah->ah_spi, dst_ptr, af, ahstack);
+ ah->ah_spi, dst_ptr, af, ira);
if (assoc != NULL) {
IPSA_REFRELE(assoc);
}
@@ -775,33 +745,44 @@ ipsec_inbound_ah_sa(mblk_t *mp, netstack_t *ns)
}
if (assoc->ipsa_state == IPSA_STATE_LARVAL &&
- sadb_set_lpkt(assoc, ipsec_in, ns)) {
+ sadb_set_lpkt(assoc, mp, ira)) {
/* Not fully baked; swap the packet under a rock until then */
IPSA_REFRELE(assoc);
return (NULL);
}
+ /* Are the IPsec fields initialized at all? */
+ if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
+ ira->ira_ipsec_action = NULL;
+ ira->ira_ipsec_ah_sa = NULL;
+ ira->ira_ipsec_esp_sa = NULL;
+ }
+
/*
* Save a reference to the association so that it can
* be retrieved after execution. We free any AH SA reference
* already there (innermost SA "wins". The reference to
* the SA will also be used later when doing the policy checks.
*/
-
- if (ii->ipsec_in_ah_sa != NULL) {
- IPSA_REFRELE(ii->ipsec_in_ah_sa);
+ if (ira->ira_ipsec_ah_sa != NULL) {
+ IPSA_REFRELE(ira->ira_ipsec_ah_sa);
}
- ii->ipsec_in_ah_sa = assoc;
+ ira->ira_flags |= IRAF_IPSEC_SECURE;
+ ira->ira_ipsec_ah_sa = assoc;
- return (ah);
+ *ahp = ah;
+ return (mp);
}
-esph_t *
-ipsec_inbound_esp_sa(mblk_t *ipsec_in_mp, netstack_t *ns)
+/*
+ * Can return a pulled up mblk.
+ * When it returns non-NULL esphp is updated
+ */
+mblk_t *
+ipsec_inbound_esp_sa(mblk_t *data_mp, ip_recv_attr_t *ira, esph_t **esphp)
{
- mblk_t *data_mp, *placeholder;
+ mblk_t *placeholder;
uint32_t *src_ptr, *dst_ptr;
- ipsec_in_t *ii;
ipha_t *ipha;
ip6_t *ip6h;
esph_t *esph;
@@ -811,19 +792,13 @@ ipsec_inbound_esp_sa(mblk_t *ipsec_in_mp, netstack_t *ns)
sa_family_t af;
boolean_t isv6;
sadb_t *sp;
+ netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack;
ipsec_stack_t *ipss = ns->netstack_ipsec;
ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
IP_ESP_BUMP_STAT(ipss, in_requests);
- ASSERT(ipsec_in_mp->b_datap->db_type == M_CTL);
-
- /* We have IPSEC_IN already! */
- ii = (ipsec_in_t *)ipsec_in_mp->b_rptr;
- data_mp = ipsec_in_mp->b_cont;
- ASSERT(ii->ipsec_in_type == IPSEC_IN);
-
- isv6 = !ii->ipsec_in_v4;
+ isv6 = !(ira->ira_flags & IRAF_IS_IPV4);
if (isv6) {
ip6h = (ip6_t *)data_mp->b_rptr;
} else {
@@ -841,17 +816,11 @@ ipsec_inbound_esp_sa(mblk_t *ipsec_in_mp, netstack_t *ns)
* actual packet length.
*/
if (data_mp->b_datap->db_ref > 1 ||
- (data_mp->b_wptr - data_mp->b_rptr) <
- (isv6 ? (ntohs(ip6h->ip6_plen) + sizeof (ip6_t))
- : ntohs(ipha->ipha_length))) {
+ (data_mp->b_wptr - data_mp->b_rptr) < ira->ira_pktlen) {
placeholder = msgpullup(data_mp, -1);
if (placeholder == NULL) {
IP_ESP_BUMP_STAT(ipss, in_discards);
- /*
- * TODO: Extract inbound interface from the IPSEC_IN
- * message's ii->ipsec_in_rill_index.
- */
- ip_drop_packet(ipsec_in_mp, B_TRUE, NULL, NULL,
+ ip_drop_packet(data_mp, B_TRUE, ira->ira_ill,
DROPPER(ipss, ipds_esp_nomem),
&ipss->ipsec_dropper);
return (NULL);
@@ -859,7 +828,6 @@ ipsec_inbound_esp_sa(mblk_t *ipsec_in_mp, netstack_t *ns)
/* Reset packet with new pulled up mblk. */
freemsg(data_mp);
data_mp = placeholder;
- ipsec_in_mp->b_cont = data_mp;
}
}
@@ -904,11 +872,11 @@ ipsec_inbound_esp_sa(mblk_t *ipsec_in_mp, netstack_t *ns)
/* This is a loggable error! AUDIT ME! */
IP_ESP_BUMP_STAT(ipss, lookup_failure);
IP_ESP_BUMP_STAT(ipss, in_discards);
- ipsecesp_in_assocfailure(ipsec_in_mp, 0,
+ ipsecesp_in_assocfailure(data_mp, 0,
SL_ERROR | SL_CONSOLE | SL_WARN,
"ipsec_inbound_esp_sa: No association found for "
"spi 0x%x, dst addr %s\n",
- esph->esph_spi, dst_ptr, af, espstack);
+ esph->esph_spi, dst_ptr, af, ira);
if (ipsa != NULL) {
IPSA_REFRELE(ipsa);
}
@@ -916,22 +884,31 @@ ipsec_inbound_esp_sa(mblk_t *ipsec_in_mp, netstack_t *ns)
}
if (ipsa->ipsa_state == IPSA_STATE_LARVAL &&
- sadb_set_lpkt(ipsa, ipsec_in_mp, ns)) {
+ sadb_set_lpkt(ipsa, data_mp, ira)) {
/* Not fully baked; swap the packet under a rock until then */
IPSA_REFRELE(ipsa);
return (NULL);
}
+ /* Are the IPsec fields initialized at all? */
+ if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
+ ira->ira_ipsec_action = NULL;
+ ira->ira_ipsec_ah_sa = NULL;
+ ira->ira_ipsec_esp_sa = NULL;
+ }
+
/*
* Save a reference to the association so that it can
* be retrieved after execution. We free any AH SA reference
* already there (innermost SA "wins". The reference to
* the SA will also be used later when doing the policy checks.
*/
- if (ii->ipsec_in_esp_sa != NULL) {
- IPSA_REFRELE(ii->ipsec_in_esp_sa);
+ if (ira->ira_ipsec_esp_sa != NULL) {
+ IPSA_REFRELE(ira->ira_ipsec_esp_sa);
}
- ii->ipsec_in_esp_sa = ipsa;
+ ira->ira_flags |= IRAF_IPSEC_SECURE;
+ ira->ira_ipsec_esp_sa = ipsa;
- return (esph);
+ *esphp = esph;
+ return (data_mp);
}
diff --git a/usr/src/uts/common/inet/ip/ip_srcid.c b/usr/src/uts/common/inet/ip/ip_srcid.c
index 949508a796..f6507d6413 100644
--- a/usr/src/uts/common/inet/ip/ip_srcid.c
+++ b/usr/src/uts/common/inet/ip/ip_srcid.c
@@ -101,11 +101,7 @@
#include <netinet/ip_mroute.h>
#include <inet/ipclassifier.h>
-#include <net/pfkeyv2.h>
-#include <inet/ipsec_info.h>
-#include <inet/sadb.h>
#include <sys/kmem.h>
-#include <inet/ipsec_impl.h>
static uint_t srcid_nextid(ip_stack_t *);
static srcid_map_t **srcid_lookup_addr(const in6_addr_t *addr,
@@ -239,7 +235,7 @@ ip_srcid_find_id(uint_t id, in6_addr_t *addr, zoneid_t zoneid,
rw_enter(&ipst->ips_srcid_lock, RW_READER);
smpp = srcid_lookup_id(id, ipst);
smp = *smpp;
- if (smp == NULL || smp->sm_zoneid != zoneid) {
+ if (smp == NULL || (smp->sm_zoneid != zoneid && zoneid != ALL_ZONES)) {
/* Not preset */
ip1dbg(("ip_srcid_find_id: unknown %u or in wrong zone\n", id));
*addr = ipv6_all_zeros;
@@ -290,7 +286,7 @@ srcid_lookup_addr(const in6_addr_t *addr, zoneid_t zoneid, ip_stack_t *ipst)
smpp = &ipst->ips_srcid_head;
while (*smpp != NULL) {
if (IN6_ARE_ADDR_EQUAL(&(*smpp)->sm_addr, addr) &&
- zoneid == (*smpp)->sm_zoneid)
+ (zoneid == (*smpp)->sm_zoneid || zoneid == ALL_ZONES))
return (smpp);
smpp = &(*smpp)->sm_next;
}
diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c
index 45683ec967..31fa14b4af 100644
--- a/usr/src/uts/common/inet/ip/ipclassifier.c
+++ b/usr/src/uts/common/inet/ip/ipclassifier.c
@@ -52,16 +52,12 @@
* asynchronous and the reference protects the connection from being destroyed
* before its processing is finished).
*
- * send and receive functions are currently used for TCP only. The send function
- * determines the IP entry point for the packet once it leaves TCP to be sent to
- * the destination address. The receive function is used by IP when the packet
- * should be passed for TCP processing. When a new connection is created these
- * are set to ip_output() and tcp_input() respectively. During the lifetime of
- * the connection the send and receive functions may change depending on the
- * changes in the connection state. For example, Once the connection is bound to
- * an addresse, the receive function for this connection is set to
- * tcp_conn_request(). This allows incoming SYNs to go directly into the
- * listener SYN processing function without going to tcp_input() first.
+ * conn_recv is used to pass up packets to the ULP.
+ * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for
+ * a listener, and changes to tcp_input_listener as the listener has picked a
+ * good squeue. For other cases it is set to tcp_input_data.
+ *
+ * conn_recvicmp is used to pass up ICMP errors to the ULP.
*
* Classifier uses several hash tables:
*
@@ -91,8 +87,8 @@
* Connection Lookup:
* ------------------
*
- * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, zoneid, ip_stack)
- * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, zoneid, ip_stack)
+ * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack)
+ * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack)
*
* Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
* it can't find any associated connection. If the connection is found, its
@@ -107,9 +103,12 @@
* hdr_len: The size of IP header. It is used to find TCP or UDP header in
* the packet.
*
- * zoneid: The zone in which the returned connection must be; the zoneid
- * corresponding to the ire_zoneid on the IRE located for the
- * packet's destination address.
+ * ira->ira_zoneid: The zone in which the returned connection must be; the
+ * zoneid corresponding to the ire_zoneid on the IRE located for
+ * the packet's destination address.
+ *
+ * ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and
+ * IRAF_TX_SHARED_ADDR flags
*
* For TCP connections, the lookup order is as follows:
* 5-tuple {src, dst, protocol, local port, remote port}
@@ -156,7 +155,7 @@
* any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the
* receiver's label must dominate the sender's default label.
*
- * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcph_t *, int, ip_stack);
+ * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack);
* conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
* ip_stack);
*
@@ -184,34 +183,26 @@
* Table Updates
* -------------
*
- * int ipcl_conn_insert(connp, protocol, src, dst, ports)
- * int ipcl_conn_insert_v6(connp, protocol, src, dst, ports, ifindex)
+ * int ipcl_conn_insert(connp);
+ * int ipcl_conn_insert_v4(connp);
+ * int ipcl_conn_insert_v6(connp);
*
* Insert 'connp' in the ipcl_conn_fanout.
* Arguements :
* connp conn_t to be inserted
- * protocol connection protocol
- * src source address
- * dst destination address
- * ports local and remote port
- * ifindex interface index for IPv6 connections
*
* Return value :
* 0 if connp was inserted
* EADDRINUSE if the connection with the same tuple
* already exists.
*
- * int ipcl_bind_insert(connp, protocol, src, lport);
- * int ipcl_bind_insert_v6(connp, protocol, src, lport);
+ * int ipcl_bind_insert(connp);
+ * int ipcl_bind_insert_v4(connp);
+ * int ipcl_bind_insert_v6(connp);
*
* Insert 'connp' in ipcl_bind_fanout.
* Arguements :
* connp conn_t to be inserted
- * protocol connection protocol
- * src source address connection wants
- * to bind to
- * lport local port connection wants to
- * bind to
*
*
* void ipcl_hash_remove(connp);
@@ -261,6 +252,8 @@
#include <netinet/icmp6.h>
#include <inet/ip.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ire.h>
#include <inet/ip6.h>
#include <inet/ip_ndp.h>
#include <inet/ip_impl.h>
@@ -280,19 +273,6 @@
#include <sys/tsol/tnet.h>
#include <sys/sockio.h>
-#ifdef DEBUG
-#define IPCL_DEBUG
-#else
-#undef IPCL_DEBUG
-#endif
-
-#ifdef IPCL_DEBUG
-int ipcl_debug_level = 0;
-#define IPCL_DEBUG_LVL(level, args) \
- if (ipcl_debug_level & level) { printf args; }
-#else
-#define IPCL_DEBUG_LVL(level, args) {; }
-#endif
/* Old value for compatibility. Setable in /etc/system */
uint_t tcp_conn_hash_size = 0;
@@ -336,10 +316,8 @@ typedef union itc_s {
struct kmem_cache *tcp_conn_cache;
struct kmem_cache *ip_conn_cache;
-struct kmem_cache *ip_helper_stream_cache;
extern struct kmem_cache *sctp_conn_cache;
extern struct kmem_cache *tcp_sack_info_cache;
-extern struct kmem_cache *tcp_iphc_cache;
struct kmem_cache *udp_conn_cache;
struct kmem_cache *rawip_conn_cache;
struct kmem_cache *rts_conn_cache;
@@ -362,34 +340,6 @@ static void rawip_conn_destructor(void *, void *);
static int rts_conn_constructor(void *, void *, int);
static void rts_conn_destructor(void *, void *);
-static int ip_helper_stream_constructor(void *, void *, int);
-static void ip_helper_stream_destructor(void *, void *);
-
-boolean_t ip_use_helper_cache = B_TRUE;
-
-/*
- * Hook functions to enable cluster networking
- * On non-clustered systems these vectors must always be NULL.
- */
-extern void (*cl_inet_listen)(netstackid_t, uint8_t, sa_family_t,
- uint8_t *, in_port_t, void *);
-extern void (*cl_inet_unlisten)(netstackid_t, uint8_t, sa_family_t,
- uint8_t *, in_port_t, void *);
-
-#ifdef IPCL_DEBUG
-#define INET_NTOA_BUFSIZE 18
-
-static char *
-inet_ntoa_r(uint32_t in, char *b)
-{
- unsigned char *p;
-
- p = (unsigned char *)&in;
- (void) sprintf(b, "%d.%d.%d.%d", p[0], p[1], p[2], p[3]);
- return (b);
-}
-#endif
-
/*
* Global (for all stack instances) init routine
*/
@@ -420,15 +370,6 @@ ipcl_g_init(void)
sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
rts_conn_constructor, rts_conn_destructor,
NULL, NULL, NULL, 0);
-
- if (ip_use_helper_cache) {
- ip_helper_stream_cache = kmem_cache_create
- ("ip_helper_stream_cache", sizeof (ip_helper_stream_info_t),
- CACHE_ALIGN_SIZE, ip_helper_stream_constructor,
- ip_helper_stream_destructor, NULL, NULL, NULL, 0);
- } else {
- ip_helper_stream_cache = NULL;
- }
}
/*
@@ -493,10 +434,10 @@ ipcl_init(ip_stack_t *ipst)
MUTEX_DEFAULT, NULL);
}
- ipst->ips_ipcl_proto_fanout = kmem_zalloc(IPPROTO_MAX *
+ ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX *
sizeof (connf_t), KM_SLEEP);
for (i = 0; i < IPPROTO_MAX; i++) {
- mutex_init(&ipst->ips_ipcl_proto_fanout[i].connf_lock, NULL,
+ mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL,
MUTEX_DEFAULT, NULL);
}
@@ -576,11 +517,12 @@ ipcl_destroy(ip_stack_t *ipst)
ipst->ips_ipcl_bind_fanout = NULL;
for (i = 0; i < IPPROTO_MAX; i++) {
- ASSERT(ipst->ips_ipcl_proto_fanout[i].connf_head == NULL);
- mutex_destroy(&ipst->ips_ipcl_proto_fanout[i].connf_lock);
+ ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL);
+ mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock);
}
- kmem_free(ipst->ips_ipcl_proto_fanout, IPPROTO_MAX * sizeof (connf_t));
- ipst->ips_ipcl_proto_fanout = NULL;
+ kmem_free(ipst->ips_ipcl_proto_fanout_v4,
+ IPPROTO_MAX * sizeof (connf_t));
+ ipst->ips_ipcl_proto_fanout_v4 = NULL;
for (i = 0; i < IPPROTO_MAX; i++) {
ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
@@ -636,7 +578,6 @@ conn_t *
ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
{
conn_t *connp;
- sctp_stack_t *sctps;
struct kmem_cache *conn_cache;
switch (type) {
@@ -644,10 +585,10 @@ ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
return (NULL);
sctp_conn_init(connp);
- sctps = ns->netstack_sctp;
- SCTP_G_Q_REFHOLD(sctps);
netstack_hold(ns);
connp->conn_netstack = ns;
+ connp->conn_ixa->ixa_ipst = ns->netstack_ip;
+ ipcl_globalhash_insert(connp);
return (connp);
case IPCL_TCPCONN:
@@ -681,6 +622,7 @@ ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
connp->conn_ref = 1;
netstack_hold(ns);
connp->conn_netstack = ns;
+ connp->conn_ixa->ixa_ipst = ns->netstack_ip;
ipcl_globalhash_insert(connp);
return (connp);
}
@@ -693,61 +635,61 @@ ipcl_conn_destroy(conn_t *connp)
ASSERT(!MUTEX_HELD(&connp->conn_lock));
ASSERT(connp->conn_ref == 0);
- ASSERT(connp->conn_ire_cache == NULL);
DTRACE_PROBE1(conn__destroy, conn_t *, connp);
- if (connp->conn_effective_cred != NULL) {
- crfree(connp->conn_effective_cred);
- connp->conn_effective_cred = NULL;
- }
-
if (connp->conn_cred != NULL) {
crfree(connp->conn_cred);
connp->conn_cred = NULL;
}
+ if (connp->conn_ht_iphc != NULL) {
+ kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
+ connp->conn_ht_iphc = NULL;
+ connp->conn_ht_iphc_allocated = 0;
+ connp->conn_ht_iphc_len = 0;
+ connp->conn_ht_ulp = NULL;
+ connp->conn_ht_ulp_len = 0;
+ }
+ ip_pkt_free(&connp->conn_xmit_ipp);
+
ipcl_globalhash_remove(connp);
- /* FIXME: add separate tcp_conn_free()? */
+ if (connp->conn_latch != NULL) {
+ IPLATCH_REFRELE(connp->conn_latch);
+ connp->conn_latch = NULL;
+ }
+ if (connp->conn_latch_in_policy != NULL) {
+ IPPOL_REFRELE(connp->conn_latch_in_policy);
+ connp->conn_latch_in_policy = NULL;
+ }
+ if (connp->conn_latch_in_action != NULL) {
+ IPACT_REFRELE(connp->conn_latch_in_action);
+ connp->conn_latch_in_action = NULL;
+ }
+ if (connp->conn_policy != NULL) {
+ IPPH_REFRELE(connp->conn_policy, ns);
+ connp->conn_policy = NULL;
+ }
+
+ if (connp->conn_ipsec_opt_mp != NULL) {
+ freemsg(connp->conn_ipsec_opt_mp);
+ connp->conn_ipsec_opt_mp = NULL;
+ }
+
if (connp->conn_flags & IPCL_TCPCONN) {
- tcp_t *tcp = connp->conn_tcp;
- tcp_stack_t *tcps;
-
- ASSERT(tcp != NULL);
- tcps = tcp->tcp_tcps;
- if (tcps != NULL) {
- if (connp->conn_latch != NULL) {
- IPLATCH_REFRELE(connp->conn_latch, ns);
- connp->conn_latch = NULL;
- }
- if (connp->conn_policy != NULL) {
- IPPH_REFRELE(connp->conn_policy, ns);
- connp->conn_policy = NULL;
- }
- tcp->tcp_tcps = NULL;
- TCPS_REFRELE(tcps);
- }
+ tcp_t *tcp = connp->conn_tcp;
tcp_free(tcp);
mp = tcp->tcp_timercache;
- tcp->tcp_cred = NULL;
+
+ tcp->tcp_tcps = NULL;
if (tcp->tcp_sack_info != NULL) {
bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t));
kmem_cache_free(tcp_sack_info_cache,
tcp->tcp_sack_info);
}
- if (tcp->tcp_iphc != NULL) {
- if (tcp->tcp_hdr_grown) {
- kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len);
- } else {
- bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
- kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc);
- }
- tcp->tcp_iphc_len = 0;
- }
- ASSERT(tcp->tcp_iphc_len == 0);
/*
* tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
@@ -759,17 +701,15 @@ ipcl_conn_destroy(conn_t *connp)
mutex_destroy(&tcp->tcp_rsrv_mp_lock);
}
- ASSERT(connp->conn_latch == NULL);
- ASSERT(connp->conn_policy == NULL);
-
+ ipcl_conn_cleanup(connp);
+ connp->conn_flags = IPCL_TCPCONN;
if (ns != NULL) {
ASSERT(tcp->tcp_tcps == NULL);
connp->conn_netstack = NULL;
+ connp->conn_ixa->ixa_ipst = NULL;
netstack_rele(ns);
}
- ipcl_conn_cleanup(connp);
- connp->conn_flags = IPCL_TCPCONN;
bzero(tcp, sizeof (tcp_t));
tcp->tcp_timercache = mp;
@@ -777,18 +717,6 @@ ipcl_conn_destroy(conn_t *connp)
kmem_cache_free(tcp_conn_cache, connp);
return;
}
- if (connp->conn_latch != NULL) {
- IPLATCH_REFRELE(connp->conn_latch, connp->conn_netstack);
- connp->conn_latch = NULL;
- }
- if (connp->conn_policy != NULL) {
- IPPH_REFRELE(connp->conn_policy, connp->conn_netstack);
- connp->conn_policy = NULL;
- }
- if (connp->conn_ipsec_opt_mp != NULL) {
- freemsg(connp->conn_ipsec_opt_mp);
- connp->conn_ipsec_opt_mp = NULL;
- }
if (connp->conn_flags & IPCL_SCTPCONN) {
ASSERT(ns != NULL);
@@ -796,21 +724,21 @@ ipcl_conn_destroy(conn_t *connp)
return;
}
+ ipcl_conn_cleanup(connp);
if (ns != NULL) {
connp->conn_netstack = NULL;
+ connp->conn_ixa->ixa_ipst = NULL;
netstack_rele(ns);
}
- ipcl_conn_cleanup(connp);
-
/* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
if (connp->conn_flags & IPCL_UDPCONN) {
connp->conn_flags = IPCL_UDPCONN;
kmem_cache_free(udp_conn_cache, connp);
} else if (connp->conn_flags & IPCL_RAWIPCONN) {
-
connp->conn_flags = IPCL_RAWIPCONN;
- connp->conn_ulp = IPPROTO_ICMP;
+ connp->conn_proto = IPPROTO_ICMP;
+ connp->conn_ixa->ixa_protocol = connp->conn_proto;
kmem_cache_free(rawip_conn_cache, connp);
} else if (connp->conn_flags & IPCL_RTSCONN) {
connp->conn_flags = IPCL_RTSCONN;
@@ -826,7 +754,6 @@ ipcl_conn_destroy(conn_t *connp)
/*
* Running in cluster mode - deregister listener information
*/
-
static void
ipcl_conn_unlisten(conn_t *connp)
{
@@ -837,12 +764,12 @@ ipcl_conn_unlisten(conn_t *connp)
sa_family_t addr_family;
uint8_t *laddrp;
- if (connp->conn_pkt_isv6) {
+ if (connp->conn_ipversion == IPV6_VERSION) {
addr_family = AF_INET6;
- laddrp = (uint8_t *)&connp->conn_bound_source_v6;
+ laddrp = (uint8_t *)&connp->conn_bound_addr_v6;
} else {
addr_family = AF_INET;
- laddrp = (uint8_t *)&connp->conn_bound_source;
+ laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
}
(*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid,
IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL);
@@ -859,8 +786,6 @@ ipcl_conn_unlisten(conn_t *connp)
connf_t *connfp = (connp)->conn_fanout; \
ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \
if (connfp != NULL) { \
- IPCL_DEBUG_LVL(4, ("IPCL_HASH_REMOVE: connp %p", \
- (void *)(connp))); \
mutex_enter(&connfp->connf_lock); \
if ((connp)->conn_next != NULL) \
(connp)->conn_next->conn_prev = \
@@ -884,7 +809,11 @@ ipcl_conn_unlisten(conn_t *connp)
void
ipcl_hash_remove(conn_t *connp)
{
+ uint8_t protocol = connp->conn_proto;
+
IPCL_HASH_REMOVE(connp);
+ if (protocol == IPPROTO_RSVP)
+ ill_set_inputfn_all(connp->conn_netstack->netstack_ip);
}
/*
@@ -937,8 +866,6 @@ ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp)
}
#define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \
- IPCL_DEBUG_LVL(8, ("IPCL_HASH_INSERT_CONNECTED: connfp %p " \
- "connp %p", (void *)(connfp), (void *)(connp))); \
IPCL_HASH_REMOVE((connp)); \
mutex_enter(&(connfp)->connf_lock); \
IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \
@@ -947,13 +874,11 @@ ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp)
#define IPCL_HASH_INSERT_BOUND(connfp, connp) { \
conn_t *pconnp = NULL, *nconnp; \
- IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_BOUND: connfp %p " \
- "connp %p", (void *)connfp, (void *)(connp))); \
IPCL_HASH_REMOVE((connp)); \
mutex_enter(&(connfp)->connf_lock); \
nconnp = (connfp)->connf_head; \
while (nconnp != NULL && \
- !_IPCL_V4_MATCH_ANY(nconnp->conn_srcv6)) { \
+ !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) { \
pconnp = nconnp; \
nconnp = nconnp->conn_next; \
} \
@@ -977,16 +902,14 @@ ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp)
#define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \
conn_t **list, *prev, *next; \
boolean_t isv4mapped = \
- IN6_IS_ADDR_V4MAPPED(&(connp)->conn_srcv6); \
- IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_WILDCARD: connfp %p " \
- "connp %p", (void *)(connfp), (void *)(connp))); \
+ IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6); \
IPCL_HASH_REMOVE((connp)); \
mutex_enter(&(connfp)->connf_lock); \
list = &(connfp)->connf_head; \
prev = NULL; \
while ((next = *list) != NULL) { \
if (isv4mapped && \
- IN6_IS_ADDR_UNSPECIFIED(&next->conn_srcv6) && \
+ IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) && \
connp->conn_zoneid == next->conn_zoneid) { \
(connp)->conn_next = next; \
if (prev != NULL) \
@@ -1012,44 +935,13 @@ ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
IPCL_HASH_INSERT_WILDCARD(connfp, connp);
}
-void
-ipcl_proto_insert(conn_t *connp, uint8_t protocol)
-{
- connf_t *connfp;
- ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
-
- ASSERT(connp != NULL);
- ASSERT((connp->conn_mac_mode == CONN_MAC_DEFAULT) ||
- protocol == IPPROTO_AH || protocol == IPPROTO_ESP);
-
- connp->conn_ulp = protocol;
-
- /* Insert it in the protocol hash */
- connfp = &ipst->ips_ipcl_proto_fanout[protocol];
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
-}
-
-void
-ipcl_proto_insert_v6(conn_t *connp, uint8_t protocol)
-{
- connf_t *connfp;
- ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
-
- ASSERT(connp != NULL);
- ASSERT((connp->conn_mac_mode == CONN_MAC_DEFAULT) ||
- protocol == IPPROTO_AH || protocol == IPPROTO_ESP);
-
- connp->conn_ulp = protocol;
-
- /* Insert it in the Bind Hash */
- connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
-}
-
/*
* Because the classifier is used to classify inbound packets, the destination
* address is meant to be our local tunnel address (tunnel source), and the
* source the remote tunnel address (tunnel destination).
+ *
+ * Note that conn_proto can't be used for fanout since the upper protocol
+ * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
*/
conn_t *
ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst)
@@ -1128,13 +1020,13 @@ ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
oconnp = oconnp->conn_next) {
if (oconnp->conn_lport == lport &&
oconnp->conn_zoneid == connp->conn_zoneid &&
- oconnp->conn_af_isv6 == connp->conn_af_isv6 &&
- ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
- IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_srcv6) ||
- IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6) ||
- IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_srcv6)) ||
- IN6_ARE_ADDR_EQUAL(&oconnp->conn_srcv6,
- &connp->conn_srcv6))) {
+ oconnp->conn_family == connp->conn_family &&
+ ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
+ IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) ||
+ IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) ||
+ IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) ||
+ IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6,
+ &connp->conn_laddr_v6))) {
break;
}
}
@@ -1142,10 +1034,10 @@ ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
if (oconnp != NULL)
return (EADDRNOTAVAIL);
- if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6) ||
- IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_remv6)) {
- if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
- IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6)) {
+ if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) ||
+ IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
+ if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
+ IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
IPCL_HASH_INSERT_WILDCARD(connfp, connp);
} else {
IPCL_HASH_INSERT_BOUND(connfp, connp);
@@ -1157,17 +1049,18 @@ ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
}
static int
-ipcl_iptun_hash_insert(conn_t *connp, ipaddr_t src, ipaddr_t dst,
- ip_stack_t *ipst)
+ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst)
{
connf_t *connfp;
conn_t *tconnp;
+ ipaddr_t laddr = connp->conn_laddr_v4;
+ ipaddr_t faddr = connp->conn_faddr_v4;
- connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(src, dst)];
+ connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)];
mutex_enter(&connfp->connf_lock);
for (tconnp = connfp->connf_head; tconnp != NULL;
tconnp = tconnp->conn_next) {
- if (IPCL_IPTUN_MATCH(tconnp, src, dst)) {
+ if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) {
/* A tunnel is already bound to these addresses. */
mutex_exit(&connfp->connf_lock);
return (EADDRINUSE);
@@ -1179,17 +1072,18 @@ ipcl_iptun_hash_insert(conn_t *connp, ipaddr_t src, ipaddr_t dst,
}
static int
-ipcl_iptun_hash_insert_v6(conn_t *connp, const in6_addr_t *src,
- const in6_addr_t *dst, ip_stack_t *ipst)
+ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst)
{
connf_t *connfp;
conn_t *tconnp;
+ in6_addr_t *laddr = &connp->conn_laddr_v6;
+ in6_addr_t *faddr = &connp->conn_faddr_v6;
- connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(src, dst)];
+ connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)];
mutex_enter(&connfp->connf_lock);
for (tconnp = connfp->connf_head; tconnp != NULL;
tconnp = tconnp->conn_next) {
- if (IPCL_IPTUN_MATCH_V6(tconnp, src, dst)) {
+ if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) {
/* A tunnel is already bound to these addresses. */
mutex_exit(&connfp->connf_lock);
return (EADDRINUSE);
@@ -1213,12 +1107,12 @@ check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
connf_t *connfp;
conn_t *tconn;
- connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp];
+ connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto];
mutex_enter(&connfp->connf_lock);
for (tconn = connfp->connf_head; tconn != NULL;
tconn = tconn->conn_next) {
/* We don't allow v4 fallback for v6 raw socket */
- if (connp->conn_af_isv6 != tconn->conn_af_isv6)
+ if (connp->conn_family != tconn->conn_family)
continue;
/* If neither is exempt, then there's no conflict */
if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
@@ -1228,9 +1122,9 @@ check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
if (connp->conn_zoneid == tconn->conn_zoneid)
continue;
/* If both are bound to different specific addrs, ok */
- if (connp->conn_src != INADDR_ANY &&
- tconn->conn_src != INADDR_ANY &&
- connp->conn_src != tconn->conn_src)
+ if (connp->conn_laddr_v4 != INADDR_ANY &&
+ tconn->conn_laddr_v4 != INADDR_ANY &&
+ connp->conn_laddr_v4 != tconn->conn_laddr_v4)
continue;
/* These two conflict; fail */
break;
@@ -1245,12 +1139,12 @@ check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
connf_t *connfp;
conn_t *tconn;
- connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp];
+ connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto];
mutex_enter(&connfp->connf_lock);
for (tconn = connfp->connf_head; tconn != NULL;
tconn = tconn->conn_next) {
/* We don't allow v4 fallback for v6 raw socket */
- if (connp->conn_af_isv6 != tconn->conn_af_isv6)
+ if (connp->conn_family != tconn->conn_family)
continue;
/* If neither is exempt, then there's no conflict */
if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
@@ -1260,9 +1154,10 @@ check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
if (connp->conn_zoneid == tconn->conn_zoneid)
continue;
/* If both are bound to different addrs, ok */
- if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) &&
- !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_srcv6) &&
- !IN6_ARE_ADDR_EQUAL(&connp->conn_srcv6, &tconn->conn_srcv6))
+ if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) &&
+ !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) &&
+ !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
+ &tconn->conn_laddr_v6))
continue;
/* These two conflict; fail */
break;
@@ -1273,28 +1168,29 @@ check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
/*
* (v4, v6) bind hash insertion routines
+ * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport)
*/
+
+int
+ipcl_bind_insert(conn_t *connp)
+{
+ if (connp->conn_ipversion == IPV6_VERSION)
+ return (ipcl_bind_insert_v6(connp));
+ else
+ return (ipcl_bind_insert_v4(connp));
+}
+
int
-ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport)
+ipcl_bind_insert_v4(conn_t *connp)
{
connf_t *connfp;
-#ifdef IPCL_DEBUG
- char buf[INET_NTOA_BUFSIZE];
-#endif
int ret = 0;
ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
-
- ASSERT(connp);
-
- IPCL_DEBUG_LVL(64, ("ipcl_bind_insert: connp %p, src = %s, "
- "port = %d\n", (void *)connp, inet_ntoa_r(src, buf), lport));
-
- connp->conn_ulp = protocol;
- IN6_IPADDR_TO_V4MAPPED(src, &connp->conn_srcv6);
- connp->conn_lport = lport;
+ uint16_t lport = connp->conn_lport;
+ uint8_t protocol = connp->conn_proto;
if (IPCL_IS_IPTUN(connp))
- return (ipcl_iptun_hash_insert(connp, src, INADDR_ANY, ipst));
+ return (ipcl_iptun_hash_insert(connp, ipst));
switch (protocol) {
default:
@@ -1304,45 +1200,40 @@ ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport)
/* FALLTHROUGH */
case IPPROTO_UDP:
if (protocol == IPPROTO_UDP) {
- IPCL_DEBUG_LVL(64,
- ("ipcl_bind_insert: connp %p - udp\n",
- (void *)connp));
connfp = &ipst->ips_ipcl_udp_fanout[
IPCL_UDP_HASH(lport, ipst)];
} else {
- IPCL_DEBUG_LVL(64,
- ("ipcl_bind_insert: connp %p - protocol\n",
- (void *)connp));
- connfp = &ipst->ips_ipcl_proto_fanout[protocol];
+ connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
}
- if (connp->conn_rem != INADDR_ANY) {
+ if (connp->conn_faddr_v4 != INADDR_ANY) {
IPCL_HASH_INSERT_CONNECTED(connfp, connp);
- } else if (connp->conn_src != INADDR_ANY) {
+ } else if (connp->conn_laddr_v4 != INADDR_ANY) {
IPCL_HASH_INSERT_BOUND(connfp, connp);
} else {
IPCL_HASH_INSERT_WILDCARD(connfp, connp);
}
+ if (protocol == IPPROTO_RSVP)
+ ill_set_inputfn_all(ipst);
break;
case IPPROTO_TCP:
-
/* Insert it in the Bind Hash */
ASSERT(connp->conn_zoneid != ALL_ZONES);
connfp = &ipst->ips_ipcl_bind_fanout[
IPCL_BIND_HASH(lport, ipst)];
- if (connp->conn_src != INADDR_ANY) {
+ if (connp->conn_laddr_v4 != INADDR_ANY) {
IPCL_HASH_INSERT_BOUND(connfp, connp);
} else {
IPCL_HASH_INSERT_WILDCARD(connfp, connp);
}
if (cl_inet_listen != NULL) {
- ASSERT(!connp->conn_pkt_isv6);
+ ASSERT(connp->conn_ipversion == IPV4_VERSION);
connp->conn_flags |= IPCL_CL_LISTENER;
(*cl_inet_listen)(
connp->conn_netstack->netstack_stackid,
IPPROTO_TCP, AF_INET,
- (uint8_t *)&connp->conn_bound_source, lport, NULL);
+ (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL);
}
break;
@@ -1355,20 +1246,16 @@ ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport)
}
int
-ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
- uint16_t lport)
+ipcl_bind_insert_v6(conn_t *connp)
{
connf_t *connfp;
int ret = 0;
ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
-
- ASSERT(connp != NULL); connp->conn_ulp = protocol;
- connp->conn_srcv6 = *src;
- connp->conn_lport = lport;
+ uint16_t lport = connp->conn_lport;
+ uint8_t protocol = connp->conn_proto;
if (IPCL_IS_IPTUN(connp)) {
- return (ipcl_iptun_hash_insert_v6(connp, src, &ipv6_all_zeros,
- ipst));
+ return (ipcl_iptun_hash_insert_v6(connp, ipst));
}
switch (protocol) {
@@ -1379,21 +1266,15 @@ ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
/* FALLTHROUGH */
case IPPROTO_UDP:
if (protocol == IPPROTO_UDP) {
- IPCL_DEBUG_LVL(128,
- ("ipcl_bind_insert_v6: connp %p - udp\n",
- (void *)connp));
connfp = &ipst->ips_ipcl_udp_fanout[
IPCL_UDP_HASH(lport, ipst)];
} else {
- IPCL_DEBUG_LVL(128,
- ("ipcl_bind_insert_v6: connp %p - protocol\n",
- (void *)connp));
connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
}
- if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
+ if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
IPCL_HASH_INSERT_CONNECTED(connfp, connp);
- } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
+ } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
IPCL_HASH_INSERT_BOUND(connfp, connp);
} else {
IPCL_HASH_INSERT_WILDCARD(connfp, connp);
@@ -1401,13 +1282,11 @@ ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
break;
case IPPROTO_TCP:
- /* XXX - Need a separate table for IN6_IS_ADDR_UNSPECIFIED? */
-
/* Insert it in the Bind Hash */
ASSERT(connp->conn_zoneid != ALL_ZONES);
connfp = &ipst->ips_ipcl_bind_fanout[
IPCL_BIND_HASH(lport, ipst)];
- if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
+ if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
IPCL_HASH_INSERT_BOUND(connfp, connp);
} else {
IPCL_HASH_INSERT_WILDCARD(connfp, connp);
@@ -1416,13 +1295,13 @@ ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
sa_family_t addr_family;
uint8_t *laddrp;
- if (connp->conn_pkt_isv6) {
+ if (connp->conn_ipversion == IPV6_VERSION) {
addr_family = AF_INET6;
laddrp =
- (uint8_t *)&connp->conn_bound_source_v6;
+ (uint8_t *)&connp->conn_bound_addr_v6;
} else {
addr_family = AF_INET;
- laddrp = (uint8_t *)&connp->conn_bound_source;
+ laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
}
connp->conn_flags |= IPCL_CL_LISTENER;
(*cl_inet_listen)(
@@ -1441,43 +1320,35 @@ ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
/*
* ipcl_conn_hash insertion routines.
+ * The caller has already set conn_proto and the addresses/ports in the conn_t.
*/
+
int
-ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src,
- ipaddr_t rem, uint32_t ports)
+ipcl_conn_insert(conn_t *connp)
+{
+ if (connp->conn_ipversion == IPV6_VERSION)
+ return (ipcl_conn_insert_v6(connp));
+ else
+ return (ipcl_conn_insert_v4(connp));
+}
+
+int
+ipcl_conn_insert_v4(conn_t *connp)
{
connf_t *connfp;
- uint16_t *up;
conn_t *tconnp;
-#ifdef IPCL_DEBUG
- char sbuf[INET_NTOA_BUFSIZE], rbuf[INET_NTOA_BUFSIZE];
-#endif
- in_port_t lport;
int ret = 0;
ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
-
- IPCL_DEBUG_LVL(256, ("ipcl_conn_insert: connp %p, src = %s, "
- "dst = %s, ports = %x, protocol = %x", (void *)connp,
- inet_ntoa_r(src, sbuf), inet_ntoa_r(rem, rbuf),
- ports, protocol));
+ uint16_t lport = connp->conn_lport;
+ uint8_t protocol = connp->conn_proto;
if (IPCL_IS_IPTUN(connp))
- return (ipcl_iptun_hash_insert(connp, src, rem, ipst));
+ return (ipcl_iptun_hash_insert(connp, ipst));
switch (protocol) {
case IPPROTO_TCP:
- if (!(connp->conn_flags & IPCL_EAGER)) {
- /*
- * for a eager connection, i.e connections which
- * have just been created, the initialization is
- * already done in ip at conn_creation time, so
- * we can skip the checks here.
- */
- IPCL_CONN_INIT(connp, protocol, src, rem, ports);
- }
-
/*
- * For tcp, we check whether the connection tuple already
+ * For TCP, we check whether the connection tuple already
* exists before allowing the connection to proceed. We
* also allow indexing on the zoneid. This is to allow
* multiple shared stack zones to have the same tcp
@@ -1486,16 +1357,15 @@ ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src,
* doesn't have to be unique.
*/
connfp = &ipst->ips_ipcl_conn_fanout[
- IPCL_CONN_HASH(connp->conn_rem,
+ IPCL_CONN_HASH(connp->conn_faddr_v4,
connp->conn_ports, ipst)];
mutex_enter(&connfp->connf_lock);
for (tconnp = connfp->connf_head; tconnp != NULL;
tconnp = tconnp->conn_next) {
- if ((IPCL_CONN_MATCH(tconnp, connp->conn_ulp,
- connp->conn_rem, connp->conn_src,
- connp->conn_ports)) &&
- (IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid))) {
-
+ if (IPCL_CONN_MATCH(tconnp, connp->conn_proto,
+ connp->conn_faddr_v4, connp->conn_laddr_v4,
+ connp->conn_ports) &&
+ IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
/* Already have a conn. bail out */
mutex_exit(&connfp->connf_lock);
return (EADDRINUSE);
@@ -1512,6 +1382,7 @@ ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src,
}
ASSERT(connp->conn_recv != NULL);
+ ASSERT(connp->conn_recvicmp != NULL);
IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
mutex_exit(&connfp->connf_lock);
@@ -1523,7 +1394,6 @@ ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src,
* from the hash first.
*/
IPCL_HASH_REMOVE(connp);
- lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
ret = ipcl_sctp_hash_insert(connp, lport);
break;
@@ -1540,18 +1410,16 @@ ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src,
/* FALLTHROUGH */
case IPPROTO_UDP:
- up = (uint16_t *)&ports;
- IPCL_CONN_INIT(connp, protocol, src, rem, ports);
if (protocol == IPPROTO_UDP) {
connfp = &ipst->ips_ipcl_udp_fanout[
- IPCL_UDP_HASH(up[1], ipst)];
+ IPCL_UDP_HASH(lport, ipst)];
} else {
- connfp = &ipst->ips_ipcl_proto_fanout[protocol];
+ connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
}
- if (connp->conn_rem != INADDR_ANY) {
+ if (connp->conn_faddr_v4 != INADDR_ANY) {
IPCL_HASH_INSERT_CONNECTED(connfp, connp);
- } else if (connp->conn_src != INADDR_ANY) {
+ } else if (connp->conn_laddr_v4 != INADDR_ANY) {
IPCL_HASH_INSERT_BOUND(connfp, connp);
} else {
IPCL_HASH_INSERT_WILDCARD(connfp, connp);
@@ -1563,25 +1431,21 @@ ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src,
}
int
-ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
- const in6_addr_t *rem, uint32_t ports, uint_t ifindex)
+ipcl_conn_insert_v6(conn_t *connp)
{
connf_t *connfp;
- uint16_t *up;
conn_t *tconnp;
- in_port_t lport;
int ret = 0;
ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
+ uint16_t lport = connp->conn_lport;
+ uint8_t protocol = connp->conn_proto;
+ uint_t ifindex = connp->conn_bound_if;
if (IPCL_IS_IPTUN(connp))
- return (ipcl_iptun_hash_insert_v6(connp, src, rem, ipst));
+ return (ipcl_iptun_hash_insert_v6(connp, ipst));
switch (protocol) {
case IPPROTO_TCP:
- /* Just need to insert a conn struct */
- if (!(connp->conn_flags & IPCL_EAGER)) {
- IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
- }
/*
* For tcp, we check whether the connection tuple already
@@ -1593,17 +1457,18 @@ ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
* doesn't have to be unique.
*/
connfp = &ipst->ips_ipcl_conn_fanout[
- IPCL_CONN_HASH_V6(connp->conn_remv6, connp->conn_ports,
+ IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports,
ipst)];
mutex_enter(&connfp->connf_lock);
for (tconnp = connfp->connf_head; tconnp != NULL;
tconnp = tconnp->conn_next) {
- if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_ulp,
- connp->conn_remv6, connp->conn_srcv6,
+ /* NOTE: need to match zoneid. Bug in onnv-gate */
+ if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto,
+ connp->conn_faddr_v6, connp->conn_laddr_v6,
connp->conn_ports) &&
- (tconnp->conn_tcp->tcp_bound_if == 0 ||
- tconnp->conn_tcp->tcp_bound_if == ifindex) &&
- (IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid))) {
+ (tconnp->conn_bound_if == 0 ||
+ tconnp->conn_bound_if == ifindex) &&
+ IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
/* Already have a conn. bail out */
mutex_exit(&connfp->connf_lock);
return (EADDRINUSE);
@@ -1624,7 +1489,6 @@ ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
case IPPROTO_SCTP:
IPCL_HASH_REMOVE(connp);
- lport = htons((uint16_t)(ntohl(ports) & 0xFFFF));
ret = ipcl_sctp_hash_insert(connp, lport);
break;
@@ -1634,18 +1498,16 @@ ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
return (EADDRINUSE);
/* FALLTHROUGH */
case IPPROTO_UDP:
- up = (uint16_t *)&ports;
- IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports);
if (protocol == IPPROTO_UDP) {
connfp = &ipst->ips_ipcl_udp_fanout[
- IPCL_UDP_HASH(up[1], ipst)];
+ IPCL_UDP_HASH(lport, ipst)];
} else {
connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
}
- if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) {
+ if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
IPCL_HASH_INSERT_CONNECTED(connfp, connp);
- } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) {
+ } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
IPCL_HASH_INSERT_BOUND(connfp, connp);
} else {
IPCL_HASH_INSERT_WILDCARD(connfp, connp);
@@ -1667,8 +1529,8 @@ ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src,
* zone, then label checks are omitted.
*/
conn_t *
-ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
- ip_stack_t *ipst)
+ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
+ ip_recv_attr_t *ira, ip_stack_t *ipst)
{
ipha_t *ipha;
connf_t *connfp, *bind_connfp;
@@ -1677,8 +1539,7 @@ ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
uint32_t ports;
conn_t *connp;
uint16_t *up;
- boolean_t shared_addr;
- boolean_t unlabeled;
+ zoneid_t zoneid = ira->ira_zoneid;
ipha = (ipha_t *)mp->b_rptr;
up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
@@ -1692,11 +1553,14 @@ ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
mutex_enter(&connfp->connf_lock);
for (connp = connfp->connf_head; connp != NULL;
connp = connp->conn_next) {
- if ((IPCL_CONN_MATCH(connp, protocol,
- ipha->ipha_src, ipha->ipha_dst, ports)) &&
- (IPCL_ZONE_MATCH(connp, zoneid))) {
+ if (IPCL_CONN_MATCH(connp, protocol,
+ ipha->ipha_src, ipha->ipha_dst, ports) &&
+ (connp->conn_zoneid == zoneid ||
+ connp->conn_allzones ||
+ ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
+ (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
+ (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
break;
- }
}
if (connp != NULL) {
@@ -1713,48 +1577,19 @@ ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
}
mutex_exit(&connfp->connf_lock);
-
lport = up[1];
- unlabeled = B_FALSE;
- /* Cred cannot be null on IPv4 */
- if (is_system_labeled()) {
- cred_t *cr = msg_getcred(mp, NULL);
- ASSERT(cr != NULL);
- unlabeled = (crgetlabel(cr)->tsl_flags &
- TSLF_UNLABELED) != 0;
- }
- shared_addr = (zoneid == ALL_ZONES);
- if (shared_addr) {
- /*
- * No need to handle exclusive-stack zones since
- * ALL_ZONES only applies to the shared stack.
- */
- zoneid = tsol_mlp_findzone(protocol, lport);
- /*
- * If no shared MLP is found, tsol_mlp_findzone returns
- * ALL_ZONES. In that case, we assume it's SLP, and
- * search for the zone based on the packet label.
- *
- * If there is such a zone, we prefer to find a
- * connection in it. Otherwise, we look for a
- * MAC-exempt connection in any zone whose label
- * dominates the default label on the packet.
- */
- if (zoneid == ALL_ZONES)
- zoneid = tsol_packet_to_zoneid(mp);
- else
- unlabeled = B_FALSE;
- }
-
bind_connfp =
&ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
mutex_enter(&bind_connfp->connf_lock);
for (connp = bind_connfp->connf_head; connp != NULL;
connp = connp->conn_next) {
if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
- lport) && (IPCL_ZONE_MATCH(connp, zoneid) ||
- (unlabeled && shared_addr &&
- (connp->conn_mac_mode != CONN_MAC_DEFAULT))))
+ lport) &&
+ (connp->conn_zoneid == zoneid ||
+ connp->conn_allzones ||
+ ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
+ (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
+ (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
break;
}
@@ -1762,16 +1597,17 @@ ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
* If the matching connection is SLP on a private address, then
* the label on the packet must match the local zone's label.
* Otherwise, it must be in the label range defined by tnrh.
- * This is ensured by tsol_receive_label.
+ * This is ensured by tsol_receive_local.
+ *
+ * Note that we don't check tsol_receive_local for
+ * the connected case.
*/
- if (connp != NULL && is_system_labeled() &&
+ if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
!tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
- shared_addr, connp)) {
- DTRACE_PROBE3(
- tx__ip__log__info__classify__tcp,
- char *,
- "connp(1) could not receive mp(2)",
- conn_t *, connp, mblk_t *, mp);
+ ira, connp)) {
+ DTRACE_PROBE3(tx__ip__log__info__classify__tcp,
+ char *, "connp(1) could not receive mp(2)",
+ conn_t *, connp, mblk_t *, mp);
connp = NULL;
}
@@ -1783,61 +1619,27 @@ ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
}
mutex_exit(&bind_connfp->connf_lock);
-
- IPCL_DEBUG_LVL(512,
- ("ipcl_classify: couldn't classify mp = %p\n",
- (void *)mp));
break;
case IPPROTO_UDP:
lport = up[1];
- unlabeled = B_FALSE;
- /* Cred cannot be null on IPv4 */
- if (is_system_labeled()) {
- cred_t *cr = msg_getcred(mp, NULL);
- ASSERT(cr != NULL);
- unlabeled = (crgetlabel(cr)->tsl_flags &
- TSLF_UNLABELED) != 0;
- }
- shared_addr = (zoneid == ALL_ZONES);
- if (shared_addr) {
- /*
- * No need to handle exclusive-stack zones since
- * ALL_ZONES only applies to the shared stack.
- */
- zoneid = tsol_mlp_findzone(protocol, lport);
- /*
- * If no shared MLP is found, tsol_mlp_findzone returns
- * ALL_ZONES. In that case, we assume it's SLP, and
- * search for the zone based on the packet label.
- *
- * If there is such a zone, we prefer to find a
- * connection in it. Otherwise, we look for a
- * MAC-exempt connection in any zone whose label
- * dominates the default label on the packet.
- */
- if (zoneid == ALL_ZONES)
- zoneid = tsol_packet_to_zoneid(mp);
- else
- unlabeled = B_FALSE;
- }
fport = up[0];
- IPCL_DEBUG_LVL(512, ("ipcl_udp_classify %x %x", lport, fport));
connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
mutex_enter(&connfp->connf_lock);
for (connp = connfp->connf_head; connp != NULL;
connp = connp->conn_next) {
if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
fport, ipha->ipha_src) &&
- (IPCL_ZONE_MATCH(connp, zoneid) ||
- (unlabeled && shared_addr &&
- (connp->conn_mac_mode != CONN_MAC_DEFAULT))))
+ (connp->conn_zoneid == zoneid ||
+ connp->conn_allzones ||
+ ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
+ (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE))))
break;
}
- if (connp != NULL && is_system_labeled() &&
+ if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
!tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
- shared_addr, connp)) {
+ ira, connp)) {
DTRACE_PROBE3(tx__ip__log__info__classify__udp,
char *, "connp(1) could not receive mp(2)",
conn_t *, connp, mblk_t *, mp);
@@ -1854,9 +1656,7 @@ ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
* We shouldn't come here for multicast/broadcast packets
*/
mutex_exit(&connfp->connf_lock);
- IPCL_DEBUG_LVL(512,
- ("ipcl_classify: cant find udp conn_t for ports : %x %x",
- lport, fport));
+
break;
case IPPROTO_ENCAP:
@@ -1869,26 +1669,25 @@ ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
}
conn_t *
-ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
- ip_stack_t *ipst)
+ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
+ ip_recv_attr_t *ira, ip_stack_t *ipst)
{
ip6_t *ip6h;
connf_t *connfp, *bind_connfp;
uint16_t lport;
uint16_t fport;
- tcph_t *tcph;
+ tcpha_t *tcpha;
uint32_t ports;
conn_t *connp;
uint16_t *up;
- boolean_t shared_addr;
- boolean_t unlabeled;
+ zoneid_t zoneid = ira->ira_zoneid;
ip6h = (ip6_t *)mp->b_rptr;
switch (protocol) {
case IPPROTO_TCP:
- tcph = (tcph_t *)&mp->b_rptr[hdr_len];
- up = (uint16_t *)tcph->th_lport;
+ tcpha = (tcpha_t *)&mp->b_rptr[hdr_len];
+ up = &tcpha->tha_lport;
ports = *(uint32_t *)up;
connfp =
@@ -1897,11 +1696,14 @@ ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
mutex_enter(&connfp->connf_lock);
for (connp = connfp->connf_head; connp != NULL;
connp = connp->conn_next) {
- if ((IPCL_CONN_MATCH_V6(connp, protocol,
- ip6h->ip6_src, ip6h->ip6_dst, ports)) &&
- (IPCL_ZONE_MATCH(connp, zoneid))) {
+ if (IPCL_CONN_MATCH_V6(connp, protocol,
+ ip6h->ip6_src, ip6h->ip6_dst, ports) &&
+ (connp->conn_zoneid == zoneid ||
+ connp->conn_allzones ||
+ ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
+ (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
+ (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
break;
- }
}
if (connp != NULL) {
@@ -1920,37 +1722,6 @@ ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
mutex_exit(&connfp->connf_lock);
lport = up[1];
- unlabeled = B_FALSE;
- /* Cred can be null on IPv6 */
- if (is_system_labeled()) {
- cred_t *cr = msg_getcred(mp, NULL);
-
- unlabeled = (cr != NULL &&
- crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
- }
- shared_addr = (zoneid == ALL_ZONES);
- if (shared_addr) {
- /*
- * No need to handle exclusive-stack zones since
- * ALL_ZONES only applies to the shared stack.
- */
- zoneid = tsol_mlp_findzone(protocol, lport);
- /*
- * If no shared MLP is found, tsol_mlp_findzone returns
- * ALL_ZONES. In that case, we assume it's SLP, and
- * search for the zone based on the packet label.
- *
- * If there is such a zone, we prefer to find a
- * connection in it. Otherwise, we look for a
- * MAC-exempt connection in any zone whose label
- * dominates the default label on the packet.
- */
- if (zoneid == ALL_ZONES)
- zoneid = tsol_packet_to_zoneid(mp);
- else
- unlabeled = B_FALSE;
- }
-
bind_connfp =
&ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
mutex_enter(&bind_connfp->connf_lock);
@@ -1958,15 +1729,17 @@ ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
connp = connp->conn_next) {
if (IPCL_BIND_MATCH_V6(connp, protocol,
ip6h->ip6_dst, lport) &&
- (IPCL_ZONE_MATCH(connp, zoneid) ||
- (unlabeled && shared_addr &&
- (connp->conn_mac_mode != CONN_MAC_DEFAULT))))
+ (connp->conn_zoneid == zoneid ||
+ connp->conn_allzones ||
+ ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
+ (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
+ (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
break;
}
- if (connp != NULL && is_system_labeled() &&
+ if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
!tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
- shared_addr, connp)) {
+ ira, connp)) {
DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
char *, "connp(1) could not receive mp(2)",
conn_t *, connp, mblk_t *, mp);
@@ -1977,72 +1750,33 @@ ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
/* Have a listner at least */
CONN_INC_REF(connp);
mutex_exit(&bind_connfp->connf_lock);
- IPCL_DEBUG_LVL(512,
- ("ipcl_classify_v6: found listner "
- "connp = %p\n", (void *)connp));
-
return (connp);
}
mutex_exit(&bind_connfp->connf_lock);
-
- IPCL_DEBUG_LVL(512,
- ("ipcl_classify_v6: couldn't classify mp = %p\n",
- (void *)mp));
break;
case IPPROTO_UDP:
up = (uint16_t *)&mp->b_rptr[hdr_len];
lport = up[1];
- unlabeled = B_FALSE;
- /* Cred can be null on IPv6 */
- if (is_system_labeled()) {
- cred_t *cr = msg_getcred(mp, NULL);
-
- unlabeled = (cr != NULL &&
- crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
- }
- shared_addr = (zoneid == ALL_ZONES);
- if (shared_addr) {
- /*
- * No need to handle exclusive-stack zones since
- * ALL_ZONES only applies to the shared stack.
- */
- zoneid = tsol_mlp_findzone(protocol, lport);
- /*
- * If no shared MLP is found, tsol_mlp_findzone returns
- * ALL_ZONES. In that case, we assume it's SLP, and
- * search for the zone based on the packet label.
- *
- * If there is such a zone, we prefer to find a
- * connection in it. Otherwise, we look for a
- * MAC-exempt connection in any zone whose label
- * dominates the default label on the packet.
- */
- if (zoneid == ALL_ZONES)
- zoneid = tsol_packet_to_zoneid(mp);
- else
- unlabeled = B_FALSE;
- }
-
fport = up[0];
- IPCL_DEBUG_LVL(512, ("ipcl_udp_classify_v6 %x %x", lport,
- fport));
connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
mutex_enter(&connfp->connf_lock);
for (connp = connfp->connf_head; connp != NULL;
connp = connp->conn_next) {
if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
fport, ip6h->ip6_src) &&
- (IPCL_ZONE_MATCH(connp, zoneid) ||
- (unlabeled && shared_addr &&
- (connp->conn_mac_mode != CONN_MAC_DEFAULT))))
+ (connp->conn_zoneid == zoneid ||
+ connp->conn_allzones ||
+ ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
+ (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
+ (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
break;
}
- if (connp != NULL && is_system_labeled() &&
+ if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
!tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
- shared_addr, connp)) {
+ ira, connp)) {
DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
char *, "connp(1) could not receive mp(2)",
conn_t *, connp, mblk_t *, mp);
@@ -2059,9 +1793,6 @@ ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
* We shouldn't come here for multicast/broadcast packets
*/
mutex_exit(&connfp->connf_lock);
- IPCL_DEBUG_LVL(512,
- ("ipcl_classify_v6: cant find udp conn_t for ports : %x %x",
- lport, fport));
break;
case IPPROTO_ENCAP:
case IPPROTO_IPV6:
@@ -2076,125 +1807,80 @@ ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid,
* wrapper around ipcl_classify_(v4,v6) routines.
*/
conn_t *
-ipcl_classify(mblk_t *mp, zoneid_t zoneid, ip_stack_t *ipst)
+ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
{
- uint16_t hdr_len;
- ipha_t *ipha;
- uint8_t *nexthdrp;
-
- if (MBLKL(mp) < sizeof (ipha_t))
- return (NULL);
-
- switch (IPH_HDR_VERSION(mp->b_rptr)) {
- case IPV4_VERSION:
- ipha = (ipha_t *)mp->b_rptr;
- hdr_len = IPH_HDR_LENGTH(ipha);
- return (ipcl_classify_v4(mp, ipha->ipha_protocol, hdr_len,
- zoneid, ipst));
- case IPV6_VERSION:
- if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr,
- &hdr_len, &nexthdrp))
- return (NULL);
-
- return (ipcl_classify_v6(mp, *nexthdrp, hdr_len, zoneid, ipst));
+ if (ira->ira_flags & IRAF_IS_IPV4) {
+ return (ipcl_classify_v4(mp, ira->ira_protocol,
+ ira->ira_ip_hdr_length, ira, ipst));
+ } else {
+ return (ipcl_classify_v6(mp, ira->ira_protocol,
+ ira->ira_ip_hdr_length, ira, ipst));
}
-
- return (NULL);
}
+/*
+ * Only used to classify SCTP RAW sockets
+ */
conn_t *
-ipcl_classify_raw(mblk_t *mp, uint8_t protocol, zoneid_t zoneid,
- uint32_t ports, ipha_t *hdr, ip_stack_t *ipst)
+ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports,
+ ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst)
{
connf_t *connfp;
conn_t *connp;
in_port_t lport;
- int af;
- boolean_t shared_addr;
- boolean_t unlabeled;
+ int ipversion;
const void *dst;
+ zoneid_t zoneid = ira->ira_zoneid;
lport = ((uint16_t *)&ports)[1];
-
- unlabeled = B_FALSE;
- /* Cred can be null on IPv6 */
- if (is_system_labeled()) {
- cred_t *cr = msg_getcred(mp, NULL);
-
- unlabeled = (cr != NULL &&
- crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0;
- }
- shared_addr = (zoneid == ALL_ZONES);
- if (shared_addr) {
- /*
- * No need to handle exclusive-stack zones since ALL_ZONES
- * only applies to the shared stack.
- */
- zoneid = tsol_mlp_findzone(protocol, lport);
- /*
- * If no shared MLP is found, tsol_mlp_findzone returns
- * ALL_ZONES. In that case, we assume it's SLP, and search for
- * the zone based on the packet label.
- *
- * If there is such a zone, we prefer to find a connection in
- * it. Otherwise, we look for a MAC-exempt connection in any
- * zone whose label dominates the default label on the packet.
- */
- if (zoneid == ALL_ZONES)
- zoneid = tsol_packet_to_zoneid(mp);
- else
- unlabeled = B_FALSE;
+ if (ira->ira_flags & IRAF_IS_IPV4) {
+ dst = (const void *)&ipha->ipha_dst;
+ ipversion = IPV4_VERSION;
+ } else {
+ dst = (const void *)&ip6h->ip6_dst;
+ ipversion = IPV6_VERSION;
}
- af = IPH_HDR_VERSION(hdr);
- dst = af == IPV4_VERSION ? (const void *)&hdr->ipha_dst :
- (const void *)&((ip6_t *)hdr)->ip6_dst;
connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
-
mutex_enter(&connfp->connf_lock);
for (connp = connfp->connf_head; connp != NULL;
connp = connp->conn_next) {
/* We don't allow v4 fallback for v6 raw socket. */
- if (af == (connp->conn_af_isv6 ? IPV4_VERSION :
- IPV6_VERSION))
+ if (ipversion != connp->conn_ipversion)
continue;
- if (connp->conn_fully_bound) {
- if (af == IPV4_VERSION) {
+ if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
+ !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
+ if (ipversion == IPV4_VERSION) {
if (!IPCL_CONN_MATCH(connp, protocol,
- hdr->ipha_src, hdr->ipha_dst, ports))
+ ipha->ipha_src, ipha->ipha_dst, ports))
continue;
} else {
if (!IPCL_CONN_MATCH_V6(connp, protocol,
- ((ip6_t *)hdr)->ip6_src,
- ((ip6_t *)hdr)->ip6_dst, ports))
+ ip6h->ip6_src, ip6h->ip6_dst, ports))
continue;
}
} else {
- if (af == IPV4_VERSION) {
+ if (ipversion == IPV4_VERSION) {
if (!IPCL_BIND_MATCH(connp, protocol,
- hdr->ipha_dst, lport))
+ ipha->ipha_dst, lport))
continue;
} else {
if (!IPCL_BIND_MATCH_V6(connp, protocol,
- ((ip6_t *)hdr)->ip6_dst, lport))
+ ip6h->ip6_dst, lport))
continue;
}
}
- if (IPCL_ZONE_MATCH(connp, zoneid) ||
- (unlabeled &&
- (connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
- shared_addr))
+ if (connp->conn_zoneid == zoneid ||
+ connp->conn_allzones ||
+ ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
+ (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
+ (ira->ira_flags & IRAF_TX_SHARED_ADDR)))
break;
}
- /*
- * If the connection is fully-bound and connection-oriented (TCP or
- * SCTP), then we've already validated the remote system's label.
- * There's no need to do it again for every packet.
- */
- if (connp != NULL && is_system_labeled() && (!connp->conn_fully_bound ||
- !(connp->conn_flags & (IPCL_TCP|IPCL_SCTPCONN))) &&
- !tsol_receive_local(mp, dst, af, shared_addr, connp)) {
+
+ if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
+ !tsol_receive_local(mp, dst, ipversion, ira, connp)) {
DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
char *, "connp(1) could not receive mp(2)",
conn_t *, connp, mblk_t *, mp);
@@ -2205,22 +1891,22 @@ ipcl_classify_raw(mblk_t *mp, uint8_t protocol, zoneid_t zoneid,
goto found;
mutex_exit(&connfp->connf_lock);
- /* Try to look for a wildcard match. */
+ /* Try to look for a wildcard SCTP RAW socket match. */
connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
mutex_enter(&connfp->connf_lock);
for (connp = connfp->connf_head; connp != NULL;
connp = connp->conn_next) {
/* We don't allow v4 fallback for v6 raw socket. */
- if ((af == (connp->conn_af_isv6 ? IPV4_VERSION :
- IPV6_VERSION)) || !IPCL_ZONE_MATCH(connp, zoneid)) {
+ if (ipversion != connp->conn_ipversion)
continue;
- }
- if (af == IPV4_VERSION) {
- if (IPCL_RAW_MATCH(connp, protocol, hdr->ipha_dst))
+ if (!IPCL_ZONE_MATCH(connp, zoneid))
+ continue;
+
+ if (ipversion == IPV4_VERSION) {
+ if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst))
break;
} else {
- if (IPCL_RAW_MATCH_V6(connp, protocol,
- ((ip6_t *)hdr)->ip6_dst)) {
+ if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) {
break;
}
}
@@ -2253,11 +1939,23 @@ tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
- tcp->tcp_timercache = tcp_timermp_alloc(KM_NOSLEEP);
+ tcp->tcp_timercache = tcp_timermp_alloc(kmflags);
+ if (tcp->tcp_timercache == NULL)
+ return (ENOMEM);
connp->conn_tcp = tcp;
connp->conn_flags = IPCL_TCPCONN;
- connp->conn_ulp = IPPROTO_TCP;
+ connp->conn_proto = IPPROTO_TCP;
tcp->tcp_connp = connp;
+ rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
+
+ connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
+ if (connp->conn_ixa == NULL) {
+ tcp_timermp_free(tcp);
+ return (ENOMEM);
+ }
+ connp->conn_ixa->ixa_refcnt = 1;
+ connp->conn_ixa->ixa_protocol = connp->conn_proto;
+ connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
return (0);
}
@@ -2276,6 +1974,15 @@ tcp_conn_destructor(void *buf, void *cdrarg)
mutex_destroy(&connp->conn_lock);
cv_destroy(&connp->conn_cv);
cv_destroy(&connp->conn_sq_cv);
+ rw_destroy(&connp->conn_ilg_lock);
+
+ /* Can be NULL if constructor failed */
+ if (connp->conn_ixa != NULL) {
+ ASSERT(connp->conn_ixa->ixa_refcnt == 1);
+ ASSERT(connp->conn_ixa->ixa_ire == NULL);
+ ASSERT(connp->conn_ixa->ixa_nce == NULL);
+ ixa_refrele(connp->conn_ixa);
+ }
}
/* ARGSUSED */
@@ -2289,7 +1996,13 @@ ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
connp->conn_flags = IPCL_IPCCONN;
+ rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
+ connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
+ if (connp->conn_ixa == NULL)
+ return (ENOMEM);
+ connp->conn_ixa->ixa_refcnt = 1;
+ connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
return (0);
}
@@ -2304,6 +2017,15 @@ ip_conn_destructor(void *buf, void *cdrarg)
ASSERT(connp->conn_priv == NULL);
mutex_destroy(&connp->conn_lock);
cv_destroy(&connp->conn_cv);
+ rw_destroy(&connp->conn_ilg_lock);
+
+ /* Can be NULL if constructor failed */
+ if (connp->conn_ixa != NULL) {
+ ASSERT(connp->conn_ixa->ixa_refcnt == 1);
+ ASSERT(connp->conn_ixa->ixa_ire == NULL);
+ ASSERT(connp->conn_ixa->ixa_nce == NULL);
+ ixa_refrele(connp->conn_ixa);
+ }
}
/* ARGSUSED */
@@ -2321,8 +2043,15 @@ udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
connp->conn_udp = udp;
connp->conn_flags = IPCL_UDPCONN;
- connp->conn_ulp = IPPROTO_UDP;
+ connp->conn_proto = IPPROTO_UDP;
udp->udp_connp = connp;
+ rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
+ connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
+ if (connp->conn_ixa == NULL)
+ return (ENOMEM);
+ connp->conn_ixa->ixa_refcnt = 1;
+ connp->conn_ixa->ixa_protocol = connp->conn_proto;
+ connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
return (0);
}
@@ -2339,6 +2068,15 @@ udp_conn_destructor(void *buf, void *cdrarg)
ASSERT(connp->conn_udp == udp);
mutex_destroy(&connp->conn_lock);
cv_destroy(&connp->conn_cv);
+ rw_destroy(&connp->conn_ilg_lock);
+
+ /* Can be NULL if constructor failed */
+ if (connp->conn_ixa != NULL) {
+ ASSERT(connp->conn_ixa->ixa_refcnt == 1);
+ ASSERT(connp->conn_ixa->ixa_ire == NULL);
+ ASSERT(connp->conn_ixa->ixa_nce == NULL);
+ ixa_refrele(connp->conn_ixa);
+ }
}
/* ARGSUSED */
@@ -2356,8 +2094,15 @@ rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
connp->conn_icmp = icmp;
connp->conn_flags = IPCL_RAWIPCONN;
- connp->conn_ulp = IPPROTO_ICMP;
+ connp->conn_proto = IPPROTO_ICMP;
icmp->icmp_connp = connp;
+ rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
+ connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
+ if (connp->conn_ixa == NULL)
+ return (ENOMEM);
+ connp->conn_ixa->ixa_refcnt = 1;
+ connp->conn_ixa->ixa_protocol = connp->conn_proto;
+ connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
return (0);
}
@@ -2374,6 +2119,15 @@ rawip_conn_destructor(void *buf, void *cdrarg)
ASSERT(connp->conn_icmp == icmp);
mutex_destroy(&connp->conn_lock);
cv_destroy(&connp->conn_cv);
+ rw_destroy(&connp->conn_ilg_lock);
+
+ /* Can be NULL if constructor failed */
+ if (connp->conn_ixa != NULL) {
+ ASSERT(connp->conn_ixa->ixa_refcnt == 1);
+ ASSERT(connp->conn_ixa->ixa_ire == NULL);
+ ASSERT(connp->conn_ixa->ixa_nce == NULL);
+ ixa_refrele(connp->conn_ixa);
+ }
}
/* ARGSUSED */
@@ -2392,6 +2146,12 @@ rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
connp->conn_rts = rts;
connp->conn_flags = IPCL_RTSCONN;
rts->rts_connp = connp;
+ rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
+ connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
+ if (connp->conn_ixa == NULL)
+ return (ENOMEM);
+ connp->conn_ixa->ixa_refcnt = 1;
+ connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
return (0);
}
@@ -2408,71 +2168,35 @@ rts_conn_destructor(void *buf, void *cdrarg)
ASSERT(connp->conn_rts == rts);
mutex_destroy(&connp->conn_lock);
cv_destroy(&connp->conn_cv);
-}
+ rw_destroy(&connp->conn_ilg_lock);
-/* ARGSUSED */
-int
-ip_helper_stream_constructor(void *buf, void *cdrarg, int kmflags)
-{
- int error;
- netstack_t *ns;
- int ret;
- tcp_stack_t *tcps;
- ip_helper_stream_info_t *ip_helper_str;
- ip_stack_t *ipst;
-
- ns = netstack_find_by_cred(kcred);
- ASSERT(ns != NULL);
- tcps = ns->netstack_tcp;
- ipst = ns->netstack_ip;
- ASSERT(tcps != NULL);
- ip_helper_str = (ip_helper_stream_info_t *)buf;
-
- do {
- error = ldi_open_by_name(DEV_IP, IP_HELPER_STR, kcred,
- &ip_helper_str->iphs_handle, ipst->ips_ldi_ident);
- } while (error == EINTR);
-
- if (error == 0) {
- do {
- error = ldi_ioctl(
- ip_helper_str->iphs_handle, SIOCSQPTR,
- (intptr_t)buf, FKIOCTL, kcred, &ret);
- } while (error == EINTR);
-
- if (error != 0) {
- (void) ldi_close(
- ip_helper_str->iphs_handle, 0, kcred);
- }
+ /* Can be NULL if constructor failed */
+ if (connp->conn_ixa != NULL) {
+ ASSERT(connp->conn_ixa->ixa_refcnt == 1);
+ ASSERT(connp->conn_ixa->ixa_ire == NULL);
+ ASSERT(connp->conn_ixa->ixa_nce == NULL);
+ ixa_refrele(connp->conn_ixa);
}
-
- netstack_rele(ipst->ips_netstack);
-
- return (error);
}
-/* ARGSUSED */
-static void
-ip_helper_stream_destructor(void *buf, void *cdrarg)
-{
- ip_helper_stream_info_t *ip_helper_str = (ip_helper_stream_info_t *)buf;
-
- ip_helper_str->iphs_rq->q_ptr =
- ip_helper_str->iphs_wq->q_ptr =
- ip_helper_str->iphs_minfo;
- (void) ldi_close(ip_helper_str->iphs_handle, 0, kcred);
-}
-
-
/*
* Called as part of ipcl_conn_destroy to assert and clear any pointers
* in the conn_t.
+ *
+ * Below we list all the pointers in the conn_t as a documentation aid.
+ * The ones that we can not ASSERT to be NULL are #ifdef'ed out.
+ * If you add any pointers to the conn_t please add an ASSERT here
+ * and #ifdef it out if it can't be actually asserted to be NULL.
+ * In any case, we bzero most of the conn_t at the end of the function.
*/
void
ipcl_conn_cleanup(conn_t *connp)
{
- ASSERT(connp->conn_ire_cache == NULL);
+ ip_xmit_attr_t *ixa;
+
ASSERT(connp->conn_latch == NULL);
+ ASSERT(connp->conn_latch_in_policy == NULL);
+ ASSERT(connp->conn_latch_in_action == NULL);
#ifdef notdef
ASSERT(connp->conn_rq == NULL);
ASSERT(connp->conn_wq == NULL);
@@ -2485,18 +2209,6 @@ ipcl_conn_cleanup(conn_t *connp)
ASSERT(connp->conn_fanout == NULL);
ASSERT(connp->conn_next == NULL);
ASSERT(connp->conn_prev == NULL);
-#ifdef notdef
- /*
- * The ill and ipif pointers are not cleared before the conn_t
- * goes away since they do not hold a reference on the ill/ipif.
- * We should replace these pointers with ifindex/ipaddr_t to
- * make the code less complex.
- */
- ASSERT(connp->conn_outgoing_ill == NULL);
- ASSERT(connp->conn_incoming_ill == NULL);
- ASSERT(connp->conn_multicast_ipif == NULL);
- ASSERT(connp->conn_multicast_ill == NULL);
-#endif
ASSERT(connp->conn_oper_pending_ill == NULL);
ASSERT(connp->conn_ilg == NULL);
ASSERT(connp->conn_drain_next == NULL);
@@ -2506,10 +2218,19 @@ ipcl_conn_cleanup(conn_t *connp)
ASSERT(connp->conn_idl == NULL);
#endif
ASSERT(connp->conn_ipsec_opt_mp == NULL);
- ASSERT(connp->conn_effective_cred == NULL);
+#ifdef notdef
+ /* conn_netstack is cleared by the caller; needed by ixa_cleanup */
ASSERT(connp->conn_netstack == NULL);
+#endif
ASSERT(connp->conn_helper_info == NULL);
+ ASSERT(connp->conn_ixa != NULL);
+ ixa = connp->conn_ixa;
+ ASSERT(ixa->ixa_refcnt == 1);
+ /* Need to preserve ixa_protocol */
+ ixa_cleanup(ixa);
+ ixa->ixa_flags = 0;
+
/* Clear out the conn_t fields that are not preserved */
bzero(&connp->conn_start_clr,
sizeof (conn_t) -
@@ -2602,10 +2323,11 @@ ipcl_globalhash_remove(conn_t *connp)
/*
* Walk the list of all conn_t's in the system, calling the function provided
- * with the specified argument for each.
+ * With the specified argument for each.
* Applies to both IPv4 and IPv6.
*
- * IPCs may hold pointers to ipif/ill. To guard against stale pointers
+ * CONNs may hold pointers to ills (conn_dhcpinit_ill and
+ * conn_oper_pending_ill). To guard against stale pointers
* ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
* unplumbed or removed. New conn_t's that are created while we are walking
* may be missed by this walk, because they are not necessarily inserted
@@ -2657,7 +2379,7 @@ ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
* (peer tcp in ESTABLISHED state).
*/
conn_t *
-ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph,
+ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha,
ip_stack_t *ipst)
{
uint32_t ports;
@@ -2675,8 +2397,8 @@ ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph,
zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
ipha->ipha_dst == htonl(INADDR_LOOPBACK));
- bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
- bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
+ pports[0] = tcpha->tha_fport;
+ pports[1] = tcpha->tha_lport;
connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
ports, ipst)];
@@ -2707,7 +2429,7 @@ ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph,
* (peer tcp in ESTABLISHED state).
*/
conn_t *
-ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph,
+ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha,
ip_stack_t *ipst)
{
uint32_t ports;
@@ -2728,8 +2450,8 @@ ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph,
zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
- bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
- bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
+ pports[0] = tcpha->tha_fport;
+ pports[1] = tcpha->tha_lport;
connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
ports, ipst)];
@@ -2738,7 +2460,7 @@ ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph,
for (tconnp = connfp->connf_head; tconnp != NULL;
tconnp = tconnp->conn_next) {
- /* We skip tcp_bound_if check here as this is loopback tcp */
+ /* We skip conn_bound_if check here as this is loopback tcp */
if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
ip6h->ip6_dst, ip6h->ip6_src, ports) &&
tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
@@ -2760,7 +2482,7 @@ ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph,
* Only checks for connected entries i.e. no INADDR_ANY checks.
*/
conn_t *
-ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcph_t *tcph, int min_state,
+ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state,
ip_stack_t *ipst)
{
uint32_t ports;
@@ -2769,8 +2491,8 @@ ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcph_t *tcph, int min_state,
conn_t *tconnp;
pports = (uint16_t *)&ports;
- bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t));
- bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t));
+ pports[0] = tcpha->tha_fport;
+ pports[1] = tcpha->tha_lport;
connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
ports, ipst)];
@@ -2823,8 +2545,8 @@ ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
ip6h->ip6_dst, ip6h->ip6_src, ports) &&
tcp->tcp_state >= min_state &&
- (tcp->tcp_bound_if == 0 ||
- tcp->tcp_bound_if == ifindex)) {
+ (tconnp->conn_bound_if == 0 ||
+ tconnp->conn_bound_if == ifindex)) {
CONN_INC_REF(tconnp);
mutex_exit(&connfp->connf_lock);
@@ -2901,8 +2623,8 @@ ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
tcp = connp->conn_tcp;
if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
IPCL_ZONE_MATCH(connp, zoneid) &&
- (tcp->tcp_bound_if == 0 ||
- tcp->tcp_bound_if == ifindex) &&
+ (connp->conn_bound_if == 0 ||
+ connp->conn_bound_if == ifindex) &&
tcp->tcp_listener == NULL) {
CONN_INC_REF(connp);
mutex_exit(&bind_connfp->connf_lock);
diff --git a/usr/src/uts/common/inet/ip/ipdrop.c b/usr/src/uts/common/inet/ip/ipdrop.c
index 6d08ec9d60..0f257d6cd2 100644
--- a/usr/src/uts/common/inet/ip/ipdrop.c
+++ b/usr/src/uts/common/inet/ip/ipdrop.c
@@ -29,11 +29,11 @@
#include <sys/sunddi.h>
#include <sys/kstat.h>
#include <sys/kmem.h>
+#include <sys/sdt.h>
#include <net/pfkeyv2.h>
#include <inet/common.h>
#include <inet/ip.h>
#include <inet/ip6.h>
-#include <inet/ipsec_info.h>
#include <inet/ipsec_impl.h>
#include <inet/ipdrop.h>
@@ -246,16 +246,11 @@ ip_drop_unregister(ipdropper_t *ipd)
* Actually drop a packet. Many things could happen here, but at the least,
* the packet will be freemsg()ed.
*/
-/* ARGSUSED */
void
-ip_drop_packet(mblk_t *mp, boolean_t inbound, ill_t *arriving,
- ire_t *outbound_ire, struct kstat_named *counter, ipdropper_t *who_called)
+ip_drop_packet(mblk_t *mp, boolean_t inbound, ill_t *ill,
+ struct kstat_named *counter, ipdropper_t *who_called)
{
- mblk_t *ipsec_mp = NULL;
- ipsec_in_t *ii = NULL;
- ipsec_out_t *io = NULL;
- ipsec_info_t *in;
- uint8_t vers;
+ char *str;
if (mp == NULL) {
/*
@@ -265,41 +260,7 @@ ip_drop_packet(mblk_t *mp, boolean_t inbound, ill_t *arriving,
return;
}
- if (DB_TYPE(mp) == M_CTL) {
- in = (ipsec_info_t *)mp->b_rptr;
-
- if (in->ipsec_info_type == IPSEC_IN)
- ii = (ipsec_in_t *)in;
- else if (in->ipsec_info_type == IPSEC_OUT)
- io = (ipsec_out_t *)in;
-
- /* See if this is an ICMP packet (check for v4/v6). */
- vers = (*mp->b_rptr) >> 4;
- if (vers != IPV4_VERSION && vers != IPV6_VERSION) {
- /*
- * If not, it's some other sort of M_CTL to be freed.
- * For now, treat it like an ordinary packet.
- */
- ipsec_mp = mp;
- mp = mp->b_cont;
- }
- }
-
- /* Reality checks */
- if (inbound && io != NULL)
- cmn_err(CE_WARN,
- "ip_drop_packet: inbound packet with IPSEC_OUT");
-
- if (outbound_ire != NULL && ii != NULL)
- cmn_err(CE_WARN,
- "ip_drop_packet: outbound packet with IPSEC_IN");
-
- /* At this point, mp always points to the data. */
- /*
- * Can't make the assertion yet - It could be an inbound ICMP
- * message, which is M_CTL but with data in it.
- */
- /* ASSERT(mp->b_datap->db_type == M_DATA); */
+ ASSERT(mp->b_datap->db_type == M_DATA);
/* Increment the bean counter, if available. */
if (counter != NULL) {
@@ -318,16 +279,22 @@ ip_drop_packet(mblk_t *mp, boolean_t inbound, ill_t *arriving,
break;
/* Other types we can't handle for now. */
}
-
- /* TODO? Copy out kstat name for use in logging. */
}
- /* TODO: log the packet details if logging is called for. */
+ if (counter != NULL)
+ str = counter->name;
+ else if (who_called != NULL)
+ str = who_called->ipd_name;
+ else
+ str = "Unspecified IPsec drop";
+
+ if (inbound)
+ ip_drop_input(str, mp, ill);
+ else
+ ip_drop_output(str, mp, ill);
+
/* TODO: queue the packet onto a snoop-friendly queue. */
- /* If I haven't queued the packet or some such nonsense, free it. */
- if (ipsec_mp != NULL)
- freeb(ipsec_mp);
/*
* ASSERT this isn't a b_next linked mblk chain where a
* chained dropper should be used instead
@@ -335,3 +302,50 @@ ip_drop_packet(mblk_t *mp, boolean_t inbound, ill_t *arriving,
ASSERT(mp->b_prev == NULL && mp->b_next == NULL);
freemsg(mp);
}
+
+/*
+ * This is just a convinient place for dtrace to see dropped packets
+ */
+/*ARGSUSED*/
+void
+ip_drop_input(char *str, mblk_t *mp, ill_t *ill)
+{
+ if (mp == NULL)
+ return;
+
+ if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
+ ipha_t *ipha = (ipha_t *)mp->b_rptr;
+
+ DTRACE_IP7(drop__in, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+ ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha,
+ ip6_t *, NULL, int, 0);
+ } else {
+ ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+
+ DTRACE_IP7(drop__in, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+ ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL,
+ ip6_t *, ip6h, int, 0);
+ }
+}
+
+/*ARGSUSED*/
+void
+ip_drop_output(char *str, mblk_t *mp, ill_t *ill)
+{
+ if (mp == NULL)
+ return;
+
+ if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
+ ipha_t *ipha = (ipha_t *)mp->b_rptr;
+
+ DTRACE_IP7(drop__out, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+ ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha,
+ ip6_t *, NULL, int, 0);
+ } else {
+ ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+
+ DTRACE_IP7(drop__out, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
+ ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL,
+ ip6_t *, ip6h, int, 0);
+ }
+}
diff --git a/usr/src/uts/common/inet/ip/ipmp.c b/usr/src/uts/common/inet/ip/ipmp.c
index ea8b4a73bb..b89171ed2b 100644
--- a/usr/src/uts/common/inet/ip/ipmp.c
+++ b/usr/src/uts/common/inet/ip/ipmp.c
@@ -22,12 +22,12 @@
* Use is subject to license terms.
*/
-#include <inet/arp.h>
#include <inet/ip.h>
#include <inet/ip6.h>
#include <inet/ip_if.h>
#include <inet/ip_ire.h>
#include <inet/ip_multi.h>
+#include <inet/ip_ndp.h>
#include <inet/ip_rts.h>
#include <inet/mi.h>
#include <net/if_types.h>
@@ -52,20 +52,6 @@
#define IPMP_GRP_HASH_SIZE 64
#define IPMP_ILL_REFRESH_TIMEOUT 120 /* seconds */
-/*
- * Templates for IPMP ARP messages.
- */
-static const arie_t ipmp_aract_template = {
- AR_IPMP_ACTIVATE,
- sizeof (arie_t), /* Name offset */
- sizeof (arie_t) /* Name length (set by ill_arp_alloc) */
-};
-
-static const arie_t ipmp_ardeact_template = {
- AR_IPMP_DEACTIVATE,
- sizeof (arie_t), /* Name offset */
- sizeof (arie_t) /* Name length (set by ill_arp_alloc) */
-};
/*
* IPMP meta-interface kstats (based on those in PSARC/1997/198).
@@ -497,7 +483,7 @@ ipmp_grp_vet_ill(ipmp_grp_t *grp, ill_t *ill)
* An ill must strictly be using ARP and/or ND for address
* resolution for it to be allowed into a group.
*/
- if (ill->ill_flags & (ILLF_NONUD | ILLF_NOARP | ILLF_XRESOLV))
+ if (ill->ill_flags & (ILLF_NONUD | ILLF_NOARP))
return (ENOTSUP);
/*
@@ -752,7 +738,7 @@ ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *illg)
if (illg->ig_next_ill == NULL)
illg->ig_next_ill = list_head(&illg->ig_actif);
- if (ill_check_and_refhold(ill) == 0) {
+ if (ill_check_and_refhold(ill)) {
rw_exit(&ipst->ips_ipmp_lock);
return (ill);
}
@@ -763,17 +749,6 @@ ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *illg)
}
/*
- * Return a pointer to the nominated multicast ill in `illg', or NULL if one
- * doesn't exist. Caller must be inside the IPSQ.
- */
-ill_t *
-ipmp_illgrp_cast_ill(ipmp_illgrp_t *illg)
-{
- ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
- return (illg->ig_cast_ill);
-}
-
-/*
* Return a held pointer to the nominated multicast ill in `illg', or NULL if
* one doesn't exist. Caller need not be inside the IPSQ.
*/
@@ -785,7 +760,7 @@ ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *illg)
rw_enter(&ipst->ips_ipmp_lock, RW_READER);
castill = illg->ig_cast_ill;
- if (castill != NULL && ill_check_and_refhold(castill) == 0) {
+ if (castill != NULL && ill_check_and_refhold(castill)) {
rw_exit(&ipst->ips_ipmp_lock);
return (castill);
}
@@ -794,6 +769,20 @@ ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *illg)
}
/*
+ * Callback routine for ncec_walk() that deletes `nce' if it is associated with
+ * the `(ill_t *)arg' and it is not one of the local addresses. Caller must be
+ * inside the IPSQ.
+ */
+static void
+ipmp_ncec_delete_nonlocal(ncec_t *ncec, uchar_t *arg)
+{
+ if ((ncec != NULL) && !NCE_MYADDR(ncec) &&
+ ncec->ncec_ill == (ill_t *)arg) {
+ ncec_delete(ncec);
+ }
+}
+
+/*
* Set the nominated cast ill on `illg' to `castill'. If `castill' is NULL,
* any existing nomination is removed. Caller must be inside the IPSQ.
*/
@@ -820,6 +809,14 @@ ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill)
*/
if (ipmp_ill->ill_dl_up)
ill_leave_multicast(ipmp_ill);
+
+ /*
+ * Delete any NCEs tied to the old nomination. We must do this
+ * last since ill_leave_multicast() may trigger IREs to be
+ * built using ig_cast_ill.
+ */
+ ncec_walk(ocastill, (pfi_t)ipmp_ncec_delete_nonlocal, ocastill,
+ ocastill->ill_ipst);
}
/*
@@ -829,16 +826,6 @@ ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill)
illg->ig_cast_ill = castill;
rw_exit(&ipst->ips_ipmp_lock);
- if (ocastill != NULL) {
- /*
- * Delete any IREs tied to the old nomination. We must do
- * this after the new castill is set and has reached global
- * visibility since the datapath has not been quiesced.
- */
- ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
- ill_stq_cache_delete, ocastill, ocastill);
- }
-
/*
* Enable new nominated ill (if any).
*/
@@ -855,15 +842,6 @@ ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill)
if (ipmp_ill->ill_dl_up)
ill_recover_multicast(ipmp_ill);
}
-
- /*
- * For IPv4, refresh our broadcast IREs. This needs to be done even
- * if there's no new nomination since ill_refresh_bcast() still must
- * update the IPMP meta-interface's broadcast IREs to point back at
- * the IPMP meta-interface itself.
- */
- if (!ipmp_ill->ill_isv6)
- ill_refresh_bcast(ipmp_ill);
}
/*
@@ -872,33 +850,33 @@ ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill)
* created IPMP ARP entry, or NULL on failure.
*/
ipmp_arpent_t *
-ipmp_illgrp_create_arpent(ipmp_illgrp_t *illg, mblk_t *mp, boolean_t proxyarp)
+ipmp_illgrp_create_arpent(ipmp_illgrp_t *illg, boolean_t proxyarp,
+ ipaddr_t ipaddr, uchar_t *lladdr, size_t lladdr_len, uint16_t flags)
{
- uchar_t *addrp;
- area_t *area = (area_t *)mp->b_rptr;
ipmp_arpent_t *entp, *oentp;
ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
- ASSERT(area->area_proto_addr_length == sizeof (ipaddr_t));
- if ((entp = kmem_zalloc(sizeof (ipmp_arpent_t), KM_NOSLEEP)) == NULL)
+ if ((entp = kmem_alloc(sizeof (ipmp_arpent_t) + lladdr_len,
+ KM_NOSLEEP)) == NULL)
return (NULL);
- if ((mp = copyb(mp)) == NULL) {
- kmem_free(entp, sizeof (ipmp_arpent_t));
- return (NULL);
- }
-
- DB_TYPE(mp) = M_PROTO;
- entp->ia_area_mp = mp;
- entp->ia_proxyarp = proxyarp;
- addrp = mi_offset_paramc(mp, area->area_proto_addr_offset,
- sizeof (ipaddr_t));
- bcopy(addrp, &entp->ia_ipaddr, sizeof (ipaddr_t));
-
+ /*
+ * Delete any existing ARP entry for this address.
+ */
if ((oentp = ipmp_illgrp_lookup_arpent(illg, &entp->ia_ipaddr)) != NULL)
ipmp_illgrp_destroy_arpent(illg, oentp);
+ /*
+ * Prepend the new entry.
+ */
+ entp->ia_ipaddr = ipaddr;
+ entp->ia_flags = flags;
+ entp->ia_lladdr_len = lladdr_len;
+ entp->ia_lladdr = (uchar_t *)&entp[1];
+ bcopy(lladdr, entp->ia_lladdr, lladdr_len);
+ entp->ia_proxyarp = proxyarp;
+ entp->ia_notified = B_TRUE;
list_insert_head(&illg->ig_arpent, entp);
return (entp);
}
@@ -912,8 +890,7 @@ ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp)
ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
list_remove(&illg->ig_arpent, entp);
- freeb(entp->ia_area_mp);
- kmem_free(entp, sizeof (ipmp_arpent_t));
+ kmem_free(entp, sizeof (ipmp_arpent_t) + entp->ia_lladdr_len);
}
/*
@@ -957,10 +934,9 @@ ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg)
{
ill_t *ill, *ipmp_ill = illg->ig_ipmp_ill;
uint_t paddrlen = ipmp_ill->ill_phys_addr_length;
- area_t *area;
- mblk_t *area_mp;
- uchar_t *physaddr;
ipmp_arpent_t *entp;
+ ncec_t *ncec;
+ nce_t *nce;
ASSERT(IAM_WRITER_ILL(ipmp_ill));
ASSERT(!ipmp_ill->ill_isv6);
@@ -973,11 +949,7 @@ ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg)
continue;
}
- area = (area_t *)entp->ia_area_mp->b_rptr;
ASSERT(paddrlen == ill->ill_phys_addr_length);
- ASSERT(paddrlen == area->area_hw_addr_length);
- physaddr = mi_offset_paramc(entp->ia_area_mp,
- area->area_hw_addr_offset, paddrlen);
/*
* If this is a proxy ARP entry, we can skip notifying ARP if
@@ -985,18 +957,25 @@ ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg)
* update the entry's hardware address before notifying ARP.
*/
if (entp->ia_proxyarp) {
- if (bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0 &&
- entp->ia_notified)
+ if (bcmp(ill->ill_phys_addr, entp->ia_lladdr,
+ paddrlen) == 0 && entp->ia_notified)
continue;
- bcopy(ill->ill_phys_addr, physaddr, paddrlen);
+ bcopy(ill->ill_phys_addr, entp->ia_lladdr, paddrlen);
}
- if ((area_mp = copyb(entp->ia_area_mp)) == NULL) {
- entp->ia_notified = B_FALSE;
+ (void) nce_lookup_then_add_v4(ipmp_ill, entp->ia_lladdr,
+ paddrlen, &entp->ia_ipaddr, entp->ia_flags, ND_UNCHANGED,
+ &nce);
+ if (nce == NULL || !entp->ia_proxyarp) {
+ if (nce != NULL)
+ nce_refrele(nce);
continue;
}
-
- putnext(ipmp_ill->ill_rq, area_mp);
+ ncec = nce->nce_common;
+ mutex_enter(&ncec->ncec_lock);
+ nce_update(ncec, ND_UNCHANGED, ill->ill_phys_addr);
+ mutex_exit(&ncec->ncec_lock);
+ nce_refrele(nce);
ipmp_illgrp_mark_arpent(illg, entp);
if ((ill = list_next(&illg->ig_actif, ill)) == NULL)
@@ -1061,16 +1040,16 @@ ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *illg)
ASSERT(IAM_WRITER_ILL(ipmp_ill));
/*
- * Since ill_max_mtu can only change under ill_lock, we hold ill_lock
+ * Since ill_mtu can only change under ill_lock, we hold ill_lock
* for each ill as we iterate through the list. Any changes to the
- * ill_max_mtu will also trigger an update, so even if we missed it
+ * ill_mtu will also trigger an update, so even if we missed it
* this time around, the update will catch it.
*/
ill = list_head(&illg->ig_if);
for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
mutex_enter(&ill->ill_lock);
- if (mtu == 0 || ill->ill_max_mtu < mtu)
- mtu = ill->ill_max_mtu;
+ if (mtu == 0 || ill->ill_mtu < mtu)
+ mtu = ill->ill_mtu;
mutex_exit(&ill->ill_lock);
}
@@ -1171,13 +1150,12 @@ ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg)
* This may seem odd, but it's consistent with the application view
* that `ill' no longer exists (e.g., due to ipmp_ill_rtsaddrmsg()).
*/
+ update_conn_ill(ill, ill->ill_ipst);
if (ill->ill_isv6) {
- reset_conn_ill(ill);
reset_mrt_ill(ill);
} else {
ipif = ill->ill_ipif;
for (; ipif != NULL; ipif = ipif->ipif_next) {
- reset_conn_ipif(ipif);
reset_mrt_vif_ipif(ipif);
}
}
@@ -1206,7 +1184,7 @@ ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg)
ipmp_ill->ill_flags |= ILLF_COS_ENABLED;
mutex_exit(&ipmp_ill->ill_lock);
}
- ipmp_illgrp_set_mtu(illg, ill->ill_max_mtu);
+ ipmp_illgrp_set_mtu(illg, ill->ill_mtu);
} else {
ASSERT(ipmp_ill->ill_phys_addr_length ==
ill->ill_phys_addr_length);
@@ -1217,8 +1195,8 @@ ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg)
ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED;
mutex_exit(&ipmp_ill->ill_lock);
}
- if (illg->ig_mtu > ill->ill_max_mtu)
- ipmp_illgrp_set_mtu(illg, ill->ill_max_mtu);
+ if (illg->ig_mtu > ill->ill_mtu)
+ ipmp_illgrp_set_mtu(illg, ill->ill_mtu);
}
rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
@@ -1232,12 +1210,6 @@ ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg)
*/
ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_mark_testhidden, ill, ill);
- /*
- * Merge any broadcast IREs, if need be.
- */
- if (!ill->ill_isv6)
- ill_refresh_bcast(ill);
-
ipmp_ill_refresh_active(ill);
}
@@ -1301,12 +1273,6 @@ ipmp_ill_leave_illgrp(ill_t *ill)
rw_exit(&ipst->ips_ill_g_lock);
/*
- * Recreate any broadcast IREs that had been shared, if need be.
- */
- if (!ill->ill_isv6)
- ill_refresh_bcast(ill);
-
- /*
* Re-establish multicast memberships that were previously being
* handled by the IPMP meta-interface.
*/
@@ -1456,10 +1422,8 @@ static boolean_t
ipmp_ill_activate(ill_t *ill)
{
ipif_t *ipif;
- mblk_t *actmp = NULL, *deactmp = NULL;
mblk_t *linkupmp = NULL, *linkdownmp = NULL;
ipmp_grp_t *grp = ill->ill_phyint->phyint_grp;
- const char *grifname = grp->gr_ifname;
ipmp_illgrp_t *illg = ill->ill_grp;
ill_t *maxill;
ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
@@ -1478,20 +1442,6 @@ ipmp_ill_activate(ill_t *ill)
goto fail;
}
- /*
- * For IPv4, allocate the activate/deactivate messages, and tell ARP.
- */
- if (!ill->ill_isv6) {
- actmp = ill_arie_alloc(ill, grifname, &ipmp_aract_template);
- deactmp = ill_arie_alloc(ill, grifname, &ipmp_ardeact_template);
- if (actmp == NULL || deactmp == NULL)
- goto fail;
-
- ASSERT(ill->ill_ardeact_mp == NULL);
- ill->ill_ardeact_mp = deactmp;
- putnext(illg->ig_ipmp_ill->ill_rq, actmp);
- }
-
if (list_is_empty(&illg->ig_actif)) {
/*
* Now that we have an active ill, nominate it for multicast
@@ -1524,12 +1474,6 @@ ipmp_ill_activate(ill_t *ill)
ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE);
ipmp_ill_bind_ipif(ill, ipif, Res_act_rebind);
}
-
- /*
- * TODO: explore whether it's advantageous to flush IRE_CACHE
- * bindings to force existing connections to be redistributed
- * to the new ill.
- */
}
/*
@@ -1542,7 +1486,7 @@ ipmp_ill_activate(ill_t *ill)
rw_exit(&ipst->ips_ipmp_lock);
/*
- * Refresh ARP entries to use `ill', if need be.
+ * Refresh static/proxy ARP entries to use `ill', if need be.
*/
if (!ill->ill_isv6)
ipmp_illgrp_refresh_arpent(illg);
@@ -1557,8 +1501,6 @@ ipmp_ill_activate(ill_t *ill)
}
return (B_TRUE);
fail:
- freemsg(actmp);
- freemsg(deactmp);
freemsg(linkupmp);
freemsg(linkdownmp);
return (B_FALSE);
@@ -1581,18 +1523,6 @@ ipmp_ill_deactivate(ill_t *ill)
ASSERT(IS_UNDER_IPMP(ill));
/*
- * Delete all IRE_CACHE entries for the group. (We cannot restrict
- * ourselves to entries with ire_stq == ill since there may be other
- * IREs that are backed by ACEs that are tied to this ill -- and thus
- * when those ACEs are deleted, the IREs will be adrift without any
- * AR_CN_ANNOUNCE notification from ARP.)
- */
- if (ill->ill_isv6)
- ire_walk_v6(ill_grp_cache_delete, ill, ALL_ZONES, ipst);
- else
- ire_walk_v4(ill_grp_cache_delete, ill, ALL_ZONES, ipst);
-
- /*
* Pull the interface out of the active list.
*/
rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
@@ -1609,6 +1539,12 @@ ipmp_ill_deactivate(ill_t *ill)
ipmp_illgrp_set_cast(illg, list_head(&illg->ig_actif));
/*
+ * Delete all nce_t entries using this ill, so that the next attempt
+ * to send data traffic will revalidate cached nce's.
+ */
+ nce_flush(ill, B_TRUE);
+
+ /*
* Unbind all of the ipifs bound to this ill, and save 'em in a list;
* we'll rebind them after we tell the resolver the ill is no longer
* active. We must do things in this order or the resolver could
@@ -1620,18 +1556,10 @@ ipmp_ill_deactivate(ill_t *ill)
ipif->ipif_bound_next = ubheadipif;
ubheadipif = ipif;
}
-
if (!ill->ill_isv6) {
- /*
- * Tell ARP `ill' is no longer active in the group.
- */
- mp = ill->ill_ardeact_mp;
- ill->ill_ardeact_mp = NULL;
- ASSERT(mp != NULL);
- putnext(illg->ig_ipmp_ill->ill_rq, mp);
/*
- * Refresh any ARP entries that had been using `ill'.
+ * Refresh static/proxy ARP entries that had been using `ill'.
*/
ipmp_illgrp_refresh_arpent(illg);
}
@@ -1649,6 +1577,20 @@ ipmp_ill_deactivate(ill_t *ill)
ipmp_ill_bind_ipif(minill, ipif, Res_act_rebind);
}
+ if (list_is_empty(&illg->ig_actif)) {
+ ill_t *ipmp_ill = illg->ig_ipmp_ill;
+
+ ncec_walk(ipmp_ill, (pfi_t)ncec_delete_per_ill,
+ (uchar_t *)ipmp_ill, ipmp_ill->ill_ipst);
+ }
+
+ /*
+ * Remove any IRE_IF_CLONE for this ill since they might have
+ * an ire_nce_cache/nce_common which refers to another ill in the group.
+ */
+ ire_walk_ill(MATCH_IRE_TYPE, IRE_IF_CLONE, ill_downi_if_clone,
+ ill, ill);
+
/*
* Finally, mark the group link down, if necessary.
*/
@@ -1725,7 +1667,7 @@ ipmp_ill_bind_ipif(ill_t *ill, ipif_t *ipif, enum ip_resolver_action act)
/*
* If necessary, tell ARP/NDP about the new mapping. Note that
- * ipif_resolver_up() cannot fail for non-XRESOLV IPv6 ills.
+ * ipif_resolver_up() cannot fail for IPv6 ills.
*/
if (act != Res_act_none) {
if (ill->ill_isv6) {
@@ -1756,15 +1698,12 @@ ipmp_ill_bind_ipif(ill_t *ill, ipif_t *ipif, enum ip_resolver_action act)
static ipif_t *
ipmp_ill_unbind_ipif(ill_t *ill, ipif_t *ipif, boolean_t notifyres)
{
- ill_t *ipmp_ill;
ipif_t *previpif;
ip_stack_t *ipst = ill->ill_ipst;
ASSERT(IAM_WRITER_ILL(ill));
ASSERT(IS_UNDER_IPMP(ill));
- ipmp_ill = ill->ill_grp->ig_ipmp_ill;
-
/*
* If necessary, find an ipif to unbind.
*/
@@ -1803,13 +1742,10 @@ ipmp_ill_unbind_ipif(ill_t *ill, ipif_t *ipif, boolean_t notifyres)
* If requested, notify the resolvers (provided we're bound).
*/
if (notifyres && ipif->ipif_bound) {
- if (ill->ill_isv6) {
+ if (ill->ill_isv6)
ipif_ndp_down(ipif);
- } else {
- ASSERT(ipif->ipif_arp_del_mp != NULL);
- putnext(ipmp_ill->ill_rq, ipif->ipif_arp_del_mp);
- ipif->ipif_arp_del_mp = NULL;
- }
+ else
+ (void) ipif_arp_down(ipif);
}
ipif->ipif_bound = B_FALSE;
@@ -1845,8 +1781,8 @@ ipmp_ill_is_active(ill_t *ill)
}
/*
- * IRE walker callback: set IRE_MARK_TESTHIDDEN on cache/interface/offsubnet
- * IREs with a source address on `ill_arg'.
+ * IRE walker callback: set ire_testhidden on IRE_HIDDEN_TYPE IREs associated
+ * with `ill_arg'.
*/
static void
ipmp_ill_ire_mark_testhidden(ire_t *ire, char *ill_arg)
@@ -1856,27 +1792,18 @@ ipmp_ill_ire_mark_testhidden(ire_t *ire, char *ill_arg)
ASSERT(IAM_WRITER_ILL(ill));
ASSERT(!IS_IPMP(ill));
- if (ire->ire_ipif->ipif_ill != ill)
+ if (ire->ire_ill != ill)
return;
- switch (ire->ire_type) {
- case IRE_HOST:
- case IRE_PREFIX:
- case IRE_DEFAULT:
- case IRE_CACHE:
- case IRE_IF_RESOLVER:
- case IRE_IF_NORESOLVER:
+ if (IRE_HIDDEN_TYPE(ire->ire_type)) {
DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire);
- ire->ire_marks |= IRE_MARK_TESTHIDDEN;
- break;
- default:
- break;
+ ire->ire_testhidden = B_TRUE;
}
}
/*
- * IRE walker callback: clear IRE_MARK_TESTHIDDEN if the IRE has a source
- * address on `ill_arg'.
+ * IRE walker callback: clear ire_testhidden if the IRE has a source address
+ * on `ill_arg'.
*/
static void
ipmp_ill_ire_clear_testhidden(ire_t *ire, char *ill_arg)
@@ -1886,9 +1813,9 @@ ipmp_ill_ire_clear_testhidden(ire_t *ire, char *ill_arg)
ASSERT(IAM_WRITER_ILL(ill));
ASSERT(!IS_IPMP(ill));
- if (ire->ire_ipif->ipif_ill == ill) {
+ if (ire->ire_ill == ill) {
DTRACE_PROBE1(ipmp__clear__testhidden, ire_t *, ire);
- ire->ire_marks &= ~IRE_MARK_TESTHIDDEN;
+ ire->ire_testhidden = B_FALSE;
}
}
@@ -1909,7 +1836,7 @@ ipmp_ill_hold_ipmp_ill(ill_t *ill)
rw_enter(&ipst->ips_ipmp_lock, RW_READER);
illg = ill->ill_grp;
- if (illg != NULL && ill_check_and_refhold(illg->ig_ipmp_ill) == 0) {
+ if (illg != NULL && ill_check_and_refhold(illg->ig_ipmp_ill)) {
rw_exit(&ipst->ips_ipmp_lock);
return (illg->ig_ipmp_ill);
}
@@ -2135,7 +2062,7 @@ ipmp_ipif_hold_bound_ill(const ipif_t *ipif)
rw_enter(&ipst->ips_ipmp_lock, RW_READER);
boundill = ipif->ipif_bound_ill;
- if (boundill != NULL && ill_check_and_refhold(boundill) == 0) {
+ if (boundill != NULL && ill_check_and_refhold(boundill)) {
rw_exit(&ipst->ips_ipmp_lock);
return (boundill);
}
@@ -2192,3 +2119,182 @@ ipmp_ipif_is_up_dataaddr(const ipif_t *ipif)
{
return (ipmp_ipif_is_dataaddr(ipif) && (ipif->ipif_flags & IPIF_UP));
}
+
+/*
+ * Check if `mp' contains a probe packet by verifying if the IP source address
+ * is a test address on an underlying interface `ill'. Caller need not be inside
+ * the IPSQ.
+ */
+boolean_t
+ipmp_packet_is_probe(mblk_t *mp, ill_t *ill)
+{
+ ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+ ipha_t *ipha = (ipha_t *)mp->b_rptr;
+
+ ASSERT(DB_TYPE(mp) != M_CTL);
+
+ if (!IS_UNDER_IPMP(ill))
+ return (B_FALSE);
+
+ if (ill->ill_isv6) {
+ if (!IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) &&
+ ipif_lookup_testaddr_v6(ill, &ip6h->ip6_src, NULL))
+ return (B_TRUE);
+ } else {
+ if ((ipha->ipha_src != INADDR_ANY) &&
+ ipif_lookup_testaddr_v4(ill, &ipha->ipha_src, NULL))
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+/*
+ * Pick out an appropriate underlying interface for packet transmit. This
+ * function may be called from the data path, so we need to verify that the
+ * IPMP group associated with `ill' is non-null after holding the ill_g_lock.
+ * Caller need not be inside the IPSQ.
+ */
+ill_t *
+ipmp_ill_get_xmit_ill(ill_t *ill, boolean_t is_unicast)
+{
+ ill_t *xmit_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+ if (ill->ill_grp == NULL) {
+ /*
+ * The interface was taken out of the group. Return ill itself,
+ * but take a ref so that callers will always be able to do
+ * ill_refrele(ill);
+ */
+ rw_exit(&ipst->ips_ill_g_lock);
+ ill_refhold(ill);
+ return (ill);
+ }
+ if (!is_unicast)
+ xmit_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
+ else
+ xmit_ill = ipmp_illgrp_hold_next_ill(ill->ill_grp);
+ rw_exit(&ipst->ips_ill_g_lock);
+ return (xmit_ill);
+}
+
+/*
+ * Flush out any nce that points at `ncec' from an underlying interface
+ */
+void
+ipmp_ncec_flush_nce(ncec_t *ncec)
+{
+ ill_t *ncec_ill = ncec->ncec_ill;
+ ill_t *ill;
+ ipmp_illgrp_t *illg;
+ ip_stack_t *ipst = ncec_ill->ill_ipst;
+ list_t dead;
+ nce_t *nce;
+
+ if (!IS_IPMP(ncec_ill))
+ return;
+
+ illg = ncec_ill->ill_grp;
+ list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
+
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+ ill = list_head(&illg->ig_if);
+ for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
+ nce_fastpath_list_delete(ill, ncec, &dead);
+ }
+ rw_exit(&ipst->ips_ill_g_lock);
+
+ /*
+ * we may now nce_refrele() all dead entries since all locks have been
+ * dropped.
+ */
+ while ((nce = list_head(&dead)) != NULL) {
+ list_remove(&dead, nce);
+ nce_refrele(nce);
+ }
+ ASSERT(list_is_empty(&dead));
+ list_destroy(&dead);
+}
+
+/*
+ * For each interface in the IPMP group, if there are nce_t entries for the IP
+ * address corresponding to `ncec', then their dl_unitdata_req_t and fastpath
+ * information must be updated to match the link-layer address information in
+ * `ncec'.
+ */
+void
+ipmp_ncec_fastpath(ncec_t *ncec, ill_t *ipmp_ill)
+{
+ ill_t *ill;
+ ipmp_illgrp_t *illg = ipmp_ill->ill_grp;
+ ip_stack_t *ipst = ipmp_ill->ill_ipst;
+ nce_t *nce, *nce_next;
+ list_t replace;
+
+ ASSERT(IS_IPMP(ipmp_ill));
+
+ /*
+ * if ncec itself is not reachable, there is no use in creating nce_t
+ * entries on the underlying interfaces in the group.
+ */
+ if (!NCE_ISREACHABLE(ncec))
+ return;
+
+ list_create(&replace, sizeof (nce_t), offsetof(nce_t, nce_node));
+ rw_enter(&ipst->ips_ipmp_lock, RW_READER);
+ ill = list_head(&illg->ig_actif);
+ for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
+ /*
+ * For each underlying interface, we first check if there is an
+ * nce_t for the address in ncec->ncec_addr. If one exists,
+ * we should trigger nce_fastpath for that nce_t. However, the
+ * catch is that we are holding the ips_ipmp_lock to prevent
+ * changes to the IPMP group membership, so that we cannot
+ * putnext() to the driver. So we nce_delete the
+ * list nce_t entries that need to be updated into the
+ * `replace' list, and then process the `replace' list
+ * after dropping the ips_ipmp_lock.
+ */
+ mutex_enter(&ill->ill_lock);
+ for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
+ nce_next = list_next(&ill->ill_nce, nce);
+ if (!IN6_ARE_ADDR_EQUAL(&nce->nce_addr,
+ &ncec->ncec_addr)) {
+ nce = nce_next;
+ continue;
+ }
+ nce_refhold(nce);
+ nce_delete(nce);
+ list_insert_tail(&replace, nce);
+ nce = nce_next;
+ }
+ mutex_exit(&ill->ill_lock);
+ }
+ rw_exit(&ipst->ips_ipmp_lock);
+ /*
+ * `replace' now has the list of nce's on which we should be triggering
+ * nce_fastpath(). We now retrigger fastpath by setting up the nce
+ * again. The code in nce_lookup_then_add_v* ensures that nce->nce_ill
+ * is still in the group for ncec->ncec_ill
+ */
+ while ((nce = list_head(&replace)) != NULL) {
+ list_remove(&replace, nce);
+ if (ncec->ncec_ill->ill_isv6) {
+ (void) nce_lookup_then_add_v6(nce->nce_ill,
+ ncec->ncec_lladdr, ncec->ncec_lladdr_length,
+ &nce->nce_addr, ncec->ncec_flags, ND_UNCHANGED,
+ NULL);
+ } else {
+ ipaddr_t ipaddr;
+
+ IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ipaddr);
+ (void) nce_lookup_then_add_v4(nce->nce_ill,
+ ncec->ncec_lladdr, ncec->ncec_lladdr_length,
+ &ipaddr, ncec->ncec_flags, ND_UNCHANGED, NULL);
+ }
+ nce_refrele(nce);
+ }
+ ASSERT(list_is_empty(&replace));
+ list_destroy(&replace);
+}
diff --git a/usr/src/uts/common/inet/ip/ipsec_loader.c b/usr/src/uts/common/inet/ip/ipsec_loader.c
index 6609146fd1..7f5c434359 100644
--- a/usr/src/uts/common/inet/ip/ipsec_loader.c
+++ b/usr/src/uts/common/inet/ip/ipsec_loader.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -121,8 +121,6 @@ ipsec_loader(void *arg)
}
mutex_exit(&ipss->ipsec_loader_lock);
- ip_ipsec_load_complete(ipss);
-
mutex_enter(&ipss->ipsec_loader_lock);
if (!ipsec_failure) {
CALLB_CPR_EXIT(&cprinfo);
diff --git a/usr/src/uts/common/inet/ip/ipsecah.c b/usr/src/uts/common/inet/ip/ipsecah.c
index c130dac490..a511b85ff4 100644
--- a/usr/src/uts/common/inet/ip/ipsecah.c
+++ b/usr/src/uts/common/inet/ip/ipsecah.c
@@ -54,6 +54,8 @@
#include <inet/ip.h>
#include <inet/ip6.h>
#include <inet/nd.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ndp.h>
#include <inet/ipsec_info.h>
#include <inet/ipsec_impl.h>
#include <inet/sadb.h>
@@ -62,7 +64,6 @@
#include <inet/ipdrop.h>
#include <sys/taskq.h>
#include <sys/policy.h>
-#include <sys/iphada.h>
#include <sys/strsun.h>
#include <sys/crypto/common.h>
@@ -132,32 +133,27 @@ static ipsecahparam_t lcl_param_arr[] = {
#define AH_MSGSIZE(mp) ((mp)->b_cont != NULL ? msgdsize(mp) : MBLKL(mp))
-static ipsec_status_t ah_auth_out_done(mblk_t *);
-static ipsec_status_t ah_auth_in_done(mblk_t *);
+static mblk_t *ah_auth_out_done(mblk_t *, ip_xmit_attr_t *, ipsec_crypto_t *);
+static mblk_t *ah_auth_in_done(mblk_t *, ip_recv_attr_t *, ipsec_crypto_t *);
static mblk_t *ah_process_ip_options_v4(mblk_t *, ipsa_t *, int *, uint_t,
boolean_t, ipsecah_stack_t *);
static mblk_t *ah_process_ip_options_v6(mblk_t *, ipsa_t *, int *, uint_t,
boolean_t, ipsecah_stack_t *);
static void ah_getspi(mblk_t *, keysock_in_t *, ipsecah_stack_t *);
-static ipsec_status_t ah_inbound_accelerated(mblk_t *, boolean_t, ipsa_t *,
- uint32_t);
-static ipsec_status_t ah_outbound_accelerated_v4(mblk_t *, ipsa_t *);
-static ipsec_status_t ah_outbound_accelerated_v6(mblk_t *, ipsa_t *);
-static ipsec_status_t ah_outbound(mblk_t *);
+static void ah_inbound_restart(mblk_t *, ip_recv_attr_t *);
+
+static mblk_t *ah_outbound(mblk_t *, ip_xmit_attr_t *);
+static void ah_outbound_finish(mblk_t *, ip_xmit_attr_t *);
static int ipsecah_open(queue_t *, dev_t *, int, int, cred_t *);
static int ipsecah_close(queue_t *);
-static void ipsecah_rput(queue_t *, mblk_t *);
static void ipsecah_wput(queue_t *, mblk_t *);
static void ah_send_acquire(ipsacq_t *, mblk_t *, netstack_t *);
static boolean_t ah_register_out(uint32_t, uint32_t, uint_t, ipsecah_stack_t *,
- mblk_t *);
+ cred_t *);
static void *ipsecah_stack_init(netstackid_t stackid, netstack_t *ns);
static void ipsecah_stack_fini(netstackid_t stackid, void *arg);
-extern void (*cl_inet_getspi)(netstackid_t, uint8_t, uint8_t *, size_t,
- void *);
-
/* Setable in /etc/system */
uint32_t ah_hash_size = IPSEC_DEFAULT_HASH_SIZE;
@@ -168,7 +164,7 @@ static struct module_info info = {
};
static struct qinit rinit = {
- (pfi_t)ipsecah_rput, NULL, ipsecah_open, ipsecah_close, NULL, &info,
+ (pfi_t)putnext, NULL, ipsecah_open, ipsecah_close, NULL, &info,
NULL
};
@@ -215,9 +211,6 @@ ah_kstat_init(ipsecah_stack_t *ahstack, netstackid_t stackid)
KI(acquire_requests);
KI(bytes_expired);
KI(out_discards);
- KI(in_accelerated);
- KI(out_accelerated);
- KI(noaccel);
KI(crypto_sync);
KI(crypto_async);
KI(crypto_failures);
@@ -275,9 +268,9 @@ ah_ager(void *arg)
hrtime_t begin = gethrtime();
sadb_ager(&ahstack->ah_sadb.s_v4, ahstack->ah_pfkey_q,
- ahstack->ah_sadb.s_ip_q, ahstack->ipsecah_reap_delay, ns);
+ ahstack->ipsecah_reap_delay, ns);
sadb_ager(&ahstack->ah_sadb.s_v6, ahstack->ah_pfkey_q,
- ahstack->ah_sadb.s_ip_q, ahstack->ipsecah_reap_delay, ns);
+ ahstack->ipsecah_reap_delay, ns);
ahstack->ah_event = sadb_retimeout(begin, ahstack->ah_pfkey_q,
ah_ager, ahstack,
@@ -474,7 +467,13 @@ ipsecah_stack_fini(netstackid_t stackid, void *arg)
}
/*
- * AH module open routine. The module should be opened by keysock.
+ * AH module open routine, which is here for keysock plumbing.
+ * Keysock is pushed over {AH,ESP} which is an artifact from the Bad Old
+ * Days of export control, and fears that ESP would not be allowed
+ * to be shipped at all by default. Eventually, keysock should
+ * either access AH and ESP via modstubs or krtld dependencies, or
+ * perhaps be folded in with AH and ESP into a single IPsec/netsec
+ * module ("netsec" if PF_KEY provides more than AH/ESP keying tables).
*/
/* ARGSUSED */
static int
@@ -497,57 +496,10 @@ ipsecah_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
ahstack = ns->netstack_ipsecah;
ASSERT(ahstack != NULL);
- /*
- * ASSUMPTIONS (because I'm MT_OCEXCL):
- *
- * * I'm being pushed on top of IP for all my opens (incl. #1).
- * * Only ipsecah_open() can write into ah_sadb.s_ip_q.
- * * Because of this, I can check lazily for ah_sadb.s_ip_q.
- *
- * If these assumptions are wrong, I'm in BIG trouble...
- */
-
q->q_ptr = ahstack;
WR(q)->q_ptr = q->q_ptr;
- if (ahstack->ah_sadb.s_ip_q == NULL) {
- struct T_unbind_req *tur;
-
- ahstack->ah_sadb.s_ip_q = WR(q);
- /* Allocate an unbind... */
- ahstack->ah_ip_unbind = allocb(sizeof (struct T_unbind_req),
- BPRI_HI);
-
- /*
- * Send down T_BIND_REQ to bind IPPROTO_AH.
- * Handle the ACK here in AH.
- */
- qprocson(q);
- if (ahstack->ah_ip_unbind == NULL ||
- !sadb_t_bind_req(ahstack->ah_sadb.s_ip_q, IPPROTO_AH)) {
- if (ahstack->ah_ip_unbind != NULL) {
- freeb(ahstack->ah_ip_unbind);
- ahstack->ah_ip_unbind = NULL;
- }
- q->q_ptr = NULL;
- qprocsoff(q);
- netstack_rele(ahstack->ipsecah_netstack);
- return (ENOMEM);
- }
-
- ahstack->ah_ip_unbind->b_datap->db_type = M_PROTO;
- tur = (struct T_unbind_req *)ahstack->ah_ip_unbind->b_rptr;
- tur->PRIM_type = T_UNBIND_REQ;
- } else {
- qprocson(q);
- }
-
- /*
- * For now, there's not much I can do. I'll be getting a message
- * passed down to me from keysock (in my wput), and a T_BIND_ACK
- * up from IP (in my rput).
- */
-
+ qprocson(q);
return (0);
}
@@ -560,17 +512,6 @@ ipsecah_close(queue_t *q)
ipsecah_stack_t *ahstack = (ipsecah_stack_t *)q->q_ptr;
/*
- * If ah_sadb.s_ip_q is attached to this instance, send a
- * T_UNBIND_REQ to IP for the instance before doing
- * a qprocsoff().
- */
- if (WR(q) == ahstack->ah_sadb.s_ip_q &&
- ahstack->ah_ip_unbind != NULL) {
- putnext(WR(q), ahstack->ah_ip_unbind);
- ahstack->ah_ip_unbind = NULL;
- }
-
- /*
* Clean up q_ptr, if needed.
*/
qprocsoff(q);
@@ -585,98 +526,16 @@ ipsecah_close(queue_t *q)
(void) quntimeout(q, ahstack->ah_event);
}
- if (WR(q) == ahstack->ah_sadb.s_ip_q) {
- /*
- * If the ah_sadb.s_ip_q is attached to this instance, find
- * another. The OCEXCL outer perimeter helps us here.
- */
-
- ahstack->ah_sadb.s_ip_q = NULL;
-
- /*
- * Find a replacement queue for ah_sadb.s_ip_q.
- */
- if (ahstack->ah_pfkey_q != NULL &&
- ahstack->ah_pfkey_q != RD(q)) {
- /*
- * See if we can use the pfkey_q.
- */
- ahstack->ah_sadb.s_ip_q = WR(ahstack->ah_pfkey_q);
- }
-
- if (ahstack->ah_sadb.s_ip_q == NULL ||
- !sadb_t_bind_req(ahstack->ah_sadb.s_ip_q, IPPROTO_AH)) {
- ah1dbg(ahstack,
- ("ipsecah: Can't reassign ah_sadb.s_ip_q.\n"));
- ahstack->ah_sadb.s_ip_q = NULL;
- } else {
- ahstack->ah_ip_unbind =
- allocb(sizeof (struct T_unbind_req), BPRI_HI);
-
- if (ahstack->ah_ip_unbind != NULL) {
- struct T_unbind_req *tur;
-
- ahstack->ah_ip_unbind->b_datap->db_type =
- M_PROTO;
- tur = (struct T_unbind_req *)
- ahstack->ah_ip_unbind->b_rptr;
- tur->PRIM_type = T_UNBIND_REQ;
- }
- /* If it's NULL, I can't do much here. */
- }
- }
-
netstack_rele(ahstack->ipsecah_netstack);
return (0);
}
/*
- * AH module read put routine.
- */
-/* ARGSUSED */
-static void
-ipsecah_rput(queue_t *q, mblk_t *mp)
-{
- ipsecah_stack_t *ahstack = (ipsecah_stack_t *)q->q_ptr;
-
- ASSERT(mp->b_datap->db_type != M_CTL); /* No more IRE_DB_REQ. */
-
- switch (mp->b_datap->db_type) {
- case M_PROTO:
- case M_PCPROTO:
- /* TPI message of some sort. */
- switch (*((t_scalar_t *)mp->b_rptr)) {
- case T_BIND_ACK:
- /* We expect this. */
- ah3dbg(ahstack,
- ("Thank you IP from AH for T_BIND_ACK\n"));
- break;
- case T_ERROR_ACK:
- cmn_err(CE_WARN,
- "ipsecah: AH received T_ERROR_ACK from IP.");
- break;
- case T_OK_ACK:
- /* Probably from a (rarely sent) T_UNBIND_REQ. */
- break;
- default:
- ah1dbg(ahstack, ("Unknown M_{,PC}PROTO message.\n"));
- }
- freemsg(mp);
- break;
- default:
- /* For now, passthru message. */
- ah2dbg(ahstack, ("AH got unknown mblk type %d.\n",
- mp->b_datap->db_type));
- putnext(q, mp);
- }
-}
-
-/*
* Construct an SADB_REGISTER message with the current algorithms.
*/
static boolean_t
ah_register_out(uint32_t sequence, uint32_t pid, uint_t serial,
- ipsecah_stack_t *ahstack, mblk_t *in_mp)
+ ipsecah_stack_t *ahstack, cred_t *cr)
{
mblk_t *mp;
boolean_t rc = B_TRUE;
@@ -691,7 +550,7 @@ ah_register_out(uint32_t sequence, uint32_t pid, uint_t serial,
sadb_sens_t *sens;
size_t sens_len = 0;
sadb_ext_t *nextext;
- cred_t *sens_cr = NULL;
+ ts_label_t *sens_tsl = NULL;
/* Allocate the KEYSOCK_OUT. */
mp = sadb_keysock_out(serial);
@@ -700,11 +559,10 @@ ah_register_out(uint32_t sequence, uint32_t pid, uint_t serial,
return (B_FALSE);
}
- if (is_system_labeled() && (in_mp != NULL)) {
- sens_cr = msg_getcred(in_mp, NULL);
-
- if (sens_cr != NULL) {
- sens_len = sadb_sens_len_from_cred(sens_cr);
+ if (is_system_labeled() && (cr != NULL)) {
+ sens_tsl = crgetlabel(cr);
+ if (sens_tsl != NULL) {
+ sens_len = sadb_sens_len_from_label(sens_tsl);
allocsize += sens_len;
}
}
@@ -786,10 +644,10 @@ ah_register_out(uint32_t sequence, uint32_t pid, uint_t serial,
mutex_exit(&ipss->ipsec_alg_lock);
- if (sens_cr != NULL) {
+ if (sens_tsl != NULL) {
sens = (sadb_sens_t *)nextext;
- sadb_sens_from_cred(sens, SADB_EXT_SENSITIVITY,
- sens_cr, sens_len);
+ sadb_sens_from_label(sens, SADB_EXT_SENSITIVITY,
+ sens_tsl, sens_len);
nextext = (sadb_ext_t *)(((uint8_t *)sens) + sens_len);
}
@@ -847,40 +705,61 @@ ipsecah_algs_changed(netstack_t *ns)
/*
* Stub function that taskq_dispatch() invokes to take the mblk (in arg)
- * and put() it into AH and STREAMS again.
+ * and send it into AH and IP again.
*/
static void
inbound_task(void *arg)
{
- ah_t *ah;
- mblk_t *mp = (mblk_t *)arg;
- ipsec_in_t *ii = (ipsec_in_t *)mp->b_rptr;
- int ipsec_rc;
- netstack_t *ns;
- ipsecah_stack_t *ahstack;
-
- ns = netstack_find_by_stackid(ii->ipsec_in_stackid);
- if (ns == NULL || ns != ii->ipsec_in_ns) {
- /* Just freemsg(). */
- if (ns != NULL)
- netstack_rele(ns);
+ mblk_t *mp = (mblk_t *)arg;
+ mblk_t *async_mp;
+ ip_recv_attr_t iras;
+
+ async_mp = mp;
+ mp = async_mp->b_cont;
+ async_mp->b_cont = NULL;
+ if (!ip_recv_attr_from_mblk(async_mp, &iras)) {
+ /* The ill or ip_stack_t disappeared on us */
+ ip_drop_input("ip_recv_attr_from_mblk", mp, NULL);
freemsg(mp);
- return;
+ goto done;
}
- ahstack = ns->netstack_ipsecah;
+ ah_inbound_restart(mp, &iras);
+done:
+ ira_cleanup(&iras, B_TRUE);
+}
- ah2dbg(ahstack, ("in AH inbound_task"));
+/*
+ * Restart ESP after the SA has been added.
+ */
+static void
+ah_inbound_restart(mblk_t *mp, ip_recv_attr_t *ira)
+{
+ ah_t *ah;
+ netstack_t *ns;
+ ipsecah_stack_t *ahstack;
+
+ ns = ira->ira_ill->ill_ipst->ips_netstack;
+ ahstack = ns->netstack_ipsecah;
ASSERT(ahstack != NULL);
- ah = ipsec_inbound_ah_sa(mp, ns);
- if (ah != NULL) {
- ASSERT(ii->ipsec_in_ah_sa != NULL);
- ipsec_rc = ii->ipsec_in_ah_sa->ipsa_input_func(mp, ah);
- if (ipsec_rc == IPSEC_STATUS_SUCCESS)
- ip_fanout_proto_again(mp, NULL, NULL, NULL);
+ mp = ipsec_inbound_ah_sa(mp, ira, &ah);
+ if (mp == NULL)
+ return;
+
+ ASSERT(ah != NULL);
+ ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
+ ASSERT(ira->ira_ipsec_ah_sa != NULL);
+
+ mp = ira->ira_ipsec_ah_sa->ipsa_input_func(mp, ah, ira);
+ if (mp == NULL) {
+ /*
+ * Either it failed or is pending. In the former case
+ * ipIfStatsInDiscards was increased.
+ */
+ return;
}
- netstack_rele(ns);
+ ip_input_post_ipsec(mp, ira);
}
/*
@@ -1051,60 +930,96 @@ ah_add_sa_finish(mblk_t *mp, sadb_msg_t *samsg, keysock_in_t *ksi,
if (larval != NULL)
lpkt = sadb_clear_lpkt(larval);
- rc = sadb_common_add(ahstack->ah_sadb.s_ip_q, ahstack->ah_pfkey_q, mp,
+ rc = sadb_common_add(ahstack->ah_pfkey_q, mp,
samsg, ksi, primary, secondary, larval, clone, is_inbound,
diagnostic, ns, &ahstack->ah_sadb);
+ if (lpkt != NULL) {
+ if (rc == 0) {
+ rc = !taskq_dispatch(ah_taskq, inbound_task, lpkt,
+ TQ_NOSLEEP);
+ }
+ if (rc != 0) {
+ lpkt = ip_recv_attr_free_mblk(lpkt);
+ ip_drop_packet(lpkt, B_TRUE, NULL,
+ DROPPER(ipss, ipds_sadb_inlarval_timeout),
+ &ahstack->ah_dropper);
+ }
+ }
+
/*
* How much more stack will I create with all of these
- * ah_inbound_* and ah_outbound_*() calls?
+ * ah_outbound_*() calls?
*/
- if (rc == 0 && lpkt != NULL)
- rc = !taskq_dispatch(ah_taskq, inbound_task, lpkt, TQ_NOSLEEP);
-
- if (rc != 0) {
- ip_drop_packet(lpkt, B_TRUE, NULL, NULL,
- DROPPER(ipss, ipds_sadb_inlarval_timeout),
- &ahstack->ah_dropper);
- }
-
+ /* Handle the packets queued waiting for the SA */
while (acq_msgs != NULL) {
- mblk_t *mp = acq_msgs;
+ mblk_t *asyncmp;
+ mblk_t *data_mp;
+ ip_xmit_attr_t ixas;
+ ill_t *ill;
+ asyncmp = acq_msgs;
acq_msgs = acq_msgs->b_next;
- mp->b_next = NULL;
- if (rc == 0) {
- ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr;
-
- ASSERT(ahstack->ah_sadb.s_ip_q != NULL);
- if (ipsec_outbound_sa(mp, IPPROTO_AH)) {
- io->ipsec_out_ah_done = B_TRUE;
- if (ah_outbound(mp) == IPSEC_STATUS_SUCCESS) {
- ipha_t *ipha = (ipha_t *)
- mp->b_cont->b_rptr;
- if (sq.af == AF_INET) {
- ip_wput_ipsec_out(NULL, mp,
- ipha, NULL, NULL);
- } else {
- ip6_t *ip6h = (ip6_t *)ipha;
-
- ASSERT(sq.af == AF_INET6);
-
- ip_wput_ipsec_out_v6(NULL,
- mp, ip6h, NULL, NULL);
- }
- }
- continue;
- }
+ asyncmp->b_next = NULL;
+
+ /*
+ * Extract the ip_xmit_attr_t from the first mblk.
+ * Verifies that the netstack and ill is still around; could
+ * have vanished while iked was doing its work.
+ * On succesful return we have a nce_t and the ill/ipst can't
+ * disappear until we do the nce_refrele in ixa_cleanup.
+ */
+ data_mp = asyncmp->b_cont;
+ asyncmp->b_cont = NULL;
+ if (!ip_xmit_attr_from_mblk(asyncmp, &ixas)) {
+ AH_BUMP_STAT(ahstack, out_discards);
+ ip_drop_packet(data_mp, B_FALSE, NULL,
+ DROPPER(ipss, ipds_sadb_acquire_timeout),
+ &ahstack->ah_dropper);
+ } else if (rc != 0) {
+ ill = ixas.ixa_nce->nce_ill;
+ AH_BUMP_STAT(ahstack, out_discards);
+ ip_drop_packet(data_mp, B_FALSE, ill,
+ DROPPER(ipss, ipds_sadb_acquire_timeout),
+ &ahstack->ah_dropper);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ } else {
+ ah_outbound_finish(data_mp, &ixas);
}
+ ixa_cleanup(&ixas);
+ }
+
+ return (rc);
+}
+
+
+/*
+ * Process one of the queued messages (from ipsacq_mp) once the SA
+ * has been added.
+ */
+static void
+ah_outbound_finish(mblk_t *data_mp, ip_xmit_attr_t *ixa)
+{
+ netstack_t *ns = ixa->ixa_ipst->ips_netstack;
+ ipsecah_stack_t *ahstack = ns->netstack_ipsecah;
+ ipsec_stack_t *ipss = ns->netstack_ipsec;
+ ill_t *ill = ixa->ixa_nce->nce_ill;
+
+ if (!ipsec_outbound_sa(data_mp, ixa, IPPROTO_AH)) {
AH_BUMP_STAT(ahstack, out_discards);
- ip_drop_packet(mp, B_FALSE, NULL, NULL,
+ ip_drop_packet(data_mp, B_FALSE, ill,
DROPPER(ipss, ipds_sadb_acquire_timeout),
&ahstack->ah_dropper);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ return;
}
- return (rc);
+ data_mp = ah_outbound(data_mp, ixa);
+ if (data_mp == NULL)
+ return;
+
+ (void) ip_output_post_ipsec(data_mp, ixa);
}
/*
@@ -1300,8 +1215,7 @@ ah_del_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic,
}
return (sadb_purge_sa(mp, ksi,
(sin->sin_family == AF_INET6) ? &ahstack->ah_sadb.s_v6 :
- &ahstack->ah_sadb.s_v4, diagnostic, ahstack->ah_pfkey_q,
- ahstack->ah_sadb.s_ip_q));
+ &ahstack->ah_sadb.s_v4, diagnostic, ahstack->ah_pfkey_q));
}
return (sadb_delget_sa(mp, ksi, &ahstack->ah_sadb, diagnostic,
@@ -1449,7 +1363,7 @@ ah_parse_pfkey(mblk_t *mp, ipsecah_stack_t *ahstack)
* Keysock takes care of the PF_KEY bookkeeping for this.
*/
if (ah_register_out(samsg->sadb_msg_seq, samsg->sadb_msg_pid,
- ksi->ks_in_serial, ahstack, mp)) {
+ ksi->ks_in_serial, ahstack, msg_getcred(mp, NULL))) {
freemsg(mp);
} else {
/*
@@ -1534,8 +1448,7 @@ ah_keysock_no_socket(mblk_t *mp, ipsecah_stack_t *ahstack)
samsg->sadb_msg_errno = kse->ks_err_errno;
samsg->sadb_msg_len = SADB_8TO64(sizeof (*samsg));
/*
- * Use the write-side of the ah_pfkey_q, in case there is
- * no ahstack->ah_sadb.s_ip_q.
+ * Use the write-side of the ah_pfkey_q
*/
sadb_in_acquire(samsg, &ahstack->ah_sadb,
WR(ahstack->ah_pfkey_q), ahstack->ipsecah_netstack);
@@ -1825,22 +1738,15 @@ ah_age_bytes(ipsa_t *assoc, uint64_t bytes, boolean_t inbound)
* Called while holding the algorithm lock.
*/
static void
-ah_insert_prop(sadb_prop_t *prop, ipsacq_t *acqrec, uint_t combs)
+ah_insert_prop(sadb_prop_t *prop, ipsacq_t *acqrec, uint_t combs,
+ netstack_t *ns)
{
sadb_comb_t *comb = (sadb_comb_t *)(prop + 1);
- ipsec_out_t *io;
ipsec_action_t *ap;
ipsec_prot_t *prot;
- ipsecah_stack_t *ahstack;
- netstack_t *ns;
- ipsec_stack_t *ipss;
-
- io = (ipsec_out_t *)acqrec->ipsacq_mp->b_rptr;
- ASSERT(io->ipsec_out_type == IPSEC_OUT);
+ ipsecah_stack_t *ahstack = ns->netstack_ipsecah;
+ ipsec_stack_t *ipss = ns->netstack_ipsec;
- ns = io->ipsec_out_ns;
- ipss = ns->netstack_ipsec;
- ahstack = ns->netstack_ipsecah;
ASSERT(MUTEX_HELD(&ipss->ipsec_alg_lock));
prop->sadb_prop_exttype = SADB_EXT_PROPOSAL;
@@ -1851,9 +1757,9 @@ ah_insert_prop(sadb_prop_t *prop, ipsacq_t *acqrec, uint_t combs)
/*
* Based upon algorithm properties, and what-not, prioritize a
- * proposal, based on the ordering of the ah algorithms in the
- * alternatives presented in the policy rule passed down
- * through the ipsec_out_t and attached to the acquire record.
+ * proposal, based on the ordering of the AH algorithms in the
+ * alternatives in the policy rule or socket that was placed
+ * in the acquire record.
*/
for (ap = acqrec->ipsacq_act; ap != NULL;
@@ -1961,7 +1867,7 @@ ah_send_acquire(ipsacq_t *acqrec, mblk_t *extended, netstack_t *ns)
/* Insert proposal here. */
prop = (sadb_prop_t *)(((uint64_t *)samsg) + samsg->sadb_msg_len);
- ah_insert_prop(prop, acqrec, combs);
+ ah_insert_prop(prop, acqrec, combs, ns);
samsg->sadb_msg_len += prop->sadb_prop_len;
msgmp->b_wptr += SADB_64TO8(samsg->sadb_msg_len);
@@ -2117,11 +2023,12 @@ ah_getspi(mblk_t *mp, keysock_in_t *ksi, ipsecah_stack_t *ahstack)
/*
* IPv6 sends up the ICMP errors for validation and the removal of the AH
* header.
+ * If succesful, the mp has been modified to not include the AH header so
+ * that the caller can fanout to the ULP's icmp error handler.
*/
-static ipsec_status_t
-ah_icmp_error_v6(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack)
+static mblk_t *
+ah_icmp_error_v6(mblk_t *mp, ip_recv_attr_t *ira, ipsecah_stack_t *ahstack)
{
- mblk_t *mp;
ip6_t *ip6h, *oip6h;
uint16_t hdr_length, ah_length;
uint8_t *nexthdrp;
@@ -2132,14 +2039,6 @@ ah_icmp_error_v6(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack)
uint8_t *post_ah_ptr;
ipsec_stack_t *ipss = ahstack->ipsecah_netstack->netstack_ipsec;
- mp = ipsec_mp->b_cont;
- ASSERT(mp->b_datap->db_type == M_CTL);
-
- /*
- * Change the type to M_DATA till we finish pullups.
- */
- mp->b_datap->db_type = M_DATA;
-
/*
* Eat the cost of a pullupmsg() for now. It makes the rest of this
* code far less convoluted.
@@ -2150,10 +2049,10 @@ ah_icmp_error_v6(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack)
mp->b_rptr + hdr_length + sizeof (icmp6_t) + sizeof (ip6_t) +
sizeof (ah_t) > mp->b_wptr) {
IP_AH_BUMP_STAT(ipss, in_discards);
- ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL,
+ ip_drop_packet(mp, B_TRUE, ira->ira_ill,
DROPPER(ipss, ipds_ah_nomem),
&ahstack->ah_dropper);
- return (IPSEC_STATUS_FAILED);
+ return (NULL);
}
oip6h = (ip6_t *)mp->b_rptr;
@@ -2161,10 +2060,10 @@ ah_icmp_error_v6(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack)
ip6h = (ip6_t *)(icmp6 + 1);
if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_length, &nexthdrp)) {
IP_AH_BUMP_STAT(ipss, in_discards);
- ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL,
+ ip_drop_packet(mp, B_TRUE, ira->ira_ill,
DROPPER(ipss, ipds_ah_bad_v6_hdrs),
&ahstack->ah_dropper);
- return (IPSEC_STATUS_FAILED);
+ return (NULL);
}
ah = (ah_t *)((uint8_t *)ip6h + hdr_length);
@@ -2186,10 +2085,10 @@ ah_icmp_error_v6(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack)
ah->ah_spi, &oip6h->ip6_src, AF_INET6,
ahstack->ipsecah_netstack);
}
- ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL,
+ ip_drop_packet(mp, B_TRUE, ira->ira_ill,
DROPPER(ipss, ipds_ah_no_sa),
&ahstack->ah_dropper);
- return (IPSEC_STATUS_FAILED);
+ return (NULL);
}
IPSA_REFRELE(assoc);
@@ -2208,10 +2107,10 @@ ah_icmp_error_v6(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack)
if (post_ah_ptr > mp->b_wptr) {
IP_AH_BUMP_STAT(ipss, in_discards);
- ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL,
+ ip_drop_packet(mp, B_TRUE, ira->ira_ill,
DROPPER(ipss, ipds_ah_bad_length),
&ahstack->ah_dropper);
- return (IPSEC_STATUS_FAILED);
+ return (NULL);
}
ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) - ah_length);
@@ -2219,20 +2118,19 @@ ah_icmp_error_v6(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack)
ovbcopy(post_ah_ptr, ah,
(size_t)((uintptr_t)mp->b_wptr - (uintptr_t)post_ah_ptr));
mp->b_wptr -= ah_length;
- /* Rewhack to be an ICMP error. */
- mp->b_datap->db_type = M_CTL;
- return (IPSEC_STATUS_SUCCESS);
+ return (mp);
}
/*
* IP sends up the ICMP errors for validation and the removal of
* the AH header.
+ * If succesful, the mp has been modified to not include the AH header so
+ * that the caller can fanout to the ULP's icmp error handler.
*/
-static ipsec_status_t
-ah_icmp_error_v4(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack)
+static mblk_t *
+ah_icmp_error_v4(mblk_t *mp, ip_recv_attr_t *ira, ipsecah_stack_t *ahstack)
{
- mblk_t *mp;
mblk_t *mp1;
icmph_t *icmph;
int iph_hdr_length;
@@ -2248,14 +2146,6 @@ ah_icmp_error_v4(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack)
uint8_t nexthdr;
ipsec_stack_t *ipss = ahstack->ipsecah_netstack->netstack_ipsec;
- mp = ipsec_mp->b_cont;
- ASSERT(mp->b_datap->db_type == M_CTL);
-
- /*
- * Change the type to M_DATA till we finish pullups.
- */
- mp->b_datap->db_type = M_DATA;
-
oipha = ipha = (ipha_t *)mp->b_rptr;
iph_hdr_length = IPH_HDR_LENGTH(ipha);
icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
@@ -2274,10 +2164,10 @@ ah_icmp_error_v4(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack)
SL_WARN | SL_ERROR,
"ICMP error: Small AH header\n");
IP_AH_BUMP_STAT(ipss, in_discards);
- ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL,
+ ip_drop_packet(mp, B_TRUE, ira->ira_ill,
DROPPER(ipss, ipds_ah_bad_length),
&ahstack->ah_dropper);
- return (IPSEC_STATUS_FAILED);
+ return (NULL);
}
icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
ipha = (ipha_t *)&icmph[1];
@@ -2304,10 +2194,10 @@ ah_icmp_error_v4(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack)
ah->ah_spi, &oipha->ipha_src, AF_INET,
ahstack->ipsecah_netstack);
}
- ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL,
+ ip_drop_packet(mp, B_TRUE, ira->ira_ill,
DROPPER(ipss, ipds_ah_no_sa),
&ahstack->ah_dropper);
- return (IPSEC_STATUS_FAILED);
+ return (NULL);
}
IPSA_REFRELE(assoc);
@@ -2343,10 +2233,10 @@ ah_icmp_error_v4(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack)
* We tried hard, give up now.
*/
IP_AH_BUMP_STAT(ipss, in_discards);
- ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL,
+ ip_drop_packet(mp, B_TRUE, ira->ira_ill,
DROPPER(ipss, ipds_ah_nomem),
&ahstack->ah_dropper);
- return (IPSEC_STATUS_FAILED);
+ return (NULL);
}
icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
ipha = (ipha_t *)&icmph[1];
@@ -2354,8 +2244,8 @@ ah_icmp_error_v4(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack)
done:
/*
* Remove the AH header and change the protocol.
- * Don't update the spi fields in the ipsec_in
- * message as we are called just to validate the
+ * Don't update the spi fields in the ip_recv_attr_t
+ * as we are called just to validate the
* message attached to the ICMP message.
*
* If we never pulled up since all of the message
@@ -2368,14 +2258,11 @@ done:
if ((mp1 = allocb(alloc_size, BPRI_LO)) == NULL) {
IP_AH_BUMP_STAT(ipss, in_discards);
- ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL,
+ ip_drop_packet(mp, B_TRUE, ira->ira_ill,
DROPPER(ipss, ipds_ah_nomem),
&ahstack->ah_dropper);
- return (IPSEC_STATUS_FAILED);
+ return (NULL);
}
- /* ICMP errors are M_CTL messages */
- mp1->b_datap->db_type = M_CTL;
- ipsec_mp->b_cont = mp1;
bcopy(mp->b_rptr, mp1->b_rptr, alloc_size);
mp1->b_wptr += alloc_size;
@@ -2402,24 +2289,23 @@ done:
ipha->ipha_hdr_checksum = 0;
ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
- return (IPSEC_STATUS_SUCCESS);
+ return (mp1);
}
/*
* IP calls this to validate the ICMP errors that
* we got from the network.
*/
-ipsec_status_t
-ipsecah_icmp_error(mblk_t *mp)
+mblk_t *
+ipsecah_icmp_error(mblk_t *data_mp, ip_recv_attr_t *ira)
{
- ipsec_in_t *ii = (ipsec_in_t *)mp->b_rptr;
- netstack_t *ns = ii->ipsec_in_ns;
+ netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack;
ipsecah_stack_t *ahstack = ns->netstack_ipsecah;
- if (ii->ipsec_in_v4)
- return (ah_icmp_error_v4(mp, ahstack));
+ if (ira->ira_flags & IRAF_IS_IPV4)
+ return (ah_icmp_error_v4(data_mp, ira, ahstack));
else
- return (ah_icmp_error_v6(mp, ahstack));
+ return (ah_icmp_error_v6(data_mp, ira, ahstack));
}
static int
@@ -2546,7 +2432,7 @@ ah_fix_phdr_v6(ip6_t *ip6h, ip6_t *oip6h, boolean_t outbound,
prev_nexthdr = (uint8_t *)&ip6h->ip6_nxt;
nexthdr = oip6h->ip6_nxt;
/* Assume IP has already stripped it */
- ASSERT(nexthdr != IPPROTO_FRAGMENT && nexthdr != IPPROTO_RAW);
+ ASSERT(nexthdr != IPPROTO_FRAGMENT);
ah = NULL;
dsthdr = NULL;
for (;;) {
@@ -2741,19 +2627,19 @@ ah_finish_up(ah_t *phdr_ah, ah_t *inbound_ah, ipsa_t *assoc,
* argument is freed.
*/
static void
-ah_log_bad_auth(mblk_t *ipsec_in)
+ah_log_bad_auth(mblk_t *mp, ip_recv_attr_t *ira, ipsec_crypto_t *ic)
{
- mblk_t *mp = ipsec_in->b_cont->b_cont;
- ipsec_in_t *ii = (ipsec_in_t *)ipsec_in->b_rptr;
- boolean_t isv4 = ii->ipsec_in_v4;
- ipsa_t *assoc = ii->ipsec_in_ah_sa;
- int af;
- void *addr;
- netstack_t *ns = ii->ipsec_in_ns;
+ boolean_t isv4 = (ira->ira_flags & IRAF_IS_IPV4);
+ ipsa_t *assoc = ira->ira_ipsec_ah_sa;
+ int af;
+ void *addr;
+ netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack;
ipsecah_stack_t *ahstack = ns->netstack_ipsecah;
ipsec_stack_t *ipss = ns->netstack_ipsec;
- mp->b_rptr -= ii->ipsec_in_skip_len;
+ ASSERT(mp->b_datap->db_type == M_DATA);
+
+ mp->b_rptr -= ic->ic_skip_len;
if (isv4) {
ipha_t *ipha = (ipha_t *)mp->b_rptr;
@@ -2776,110 +2662,163 @@ ah_log_bad_auth(mblk_t *ipsec_in)
assoc->ipsa_spi, addr, af, ahstack->ipsecah_netstack);
IP_AH_BUMP_STAT(ipss, in_discards);
- ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL,
+ ip_drop_packet(mp, B_TRUE, ira->ira_ill,
DROPPER(ipss, ipds_ah_bad_auth),
&ahstack->ah_dropper);
}
/*
* Kernel crypto framework callback invoked after completion of async
- * crypto requests.
+ * crypto requests for outbound packets.
*/
static void
-ah_kcf_callback(void *arg, int status)
+ah_kcf_callback_outbound(void *arg, int status)
{
- mblk_t *ipsec_mp = (mblk_t *)arg;
- ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr;
- boolean_t is_inbound = (ii->ipsec_in_type == IPSEC_IN);
- netstackid_t stackid;
- netstack_t *ns, *ns_arg;
+ mblk_t *mp = (mblk_t *)arg;
+ mblk_t *async_mp;
+ netstack_t *ns;
ipsec_stack_t *ipss;
ipsecah_stack_t *ahstack;
- ipsec_out_t *io = (ipsec_out_t *)ii;
+ mblk_t *data_mp;
+ ip_xmit_attr_t ixas;
+ ipsec_crypto_t *ic;
+ ill_t *ill;
- ASSERT(ipsec_mp->b_cont != NULL);
+ /*
+ * First remove the ipsec_crypto_t mblk
+ * Note that we need to ipsec_free_crypto_data(mp) once done with ic.
+ */
+ async_mp = ipsec_remove_crypto_data(mp, &ic);
+ ASSERT(async_mp != NULL);
- if (is_inbound) {
- stackid = ii->ipsec_in_stackid;
- ns_arg = ii->ipsec_in_ns;
+ /*
+ * Extract the ip_xmit_attr_t from the first mblk.
+ * Verifies that the netstack and ill is still around; could
+ * have vanished while kEf was doing its work.
+ * On succesful return we have a nce_t and the ill/ipst can't
+ * disappear until we do the nce_refrele in ixa_cleanup.
+ */
+ data_mp = async_mp->b_cont;
+ async_mp->b_cont = NULL;
+ if (!ip_xmit_attr_from_mblk(async_mp, &ixas)) {
+ /* Disappeared on us - no ill/ipst for MIB */
+ if (ixas.ixa_nce != NULL) {
+ ill = ixas.ixa_nce->nce_ill;
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards", data_mp, ill);
+ }
+ freemsg(data_mp);
+ goto done;
+ }
+ ns = ixas.ixa_ipst->ips_netstack;
+ ahstack = ns->netstack_ipsecah;
+ ipss = ns->netstack_ipsec;
+ ill = ixas.ixa_nce->nce_ill;
+
+ if (status == CRYPTO_SUCCESS) {
+ data_mp = ah_auth_out_done(data_mp, &ixas, ic);
+ if (data_mp == NULL)
+ goto done;
+
+ (void) ip_output_post_ipsec(data_mp, &ixas);
} else {
- stackid = io->ipsec_out_stackid;
- ns_arg = io->ipsec_out_ns;
+ /* Outbound shouldn't see invalid MAC */
+ ASSERT(status != CRYPTO_INVALID_MAC);
+
+ ah1dbg(ahstack,
+ ("ah_kcf_callback_outbound: crypto failed with 0x%x\n",
+ status));
+ AH_BUMP_STAT(ahstack, crypto_failures);
+ AH_BUMP_STAT(ahstack, out_discards);
+
+ ip_drop_packet(data_mp, B_FALSE, ill,
+ DROPPER(ipss, ipds_ah_crypto_failed),
+ &ahstack->ah_dropper);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
}
+done:
+ ixa_cleanup(&ixas);
+ (void) ipsec_free_crypto_data(mp);
+}
+
+/*
+ * Kernel crypto framework callback invoked after completion of async
+ * crypto requests for inbound packets.
+ */
+static void
+ah_kcf_callback_inbound(void *arg, int status)
+{
+ mblk_t *mp = (mblk_t *)arg;
+ mblk_t *async_mp;
+ netstack_t *ns;
+ ipsec_stack_t *ipss;
+ ipsecah_stack_t *ahstack;
+ mblk_t *data_mp;
+ ip_recv_attr_t iras;
+ ipsec_crypto_t *ic;
+
/*
- * Verify that the netstack is still around; could have vanished
- * while kEf was doing its work.
+ * First remove the ipsec_crypto_t mblk
+ * Note that we need to ipsec_free_crypto_data(mp) once done with ic.
*/
- ns = netstack_find_by_stackid(stackid);
- if (ns == NULL || ns != ns_arg) {
- /* Disappeared on us */
- if (ns != NULL)
- netstack_rele(ns);
- freemsg(ipsec_mp);
- return;
- }
+ async_mp = ipsec_remove_crypto_data(mp, &ic);
+ ASSERT(async_mp != NULL);
+ /*
+ * Extract the ip_xmit_attr_t from the first mblk.
+ * Verifies that the netstack and ill is still around; could
+ * have vanished while kEf was doing its work.
+ */
+ data_mp = async_mp->b_cont;
+ async_mp->b_cont = NULL;
+ if (!ip_recv_attr_from_mblk(async_mp, &iras)) {
+ /* The ill or ip_stack_t disappeared on us */
+ ip_drop_input("ip_recv_attr_from_mblk", data_mp, NULL);
+ freemsg(data_mp);
+ goto done;
+ }
+ ns = iras.ira_ill->ill_ipst->ips_netstack;
ahstack = ns->netstack_ipsecah;
ipss = ns->netstack_ipsec;
if (status == CRYPTO_SUCCESS) {
- if (is_inbound) {
- if (ah_auth_in_done(ipsec_mp) != IPSEC_STATUS_SUCCESS) {
- netstack_rele(ns);
- return;
- }
- /* finish IPsec processing */
- ip_fanout_proto_again(ipsec_mp, NULL, NULL, NULL);
- } else {
- ipha_t *ipha;
+ data_mp = ah_auth_in_done(data_mp, &iras, ic);
+ if (data_mp == NULL)
+ goto done;
- if (ah_auth_out_done(ipsec_mp) !=
- IPSEC_STATUS_SUCCESS) {
- netstack_rele(ns);
- return;
- }
-
- /* finish IPsec processing */
- ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr;
- if (IPH_HDR_VERSION(ipha) == IP_VERSION) {
- ip_wput_ipsec_out(NULL, ipsec_mp, ipha, NULL,
- NULL);
- } else {
- ip6_t *ip6h = (ip6_t *)ipha;
- ip_wput_ipsec_out_v6(NULL, ipsec_mp, ip6h,
- NULL, NULL);
- }
- }
+ /* finish IPsec processing */
+ ip_input_post_ipsec(data_mp, &iras);
} else if (status == CRYPTO_INVALID_MAC) {
- ah_log_bad_auth(ipsec_mp);
+ ah_log_bad_auth(data_mp, &iras, ic);
} else {
- ah1dbg(ahstack, ("ah_kcf_callback: crypto failed with 0x%x\n",
+ ah1dbg(ahstack,
+ ("ah_kcf_callback_inbound: crypto failed with 0x%x\n",
status));
AH_BUMP_STAT(ahstack, crypto_failures);
- if (is_inbound)
- IP_AH_BUMP_STAT(ipss, in_discards);
- else
- AH_BUMP_STAT(ahstack, out_discards);
- ip_drop_packet(ipsec_mp, is_inbound, NULL, NULL,
+ IP_AH_BUMP_STAT(ipss, in_discards);
+ ip_drop_packet(data_mp, B_TRUE, iras.ira_ill,
DROPPER(ipss, ipds_ah_crypto_failed),
&ahstack->ah_dropper);
+ BUMP_MIB(iras.ira_ill->ill_ip_mib, ipIfStatsInDiscards);
}
- netstack_rele(ns);
+done:
+ ira_cleanup(&iras, B_TRUE);
+ (void) ipsec_free_crypto_data(mp);
}
/*
* Invoked on kernel crypto failure during inbound and outbound processing.
*/
static void
-ah_crypto_failed(mblk_t *mp, boolean_t is_inbound, int kef_rc,
- ipsecah_stack_t *ahstack)
+ah_crypto_failed(mblk_t *data_mp, boolean_t is_inbound, int kef_rc,
+ ill_t *ill, ipsecah_stack_t *ahstack)
{
ipsec_stack_t *ipss = ahstack->ipsecah_netstack->netstack_ipsec;
ah1dbg(ahstack, ("crypto failed for %s AH with 0x%x\n",
is_inbound ? "inbound" : "outbound", kef_rc));
- ip_drop_packet(mp, is_inbound, NULL, NULL,
+ ip_drop_packet(data_mp, is_inbound, ill,
DROPPER(ipss, ipds_ah_crypto_failed),
&ahstack->ah_dropper);
AH_BUMP_STAT(ahstack, crypto_failures);
@@ -2893,14 +2832,14 @@ ah_crypto_failed(mblk_t *mp, boolean_t is_inbound, int kef_rc,
* Helper macros for the ah_submit_req_{inbound,outbound}() functions.
*/
-#define AH_INIT_CALLREQ(_cr, _ipss) { \
- (_cr)->cr_flag = CRYPTO_SKIP_REQID|CRYPTO_RESTRICTED; \
- if ((_ipss)->ipsec_algs_exec_mode[IPSEC_ALG_AUTH] == \
- IPSEC_ALGS_EXEC_ASYNC) \
- (_cr)->cr_flag |= CRYPTO_ALWAYS_QUEUE; \
- (_cr)->cr_callback_arg = ipsec_mp; \
- (_cr)->cr_callback_func = ah_kcf_callback; \
-}
+/*
+ * A statement-equivalent macro, _cr MUST point to a modifiable
+ * crypto_call_req_t.
+ */
+#define AH_INIT_CALLREQ(_cr, _mp, _callback) \
+ (_cr)->cr_flag = CRYPTO_SKIP_REQID|CRYPTO_ALWAYS_QUEUE; \
+ (_cr)->cr_callback_arg = (_mp); \
+ (_cr)->cr_callback_func = (_callback)
#define AH_INIT_CRYPTO_DATA(data, msglen, mblk) { \
(data)->cd_format = CRYPTO_DATA_MBLK; \
@@ -2920,124 +2859,185 @@ ah_crypto_failed(mblk_t *mp, boolean_t is_inbound, int kef_rc,
/*
* Submit an inbound packet for processing by the crypto framework.
*/
-static ipsec_status_t
-ah_submit_req_inbound(mblk_t *ipsec_mp, size_t skip_len, uint32_t ah_offset,
- ipsa_t *assoc)
+static mblk_t *
+ah_submit_req_inbound(mblk_t *phdr_mp, ip_recv_attr_t *ira,
+ size_t skip_len, uint32_t ah_offset, ipsa_t *assoc)
{
int kef_rc;
- mblk_t *phdr_mp;
- crypto_call_req_t call_req;
- ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr;
+ mblk_t *mp;
+ crypto_call_req_t call_req, *callrp;
uint_t icv_len = assoc->ipsa_mac_len;
crypto_ctx_template_t ctx_tmpl;
- netstack_t *ns = ii->ipsec_in_ns;
- ipsecah_stack_t *ahstack = ns->netstack_ipsecah;
- ipsec_stack_t *ipss = ns->netstack_ipsec;
+ ipsecah_stack_t *ahstack;
+ ipsec_crypto_t *ic, icstack;
+ boolean_t force = (assoc->ipsa_flags & IPSA_F_ASYNC);
+
+ ahstack = ira->ira_ill->ill_ipst->ips_netstack->netstack_ipsecah;
- phdr_mp = ipsec_mp->b_cont;
ASSERT(phdr_mp != NULL);
- ASSERT(ii->ipsec_in_type == IPSEC_IN);
+ ASSERT(phdr_mp->b_datap->db_type == M_DATA);
+
+ if (force) {
+ /* We are doing asynch; allocate mblks to hold state */
+ if ((mp = ip_recv_attr_to_mblk(ira)) == NULL ||
+ (mp = ipsec_add_crypto_data(mp, &ic)) == NULL) {
+ BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", phdr_mp,
+ ira->ira_ill);
+ freemsg(phdr_mp);
+ return (NULL);
+ }
- /*
- * In case kEF queues and calls back, make sure we have the
- * netstackid_t for verification that the IP instance is still around
- * in esp_kcf_callback().
- */
- ASSERT(ii->ipsec_in_stackid == ns->netstack_stackid);
+ linkb(mp, phdr_mp);
+ callrp = &call_req;
+ AH_INIT_CALLREQ(callrp, mp, ah_kcf_callback_inbound);
+ } else {
+ /*
+ * If we know we are going to do sync then ipsec_crypto_t
+ * should be on the stack.
+ */
+ ic = &icstack;
+ bzero(ic, sizeof (*ic));
+ callrp = NULL;
+ }
/* init arguments for the crypto framework */
- AH_INIT_CRYPTO_DATA(&ii->ipsec_in_crypto_data, AH_MSGSIZE(phdr_mp),
+ AH_INIT_CRYPTO_DATA(&ic->ic_crypto_data, AH_MSGSIZE(phdr_mp),
phdr_mp);
- AH_INIT_CRYPTO_MAC(&ii->ipsec_in_crypto_mac, icv_len,
+ AH_INIT_CRYPTO_MAC(&ic->ic_crypto_mac, icv_len,
(char *)phdr_mp->b_cont->b_rptr - skip_len + ah_offset +
sizeof (ah_t));
- AH_INIT_CALLREQ(&call_req, ipss);
-
- ii->ipsec_in_skip_len = skip_len;
+ ic->ic_skip_len = skip_len;
IPSEC_CTX_TMPL(assoc, ipsa_authtmpl, IPSEC_ALG_AUTH, ctx_tmpl);
/* call KEF to do the MAC operation */
kef_rc = crypto_mac_verify(&assoc->ipsa_amech,
- &ii->ipsec_in_crypto_data, &assoc->ipsa_kcfauthkey, ctx_tmpl,
- &ii->ipsec_in_crypto_mac, &call_req);
+ &ic->ic_crypto_data, &assoc->ipsa_kcfauthkey, ctx_tmpl,
+ &ic->ic_crypto_mac, callrp);
switch (kef_rc) {
case CRYPTO_SUCCESS:
AH_BUMP_STAT(ahstack, crypto_sync);
- return (ah_auth_in_done(ipsec_mp));
+ phdr_mp = ah_auth_in_done(phdr_mp, ira, ic);
+ if (force) {
+ /* Free mp after we are done with ic */
+ mp = ipsec_free_crypto_data(mp);
+ (void) ip_recv_attr_free_mblk(mp);
+ }
+ return (phdr_mp);
case CRYPTO_QUEUED:
- /* ah_kcf_callback() will be invoked on completion */
+ /* ah_kcf_callback_inbound() will be invoked on completion */
AH_BUMP_STAT(ahstack, crypto_async);
- return (IPSEC_STATUS_PENDING);
+ return (NULL);
case CRYPTO_INVALID_MAC:
+ /* Free mp after we are done with ic */
AH_BUMP_STAT(ahstack, crypto_sync);
- ah_log_bad_auth(ipsec_mp);
- return (IPSEC_STATUS_FAILED);
+ BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+ ah_log_bad_auth(phdr_mp, ira, ic);
+ /* phdr_mp was passed to ip_drop_packet */
+ if (force) {
+ mp = ipsec_free_crypto_data(mp);
+ (void) ip_recv_attr_free_mblk(mp);
+ }
+ return (NULL);
}
- ah_crypto_failed(ipsec_mp, B_TRUE, kef_rc, ahstack);
- return (IPSEC_STATUS_FAILED);
+ if (force) {
+ mp = ipsec_free_crypto_data(mp);
+ phdr_mp = ip_recv_attr_free_mblk(mp);
+ }
+ BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+ ah_crypto_failed(phdr_mp, B_TRUE, kef_rc, ira->ira_ill, ahstack);
+ /* phdr_mp was passed to ip_drop_packet */
+ return (NULL);
}
/*
* Submit an outbound packet for processing by the crypto framework.
*/
-static ipsec_status_t
-ah_submit_req_outbound(mblk_t *ipsec_mp, size_t skip_len, ipsa_t *assoc)
+static mblk_t *
+ah_submit_req_outbound(mblk_t *phdr_mp, ip_xmit_attr_t *ixa,
+ size_t skip_len, ipsa_t *assoc)
{
int kef_rc;
- mblk_t *phdr_mp;
- crypto_call_req_t call_req;
- ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr;
+ mblk_t *mp;
+ crypto_call_req_t call_req, *callrp;
uint_t icv_len = assoc->ipsa_mac_len;
- netstack_t *ns = io->ipsec_out_ns;
- ipsecah_stack_t *ahstack = ns->netstack_ipsecah;
- ipsec_stack_t *ipss = ns->netstack_ipsec;
+ ipsecah_stack_t *ahstack;
+ ipsec_crypto_t *ic, icstack;
+ ill_t *ill = ixa->ixa_nce->nce_ill;
+ boolean_t force = (assoc->ipsa_flags & IPSA_F_ASYNC);
- phdr_mp = ipsec_mp->b_cont;
- ASSERT(phdr_mp != NULL);
- ASSERT(io->ipsec_out_type == IPSEC_OUT);
+ ahstack = ill->ill_ipst->ips_netstack->netstack_ipsecah;
- /*
- * In case kEF queues and calls back, keep netstackid_t for
- * verification that the IP instance is still around in
- * ah_kcf_callback().
- */
- io->ipsec_out_stackid = ns->netstack_stackid;
+ ASSERT(phdr_mp != NULL);
+ ASSERT(phdr_mp->b_datap->db_type == M_DATA);
+
+ if (force) {
+ /* We are doing asynch; allocate mblks to hold state */
+ if ((mp = ip_xmit_attr_to_mblk(ixa)) == NULL ||
+ (mp = ipsec_add_crypto_data(mp, &ic)) == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards", phdr_mp, ill);
+ freemsg(phdr_mp);
+ return (NULL);
+ }
+ linkb(mp, phdr_mp);
+ callrp = &call_req;
+ AH_INIT_CALLREQ(callrp, mp, ah_kcf_callback_outbound);
+ } else {
+ /*
+ * If we know we are going to do sync then ipsec_crypto_t
+ * should be on the stack.
+ */
+ ic = &icstack;
+ bzero(ic, sizeof (*ic));
+ callrp = NULL;
+ }
/* init arguments for the crypto framework */
- AH_INIT_CRYPTO_DATA(&io->ipsec_out_crypto_data, AH_MSGSIZE(phdr_mp),
+ AH_INIT_CRYPTO_DATA(&ic->ic_crypto_data, AH_MSGSIZE(phdr_mp),
phdr_mp);
- AH_INIT_CRYPTO_MAC(&io->ipsec_out_crypto_mac, icv_len,
+ AH_INIT_CRYPTO_MAC(&ic->ic_crypto_mac, icv_len,
(char *)phdr_mp->b_wptr);
- AH_INIT_CALLREQ(&call_req, ipss);
+ ic->ic_skip_len = skip_len;
- io->ipsec_out_skip_len = skip_len;
-
- ASSERT(io->ipsec_out_ah_sa != NULL);
+ ASSERT(ixa->ixa_ipsec_ah_sa != NULL);
/* call KEF to do the MAC operation */
- kef_rc = crypto_mac(&assoc->ipsa_amech, &io->ipsec_out_crypto_data,
+ kef_rc = crypto_mac(&assoc->ipsa_amech, &ic->ic_crypto_data,
&assoc->ipsa_kcfauthkey, assoc->ipsa_authtmpl,
- &io->ipsec_out_crypto_mac, &call_req);
+ &ic->ic_crypto_mac, callrp);
switch (kef_rc) {
case CRYPTO_SUCCESS:
AH_BUMP_STAT(ahstack, crypto_sync);
- return (ah_auth_out_done(ipsec_mp));
+ phdr_mp = ah_auth_out_done(phdr_mp, ixa, ic);
+ if (force) {
+ /* Free mp after we are done with ic */
+ mp = ipsec_free_crypto_data(mp);
+ (void) ip_xmit_attr_free_mblk(mp);
+ }
+ return (phdr_mp);
case CRYPTO_QUEUED:
- /* ah_kcf_callback() will be invoked on completion */
+ /* ah_kcf_callback_outbound() will be invoked on completion */
AH_BUMP_STAT(ahstack, crypto_async);
- return (IPSEC_STATUS_PENDING);
+ return (NULL);
}
- ah_crypto_failed(ipsec_mp, B_FALSE, kef_rc, ahstack);
- return (IPSEC_STATUS_FAILED);
+ if (force) {
+ mp = ipsec_free_crypto_data(mp);
+ phdr_mp = ip_xmit_attr_free_mblk(mp);
+ }
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ah_crypto_failed(phdr_mp, B_FALSE, kef_rc, NULL, ahstack);
+ /* phdr_mp was passed to ip_drop_packet */
+ return (NULL);
}
/*
@@ -3056,7 +3056,6 @@ ah_process_ip_options_v6(mblk_t *mp, ipsa_t *assoc, int *length_to_skip,
uint_t ah_align_sz;
uint_t ah_offset;
int hdr_size;
- ipsec_stack_t *ipss = ahstack->ipsecah_netstack->netstack_ipsec;
/*
* Allocate space for the authentication data also. It is
@@ -3135,9 +3134,6 @@ ah_process_ip_options_v6(mblk_t *mp, ipsa_t *assoc, int *length_to_skip,
ah_offset = ah_fix_phdr_v6(ip6h, oip6h, outbound, B_FALSE);
if (ah_offset == 0) {
- ip_drop_packet(phdr_mp, !outbound, NULL, NULL,
- DROPPER(ipss, ipds_ah_bad_v6_hdrs),
- &ahstack->ah_dropper);
return (NULL);
}
}
@@ -3375,65 +3371,67 @@ ah_hdr:
/*
* Authenticate an outbound datagram. This function is called
* whenever IP sends an outbound datagram that needs authentication.
+ * Returns a modified packet if done. Returns NULL if error or queued.
+ * If error return then ipIfStatsOutDiscards has been increased.
*/
-static ipsec_status_t
-ah_outbound(mblk_t *ipsec_out)
+static mblk_t *
+ah_outbound(mblk_t *data_mp, ip_xmit_attr_t *ixa)
{
- mblk_t *mp;
mblk_t *phdr_mp;
- ipsec_out_t *oi;
ipsa_t *assoc;
int length_to_skip;
uint_t ah_align_sz;
uint_t age_bytes;
- netstack_t *ns;
- ipsec_stack_t *ipss;
- ipsecah_stack_t *ahstack;
+ netstack_t *ns = ixa->ixa_ipst->ips_netstack;
+ ipsecah_stack_t *ahstack = ns->netstack_ipsecah;
+ ipsec_stack_t *ipss = ns->netstack_ipsec;
+ ill_t *ill = ixa->ixa_nce->nce_ill;
+ boolean_t need_refrele = B_FALSE;
/*
* Construct the chain of mblks
*
- * IPSEC_OUT->PSEUDO_HDR->DATA
+ * PSEUDO_HDR->DATA
*
* one by one.
*/
- ASSERT(ipsec_out->b_datap->db_type == M_CTL);
-
- ASSERT(MBLKL(ipsec_out) >= sizeof (ipsec_info_t));
-
- mp = ipsec_out->b_cont;
- oi = (ipsec_out_t *)ipsec_out->b_rptr;
- ns = oi->ipsec_out_ns;
- ipss = ns->netstack_ipsec;
- ahstack = ns->netstack_ipsecah;
-
AH_BUMP_STAT(ahstack, out_requests);
- ASSERT(mp->b_datap->db_type == M_DATA);
+ ASSERT(data_mp->b_datap->db_type == M_DATA);
- assoc = oi->ipsec_out_ah_sa;
+ assoc = ixa->ixa_ipsec_ah_sa;
ASSERT(assoc != NULL);
/*
* Get the outer IP header in shape to escape this system..
*/
- if (is_system_labeled() && (assoc->ipsa_ocred != NULL)) {
- int whack;
-
- mblk_setcred(mp, assoc->ipsa_ocred, NOPID);
- if (oi->ipsec_out_v4)
- whack = sadb_whack_label(&mp, assoc);
- else
- whack = sadb_whack_label_v6(&mp, assoc);
- if (whack != 0) {
- ip_drop_packet(ipsec_out, B_FALSE, NULL,
- NULL, DROPPER(ipss, ipds_ah_nomem),
+ if (is_system_labeled() && (assoc->ipsa_otsl != NULL)) {
+ /*
+ * Need to update packet with any CIPSO option and update
+ * ixa_tsl to capture the new label.
+ * We allocate a separate ixa for that purpose.
+ */
+ ixa = ip_xmit_attr_duplicate(ixa);
+ if (ixa == NULL) {
+ ip_drop_packet(data_mp, B_FALSE, ill,
+ DROPPER(ipss, ipds_ah_nomem),
&ahstack->ah_dropper);
- return (IPSEC_STATUS_FAILED);
+ return (NULL);
+ }
+ need_refrele = B_TRUE;
+
+ label_hold(assoc->ipsa_otsl);
+ ip_xmit_attr_replace_tsl(ixa, assoc->ipsa_otsl);
+
+ data_mp = sadb_whack_label(data_mp, assoc, ixa,
+ DROPPER(ipss, ipds_ah_nomem), &ahstack->ah_dropper);
+ if (data_mp == NULL) {
+ /* Packet dropped by sadb_whack_label */
+ ixa_refrele(ixa);
+ return (NULL);
}
- ipsec_out->b_cont = mp;
}
/*
@@ -3441,14 +3439,14 @@ ah_outbound(mblk_t *ipsec_out)
* adding the AH header, ICV, and padding to the packet.
*/
- if (oi->ipsec_out_v4) {
- ipha_t *ipha = (ipha_t *)mp->b_rptr;
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
ah_align_sz = P2ALIGN(assoc->ipsa_mac_len +
IPV4_PADDING_ALIGN - 1, IPV4_PADDING_ALIGN);
age_bytes = ntohs(ipha->ipha_length) + sizeof (ah_t) +
ah_align_sz;
} else {
- ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+ ip6_t *ip6h = (ip6_t *)data_mp->b_rptr;
ah_align_sz = P2ALIGN(assoc->ipsa_mac_len +
IPV6_PADDING_ALIGN - 1, IPV6_PADDING_ALIGN);
age_bytes = sizeof (ip6_t) + ntohs(ip6h->ip6_plen) +
@@ -3461,8 +3459,12 @@ ah_outbound(mblk_t *ipsec_out)
"AH association 0x%x, dst %s had bytes expire.\n",
ntohl(assoc->ipsa_spi), assoc->ipsa_dstaddr, AF_INET,
ahstack->ipsecah_netstack);
- freemsg(ipsec_out);
- return (IPSEC_STATUS_FAILED);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards", data_mp, ill);
+ freemsg(data_mp);
+ if (need_refrele)
+ ixa_refrele(ixa);
+ return (NULL);
}
/*
@@ -3470,64 +3472,59 @@ ah_outbound(mblk_t *ipsec_out)
* (AH is computing the checksum over the outer label).
*/
- if (oi->ipsec_out_is_capab_ill) {
- ah3dbg(ahstack, ("ah_outbound: pkt can be accelerated\n"));
- if (oi->ipsec_out_v4)
- return (ah_outbound_accelerated_v4(ipsec_out, assoc));
- else
- return (ah_outbound_accelerated_v6(ipsec_out, assoc));
- }
- AH_BUMP_STAT(ahstack, noaccel);
-
/*
* Insert pseudo header:
- * IPSEC_INFO -> [IP, ULP] => IPSEC_INFO -> [IP, AH, ICV] -> ULP
+ * [IP, ULP] => [IP, AH, ICV] -> ULP
*/
- if (oi->ipsec_out_v4) {
- phdr_mp = ah_process_ip_options_v4(mp, assoc, &length_to_skip,
- assoc->ipsa_mac_len, B_TRUE, ahstack);
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ phdr_mp = ah_process_ip_options_v4(data_mp, assoc,
+ &length_to_skip, assoc->ipsa_mac_len, B_TRUE, ahstack);
} else {
- phdr_mp = ah_process_ip_options_v6(mp, assoc, &length_to_skip,
- assoc->ipsa_mac_len, B_TRUE, ahstack);
+ phdr_mp = ah_process_ip_options_v6(data_mp, assoc,
+ &length_to_skip, assoc->ipsa_mac_len, B_TRUE, ahstack);
}
if (phdr_mp == NULL) {
AH_BUMP_STAT(ahstack, out_discards);
- ip_drop_packet(ipsec_out, B_FALSE, NULL, NULL,
+ ip_drop_packet(data_mp, B_FALSE, ixa->ixa_nce->nce_ill,
DROPPER(ipss, ipds_ah_bad_v4_opts),
&ahstack->ah_dropper);
- return (IPSEC_STATUS_FAILED);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ if (need_refrele)
+ ixa_refrele(ixa);
+ return (NULL);
}
- ipsec_out->b_cont = phdr_mp;
- phdr_mp->b_cont = mp;
- mp->b_rptr += length_to_skip;
+ phdr_mp->b_cont = data_mp;
+ data_mp->b_rptr += length_to_skip;
+ data_mp = phdr_mp;
/*
- * At this point ipsec_out points to the IPSEC_OUT, new_mp
- * points to an mblk containing the pseudo header (IP header,
+ * At this point data_mp points to
+ * an mblk containing the pseudo header (IP header,
* AH header, and ICV with mutable fields zero'ed out).
* mp points to the mblk containing the ULP data. The original
- * IP header is kept before the ULP data in mp.
+ * IP header is kept before the ULP data in data_mp.
*/
/* submit MAC request to KCF */
- return (ah_submit_req_outbound(ipsec_out, length_to_skip, assoc));
+ data_mp = ah_submit_req_outbound(data_mp, ixa, length_to_skip, assoc);
+ if (need_refrele)
+ ixa_refrele(ixa);
+ return (data_mp);
}
-static ipsec_status_t
-ah_inbound(mblk_t *ipsec_in_mp, void *arg)
+static mblk_t *
+ah_inbound(mblk_t *data_mp, void *arg, ip_recv_attr_t *ira)
{
- mblk_t *data_mp = ipsec_in_mp->b_cont;
- ipsec_in_t *ii = (ipsec_in_t *)ipsec_in_mp->b_rptr;
- ah_t *ah = (ah_t *)arg;
- ipsa_t *assoc = ii->ipsec_in_ah_sa;
- int length_to_skip;
- int ah_length;
- mblk_t *phdr_mp;
- uint32_t ah_offset;
- netstack_t *ns = ii->ipsec_in_ns;
+ ah_t *ah = (ah_t *)arg;
+ ipsa_t *assoc = ira->ira_ipsec_ah_sa;
+ int length_to_skip;
+ int ah_length;
+ mblk_t *phdr_mp;
+ uint32_t ah_offset;
+ netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack;
ipsecah_stack_t *ahstack = ns->netstack_ipsecah;
ipsec_stack_t *ipss = ns->netstack_ipsec;
@@ -3547,10 +3544,11 @@ ah_inbound(mblk_t *ipsec_in_mp, void *arg)
if (!sadb_replay_peek(assoc, ah->ah_replay)) {
AH_BUMP_STAT(ahstack, replay_early_failures);
IP_AH_BUMP_STAT(ipss, in_discards);
- ip_drop_packet(ipsec_in_mp, B_TRUE, NULL, NULL,
+ ip_drop_packet(data_mp, B_TRUE, ira->ira_ill,
DROPPER(ipss, ipds_ah_early_replay),
&ahstack->ah_dropper);
- return (IPSEC_STATUS_FAILED);
+ BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+ return (NULL);
}
/*
@@ -3561,19 +3559,6 @@ ah_inbound(mblk_t *ipsec_in_mp, void *arg)
ah_offset = (uchar_t *)ah - data_mp->b_rptr;
/*
- * Has this packet already been processed by a hardware
- * IPsec accelerator?
- */
- if (ii->ipsec_in_accelerated) {
- ah3dbg(ahstack,
- ("ah_inbound_v6: pkt processed by ill=%d isv6=%d\n",
- ii->ipsec_in_ill_index, !ii->ipsec_in_v4));
- return (ah_inbound_accelerated(ipsec_in_mp, ii->ipsec_in_v4,
- assoc, ah_offset));
- }
- AH_BUMP_STAT(ahstack, noaccel);
-
- /*
* We need to pullup until the ICV before we call
* ah_process_ip_options_v6.
*/
@@ -3590,18 +3575,19 @@ ah_inbound(mblk_t *ipsec_in_mp, void *arg)
SL_WARN | SL_ERROR,
"ah_inbound: Small AH header\n");
IP_AH_BUMP_STAT(ipss, in_discards);
- ip_drop_packet(ipsec_in_mp, B_TRUE, NULL, NULL,
+ ip_drop_packet(data_mp, B_TRUE, ira->ira_ill,
DROPPER(ipss, ipds_ah_nomem),
&ahstack->ah_dropper);
- return (IPSEC_STATUS_FAILED);
+ BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+ return (NULL);
}
}
/*
* Insert pseudo header:
- * IPSEC_INFO -> [IP, ULP] => IPSEC_INFO -> [IP, AH, ICV] -> ULP
+ * [IP, ULP] => [IP, AH, ICV] -> ULP
*/
- if (ii->ipsec_in_v4) {
+ if (ira->ira_flags & IRAF_IS_IPV4) {
phdr_mp = ah_process_ip_options_v4(data_mp, assoc,
&length_to_skip, assoc->ipsa_mac_len, B_FALSE, ahstack);
} else {
@@ -3611,483 +3597,33 @@ ah_inbound(mblk_t *ipsec_in_mp, void *arg)
if (phdr_mp == NULL) {
IP_AH_BUMP_STAT(ipss, in_discards);
- ip_drop_packet(ipsec_in_mp, B_TRUE, NULL, NULL,
- (ii->ipsec_in_v4 ?
+ ip_drop_packet(data_mp, B_TRUE, ira->ira_ill,
+ ((ira->ira_flags & IRAF_IS_IPV4) ?
DROPPER(ipss, ipds_ah_bad_v4_opts) :
DROPPER(ipss, ipds_ah_bad_v6_hdrs)),
&ahstack->ah_dropper);
- return (IPSEC_STATUS_FAILED);
+ BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+ return (NULL);
}
- ipsec_in_mp->b_cont = phdr_mp;
phdr_mp->b_cont = data_mp;
data_mp->b_rptr += length_to_skip;
+ data_mp = phdr_mp;
/* submit request to KCF */
- return (ah_submit_req_inbound(ipsec_in_mp, length_to_skip, ah_offset,
+ return (ah_submit_req_inbound(data_mp, ira, length_to_skip, ah_offset,
assoc));
}
/*
- * ah_inbound_accelerated:
- * Called from ah_inbound() to process IPsec packets that have been
- * accelerated by hardware.
- *
- * Basically does what ah_auth_in_done() with some changes since
- * no pseudo-headers are involved, i.e. the passed message is a
- * IPSEC_INFO->DATA.
- *
- * It is assumed that only packets that have been successfully
- * processed by the adapter come here.
- *
- * 1. get algorithm structure corresponding to association
- * 2. calculate pointers to authentication header and ICV
- * 3. compare ICV in AH header with ICV in data attributes
- * 3.1 if different:
- * 3.1.1 generate error
- * 3.1.2 discard message
- * 3.2 if ICV matches:
- * 3.2.1 check replay
- * 3.2.2 remove AH header
- * 3.2.3 age SA byte
- * 3.2.4 send to IP
- */
-ipsec_status_t
-ah_inbound_accelerated(mblk_t *ipsec_in, boolean_t isv4, ipsa_t *assoc,
- uint32_t ah_offset)
-{
- mblk_t *mp;
- ipha_t *ipha;
- ah_t *ah;
- ipsec_in_t *ii;
- uint32_t icv_len;
- uint32_t align_len;
- uint32_t age_bytes;
- ip6_t *ip6h;
- uint8_t *in_icv;
- mblk_t *hada_mp;
- uint32_t next_hdr;
- da_ipsec_t *hada;
- kstat_named_t *counter;
- ipsecah_stack_t *ahstack;
- netstack_t *ns;
- ipsec_stack_t *ipss;
-
- ii = (ipsec_in_t *)ipsec_in->b_rptr;
- ns = ii->ipsec_in_ns;
- ahstack = ns->netstack_ipsecah;
- ipss = ns->netstack_ipsec;
-
- mp = ipsec_in->b_cont;
- hada_mp = ii->ipsec_in_da;
- ASSERT(hada_mp != NULL);
- hada = (da_ipsec_t *)hada_mp->b_rptr;
-
- AH_BUMP_STAT(ahstack, in_accelerated);
-
- /*
- * We only support one level of decapsulation in hardware, so
- * nuke the pointer.
- */
- ii->ipsec_in_da = NULL;
- ii->ipsec_in_accelerated = B_FALSE;
-
- /*
- * Extract ICV length from attributes M_CTL and sanity check
- * its value. We allow the mblk to be smaller than da_ipsec_t
- * for a small ICV, as long as the entire ICV fits within the mblk.
- * Also ensures that the ICV length computed by Provider
- * corresponds to the ICV length of the algorithm specified by the SA.
- */
- icv_len = hada->da_icv_len;
- if ((icv_len != assoc->ipsa_mac_len) ||
- (icv_len > DA_ICV_MAX_LEN) || (MBLKL(hada_mp) <
- (sizeof (da_ipsec_t) - DA_ICV_MAX_LEN + icv_len))) {
- ah0dbg(("ah_inbound_accelerated: "
- "ICV len (%u) incorrect or mblk too small (%u)\n",
- icv_len, (uint32_t)(MBLKL(hada_mp))));
- counter = DROPPER(ipss, ipds_ah_bad_length);
- goto ah_in_discard;
- }
- ASSERT(icv_len != 0);
-
- /* compute the padded AH ICV len */
- if (isv4) {
- ipha = (ipha_t *)mp->b_rptr;
- align_len = (icv_len + IPV4_PADDING_ALIGN - 1) &
- -IPV4_PADDING_ALIGN;
- } else {
- ip6h = (ip6_t *)mp->b_rptr;
- align_len = (icv_len + IPV6_PADDING_ALIGN - 1) &
- -IPV6_PADDING_ALIGN;
- }
-
- ah = (ah_t *)(mp->b_rptr + ah_offset);
- in_icv = (uint8_t *)ah + sizeof (ah_t);
-
- /* compare ICV in AH header vs ICV computed by adapter */
- if (bcmp(hada->da_icv, in_icv, icv_len)) {
- int af;
- void *addr;
-
- if (isv4) {
- addr = &ipha->ipha_dst;
- af = AF_INET;
- } else {
- addr = &ip6h->ip6_dst;
- af = AF_INET6;
- }
-
- /*
- * Log the event. Don't print to the console, block
- * potential denial-of-service attack.
- */
- AH_BUMP_STAT(ahstack, bad_auth);
- ipsec_assocfailure(info.mi_idnum, 0, 0, SL_ERROR | SL_WARN,
- "AH Authentication failed spi %x, dst_addr %s",
- assoc->ipsa_spi, addr, af, ahstack->ipsecah_netstack);
- counter = DROPPER(ipss, ipds_ah_bad_auth);
- goto ah_in_discard;
- }
-
- ah3dbg(ahstack, ("AH succeeded, checking replay\n"));
- AH_BUMP_STAT(ahstack, good_auth);
-
- if (!sadb_replay_check(assoc, ah->ah_replay)) {
- int af;
- void *addr;
-
- if (isv4) {
- addr = &ipha->ipha_dst;
- af = AF_INET;
- } else {
- addr = &ip6h->ip6_dst;
- af = AF_INET6;
- }
-
- /*
- * Log the event. As of now we print out an event.
- * Do not print the replay failure number, or else
- * syslog cannot collate the error messages. Printing
- * the replay number that failed (or printing to the
- * console) opens a denial-of-service attack.
- */
- AH_BUMP_STAT(ahstack, replay_failures);
- ipsec_assocfailure(info.mi_idnum, 0, 0,
- SL_ERROR | SL_WARN,
- "Replay failed for AH spi %x, dst_addr %s",
- assoc->ipsa_spi, addr, af, ahstack->ipsecah_netstack);
- counter = DROPPER(ipss, ipds_ah_replay);
- goto ah_in_discard;
- }
-
- /*
- * Remove AH header. We do this by copying everything before
- * the AH header onto the AH header+ICV.
- */
- /* overwrite AH with what was preceeding it (IP header) */
- next_hdr = ah->ah_nexthdr;
- ovbcopy(mp->b_rptr, mp->b_rptr + sizeof (ah_t) + align_len,
- ah_offset);
- mp->b_rptr += sizeof (ah_t) + align_len;
- if (isv4) {
- /* adjust IP header next protocol */
- ipha = (ipha_t *)mp->b_rptr;
- ipha->ipha_protocol = next_hdr;
-
- age_bytes = ipha->ipha_length;
-
- /* adjust length in IP header */
- ipha->ipha_length -= (sizeof (ah_t) + align_len);
-
- /* recalculate checksum */
- ipha->ipha_hdr_checksum = 0;
- ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
- } else {
- /* adjust IP header next protocol */
- ip6h = (ip6_t *)mp->b_rptr;
- ip6h->ip6_nxt = next_hdr;
-
- age_bytes = sizeof (ip6_t) + ntohs(ip6h->ip6_plen) +
- sizeof (ah_t);
-
- /* adjust length in IP header */
- ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) -
- (sizeof (ah_t) + align_len));
- }
-
- /* age SA */
- if (!ah_age_bytes(assoc, age_bytes, B_TRUE)) {
- /* The ipsa has hit hard expiration, LOG and AUDIT. */
- ipsec_assocfailure(info.mi_idnum, 0, 0,
- SL_ERROR | SL_WARN,
- "AH Association 0x%x, dst %s had bytes expire.\n",
- assoc->ipsa_spi, assoc->ipsa_dstaddr,
- AF_INET, ahstack->ipsecah_netstack);
- AH_BUMP_STAT(ahstack, bytes_expired);
- counter = DROPPER(ipss, ipds_ah_bytes_expire);
- goto ah_in_discard;
- }
-
- freeb(hada_mp);
- return (IPSEC_STATUS_SUCCESS);
-
-ah_in_discard:
- IP_AH_BUMP_STAT(ipss, in_discards);
- freeb(hada_mp);
- ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL, counter,
- &ahstack->ah_dropper);
- return (IPSEC_STATUS_FAILED);
-}
-
-/*
- * ah_outbound_accelerated_v4:
- * Called from ah_outbound_v4() and once it is determined that the
- * packet is elligible for hardware acceleration.
- *
- * We proceed as follows:
- * 1. allocate and initialize attributes mblk
- * 2. mark IPSEC_OUT to indicate that pkt is accelerated
- * 3. insert AH header
- */
-static ipsec_status_t
-ah_outbound_accelerated_v4(mblk_t *ipsec_mp, ipsa_t *assoc)
-{
- mblk_t *mp, *new_mp;
- ipsec_out_t *oi;
- uint_t ah_data_sz; /* ICV length, algorithm dependent */
- uint_t ah_align_sz; /* ICV length + padding */
- uint32_t v_hlen_tos_len; /* from original IP header */
- ipha_t *oipha; /* original IP header */
- ipha_t *nipha; /* new IP header */
- uint_t option_length = 0;
- uint_t new_hdr_len; /* new header length */
- uint_t iphdr_length;
- ah_t *ah_hdr; /* ptr to AH header */
- netstack_t *ns;
- ipsec_stack_t *ipss;
- ipsecah_stack_t *ahstack;
-
- oi = (ipsec_out_t *)ipsec_mp->b_rptr;
- ns = oi->ipsec_out_ns;
- ipss = ns->netstack_ipsec;
- ahstack = ns->netstack_ipsecah;
-
- mp = ipsec_mp->b_cont;
-
- AH_BUMP_STAT(ahstack, out_accelerated);
-
- oipha = (ipha_t *)mp->b_rptr;
- v_hlen_tos_len = ((uint32_t *)oipha)[0];
-
- /* mark packet as being accelerated in IPSEC_OUT */
- ASSERT(oi->ipsec_out_accelerated == B_FALSE);
- oi->ipsec_out_accelerated = B_TRUE;
-
- /* calculate authentication data length, i.e. ICV + padding */
- ah_data_sz = assoc->ipsa_mac_len;
- ah_align_sz = (ah_data_sz + IPV4_PADDING_ALIGN - 1) &
- -IPV4_PADDING_ALIGN;
-
- /*
- * Insert pseudo header:
- * IPSEC_INFO -> [IP, ULP] => IPSEC_INFO -> [IP, AH, ICV] -> ULP
- */
-
- /* IP + AH + authentication + padding data length */
- new_hdr_len = IP_SIMPLE_HDR_LENGTH + sizeof (ah_t) + ah_align_sz;
- if (V_HLEN != IP_SIMPLE_HDR_VERSION) {
- option_length = oipha->ipha_version_and_hdr_length -
- (uint8_t)((IP_VERSION << 4) +
- IP_SIMPLE_HDR_LENGTH_IN_WORDS);
- option_length <<= 2;
- new_hdr_len += option_length;
- }
-
- /* allocate pseudo-header mblk */
- if ((new_mp = allocb(new_hdr_len, BPRI_HI)) == NULL) {
- /* IPsec kstats: bump bean counter here */
- ip_drop_packet(ipsec_mp, B_FALSE, NULL, NULL,
- DROPPER(ipss, ipds_ah_nomem),
- &ahstack->ah_dropper);
- return (IPSEC_STATUS_FAILED);
- }
-
- new_mp->b_cont = mp;
- ipsec_mp->b_cont = new_mp;
- new_mp->b_wptr += new_hdr_len;
-
- /* copy original IP header to new header */
- bcopy(mp->b_rptr, new_mp->b_rptr, IP_SIMPLE_HDR_LENGTH +
- option_length);
-
- /* update IP header */
- nipha = (ipha_t *)new_mp->b_rptr;
- nipha->ipha_protocol = IPPROTO_AH;
- iphdr_length = ntohs(nipha->ipha_length);
- iphdr_length += sizeof (ah_t) + ah_align_sz;
- nipha->ipha_length = htons(iphdr_length);
- nipha->ipha_hdr_checksum = 0;
- nipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(nipha);
-
- /* skip original IP header in mp */
- mp->b_rptr += IP_SIMPLE_HDR_LENGTH + option_length;
-
- /* initialize AH header */
- ah_hdr = (ah_t *)(new_mp->b_rptr + IP_SIMPLE_HDR_LENGTH +
- option_length);
- ah_hdr->ah_nexthdr = oipha->ipha_protocol;
- if (!ah_finish_up(ah_hdr, NULL, assoc, ah_data_sz, ah_align_sz,
- ahstack)) {
- /* Only way this fails is if outbound replay counter wraps. */
- ip_drop_packet(ipsec_mp, B_FALSE, NULL, NULL,
- DROPPER(ipss, ipds_ah_replay),
- &ahstack->ah_dropper);
- return (IPSEC_STATUS_FAILED);
- }
-
- return (IPSEC_STATUS_SUCCESS);
-}
-
-/*
- * ah_outbound_accelerated_v6:
- *
- * Called from ah_outbound_v6() once it is determined that the packet
- * is eligible for hardware acceleration.
- *
- * We proceed as follows:
- * 1. allocate and initialize attributes mblk
- * 2. mark IPSEC_OUT to indicate that pkt is accelerated
- * 3. insert AH header
- */
-static ipsec_status_t
-ah_outbound_accelerated_v6(mblk_t *ipsec_mp, ipsa_t *assoc)
-{
- mblk_t *mp, *phdr_mp;
- ipsec_out_t *oi;
- uint_t ah_data_sz; /* ICV length, algorithm dependent */
- uint_t ah_align_sz; /* ICV length + padding */
- ip6_t *oip6h; /* original IP header */
- ip6_t *ip6h; /* new IP header */
- uint_t option_length = 0;
- uint_t hdr_size;
- uint_t ah_offset;
- ah_t *ah_hdr; /* ptr to AH header */
- netstack_t *ns;
- ipsec_stack_t *ipss;
- ipsecah_stack_t *ahstack;
-
- oi = (ipsec_out_t *)ipsec_mp->b_rptr;
- ns = oi->ipsec_out_ns;
- ipss = ns->netstack_ipsec;
- ahstack = ns->netstack_ipsecah;
-
- mp = ipsec_mp->b_cont;
-
- AH_BUMP_STAT(ahstack, out_accelerated);
-
- oip6h = (ip6_t *)mp->b_rptr;
-
- /* mark packet as being accelerated in IPSEC_OUT */
- ASSERT(oi->ipsec_out_accelerated == B_FALSE);
- oi->ipsec_out_accelerated = B_TRUE;
-
- /* calculate authentication data length, i.e. ICV + padding */
- ah_data_sz = assoc->ipsa_mac_len;
- ah_align_sz = (ah_data_sz + IPV4_PADDING_ALIGN - 1) &
- -IPV4_PADDING_ALIGN;
-
- ASSERT(ah_align_sz >= ah_data_sz);
-
- hdr_size = ipsec_ah_get_hdr_size_v6(mp, B_FALSE);
- option_length = hdr_size - IPV6_HDR_LEN;
-
- /* This was not included in ipsec_ah_get_hdr_size_v6() */
- hdr_size += (sizeof (ah_t) + ah_align_sz);
-
- if ((phdr_mp = allocb(hdr_size, BPRI_HI)) == NULL) {
- ip_drop_packet(ipsec_mp, B_FALSE, NULL, NULL,
- DROPPER(ipss, ipds_ah_nomem),
- &ahstack->ah_dropper);
- return (IPSEC_STATUS_FAILED);
- }
- phdr_mp->b_wptr += hdr_size;
-
- /*
- * Form the basic IP header first. We always assign every bit
- * of the v6 basic header, so a separate bzero is unneeded.
- */
- ip6h = (ip6_t *)phdr_mp->b_rptr;
- ip6h->ip6_vcf = oip6h->ip6_vcf;
- ip6h->ip6_hlim = oip6h->ip6_hlim;
- ip6h->ip6_src = oip6h->ip6_src;
- ip6h->ip6_dst = oip6h->ip6_dst;
- /*
- * Include the size of AH and authentication data.
- * This is how our recipient would compute the
- * authentication data. Look at what we do in the
- * inbound case below.
- */
- ip6h->ip6_plen = htons(ntohs(oip6h->ip6_plen) + sizeof (ah_t) +
- ah_align_sz);
-
- /*
- * Insert pseudo header:
- * IPSEC_INFO -> [IP6, LLH, ULP] =>
- * IPSEC_INFO -> [IP, LLH, AH, ICV] -> ULP
- */
-
- if (option_length == 0) {
- /* Form the AH header */
- ip6h->ip6_nxt = IPPROTO_AH;
- ((ah_t *)(ip6h + 1))->ah_nexthdr = oip6h->ip6_nxt;
- ah_offset = IPV6_HDR_LEN;
- } else {
- ip6h->ip6_nxt = oip6h->ip6_nxt;
- /* option_length does not include the AH header's size */
- ah_offset = ah_fix_phdr_v6(ip6h, oip6h, B_TRUE, B_FALSE);
- if (ah_offset == 0) {
- freemsg(phdr_mp);
- ip_drop_packet(ipsec_mp, B_FALSE, NULL, NULL,
- DROPPER(ipss, ipds_ah_bad_v6_hdrs),
- &ahstack->ah_dropper);
- return (IPSEC_STATUS_FAILED);
- }
- }
-
- phdr_mp->b_cont = mp;
- ipsec_mp->b_cont = phdr_mp;
-
- /* skip original IP header in mp */
- mp->b_rptr += IPV6_HDR_LEN + option_length;
-
- /* initialize AH header */
- ah_hdr = (ah_t *)(phdr_mp->b_rptr + IPV6_HDR_LEN + option_length);
- ah_hdr->ah_nexthdr = oip6h->ip6_nxt;
-
- if (!ah_finish_up(((ah_t *)((uint8_t *)ip6h + ah_offset)), NULL,
- assoc, ah_data_sz, ah_align_sz, ahstack)) {
- /* Only way this fails is if outbound replay counter wraps. */
- ip_drop_packet(ipsec_mp, B_FALSE, NULL, NULL,
- DROPPER(ipss, ipds_ah_replay),
- &ahstack->ah_dropper);
- return (IPSEC_STATUS_FAILED);
- }
-
- return (IPSEC_STATUS_SUCCESS);
-}
-
-/*
* Invoked after processing of an inbound packet by the
* kernel crypto framework. Called by ah_submit_req() for a sync request,
* or by the kcf callback for an async request.
- * Returns IPSEC_STATUS_SUCCESS on success, IPSEC_STATUS_FAILED on failure.
- * On failure, the mblk chain ipsec_in is freed by this function.
+ * Returns NULL if the mblk chain is consumed.
*/
-static ipsec_status_t
-ah_auth_in_done(mblk_t *ipsec_in)
+static mblk_t *
+ah_auth_in_done(mblk_t *phdr_mp, ip_recv_attr_t *ira, ipsec_crypto_t *ic)
{
- mblk_t *phdr_mp;
ipha_t *ipha;
uint_t ah_offset = 0;
mblk_t *mp;
@@ -4096,41 +3632,36 @@ ah_auth_in_done(mblk_t *ipsec_in)
uint32_t length;
uint32_t *dest32;
uint8_t *dest;
- ipsec_in_t *ii;
boolean_t isv4;
ip6_t *ip6h;
uint_t icv_len;
ipsa_t *assoc;
kstat_named_t *counter;
- netstack_t *ns;
- ipsecah_stack_t *ahstack;
- ipsec_stack_t *ipss;
-
- ii = (ipsec_in_t *)ipsec_in->b_rptr;
- ns = ii->ipsec_in_ns;
- ahstack = ns->netstack_ipsecah;
- ipss = ns->netstack_ipsec;
+ netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack;
+ ipsecah_stack_t *ahstack = ns->netstack_ipsecah;
+ ipsec_stack_t *ipss = ns->netstack_ipsec;
- isv4 = ii->ipsec_in_v4;
- assoc = ii->ipsec_in_ah_sa;
- icv_len = (uint_t)ii->ipsec_in_crypto_mac.cd_raw.iov_len;
+ isv4 = (ira->ira_flags & IRAF_IS_IPV4);
+ assoc = ira->ira_ipsec_ah_sa;
+ icv_len = (uint_t)ic->ic_crypto_mac.cd_raw.iov_len;
- phdr_mp = ipsec_in->b_cont;
if (phdr_mp == NULL) {
- ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL,
+ ip_drop_packet(phdr_mp, B_TRUE, ira->ira_ill,
DROPPER(ipss, ipds_ah_nomem),
&ahstack->ah_dropper);
- return (IPSEC_STATUS_FAILED);
+ BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+ return (NULL);
}
mp = phdr_mp->b_cont;
if (mp == NULL) {
- ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL,
+ ip_drop_packet(phdr_mp, B_TRUE, ira->ira_ill,
DROPPER(ipss, ipds_ah_nomem),
&ahstack->ah_dropper);
- return (IPSEC_STATUS_FAILED);
+ BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+ return (NULL);
}
- mp->b_rptr -= ii->ipsec_in_skip_len;
+ mp->b_rptr -= ic->ic_skip_len;
ah_set_usetime(assoc, B_TRUE);
@@ -4256,8 +3787,7 @@ ah_auth_in_done(mblk_t *ipsec_in)
while (*nexthdr != IPPROTO_AH) {
whereptr += hdrlen;
/* Assume IP has already stripped it */
- ASSERT(*nexthdr != IPPROTO_FRAGMENT &&
- *nexthdr != IPPROTO_RAW);
+ ASSERT(*nexthdr != IPPROTO_FRAGMENT);
switch (*nexthdr) {
case IPPROTO_HOPOPTS:
hbhhdr = (ip6_hbh_t *)whereptr;
@@ -4292,20 +3822,18 @@ ah_auth_in_done(mblk_t *ipsec_in)
while (--dest >= mp->b_rptr)
*dest = *(dest - newpos);
}
- ipsec_in->b_cont = mp;
- phdr_mp->b_cont = NULL;
- /*
- * If a db_credp exists in phdr_mp, it must also exist in mp.
- */
- ASSERT(DB_CRED(phdr_mp) == NULL ||
- msg_getcred(mp, NULL) != NULL);
freeb(phdr_mp);
/*
* If SA is labelled, use its label, else inherit the label
*/
- if (is_system_labeled() && (assoc->ipsa_cred != NULL)) {
- mblk_setcred(mp, assoc->ipsa_cred, NOPID);
+ if (is_system_labeled() && (assoc->ipsa_tsl != NULL)) {
+ if (!ip_recv_attr_replace_label(ira, assoc->ipsa_tsl)) {
+ ip_drop_packet(mp, B_TRUE, ira->ira_ill,
+ DROPPER(ipss, ipds_ah_nomem), &ahstack->ah_dropper);
+ BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+ return (NULL);
+ }
}
if (assoc->ipsa_state == IPSA_STATE_IDLE) {
@@ -4313,17 +3841,18 @@ ah_auth_in_done(mblk_t *ipsec_in)
* Cluster buffering case. Tell caller that we're
* handling the packet.
*/
- sadb_buf_pkt(assoc, ipsec_in, ns);
- return (IPSEC_STATUS_PENDING);
+ sadb_buf_pkt(assoc, mp, ira);
+ return (NULL);
}
- return (IPSEC_STATUS_SUCCESS);
+ return (mp);
ah_in_discard:
IP_AH_BUMP_STAT(ipss, in_discards);
- ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL, counter,
+ ip_drop_packet(phdr_mp, B_TRUE, ira->ira_ill, counter,
&ahstack->ah_dropper);
- return (IPSEC_STATUS_FAILED);
+ BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+ return (NULL);
}
/*
@@ -4332,49 +3861,37 @@ ah_in_discard:
* executed syncrhonously, or by the KEF callback for a request
* executed asynchronously.
*/
-static ipsec_status_t
-ah_auth_out_done(mblk_t *ipsec_out)
+static mblk_t *
+ah_auth_out_done(mblk_t *phdr_mp, ip_xmit_attr_t *ixa, ipsec_crypto_t *ic)
{
- mblk_t *phdr_mp;
mblk_t *mp;
int align_len;
uint32_t hdrs_length;
uchar_t *ptr;
uint32_t length;
boolean_t isv4;
- ipsec_out_t *io;
size_t icv_len;
- netstack_t *ns;
- ipsec_stack_t *ipss;
- ipsecah_stack_t *ahstack;
-
- io = (ipsec_out_t *)ipsec_out->b_rptr;
- ns = io->ipsec_out_ns;
- ipss = ns->netstack_ipsec;
- ahstack = ns->netstack_ipsecah;
+ netstack_t *ns = ixa->ixa_ipst->ips_netstack;
+ ipsecah_stack_t *ahstack = ns->netstack_ipsecah;
+ ipsec_stack_t *ipss = ns->netstack_ipsec;
+ ill_t *ill = ixa->ixa_nce->nce_ill;
- isv4 = io->ipsec_out_v4;
- icv_len = io->ipsec_out_crypto_mac.cd_raw.iov_len;
-
- phdr_mp = ipsec_out->b_cont;
- if (phdr_mp == NULL) {
- ip_drop_packet(ipsec_out, B_FALSE, NULL, NULL,
- DROPPER(ipss, ipds_ah_nomem),
- &ahstack->ah_dropper);
- return (IPSEC_STATUS_FAILED);
- }
+ isv4 = (ixa->ixa_flags & IXAF_IS_IPV4);
+ icv_len = ic->ic_crypto_mac.cd_raw.iov_len;
mp = phdr_mp->b_cont;
if (mp == NULL) {
- ip_drop_packet(ipsec_out, B_FALSE, NULL, NULL,
+ ip_drop_packet(phdr_mp, B_FALSE, ill,
DROPPER(ipss, ipds_ah_nomem),
&ahstack->ah_dropper);
- return (IPSEC_STATUS_FAILED);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ return (NULL);
}
- mp->b_rptr -= io->ipsec_out_skip_len;
+ mp->b_rptr -= ic->ic_skip_len;
- ASSERT(io->ipsec_out_ah_sa != NULL);
- ah_set_usetime(io->ipsec_out_ah_sa, B_FALSE);
+ ASSERT(ixa->ixa_flags & IXAF_IPSEC_SECURE);
+ ASSERT(ixa->ixa_ipsec_ah_sa != NULL);
+ ah_set_usetime(ixa->ixa_ipsec_ah_sa, B_FALSE);
if (isv4) {
ipha_t *ipha;
@@ -4454,7 +3971,7 @@ ah_auth_out_done(mblk_t *ipsec_out)
freeb(mp);
}
- return (IPSEC_STATUS_SUCCESS);
+ return (phdr_mp);
}
/* Refactor me */
@@ -4464,16 +3981,18 @@ ah_auth_out_done(mblk_t *ipsec_out)
*/
void
ipsecah_in_assocfailure(mblk_t *mp, char level, ushort_t sl, char *fmt,
- uint32_t spi, void *addr, int af, ipsecah_stack_t *ahstack)
+ uint32_t spi, void *addr, int af, ip_recv_attr_t *ira)
{
- ipsec_stack_t *ipss = ahstack->ipsecah_netstack->netstack_ipsec;
+ netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack;
+ ipsecah_stack_t *ahstack = ns->netstack_ipsecah;
+ ipsec_stack_t *ipss = ns->netstack_ipsec;
if (ahstack->ipsecah_log_unknown_spi) {
ipsec_assocfailure(info.mi_idnum, 0, level, sl, fmt, spi,
addr, af, ahstack->ipsecah_netstack);
}
- ip_drop_packet(mp, B_TRUE, NULL, NULL,
+ ip_drop_packet(mp, B_TRUE, ira->ira_ill,
DROPPER(ipss, ipds_ah_no_sa),
&ahstack->ah_dropper);
}
diff --git a/usr/src/uts/common/inet/ip/ipsecesp.c b/usr/src/uts/common/inet/ip/ipsecesp.c
index 089e23e937..8af449384f 100644
--- a/usr/src/uts/common/inet/ip/ipsecesp.c
+++ b/usr/src/uts/common/inet/ip/ipsecesp.c
@@ -53,6 +53,8 @@
#include <inet/ip.h>
#include <inet/ip_impl.h>
#include <inet/ip6.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ndp.h>
#include <inet/sadb.h>
#include <inet/ipsec_info.h>
#include <inet/ipsec_impl.h>
@@ -67,8 +69,6 @@
#include <sys/taskq.h>
#include <sys/note.h>
-#include <sys/iphada.h>
-
#include <sys/tsol/tnet.h>
/*
@@ -130,26 +130,23 @@ static ipsecespparam_t lcl_param_arr[] = {
static int ipsecesp_open(queue_t *, dev_t *, int, int, cred_t *);
static int ipsecesp_close(queue_t *);
-static void ipsecesp_rput(queue_t *, mblk_t *);
static void ipsecesp_wput(queue_t *, mblk_t *);
static void *ipsecesp_stack_init(netstackid_t stackid, netstack_t *ns);
static void ipsecesp_stack_fini(netstackid_t stackid, void *arg);
static void esp_send_acquire(ipsacq_t *, mblk_t *, netstack_t *);
static void esp_prepare_udp(netstack_t *, mblk_t *, ipha_t *);
-static ipsec_status_t esp_outbound_accelerated(mblk_t *, uint_t);
-static ipsec_status_t esp_inbound_accelerated(mblk_t *, mblk_t *,
- boolean_t, ipsa_t *);
+static void esp_outbound_finish(mblk_t *, ip_xmit_attr_t *);
+static void esp_inbound_restart(mblk_t *, ip_recv_attr_t *);
static boolean_t esp_register_out(uint32_t, uint32_t, uint_t,
- ipsecesp_stack_t *, mblk_t *);
+ ipsecesp_stack_t *, cred_t *);
static boolean_t esp_strip_header(mblk_t *, boolean_t, uint32_t,
kstat_named_t **, ipsecesp_stack_t *);
-static ipsec_status_t esp_submit_req_inbound(mblk_t *, ipsa_t *, uint_t);
-static ipsec_status_t esp_submit_req_outbound(mblk_t *, ipsa_t *, uchar_t *,
- uint_t);
-extern void (*cl_inet_getspi)(netstackid_t, uint8_t, uint8_t *, size_t,
- void *);
+static mblk_t *esp_submit_req_inbound(mblk_t *, ip_recv_attr_t *,
+ ipsa_t *, uint_t);
+static mblk_t *esp_submit_req_outbound(mblk_t *, ip_xmit_attr_t *,
+ ipsa_t *, uchar_t *, uint_t);
/* Setable in /etc/system */
uint32_t esp_hash_size = IPSEC_DEFAULT_HASH_SIZE;
@@ -159,7 +156,7 @@ static struct module_info info = {
};
static struct qinit rinit = {
- (pfi_t)ipsecesp_rput, NULL, ipsecesp_open, ipsecesp_close, NULL, &info,
+ (pfi_t)putnext, NULL, ipsecesp_open, ipsecesp_close, NULL, &info,
NULL
};
@@ -201,9 +198,6 @@ typedef struct esp_kstats_s {
kstat_named_t esp_stat_acquire_requests;
kstat_named_t esp_stat_bytes_expired;
kstat_named_t esp_stat_out_discards;
- kstat_named_t esp_stat_in_accelerated;
- kstat_named_t esp_stat_out_accelerated;
- kstat_named_t esp_stat_noaccel;
kstat_named_t esp_stat_crypto_sync;
kstat_named_t esp_stat_crypto_async;
kstat_named_t esp_stat_crypto_failures;
@@ -266,9 +260,6 @@ esp_kstat_init(ipsecesp_stack_t *espstack, netstackid_t stackid)
KI(acquire_requests);
KI(bytes_expired);
KI(out_discards);
- KI(in_accelerated);
- KI(out_accelerated);
- KI(noaccel);
KI(crypto_sync);
KI(crypto_async);
KI(crypto_failures);
@@ -384,9 +375,9 @@ esp_ager(void *arg)
hrtime_t begin = gethrtime();
sadb_ager(&espstack->esp_sadb.s_v4, espstack->esp_pfkey_q,
- espstack->esp_sadb.s_ip_q, espstack->ipsecesp_reap_delay, ns);
+ espstack->ipsecesp_reap_delay, ns);
sadb_ager(&espstack->esp_sadb.s_v6, espstack->esp_pfkey_q,
- espstack->esp_sadb.s_ip_q, espstack->ipsecesp_reap_delay, ns);
+ espstack->ipsecesp_reap_delay, ns);
espstack->esp_event = sadb_retimeout(begin, espstack->esp_pfkey_q,
esp_ager, espstack,
@@ -583,7 +574,13 @@ ipsecesp_stack_fini(netstackid_t stackid, void *arg)
}
/*
- * ESP module open routine.
+ * ESP module open routine, which is here for keysock plumbing.
+ * Keysock is pushed over {AH,ESP} which is an artifact from the Bad Old
+ * Days of export control, and fears that ESP would not be allowed
+ * to be shipped at all by default. Eventually, keysock should
+ * either access AH and ESP via modstubs or krtld dependencies, or
+ * perhaps be folded in with AH and ESP into a single IPsec/netsec
+ * module ("netsec" if PF_KEY provides more than AH/ESP keying tables).
*/
/* ARGSUSED */
static int
@@ -606,56 +603,10 @@ ipsecesp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
espstack = ns->netstack_ipsecesp;
ASSERT(espstack != NULL);
- /*
- * ASSUMPTIONS (because I'm MT_OCEXCL):
- *
- * * I'm being pushed on top of IP for all my opens (incl. #1).
- * * Only ipsecesp_open() can write into esp_sadb.s_ip_q.
- * * Because of this, I can check lazily for esp_sadb.s_ip_q.
- *
- * If these assumptions are wrong, I'm in BIG trouble...
- */
-
q->q_ptr = espstack;
WR(q)->q_ptr = q->q_ptr;
- if (espstack->esp_sadb.s_ip_q == NULL) {
- struct T_unbind_req *tur;
-
- espstack->esp_sadb.s_ip_q = WR(q);
- /* Allocate an unbind... */
- espstack->esp_ip_unbind = allocb(sizeof (struct T_unbind_req),
- BPRI_HI);
-
- /*
- * Send down T_BIND_REQ to bind IPPROTO_ESP.
- * Handle the ACK here in ESP.
- */
- qprocson(q);
- if (espstack->esp_ip_unbind == NULL ||
- !sadb_t_bind_req(espstack->esp_sadb.s_ip_q, IPPROTO_ESP)) {
- if (espstack->esp_ip_unbind != NULL) {
- freeb(espstack->esp_ip_unbind);
- espstack->esp_ip_unbind = NULL;
- }
- q->q_ptr = NULL;
- netstack_rele(espstack->ipsecesp_netstack);
- return (ENOMEM);
- }
-
- espstack->esp_ip_unbind->b_datap->db_type = M_PROTO;
- tur = (struct T_unbind_req *)espstack->esp_ip_unbind->b_rptr;
- tur->PRIM_type = T_UNBIND_REQ;
- } else {
- qprocson(q);
- }
-
- /*
- * For now, there's not much I can do. I'll be getting a message
- * passed down to me from keysock (in my wput), and a T_BIND_ACK
- * up from IP (in my rput).
- */
-
+ qprocson(q);
return (0);
}
@@ -668,17 +619,6 @@ ipsecesp_close(queue_t *q)
ipsecesp_stack_t *espstack = (ipsecesp_stack_t *)q->q_ptr;
/*
- * If esp_sadb.s_ip_q is attached to this instance, send a
- * T_UNBIND_REQ to IP for the instance before doing
- * a qprocsoff().
- */
- if (WR(q) == espstack->esp_sadb.s_ip_q &&
- espstack->esp_ip_unbind != NULL) {
- putnext(WR(q), espstack->esp_ip_unbind);
- espstack->esp_ip_unbind = NULL;
- }
-
- /*
* Clean up q_ptr, if needed.
*/
qprocsoff(q);
@@ -693,45 +633,6 @@ ipsecesp_close(queue_t *q)
(void) quntimeout(q, espstack->esp_event);
}
- if (WR(q) == espstack->esp_sadb.s_ip_q) {
- /*
- * If the esp_sadb.s_ip_q is attached to this instance, find
- * another. The OCEXCL outer perimeter helps us here.
- */
- espstack->esp_sadb.s_ip_q = NULL;
-
- /*
- * Find a replacement queue for esp_sadb.s_ip_q.
- */
- if (espstack->esp_pfkey_q != NULL &&
- espstack->esp_pfkey_q != RD(q)) {
- /*
- * See if we can use the pfkey_q.
- */
- espstack->esp_sadb.s_ip_q = WR(espstack->esp_pfkey_q);
- }
-
- if (espstack->esp_sadb.s_ip_q == NULL ||
- !sadb_t_bind_req(espstack->esp_sadb.s_ip_q, IPPROTO_ESP)) {
- esp1dbg(espstack, ("ipsecesp: Can't reassign ip_q.\n"));
- espstack->esp_sadb.s_ip_q = NULL;
- } else {
- espstack->esp_ip_unbind =
- allocb(sizeof (struct T_unbind_req), BPRI_HI);
-
- if (espstack->esp_ip_unbind != NULL) {
- struct T_unbind_req *tur;
-
- espstack->esp_ip_unbind->b_datap->db_type =
- M_PROTO;
- tur = (struct T_unbind_req *)
- espstack->esp_ip_unbind->b_rptr;
- tur->PRIM_type = T_UNBIND_REQ;
- }
- /* If it's NULL, I can't do much here. */
- }
- }
-
netstack_rele(espstack->ipsecesp_netstack);
return (0);
}
@@ -834,26 +735,27 @@ esp_age_bytes(ipsa_t *assoc, uint64_t bytes, boolean_t inbound)
/*
* Do incoming NAT-T manipulations for packet.
+ * Returns NULL if the mblk chain is consumed.
*/
-static ipsec_status_t
+static mblk_t *
esp_fix_natt_checksums(mblk_t *data_mp, ipsa_t *assoc)
{
ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
- tcpha_t *tcph;
+ tcpha_t *tcpha;
udpha_t *udpha;
/* Initialize to our inbound cksum adjustment... */
uint32_t sum = assoc->ipsa_inbound_cksum;
switch (ipha->ipha_protocol) {
case IPPROTO_TCP:
- tcph = (tcpha_t *)(data_mp->b_rptr +
+ tcpha = (tcpha_t *)(data_mp->b_rptr +
IPH_HDR_LENGTH(ipha));
#define DOWN_SUM(x) (x) = ((x) & 0xFFFF) + ((x) >> 16)
- sum += ~ntohs(tcph->tha_sum) & 0xFFFF;
+ sum += ~ntohs(tcpha->tha_sum) & 0xFFFF;
DOWN_SUM(sum);
DOWN_SUM(sum);
- tcph->tha_sum = ~htons(sum);
+ tcpha->tha_sum = ~htons(sum);
break;
case IPPROTO_UDP:
udpha = (udpha_t *)(data_mp->b_rptr + IPH_HDR_LENGTH(ipha));
@@ -876,7 +778,7 @@ esp_fix_natt_checksums(mblk_t *data_mp, ipsa_t *assoc)
*/
break;
}
- return (IPSEC_STATUS_SUCCESS);
+ return (data_mp);
}
@@ -968,10 +870,11 @@ esp_strip_header(mblk_t *data_mp, boolean_t isv4, uint32_t ivlen,
if (ip6h->ip6_nxt == IPPROTO_ESP) {
ip6h->ip6_nxt = nexthdr;
} else {
- ip6_pkt_t ipp;
+ ip_pkt_t ipp;
bzero(&ipp, sizeof (ipp));
- (void) ip_find_hdr_v6(data_mp, ip6h, &ipp, NULL);
+ (void) ip_find_hdr_v6(data_mp, ip6h, B_FALSE, &ipp,
+ NULL);
if (ipp.ipp_dstopts != NULL) {
ipp.ipp_dstopts->ip6d_nxt = nexthdr;
} else if (ipp.ipp_rthdr != NULL) {
@@ -1227,16 +1130,14 @@ esp_set_usetime(ipsa_t *assoc, boolean_t inbound)
/*
* Handle ESP inbound data for IPv4 and IPv6.
* On success returns B_TRUE, on failure returns B_FALSE and frees the
- * mblk chain ipsec_in_mp.
+ * mblk chain data_mp.
*/
-ipsec_status_t
-esp_inbound(mblk_t *ipsec_in_mp, void *arg)
+mblk_t *
+esp_inbound(mblk_t *data_mp, void *arg, ip_recv_attr_t *ira)
{
- mblk_t *data_mp = ipsec_in_mp->b_cont;
- ipsec_in_t *ii = (ipsec_in_t *)ipsec_in_mp->b_rptr;
esph_t *esph = (esph_t *)arg;
- ipsa_t *ipsa = ii->ipsec_in_esp_sa;
- netstack_t *ns = ii->ipsec_in_ns;
+ ipsa_t *ipsa = ira->ira_ipsec_esp_sa;
+ netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack;
ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
ipsec_stack_t *ipss = ns->netstack_ipsec;
@@ -1254,36 +1155,18 @@ esp_inbound(mblk_t *ipsec_in_mp, void *arg)
if (!sadb_replay_peek(ipsa, esph->esph_replay)) {
ESP_BUMP_STAT(espstack, replay_early_failures);
IP_ESP_BUMP_STAT(ipss, in_discards);
- /*
- * TODO: Extract inbound interface from the IPSEC_IN
- * message's ii->ipsec_in_rill_index.
- */
- ip_drop_packet(ipsec_in_mp, B_TRUE, NULL, NULL,
+ ip_drop_packet(data_mp, B_TRUE, ira->ira_ill,
DROPPER(ipss, ipds_esp_early_replay),
&espstack->esp_dropper);
- return (IPSEC_STATUS_FAILED);
+ BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+ return (NULL);
}
/*
- * Has this packet already been processed by a hardware
- * IPsec accelerator?
- */
- if (ii->ipsec_in_accelerated) {
- ipsec_status_t rv;
- esp3dbg(espstack,
- ("esp_inbound: pkt processed by ill=%d isv6=%d\n",
- ii->ipsec_in_ill_index, !ii->ipsec_in_v4));
- rv = esp_inbound_accelerated(ipsec_in_mp,
- data_mp, ii->ipsec_in_v4, ipsa);
- return (rv);
- }
- ESP_BUMP_STAT(espstack, noaccel);
-
- /*
* Adjust the IP header's payload length to reflect the removal
* of the ICV.
*/
- if (!ii->ipsec_in_v4) {
+ if (!(ira->ira_flags & IRAF_IS_IPV4)) {
ip6_t *ip6h = (ip6_t *)data_mp->b_rptr;
ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) -
ipsa->ipsa_mac_len);
@@ -1294,7 +1177,7 @@ esp_inbound(mblk_t *ipsec_in_mp, void *arg)
}
/* submit the request to the crypto framework */
- return (esp_submit_req_inbound(ipsec_in_mp, ipsa,
+ return (esp_submit_req_inbound(data_mp, ira, ipsa,
(uint8_t *)esph - data_mp->b_rptr));
}
@@ -1303,21 +1186,15 @@ esp_inbound(mblk_t *ipsec_in_mp, void *arg)
* Called while holding the algorithm lock.
*/
static void
-esp_insert_prop(sadb_prop_t *prop, ipsacq_t *acqrec, uint_t combs)
+esp_insert_prop(sadb_prop_t *prop, ipsacq_t *acqrec, uint_t combs,
+ netstack_t *ns)
{
sadb_comb_t *comb = (sadb_comb_t *)(prop + 1);
- ipsec_out_t *io;
ipsec_action_t *ap;
ipsec_prot_t *prot;
- netstack_t *ns;
- ipsecesp_stack_t *espstack;
- ipsec_stack_t *ipss;
+ ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
+ ipsec_stack_t *ipss = ns->netstack_ipsec;
- io = (ipsec_out_t *)acqrec->ipsacq_mp->b_rptr;
- ASSERT(io->ipsec_out_type == IPSEC_OUT);
- ns = io->ipsec_out_ns;
- espstack = ns->netstack_ipsecesp;
- ipss = ns->netstack_ipsec;
ASSERT(MUTEX_HELD(&ipss->ipsec_alg_lock));
prop->sadb_prop_exttype = SADB_EXT_PROPOSAL;
@@ -1327,9 +1204,10 @@ esp_insert_prop(sadb_prop_t *prop, ipsacq_t *acqrec, uint_t combs)
prop->sadb_prop_replay = espstack->ipsecesp_replay_size;
/*
- * Based upon algorithm properties, and what-not, prioritize
- * a proposal. If the IPSEC_OUT message has an algorithm specified,
- * use it first and foremost.
+ * Based upon algorithm properties, and what-not, prioritize a
+ * proposal, based on the ordering of the ESP algorithms in the
+ * alternatives in the policy rule or socket that was placed
+ * in the acquire record.
*
* For each action in policy list
* Add combination. If I've hit limit, return.
@@ -1456,7 +1334,7 @@ esp_send_acquire(ipsacq_t *acqrec, mblk_t *extended, netstack_t *ns)
/* Insert proposal here. */
prop = (sadb_prop_t *)(((uint64_t *)samsg) + samsg->sadb_msg_len);
- esp_insert_prop(prop, acqrec, combs);
+ esp_insert_prop(prop, acqrec, combs, ns);
samsg->sadb_msg_len += prop->sadb_prop_len;
msgmp->b_wptr += SADB_64TO8(samsg->sadb_msg_len);
@@ -1756,13 +1634,11 @@ esp_port_freshness(uint32_t ports, ipsa_t *assoc)
* If authentication was performed on the packet, this function is called
* only if the authentication succeeded.
* On success returns B_TRUE, on failure returns B_FALSE and frees the
- * mblk chain ipsec_in_mp.
+ * mblk chain data_mp.
*/
-static ipsec_status_t
-esp_in_done(mblk_t *ipsec_in_mp)
+static mblk_t *
+esp_in_done(mblk_t *data_mp, ip_recv_attr_t *ira, ipsec_crypto_t *ic)
{
- ipsec_in_t *ii = (ipsec_in_t *)ipsec_in_mp->b_rptr;
- mblk_t *data_mp;
ipsa_t *assoc;
uint_t espstart;
uint32_t ivlen = 0;
@@ -1770,11 +1646,11 @@ esp_in_done(mblk_t *ipsec_in_mp)
esph_t *esph;
kstat_named_t *counter;
boolean_t is_natt;
- netstack_t *ns = ii->ipsec_in_ns;
+ netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack;
ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
ipsec_stack_t *ipss = ns->netstack_ipsec;
- assoc = ii->ipsec_in_esp_sa;
+ assoc = ira->ira_ipsec_esp_sa;
ASSERT(assoc != NULL);
is_natt = ((assoc->ipsa_flags & IPSA_F_NATT) != 0);
@@ -1782,26 +1658,25 @@ esp_in_done(mblk_t *ipsec_in_mp)
/* get the pointer to the ESP header */
if (assoc->ipsa_encr_alg == SADB_EALG_NULL) {
/* authentication-only ESP */
- espstart = ii->ipsec_in_crypto_data.cd_offset;
- processed_len = ii->ipsec_in_crypto_data.cd_length;
+ espstart = ic->ic_crypto_data.cd_offset;
+ processed_len = ic->ic_crypto_data.cd_length;
} else {
/* encryption present */
ivlen = assoc->ipsa_iv_len;
if (assoc->ipsa_auth_alg == SADB_AALG_NONE) {
/* encryption-only ESP */
- espstart = ii->ipsec_in_crypto_data.cd_offset -
+ espstart = ic->ic_crypto_data.cd_offset -
sizeof (esph_t) - assoc->ipsa_iv_len;
- processed_len = ii->ipsec_in_crypto_data.cd_length +
+ processed_len = ic->ic_crypto_data.cd_length +
ivlen;
} else {
/* encryption with authentication */
- espstart = ii->ipsec_in_crypto_dual_data.dd_offset1;
- processed_len = ii->ipsec_in_crypto_dual_data.dd_len2 +
+ espstart = ic->ic_crypto_dual_data.dd_offset1;
+ processed_len = ic->ic_crypto_dual_data.dd_len2 +
ivlen;
}
}
- data_mp = ipsec_in_mp->b_cont;
esph = (esph_t *)(data_mp->b_rptr + espstart);
if (assoc->ipsa_auth_alg != IPSA_AALG_NONE ||
@@ -1840,8 +1715,11 @@ esp_in_done(mblk_t *ipsec_in_mp)
goto drop_and_bail;
}
- if (is_natt)
- esp_port_freshness(ii->ipsec_in_esp_udp_ports, assoc);
+ if (is_natt) {
+ ASSERT(ira->ira_flags & IRAF_ESP_UDP_PORTS);
+ ASSERT(ira->ira_esp_udp_ports != 0);
+ esp_port_freshness(ira->ira_esp_udp_ports, assoc);
+ }
}
esp_set_usetime(assoc, B_TRUE);
@@ -1863,44 +1741,41 @@ esp_in_done(mblk_t *ipsec_in_mp)
* spews "branch, predict taken" code for this.
*/
- if (esp_strip_header(data_mp, ii->ipsec_in_v4, ivlen, &counter,
- espstack)) {
-
- if (is_system_labeled()) {
- cred_t *cr = assoc->ipsa_cred;
+ if (esp_strip_header(data_mp, (ira->ira_flags & IRAF_IS_IPV4),
+ ivlen, &counter, espstack)) {
- if (cr != NULL) {
- mblk_setcred(data_mp, cr, NOPID);
+ if (is_system_labeled() && assoc->ipsa_tsl != NULL) {
+ if (!ip_recv_attr_replace_label(ira, assoc->ipsa_tsl)) {
+ ip_drop_packet(data_mp, B_TRUE, ira->ira_ill,
+ DROPPER(ipss, ipds_ah_nomem),
+ &espstack->esp_dropper);
+ BUMP_MIB(ira->ira_ill->ill_ip_mib,
+ ipIfStatsInDiscards);
+ return (NULL);
}
-
}
if (is_natt)
return (esp_fix_natt_checksums(data_mp, assoc));
- ASSERT(!is_system_labeled() || (DB_CRED(data_mp) != NULL));
-
if (assoc->ipsa_state == IPSA_STATE_IDLE) {
/*
* Cluster buffering case. Tell caller that we're
* handling the packet.
*/
- sadb_buf_pkt(assoc, ipsec_in_mp, ns);
- return (IPSEC_STATUS_PENDING);
+ sadb_buf_pkt(assoc, data_mp, ira);
+ return (NULL);
}
- return (IPSEC_STATUS_SUCCESS);
+ return (data_mp);
}
esp1dbg(espstack, ("esp_in_done: esp_strip_header() failed\n"));
drop_and_bail:
IP_ESP_BUMP_STAT(ipss, in_discards);
- /*
- * TODO: Extract inbound interface from the IPSEC_IN message's
- * ii->ipsec_in_rill_index.
- */
- ip_drop_packet(ipsec_in_mp, B_TRUE, NULL, NULL, counter,
+ ip_drop_packet(data_mp, B_TRUE, ira->ira_ill, counter,
&espstack->esp_dropper);
- return (IPSEC_STATUS_FAILED);
+ BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+ return (NULL);
}
/*
@@ -1908,11 +1783,10 @@ drop_and_bail:
* argument is freed.
*/
static void
-esp_log_bad_auth(mblk_t *ipsec_in)
+esp_log_bad_auth(mblk_t *mp, ip_recv_attr_t *ira)
{
- ipsec_in_t *ii = (ipsec_in_t *)ipsec_in->b_rptr;
- ipsa_t *assoc = ii->ipsec_in_esp_sa;
- netstack_t *ns = ii->ipsec_in_ns;
+ ipsa_t *assoc = ira->ira_ipsec_esp_sa;
+ netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack;
ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
ipsec_stack_t *ipss = ns->netstack_ipsec;
@@ -1928,11 +1802,7 @@ esp_log_bad_auth(mblk_t *ipsec_in)
espstack->ipsecesp_netstack);
IP_ESP_BUMP_STAT(ipss, in_discards);
- /*
- * TODO: Extract inbound interface from the IPSEC_IN
- * message's ii->ipsec_in_rill_index.
- */
- ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL,
+ ip_drop_packet(mp, B_TRUE, ira->ira_ill,
DROPPER(ipss, ipds_esp_bad_auth),
&espstack->esp_dropper);
}
@@ -1944,148 +1814,205 @@ esp_log_bad_auth(mblk_t *ipsec_in)
* Returns B_TRUE if the AH processing was not needed or if it was
* performed successfully. Returns B_FALSE and consumes the passed mblk
* if AH processing was required but could not be performed.
+ *
+ * Returns data_mp unless data_mp was consumed/queued.
*/
-static boolean_t
-esp_do_outbound_ah(mblk_t *ipsec_mp)
+static mblk_t *
+esp_do_outbound_ah(mblk_t *data_mp, ip_xmit_attr_t *ixa)
{
- ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr;
- ipsec_status_t ipsec_rc;
ipsec_action_t *ap;
- ap = io->ipsec_out_act;
+ ap = ixa->ixa_ipsec_action;
if (ap == NULL) {
- ipsec_policy_t *pp = io->ipsec_out_policy;
+ ipsec_policy_t *pp = ixa->ixa_ipsec_policy;
ap = pp->ipsp_act;
}
if (!ap->ipa_want_ah)
- return (B_TRUE);
+ return (data_mp);
- ASSERT(io->ipsec_out_ah_done == B_FALSE);
-
- if (io->ipsec_out_ah_sa == NULL) {
- if (!ipsec_outbound_sa(ipsec_mp, IPPROTO_AH)) {
- sadb_acquire(ipsec_mp, io, B_TRUE, B_FALSE);
- return (B_FALSE);
+ /*
+ * Normally the AH SA would have already been put in place
+ * but it could have been flushed so we need to look for it.
+ */
+ if (ixa->ixa_ipsec_ah_sa == NULL) {
+ if (!ipsec_outbound_sa(data_mp, ixa, IPPROTO_AH)) {
+ sadb_acquire(data_mp, ixa, B_TRUE, B_FALSE);
+ return (NULL);
}
}
- ASSERT(io->ipsec_out_ah_sa != NULL);
+ ASSERT(ixa->ixa_ipsec_ah_sa != NULL);
- io->ipsec_out_ah_done = B_TRUE;
- ipsec_rc = io->ipsec_out_ah_sa->ipsa_output_func(ipsec_mp);
- return (ipsec_rc == IPSEC_STATUS_SUCCESS);
+ data_mp = ixa->ixa_ipsec_ah_sa->ipsa_output_func(data_mp, ixa);
+ return (data_mp);
}
/*
* Kernel crypto framework callback invoked after completion of async
- * crypto requests.
+ * crypto requests for outbound packets.
*/
static void
-esp_kcf_callback(void *arg, int status)
+esp_kcf_callback_outbound(void *arg, int status)
{
- mblk_t *ipsec_mp = (mblk_t *)arg;
- ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr;
- ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr;
- boolean_t is_inbound = (ii->ipsec_in_type == IPSEC_IN);
- netstackid_t stackid;
- netstack_t *ns, *ns_arg;
- ipsecesp_stack_t *espstack;
+ mblk_t *mp = (mblk_t *)arg;
+ mblk_t *async_mp;
+ netstack_t *ns;
ipsec_stack_t *ipss;
+ ipsecesp_stack_t *espstack;
+ mblk_t *data_mp;
+ ip_xmit_attr_t ixas;
+ ipsec_crypto_t *ic;
+ ill_t *ill;
- ASSERT(ipsec_mp->b_cont != NULL);
+ /*
+ * First remove the ipsec_crypto_t mblk
+ * Note that we need to ipsec_free_crypto_data(mp) once done with ic.
+ */
+ async_mp = ipsec_remove_crypto_data(mp, &ic);
+ ASSERT(async_mp != NULL);
- if (is_inbound) {
- stackid = ii->ipsec_in_stackid;
- ns_arg = ii->ipsec_in_ns;
+ /*
+ * Extract the ip_xmit_attr_t from the first mblk.
+ * Verifies that the netstack and ill is still around; could
+ * have vanished while kEf was doing its work.
+ * On succesful return we have a nce_t and the ill/ipst can't
+ * disappear until we do the nce_refrele in ixa_cleanup.
+ */
+ data_mp = async_mp->b_cont;
+ async_mp->b_cont = NULL;
+ if (!ip_xmit_attr_from_mblk(async_mp, &ixas)) {
+ /* Disappeared on us - no ill/ipst for MIB */
+ /* We have nowhere to do stats since ixa_ipst could be NULL */
+ if (ixas.ixa_nce != NULL) {
+ ill = ixas.ixa_nce->nce_ill;
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards", data_mp, ill);
+ }
+ freemsg(data_mp);
+ goto done;
+ }
+ ns = ixas.ixa_ipst->ips_netstack;
+ espstack = ns->netstack_ipsecesp;
+ ipss = ns->netstack_ipsec;
+ ill = ixas.ixa_nce->nce_ill;
+
+ if (status == CRYPTO_SUCCESS) {
+ /*
+ * If a ICV was computed, it was stored by the
+ * crypto framework at the end of the packet.
+ */
+ ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
+
+ esp_set_usetime(ixas.ixa_ipsec_esp_sa, B_FALSE);
+ /* NAT-T packet. */
+ if (IPH_HDR_VERSION(ipha) == IP_VERSION &&
+ ipha->ipha_protocol == IPPROTO_UDP)
+ esp_prepare_udp(ns, data_mp, ipha);
+
+ /* do AH processing if needed */
+ data_mp = esp_do_outbound_ah(data_mp, &ixas);
+ if (data_mp == NULL)
+ goto done;
+
+ (void) ip_output_post_ipsec(data_mp, &ixas);
} else {
- stackid = io->ipsec_out_stackid;
- ns_arg = io->ipsec_out_ns;
+ /* Outbound shouldn't see invalid MAC */
+ ASSERT(status != CRYPTO_INVALID_MAC);
+
+ esp1dbg(espstack,
+ ("esp_kcf_callback_outbound: crypto failed with 0x%x\n",
+ status));
+ ESP_BUMP_STAT(espstack, crypto_failures);
+ ESP_BUMP_STAT(espstack, out_discards);
+ ip_drop_packet(data_mp, B_FALSE, ill,
+ DROPPER(ipss, ipds_esp_crypto_failed),
+ &espstack->esp_dropper);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
}
+done:
+ ixa_cleanup(&ixas);
+ (void) ipsec_free_crypto_data(mp);
+}
+
+/*
+ * Kernel crypto framework callback invoked after completion of async
+ * crypto requests for inbound packets.
+ */
+static void
+esp_kcf_callback_inbound(void *arg, int status)
+{
+ mblk_t *mp = (mblk_t *)arg;
+ mblk_t *async_mp;
+ netstack_t *ns;
+ ipsecesp_stack_t *espstack;
+ ipsec_stack_t *ipss;
+ mblk_t *data_mp;
+ ip_recv_attr_t iras;
+ ipsec_crypto_t *ic;
/*
- * Verify that the netstack is still around; could have vanished
- * while kEf was doing its work.
+ * First remove the ipsec_crypto_t mblk
+ * Note that we need to ipsec_free_crypto_data(mp) once done with ic.
*/
- ns = netstack_find_by_stackid(stackid);
- if (ns == NULL || ns != ns_arg) {
- /* Disappeared on us */
- if (ns != NULL)
- netstack_rele(ns);
- freemsg(ipsec_mp);
- return;
+ async_mp = ipsec_remove_crypto_data(mp, &ic);
+ ASSERT(async_mp != NULL);
+
+ /*
+ * Extract the ip_recv_attr_t from the first mblk.
+ * Verifies that the netstack and ill is still around; could
+ * have vanished while kEf was doing its work.
+ */
+ data_mp = async_mp->b_cont;
+ async_mp->b_cont = NULL;
+ if (!ip_recv_attr_from_mblk(async_mp, &iras)) {
+ /* The ill or ip_stack_t disappeared on us */
+ ip_drop_input("ip_recv_attr_from_mblk", data_mp, NULL);
+ freemsg(data_mp);
+ goto done;
}
+ ns = iras.ira_ill->ill_ipst->ips_netstack;
espstack = ns->netstack_ipsecesp;
ipss = ns->netstack_ipsec;
if (status == CRYPTO_SUCCESS) {
- if (is_inbound) {
- if (esp_in_done(ipsec_mp) != IPSEC_STATUS_SUCCESS) {
- netstack_rele(ns);
- return;
- }
- /* finish IPsec processing */
- ip_fanout_proto_again(ipsec_mp, NULL, NULL, NULL);
- } else {
- /*
- * If a ICV was computed, it was stored by the
- * crypto framework at the end of the packet.
- */
- ipha_t *ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr;
-
- esp_set_usetime(io->ipsec_out_esp_sa, B_FALSE);
- /* NAT-T packet. */
- if (ipha->ipha_protocol == IPPROTO_UDP)
- esp_prepare_udp(ns, ipsec_mp->b_cont, ipha);
-
- /* do AH processing if needed */
- if (!esp_do_outbound_ah(ipsec_mp)) {
- netstack_rele(ns);
- return;
- }
- /* finish IPsec processing */
- if (IPH_HDR_VERSION(ipha) == IP_VERSION) {
- ip_wput_ipsec_out(NULL, ipsec_mp, ipha, NULL,
- NULL);
- } else {
- ip6_t *ip6h = (ip6_t *)ipha;
- ip_wput_ipsec_out_v6(NULL, ipsec_mp, ip6h,
- NULL, NULL);
- }
- }
+ data_mp = esp_in_done(data_mp, &iras, ic);
+ if (data_mp == NULL)
+ goto done;
+ /* finish IPsec processing */
+ ip_input_post_ipsec(data_mp, &iras);
} else if (status == CRYPTO_INVALID_MAC) {
- esp_log_bad_auth(ipsec_mp);
-
+ esp_log_bad_auth(data_mp, &iras);
} else {
esp1dbg(espstack,
("esp_kcf_callback: crypto failed with 0x%x\n",
status));
ESP_BUMP_STAT(espstack, crypto_failures);
- if (is_inbound)
- IP_ESP_BUMP_STAT(ipss, in_discards);
- else
- ESP_BUMP_STAT(espstack, out_discards);
- ip_drop_packet(ipsec_mp, is_inbound, NULL, NULL,
+ IP_ESP_BUMP_STAT(ipss, in_discards);
+ ip_drop_packet(data_mp, B_TRUE, iras.ira_ill,
DROPPER(ipss, ipds_esp_crypto_failed),
&espstack->esp_dropper);
+ BUMP_MIB(iras.ira_ill->ill_ip_mib, ipIfStatsInDiscards);
}
- netstack_rele(ns);
+done:
+ ira_cleanup(&iras, B_TRUE);
+ (void) ipsec_free_crypto_data(mp);
}
/*
* Invoked on crypto framework failure during inbound and outbound processing.
*/
static void
-esp_crypto_failed(mblk_t *mp, boolean_t is_inbound, int kef_rc,
- ipsecesp_stack_t *espstack)
+esp_crypto_failed(mblk_t *data_mp, boolean_t is_inbound, int kef_rc,
+ ill_t *ill, ipsecesp_stack_t *espstack)
{
ipsec_stack_t *ipss = espstack->ipsecesp_netstack->netstack_ipsec;
esp1dbg(espstack, ("crypto failed for %s ESP with 0x%x\n",
is_inbound ? "inbound" : "outbound", kef_rc));
- ip_drop_packet(mp, is_inbound, NULL, NULL,
+ ip_drop_packet(data_mp, is_inbound, ill,
DROPPER(ipss, ipds_esp_crypto_failed),
&espstack->esp_dropper);
ESP_BUMP_STAT(espstack, crypto_failures);
@@ -2095,11 +2022,14 @@ esp_crypto_failed(mblk_t *mp, boolean_t is_inbound, int kef_rc,
ESP_BUMP_STAT(espstack, out_discards);
}
-#define ESP_INIT_CALLREQ(_cr) { \
- (_cr)->cr_flag = CRYPTO_SKIP_REQID|CRYPTO_RESTRICTED; \
- (_cr)->cr_callback_arg = ipsec_mp; \
- (_cr)->cr_callback_func = esp_kcf_callback; \
-}
+/*
+ * A statement-equivalent macro, _cr MUST point to a modifiable
+ * crypto_call_req_t.
+ */
+#define ESP_INIT_CALLREQ(_cr, _mp, _callback) \
+ (_cr)->cr_flag = CRYPTO_SKIP_REQID|CRYPTO_ALWAYS_QUEUE; \
+ (_cr)->cr_callback_arg = (_mp); \
+ (_cr)->cr_callback_func = (_callback)
#define ESP_INIT_CRYPTO_MAC(mac, icvlen, icvbuf) { \
(mac)->cd_format = CRYPTO_DATA_RAW; \
@@ -2132,44 +2062,45 @@ esp_crypto_failed(mblk_t *mp, boolean_t is_inbound, int kef_rc,
(data)->dd_offset2 = off2; \
}
-static ipsec_status_t
-esp_submit_req_inbound(mblk_t *ipsec_mp, ipsa_t *assoc, uint_t esph_offset)
+/*
+ * Returns data_mp if successfully completed the request. Returns
+ * NULL if it failed (and increments InDiscards) or if it is pending.
+ */
+static mblk_t *
+esp_submit_req_inbound(mblk_t *esp_mp, ip_recv_attr_t *ira,
+ ipsa_t *assoc, uint_t esph_offset)
{
- ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr;
- boolean_t do_auth;
uint_t auth_offset, msg_len, auth_len;
- crypto_call_req_t call_req;
- mblk_t *esp_mp;
+ crypto_call_req_t call_req, *callrp;
+ mblk_t *mp;
esph_t *esph_ptr;
- int kef_rc = CRYPTO_FAILED;
+ int kef_rc;
uint_t icv_len = assoc->ipsa_mac_len;
crypto_ctx_template_t auth_ctx_tmpl;
- boolean_t do_encr;
+ boolean_t do_auth, do_encr, force;
uint_t encr_offset, encr_len;
uint_t iv_len = assoc->ipsa_iv_len;
crypto_ctx_template_t encr_ctx_tmpl;
- netstack_t *ns = ii->ipsec_in_ns;
- ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
- ipsec_stack_t *ipss = ns->netstack_ipsec;
+ ipsec_crypto_t *ic, icstack;
uchar_t *iv_ptr;
-
- ASSERT(ii->ipsec_in_type == IPSEC_IN);
-
- /*
- * In case kEF queues and calls back, keep netstackid_t for
- * verification that the IP instance is still around in
- * esp_kcf_callback().
- */
- ASSERT(ii->ipsec_in_stackid == ns->netstack_stackid);
+ netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack;
+ ipsec_stack_t *ipss = ns->netstack_ipsec;
+ ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
do_auth = assoc->ipsa_auth_alg != SADB_AALG_NONE;
do_encr = assoc->ipsa_encr_alg != SADB_EALG_NULL;
+ force = (assoc->ipsa_flags & IPSA_F_ASYNC);
+
+#ifdef IPSEC_LATENCY_TEST
+ kef_rc = CRYPTO_SUCCESS;
+#else
+ kef_rc = CRYPTO_FAILED;
+#endif
/*
* An inbound packet is of the form:
- * IPSEC_IN -> [IP,options,ESP,IV,data,ICV,pad]
+ * [IP,options,ESP,IV,data,ICV,pad]
*/
- esp_mp = ipsec_mp->b_cont;
esph_ptr = (esph_t *)(esp_mp->b_rptr + esph_offset);
iv_ptr = (uchar_t *)(esph_ptr + 1);
/* Packet length starting at IP header ending after ESP ICV. */
@@ -2178,8 +2109,6 @@ esp_submit_req_inbound(mblk_t *ipsec_mp, ipsa_t *assoc, uint_t esph_offset)
encr_offset = esph_offset + sizeof (esph_t) + iv_len;
encr_len = msg_len - encr_offset;
- ESP_INIT_CALLREQ(&call_req);
-
/*
* Counter mode algs need a nonce. This is setup in sadb_common_add().
* If for some reason we are using a SA which does not have a nonce
@@ -2187,23 +2116,40 @@ esp_submit_req_inbound(mblk_t *ipsec_mp, ipsa_t *assoc, uint_t esph_offset)
*/
if ((assoc->ipsa_flags & IPSA_F_COUNTERMODE) &&
(assoc->ipsa_nonce == NULL)) {
- ip_drop_packet(ipsec_mp, B_FALSE, NULL, NULL,
+ ip_drop_packet(esp_mp, B_TRUE, ira->ira_ill,
DROPPER(ipss, ipds_esp_nomem), &espstack->esp_dropper);
- return (IPSEC_STATUS_FAILED);
+ return (NULL);
}
- if (do_auth) {
- /* force asynchronous processing? */
- if (ipss->ipsec_algs_exec_mode[IPSEC_ALG_AUTH] ==
- IPSEC_ALGS_EXEC_ASYNC)
- call_req.cr_flag |= CRYPTO_ALWAYS_QUEUE;
+ if (force) {
+ /* We are doing asynch; allocate mblks to hold state */
+ if ((mp = ip_recv_attr_to_mblk(ira)) == NULL ||
+ (mp = ipsec_add_crypto_data(mp, &ic)) == NULL) {
+ BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", esp_mp,
+ ira->ira_ill);
+ return (NULL);
+ }
+ linkb(mp, esp_mp);
+ callrp = &call_req;
+ ESP_INIT_CALLREQ(callrp, mp, esp_kcf_callback_inbound);
+ } else {
+ /*
+ * If we know we are going to do sync then ipsec_crypto_t
+ * should be on the stack.
+ */
+ ic = &icstack;
+ bzero(ic, sizeof (*ic));
+ callrp = NULL;
+ }
+ if (do_auth) {
/* authentication context template */
IPSEC_CTX_TMPL(assoc, ipsa_authtmpl, IPSEC_ALG_AUTH,
auth_ctx_tmpl);
/* ICV to be verified */
- ESP_INIT_CRYPTO_MAC(&ii->ipsec_in_crypto_mac,
+ ESP_INIT_CRYPTO_MAC(&ic->ic_crypto_mac,
icv_len, esp_mp->b_wptr - icv_len);
/* authentication starts at the ESP header */
@@ -2212,79 +2158,90 @@ esp_submit_req_inbound(mblk_t *ipsec_mp, ipsa_t *assoc, uint_t esph_offset)
if (!do_encr) {
/* authentication only */
/* initialize input data argument */
- ESP_INIT_CRYPTO_DATA(&ii->ipsec_in_crypto_data,
+ ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data,
esp_mp, auth_offset, auth_len);
/* call the crypto framework */
kef_rc = crypto_mac_verify(&assoc->ipsa_amech,
- &ii->ipsec_in_crypto_data,
+ &ic->ic_crypto_data,
&assoc->ipsa_kcfauthkey, auth_ctx_tmpl,
- &ii->ipsec_in_crypto_mac, &call_req);
+ &ic->ic_crypto_mac, callrp);
}
}
if (do_encr) {
- /* force asynchronous processing? */
- if (ipss->ipsec_algs_exec_mode[IPSEC_ALG_ENCR] ==
- IPSEC_ALGS_EXEC_ASYNC)
- call_req.cr_flag |= CRYPTO_ALWAYS_QUEUE;
-
/* encryption template */
IPSEC_CTX_TMPL(assoc, ipsa_encrtmpl, IPSEC_ALG_ENCR,
encr_ctx_tmpl);
/* Call the nonce update function. Also passes in IV */
(assoc->ipsa_noncefunc)(assoc, (uchar_t *)esph_ptr, encr_len,
- iv_ptr, &ii->ipsec_in_cmm, &ii->ipsec_in_crypto_data);
+ iv_ptr, &ic->ic_cmm, &ic->ic_crypto_data);
if (!do_auth) {
/* decryption only */
/* initialize input data argument */
- ESP_INIT_CRYPTO_DATA(&ii->ipsec_in_crypto_data,
+ ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data,
esp_mp, encr_offset, encr_len);
/* call the crypto framework */
kef_rc = crypto_decrypt((crypto_mechanism_t *)
- &ii->ipsec_in_cmm, &ii->ipsec_in_crypto_data,
+ &ic->ic_cmm, &ic->ic_crypto_data,
&assoc->ipsa_kcfencrkey, encr_ctx_tmpl,
- NULL, &call_req);
+ NULL, callrp);
}
}
if (do_auth && do_encr) {
/* dual operation */
/* initialize input data argument */
- ESP_INIT_CRYPTO_DUAL_DATA(&ii->ipsec_in_crypto_dual_data,
+ ESP_INIT_CRYPTO_DUAL_DATA(&ic->ic_crypto_dual_data,
esp_mp, auth_offset, auth_len,
encr_offset, encr_len - icv_len);
/* specify IV */
- ii->ipsec_in_crypto_dual_data.dd_miscdata = (char *)iv_ptr;
+ ic->ic_crypto_dual_data.dd_miscdata = (char *)iv_ptr;
/* call the framework */
kef_rc = crypto_mac_verify_decrypt(&assoc->ipsa_amech,
- &assoc->ipsa_emech, &ii->ipsec_in_crypto_dual_data,
+ &assoc->ipsa_emech, &ic->ic_crypto_dual_data,
&assoc->ipsa_kcfauthkey, &assoc->ipsa_kcfencrkey,
- auth_ctx_tmpl, encr_ctx_tmpl, &ii->ipsec_in_crypto_mac,
- NULL, &call_req);
+ auth_ctx_tmpl, encr_ctx_tmpl, &ic->ic_crypto_mac,
+ NULL, callrp);
}
switch (kef_rc) {
case CRYPTO_SUCCESS:
ESP_BUMP_STAT(espstack, crypto_sync);
- return (esp_in_done(ipsec_mp));
+ esp_mp = esp_in_done(esp_mp, ira, ic);
+ if (force) {
+ /* Free mp after we are done with ic */
+ mp = ipsec_free_crypto_data(mp);
+ (void) ip_recv_attr_free_mblk(mp);
+ }
+ return (esp_mp);
case CRYPTO_QUEUED:
- /* esp_kcf_callback() will be invoked on completion */
+ /* esp_kcf_callback_inbound() will be invoked on completion */
ESP_BUMP_STAT(espstack, crypto_async);
- return (IPSEC_STATUS_PENDING);
+ return (NULL);
case CRYPTO_INVALID_MAC:
+ if (force) {
+ mp = ipsec_free_crypto_data(mp);
+ esp_mp = ip_recv_attr_free_mblk(mp);
+ }
ESP_BUMP_STAT(espstack, crypto_sync);
- esp_log_bad_auth(ipsec_mp);
- return (IPSEC_STATUS_FAILED);
+ BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+ esp_log_bad_auth(esp_mp, ira);
+ /* esp_mp was passed to ip_drop_packet */
+ return (NULL);
}
- esp_crypto_failed(ipsec_mp, B_TRUE, kef_rc, espstack);
- return (IPSEC_STATUS_FAILED);
+ mp = ipsec_free_crypto_data(mp);
+ esp_mp = ip_recv_attr_free_mblk(mp);
+ BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
+ esp_crypto_failed(esp_mp, B_TRUE, kef_rc, ira->ira_ill, espstack);
+ /* esp_mp was passed to ip_drop_packet */
+ return (NULL);
}
/*
@@ -2293,6 +2250,9 @@ esp_submit_req_inbound(mblk_t *ipsec_mp, ipsa_t *assoc, uint_t esph_offset)
* uses mblk-insertion to insert the UDP header.
* TODO - If there is an easy way to prep a packet for HW checksums, make
* it happen here.
+ * Note that this is used before both before calling ip_output_simple and
+ * in the esp datapath. The former could use IXAF_SET_ULP_CKSUM but not the
+ * latter.
*/
static void
esp_prepare_udp(netstack_t *ns, mblk_t *mp, ipha_t *ipha)
@@ -2313,7 +2273,7 @@ esp_prepare_udp(netstack_t *ns, mblk_t *mp, ipha_t *ipha)
/* arr points to the IP header. */
arr = (uint16_t *)ipha;
IP_STAT(ns->netstack_ip, ip_out_sw_cksum);
- IP_STAT_UPDATE(ns->netstack_ip, ip_udp_out_sw_cksum_bytes,
+ IP_STAT_UPDATE(ns->netstack_ip, ip_out_sw_cksum_bytes,
ntohs(htons(ipha->ipha_length) - hlen));
/* arr[6-9] are the IP addresses. */
cksum = IP_UDP_CSUM_COMP + arr[6] + arr[7] + arr[8] + arr[9] +
@@ -2336,41 +2296,45 @@ esp_prepare_udp(netstack_t *ns, mblk_t *mp, ipha_t *ipha)
static void
actually_send_keepalive(void *arg)
{
- mblk_t *ipsec_mp = (mblk_t *)arg;
- ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr;
- ipha_t *ipha;
- netstack_t *ns;
-
- ASSERT(DB_TYPE(ipsec_mp) == M_CTL);
- ASSERT(io->ipsec_out_type == IPSEC_OUT);
- ASSERT(ipsec_mp->b_cont != NULL);
- ASSERT(DB_TYPE(ipsec_mp->b_cont) == M_DATA);
-
- ns = netstack_find_by_stackid(io->ipsec_out_stackid);
- if (ns == NULL || ns != io->ipsec_out_ns) {
- /* Just freemsg(). */
- if (ns != NULL)
- netstack_rele(ns);
- freemsg(ipsec_mp);
+ mblk_t *mp = (mblk_t *)arg;
+ ip_xmit_attr_t ixas;
+ netstack_t *ns;
+ netstackid_t stackid;
+
+ stackid = (netstackid_t)(uintptr_t)mp->b_prev;
+ mp->b_prev = NULL;
+ ns = netstack_find_by_stackid(stackid);
+ if (ns == NULL) {
+ /* Disappeared */
+ ip_drop_output("ipIfStatsOutDiscards", mp, NULL);
+ freemsg(mp);
return;
}
- ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr;
- ip_wput_ipsec_out(NULL, ipsec_mp, ipha, NULL, NULL);
+ bzero(&ixas, sizeof (ixas));
+ ixas.ixa_zoneid = ALL_ZONES;
+ ixas.ixa_cred = kcred;
+ ixas.ixa_cpid = NOPID;
+ ixas.ixa_tsl = NULL;
+ ixas.ixa_ipst = ns->netstack_ip;
+ /* No ULP checksum; done by esp_prepare_udp */
+ ixas.ixa_flags = IXAF_IS_IPV4 | IXAF_NO_IPSEC;
+
+ (void) ip_output_simple(mp, &ixas);
+ ixa_cleanup(&ixas);
netstack_rele(ns);
}
/*
- * Send a one-byte UDP NAT-T keepalive. Construct an IPSEC_OUT too that'll
- * get fed into esp_send_udp/ip_wput_ipsec_out.
+ * Send a one-byte UDP NAT-T keepalive.
*/
void
ipsecesp_send_keepalive(ipsa_t *assoc)
{
- mblk_t *mp = NULL, *ipsec_mp = NULL;
- ipha_t *ipha;
- udpha_t *udpha;
- ipsec_out_t *io;
+ mblk_t *mp;
+ ipha_t *ipha;
+ udpha_t *udpha;
+ netstack_t *ns = assoc->ipsa_netstack;
ASSERT(MUTEX_NOT_HELD(&assoc->ipsa_lock));
@@ -2399,85 +2363,78 @@ ipsecesp_send_keepalive(ipsa_t *assoc)
mp->b_wptr = (uint8_t *)(udpha + 1);
*(mp->b_wptr++) = 0xFF;
- ipsec_mp = ipsec_alloc_ipsec_out(assoc->ipsa_netstack);
- if (ipsec_mp == NULL) {
- freeb(mp);
- return;
- }
- ipsec_mp->b_cont = mp;
- io = (ipsec_out_t *)ipsec_mp->b_rptr;
- io->ipsec_out_zoneid =
- netstackid_to_zoneid(assoc->ipsa_netstack->netstack_stackid);
- io->ipsec_out_stackid = assoc->ipsa_netstack->netstack_stackid;
+ esp_prepare_udp(ns, mp, ipha);
- esp_prepare_udp(assoc->ipsa_netstack, mp, ipha);
/*
* We're holding an isaf_t bucket lock, so pawn off the actual
* packet transmission to another thread. Just in case syncq
* processing causes a same-bucket packet to be processed.
*/
- if (taskq_dispatch(esp_taskq, actually_send_keepalive, ipsec_mp,
+ mp->b_prev = (mblk_t *)(uintptr_t)ns->netstack_stackid;
+
+ if (taskq_dispatch(esp_taskq, actually_send_keepalive, mp,
TQ_NOSLEEP) == 0) {
/* Assume no memory if taskq_dispatch() fails. */
- ip_drop_packet(ipsec_mp, B_FALSE, NULL, NULL,
- DROPPER(assoc->ipsa_netstack->netstack_ipsec,
- ipds_esp_nomem),
- &assoc->ipsa_netstack->netstack_ipsecesp->esp_dropper);
+ mp->b_prev = NULL;
+ ip_drop_packet(mp, B_FALSE, NULL,
+ DROPPER(ns->netstack_ipsec, ipds_esp_nomem),
+ &ns->netstack_ipsecesp->esp_dropper);
}
}
-static ipsec_status_t
-esp_submit_req_outbound(mblk_t *ipsec_mp, ipsa_t *assoc, uchar_t *icv_buf,
- uint_t payload_len)
+/*
+ * Returns mp if successfully completed the request. Returns
+ * NULL if it failed (and increments InDiscards) or if it is pending.
+ */
+static mblk_t *
+esp_submit_req_outbound(mblk_t *data_mp, ip_xmit_attr_t *ixa, ipsa_t *assoc,
+ uchar_t *icv_buf, uint_t payload_len)
{
- ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr;
uint_t auth_len;
- crypto_call_req_t call_req;
- mblk_t *esp_mp, *data_mp, *ip_mp;
+ crypto_call_req_t call_req, *callrp;
+ mblk_t *esp_mp;
esph_t *esph_ptr;
+ mblk_t *mp;
int kef_rc = CRYPTO_FAILED;
uint_t icv_len = assoc->ipsa_mac_len;
crypto_ctx_template_t auth_ctx_tmpl;
- boolean_t do_auth;
- boolean_t do_encr;
+ boolean_t do_auth, do_encr, force;
uint_t iv_len = assoc->ipsa_iv_len;
crypto_ctx_template_t encr_ctx_tmpl;
boolean_t is_natt = ((assoc->ipsa_flags & IPSA_F_NATT) != 0);
size_t esph_offset = (is_natt ? UDPH_SIZE : 0);
- netstack_t *ns = io->ipsec_out_ns;
+ netstack_t *ns = ixa->ixa_ipst->ips_netstack;
ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
+ ipsec_crypto_t *ic, icstack;
+ uchar_t *iv_ptr;
+ crypto_data_t *cd_ptr = NULL;
+ ill_t *ill = ixa->ixa_nce->nce_ill;
ipsec_stack_t *ipss = ns->netstack_ipsec;
- uchar_t *iv_ptr;
- crypto_data_t *cd_ptr = NULL;
esp3dbg(espstack, ("esp_submit_req_outbound:%s",
is_natt ? "natt" : "not natt"));
- ASSERT(io->ipsec_out_type == IPSEC_OUT);
-
- /*
- * In case kEF queues and calls back, keep netstackid_t for
- * verification that the IP instance is still around in
- * esp_kcf_callback().
- */
- io->ipsec_out_stackid = ns->netstack_stackid;
-
do_encr = assoc->ipsa_encr_alg != SADB_EALG_NULL;
do_auth = assoc->ipsa_auth_alg != SADB_AALG_NONE;
+ force = (assoc->ipsa_flags & IPSA_F_ASYNC);
+
+#ifdef IPSEC_LATENCY_TEST
+ kef_rc = CRYPTO_SUCCESS;
+#else
+ kef_rc = CRYPTO_FAILED;
+#endif
/*
* Outbound IPsec packets are of the form:
- * IPSEC_OUT -> [IP,options] -> [ESP,IV] -> [data] -> [pad,ICV]
+ * [IP,options] -> [ESP,IV] -> [data] -> [pad,ICV]
* unless it's NATT, then it's
- * IPSEC_OUT -> [IP,options] -> [udp][ESP,IV] -> [data] -> [pad,ICV]
+ * [IP,options] -> [udp][ESP,IV] -> [data] -> [pad,ICV]
* Get a pointer to the mblk containing the ESP header.
*/
- ip_mp = ipsec_mp->b_cont;
- esp_mp = ipsec_mp->b_cont->b_cont;
- ASSERT(ip_mp != NULL && esp_mp != NULL);
+ ASSERT(data_mp->b_cont != NULL);
+ esp_mp = data_mp->b_cont;
esph_ptr = (esph_t *)(esp_mp->b_rptr + esph_offset);
iv_ptr = (uchar_t *)(esph_ptr + 1);
- data_mp = ipsec_mp->b_cont->b_cont->b_cont;
/*
* Combined mode algs need a nonce. This is setup in sadb_common_add().
@@ -2486,25 +2443,42 @@ esp_submit_req_outbound(mblk_t *ipsec_mp, ipsa_t *assoc, uchar_t *icv_buf,
*/
if ((assoc->ipsa_flags & IPSA_F_COUNTERMODE) &&
(assoc->ipsa_nonce == NULL)) {
- ip_drop_packet(ipsec_mp, B_FALSE, NULL, NULL,
+ ip_drop_packet(data_mp, B_FALSE, NULL,
DROPPER(ipss, ipds_esp_nomem), &espstack->esp_dropper);
- return (IPSEC_STATUS_FAILED);
+ return (NULL);
}
- ESP_INIT_CALLREQ(&call_req);
+ if (force) {
+ /* We are doing asynch; allocate mblks to hold state */
+ if ((mp = ip_xmit_attr_to_mblk(ixa)) == NULL ||
+ (mp = ipsec_add_crypto_data(mp, &ic)) == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("ipIfStatsOutDiscards", data_mp, ill);
+ freemsg(data_mp);
+ return (NULL);
+ }
+
+ linkb(mp, data_mp);
+ callrp = &call_req;
+ ESP_INIT_CALLREQ(callrp, mp, esp_kcf_callback_outbound);
+ } else {
+ /*
+ * If we know we are going to do sync then ipsec_crypto_t
+ * should be on the stack.
+ */
+ ic = &icstack;
+ bzero(ic, sizeof (*ic));
+ callrp = NULL;
+ }
- if (do_auth) {
- /* force asynchronous processing? */
- if (ipss->ipsec_algs_exec_mode[IPSEC_ALG_AUTH] ==
- IPSEC_ALGS_EXEC_ASYNC)
- call_req.cr_flag |= CRYPTO_ALWAYS_QUEUE;
+ if (do_auth) {
/* authentication context template */
IPSEC_CTX_TMPL(assoc, ipsa_authtmpl, IPSEC_ALG_AUTH,
auth_ctx_tmpl);
/* where to store the computed mac */
- ESP_INIT_CRYPTO_MAC(&io->ipsec_out_crypto_mac,
+ ESP_INIT_CRYPTO_MAC(&ic->ic_crypto_mac,
icv_len, icv_buf);
/* authentication starts at the ESP header */
@@ -2512,35 +2486,30 @@ esp_submit_req_outbound(mblk_t *ipsec_mp, ipsa_t *assoc, uchar_t *icv_buf,
if (!do_encr) {
/* authentication only */
/* initialize input data argument */
- ESP_INIT_CRYPTO_DATA(&io->ipsec_out_crypto_data,
+ ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data,
esp_mp, esph_offset, auth_len);
/* call the crypto framework */
kef_rc = crypto_mac(&assoc->ipsa_amech,
- &io->ipsec_out_crypto_data,
+ &ic->ic_crypto_data,
&assoc->ipsa_kcfauthkey, auth_ctx_tmpl,
- &io->ipsec_out_crypto_mac, &call_req);
+ &ic->ic_crypto_mac, callrp);
}
}
if (do_encr) {
- /* force asynchronous processing? */
- if (ipss->ipsec_algs_exec_mode[IPSEC_ALG_ENCR] ==
- IPSEC_ALGS_EXEC_ASYNC)
- call_req.cr_flag |= CRYPTO_ALWAYS_QUEUE;
-
/* encryption context template */
IPSEC_CTX_TMPL(assoc, ipsa_encrtmpl, IPSEC_ALG_ENCR,
encr_ctx_tmpl);
/* Call the nonce update function. */
(assoc->ipsa_noncefunc)(assoc, (uchar_t *)esph_ptr, payload_len,
- iv_ptr, &io->ipsec_out_cmm, &io->ipsec_out_crypto_data);
+ iv_ptr, &ic->ic_cmm, &ic->ic_crypto_data);
if (!do_auth) {
/* encryption only, skip mblk that contains ESP hdr */
/* initialize input data argument */
- ESP_INIT_CRYPTO_DATA(&io->ipsec_out_crypto_data,
- data_mp, 0, payload_len);
+ ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data,
+ esp_mp->b_cont, 0, payload_len);
/*
* For combined mode ciphers, the ciphertext is the same
@@ -2556,20 +2525,19 @@ esp_submit_req_outbound(mblk_t *ipsec_mp, ipsa_t *assoc, uchar_t *icv_buf,
* for the cipher to use.
*/
if (assoc->ipsa_flags & IPSA_F_COMBINED) {
- bcopy(&io->ipsec_out_crypto_data,
- &io->ipsec_out_crypto_mac,
+ bcopy(&ic->ic_crypto_data,
+ &ic->ic_crypto_mac,
sizeof (crypto_data_t));
- io->ipsec_out_crypto_mac.cd_length =
+ ic->ic_crypto_mac.cd_length =
payload_len + icv_len;
- cd_ptr = &io->ipsec_out_crypto_mac;
+ cd_ptr = &ic->ic_crypto_mac;
}
/* call the crypto framework */
kef_rc = crypto_encrypt((crypto_mechanism_t *)
- &io->ipsec_out_cmm,
- &io->ipsec_out_crypto_data,
+ &ic->ic_cmm, &ic->ic_crypto_data,
&assoc->ipsa_kcfencrkey, encr_ctx_tmpl,
- cd_ptr, &call_req);
+ cd_ptr, callrp);
}
}
@@ -2584,49 +2552,58 @@ esp_submit_req_outbound(mblk_t *ipsec_mp, ipsa_t *assoc, uchar_t *icv_buf,
* the authentication at the ESP header, i.e. use an
* authentication offset of zero.
*/
- ESP_INIT_CRYPTO_DUAL_DATA(&io->ipsec_out_crypto_dual_data,
+ ESP_INIT_CRYPTO_DUAL_DATA(&ic->ic_crypto_dual_data,
esp_mp, MBLKL(esp_mp), payload_len, esph_offset, auth_len);
/* specify IV */
- io->ipsec_out_crypto_dual_data.dd_miscdata = (char *)iv_ptr;
+ ic->ic_crypto_dual_data.dd_miscdata = (char *)iv_ptr;
/* call the framework */
kef_rc = crypto_encrypt_mac(&assoc->ipsa_emech,
&assoc->ipsa_amech, NULL,
&assoc->ipsa_kcfencrkey, &assoc->ipsa_kcfauthkey,
encr_ctx_tmpl, auth_ctx_tmpl,
- &io->ipsec_out_crypto_dual_data,
- &io->ipsec_out_crypto_mac, &call_req);
+ &ic->ic_crypto_dual_data,
+ &ic->ic_crypto_mac, callrp);
}
switch (kef_rc) {
case CRYPTO_SUCCESS:
ESP_BUMP_STAT(espstack, crypto_sync);
esp_set_usetime(assoc, B_FALSE);
+ if (force) {
+ mp = ipsec_free_crypto_data(mp);
+ data_mp = ip_xmit_attr_free_mblk(mp);
+ }
if (is_natt)
- esp_prepare_udp(ns, ipsec_mp->b_cont,
- (ipha_t *)ipsec_mp->b_cont->b_rptr);
- return (IPSEC_STATUS_SUCCESS);
+ esp_prepare_udp(ns, data_mp, (ipha_t *)data_mp->b_rptr);
+ return (data_mp);
case CRYPTO_QUEUED:
- /* esp_kcf_callback() will be invoked on completion */
+ /* esp_kcf_callback_outbound() will be invoked on completion */
ESP_BUMP_STAT(espstack, crypto_async);
- return (IPSEC_STATUS_PENDING);
+ return (NULL);
}
- esp_crypto_failed(ipsec_mp, B_FALSE, kef_rc, espstack);
- return (IPSEC_STATUS_FAILED);
+ if (force) {
+ mp = ipsec_free_crypto_data(mp);
+ data_mp = ip_xmit_attr_free_mblk(mp);
+ }
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ esp_crypto_failed(data_mp, B_FALSE, kef_rc, NULL, espstack);
+ /* data_mp was passed to ip_drop_packet */
+ return (NULL);
}
/*
* Handle outbound IPsec processing for IPv4 and IPv6
- * On success returns B_TRUE, on failure returns B_FALSE and frees the
- * mblk chain ipsec_in_mp.
+ *
+ * Returns data_mp if successfully completed the request. Returns
+ * NULL if it failed (and increments InDiscards) or if it is pending.
*/
-static ipsec_status_t
-esp_outbound(mblk_t *mp)
+static mblk_t *
+esp_outbound(mblk_t *data_mp, ip_xmit_attr_t *ixa)
{
- mblk_t *ipsec_out_mp, *data_mp, *espmp, *tailmp;
- ipsec_out_t *io;
+ mblk_t *espmp, *tailmp;
ipha_t *ipha;
ip6_t *ip6h;
esph_t *esph_ptr, *iv_ptr;
@@ -2640,17 +2617,11 @@ esp_outbound(mblk_t *mp)
uchar_t *icv_buf;
udpha_t *udpha;
boolean_t is_natt = B_FALSE;
- netstack_t *ns;
- ipsecesp_stack_t *espstack;
- ipsec_stack_t *ipss;
-
- ipsec_out_mp = mp;
- data_mp = ipsec_out_mp->b_cont;
-
- io = (ipsec_out_t *)ipsec_out_mp->b_rptr;
- ns = io->ipsec_out_ns;
- espstack = ns->netstack_ipsecesp;
- ipss = ns->netstack_ipsec;
+ netstack_t *ns = ixa->ixa_ipst->ips_netstack;
+ ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
+ ipsec_stack_t *ipss = ns->netstack_ipsec;
+ ill_t *ill = ixa->ixa_nce->nce_ill;
+ boolean_t need_refrele = B_FALSE;
ESP_BUMP_STAT(espstack, out_requests);
@@ -2662,65 +2633,73 @@ esp_outbound(mblk_t *mp)
* we might as well make use of msgpullup() and get the mblk into one
* contiguous piece!
*/
- ipsec_out_mp->b_cont = msgpullup(data_mp, -1);
- if (ipsec_out_mp->b_cont == NULL) {
+ tailmp = msgpullup(data_mp, -1);
+ if (tailmp == NULL) {
esp0dbg(("esp_outbound: msgpullup() failed, "
"dropping packet.\n"));
- ipsec_out_mp->b_cont = data_mp;
- /*
- * TODO: Find the outbound IRE for this packet and
- * pass it to ip_drop_packet().
- */
- ip_drop_packet(ipsec_out_mp, B_FALSE, NULL, NULL,
+ ip_drop_packet(data_mp, B_FALSE, ill,
DROPPER(ipss, ipds_esp_nomem),
&espstack->esp_dropper);
- return (IPSEC_STATUS_FAILED);
- } else {
- freemsg(data_mp);
- data_mp = ipsec_out_mp->b_cont;
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ return (NULL);
}
+ freemsg(data_mp);
+ data_mp = tailmp;
- assoc = io->ipsec_out_esp_sa;
+ assoc = ixa->ixa_ipsec_esp_sa;
ASSERT(assoc != NULL);
/*
* Get the outer IP header in shape to escape this system..
*/
- if (is_system_labeled() && (assoc->ipsa_ocred != NULL)) {
- int whack;
-
- mblk_setcred(data_mp, assoc->ipsa_ocred, NOPID);
- if (io->ipsec_out_v4)
- whack = sadb_whack_label(&data_mp, assoc);
- else
- whack = sadb_whack_label_v6(&data_mp, assoc);
- if (whack != 0) {
- ip_drop_packet(ipsec_out_mp, B_FALSE, NULL,
- NULL, DROPPER(ipss, ipds_esp_nomem),
+ if (is_system_labeled() && (assoc->ipsa_otsl != NULL)) {
+ /*
+ * Need to update packet with any CIPSO option and update
+ * ixa_tsl to capture the new label.
+ * We allocate a separate ixa for that purpose.
+ */
+ ixa = ip_xmit_attr_duplicate(ixa);
+ if (ixa == NULL) {
+ ip_drop_packet(data_mp, B_FALSE, ill,
+ DROPPER(ipss, ipds_esp_nomem),
&espstack->esp_dropper);
- return (IPSEC_STATUS_FAILED);
+ return (NULL);
}
- ipsec_out_mp->b_cont = data_mp;
- }
+ need_refrele = B_TRUE;
+ label_hold(assoc->ipsa_otsl);
+ ip_xmit_attr_replace_tsl(ixa, assoc->ipsa_otsl);
+
+ data_mp = sadb_whack_label(data_mp, assoc, ixa,
+ DROPPER(ipss, ipds_esp_nomem), &espstack->esp_dropper);
+ if (data_mp == NULL) {
+ /* Packet dropped by sadb_whack_label */
+ ixa_refrele(ixa);
+ return (NULL);
+ }
+ }
/*
* Reality check....
*/
ipha = (ipha_t *)data_mp->b_rptr; /* So we can call esp_acquire(). */
- if (io->ipsec_out_v4) {
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
+
af = AF_INET;
divpoint = IPH_HDR_LENGTH(ipha);
datalen = ntohs(ipha->ipha_length) - divpoint;
nhp = (uint8_t *)&ipha->ipha_protocol;
} else {
- ip6_pkt_t ipp;
+ ip_pkt_t ipp;
+
+ ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
af = AF_INET6;
ip6h = (ip6_t *)ipha;
bzero(&ipp, sizeof (ipp));
- divpoint = ip_find_hdr_v6(data_mp, ip6h, &ipp, NULL);
+ divpoint = ip_find_hdr_v6(data_mp, ip6h, B_FALSE, &ipp, NULL);
if (ipp.ipp_dstopts != NULL &&
ipp.ipp_dstopts->ip6d_nxt != IPPROTO_ROUTING) {
/*
@@ -2795,28 +2774,26 @@ esp_outbound(mblk_t *mp)
*/
if (!esp_age_bytes(assoc, datalen + padlen + iv_len + 2, B_FALSE)) {
- /*
- * TODO: Find the outbound IRE for this packet and
- * pass it to ip_drop_packet().
- */
- ip_drop_packet(mp, B_FALSE, NULL, NULL,
+ ip_drop_packet(data_mp, B_FALSE, ill,
DROPPER(ipss, ipds_esp_bytes_expire),
&espstack->esp_dropper);
- return (IPSEC_STATUS_FAILED);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ if (need_refrele)
+ ixa_refrele(ixa);
+ return (NULL);
}
espmp = allocb(esplen, BPRI_HI);
if (espmp == NULL) {
ESP_BUMP_STAT(espstack, out_discards);
esp1dbg(espstack, ("esp_outbound: can't allocate espmp.\n"));
- /*
- * TODO: Find the outbound IRE for this packet and
- * pass it to ip_drop_packet().
- */
- ip_drop_packet(mp, B_FALSE, NULL, NULL,
+ ip_drop_packet(data_mp, B_FALSE, ill,
DROPPER(ipss, ipds_esp_nomem),
&espstack->esp_dropper);
- return (IPSEC_STATUS_FAILED);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ if (need_refrele)
+ ixa_refrele(ixa);
+ return (NULL);
}
espmp->b_wptr += esplen;
esph_ptr = (esph_t *)espmp->b_rptr;
@@ -2853,14 +2830,13 @@ esp_outbound(mblk_t *mp)
ESP_BUMP_STAT(espstack, out_discards);
sadb_replay_delete(assoc);
- /*
- * TODO: Find the outbound IRE for this packet and
- * pass it to ip_drop_packet().
- */
- ip_drop_packet(mp, B_FALSE, NULL, NULL,
+ ip_drop_packet(data_mp, B_FALSE, ill,
DROPPER(ipss, ipds_esp_replay),
&espstack->esp_dropper);
- return (IPSEC_STATUS_FAILED);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ if (need_refrele)
+ ixa_refrele(ixa);
+ return (NULL);
}
iv_ptr = (esph_ptr + 1);
@@ -2887,9 +2863,11 @@ esp_outbound(mblk_t *mp)
*/
if (!update_iv((uint8_t *)iv_ptr, espstack->esp_pfkey_q, assoc,
espstack)) {
- ip_drop_packet(mp, B_FALSE, NULL, NULL,
+ ip_drop_packet(data_mp, B_FALSE, ill,
DROPPER(ipss, ipds_esp_iv_wrap), &espstack->esp_dropper);
- return (IPSEC_STATUS_FAILED);
+ if (need_refrele)
+ ixa_refrele(ixa);
+ return (NULL);
}
/* Fix the IP header. */
@@ -2898,7 +2876,7 @@ esp_outbound(mblk_t *mp)
protocol = *nhp;
- if (io->ipsec_out_v4) {
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
ipha->ipha_length = htons(ntohs(ipha->ipha_length) + adj);
if (is_natt) {
*nhp = IPPROTO_UDP;
@@ -2922,15 +2900,14 @@ esp_outbound(mblk_t *mp)
if (!esp_insert_esp(data_mp, espmp, divpoint, espstack)) {
ESP_BUMP_STAT(espstack, out_discards);
/* NOTE: esp_insert_esp() only fails if there's no memory. */
- /*
- * TODO: Find the outbound IRE for this packet and
- * pass it to ip_drop_packet().
- */
- ip_drop_packet(mp, B_FALSE, NULL, NULL,
+ ip_drop_packet(data_mp, B_FALSE, ill,
DROPPER(ipss, ipds_esp_nomem),
&espstack->esp_dropper);
freeb(espmp);
- return (IPSEC_STATUS_FAILED);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ if (need_refrele)
+ ixa_refrele(ixa);
+ return (NULL);
}
/* Append padding (and leave room for ICV). */
@@ -2941,14 +2918,13 @@ esp_outbound(mblk_t *mp)
if (tailmp->b_cont == NULL) {
ESP_BUMP_STAT(espstack, out_discards);
esp0dbg(("esp_outbound: Can't allocate tailmp.\n"));
- /*
- * TODO: Find the outbound IRE for this packet and
- * pass it to ip_drop_packet().
- */
- ip_drop_packet(mp, B_FALSE, NULL, NULL,
+ ip_drop_packet(data_mp, B_FALSE, ill,
DROPPER(ipss, ipds_esp_nomem),
&espstack->esp_dropper);
- return (IPSEC_STATUS_FAILED);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ if (need_refrele)
+ ixa_refrele(ixa);
+ return (NULL);
}
tailmp = tailmp->b_cont;
}
@@ -2968,29 +2944,6 @@ esp_outbound(mblk_t *mp)
esp2dbg(espstack, (dump_msg(data_mp)));
/*
- * The packet is eligible for hardware acceleration if the
- * following conditions are satisfied:
- *
- * 1. the packet will not be fragmented
- * 2. the provider supports the algorithms specified by SA
- * 3. there is no pending control message being exchanged
- * 4. snoop is not attached
- * 5. the destination address is not a multicast address
- *
- * All five of these conditions are checked by IP prior to
- * sending the packet to ESP.
- *
- * But We, and We Alone, can, nay MUST check if the packet
- * is over NATT, and then disqualify it from hardware
- * acceleration.
- */
-
- if (io->ipsec_out_is_capab_ill && !(assoc->ipsa_flags & IPSA_F_NATT)) {
- return (esp_outbound_accelerated(ipsec_out_mp, mac_len));
- }
- ESP_BUMP_STAT(espstack, noaccel);
-
- /*
* Okay. I've set up the pre-encryption ESP. Let's do it!
*/
@@ -3002,32 +2955,23 @@ esp_outbound(mblk_t *mp)
icv_buf = NULL;
}
- return (esp_submit_req_outbound(ipsec_out_mp, assoc, icv_buf,
- datalen + padlen + 2));
+ data_mp = esp_submit_req_outbound(data_mp, ixa, assoc, icv_buf,
+ datalen + padlen + 2);
+ if (need_refrele)
+ ixa_refrele(ixa);
+ return (data_mp);
}
/*
* IP calls this to validate the ICMP errors that
* we got from the network.
*/
-ipsec_status_t
-ipsecesp_icmp_error(mblk_t *ipsec_mp)
+mblk_t *
+ipsecesp_icmp_error(mblk_t *data_mp, ip_recv_attr_t *ira)
{
- ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr;
- boolean_t is_inbound = (ii->ipsec_in_type == IPSEC_IN);
- netstack_t *ns;
- ipsecesp_stack_t *espstack;
- ipsec_stack_t *ipss;
-
- if (is_inbound) {
- ns = ii->ipsec_in_ns;
- } else {
- ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr;
-
- ns = io->ipsec_out_ns;
- }
- espstack = ns->netstack_ipsecesp;
- ipss = ns->netstack_ipsec;
+ netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack;
+ ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
+ ipsec_stack_t *ipss = ns->netstack_ipsec;
/*
* Unless we get an entire packet back, this function is useless.
@@ -3044,55 +2988,10 @@ ipsecesp_icmp_error(mblk_t *ipsec_mp)
* very small, we discard here.
*/
IP_ESP_BUMP_STAT(ipss, in_discards);
- ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL,
+ ip_drop_packet(data_mp, B_TRUE, ira->ira_ill,
DROPPER(ipss, ipds_esp_icmp),
&espstack->esp_dropper);
- return (IPSEC_STATUS_FAILED);
-}
-
-/*
- * ESP module read put routine.
- */
-/* ARGSUSED */
-static void
-ipsecesp_rput(queue_t *q, mblk_t *mp)
-{
- ipsecesp_stack_t *espstack = (ipsecesp_stack_t *)q->q_ptr;
-
- ASSERT(mp->b_datap->db_type != M_CTL); /* No more IRE_DB_REQ. */
-
- switch (mp->b_datap->db_type) {
- case M_PROTO:
- case M_PCPROTO:
- /* TPI message of some sort. */
- switch (*((t_scalar_t *)mp->b_rptr)) {
- case T_BIND_ACK:
- esp3dbg(espstack,
- ("Thank you IP from ESP for T_BIND_ACK\n"));
- break;
- case T_ERROR_ACK:
- cmn_err(CE_WARN,
- "ipsecesp: ESP received T_ERROR_ACK from IP.");
- /*
- * Make esp_sadb.s_ip_q NULL, and in the
- * future, perhaps try again.
- */
- espstack->esp_sadb.s_ip_q = NULL;
- break;
- case T_OK_ACK:
- /* Probably from a (rarely sent) T_UNBIND_REQ. */
- break;
- default:
- esp0dbg(("Unknown M_{,PC}PROTO message.\n"));
- }
- freemsg(mp);
- break;
- default:
- /* For now, passthru message. */
- esp2dbg(espstack, ("ESP got unknown mblk type %d.\n",
- mp->b_datap->db_type));
- putnext(q, mp);
- }
+ return (NULL);
}
/*
@@ -3102,7 +3001,7 @@ ipsecesp_rput(queue_t *q, mblk_t *mp)
*/
static boolean_t
esp_register_out(uint32_t sequence, uint32_t pid, uint_t serial,
- ipsecesp_stack_t *espstack, mblk_t *in_mp)
+ ipsecesp_stack_t *espstack, cred_t *cr)
{
mblk_t *pfkey_msg_mp, *keysock_out_mp;
sadb_msg_t *samsg;
@@ -3121,7 +3020,7 @@ esp_register_out(uint32_t sequence, uint32_t pid, uint_t serial,
sadb_sens_t *sens;
size_t sens_len = 0;
sadb_ext_t *nextext;
- cred_t *sens_cr = NULL;
+ ts_label_t *sens_tsl = NULL;
/* Allocate the KEYSOCK_OUT. */
keysock_out_mp = sadb_keysock_out(serial);
@@ -3130,11 +3029,10 @@ esp_register_out(uint32_t sequence, uint32_t pid, uint_t serial,
return (B_FALSE);
}
- if (is_system_labeled() && (in_mp != NULL)) {
- sens_cr = msg_getcred(in_mp, NULL);
-
- if (sens_cr != NULL) {
- sens_len = sadb_sens_len_from_cred(sens_cr);
+ if (is_system_labeled() && (cr != NULL)) {
+ sens_tsl = crgetlabel(cr);
+ if (sens_tsl != NULL) {
+ sens_len = sadb_sens_len_from_label(sens_tsl);
allocsize += sens_len;
}
}
@@ -3268,10 +3166,10 @@ esp_register_out(uint32_t sequence, uint32_t pid, uint_t serial,
mutex_exit(&ipss->ipsec_alg_lock);
- if (sens_cr != NULL) {
+ if (sens_tsl != NULL) {
sens = (sadb_sens_t *)nextext;
- sadb_sens_from_cred(sens, SADB_EXT_SENSITIVITY,
- sens_cr, sens_len);
+ sadb_sens_from_label(sens, SADB_EXT_SENSITIVITY,
+ sens_tsl, sens_len);
nextext = (sadb_ext_t *)(((uint8_t *)sens) + sens_len);
}
@@ -3336,40 +3234,61 @@ ipsecesp_algs_changed(netstack_t *ns)
/*
* Stub function that taskq_dispatch() invokes to take the mblk (in arg)
- * and put() it into AH and STREAMS again.
+ * and send() it into ESP and IP again.
*/
static void
inbound_task(void *arg)
{
- esph_t *esph;
- mblk_t *mp = (mblk_t *)arg;
- ipsec_in_t *ii = (ipsec_in_t *)mp->b_rptr;
- netstack_t *ns;
- ipsecesp_stack_t *espstack;
- int ipsec_rc;
-
- ns = netstack_find_by_stackid(ii->ipsec_in_stackid);
- if (ns == NULL || ns != ii->ipsec_in_ns) {
- /* Just freemsg(). */
- if (ns != NULL)
- netstack_rele(ns);
+ mblk_t *mp = (mblk_t *)arg;
+ mblk_t *async_mp;
+ ip_recv_attr_t iras;
+
+ async_mp = mp;
+ mp = async_mp->b_cont;
+ async_mp->b_cont = NULL;
+ if (!ip_recv_attr_from_mblk(async_mp, &iras)) {
+ /* The ill or ip_stack_t disappeared on us */
+ ip_drop_input("ip_recv_attr_from_mblk", mp, NULL);
freemsg(mp);
- return;
+ goto done;
}
- espstack = ns->netstack_ipsecesp;
+ esp_inbound_restart(mp, &iras);
+done:
+ ira_cleanup(&iras, B_TRUE);
+}
+
+/*
+ * Restart ESP after the SA has been added.
+ */
+static void
+esp_inbound_restart(mblk_t *mp, ip_recv_attr_t *ira)
+{
+ esph_t *esph;
+ netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack;
+ ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
esp2dbg(espstack, ("in ESP inbound_task"));
ASSERT(espstack != NULL);
- esph = ipsec_inbound_esp_sa(mp, ns);
- if (esph != NULL) {
- ASSERT(ii->ipsec_in_esp_sa != NULL);
- ipsec_rc = ii->ipsec_in_esp_sa->ipsa_input_func(mp, esph);
- if (ipsec_rc == IPSEC_STATUS_SUCCESS)
- ip_fanout_proto_again(mp, NULL, NULL, NULL);
+ mp = ipsec_inbound_esp_sa(mp, ira, &esph);
+ if (mp == NULL)
+ return;
+
+ ASSERT(esph != NULL);
+ ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
+ ASSERT(ira->ira_ipsec_esp_sa != NULL);
+
+ mp = ira->ira_ipsec_esp_sa->ipsa_input_func(mp, esph, ira);
+ if (mp == NULL) {
+ /*
+ * Either it failed or is pending. In the former case
+ * ipIfStatsInDiscards was increased.
+ */
+ return;
}
- netstack_rele(ns);
+
+ ip_input_post_ipsec(mp, ira);
}
/*
@@ -3533,17 +3452,21 @@ esp_add_sa_finish(mblk_t *mp, sadb_msg_t *samsg, keysock_in_t *ksi,
if (larval != NULL)
lpkt = sadb_clear_lpkt(larval);
- rc = sadb_common_add(espstack->esp_sadb.s_ip_q, espstack->esp_pfkey_q,
+ rc = sadb_common_add(espstack->esp_pfkey_q,
mp, samsg, ksi, primary, secondary, larval, clone, is_inbound,
diagnostic, espstack->ipsecesp_netstack, &espstack->esp_sadb);
- if (rc == 0 && lpkt != NULL)
- rc = !taskq_dispatch(esp_taskq, inbound_task, lpkt, TQ_NOSLEEP);
-
- if (rc != 0) {
- ip_drop_packet(lpkt, B_TRUE, NULL, NULL,
- DROPPER(ipss, ipds_sadb_inlarval_timeout),
- &espstack->esp_dropper);
+ if (lpkt != NULL) {
+ if (rc == 0) {
+ rc = !taskq_dispatch(esp_taskq, inbound_task,
+ lpkt, TQ_NOSLEEP);
+ }
+ if (rc != 0) {
+ lpkt = ip_recv_attr_free_mblk(lpkt);
+ ip_drop_packet(lpkt, B_TRUE, NULL,
+ DROPPER(ipss, ipds_sadb_inlarval_timeout),
+ &espstack->esp_dropper);
+ }
}
/*
@@ -3551,45 +3474,78 @@ esp_add_sa_finish(mblk_t *mp, sadb_msg_t *samsg, keysock_in_t *ksi,
* esp_outbound() calls?
*/
+ /* Handle the packets queued waiting for the SA */
while (acq_msgs != NULL) {
- mblk_t *mp = acq_msgs;
+ mblk_t *asyncmp;
+ mblk_t *data_mp;
+ ip_xmit_attr_t ixas;
+ ill_t *ill;
+ asyncmp = acq_msgs;
acq_msgs = acq_msgs->b_next;
- mp->b_next = NULL;
- if (rc == 0) {
- if (ipsec_outbound_sa(mp, IPPROTO_ESP)) {
- ((ipsec_out_t *)(mp->b_rptr))->
- ipsec_out_esp_done = B_TRUE;
- if (esp_outbound(mp) == IPSEC_STATUS_SUCCESS) {
- ipha_t *ipha;
-
- /* do AH processing if needed */
- if (!esp_do_outbound_ah(mp))
- continue;
-
- ipha = (ipha_t *)mp->b_cont->b_rptr;
-
- /* finish IPsec processing */
- if (IPH_HDR_VERSION(ipha) ==
- IP_VERSION) {
- ip_wput_ipsec_out(NULL, mp,
- ipha, NULL, NULL);
- } else {
- ip6_t *ip6h = (ip6_t *)ipha;
- ip_wput_ipsec_out_v6(NULL,
- mp, ip6h, NULL, NULL);
- }
- }
- continue;
- }
+ asyncmp->b_next = NULL;
+
+ /*
+ * Extract the ip_xmit_attr_t from the first mblk.
+ * Verifies that the netstack and ill is still around; could
+ * have vanished while iked was doing its work.
+ * On succesful return we have a nce_t and the ill/ipst can't
+ * disappear until we do the nce_refrele in ixa_cleanup.
+ */
+ data_mp = asyncmp->b_cont;
+ asyncmp->b_cont = NULL;
+ if (!ip_xmit_attr_from_mblk(asyncmp, &ixas)) {
+ ESP_BUMP_STAT(espstack, out_discards);
+ ip_drop_packet(data_mp, B_FALSE, NULL,
+ DROPPER(ipss, ipds_sadb_acquire_timeout),
+ &espstack->esp_dropper);
+ } else if (rc != 0) {
+ ill = ixas.ixa_nce->nce_ill;
+ ESP_BUMP_STAT(espstack, out_discards);
+ ip_drop_packet(data_mp, B_FALSE, ill,
+ DROPPER(ipss, ipds_sadb_acquire_timeout),
+ &espstack->esp_dropper);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ } else {
+ esp_outbound_finish(data_mp, &ixas);
}
+ ixa_cleanup(&ixas);
+ }
+
+ return (rc);
+}
+
+/*
+ * Process one of the queued messages (from ipsacq_mp) once the SA
+ * has been added.
+ */
+static void
+esp_outbound_finish(mblk_t *data_mp, ip_xmit_attr_t *ixa)
+{
+ netstack_t *ns = ixa->ixa_ipst->ips_netstack;
+ ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
+ ipsec_stack_t *ipss = ns->netstack_ipsec;
+ ill_t *ill = ixa->ixa_nce->nce_ill;
+
+ if (!ipsec_outbound_sa(data_mp, ixa, IPPROTO_ESP)) {
ESP_BUMP_STAT(espstack, out_discards);
- ip_drop_packet(mp, B_FALSE, NULL, NULL,
+ ip_drop_packet(data_mp, B_FALSE, ill,
DROPPER(ipss, ipds_sadb_acquire_timeout),
&espstack->esp_dropper);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
+ return;
}
- return (rc);
+ data_mp = esp_outbound(data_mp, ixa);
+ if (data_mp == NULL)
+ return;
+
+ /* do AH processing if needed */
+ data_mp = esp_do_outbound_ah(data_mp, ixa);
+ if (data_mp == NULL)
+ return;
+
+ (void) ip_output_post_ipsec(data_mp, ixa);
}
/*
@@ -3674,11 +3630,13 @@ esp_add_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic, netstack_t *ns)
return (EINVAL);
}
+#ifndef IPSEC_LATENCY_TEST
if (assoc->sadb_sa_encrypt == SADB_EALG_NULL &&
assoc->sadb_sa_auth == SADB_AALG_NONE) {
*diagnostic = SADB_X_DIAGNOSTIC_BAD_AALG;
return (EINVAL);
}
+#endif
if (assoc->sadb_sa_flags & ~espstack->esp_sadb.s_addflags) {
*diagnostic = SADB_X_DIAGNOSTIC_BAD_SAFLAGS;
@@ -3734,7 +3692,11 @@ esp_add_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic, netstack_t *ns)
/*
* First locate the authentication algorithm.
*/
+#ifdef IPSEC_LATENCY_TEST
+ if (akey != NULL && assoc->sadb_sa_auth != SADB_AALG_NONE) {
+#else
if (akey != NULL) {
+#endif
ipsec_alginfo_t *aalg;
aalg = ipss->ipsec_alglists[IPSEC_ALG_AUTH]
@@ -3883,7 +3845,7 @@ esp_del_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic,
return (sadb_purge_sa(mp, ksi,
(sin->sin_family == AF_INET6) ? &espstack->esp_sadb.s_v6 :
&espstack->esp_sadb.s_v4, diagnostic,
- espstack->esp_pfkey_q, espstack->esp_sadb.s_ip_q));
+ espstack->esp_pfkey_q));
}
return (sadb_delget_sa(mp, ksi, &espstack->esp_sadb, diagnostic,
@@ -4024,7 +3986,7 @@ esp_parse_pfkey(mblk_t *mp, ipsecesp_stack_t *espstack)
* Keysock takes care of the PF_KEY bookkeeping for this.
*/
if (esp_register_out(samsg->sadb_msg_seq, samsg->sadb_msg_pid,
- ksi->ks_in_serial, espstack, mp)) {
+ ksi->ks_in_serial, espstack, msg_getcred(mp, NULL))) {
freemsg(mp);
} else {
/*
@@ -4109,8 +4071,7 @@ esp_keysock_no_socket(mblk_t *mp, ipsecesp_stack_t *espstack)
samsg->sadb_msg_errno = kse->ks_err_errno;
samsg->sadb_msg_len = SADB_8TO64(sizeof (*samsg));
/*
- * Use the write-side of the esp_pfkey_q, in case there is
- * no esp_sadb.s_ip_q.
+ * Use the write-side of the esp_pfkey_q
*/
sadb_in_acquire(samsg, &espstack->esp_sadb,
WR(espstack->esp_pfkey_q), espstack->ipsecesp_netstack);
@@ -4197,236 +4158,23 @@ ipsecesp_wput(queue_t *q, mblk_t *mp)
}
/*
- * Process an outbound ESP packet that can be accelerated by a IPsec
- * hardware acceleration capable Provider.
- * The caller already inserted and initialized the ESP header.
- * This function allocates a tagging M_CTL, and adds room at the end
- * of the packet to hold the ICV if authentication is needed.
- *
- * On success returns B_TRUE, on failure returns B_FALSE and frees the
- * mblk chain ipsec_out.
- */
-static ipsec_status_t
-esp_outbound_accelerated(mblk_t *ipsec_out, uint_t icv_len)
-{
- ipsec_out_t *io;
- mblk_t *lastmp;
- netstack_t *ns;
- ipsecesp_stack_t *espstack;
- ipsec_stack_t *ipss;
-
- io = (ipsec_out_t *)ipsec_out->b_rptr;
- ns = io->ipsec_out_ns;
- espstack = ns->netstack_ipsecesp;
- ipss = ns->netstack_ipsec;
-
- ESP_BUMP_STAT(espstack, out_accelerated);
-
- /* mark packet as being accelerated in IPSEC_OUT */
- ASSERT(io->ipsec_out_accelerated == B_FALSE);
- io->ipsec_out_accelerated = B_TRUE;
-
- /*
- * add room at the end of the packet for the ICV if needed
- */
- if (icv_len > 0) {
- /* go to last mblk */
- lastmp = ipsec_out; /* For following while loop. */
- do {
- lastmp = lastmp->b_cont;
- } while (lastmp->b_cont != NULL);
-
- /* if not enough available room, allocate new mblk */
- if ((lastmp->b_wptr + icv_len) > lastmp->b_datap->db_lim) {
- lastmp->b_cont = allocb(icv_len, BPRI_HI);
- if (lastmp->b_cont == NULL) {
- ESP_BUMP_STAT(espstack, out_discards);
- ip_drop_packet(ipsec_out, B_FALSE, NULL, NULL,
- DROPPER(ipss, ipds_esp_nomem),
- &espstack->esp_dropper);
- return (IPSEC_STATUS_FAILED);
- }
- lastmp = lastmp->b_cont;
- }
- lastmp->b_wptr += icv_len;
- }
-
- return (IPSEC_STATUS_SUCCESS);
-}
-
-/*
- * Process an inbound accelerated ESP packet.
- * On success returns B_TRUE, on failure returns B_FALSE and frees the
- * mblk chain ipsec_in.
- */
-static ipsec_status_t
-esp_inbound_accelerated(mblk_t *ipsec_in, mblk_t *data_mp, boolean_t isv4,
- ipsa_t *assoc)
-{
- ipsec_in_t *ii = (ipsec_in_t *)ipsec_in->b_rptr;
- mblk_t *hada_mp;
- uint32_t icv_len = 0;
- da_ipsec_t *hada;
- ipha_t *ipha;
- ip6_t *ip6h;
- kstat_named_t *counter;
- netstack_t *ns = ii->ipsec_in_ns;
- ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
- ipsec_stack_t *ipss = ns->netstack_ipsec;
-
- ESP_BUMP_STAT(espstack, in_accelerated);
-
- hada_mp = ii->ipsec_in_da;
- ASSERT(hada_mp != NULL);
- hada = (da_ipsec_t *)hada_mp->b_rptr;
-
- /*
- * We only support one level of decapsulation in hardware, so
- * nuke the pointer.
- */
- ii->ipsec_in_da = NULL;
- ii->ipsec_in_accelerated = B_FALSE;
-
- if (assoc->ipsa_auth_alg != IPSA_AALG_NONE) {
- /*
- * ESP with authentication. We expect the Provider to have
- * computed the ICV and placed it in the hardware acceleration
- * data attributes.
- *
- * Extract ICV length from attributes M_CTL and sanity check
- * its value. We allow the mblk to be smaller than da_ipsec_t
- * for a small ICV, as long as the entire ICV fits within the
- * mblk.
- *
- * Also ensures that the ICV length computed by Provider
- * corresponds to the ICV length of the agorithm specified by
- * the SA.
- */
- icv_len = hada->da_icv_len;
- if ((icv_len != assoc->ipsa_mac_len) ||
- (icv_len > DA_ICV_MAX_LEN) || (MBLKL(hada_mp) <
- (sizeof (da_ipsec_t) - DA_ICV_MAX_LEN + icv_len))) {
- esp0dbg(("esp_inbound_accelerated: "
- "ICV len (%u) incorrect or mblk too small (%u)\n",
- icv_len, (uint32_t)(MBLKL(hada_mp))));
- counter = DROPPER(ipss, ipds_esp_bad_auth);
- goto esp_in_discard;
- }
- }
-
- /* get pointers to IP header */
- if (isv4) {
- ipha = (ipha_t *)data_mp->b_rptr;
- } else {
- ip6h = (ip6_t *)data_mp->b_rptr;
- }
-
- /*
- * Compare ICV in ESP packet vs ICV computed by adapter.
- * We also remove the ICV from the end of the packet since
- * it will no longer be needed.
- *
- * Assume that esp_inbound() already ensured that the pkt
- * was in one mblk.
- */
- ASSERT(data_mp->b_cont == NULL);
- data_mp->b_wptr -= icv_len;
- /* adjust IP header */
- if (isv4)
- ipha->ipha_length = htons(ntohs(ipha->ipha_length) - icv_len);
- else
- ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) - icv_len);
- if (icv_len && bcmp(hada->da_icv, data_mp->b_wptr, icv_len)) {
- int af;
- void *addr;
-
- if (isv4) {
- addr = &ipha->ipha_dst;
- af = AF_INET;
- } else {
- addr = &ip6h->ip6_dst;
- af = AF_INET6;
- }
-
- /*
- * Log the event. Don't print to the console, block
- * potential denial-of-service attack.
- */
- ESP_BUMP_STAT(espstack, bad_auth);
- ipsec_assocfailure(info.mi_idnum, 0, 0, SL_ERROR | SL_WARN,
- "ESP Authentication failed spi %x, dst_addr %s",
- assoc->ipsa_spi, addr, af, espstack->ipsecesp_netstack);
- counter = DROPPER(ipss, ipds_esp_bad_auth);
- goto esp_in_discard;
- }
-
- esp3dbg(espstack, ("esp_inbound_accelerated: ESP authentication "
- "succeeded, checking replay\n"));
-
- ipsec_in->b_cont = data_mp;
-
- /*
- * Remove ESP header and padding from packet.
- */
- if (!esp_strip_header(data_mp, ii->ipsec_in_v4, assoc->ipsa_iv_len,
- &counter, espstack)) {
- esp1dbg(espstack, ("esp_inbound_accelerated: "
- "esp_strip_header() failed\n"));
- goto esp_in_discard;
- }
-
- freeb(hada_mp);
-
- if (is_system_labeled() && (assoc->ipsa_cred != NULL))
- mblk_setcred(data_mp, assoc->ipsa_cred, NOPID);
-
- /*
- * Account for usage..
- */
- if (!esp_age_bytes(assoc, msgdsize(data_mp), B_TRUE)) {
- /* The ipsa has hit hard expiration, LOG and AUDIT. */
- ESP_BUMP_STAT(espstack, bytes_expired);
- IP_ESP_BUMP_STAT(ipss, in_discards);
- ipsec_assocfailure(info.mi_idnum, 0, 0, SL_ERROR | SL_WARN,
- "ESP association 0x%x, dst %s had bytes expire.\n",
- assoc->ipsa_spi, assoc->ipsa_dstaddr, assoc->ipsa_addrfam,
- espstack->ipsecesp_netstack);
- ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL,
- DROPPER(ipss, ipds_esp_bytes_expire),
- &espstack->esp_dropper);
- return (IPSEC_STATUS_FAILED);
- }
-
- /* done processing the packet */
- return (IPSEC_STATUS_SUCCESS);
-
-esp_in_discard:
- IP_ESP_BUMP_STAT(ipss, in_discards);
- freeb(hada_mp);
-
- ipsec_in->b_cont = data_mp; /* For ip_drop_packet()'s sake... */
- ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL, counter,
- &espstack->esp_dropper);
-
- return (IPSEC_STATUS_FAILED);
-}
-
-/*
* Wrapper to allow IP to trigger an ESP association failure message
* during inbound SA selection.
*/
void
ipsecesp_in_assocfailure(mblk_t *mp, char level, ushort_t sl, char *fmt,
- uint32_t spi, void *addr, int af, ipsecesp_stack_t *espstack)
+ uint32_t spi, void *addr, int af, ip_recv_attr_t *ira)
{
- ipsec_stack_t *ipss = espstack->ipsecesp_netstack->netstack_ipsec;
+ netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack;
+ ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
+ ipsec_stack_t *ipss = ns->netstack_ipsec;
if (espstack->ipsecesp_log_unknown_spi) {
ipsec_assocfailure(info.mi_idnum, 0, level, sl, fmt, spi,
addr, af, espstack->ipsecesp_netstack);
}
- ip_drop_packet(mp, B_TRUE, NULL, NULL,
+ ip_drop_packet(mp, B_TRUE, ira->ira_ill,
DROPPER(ipss, ipds_esp_no_sa),
&espstack->esp_dropper);
}
diff --git a/usr/src/uts/common/inet/ip/keysock.c b/usr/src/uts/common/inet/ip/keysock.c
index ca82eeece0..855af28bb2 100644
--- a/usr/src/uts/common/inet/ip/keysock.c
+++ b/usr/src/uts/common/inet/ip/keysock.c
@@ -852,7 +852,7 @@ keysock_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
int
keysock_opt_set(queue_t *q, uint_t mgmt_flags, int level,
int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
- uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
+ uchar_t *outvalp, void *thisdg_attrs, cred_t *cr)
{
int *i1 = (int *)invalp, errno = 0;
keysock_t *ks = (keysock_t *)q->q_ptr;
@@ -936,11 +936,9 @@ keysock_wput_other(queue_t *q, mblk_t *mp)
}
if (((union T_primitives *)mp->b_rptr)->type ==
T_SVR4_OPTMGMT_REQ) {
- (void) svr4_optcom_req(q, mp, cr,
- &keysock_opt_obj, B_FALSE);
+ svr4_optcom_req(q, mp, cr, &keysock_opt_obj);
} else {
- (void) tpi_optcom_req(q, mp, cr,
- &keysock_opt_obj, B_FALSE);
+ tpi_optcom_req(q, mp, cr, &keysock_opt_obj);
}
break;
case T_DATA_REQ:
diff --git a/usr/src/uts/common/inet/ip/keysock_opt_data.c b/usr/src/uts/common/inet/ip/keysock_opt_data.c
index d8d9f1d0ad..4dee663d42 100644
--- a/usr/src/uts/common/inet/ip/keysock_opt_data.c
+++ b/usr/src/uts/common/inet/ip/keysock_opt_data.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 1996-1998,2001-2003 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/stream.h>
#define _SUN_TPI_VERSION 1
@@ -51,11 +48,11 @@
*/
opdes_t keysock_opt_arr[] = {
- { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+ { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
(t_uscalar_t)sizeof (int), 0 },
- { SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+ { SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
(t_uscalar_t)sizeof (int), 0 },
- { SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+ { SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
(t_uscalar_t)sizeof (int), 0 },
};
@@ -88,7 +85,6 @@ optdb_obj_t keysock_opt_obj = {
NULL, /* KEYSOCK default value function pointer */
keysock_opt_get, /* KEYSOCK get function pointer */
keysock_opt_set, /* KEYSOCK set function pointer */
- B_TRUE, /* KEYSOCK is tpi provider */
KEYSOCK_OPT_ARR_CNT, /* KEYSOCK option database count of entries */
keysock_opt_arr, /* KEYSOCK option database */
KEYSOCK_VALID_LEVELS_CNT, /* KEYSOCK valid level count of entries */
diff --git a/usr/src/uts/common/inet/ip/rts.c b/usr/src/uts/common/inet/ip/rts.c
index ce3ac6faca..d5a1d84395 100644
--- a/usr/src/uts/common/inet/ip/rts.c
+++ b/usr/src/uts/common/inet/ip/rts.c
@@ -72,7 +72,6 @@
* Addresses are assigned to interfaces.
* ICMP redirects are processed and a IRE_HOST/RTF_DYNAMIC is installed.
* No route is found while sending a packet.
- * When TCP requests IP to remove an IRE_CACHE of a troubled destination.
*
* Since all we do is reformat the messages between routing socket and
* ioctl forms, no synchronization is necessary in this module; all
@@ -113,7 +112,8 @@ static rtsparam_t lcl_param_arr[] = {
static void rts_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
int sys_error);
-static void rts_input(void *, mblk_t *, void *);
+static void rts_input(void *, mblk_t *, void *, ip_recv_attr_t *);
+static void rts_icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
static mblk_t *rts_ioctl_alloc(mblk_t *data);
static int rts_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
static boolean_t rts_param_register(IDP *ndp, rtsparam_t *rtspa, int cnt);
@@ -211,28 +211,28 @@ rts_common_close(queue_t *q, conn_t *connp)
if (!IPCL_IS_NONSTR(connp)) {
qprocsoff(q);
+ }
- /*
- * Now we are truly single threaded on this stream, and can
- * delete the things hanging off the connp, and finally the
- * connp.
- * We removed this connp from the fanout list, it cannot be
- * accessed thru the fanouts, and we already waited for the
- * conn_ref to drop to 0. We are already in close, so
- * there cannot be any other thread from the top. qprocsoff
- * has completed, and service has completed or won't run in
- * future.
- */
+ /*
+ * Now we are truly single threaded on this stream, and can
+ * delete the things hanging off the connp, and finally the connp.
+ * We removed this connp from the fanout list, it cannot be
+ * accessed thru the fanouts, and we already waited for the
+ * conn_ref to drop to 0. We are already in close, so
+ * there cannot be any other thread from the top. qprocsoff
+ * has completed, and service has completed or won't run in
+ * future.
+ */
+ ASSERT(connp->conn_ref == 1);
+
+ if (!IPCL_IS_NONSTR(connp)) {
inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
} else {
ip_free_helper_stream(connp);
}
- ASSERT(connp->conn_ref == 1);
-
connp->conn_ref--;
ipcl_conn_destroy(connp);
-
return (0);
}
@@ -256,7 +256,6 @@ rts_stream_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
{
conn_t *connp;
dev_t conn_dev;
- rts_stack_t *rtss;
rts_t *rts;
/* If the stream is already open, return immediately. */
@@ -266,7 +265,6 @@ rts_stream_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
if (sflag == MODOPEN)
return (EINVAL);
-
/*
* Since RTS is not used so heavily, allocating from the small
* arena should be sufficient.
@@ -278,44 +276,31 @@ rts_stream_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
connp = rts_open(flag, credp);
ASSERT(connp != NULL);
-
*devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
rts = connp->conn_rts;
-
rw_enter(&rts->rts_rwlock, RW_WRITER);
connp->conn_dev = conn_dev;
connp->conn_minor_arena = ip_minor_arena_sa;
- /*
- * Initialize the rts_t structure for this stream.
- */
q->q_ptr = connp;
WR(q)->q_ptr = connp;
connp->conn_rq = q;
connp->conn_wq = WR(q);
- rtss = rts->rts_rtss;
- q->q_hiwat = rtss->rtss_recv_hiwat;
- WR(q)->q_hiwat = rtss->rtss_xmit_hiwat;
- WR(q)->q_lowat = rtss->rtss_xmit_lowat;
-
-
+ WR(q)->q_hiwat = connp->conn_sndbuf;
+ WR(q)->q_lowat = connp->conn_sndlowat;
mutex_enter(&connp->conn_lock);
connp->conn_state_flags &= ~CONN_INCIPIENT;
mutex_exit(&connp->conn_lock);
-
- qprocson(q);
rw_exit(&rts->rts_rwlock);
- /*
- * Indicate the down IP module that this is a routing socket
- * client by sending an RTS IOCTL without any user data. Although
- * this is just a notification message (without any real routing
- * request), we pass in any credential for correctness sake.
- */
+
+ /* Indicate to IP that this is a routing socket client */
ip_rts_register(connp);
+ qprocson(q);
+
return (0);
}
@@ -352,22 +337,38 @@ rts_open(int flag, cred_t *credp)
*/
netstack_rele(ns);
-
rw_enter(&rts->rts_rwlock, RW_WRITER);
ASSERT(connp->conn_rts == rts);
ASSERT(rts->rts_connp == connp);
+ connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM;
+ /* conn_allzones can not be set this early, hence no IPCL_ZONEID */
+ connp->conn_ixa->ixa_zoneid = zoneid;
connp->conn_zoneid = zoneid;
connp->conn_flow_cntrld = B_FALSE;
- connp->conn_ulp_labeled = is_system_labeled();
-
rts->rts_rtss = rtss;
- rts->rts_xmit_hiwat = rtss->rtss_xmit_hiwat;
+
+ connp->conn_rcvbuf = rtss->rtss_recv_hiwat;
+ connp->conn_sndbuf = rtss->rtss_xmit_hiwat;
+ connp->conn_sndlowat = rtss->rtss_xmit_lowat;
+ connp->conn_rcvlowat = rts_mod_info.mi_lowat;
+
+ connp->conn_family = PF_ROUTE;
+ connp->conn_so_type = SOCK_RAW;
+ /* SO_PROTOTYPE is always sent down by sockfs setting conn_proto */
connp->conn_recv = rts_input;
+ connp->conn_recvicmp = rts_icmp_input;
+
crhold(credp);
connp->conn_cred = credp;
+ connp->conn_cpid = curproc->p_pid;
+ /* Cache things in ixa without an extra refhold */
+ connp->conn_ixa->ixa_cred = connp->conn_cred;
+ connp->conn_ixa->ixa_cpid = connp->conn_cpid;
+ if (is_system_labeled())
+ connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
/*
* rts sockets start out as bound and connected
@@ -429,7 +430,6 @@ rts_tpi_bind(queue_t *q, mblk_t *mp)
{
conn_t *connp = Q_TO_CONN(q);
rts_t *rts = connp->conn_rts;
- mblk_t *mp1;
struct T_bind_req *tbr;
if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
@@ -444,16 +444,6 @@ rts_tpi_bind(queue_t *q, mblk_t *mp)
rts_err_ack(q, mp, TOUTSTATE, 0);
return;
}
- /*
- * Reallocate the message to make sure we have enough room for an
- * address and the protocol type.
- */
- mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin_t), 1);
- if (mp1 == NULL) {
- rts_err_ack(q, mp, TSYSERR, ENOMEM);
- return;
- }
- mp = mp1;
tbr = (struct T_bind_req *)mp->b_rptr;
if (tbr->ADDR_length != 0) {
(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
@@ -465,6 +455,7 @@ rts_tpi_bind(queue_t *q, mblk_t *mp)
tbr->ADDR_offset = (t_scalar_t)sizeof (struct T_bind_req);
tbr->ADDR_length = 0;
tbr->PRIM_type = T_BIND_ACK;
+ mp->b_datap->db_type = M_PCPROTO;
rts->rts_state = TS_IDLE;
qreply(q, mp);
}
@@ -545,70 +536,30 @@ static int
rts_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
{
rts_t *rts = connp->conn_rts;
- int *i1 = (int *)ptr;
+ conn_opt_arg_t coas;
+ int retval;
ASSERT(RW_READ_HELD(&rts->rts_rwlock));
switch (level) {
- case SOL_SOCKET:
- switch (name) {
- case SO_DEBUG:
- *i1 = rts->rts_debug;
- break;
- case SO_REUSEADDR:
- *i1 = rts->rts_reuseaddr;
- break;
- case SO_TYPE:
- *i1 = SOCK_RAW;
- break;
- /*
- * The following three items are available here,
- * but are only meaningful to IP.
- */
- case SO_DONTROUTE:
- *i1 = rts->rts_dontroute;
- break;
- case SO_USELOOPBACK:
- *i1 = rts->rts_useloopback;
- break;
- case SO_BROADCAST:
- *i1 = rts->rts_broadcast;
- break;
- case SO_PROTOTYPE:
- *i1 = rts->rts_proto;
- break;
- /*
- * The following two items can be manipulated,
- * but changing them should do nothing.
- */
- case SO_SNDBUF:
- ASSERT(rts->rts_xmit_hiwat <= INT_MAX);
- *i1 = (int)(rts->rts_xmit_hiwat);
- break;
- case SO_RCVBUF:
- ASSERT(rts->rts_recv_hiwat <= INT_MAX);
- *i1 = (int)(rts->rts_recv_hiwat);
- break;
- case SO_DOMAIN:
- *i1 = PF_ROUTE;
- break;
- default:
- return (-1);
- }
- break;
+ /* do this in conn_opt_get? */
case SOL_ROUTE:
switch (name) {
case RT_AWARE:
mutex_enter(&connp->conn_lock);
- *i1 = connp->conn_rtaware;
+ *(int *)ptr = connp->conn_rtaware;
mutex_exit(&connp->conn_lock);
- break;
+ return (0);
}
break;
- default:
- return (-1);
}
- return ((int)sizeof (int));
+ coas.coa_connp = connp;
+ coas.coa_ixa = connp->conn_ixa;
+ coas.coa_ipp = &connp->conn_xmit_ipp;
+ mutex_enter(&connp->conn_lock);
+ retval = conn_opt_get(&coas, level, name, ptr);
+ mutex_exit(&connp->conn_lock);
+ return (retval);
}
/* ARGSUSED */
@@ -620,6 +571,12 @@ rts_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
int *i1 = (int *)invalp;
rts_t *rts = connp->conn_rts;
rts_stack_t *rtss = rts->rts_rtss;
+ int error;
+ conn_opt_arg_t coas;
+
+ coas.coa_connp = connp;
+ coas.coa_ixa = connp->conn_ixa;
+ coas.coa_ipp = &connp->conn_xmit_ipp;
ASSERT(RW_WRITE_HELD(&rts->rts_rwlock));
@@ -638,38 +595,6 @@ rts_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
switch (level) {
case SOL_SOCKET:
switch (name) {
- case SO_REUSEADDR:
- if (!checkonly) {
- rts->rts_reuseaddr = *i1 ? 1 : 0;
- connp->conn_reuseaddr = *i1 ? 1 : 0;
- }
- break; /* goto sizeof (int) option return */
- case SO_DEBUG:
- if (!checkonly)
- rts->rts_debug = *i1 ? 1 : 0;
- break; /* goto sizeof (int) option return */
- /*
- * The following three items are available here,
- * but are only meaningful to IP.
- */
- case SO_DONTROUTE:
- if (!checkonly) {
- rts->rts_dontroute = *i1 ? 1 : 0;
- connp->conn_dontroute = *i1 ? 1 : 0;
- }
- break; /* goto sizeof (int) option return */
- case SO_USELOOPBACK:
- if (!checkonly) {
- rts->rts_useloopback = *i1 ? 1 : 0;
- connp->conn_loopback = *i1 ? 1 : 0;
- }
- break; /* goto sizeof (int) option return */
- case SO_BROADCAST:
- if (!checkonly) {
- rts->rts_broadcast = *i1 ? 1 : 0;
- connp->conn_broadcast = *i1 ? 1 : 0;
- }
- break; /* goto sizeof (int) option return */
case SO_PROTOTYPE:
/*
* Routing socket applications that call socket() with
@@ -678,13 +603,15 @@ rts_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
* down the SO_PROTOTYPE and rts_queue_input()
* implements the filtering.
*/
- if (*i1 != AF_INET && *i1 != AF_INET6)
+ if (*i1 != AF_INET && *i1 != AF_INET6) {
+ *outlenp = 0;
return (EPROTONOSUPPORT);
- if (!checkonly) {
- rts->rts_proto = *i1;
- connp->conn_proto = *i1;
}
- break; /* goto sizeof (int) option return */
+ if (!checkonly)
+ connp->conn_proto = *i1;
+ *outlenp = inlen;
+ return (0);
+
/*
* The following two items can be manipulated,
* but changing them should do nothing.
@@ -694,36 +621,13 @@ rts_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
*outlenp = 0;
return (ENOBUFS);
}
- if (!checkonly) {
- rts->rts_xmit_hiwat = *i1;
- if (!IPCL_IS_NONSTR(connp))
- connp->conn_wq->q_hiwat = *i1;
- }
break; /* goto sizeof (int) option return */
case SO_RCVBUF:
if (*i1 > rtss->rtss_max_buf) {
*outlenp = 0;
return (ENOBUFS);
}
- if (!checkonly) {
- rts->rts_recv_hiwat = *i1;
- rw_exit(&rts->rts_rwlock);
- (void) proto_set_rx_hiwat(connp->conn_rq, connp,
- *i1);
- rw_enter(&rts->rts_rwlock, RW_WRITER);
- }
-
break; /* goto sizeof (int) option return */
- case SO_RCVTIMEO:
- case SO_SNDTIMEO:
- /*
- * Pass these two options in order for third part
- * protocol usage. Here just return directly.
- */
- return (0);
- default:
- *outlenp = 0;
- return (EINVAL);
}
break;
case SOL_ROUTE:
@@ -734,15 +638,17 @@ rts_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
connp->conn_rtaware = *i1;
mutex_exit(&connp->conn_lock);
}
- break; /* goto sizeof (int) option return */
- default:
- *outlenp = 0;
- return (EINVAL);
+ *outlenp = inlen;
+ return (0);
}
break;
- default:
+ }
+ /* Serialized setsockopt since we are D_MTQPAIR */
+ error = conn_opt_set(&coas, level, name, inlen, invalp,
+ checkonly, cr);
+ if (error != 0) {
*outlenp = 0;
- return (EINVAL);
+ return (error);
}
/*
* Common case of return from an option that is sizeof (int)
@@ -832,7 +738,7 @@ rts_tpi_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
int
rts_tpi_opt_set(queue_t *q, uint_t optset_context, int level,
int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
- uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
+ uchar_t *outvalp, void *thisdg_attrs, cred_t *cr)
{
conn_t *connp = Q_TO_CONN(q);
int error;
@@ -1009,10 +915,6 @@ err_ret:
* consumes the message or passes it downstream; it never queues a
* a message. The data messages that go down are wrapped in an IOCTL
* message.
- *
- * FIXME? Should we call IP rts_request directly? Could punt on returning
- * errno in the case when it defers processing due to
- * IPIF_CHANGING/ILL_CHANGING???
*/
static void
rts_wput(queue_t *q, mblk_t *mp)
@@ -1057,7 +959,7 @@ rts_wput(queue_t *q, mblk_t *mp)
}
return;
}
- ip_output(connp, mp1, q, IP_WPUT);
+ ip_wput_nondata(q, mp1);
}
@@ -1120,11 +1022,9 @@ rts_wput_other(queue_t *q, mblk_t *mp)
}
if (((union T_primitives *)rptr)->type ==
T_SVR4_OPTMGMT_REQ) {
- (void) svr4_optcom_req(q, mp, cr,
- &rts_opt_obj, B_TRUE);
+ svr4_optcom_req(q, mp, cr, &rts_opt_obj);
} else {
- (void) tpi_optcom_req(q, mp, cr,
- &rts_opt_obj, B_TRUE);
+ tpi_optcom_req(q, mp, cr, &rts_opt_obj);
}
return;
case O_T_CONN_RES:
@@ -1168,7 +1068,7 @@ rts_wput_other(queue_t *q, mblk_t *mp)
default:
break;
}
- ip_output(connp, mp, q, IP_WPUT);
+ ip_wput_nondata(q, mp);
}
/*
@@ -1177,7 +1077,6 @@ rts_wput_other(queue_t *q, mblk_t *mp)
static void
rts_wput_iocdata(queue_t *q, mblk_t *mp)
{
- conn_t *connp = Q_TO_CONN(q);
struct sockaddr *rtsaddr;
mblk_t *mp1;
STRUCT_HANDLE(strbuf, sb);
@@ -1188,7 +1087,7 @@ rts_wput_iocdata(queue_t *q, mblk_t *mp)
case TI_GETPEERNAME:
break;
default:
- ip_output(connp, mp, q, IP_WPUT);
+ ip_wput_nondata(q, mp);
return;
}
switch (mi_copy_state(q, mp, &mp1)) {
@@ -1233,9 +1132,12 @@ rts_wput_iocdata(queue_t *q, mblk_t *mp)
mi_copyout(q, mp);
}
+/*
+ * IP passes up a NULL ira.
+ */
/*ARGSUSED2*/
static void
-rts_input(void *arg1, mblk_t *mp, void *arg2)
+rts_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
{
conn_t *connp = (conn_t *)arg1;
rts_t *rts = connp->conn_rts;
@@ -1248,27 +1150,17 @@ rts_input(void *arg1, mblk_t *mp, void *arg2)
case M_IOCACK:
case M_IOCNAK:
iocp = (struct iocblk *)mp->b_rptr;
- if (IPCL_IS_NONSTR(connp)) {
- ASSERT(rts->rts_flag & (RTS_REQ_PENDING));
- mutex_enter(&rts->rts_send_mutex);
- rts->rts_flag &= ~RTS_REQ_INPROG;
+ ASSERT(!IPCL_IS_NONSTR(connp));
+ if (rts->rts_flag & (RTS_WPUT_PENDING)) {
+ rts->rts_flag &= ~RTS_WPUT_PENDING;
rts->rts_error = iocp->ioc_error;
- cv_signal(&rts->rts_io_cv);
- mutex_exit(&rts->rts_send_mutex);
+ /*
+ * Tell rts_wvw/qwait that we are done.
+ * Note: there is no qwait_wakeup() we can use.
+ */
+ qenable(connp->conn_rq);
freemsg(mp);
return;
- } else {
- if (rts->rts_flag & (RTS_WPUT_PENDING)) {
- rts->rts_flag &= ~RTS_WPUT_PENDING;
- rts->rts_error = iocp->ioc_error;
- /*
- * Tell rts_wvw/qwait that we are done.
- * Note: there is no qwait_wakeup() we can use.
- */
- qenable(connp->conn_rq);
- freemsg(mp);
- return;
- }
}
break;
case M_DATA:
@@ -1316,6 +1208,12 @@ rts_input(void *arg1, mblk_t *mp, void *arg2)
}
}
+/*ARGSUSED*/
+static void
+rts_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
+{
+ freemsg(mp);
+}
void
rts_ddi_g_init(void)
@@ -1427,11 +1325,6 @@ int
rts_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr,
socklen_t *addrlen, cred_t *cr)
{
- conn_t *connp = (conn_t *)proto_handle;
- rts_t *rts = connp->conn_rts;
-
- ASSERT(rts != NULL);
-
bzero(addr, sizeof (struct sockaddr));
addr->sa_family = AF_ROUTE;
*addrlen = sizeof (struct sockaddr);
@@ -1444,7 +1337,11 @@ int
rts_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
socklen_t *addrlen, cred_t *cr)
{
- return (EOPNOTSUPP);
+ bzero(addr, sizeof (struct sockaddr));
+ addr->sa_family = AF_ROUTE;
+ *addrlen = sizeof (struct sockaddr);
+
+ return (0);
}
static int
@@ -1461,7 +1358,6 @@ rts_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
rts_opt_obj.odb_opt_des_arr,
rts_opt_obj.odb_opt_arr_cnt,
- rts_opt_obj.odb_topmost_tpiprovider,
B_FALSE, B_TRUE, cr);
if (error != 0) {
if (error < 0)
@@ -1473,25 +1369,20 @@ rts_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
rw_enter(&rts->rts_rwlock, RW_READER);
len = rts_opt_get(connp, level, option_name, optvalp_buf);
rw_exit(&rts->rts_rwlock);
-
- if (len < 0) {
- /*
- * Pass on to IP
- */
- error = ip_get_options(connp, level, option_name,
- optvalp, optlen, cr);
- } else {
- /*
- * update optlen and copy option value
- */
- t_uscalar_t size = MIN(len, *optlen);
- bcopy(optvalp_buf, optvalp, size);
- bcopy(&size, optlen, sizeof (size));
- error = 0;
+ if (len == -1) {
+ kmem_free(optvalp_buf, max_optbuf_len);
+ return (EINVAL);
}
+ /*
+ * update optlen and copy option value
+ */
+ t_uscalar_t size = MIN(len, *optlen);
+
+ bcopy(optvalp_buf, optvalp, size);
+ bcopy(&size, optlen, sizeof (size));
kmem_free(optvalp_buf, max_optbuf_len);
- return (error);
+ return (0);
}
static int
@@ -1505,7 +1396,6 @@ rts_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
error = proto_opt_check(level, option_name, optlen, NULL,
rts_opt_obj.odb_opt_des_arr,
rts_opt_obj.odb_opt_arr_cnt,
- rts_opt_obj.odb_topmost_tpiprovider,
B_TRUE, B_FALSE, cr);
if (error != 0) {
@@ -1530,9 +1420,7 @@ static int
rts_send(sock_lower_handle_t proto_handle, mblk_t *mp,
struct nmsghdr *msg, cred_t *cr)
{
- mblk_t *mp1;
conn_t *connp = (conn_t *)proto_handle;
- rts_t *rts = connp->conn_rts;
rt_msghdr_t *rtm;
int error;
@@ -1546,65 +1434,19 @@ rts_send(sock_lower_handle_t proto_handle, mblk_t *mp,
*/
if ((mp->b_wptr - mp->b_rptr) < sizeof (rt_msghdr_t)) {
if (!pullupmsg(mp, sizeof (rt_msghdr_t))) {
- rts->rts_error = EINVAL;
freemsg(mp);
- return (rts->rts_error);
+ return (EINVAL);
}
}
rtm = (rt_msghdr_t *)mp->b_rptr;
rtm->rtm_pid = curproc->p_pid;
- mp1 = rts_ioctl_alloc(mp);
- if (mp1 == NULL) {
- ASSERT(rts != NULL);
- freemsg(mp);
- return (ENOMEM);
- }
-
/*
- * Allow only one outstanding request(ioctl) at any given time
+ * We are not constrained by the ioctl interface and
+ * ip_rts_request_common processing requests synchronously hence
+ * we can send them down concurrently.
*/
- mutex_enter(&rts->rts_send_mutex);
- while (rts->rts_flag & RTS_REQ_PENDING) {
- int ret;
-
- ret = cv_wait_sig(&rts->rts_send_cv, &rts->rts_send_mutex);
- if (ret <= 0) {
- mutex_exit(&rts->rts_send_mutex);
- freemsg(mp);
- return (EINTR);
- }
- }
-
- rts->rts_flag |= RTS_REQ_PENDING;
-
- rts->rts_flag |= RTS_REQ_INPROG;
-
- mutex_exit(&rts->rts_send_mutex);
-
- CONN_INC_REF(connp);
-
- error = ip_rts_request_common(rts->rts_connp->conn_wq, mp1, connp, cr);
-
- mutex_enter(&rts->rts_send_mutex);
- if (error == EINPROGRESS) {
- ASSERT(rts->rts_flag & RTS_REQ_INPROG);
- if (rts->rts_flag & RTS_REQ_INPROG) {
- /*
- * Once the request has been issued we wait for
- * completion
- */
- cv_wait(&rts->rts_io_cv, &rts->rts_send_mutex);
- error = rts->rts_error;
- }
- }
-
- ASSERT((error != 0) || !(rts->rts_flag & RTS_REQ_INPROG));
- ASSERT(MUTEX_HELD(&rts->rts_send_mutex));
-
- rts->rts_flag &= ~(RTS_REQ_PENDING | RTS_REQ_INPROG);
- cv_signal(&rts->rts_send_cv);
- mutex_exit(&rts->rts_send_mutex);
+ error = ip_rts_request_common(mp, connp, cr);
return (error);
}
@@ -1614,8 +1456,6 @@ rts_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
uint_t *smodep, int *errorp, int flags, cred_t *credp)
{
conn_t *connp;
- rts_t *rts;
- rts_stack_t *rtss;
if (family != AF_ROUTE || type != SOCK_RAW ||
(proto != 0 && proto != AF_INET && proto != AF_INET6)) {
@@ -1627,25 +1467,7 @@ rts_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
ASSERT(connp != NULL);
connp->conn_flags |= IPCL_NONSTR;
- rts = connp->conn_rts;
- rtss = rts->rts_rtss;
-
- rts->rts_xmit_hiwat = rtss->rtss_xmit_hiwat;
- rts->rts_xmit_lowat = rtss->rtss_xmit_lowat;
- rts->rts_recv_hiwat = rtss->rtss_recv_hiwat;
- rts->rts_recv_lowat = rts_mod_info.mi_lowat;
-
- ASSERT(rtss->rtss_ldi_ident != NULL);
-
- *errorp = ip_create_helper_stream(connp, rtss->rtss_ldi_ident);
- if (*errorp != 0) {
-#ifdef DEBUG
- cmn_err(CE_CONT, "rts_create: create of IP helper stream"
- " failed\n");
-#endif
- (void) rts_close((sock_lower_handle_t)connp, 0, credp);
- return (NULL);
- }
+ connp->conn_proto = proto;
mutex_enter(&connp->conn_lock);
connp->conn_state_flags &= ~CONN_INCIPIENT;
@@ -1663,8 +1485,6 @@ rts_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
sock_upcalls_t *sock_upcalls, int flags, cred_t *cr)
{
conn_t *connp = (conn_t *)proto_handle;
- rts_t *rts = connp->conn_rts;
- rts_stack_t *rtss = rts->rts_rtss;
struct sock_proto_props sopp;
connp->conn_upcalls = sock_upcalls;
@@ -1673,8 +1493,8 @@ rts_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ;
sopp.sopp_wroff = 0;
- sopp.sopp_rxhiwat = rtss->rtss_recv_hiwat;
- sopp.sopp_rxlowat = rts_mod_info.mi_lowat;
+ sopp.sopp_rxhiwat = connp->conn_rcvbuf;
+ sopp.sopp_rxlowat = connp->conn_rcvlowat;
sopp.sopp_maxblk = INFPSZ;
sopp.sopp_maxpsz = rts_mod_info.mi_maxpsz;
sopp.sopp_minpsz = (rts_mod_info.mi_minpsz == 1) ? 0 :
@@ -1689,12 +1509,7 @@ rts_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
(*connp->conn_upcalls->su_connected)
(connp->conn_upper_handle, 0, NULL, -1);
- /*
- * Indicate the down IP module that this is a routing socket
- * client by sending an RTS IOCTL without any user data. Although
- * this is just a notification message (without any real routing
- * request), we pass in any credential for correctness sake.
- */
+ /* Indicate to IP that this is a routing socket client */
ip_rts_register(connp);
}
@@ -1743,6 +1558,27 @@ rts_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
conn_t *connp = (conn_t *)proto_handle;
int error;
+ /*
+ * If we don't have a helper stream then create one.
+ * ip_create_helper_stream takes care of locking the conn_t,
+ * so this check for NULL is just a performance optimization.
+ */
+ if (connp->conn_helper_info == NULL) {
+ rts_stack_t *rtss = connp->conn_rts->rts_rtss;
+
+ ASSERT(rtss->rtss_ldi_ident != NULL);
+
+ /*
+ * Create a helper stream for non-STREAMS socket.
+ */
+ error = ip_create_helper_stream(connp, rtss->rtss_ldi_ident);
+ if (error != 0) {
+ ip0dbg(("rts_ioctl: create of IP helper stream "
+ "failed %d\n", error));
+ return (error);
+ }
+ }
+
switch (cmd) {
case ND_SET:
case ND_GET:
diff --git a/usr/src/uts/common/inet/ip/rts_opt_data.c b/usr/src/uts/common/inet/ip/rts_opt_data.c
index 8a96edb668..1dd64a0317 100644
--- a/usr/src/uts/common/inet/ip/rts_opt_data.c
+++ b/usr/src/uts/common/inet/ip/rts_opt_data.c
@@ -40,6 +40,7 @@
#include <inet/optcom.h>
#include <inet/rts_impl.h>
+#include <inet/rts_impl.h>
/*
* Table of all known options handled on a RTS protocol stack.
*
@@ -49,21 +50,21 @@
*/
opdes_t rts_opt_arr[] = {
-{ SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int),
+{ SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
0 },
-{ SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
+{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
sizeof (struct timeval), 0 },
-{ SO_RCVTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ SO_RCVTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
sizeof (struct timeval), 0 },
-{ SO_PROTOTYPE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
+{ SO_PROTOTYPE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
{ RT_AWARE, SOL_ROUTE, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
};
@@ -98,9 +99,8 @@ uint_t rts_max_optsize; /* initialized in _init() */
optdb_obj_t rts_opt_obj = {
rts_opt_default, /* RTS default value function pointer */
- rts_tpi_opt_get, /* RTS get function pointer */
- rts_tpi_opt_set, /* RTS set function pointer */
- B_TRUE, /* RTS is tpi provider */
+ rts_tpi_opt_get, /* RTS get function pointer */
+ rts_tpi_opt_set, /* RTS set function pointer */
RTS_OPT_ARR_CNT, /* RTS option database count of entries */
rts_opt_arr, /* RTS option database */
RTS_VALID_LEVELS_CNT, /* RTS valid level count of entries */
diff --git a/usr/src/uts/common/inet/ip/sadb.c b/usr/src/uts/common/inet/ip/sadb.c
index 784b3b08aa..5ae4f6da8e 100644
--- a/usr/src/uts/common/inet/ip/sadb.c
+++ b/usr/src/uts/common/inet/ip/sadb.c
@@ -59,7 +59,6 @@
#include <inet/ipsecesp.h>
#include <sys/random.h>
#include <sys/dlpi.h>
-#include <sys/iphada.h>
#include <sys/strsun.h>
#include <sys/strsubr.h>
#include <inet/ip_if.h>
@@ -77,15 +76,13 @@
static mblk_t *sadb_extended_acquire(ipsec_selector_t *, ipsec_policy_t *,
ipsec_action_t *, boolean_t, uint32_t, uint32_t, sadb_sens_t *,
netstack_t *);
-static void sadb_ill_df(ill_t *, mblk_t *, isaf_t *, int, boolean_t);
-static ipsa_t *sadb_torch_assoc(isaf_t *, ipsa_t *, boolean_t, mblk_t **);
-static void sadb_drain_torchq(queue_t *, mblk_t *);
+static ipsa_t *sadb_torch_assoc(isaf_t *, ipsa_t *);
static void sadb_destroy_acqlist(iacqf_t **, uint_t, boolean_t,
netstack_t *);
static void sadb_destroy(sadb_t *, netstack_t *);
static mblk_t *sadb_sa2msg(ipsa_t *, sadb_msg_t *);
-static cred_t *sadb_cred_from_sens(sadb_sens_t *, uint64_t *);
-static sadb_sens_t *sadb_make_sens_ext(cred_t *cr, int *len);
+static ts_label_t *sadb_label_from_sens(sadb_sens_t *, uint64_t *);
+static sadb_sens_t *sadb_make_sens_ext(ts_label_t *tsl, int *len);
static time_t sadb_add_time(time_t, uint64_t);
static void lifetime_fuzz(ipsa_t *);
@@ -96,12 +93,6 @@ static void destroy_ipsa_pair(ipsap_t *);
static int update_pairing(ipsap_t *, ipsa_query_t *, keysock_in_t *, int *);
static void ipsa_set_replay(ipsa_t *ipsa, uint32_t offset);
-extern void (*cl_inet_getspi)(netstackid_t stack_id, uint8_t protocol,
- uint8_t *ptr, size_t len, void *args);
-extern int (*cl_inet_checkspi)(netstackid_t stack_id, uint8_t protocol,
- uint32_t spi, void *args);
-extern void (*cl_inet_deletespi)(netstackid_t stack_id, uint8_t protocol,
- uint32_t spi, void *args);
/*
* ipsacq_maxpackets is defined here to make it tunable
* from /etc/system.
@@ -269,6 +260,7 @@ static void
sadb_freeassoc(ipsa_t *ipsa)
{
ipsec_stack_t *ipss = ipsa->ipsa_netstack->netstack_ipsec;
+ mblk_t *asyncmp, *mp;
ASSERT(ipss != NULL);
ASSERT(MUTEX_NOT_HELD(&ipsa->ipsa_lock));
@@ -276,20 +268,24 @@ sadb_freeassoc(ipsa_t *ipsa)
ASSERT(ipsa->ipsa_next == NULL);
ASSERT(ipsa->ipsa_ptpn == NULL);
+
+ asyncmp = sadb_clear_lpkt(ipsa);
+ if (asyncmp != NULL) {
+ mp = ip_recv_attr_free_mblk(asyncmp);
+ ip_drop_packet(mp, B_TRUE, NULL,
+ DROPPER(ipss, ipds_sadb_inlarval_timeout),
+ &ipss->ipsec_sadb_dropper);
+ }
mutex_enter(&ipsa->ipsa_lock);
- /* Don't call sadb_clear_lpkt() since we hold the ipsa_lock anyway. */
- ip_drop_packet(ipsa->ipsa_lpkt, B_TRUE, NULL, NULL,
- DROPPER(ipss, ipds_sadb_inlarval_timeout),
- &ipss->ipsec_sadb_dropper);
- if (ipsa->ipsa_cred != NULL) {
- crfree(ipsa->ipsa_cred);
- ipsa->ipsa_cred = NULL;
+ if (ipsa->ipsa_tsl != NULL) {
+ label_rele(ipsa->ipsa_tsl);
+ ipsa->ipsa_tsl = NULL;
}
- if (ipsa->ipsa_ocred != NULL) {
- crfree(ipsa->ipsa_ocred);
- ipsa->ipsa_ocred = NULL;
+ if (ipsa->ipsa_otsl != NULL) {
+ label_rele(ipsa->ipsa_otsl);
+ ipsa->ipsa_otsl = NULL;
}
ipsec_destroy_ctx_tmpl(ipsa, IPSEC_ALG_AUTH);
@@ -712,336 +708,6 @@ sadb_walker(isaf_t *table, uint_t numentries,
}
/*
- * From the given SA, construct a dl_ct_ipsec_key and
- * a dl_ct_ipsec structures to be sent to the adapter as part
- * of a DL_CONTROL_REQ.
- *
- * ct_sa must point to the storage allocated for the key
- * structure and must be followed by storage allocated
- * for the SA information that must be sent to the driver
- * as part of the DL_CONTROL_REQ request.
- *
- * The is_inbound boolean indicates whether the specified
- * SA is part of an inbound SA table.
- *
- * Returns B_TRUE if the corresponding SA must be passed to
- * a provider, B_FALSE otherwise; frees *mp if it returns B_FALSE.
- */
-static boolean_t
-sadb_req_from_sa(ipsa_t *sa, mblk_t *mp, boolean_t is_inbound)
-{
- dl_ct_ipsec_key_t *keyp;
- dl_ct_ipsec_t *sap;
- void *ct_sa = mp->b_wptr;
-
- ASSERT(MUTEX_HELD(&sa->ipsa_lock));
-
- keyp = (dl_ct_ipsec_key_t *)(ct_sa);
- sap = (dl_ct_ipsec_t *)(keyp + 1);
-
- IPSECHW_DEBUG(IPSECHW_CAPAB, ("sadb_req_from_sa: "
- "is_inbound = %d\n", is_inbound));
-
- /* initialize flag */
- sap->sadb_sa_flags = 0;
- if (is_inbound) {
- sap->sadb_sa_flags |= DL_CT_IPSEC_INBOUND;
- /*
- * If an inbound SA has a peer, then mark it has being
- * an outbound SA as well.
- */
- if (sa->ipsa_haspeer)
- sap->sadb_sa_flags |= DL_CT_IPSEC_OUTBOUND;
- } else {
- /*
- * If an outbound SA has a peer, then don't send it,
- * since we will send the copy from the inbound table.
- */
- if (sa->ipsa_haspeer) {
- freemsg(mp);
- return (B_FALSE);
- }
- sap->sadb_sa_flags |= DL_CT_IPSEC_OUTBOUND;
- }
-
- keyp->dl_key_spi = sa->ipsa_spi;
- bcopy(sa->ipsa_dstaddr, keyp->dl_key_dest_addr,
- DL_CTL_IPSEC_ADDR_LEN);
- keyp->dl_key_addr_family = sa->ipsa_addrfam;
-
- sap->sadb_sa_auth = sa->ipsa_auth_alg;
- sap->sadb_sa_encrypt = sa->ipsa_encr_alg;
-
- sap->sadb_key_len_a = sa->ipsa_authkeylen;
- sap->sadb_key_bits_a = sa->ipsa_authkeybits;
- bcopy(sa->ipsa_authkey,
- sap->sadb_key_data_a, sap->sadb_key_len_a);
-
- sap->sadb_key_len_e = sa->ipsa_encrkeylen;
- sap->sadb_key_bits_e = sa->ipsa_encrkeybits;
- bcopy(sa->ipsa_encrkey,
- sap->sadb_key_data_e, sap->sadb_key_len_e);
-
- mp->b_wptr += sizeof (dl_ct_ipsec_t) + sizeof (dl_ct_ipsec_key_t);
- return (B_TRUE);
-}
-
-/*
- * Called from AH or ESP to format a message which will be used to inform
- * IPsec-acceleration-capable ills of a SADB change.
- * (It is not possible to send the message to IP directly from this function
- * since the SA, if any, is locked during the call).
- *
- * dl_operation: DL_CONTROL_REQ operation (add, delete, update, etc)
- * sa_type: identifies whether the operation applies to AH or ESP
- * (must be one of SADB_SATYPE_AH or SADB_SATYPE_ESP)
- * sa: Pointer to an SA. Must be non-NULL and locked
- * for ADD, DELETE, GET, and UPDATE operations.
- * This function returns an mblk chain that must be passed to IP
- * for forwarding to the IPsec capable providers.
- */
-mblk_t *
-sadb_fmt_sa_req(uint_t dl_operation, uint_t sa_type, ipsa_t *sa,
- boolean_t is_inbound)
-{
- mblk_t *mp;
- dl_control_req_t *ctrl;
- boolean_t need_key = B_FALSE;
- mblk_t *ctl_mp = NULL;
- ipsec_ctl_t *ctl;
-
- /*
- * 1 allocate and initialize DL_CONTROL_REQ M_PROTO
- * 2 if a key is needed for the operation
- * 2.1 initialize key
- * 2.2 if a full SA is needed for the operation
- * 2.2.1 initialize full SA info
- * 3 return message; caller will call ill_ipsec_capab_send_all()
- * to send the resulting message to IPsec capable ills.
- */
-
- ASSERT(sa_type == SADB_SATYPE_AH || sa_type == SADB_SATYPE_ESP);
-
- /*
- * Allocate DL_CONTROL_REQ M_PROTO
- * We allocate room for the SA even if it's not needed
- * by some of the operations (for example flush)
- */
- mp = allocb(sizeof (dl_control_req_t) +
- sizeof (dl_ct_ipsec_key_t) + sizeof (dl_ct_ipsec_t), BPRI_HI);
- if (mp == NULL)
- return (NULL);
- mp->b_datap->db_type = M_PROTO;
-
- /* initialize dl_control_req_t */
- ctrl = (dl_control_req_t *)mp->b_wptr;
- ctrl->dl_primitive = DL_CONTROL_REQ;
- ctrl->dl_operation = dl_operation;
- ctrl->dl_type = sa_type == SADB_SATYPE_AH ? DL_CT_IPSEC_AH :
- DL_CT_IPSEC_ESP;
- ctrl->dl_key_offset = sizeof (dl_control_req_t);
- ctrl->dl_key_length = sizeof (dl_ct_ipsec_key_t);
- ctrl->dl_data_offset = sizeof (dl_control_req_t) +
- sizeof (dl_ct_ipsec_key_t);
- ctrl->dl_data_length = sizeof (dl_ct_ipsec_t);
- mp->b_wptr += sizeof (dl_control_req_t);
-
- if ((dl_operation == DL_CO_SET) || (dl_operation == DL_CO_DELETE)) {
- ASSERT(sa != NULL);
- ASSERT(MUTEX_HELD(&sa->ipsa_lock));
-
- need_key = B_TRUE;
-
- /*
- * Initialize key and SA data. Note that for some
- * operations the SA data is ignored by the provider
- * (delete, etc.)
- */
- if (!sadb_req_from_sa(sa, mp, is_inbound))
- return (NULL);
- }
-
- /* construct control message */
- ctl_mp = allocb(sizeof (ipsec_ctl_t), BPRI_HI);
- if (ctl_mp == NULL) {
- cmn_err(CE_WARN, "sadb_fmt_sa_req: allocb failed\n");
- freemsg(mp);
- return (NULL);
- }
-
- ctl_mp->b_datap->db_type = M_CTL;
- ctl_mp->b_wptr += sizeof (ipsec_ctl_t);
- ctl_mp->b_cont = mp;
-
- ctl = (ipsec_ctl_t *)ctl_mp->b_rptr;
- ctl->ipsec_ctl_type = IPSEC_CTL;
- ctl->ipsec_ctl_len = sizeof (ipsec_ctl_t);
- ctl->ipsec_ctl_sa_type = sa_type;
-
- if (need_key) {
- /*
- * Keep an additional reference on SA, since it will be
- * needed by IP to send control messages corresponding
- * to that SA from its perimeter. IP will do a
- * IPSA_REFRELE when done with the request.
- */
- ASSERT(MUTEX_HELD(&sa->ipsa_lock));
- IPSA_REFHOLD(sa);
- ctl->ipsec_ctl_sa = sa;
- } else
- ctl->ipsec_ctl_sa = NULL;
-
- return (ctl_mp);
-}
-
-
-/*
- * Called by sadb_ill_download() to dump the entries for a specific
- * fanout table. For each SA entry in the table passed as argument,
- * use mp as a template and constructs a full DL_CONTROL message, and
- * call ill_dlpi_send(), provided by IP, to send the resulting
- * messages to the ill.
- */
-static void
-sadb_ill_df(ill_t *ill, mblk_t *mp, isaf_t *fanout, int num_entries,
- boolean_t is_inbound)
-{
- ipsa_t *walker;
- mblk_t *nmp, *salist;
- int i, error = 0;
- ip_stack_t *ipst = ill->ill_ipst;
- netstack_t *ns = ipst->ips_netstack;
-
- IPSECHW_DEBUG(IPSECHW_SADB, ("sadb_ill_df: fanout at 0x%p ne=%d\n",
- (void *)fanout, num_entries));
- /*
- * For each IPSA hash bucket do:
- * - Hold the mutex
- * - Walk each entry, sending a corresponding request to IP
- * for it.
- */
- ASSERT(mp->b_datap->db_type == M_PROTO);
-
- for (i = 0; i < num_entries; i++) {
- mutex_enter(&fanout[i].isaf_lock);
- salist = NULL;
-
- for (walker = fanout[i].isaf_ipsa; walker != NULL;
- walker = walker->ipsa_next) {
- IPSECHW_DEBUG(IPSECHW_SADB,
- ("sadb_ill_df: sending SA to ill via IP \n"));
- /*
- * Duplicate the template mp passed and
- * complete DL_CONTROL_REQ data.
- * To be more memory efficient, we could use
- * dupb() for the M_CTL and copyb() for the M_PROTO
- * as the M_CTL, since the M_CTL is the same for
- * every SA entry passed down to IP for the same ill.
- *
- * Note that copymsg/copyb ensure that the new mblk
- * is at least as large as the source mblk even if it's
- * not using all its storage -- therefore, nmp
- * has trailing space for sadb_req_from_sa to add
- * the SA-specific bits.
- */
- mutex_enter(&walker->ipsa_lock);
- if (ipsec_capab_match(ill,
- ill->ill_phyint->phyint_ifindex, ill->ill_isv6,
- walker, ns)) {
- nmp = copymsg(mp);
- if (nmp == NULL) {
- IPSECHW_DEBUG(IPSECHW_SADB,
- ("sadb_ill_df: alloc error\n"));
- error = ENOMEM;
- mutex_exit(&walker->ipsa_lock);
- break;
- }
- if (sadb_req_from_sa(walker, nmp, is_inbound)) {
- nmp->b_next = salist;
- salist = nmp;
- }
- }
- mutex_exit(&walker->ipsa_lock);
- }
- mutex_exit(&fanout[i].isaf_lock);
- while (salist != NULL) {
- nmp = salist;
- salist = nmp->b_next;
- nmp->b_next = NULL;
- ill_dlpi_send(ill, nmp);
- }
- if (error != 0)
- break; /* out of for loop. */
- }
-}
-
-/*
- * Called by ill_ipsec_capab_add(). Sends a copy of the SADB of
- * the type specified by sa_type to the specified ill.
- *
- * We call for each fanout table defined by the SADB (one per
- * protocol). sadb_ill_df() finally calls ill_dlpi_send() for
- * each SADB entry in order to send a corresponding DL_CONTROL_REQ
- * message to the ill.
- */
-void
-sadb_ill_download(ill_t *ill, uint_t sa_type)
-{
- mblk_t *protomp; /* prototype message */
- dl_control_req_t *ctrl;
- sadbp_t *spp;
- sadb_t *sp;
- int dlt;
- ip_stack_t *ipst = ill->ill_ipst;
- netstack_t *ns = ipst->ips_netstack;
-
- ASSERT(sa_type == SADB_SATYPE_AH || sa_type == SADB_SATYPE_ESP);
-
- /*
- * Allocate and initialize prototype answer. A duplicate for
- * each SA is sent down to the interface.
- */
-
- /* DL_CONTROL_REQ M_PROTO mblk_t */
- protomp = allocb(sizeof (dl_control_req_t) +
- sizeof (dl_ct_ipsec_key_t) + sizeof (dl_ct_ipsec_t), BPRI_HI);
- if (protomp == NULL)
- return;
- protomp->b_datap->db_type = M_PROTO;
-
- dlt = (sa_type == SADB_SATYPE_AH) ? DL_CT_IPSEC_AH : DL_CT_IPSEC_ESP;
- if (sa_type == SADB_SATYPE_ESP) {
- ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
-
- spp = &espstack->esp_sadb;
- } else {
- ipsecah_stack_t *ahstack = ns->netstack_ipsecah;
-
- spp = &ahstack->ah_sadb;
- }
-
- ctrl = (dl_control_req_t *)protomp->b_wptr;
- ctrl->dl_primitive = DL_CONTROL_REQ;
- ctrl->dl_operation = DL_CO_SET;
- ctrl->dl_type = dlt;
- ctrl->dl_key_offset = sizeof (dl_control_req_t);
- ctrl->dl_key_length = sizeof (dl_ct_ipsec_key_t);
- ctrl->dl_data_offset = sizeof (dl_control_req_t) +
- sizeof (dl_ct_ipsec_key_t);
- ctrl->dl_data_length = sizeof (dl_ct_ipsec_t);
- protomp->b_wptr += sizeof (dl_control_req_t);
-
- /*
- * then for each SADB entry, we fill out the dl_ct_ipsec_key_t
- * and dl_ct_ipsec_t
- */
- sp = ill->ill_isv6 ? &(spp->s_v6) : &(spp->s_v4);
- sadb_ill_df(ill, protomp, sp->sdb_of, sp->sdb_hashsize, B_FALSE);
- sadb_ill_df(ill, protomp, sp->sdb_if, sp->sdb_hashsize, B_TRUE);
- freemsg(protomp);
-}
-
-/*
* Call me to free up a security association fanout. Use the forever
* variable to indicate freeing up the SAs (forever == B_FALSE, e.g.
* an SADB_FLUSH message), or destroying everything (forever == B_TRUE,
@@ -1119,30 +785,11 @@ sadb_destroy(sadb_t *sp, netstack_t *ns)
ASSERT(sp->sdb_acq == NULL);
}
-static void
-sadb_send_flush_req(sadbp_t *spp)
-{
- mblk_t *ctl_mp;
-
- /*
- * we've been unplumbed, or never were plumbed; don't go there.
- */
- if (spp->s_ip_q == NULL)
- return;
-
- /* have IP send a flush msg to the IPsec accelerators */
- ctl_mp = sadb_fmt_sa_req(DL_CO_FLUSH, spp->s_satype, NULL, B_TRUE);
- if (ctl_mp != NULL)
- putnext(spp->s_ip_q, ctl_mp);
-}
-
void
sadbp_flush(sadbp_t *spp, netstack_t *ns)
{
sadb_flush(&spp->s_v4, ns);
sadb_flush(&spp->s_v6, ns);
-
- sadb_send_flush_req(spp);
}
void
@@ -1151,7 +798,6 @@ sadbp_destroy(sadbp_t *spp, netstack_t *ns)
sadb_destroy(&spp->s_v4, ns);
sadb_destroy(&spp->s_v6, ns);
- sadb_send_flush_req(spp);
if (spp->s_satype == SADB_SATYPE_AH) {
ipsec_stack_t *ipss = ns->netstack_ipsec;
@@ -1259,11 +905,11 @@ sadb_cloneassoc(ipsa_t *ipsa)
/* bzero and initialize locks, in case *_init() allocates... */
mutex_init(&newbie->ipsa_lock, NULL, MUTEX_DEFAULT, NULL);
- if (newbie->ipsa_cred != NULL)
- crhold(newbie->ipsa_cred);
+ if (newbie->ipsa_tsl != NULL)
+ label_hold(newbie->ipsa_tsl);
- if (newbie->ipsa_ocred != NULL)
- crhold(newbie->ipsa_ocred);
+ if (newbie->ipsa_otsl != NULL)
+ label_hold(newbie->ipsa_otsl);
/*
* While somewhat dain-bramaged, the most graceful way to
@@ -1554,14 +1200,14 @@ sadb_sa2msg(ipsa_t *ipsa, sadb_msg_t *samsg)
encr = B_FALSE;
}
- if (ipsa->ipsa_cred != NULL) {
- senslen = sadb_sens_len_from_cred(ipsa->ipsa_cred);
+ if (ipsa->ipsa_tsl != NULL) {
+ senslen = sadb_sens_len_from_label(ipsa->ipsa_tsl);
alloclen += senslen;
sensinteg = B_TRUE;
}
- if (ipsa->ipsa_ocred != NULL) {
- osenslen = sadb_sens_len_from_cred(ipsa->ipsa_ocred);
+ if (ipsa->ipsa_otsl != NULL) {
+ osenslen = sadb_sens_len_from_label(ipsa->ipsa_otsl);
alloclen += osenslen;
osensinteg = B_TRUE;
}
@@ -1792,8 +1438,8 @@ sadb_sa2msg(ipsa_t *ipsa, sadb_msg_t *samsg)
if (sensinteg) {
sens = (sadb_sens_t *)walker;
- sadb_sens_from_cred(sens, SADB_EXT_SENSITIVITY,
- ipsa->ipsa_cred, senslen);
+ sadb_sens_from_label(sens, SADB_EXT_SENSITIVITY,
+ ipsa->ipsa_tsl, senslen);
walker = (sadb_ext_t *)((uint64_t *)walker +
walker->sadb_ext_len);
@@ -1802,8 +1448,8 @@ sadb_sa2msg(ipsa_t *ipsa, sadb_msg_t *samsg)
if (osensinteg) {
sens = (sadb_sens_t *)walker;
- sadb_sens_from_cred(sens, SADB_X_EXT_OUTER_SENS,
- ipsa->ipsa_ocred, osenslen);
+ sadb_sens_from_label(sens, SADB_X_EXT_OUTER_SENS,
+ ipsa->ipsa_otsl, osenslen);
if (ipsa->ipsa_mac_exempt)
sens->sadb_x_sens_flags = SADB_X_SENS_IMPLICIT;
@@ -2123,7 +1769,6 @@ sadb_addrcheck(queue_t *pfkey_q, mblk_t *mp, sadb_ext_t *ext, uint_t serial,
sadb_address_t *addr = (sadb_address_t *)ext;
struct sockaddr_in *sin;
struct sockaddr_in6 *sin6;
- ire_t *ire;
int diagnostic, type;
boolean_t normalized = B_FALSE;
@@ -2249,18 +1894,12 @@ bail:
/*
* At this point, we're a unicast IPv6 address.
*
- * A ctable lookup for local is sufficient here. If we're
- * local, return KS_IN_ADDR_ME, otherwise KS_IN_ADDR_NOTME.
- *
* XXX Zones alert -> me/notme decision needs to be tempered
* by what zone we're in when we go to zone-aware IPsec.
*/
- ire = ire_ctable_lookup_v6(&sin6->sin6_addr, NULL,
- IRE_LOCAL, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE,
- ns->netstack_ip);
- if (ire != NULL) {
+ if (ip_type_v6(&sin6->sin6_addr, ns->netstack_ip) ==
+ IRE_LOCAL) {
/* Hey hey, it's local. */
- IRE_REFRELE(ire);
return (KS_IN_ADDR_ME);
}
} else {
@@ -2272,23 +1911,17 @@ bail:
/*
* At this point we're a unicast or broadcast IPv4 address.
*
- * Lookup on the ctable for IRE_BROADCAST or IRE_LOCAL.
- * A NULL return value is NOTME, otherwise, look at the
- * returned ire for broadcast or not and return accordingly.
+ * Check if the address is IRE_BROADCAST or IRE_LOCAL.
*
* XXX Zones alert -> me/notme decision needs to be tempered
* by what zone we're in when we go to zone-aware IPsec.
*/
- ire = ire_ctable_lookup(sin->sin_addr.s_addr, 0,
- IRE_LOCAL | IRE_BROADCAST, NULL, ALL_ZONES, NULL,
- MATCH_IRE_TYPE, ns->netstack_ip);
- if (ire != NULL) {
- /* Check for local or broadcast */
- type = ire->ire_type;
- IRE_REFRELE(ire);
- ASSERT(type == IRE_LOCAL || type == IRE_BROADCAST);
- return ((type == IRE_LOCAL) ? KS_IN_ADDR_ME :
- KS_IN_ADDR_MBCAST);
+ type = ip_type_v4(sin->sin_addr.s_addr, ns->netstack_ip);
+ switch (type) {
+ case IRE_LOCAL:
+ return (KS_IN_ADDR_ME);
+ case IRE_BROADCAST:
+ return (KS_IN_ADDR_MBCAST);
}
}
@@ -2763,7 +2396,6 @@ struct sadb_purge_state
ipsa_query_t sq;
boolean_t inbnd;
uint8_t sadb_sa_state;
- mblk_t *mq;
};
static void
@@ -2785,7 +2417,7 @@ sadb_purge_cb(isaf_t *head, ipsa_t *entry, void *cookie)
sadb_delete_cluster(entry);
}
entry->ipsa_state = IPSA_STATE_DEAD;
- (void) sadb_torch_assoc(head, entry, ps->inbnd, &ps->mq);
+ (void) sadb_torch_assoc(head, entry);
}
/*
@@ -2794,15 +2426,13 @@ sadb_purge_cb(isaf_t *head, ipsa_t *entry, void *cookie)
*/
int
sadb_purge_sa(mblk_t *mp, keysock_in_t *ksi, sadb_t *sp,
- int *diagnostic, queue_t *pfkey_q, queue_t *ip_q)
+ int *diagnostic, queue_t *pfkey_q)
{
struct sadb_purge_state ps;
int error = sadb_form_query(ksi, 0,
IPSA_Q_SRC|IPSA_Q_DST|IPSA_Q_SRCID|IPSA_Q_DSTID|IPSA_Q_KMC,
&ps.sq, diagnostic);
- ps.mq = NULL;
-
if (error != 0)
return (error);
@@ -2819,9 +2449,6 @@ sadb_purge_sa(mblk_t *mp, keysock_in_t *ksi, sadb_t *sp,
ps.inbnd = B_FALSE;
sadb_walker(sp->sdb_of, sp->sdb_hashsize, sadb_purge_cb, &ps);
- if (ps.mq != NULL)
- sadb_drain_torchq(ip_q, ps.mq);
-
ASSERT(mp->b_cont != NULL);
sadb_pfkey_echo(pfkey_q, mp, (sadb_msg_t *)mp->b_cont->b_rptr, ksi,
NULL);
@@ -2870,12 +2497,11 @@ sadb_delpair_state_one(isaf_t *head, ipsa_t *entry, void *cookie)
}
entry->ipsa_state = IPSA_STATE_DEAD;
- (void) sadb_torch_assoc(head, entry, B_FALSE, &ps->mq);
+ (void) sadb_torch_assoc(head, entry);
if (peer_assoc != NULL) {
mutex_enter(&peer_assoc->ipsa_lock);
peer_assoc->ipsa_state = IPSA_STATE_DEAD;
- (void) sadb_torch_assoc(inbound_bucket, peer_assoc,
- B_FALSE, &ps->mq);
+ (void) sadb_torch_assoc(inbound_bucket, peer_assoc);
}
mutex_exit(&inbound_bucket->isaf_lock);
}
@@ -2889,7 +2515,6 @@ sadb_delpair_state(mblk_t *mp, keysock_in_t *ksi, sadbp_t *spp,
int error;
ps.sq.spp = spp; /* XXX param */
- ps.mq = NULL;
error = sadb_form_query(ksi, IPSA_Q_DST|IPSA_Q_SRC,
IPSA_Q_SRC|IPSA_Q_DST|IPSA_Q_SRCID|IPSA_Q_DSTID|IPSA_Q_KMC,
@@ -2902,9 +2527,6 @@ sadb_delpair_state(mblk_t *mp, keysock_in_t *ksi, sadbp_t *spp,
sadb_walker(ps.sq.sp->sdb_of, ps.sq.sp->sdb_hashsize,
sadb_delpair_state_one, &ps);
- if (ps.mq != NULL)
- sadb_drain_torchq(pfkey_q, ps.mq);
-
ASSERT(mp->b_cont != NULL);
sadb_pfkey_echo(pfkey_q, mp, (sadb_msg_t *)mp->b_cont->b_rptr,
ksi, NULL);
@@ -2921,7 +2543,6 @@ sadb_delget_sa(mblk_t *mp, keysock_in_t *ksi, sadbp_t *spp,
ipsa_query_t sq;
ipsa_t *echo_target = NULL;
ipsap_t ipsapp;
- mblk_t *torchq = NULL;
uint_t error = 0;
if (sadb_msg_type == SADB_X_DELPAIR_STATE)
@@ -2965,7 +2586,7 @@ sadb_delget_sa(mblk_t *mp, keysock_in_t *ksi, sadbp_t *spp,
}
ipsapp.ipsap_sa_ptr->ipsa_state = IPSA_STATE_DEAD;
(void) sadb_torch_assoc(ipsapp.ipsap_bucket,
- ipsapp.ipsap_sa_ptr, B_FALSE, &torchq);
+ ipsapp.ipsap_sa_ptr);
/*
* sadb_torch_assoc() releases the ipsa_lock
* and calls sadb_unlinkassoc() which does a
@@ -2984,7 +2605,7 @@ sadb_delget_sa(mblk_t *mp, keysock_in_t *ksi, sadbp_t *spp,
ipsapp.ipsap_psa_ptr->ipsa_state =
IPSA_STATE_DEAD;
(void) sadb_torch_assoc(ipsapp.ipsap_pbucket,
- ipsapp.ipsap_psa_ptr, B_FALSE, &torchq);
+ ipsapp.ipsap_psa_ptr);
} else {
/*
* Only half of the "pair" has been deleted.
@@ -3004,9 +2625,6 @@ sadb_delget_sa(mblk_t *mp, keysock_in_t *ksi, sadbp_t *spp,
mutex_exit(&ipsapp.ipsap_pbucket->isaf_lock);
}
- if (torchq != NULL)
- sadb_drain_torchq(spp->s_ip_q, torchq);
-
ASSERT(mp->b_cont != NULL);
if (error == 0)
@@ -3269,7 +2887,7 @@ sadb_nat_calculations(ipsa_t *newbie, sadb_address_t *natt_loc_ext,
* case here.
*/
int
-sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg,
+sadb_common_add(queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg,
keysock_in_t *ksi, isaf_t *primary, isaf_t *secondary,
ipsa_t *newbie, boolean_t clone, boolean_t is_inbound, int *diagnostic,
netstack_t *ns, sadbp_t *spp)
@@ -3313,11 +2931,11 @@ sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg,
int error = 0;
boolean_t isupdate = (newbie != NULL);
uint32_t *src_addr_ptr, *dst_addr_ptr, *isrc_addr_ptr, *idst_addr_ptr;
- mblk_t *ctl_mp = NULL;
ipsec_stack_t *ipss = ns->netstack_ipsec;
ip_stack_t *ipst = ns->netstack_ip;
ipsec_alginfo_t *alg;
int rcode;
+ boolean_t async = B_FALSE;
init_ipsa_pair(&ipsapp);
@@ -3549,7 +3167,14 @@ sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg,
newbie->ipsa_authtmpl = NULL;
newbie->ipsa_encrtmpl = NULL;
+#ifdef IPSEC_LATENCY_TEST
+ if (akey != NULL && newbie->ipsa_auth_alg != SADB_AALG_NONE) {
+#else
if (akey != NULL) {
+#endif
+ async = (ipss->ipsec_algs_exec_mode[IPSEC_ALG_AUTH] ==
+ IPSEC_ALGS_EXEC_ASYNC);
+
newbie->ipsa_authkeybits = akey->sadb_key_bits;
newbie->ipsa_authkeylen = SADB_1TO8(akey->sadb_key_bits);
/* In case we have to round up to the next byte... */
@@ -3604,6 +3229,8 @@ sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg,
if (ekey != NULL) {
mutex_enter(&ipss->ipsec_alg_lock);
+ async = async || (ipss->ipsec_algs_exec_mode[IPSEC_ALG_ENCR] ==
+ IPSEC_ALGS_EXEC_ASYNC);
alg = ipss->ipsec_alglists[IPSEC_ALG_ENCR]
[newbie->ipsa_encr_alg];
@@ -3757,6 +3384,9 @@ sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg,
}
}
+ if (async)
+ newbie->ipsa_flags |= IPSA_F_ASYNC;
+
/*
* Ptrs to processing functions.
*/
@@ -3812,7 +3442,7 @@ sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg,
if (sens != NULL) {
uint64_t *bitmap = (uint64_t *)(sens + 1);
- newbie->ipsa_cred = sadb_cred_from_sens(sens, bitmap);
+ newbie->ipsa_tsl = sadb_label_from_sens(sens, bitmap);
}
/*
@@ -3820,41 +3450,55 @@ sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg,
*/
if (osens != NULL) {
uint64_t *bitmap = (uint64_t *)(osens + 1);
- cred_t *cred, *effective_cred;
+ ts_label_t *tsl, *effective_tsl;
uint32_t *peer_addr_ptr;
+ zoneid_t zoneid = GLOBAL_ZONEID;
+ zone_t *zone;
peer_addr_ptr = is_inbound ? src_addr_ptr : dst_addr_ptr;
- cred = sadb_cred_from_sens(osens, bitmap);
+ tsl = sadb_label_from_sens(osens, bitmap);
newbie->ipsa_mac_exempt = CONN_MAC_DEFAULT;
if (osens->sadb_x_sens_flags & SADB_X_SENS_IMPLICIT) {
newbie->ipsa_mac_exempt = CONN_MAC_IMPLICIT;
}
- error = tsol_check_dest(cred, peer_addr_ptr,
+ error = tsol_check_dest(tsl, peer_addr_ptr,
(af == AF_INET6)?IPV6_VERSION:IPV4_VERSION,
- newbie->ipsa_mac_exempt, &effective_cred);
+ newbie->ipsa_mac_exempt, B_TRUE, &effective_tsl);
if (error != 0) {
- crfree(cred);
+ label_rele(tsl);
mutex_exit(&newbie->ipsa_lock);
goto error;
}
- if (effective_cred != NULL) {
- crfree(cred);
- cred = effective_cred;
+ if (effective_tsl != NULL) {
+ label_rele(tsl);
+ tsl = effective_tsl;
}
- newbie->ipsa_ocred = cred;
+ newbie->ipsa_otsl = tsl;
+
+ zone = zone_find_by_label(tsl);
+ if (zone != NULL) {
+ zoneid = zone->zone_id;
+ zone_rele(zone);
+ }
+ /*
+ * For exclusive stacks we set the zoneid to zero to operate
+ * as if in the global zone for tsol_compute_label_v4/v6
+ */
+ if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
+ zoneid = GLOBAL_ZONEID;
if (af == AF_INET6) {
- error = tsol_compute_label_v6(cred,
+ error = tsol_compute_label_v6(tsl, zoneid,
(in6_addr_t *)peer_addr_ptr,
newbie->ipsa_opt_storage, ipst);
} else {
- error = tsol_compute_label(cred, *peer_addr_ptr,
- newbie->ipsa_opt_storage, ipst);
+ error = tsol_compute_label_v4(tsl, zoneid,
+ *peer_addr_ptr, newbie->ipsa_opt_storage, ipst);
}
if (error != 0) {
mutex_exit(&newbie->ipsa_lock);
@@ -3916,9 +3560,6 @@ sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg,
mutex_enter(&primary->isaf_lock);
}
- IPSECHW_DEBUG(IPSECHW_SADB, ("sadb_common_add: spi = 0x%x\n",
- newbie->ipsa_spi));
-
/*
* sadb_insertassoc() doesn't increment the reference
* count. We therefore have to increment the
@@ -3938,10 +3579,6 @@ sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg,
mutex_enter(&newbie->ipsa_lock);
error = sadb_insertassoc(newbie, primary);
- if (error == 0) {
- ctl_mp = sadb_fmt_sa_req(DL_CO_SET, newbie->ipsa_type, newbie,
- is_inbound);
- }
mutex_exit(&newbie->ipsa_lock);
if (error != 0) {
@@ -3982,13 +3619,6 @@ sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg,
ASSERT(MUTEX_NOT_HELD(&newbie->ipsa_lock));
ASSERT(newbie_clone == NULL ||
(MUTEX_NOT_HELD(&newbie_clone->ipsa_lock)));
- /*
- * If hardware acceleration could happen, send it.
- */
- if (ctl_mp != NULL) {
- putnext(ip_q, ctl_mp);
- ctl_mp = NULL;
- }
error_unlock:
@@ -4037,8 +3667,6 @@ error:
if (newbie_clone != NULL) {
IPSA_REFRELE(newbie_clone);
}
- if (ctl_mp != NULL)
- freemsg(ctl_mp);
if (error == 0) {
/*
@@ -4315,37 +3943,12 @@ sadb_age_bytes(queue_t *pfkey_q, ipsa_t *assoc, uint64_t bytes,
}
/*
- * Push one or more DL_CO_DELETE messages queued up by
- * sadb_torch_assoc down to the underlying driver now that it's a
- * convenient time for it (i.e., ipsa bucket locks not held).
- */
-static void
-sadb_drain_torchq(queue_t *q, mblk_t *mp)
-{
- while (mp != NULL) {
- mblk_t *next = mp->b_next;
- mp->b_next = NULL;
- if (q != NULL)
- putnext(q, mp);
- else
- freemsg(mp);
- mp = next;
- }
-}
-
-/*
* "Torch" an individual SA. Returns NULL, so it can be tail-called from
* sadb_age_assoc().
- *
- * If SA is hardware-accelerated, and we can't allocate the mblk
- * containing the DL_CO_DELETE, just return; it will remain in the
- * table and be swept up by sadb_ager() in a subsequent pass.
*/
static ipsa_t *
-sadb_torch_assoc(isaf_t *head, ipsa_t *sa, boolean_t inbnd, mblk_t **mq)
+sadb_torch_assoc(isaf_t *head, ipsa_t *sa)
{
- mblk_t *mp;
-
ASSERT(MUTEX_HELD(&head->isaf_lock));
ASSERT(MUTEX_HELD(&sa->ipsa_lock));
ASSERT(sa->ipsa_state == IPSA_STATE_DEAD);
@@ -4355,15 +3958,6 @@ sadb_torch_assoc(isaf_t *head, ipsa_t *sa, boolean_t inbnd, mblk_t **mq)
*/
head->isaf_gen++;
- if (sa->ipsa_flags & IPSA_F_HW) {
- mp = sadb_fmt_sa_req(DL_CO_DELETE, sa->ipsa_type, sa, inbnd);
- if (mp == NULL) {
- mutex_exit(&sa->ipsa_lock);
- return (NULL);
- }
- mp->b_next = *mq;
- *mq = mp;
- }
mutex_exit(&sa->ipsa_lock);
sadb_unlinkassoc(sa);
@@ -4404,7 +3998,7 @@ sadb_idle_activities(ipsa_t *assoc, time_t delta, boolean_t inbound)
*/
static ipsa_t *
sadb_age_assoc(isaf_t *head, queue_t *pfkey_q, ipsa_t *assoc,
- time_t current, int reap_delay, boolean_t inbound, mblk_t **mq)
+ time_t current, int reap_delay, boolean_t inbound)
{
ipsa_t *retval = NULL;
boolean_t dropped_mutex = B_FALSE;
@@ -4419,7 +4013,7 @@ sadb_age_assoc(isaf_t *head, queue_t *pfkey_q, ipsa_t *assoc,
(assoc->ipsa_hardexpiretime != 0))) &&
(assoc->ipsa_hardexpiretime <= current)) {
assoc->ipsa_state = IPSA_STATE_DEAD;
- return (sadb_torch_assoc(head, assoc, inbound, mq));
+ return (sadb_torch_assoc(head, assoc));
}
/*
@@ -4433,7 +4027,7 @@ sadb_age_assoc(isaf_t *head, queue_t *pfkey_q, ipsa_t *assoc,
if (assoc->ipsa_hardexpiretime != 0 &&
assoc->ipsa_hardexpiretime <= current) {
if (assoc->ipsa_state == IPSA_STATE_DEAD)
- return (sadb_torch_assoc(head, assoc, inbound, mq));
+ return (sadb_torch_assoc(head, assoc));
if (inbound) {
sadb_delete_cluster(assoc);
@@ -4516,8 +4110,7 @@ sadb_age_assoc(isaf_t *head, queue_t *pfkey_q, ipsa_t *assoc,
* the second time sadb_ager() runs.
*/
void
-sadb_ager(sadb_t *sp, queue_t *pfkey_q, queue_t *ip_q, int reap_delay,
- netstack_t *ns)
+sadb_ager(sadb_t *sp, queue_t *pfkey_q, int reap_delay, netstack_t *ns)
{
int i;
isaf_t *bucket;
@@ -4527,7 +4120,6 @@ sadb_ager(sadb_t *sp, queue_t *pfkey_q, queue_t *ip_q, int reap_delay,
templist_t *haspeerlist, *newbie;
/* Snapshot current time now. */
time_t current = gethrestime_sec();
- mblk_t *mq = NULL;
haspeerlist = NULL;
/*
@@ -4559,7 +4151,7 @@ sadb_ager(sadb_t *sp, queue_t *pfkey_q, queue_t *ip_q, int reap_delay,
assoc = spare) {
spare = assoc->ipsa_next;
if (sadb_age_assoc(bucket, pfkey_q, assoc, current,
- reap_delay, B_TRUE, &mq) != NULL) {
+ reap_delay, B_TRUE) != NULL) {
/*
* Put SA's which have a peer or SA's which
* are paired on a list for processing after
@@ -4585,10 +4177,6 @@ sadb_ager(sadb_t *sp, queue_t *pfkey_q, queue_t *ip_q, int reap_delay,
mutex_exit(&bucket->isaf_lock);
}
- if (mq != NULL) {
- sadb_drain_torchq(ip_q, mq);
- mq = NULL;
- }
age_pair_peer_list(haspeerlist, sp, B_FALSE);
haspeerlist = NULL;
@@ -4600,7 +4188,7 @@ sadb_ager(sadb_t *sp, queue_t *pfkey_q, queue_t *ip_q, int reap_delay,
assoc = spare) {
spare = assoc->ipsa_next;
if (sadb_age_assoc(bucket, pfkey_q, assoc, current,
- reap_delay, B_FALSE, &mq) != NULL) {
+ reap_delay, B_FALSE) != NULL) {
/*
* sadb_age_assoc() increments the refcnt,
* effectively doing an IPSA_REFHOLD().
@@ -4621,10 +4209,6 @@ sadb_ager(sadb_t *sp, queue_t *pfkey_q, queue_t *ip_q, int reap_delay,
}
mutex_exit(&bucket->isaf_lock);
}
- if (mq != NULL) {
- sadb_drain_torchq(ip_q, mq);
- mq = NULL;
- }
age_pair_peer_list(haspeerlist, sp, B_TRUE);
@@ -5227,7 +4811,7 @@ update_pairing(ipsap_t *ipsapp, ipsa_query_t *sq, keysock_in_t *ksi,
static ipsacq_t *
sadb_checkacquire(iacqf_t *bucket, ipsec_action_t *ap, ipsec_policy_t *pp,
uint32_t *src, uint32_t *dst, uint32_t *isrc, uint32_t *idst,
- uint64_t unique_id, cred_t *cr)
+ uint64_t unique_id, ts_label_t *tsl)
{
ipsacq_t *walker;
sa_family_t fam;
@@ -5257,7 +4841,7 @@ sadb_checkacquire(iacqf_t *bucket, ipsec_action_t *ap, ipsec_policy_t *pp,
(pp == walker->ipsacq_policy) &&
/* XXX do deep compares of ap/pp? */
(unique_id == walker->ipsacq_unique_id) &&
- (ipsec_label_match(cr, walker->ipsacq_cred)))
+ (ipsec_label_match(tsl, walker->ipsacq_tsl)))
break; /* everything matched */
mutex_exit(&walker->ipsacq_lock);
}
@@ -5272,31 +4856,32 @@ sadb_checkacquire(iacqf_t *bucket, ipsec_action_t *ap, ipsec_policy_t *pp,
* send the acquire up..
*
* In cases where we need both AH and ESP, add the SA to the ESP ACQUIRE
- * list. The ah_add_sa_finish() routines can look at the packet's ipsec_out_t
- * and handle this case specially.
+ * list. The ah_add_sa_finish() routines can look at the packet's attached
+ * attributes and handle this case specially.
*/
void
-sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp)
+sadb_acquire(mblk_t *datamp, ip_xmit_attr_t *ixa, boolean_t need_ah,
+ boolean_t need_esp)
{
+ mblk_t *asyncmp;
sadbp_t *spp;
sadb_t *sp;
ipsacq_t *newbie;
iacqf_t *bucket;
- mblk_t *datamp = mp->b_cont;
mblk_t *extended;
ipha_t *ipha = (ipha_t *)datamp->b_rptr;
ip6_t *ip6h = (ip6_t *)datamp->b_rptr;
uint32_t *src, *dst, *isrc, *idst;
- ipsec_policy_t *pp = io->ipsec_out_policy;
- ipsec_action_t *ap = io->ipsec_out_act;
+ ipsec_policy_t *pp = ixa->ixa_ipsec_policy;
+ ipsec_action_t *ap = ixa->ixa_ipsec_action;
sa_family_t af;
int hashoffset;
uint32_t seq;
uint64_t unique_id = 0;
ipsec_selector_t sel;
- boolean_t tunnel_mode = io->ipsec_out_tunnel;
- cred_t *cr = NULL;
- netstack_t *ns = io->ipsec_out_ns;
+ boolean_t tunnel_mode = (ixa->ixa_flags & IXAF_IPSEC_TUNNEL) != 0;
+ ts_label_t *tsl = NULL;
+ netstack_t *ns = ixa->ixa_ipst->ips_netstack;
ipsec_stack_t *ipss = ns->netstack_ipsec;
sadb_sens_t *sens = NULL;
int sens_len;
@@ -5315,12 +4900,10 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp)
spp = &ahstack->ah_sadb;
}
- sp = io->ipsec_out_v4 ? &spp->s_v4 : &spp->s_v6;
-
- ASSERT(mp->b_cont != NULL);
+ sp = (ixa->ixa_flags & IXAF_IS_IPV4) ? &spp->s_v4 : &spp->s_v6;
if (is_system_labeled())
- cr = msg_getcred(mp->b_cont, NULL);
+ tsl = ixa->ixa_tsl;
if (ap == NULL)
ap = pp->ipsp_act;
@@ -5328,7 +4911,7 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp)
ASSERT(ap != NULL);
if (ap->ipa_act.ipa_apply.ipp_use_unique || tunnel_mode)
- unique_id = SA_FORM_UNIQUE_ID(io);
+ unique_id = SA_FORM_UNIQUE_ID(ixa);
/*
* Set up an ACQUIRE record.
@@ -5345,14 +4928,14 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp)
dst = (uint32_t *)&ipha->ipha_dst;
af = AF_INET;
hashoffset = OUTBOUND_HASH_V4(sp, ipha->ipha_dst);
- ASSERT(io->ipsec_out_v4 == B_TRUE);
+ ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
} else {
ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
src = (uint32_t *)&ip6h->ip6_src;
dst = (uint32_t *)&ip6h->ip6_dst;
af = AF_INET6;
hashoffset = OUTBOUND_HASH_V6(sp, ip6h->ip6_dst);
- ASSERT(io->ipsec_out_v4 == B_FALSE);
+ ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4));
}
if (tunnel_mode) {
@@ -5363,14 +4946,14 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp)
* with self-encapsulated protection. Until we better
* support this, drop the packet.
*/
- ip_drop_packet(mp, B_FALSE, NULL, NULL,
+ ip_drop_packet(datamp, B_FALSE, NULL,
DROPPER(ipss, ipds_spd_got_selfencap),
&ipss->ipsec_spd_dropper);
return;
}
/* Snag inner addresses. */
- isrc = io->ipsec_out_insrc;
- idst = io->ipsec_out_indst;
+ isrc = ixa->ixa_ipsec_insrc;
+ idst = ixa->ixa_ipsec_indst;
} else {
isrc = idst = NULL;
}
@@ -5382,7 +4965,7 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp)
bucket = &(sp->sdb_acq[hashoffset]);
mutex_enter(&bucket->iacqf_lock);
newbie = sadb_checkacquire(bucket, ap, pp, src, dst, isrc, idst,
- unique_id, cr);
+ unique_id, tsl);
if (newbie == NULL) {
/*
@@ -5391,7 +4974,7 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp)
newbie = kmem_zalloc(sizeof (*newbie), KM_NOSLEEP);
if (newbie == NULL) {
mutex_exit(&bucket->iacqf_lock);
- ip_drop_packet(mp, B_FALSE, NULL, NULL,
+ ip_drop_packet(datamp, B_FALSE, NULL,
DROPPER(ipss, ipds_sadb_acquire_nomem),
&ipss->ipsec_sadb_dropper);
return;
@@ -5433,11 +5016,30 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp)
*/
ASSERT(MUTEX_HELD(&newbie->ipsacq_lock));
- mp->b_next = NULL;
+ /*
+ * Make the ip_xmit_attr_t into something we can queue.
+ * If no memory it frees datamp.
+ */
+ asyncmp = ip_xmit_attr_to_mblk(ixa);
+ if (asyncmp != NULL)
+ linkb(asyncmp, datamp);
+
/* Queue up packet. Use b_next. */
- if (newbie->ipsacq_numpackets == 0) {
+
+ if (asyncmp == NULL) {
+ /* Statistics for allocation failure */
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ BUMP_MIB(&ixa->ixa_ipst->ips_ip_mib,
+ ipIfStatsOutDiscards);
+ } else {
+ BUMP_MIB(&ixa->ixa_ipst->ips_ip6_mib,
+ ipIfStatsOutDiscards);
+ }
+ ip_drop_output("No memory for asyncmp", datamp, NULL);
+ freemsg(datamp);
+ } else if (newbie->ipsacq_numpackets == 0) {
/* First one. */
- newbie->ipsacq_mp = mp;
+ newbie->ipsacq_mp = asyncmp;
newbie->ipsacq_numpackets = 1;
newbie->ipsacq_expire = gethrestime_sec();
/*
@@ -5448,28 +5050,28 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp)
newbie->ipsacq_seq = seq;
newbie->ipsacq_addrfam = af;
- newbie->ipsacq_srcport = io->ipsec_out_src_port;
- newbie->ipsacq_dstport = io->ipsec_out_dst_port;
- newbie->ipsacq_icmp_type = io->ipsec_out_icmp_type;
- newbie->ipsacq_icmp_code = io->ipsec_out_icmp_code;
+ newbie->ipsacq_srcport = ixa->ixa_ipsec_src_port;
+ newbie->ipsacq_dstport = ixa->ixa_ipsec_dst_port;
+ newbie->ipsacq_icmp_type = ixa->ixa_ipsec_icmp_type;
+ newbie->ipsacq_icmp_code = ixa->ixa_ipsec_icmp_code;
if (tunnel_mode) {
- newbie->ipsacq_inneraddrfam = io->ipsec_out_inaf;
- newbie->ipsacq_proto = io->ipsec_out_inaf == AF_INET6 ?
+ newbie->ipsacq_inneraddrfam = ixa->ixa_ipsec_inaf;
+ newbie->ipsacq_proto = ixa->ixa_ipsec_inaf == AF_INET6 ?
IPPROTO_IPV6 : IPPROTO_ENCAP;
- newbie->ipsacq_innersrcpfx = io->ipsec_out_insrcpfx;
- newbie->ipsacq_innerdstpfx = io->ipsec_out_indstpfx;
+ newbie->ipsacq_innersrcpfx = ixa->ixa_ipsec_insrcpfx;
+ newbie->ipsacq_innerdstpfx = ixa->ixa_ipsec_indstpfx;
IPSA_COPY_ADDR(newbie->ipsacq_innersrc,
- io->ipsec_out_insrc, io->ipsec_out_inaf);
+ ixa->ixa_ipsec_insrc, ixa->ixa_ipsec_inaf);
IPSA_COPY_ADDR(newbie->ipsacq_innerdst,
- io->ipsec_out_indst, io->ipsec_out_inaf);
+ ixa->ixa_ipsec_indst, ixa->ixa_ipsec_inaf);
} else {
- newbie->ipsacq_proto = io->ipsec_out_proto;
+ newbie->ipsacq_proto = ixa->ixa_ipsec_proto;
}
newbie->ipsacq_unique_id = unique_id;
- if (cr != NULL) {
- crhold(cr);
- newbie->ipsacq_cred = cr;
+ if (ixa->ixa_tsl != NULL) {
+ label_hold(ixa->ixa_tsl);
+ newbie->ipsacq_tsl = ixa->ixa_tsl;
}
} else {
/* Scan to the end of the list & insert. */
@@ -5477,13 +5079,16 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp)
while (lastone->b_next != NULL)
lastone = lastone->b_next;
- lastone->b_next = mp;
+ lastone->b_next = asyncmp;
if (newbie->ipsacq_numpackets++ == ipsacq_maxpackets) {
newbie->ipsacq_numpackets = ipsacq_maxpackets;
lastone = newbie->ipsacq_mp;
newbie->ipsacq_mp = lastone->b_next;
lastone->b_next = NULL;
- ip_drop_packet(lastone, B_FALSE, NULL, NULL,
+
+ /* Freeing the async message */
+ lastone = ip_xmit_attr_free_mblk(lastone);
+ ip_drop_packet(lastone, B_FALSE, NULL,
DROPPER(ipss, ipds_sadb_acquire_toofull),
&ipss->ipsec_sadb_dropper);
} else {
@@ -5518,17 +5123,17 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp)
* opportunities here in failure cases.
*/
(void) memset(&sel, 0, sizeof (sel));
- sel.ips_isv4 = io->ipsec_out_v4;
+ sel.ips_isv4 = (ixa->ixa_flags & IXAF_IS_IPV4) != 0;
if (tunnel_mode) {
- sel.ips_protocol = (io->ipsec_out_inaf == AF_INET) ?
+ sel.ips_protocol = (ixa->ixa_ipsec_inaf == AF_INET) ?
IPPROTO_ENCAP : IPPROTO_IPV6;
} else {
- sel.ips_protocol = io->ipsec_out_proto;
- sel.ips_local_port = io->ipsec_out_src_port;
- sel.ips_remote_port = io->ipsec_out_dst_port;
+ sel.ips_protocol = ixa->ixa_ipsec_proto;
+ sel.ips_local_port = ixa->ixa_ipsec_src_port;
+ sel.ips_remote_port = ixa->ixa_ipsec_dst_port;
}
- sel.ips_icmp_type = io->ipsec_out_icmp_type;
- sel.ips_icmp_code = io->ipsec_out_icmp_code;
+ sel.ips_icmp_type = ixa->ixa_ipsec_icmp_type;
+ sel.ips_icmp_code = ixa->ixa_ipsec_icmp_code;
sel.ips_is_icmp_inv_acq = 0;
if (af == AF_INET) {
sel.ips_local_addr_v4 = ipha->ipha_src;
@@ -5542,13 +5147,13 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp)
if (extended == NULL)
goto punt_extended;
- if (cr != NULL) {
+ if (ixa->ixa_tsl != NULL) {
/*
* XXX MLS correct condition here?
* XXX MLS other credential attributes in acquire?
* XXX malloc failure? don't fall back to original?
*/
- sens = sadb_make_sens_ext(cr, &sens_len);
+ sens = sadb_make_sens_ext(ixa->ixa_tsl, &sens_len);
if (sens == NULL) {
freeb(extended);
@@ -5585,13 +5190,13 @@ punt_extended:
void
sadb_destroy_acquire(ipsacq_t *acqrec, netstack_t *ns)
{
- mblk_t *mp;
+ mblk_t *mp;
ipsec_stack_t *ipss = ns->netstack_ipsec;
ASSERT(MUTEX_HELD(acqrec->ipsacq_linklock));
if (acqrec->ipsacq_policy != NULL) {
- IPPOL_REFRELE(acqrec->ipsacq_policy, ns);
+ IPPOL_REFRELE(acqrec->ipsacq_policy);
}
if (acqrec->ipsacq_act != NULL) {
IPACT_REFRELE(acqrec->ipsacq_act);
@@ -5602,9 +5207,9 @@ sadb_destroy_acquire(ipsacq_t *acqrec, netstack_t *ns)
if (acqrec->ipsacq_next != NULL)
acqrec->ipsacq_next->ipsacq_ptpn = acqrec->ipsacq_ptpn;
- if (acqrec->ipsacq_cred) {
- crfree(acqrec->ipsacq_cred);
- acqrec->ipsacq_cred = NULL;
+ if (acqrec->ipsacq_tsl != NULL) {
+ label_rele(acqrec->ipsacq_tsl);
+ acqrec->ipsacq_tsl = NULL;
}
/*
@@ -5618,7 +5223,9 @@ sadb_destroy_acquire(ipsacq_t *acqrec, netstack_t *ns)
mp = acqrec->ipsacq_mp;
acqrec->ipsacq_mp = mp->b_next;
mp->b_next = NULL;
- ip_drop_packet(mp, B_FALSE, NULL, NULL,
+ /* Freeing the async message */
+ mp = ip_xmit_attr_free_mblk(mp);
+ ip_drop_packet(mp, B_FALSE, NULL,
DROPPER(ipss, ipds_sadb_acquire_timeout),
&ipss->ipsec_sadb_dropper);
}
@@ -5795,24 +5402,23 @@ sadb_action_to_ecomb(uint8_t *start, uint8_t *limit, ipsec_action_t *act,
/* ARGSUSED */
int
-sadb_sens_len_from_cred(cred_t *cr)
+sadb_sens_len_from_label(ts_label_t *tsl)
{
int baselen = sizeof (sadb_sens_t) + _C_LEN * 4;
return (roundup(baselen, sizeof (uint64_t)));
}
void
-sadb_sens_from_cred(sadb_sens_t *sens, int exttype, cred_t *cr, int senslen)
+sadb_sens_from_label(sadb_sens_t *sens, int exttype, ts_label_t *tsl,
+ int senslen)
{
uint8_t *bitmap;
bslabel_t *sl;
- ts_label_t *tsl;
/* LINTED */
ASSERT((_C_LEN & 1) == 0);
ASSERT((senslen & 7) == 0);
- tsl = crgetlabel(cr);
sl = label2bslabel(tsl);
sens->sadb_sens_exttype = exttype;
@@ -5830,14 +5436,14 @@ sadb_sens_from_cred(sadb_sens_t *sens, int exttype, cred_t *cr, int senslen)
}
static sadb_sens_t *
-sadb_make_sens_ext(cred_t *cr, int *len)
+sadb_make_sens_ext(ts_label_t *tsl, int *len)
{
/* XXX allocation failure? */
- int sens_len = sadb_sens_len_from_cred(cr);
+ int sens_len = sadb_sens_len_from_label(tsl);
sadb_sens_t *sens = kmem_alloc(sens_len, KM_SLEEP);
- sadb_sens_from_cred(sens, SADB_EXT_SENSITIVITY, cr, sens_len);
+ sadb_sens_from_label(sens, SADB_EXT_SENSITIVITY, tsl, sens_len);
*len = sens_len;
@@ -5849,12 +5455,12 @@ sadb_make_sens_ext(cred_t *cr, int *len)
* With a special designated "not a label" cred_t ?
*/
/* ARGSUSED */
-cred_t *
-sadb_cred_from_sens(sadb_sens_t *sens, uint64_t *bitmap)
+ts_label_t *
+sadb_label_from_sens(sadb_sens_t *sens, uint64_t *bitmap)
{
int bitmap_len = SADB_64TO8(sens->sadb_sens_sens_len);
bslabel_t sl;
- cred_t *cr;
+ ts_label_t *tsl;
if (sens->sadb_sens_integ_level != 0)
return (NULL);
@@ -5868,13 +5474,13 @@ sadb_cred_from_sens(sadb_sens_t *sens, uint64_t *bitmap)
bcopy(bitmap, &((_bslabel_impl_t *)&sl)->compartments,
bitmap_len);
- cr = newcred_from_bslabel(&sl, sens->sadb_sens_dpd, KM_NOSLEEP);
- if (cr == NULL)
- return (cr);
+ tsl = labelalloc(&sl, sens->sadb_sens_dpd, KM_NOSLEEP);
+ if (tsl == NULL)
+ return (NULL);
if (sens->sadb_x_sens_flags & SADB_X_SENS_UNLABELED)
- crgetlabel(cr)->tsl_flags |= TSLF_UNLABELED;
- return (cr);
+ tsl->tsl_flags |= TSLF_UNLABELED;
+ return (tsl);
}
/* End XXX label-library-leakage */
@@ -6359,12 +5965,13 @@ sadb_getspi(keysock_in_t *ksi, uint32_t master_spi, int *diagnostic,
*
* Caller frees the message, so we don't have to here.
*
- * NOTE: The ip_q parameter may be used in the future for ACQUIRE
+ * NOTE: The pfkey_q parameter may be used in the future for ACQUIRE
* failures.
*/
/* ARGSUSED */
void
-sadb_in_acquire(sadb_msg_t *samsg, sadbp_t *sp, queue_t *ip_q, netstack_t *ns)
+sadb_in_acquire(sadb_msg_t *samsg, sadbp_t *sp, queue_t *pfkey_q,
+ netstack_t *ns)
{
int i;
ipsacq_t *acqrec;
@@ -6624,36 +6231,6 @@ sadb_replay_delete(ipsa_t *assoc)
}
/*
- * Given a queue that presumably points to IP, send a T_BIND_REQ for _proto_
- * down. The caller will handle the T_BIND_ACK locally.
- */
-boolean_t
-sadb_t_bind_req(queue_t *q, int proto)
-{
- struct T_bind_req *tbr;
- mblk_t *mp;
-
- mp = allocb_cred(sizeof (struct T_bind_req) + 1, kcred, NOPID);
- if (mp == NULL) {
- /* cmn_err(CE_WARN, */
- /* "sadb_t_bind_req(%d): couldn't allocate mblk\n", proto); */
- return (B_FALSE);
- }
- mp->b_datap->db_type = M_PCPROTO;
- tbr = (struct T_bind_req *)mp->b_rptr;
- mp->b_wptr += sizeof (struct T_bind_req);
- tbr->PRIM_type = T_BIND_REQ;
- tbr->ADDR_length = 0;
- tbr->ADDR_offset = 0;
- tbr->CONIND_number = 0;
- *mp->b_wptr = (uint8_t)proto;
- mp->b_wptr++;
-
- putnext(q, mp);
- return (B_TRUE);
-}
-
-/*
* Special front-end to ipsec_rl_strlog() dealing with SA failure.
* this is designed to take only a format string with "* %x * %s *", so
* that "spi" is printed first, then "addr" is converted using inet_pton().
@@ -6676,7 +6253,6 @@ ipsec_assocfailure(short mid, short sid, char level, ushort_t sl, char *fmt,
/*
* Fills in a reference to the policy, if any, from the conn, in *ppp
- * Releases a reference to the passed conn_t.
*/
static void
ipsec_conn_pol(ipsec_selector_t *sel, conn_t *connp, ipsec_policy_t **ppp)
@@ -6684,15 +6260,14 @@ ipsec_conn_pol(ipsec_selector_t *sel, conn_t *connp, ipsec_policy_t **ppp)
ipsec_policy_t *pp;
ipsec_latch_t *ipl = connp->conn_latch;
- if ((ipl != NULL) && (ipl->ipl_out_policy != NULL)) {
- pp = ipl->ipl_out_policy;
+ if ((ipl != NULL) && (connp->conn_ixa->ixa_ipsec_policy != NULL)) {
+ pp = connp->conn_ixa->ixa_ipsec_policy;
IPPOL_REFHOLD(pp);
} else {
- pp = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, NULL, sel,
+ pp = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, sel,
connp->conn_netstack);
}
*ppp = pp;
- CONN_DEC_REF(connp);
}
/*
@@ -6753,6 +6328,7 @@ ipsec_udp_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp, ip_stack_t *ipst)
mutex_exit(&connfp->connf_lock);
ipsec_conn_pol(sel, connp, ppp);
+ CONN_DEC_REF(connp);
}
static conn_t *
@@ -6866,6 +6442,7 @@ ipsec_tcp_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp, ip_stack_t *ipst)
}
ipsec_conn_pol(sel, connp, ppp);
+ CONN_DEC_REF(connp);
}
static void
@@ -6895,21 +6472,27 @@ ipsec_sctp_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp,
pptr[0] = sel->ips_remote_port;
pptr[1] = sel->ips_local_port;
+ /*
+ * For labeled systems, there's no need to check the
+ * label here. It's known to be good as we checked
+ * before allowing the connection to become bound.
+ */
if (sel->ips_isv4) {
in6_addr_t src, dst;
IN6_IPADDR_TO_V4MAPPED(sel->ips_remote_addr_v4, &dst);
IN6_IPADDR_TO_V4MAPPED(sel->ips_local_addr_v4, &src);
connp = sctp_find_conn(&dst, &src, ports, ALL_ZONES,
- ipst->ips_netstack->netstack_sctp);
+ 0, ipst->ips_netstack->netstack_sctp);
} else {
connp = sctp_find_conn(&sel->ips_remote_addr_v6,
&sel->ips_local_addr_v6, ports, ALL_ZONES,
- ipst->ips_netstack->netstack_sctp);
+ 0, ipst->ips_netstack->netstack_sctp);
}
if (connp == NULL)
return;
ipsec_conn_pol(sel, connp, ppp);
+ CONN_DEC_REF(connp);
}
/*
@@ -6985,7 +6568,7 @@ ipsec_get_inverse_acquire_sel(ipsec_selector_t *sel, sadb_address_t *srcext,
static int
ipsec_tun_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp,
sadb_address_t *innsrcext, sadb_address_t *inndstext, ipsec_tun_pol_t *itp,
- int *diagnostic, netstack_t *ns)
+ int *diagnostic)
{
int err;
ipsec_policy_head_t *polhead;
@@ -7045,8 +6628,7 @@ ipsec_tun_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp,
polhead = itp->itp_policy;
ASSERT(polhead != NULL);
rw_enter(&polhead->iph_lock, RW_READER);
- *ppp = ipsec_find_policy_head(NULL, polhead,
- IPSEC_TYPE_INBOUND, sel, ns);
+ *ppp = ipsec_find_policy_head(NULL, polhead, IPSEC_TYPE_INBOUND, sel);
rw_exit(&polhead->iph_lock);
/*
@@ -7059,6 +6641,10 @@ ipsec_tun_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp,
return (0);
}
+/*
+ * For sctp conn_faddr is the primary address, hence this is of limited
+ * use for sctp.
+ */
static void
ipsec_oth_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp,
ip_stack_t *ipst)
@@ -7068,7 +6654,7 @@ ipsec_oth_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp,
conn_t *connp;
if (isv4) {
- connfp = &ipst->ips_ipcl_proto_fanout[sel->ips_protocol];
+ connfp = &ipst->ips_ipcl_proto_fanout_v4[sel->ips_protocol];
} else {
connfp = &ipst->ips_ipcl_proto_fanout_v6[sel->ips_protocol];
}
@@ -7076,17 +6662,20 @@ ipsec_oth_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp,
mutex_enter(&connfp->connf_lock);
for (connp = connfp->connf_head; connp != NULL;
connp = connp->conn_next) {
- if (!((isv4 && !((connp->conn_src == 0 ||
- connp->conn_src == sel->ips_local_addr_v4) &&
- (connp->conn_rem == 0 ||
- connp->conn_rem == sel->ips_remote_addr_v4))) ||
- (!isv4 && !((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) ||
- IN6_ARE_ADDR_EQUAL(&connp->conn_srcv6,
- &sel->ips_local_addr_v6)) &&
- (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6) ||
- IN6_ARE_ADDR_EQUAL(&connp->conn_remv6,
- &sel->ips_remote_addr_v6)))))) {
- break;
+ if (isv4) {
+ if ((connp->conn_laddr_v4 == INADDR_ANY ||
+ connp->conn_laddr_v4 == sel->ips_local_addr_v4) &&
+ (connp->conn_faddr_v4 == INADDR_ANY ||
+ connp->conn_faddr_v4 == sel->ips_remote_addr_v4))
+ break;
+ } else {
+ if ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
+ IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
+ &sel->ips_local_addr_v6)) &&
+ (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) ||
+ IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
+ &sel->ips_remote_addr_v6)))
+ break;
}
}
if (connp == NULL) {
@@ -7098,6 +6687,7 @@ ipsec_oth_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp,
mutex_exit(&connfp->connf_lock);
ipsec_conn_pol(sel, connp, ppp);
+ CONN_DEC_REF(connp);
}
/*
@@ -7245,7 +6835,7 @@ ipsec_construct_inverse_acquire(sadb_msg_t *samsg, sadb_ext_t *extv[],
isel.ips_isv4 = (sel.ips_protocol == IPPROTO_ENCAP);
} /* Else isel is initialized by ipsec_tun_pol(). */
err = ipsec_tun_pol(&isel, &pp, innsrcext, inndstext, itp,
- &diagnostic, ns);
+ &diagnostic);
/*
* NOTE: isel isn't used for now, but in RFC 430x IPsec, it
* may be.
@@ -7263,8 +6853,7 @@ ipsec_construct_inverse_acquire(sadb_msg_t *samsg, sadb_ext_t *extv[],
* look in the global policy.
*/
if (pp == NULL) {
- pp = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, NULL, NULL, &sel,
- ns);
+ pp = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, NULL, &sel, ns);
if (pp == NULL) {
/* There's no global policy. */
err = ENOENT;
@@ -7282,7 +6871,7 @@ ipsec_construct_inverse_acquire(sadb_msg_t *samsg, sadb_ext_t *extv[],
(itp != NULL && (itp->itp_flags & ITPF_P_TUNNEL)),
samsg->sadb_msg_seq, samsg->sadb_msg_pid, sens, ns);
if (pp != NULL) {
- IPPOL_REFRELE(pp, ns);
+ IPPOL_REFRELE(pp);
}
ASSERT(err == 0 && diagnostic == 0);
if (retmp == NULL)
@@ -7306,37 +6895,49 @@ bail:
/*
* sadb_set_lpkt: Return TRUE if we can swap in a value to ipsa->ipsa_lpkt and
* freemsg the previous value. Return FALSE if we lost the race and the SA is
- * in a non-LARVAL state. free clue: ip_drop_packet(NULL) is safe.
+ * in a non-LARVAL state. We also return FALSE if we can't allocate the attrmp.
*/
boolean_t
-sadb_set_lpkt(ipsa_t *ipsa, mblk_t *npkt, netstack_t *ns)
+sadb_set_lpkt(ipsa_t *ipsa, mblk_t *npkt, ip_recv_attr_t *ira)
{
- mblk_t *opkt;
+ mblk_t *opkt;
+ netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack;
ipsec_stack_t *ipss = ns->netstack_ipsec;
boolean_t is_larval;
- /*
- * Check the packet's netstack id in case we go asynch with a
- * taskq_dispatch.
- */
- ASSERT(((ipsec_in_t *)npkt->b_rptr)->ipsec_in_type == IPSEC_IN);
- ASSERT(((ipsec_in_t *)npkt->b_rptr)->ipsec_in_stackid ==
- ns->netstack_stackid);
-
mutex_enter(&ipsa->ipsa_lock);
is_larval = (ipsa->ipsa_state == IPSA_STATE_LARVAL);
if (is_larval) {
- opkt = ipsa->ipsa_lpkt;
- ipsa->ipsa_lpkt = npkt;
+ mblk_t *attrmp;
+
+ attrmp = ip_recv_attr_to_mblk(ira);
+ if (attrmp == NULL) {
+ ill_t *ill = ira->ira_ill;
+
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", npkt, ill);
+ freemsg(npkt);
+ opkt = NULL;
+ is_larval = B_FALSE;
+ } else {
+ ASSERT(attrmp->b_cont == NULL);
+ attrmp->b_cont = npkt;
+ npkt = attrmp;
+ opkt = ipsa->ipsa_lpkt;
+ ipsa->ipsa_lpkt = npkt;
+ }
} else {
/* We lost the race. */
opkt = NULL;
}
mutex_exit(&ipsa->ipsa_lock);
- ip_drop_packet(opkt, B_TRUE, NULL, NULL,
- DROPPER(ipss, ipds_sadb_inlarval_replace),
- &ipss->ipsec_sadb_dropper);
+ if (opkt != NULL) {
+ opkt = ip_recv_attr_free_mblk(opkt);
+ ip_drop_packet(opkt, B_TRUE, ira->ira_ill,
+ DROPPER(ipss, ipds_sadb_inlarval_replace),
+ &ipss->ipsec_sadb_dropper);
+ }
return (is_larval);
}
@@ -7353,7 +6954,6 @@ sadb_clear_lpkt(ipsa_t *ipsa)
opkt = ipsa->ipsa_lpkt;
ipsa->ipsa_lpkt = NULL;
mutex_exit(&ipsa->ipsa_lock);
-
return (opkt);
}
@@ -7361,18 +6961,18 @@ sadb_clear_lpkt(ipsa_t *ipsa)
* Buffer a packet that's in IDLE state as set by Solaris Clustering.
*/
void
-sadb_buf_pkt(ipsa_t *ipsa, mblk_t *bpkt, netstack_t *ns)
+sadb_buf_pkt(ipsa_t *ipsa, mblk_t *bpkt, ip_recv_attr_t *ira)
{
+ netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack;
ipsec_stack_t *ipss = ns->netstack_ipsec;
- extern void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t,
- sa_family_t, in6_addr_t, in6_addr_t, void *);
in6_addr_t *srcaddr = (in6_addr_t *)(&ipsa->ipsa_srcaddr);
in6_addr_t *dstaddr = (in6_addr_t *)(&ipsa->ipsa_dstaddr);
+ mblk_t *mp;
ASSERT(ipsa->ipsa_state == IPSA_STATE_IDLE);
if (cl_inet_idlesa == NULL) {
- ip_drop_packet(bpkt, B_TRUE, NULL, NULL,
+ ip_drop_packet(bpkt, B_TRUE, ira->ira_ill,
DROPPER(ipss, ipds_sadb_inidle_overflow),
&ipss->ipsec_sadb_dropper);
return;
@@ -7382,13 +6982,14 @@ sadb_buf_pkt(ipsa_t *ipsa, mblk_t *bpkt, netstack_t *ns)
(ipsa->ipsa_type == SADB_SATYPE_AH) ? IPPROTO_AH : IPPROTO_ESP,
ipsa->ipsa_spi, ipsa->ipsa_addrfam, *srcaddr, *dstaddr, NULL);
- /*
- * Check the packet's netstack id in case we go asynch with a
- * taskq_dispatch.
- */
- ASSERT(((ipsec_in_t *)bpkt->b_rptr)->ipsec_in_type == IPSEC_IN);
- ASSERT(((ipsec_in_t *)bpkt->b_rptr)->ipsec_in_stackid ==
- ns->netstack_stackid);
+ mp = ip_recv_attr_to_mblk(ira);
+ if (mp == NULL) {
+ ip_drop_packet(bpkt, B_TRUE, ira->ira_ill,
+ DROPPER(ipss, ipds_sadb_inidle_overflow),
+ &ipss->ipsec_sadb_dropper);
+ return;
+ }
+ linkb(mp, bpkt);
mutex_enter(&ipsa->ipsa_lock);
ipsa->ipsa_mblkcnt++;
@@ -7399,16 +7000,17 @@ sadb_buf_pkt(ipsa_t *ipsa, mblk_t *bpkt, netstack_t *ns)
ipsa->ipsa_bpkt_tail = bpkt;
if (ipsa->ipsa_mblkcnt > SADB_MAX_IDLEPKTS) {
mblk_t *tmp;
+
tmp = ipsa->ipsa_bpkt_head;
ipsa->ipsa_bpkt_head = ipsa->ipsa_bpkt_head->b_next;
- ip_drop_packet(tmp, B_TRUE, NULL, NULL,
+ tmp = ip_recv_attr_free_mblk(tmp);
+ ip_drop_packet(tmp, B_TRUE, NULL,
DROPPER(ipss, ipds_sadb_inidle_overflow),
&ipss->ipsec_sadb_dropper);
ipsa->ipsa_mblkcnt --;
}
}
mutex_exit(&ipsa->ipsa_lock);
-
}
/*
@@ -7419,30 +7021,28 @@ void
sadb_clear_buf_pkt(void *ipkt)
{
mblk_t *tmp, *buf_pkt;
- netstack_t *ns;
- ipsec_in_t *ii;
+ ip_recv_attr_t iras;
buf_pkt = (mblk_t *)ipkt;
- ii = (ipsec_in_t *)buf_pkt->b_rptr;
- ASSERT(ii->ipsec_in_type == IPSEC_IN);
- ns = netstack_find_by_stackid(ii->ipsec_in_stackid);
- if (ns != NULL && ns != ii->ipsec_in_ns) {
- netstack_rele(ns);
- ns = NULL; /* For while-loop below. */
- }
-
while (buf_pkt != NULL) {
+ mblk_t *data_mp;
+
tmp = buf_pkt->b_next;
buf_pkt->b_next = NULL;
- if (ns != NULL)
- ip_fanout_proto_again(buf_pkt, NULL, NULL, NULL);
- else
- freemsg(buf_pkt);
+
+ data_mp = buf_pkt->b_cont;
+ buf_pkt->b_cont = NULL;
+ if (!ip_recv_attr_from_mblk(buf_pkt, &iras)) {
+ /* The ill or ip_stack_t disappeared on us. */
+ ip_drop_input("ip_recv_attr_from_mblk", data_mp, NULL);
+ freemsg(data_mp);
+ } else {
+ ip_input_post_ipsec(data_mp, &iras);
+ }
+ ira_cleanup(&iras, B_TRUE);
buf_pkt = tmp;
}
- if (ns != NULL)
- netstack_rele(ns);
}
/*
* Walker callback used by sadb_alg_update() to free/create crypto
@@ -7454,6 +7054,8 @@ struct sadb_update_alg_state {
ipsec_algtype_t alg_type;
uint8_t alg_id;
boolean_t is_added;
+ boolean_t async_auth;
+ boolean_t async_encr;
};
static void
@@ -7470,6 +7072,15 @@ sadb_alg_update_cb(isaf_t *head, ipsa_t *entry, void *cookie)
mutex_enter(&entry->ipsa_lock);
+ if ((entry->ipsa_encr_alg != SADB_EALG_NONE && entry->ipsa_encr_alg !=
+ SADB_EALG_NULL && update_state->async_encr) ||
+ (entry->ipsa_auth_alg != SADB_AALG_NONE &&
+ update_state->async_auth)) {
+ entry->ipsa_flags |= IPSA_F_ASYNC;
+ } else {
+ entry->ipsa_flags &= ~IPSA_F_ASYNC;
+ }
+
switch (update_state->alg_type) {
case IPSEC_ALG_AUTH:
if (entry->ipsa_auth_alg == update_state->alg_id)
@@ -7511,8 +7122,11 @@ sadb_alg_update_cb(isaf_t *head, ipsa_t *entry, void *cookie)
}
/*
- * Invoked by IP when an software crypto provider has been updated.
- * The type and id of the corresponding algorithm is passed as argument.
+ * Invoked by IP when an software crypto provider has been updated, or if
+ * the crypto synchrony changes. The type and id of the corresponding
+ * algorithm is passed as argument. The type is set to ALL in the case of
+ * a synchrony change.
+ *
* is_added is B_TRUE if the provider was added, B_FALSE if it was
* removed. The function updates the SADB and free/creates the
* context templates associated with SAs if needed.
@@ -7529,12 +7143,17 @@ sadb_alg_update(ipsec_algtype_t alg_type, uint8_t alg_id, boolean_t is_added,
struct sadb_update_alg_state update_state;
ipsecah_stack_t *ahstack = ns->netstack_ipsecah;
ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
+ ipsec_stack_t *ipss = ns->netstack_ipsec;
update_state.alg_type = alg_type;
update_state.alg_id = alg_id;
update_state.is_added = is_added;
+ update_state.async_auth = ipss->ipsec_algs_exec_mode[IPSEC_ALG_AUTH] ==
+ IPSEC_ALGS_EXEC_ASYNC;
+ update_state.async_encr = ipss->ipsec_algs_exec_mode[IPSEC_ALG_ENCR] ==
+ IPSEC_ALGS_EXEC_ASYNC;
- if (alg_type == IPSEC_ALG_AUTH) {
+ if (alg_type == IPSEC_ALG_AUTH || alg_type == IPSEC_ALG_ALL) {
/* walk the AH tables only for auth. algorithm changes */
SADB_ALG_UPDATE_WALK(ahstack->ah_sadb.s_v4, sdb_of);
SADB_ALG_UPDATE_WALK(ahstack->ah_sadb.s_v4, sdb_if);
@@ -7693,15 +7312,15 @@ ipsec_check_key(crypto_mech_type_t mech_type, sadb_key_t *sadb_key,
*
* This is inelegant and really could use refactoring.
*/
-int
-sadb_whack_label(mblk_t **mpp, ipsa_t *assoc)
+mblk_t *
+sadb_whack_label_v4(mblk_t *mp, ipsa_t *assoc, kstat_named_t *counter,
+ ipdropper_t *dropper)
{
int delta;
int plen;
dblk_t *db;
int hlen;
uint8_t *opt_storage = assoc->ipsa_opt_storage;
- mblk_t *mp = *mpp;
ipha_t *ipha = (ipha_t *)mp->b_rptr;
plen = ntohs(ipha->ipha_length);
@@ -7731,8 +7350,10 @@ sadb_whack_label(mblk_t **mpp, ipsa_t *assoc)
new_mp = allocb_tmpl(hlen + copylen +
(mp->b_rptr - mp->b_datap->db_base), mp);
- if (new_mp == NULL)
- return (ENOMEM);
+ if (new_mp == NULL) {
+ ip_drop_packet(mp, B_FALSE, NULL, counter, dropper);
+ return (NULL);
+ }
/* keep the bias */
new_mp->b_rptr += mp->b_rptr - mp->b_datap->db_base;
@@ -7743,7 +7364,7 @@ sadb_whack_label(mblk_t **mpp, ipsa_t *assoc)
new_mp->b_cont = mp->b_cont;
freeb(mp);
}
- *mpp = mp = new_mp;
+ mp = new_mp;
ipha = (ipha_t *)mp->b_rptr;
}
@@ -7768,11 +7389,12 @@ sadb_whack_label(mblk_t **mpp, ipsa_t *assoc)
ipha->ipha_length = htons(plen);
- return (0);
+ return (mp);
}
-int
-sadb_whack_label_v6(mblk_t **mpp, ipsa_t *assoc)
+mblk_t *
+sadb_whack_label_v6(mblk_t *mp, ipsa_t *assoc, kstat_named_t *counter,
+ ipdropper_t *dropper)
{
int delta;
int plen;
@@ -7780,7 +7402,6 @@ sadb_whack_label_v6(mblk_t **mpp, ipsa_t *assoc)
int hlen;
uint8_t *opt_storage = assoc->ipsa_opt_storage;
uint_t sec_opt_len; /* label option length not including type, len */
- mblk_t *mp = *mpp;
ip6_t *ip6h = (ip6_t *)mp->b_rptr;
plen = ntohs(ip6h->ip6_plen);
@@ -7818,8 +7439,10 @@ sadb_whack_label_v6(mblk_t **mpp, ipsa_t *assoc)
copylen = hdr_len;
new_mp = allocb_tmpl(hlen + copylen +
(mp->b_rptr - mp->b_datap->db_base), mp);
- if (new_mp == NULL)
- return (ENOMEM);
+ if (new_mp == NULL) {
+ ip_drop_packet(mp, B_FALSE, NULL, counter, dropper);
+ return (NULL);
+ }
/* keep the bias */
new_mp->b_rptr += mp->b_rptr - mp->b_datap->db_base;
@@ -7830,7 +7453,7 @@ sadb_whack_label_v6(mblk_t **mpp, ipsa_t *assoc)
new_mp->b_cont = mp->b_cont;
freeb(mp);
}
- *mpp = mp = new_mp;
+ mp = new_mp;
ip6h = (ip6_t *)mp->b_rptr;
}
@@ -7856,10 +7479,46 @@ sadb_whack_label_v6(mblk_t **mpp, ipsa_t *assoc)
ip6h->ip6_plen = htons(plen);
- return (0);
+ return (mp);
}
+/* Whack the labels and update ip_xmit_attr_t as needed */
+mblk_t *
+sadb_whack_label(mblk_t *mp, ipsa_t *assoc, ip_xmit_attr_t *ixa,
+ kstat_named_t *counter, ipdropper_t *dropper)
+{
+ int adjust;
+ int iplen;
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ ipha_t *ipha = (ipha_t *)mp->b_rptr;
+
+ ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
+ iplen = ntohs(ipha->ipha_length);
+ mp = sadb_whack_label_v4(mp, assoc, counter, dropper);
+ if (mp == NULL)
+ return (NULL);
+
+ ipha = (ipha_t *)mp->b_rptr;
+ ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
+ adjust = (int)ntohs(ipha->ipha_length) - iplen;
+ } else {
+ ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+
+ ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
+ iplen = ntohs(ip6h->ip6_plen);
+ mp = sadb_whack_label_v6(mp, assoc, counter, dropper);
+ if (mp == NULL)
+ return (NULL);
+
+ ip6h = (ip6_t *)mp->b_rptr;
+ ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
+ adjust = (int)ntohs(ip6h->ip6_plen) - iplen;
+ }
+ ixa->ixa_pktlen += adjust;
+ ixa->ixa_ip_hdr_length += adjust;
+ return (mp);
+}
/*
* If this is an outgoing SA then add some fuzz to the
@@ -7969,7 +7628,7 @@ age_pair_peer_list(templist_t *haspeerlist, sadb_t *sp, boolean_t outbound)
*((ipaddr_t *)&dying->
ipsa_srcaddr));
}
- bucket = &(sp->sdb_of[outhash]);
+ bucket = &(sp->sdb_of[outhash]);
}
mutex_enter(&bucket->isaf_lock);
diff --git a/usr/src/uts/common/inet/ip/spd.c b/usr/src/uts/common/inet/ip/spd.c
index 37a9f47432..e6903cefc2 100644
--- a/usr/src/uts/common/inet/ip/spd.c
+++ b/usr/src/uts/common/inet/ip/spd.c
@@ -37,6 +37,7 @@
#include <sys/strsubr.h>
#include <sys/strsun.h>
#include <sys/strlog.h>
+#include <sys/strsun.h>
#include <sys/cmn_err.h>
#include <sys/zone.h>
@@ -59,7 +60,6 @@
#include <net/pfkeyv2.h>
#include <net/pfpolicy.h>
-#include <inet/ipsec_info.h>
#include <inet/sadb.h>
#include <inet/ipsec_impl.h>
@@ -75,16 +75,8 @@
static void ipsec_update_present_flags(ipsec_stack_t *);
static ipsec_act_t *ipsec_act_wildcard_expand(ipsec_act_t *, uint_t *,
netstack_t *);
-static void ipsec_out_free(void *);
-static void ipsec_in_free(void *);
-static mblk_t *ipsec_attach_global_policy(mblk_t **, conn_t *,
- ipsec_selector_t *, netstack_t *);
-static mblk_t *ipsec_apply_global_policy(mblk_t *, conn_t *,
- ipsec_selector_t *, netstack_t *);
static mblk_t *ipsec_check_ipsecin_policy(mblk_t *, ipsec_policy_t *,
- ipha_t *, ip6_t *, uint64_t, netstack_t *);
-static void ipsec_in_release_refs(ipsec_in_t *);
-static void ipsec_out_release_refs(ipsec_out_t *);
+ ipha_t *, ip6_t *, uint64_t, ip_recv_attr_t *, netstack_t *);
static void ipsec_action_free_table(ipsec_action_t *);
static void ipsec_action_reclaim(void *);
static void ipsec_action_reclaim_stack(netstack_t *);
@@ -105,9 +97,9 @@ typedef enum { SELRET_NOMEM, SELRET_BADPKT, SELRET_SUCCESS, SELRET_TUNFRAG}
static selret_t ipsec_init_inbound_sel(ipsec_selector_t *, mblk_t *,
ipha_t *, ip6_t *, uint8_t);
-static boolean_t ipsec_check_ipsecin_action(struct ipsec_in_s *, mblk_t *,
+static boolean_t ipsec_check_ipsecin_action(ip_recv_attr_t *, mblk_t *,
struct ipsec_action_s *, ipha_t *ipha, ip6_t *ip6h, const char **,
- kstat_named_t **);
+ kstat_named_t **, netstack_t *);
static void ipsec_unregister_prov_update(void);
static void ipsec_prov_update_callback_stack(uint32_t, void *, netstack_t *);
static boolean_t ipsec_compare_action(ipsec_policy_t *, ipsec_policy_t *);
@@ -117,15 +109,13 @@ static void ipsec_kstat_destroy(ipsec_stack_t *);
static int ipsec_free_tables(ipsec_stack_t *);
static int tunnel_compare(const void *, const void *);
static void ipsec_freemsg_chain(mblk_t *);
-static void ip_drop_packet_chain(mblk_t *, boolean_t, ill_t *, ire_t *,
+static void ip_drop_packet_chain(mblk_t *, boolean_t, ill_t *,
struct kstat_named *, ipdropper_t *);
static boolean_t ipsec_kstat_init(ipsec_stack_t *);
static void ipsec_kstat_destroy(ipsec_stack_t *);
static int ipsec_free_tables(ipsec_stack_t *);
static int tunnel_compare(const void *, const void *);
static void ipsec_freemsg_chain(mblk_t *);
-static void ip_drop_packet_chain(mblk_t *, boolean_t, ill_t *, ire_t *,
- struct kstat_named *, ipdropper_t *);
/*
* Selector hash table is statically sized at module load time.
@@ -150,16 +140,15 @@ static crypto_notify_handle_t prov_update_handle = NULL;
static kmem_cache_t *ipsec_action_cache;
static kmem_cache_t *ipsec_sel_cache;
static kmem_cache_t *ipsec_pol_cache;
-static kmem_cache_t *ipsec_info_cache;
/* Frag cache prototypes */
-static void ipsec_fragcache_clean(ipsec_fragcache_t *);
+static void ipsec_fragcache_clean(ipsec_fragcache_t *, ipsec_stack_t *);
static ipsec_fragcache_entry_t *fragcache_delentry(int,
- ipsec_fragcache_entry_t *, ipsec_fragcache_t *);
+ ipsec_fragcache_entry_t *, ipsec_fragcache_t *, ipsec_stack_t *);
boolean_t ipsec_fragcache_init(ipsec_fragcache_t *);
-void ipsec_fragcache_uninit(ipsec_fragcache_t *);
-mblk_t *ipsec_fragcache_add(ipsec_fragcache_t *, mblk_t *, mblk_t *, int,
- ipsec_stack_t *);
+void ipsec_fragcache_uninit(ipsec_fragcache_t *, ipsec_stack_t *ipss);
+mblk_t *ipsec_fragcache_add(ipsec_fragcache_t *, mblk_t *, mblk_t *,
+ int, ipsec_stack_t *);
int ipsec_hdr_pullup_needed = 0;
int ipsec_weird_null_inbound_policy = 0;
@@ -240,23 +229,28 @@ ipsec_freemsg_chain(mblk_t *mp)
ASSERT(mp->b_prev == NULL);
mpnext = mp->b_next;
mp->b_next = NULL;
- freemsg(mp); /* Always works, even if NULL */
+ freemsg(mp);
mp = mpnext;
}
}
-/* ip_drop all messages in an mblk chain */
+/*
+ * ip_drop all messages in an mblk chain
+ * Can handle a b_next chain of ip_recv_attr_t mblks, or just a b_next chain
+ * of data.
+ */
static void
-ip_drop_packet_chain(mblk_t *mp, boolean_t inbound, ill_t *arriving,
- ire_t *outbound_ire, struct kstat_named *counter, ipdropper_t *who_called)
+ip_drop_packet_chain(mblk_t *mp, boolean_t inbound, ill_t *ill,
+ struct kstat_named *counter, ipdropper_t *who_called)
{
mblk_t *mpnext;
while (mp != NULL) {
ASSERT(mp->b_prev == NULL);
mpnext = mp->b_next;
mp->b_next = NULL;
- ip_drop_packet(mp, inbound, arriving, outbound_ire, counter,
- who_called);
+ if (ip_recv_attr_is_mblk(mp))
+ mp = ip_recv_attr_free_mblk(mp);
+ ip_drop_packet(mp, inbound, ill, counter, who_called);
mp = mpnext;
}
}
@@ -287,7 +281,7 @@ ipsec_policy_cmpbyid(const void *a, const void *b)
* ipsl_sel (selector set), so an entry with a NULL ipsp_sel is not
* actually in-tree but rather a template node being used in
* an avl_find query; see ipsec_policy_delete(). This gives us
- * a placeholder in the ordering just before the the first entry with
+ * a placeholder in the ordering just before the first entry with
* a key >= the one we're looking for, so we can walk forward from
* that point to get the remaining entries with the same id.
*/
@@ -443,7 +437,6 @@ ipsec_policy_g_destroy(void)
kmem_cache_destroy(ipsec_action_cache);
kmem_cache_destroy(ipsec_sel_cache);
kmem_cache_destroy(ipsec_pol_cache);
- kmem_cache_destroy(ipsec_info_cache);
ipsec_unregister_prov_update();
@@ -693,9 +686,6 @@ ipsec_policy_g_init(void)
ipsec_pol_cache = kmem_cache_create("ipsec_policy",
sizeof (ipsec_policy_t), _POINTER_ALIGNMENT, NULL, NULL,
NULL, NULL, NULL, 0);
- ipsec_info_cache = kmem_cache_create("ipsec_info",
- sizeof (ipsec_info_t), _POINTER_ALIGNMENT, NULL, NULL,
- NULL, NULL, NULL, 0);
/*
* We want to be informed each time a stack is created or
@@ -920,6 +910,7 @@ ipsec_copy_policy(const ipsec_policy_t *src)
src->ipsp_sel->ipsl_refs++;
HASH_NULL(dst, ipsp_hash);
+ dst->ipsp_netstack = src->ipsp_netstack;
dst->ipsp_refs = 1;
dst->ipsp_sel = src->ipsp_sel;
dst->ipsp_act = src->ipsp_act;
@@ -1469,7 +1460,7 @@ ipsec_req_from_conn(conn_t *connp, ipsec_req_t *req, int af)
bzero(req, sizeof (*req));
- mutex_enter(&connp->conn_lock);
+ ASSERT(MUTEX_HELD(&connp->conn_lock));
ipl = connp->conn_latch;
/*
@@ -1478,20 +1469,20 @@ ipsec_req_from_conn(conn_t *connp, ipsec_req_t *req, int af)
* look at configured policy.
*/
if (ipl != NULL) {
- if (ipl->ipl_in_action != NULL) {
- rv = ipsec_req_from_act(ipl->ipl_in_action, req);
+ if (connp->conn_latch_in_action != NULL) {
+ rv = ipsec_req_from_act(connp->conn_latch_in_action,
+ req);
goto done;
}
- if (ipl->ipl_in_policy != NULL) {
- rv = ipsec_req_from_act(ipl->ipl_in_policy->ipsp_act,
- req);
+ if (connp->conn_latch_in_policy != NULL) {
+ rv = ipsec_req_from_act(
+ connp->conn_latch_in_policy->ipsp_act, req);
goto done;
}
}
if (connp->conn_policy != NULL)
rv = ipsec_req_from_head(connp->conn_policy, req, af);
done:
- mutex_exit(&connp->conn_lock);
return (rv);
}
@@ -1502,66 +1493,18 @@ ipsec_actvec_free(ipsec_act_t *act, uint_t nact)
}
/*
- * When outbound policy is not cached, look it up the hard way and attach
- * an ipsec_out_t to the packet..
- */
-static mblk_t *
-ipsec_attach_global_policy(mblk_t **mp, conn_t *connp, ipsec_selector_t *sel,
- netstack_t *ns)
-{
- ipsec_policy_t *p;
-
- p = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, NULL, sel, ns);
-
- if (p == NULL)
- return (NULL);
- return (ipsec_attach_ipsec_out(mp, connp, p, sel->ips_protocol, ns));
-}
-
-/*
- * We have an ipsec_out already, but don't have cached policy; fill it in
- * with the right actions.
- */
-static mblk_t *
-ipsec_apply_global_policy(mblk_t *ipsec_mp, conn_t *connp,
- ipsec_selector_t *sel, netstack_t *ns)
-{
- ipsec_out_t *io;
- ipsec_policy_t *p;
-
- ASSERT(ipsec_mp->b_datap->db_type == M_CTL);
- ASSERT(ipsec_mp->b_cont->b_datap->db_type == M_DATA);
-
- io = (ipsec_out_t *)ipsec_mp->b_rptr;
-
- if (io->ipsec_out_policy == NULL) {
- p = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, io, sel, ns);
- io->ipsec_out_policy = p;
- }
- return (ipsec_mp);
-}
-
-
-/*
* Consumes a reference to ipsp.
*/
static mblk_t *
-ipsec_check_loopback_policy(mblk_t *first_mp, boolean_t mctl_present,
+ipsec_check_loopback_policy(mblk_t *data_mp, ip_recv_attr_t *ira,
ipsec_policy_t *ipsp)
{
- mblk_t *ipsec_mp;
- ipsec_in_t *ii;
- netstack_t *ns;
-
- if (!mctl_present)
- return (first_mp);
+ if (!(ira->ira_flags & IRAF_IPSEC_SECURE))
+ return (data_mp);
- ipsec_mp = first_mp;
+ ASSERT(ira->ira_flags & IRAF_LOOPBACK);
- ii = (ipsec_in_t *)ipsec_mp->b_rptr;
- ns = ii->ipsec_in_ns;
- ASSERT(ii->ipsec_in_loopback);
- IPPOL_REFRELE(ipsp, ns);
+ IPPOL_REFRELE(ipsp);
/*
* We should do an actual policy check here. Revisit this
@@ -1569,7 +1512,7 @@ ipsec_check_loopback_policy(mblk_t *first_mp, boolean_t mctl_present,
* get there.)
*/
- return (first_mp);
+ return (data_mp);
}
/*
@@ -1577,20 +1520,19 @@ ipsec_check_loopback_policy(mblk_t *first_mp, boolean_t mctl_present,
* expected by the SAs it traversed on the way in.
*/
static boolean_t
-ipsec_check_ipsecin_unique(ipsec_in_t *ii, const char **reason,
- kstat_named_t **counter, uint64_t pkt_unique)
+ipsec_check_ipsecin_unique(ip_recv_attr_t *ira, const char **reason,
+ kstat_named_t **counter, uint64_t pkt_unique, netstack_t *ns)
{
uint64_t ah_mask, esp_mask;
ipsa_t *ah_assoc;
ipsa_t *esp_assoc;
- netstack_t *ns = ii->ipsec_in_ns;
ipsec_stack_t *ipss = ns->netstack_ipsec;
- ASSERT(ii->ipsec_in_secure);
- ASSERT(!ii->ipsec_in_loopback);
+ ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
+ ASSERT(!(ira->ira_flags & IRAF_LOOPBACK));
- ah_assoc = ii->ipsec_in_ah_sa;
- esp_assoc = ii->ipsec_in_esp_sa;
+ ah_assoc = ira->ira_ipsec_ah_sa;
+ esp_assoc = ira->ira_ipsec_esp_sa;
ASSERT((ah_assoc != NULL) || (esp_assoc != NULL));
ah_mask = (ah_assoc != NULL) ? ah_assoc->ipsa_unique_mask : 0;
@@ -1621,30 +1563,30 @@ ipsec_check_ipsecin_unique(ipsec_in_t *ii, const char **reason,
}
static boolean_t
-ipsec_check_ipsecin_action(ipsec_in_t *ii, mblk_t *mp, ipsec_action_t *ap,
- ipha_t *ipha, ip6_t *ip6h, const char **reason, kstat_named_t **counter)
+ipsec_check_ipsecin_action(ip_recv_attr_t *ira, mblk_t *mp, ipsec_action_t *ap,
+ ipha_t *ipha, ip6_t *ip6h, const char **reason, kstat_named_t **counter,
+ netstack_t *ns)
{
boolean_t ret = B_TRUE;
ipsec_prot_t *ipp;
ipsa_t *ah_assoc;
ipsa_t *esp_assoc;
boolean_t decaps;
- netstack_t *ns = ii->ipsec_in_ns;
ipsec_stack_t *ipss = ns->netstack_ipsec;
ASSERT((ipha == NULL && ip6h != NULL) ||
(ip6h == NULL && ipha != NULL));
- if (ii->ipsec_in_loopback) {
+ if (ira->ira_flags & IRAF_LOOPBACK) {
/*
* Besides accepting pointer-equivalent actions, we also
* accept any ICMP errors we generated for ourselves,
* regardless of policy. If we do not wish to make this
* assumption in the future, check here, and where
- * icmp_loopback is initialized in ip.c and ip6.c. (Look for
- * ipsec_out_icmp_loopback.)
+ * IXAF_TRUSTED_ICMP is initialized in ip.c and ip6.c.
*/
- if (ap == ii->ipsec_in_action || ii->ipsec_in_icmp_loopback)
+ if (ap == ira->ira_ipsec_action ||
+ (ira->ira_flags & IRAF_TRUSTED_ICMP))
return (B_TRUE);
/* Deep compare necessary here?? */
@@ -1652,12 +1594,13 @@ ipsec_check_ipsecin_action(ipsec_in_t *ii, mblk_t *mp, ipsec_action_t *ap,
*reason = "loopback policy mismatch";
return (B_FALSE);
}
- ASSERT(!ii->ipsec_in_icmp_loopback);
+ ASSERT(!(ira->ira_flags & IRAF_TRUSTED_ICMP));
+ ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
- ah_assoc = ii->ipsec_in_ah_sa;
- esp_assoc = ii->ipsec_in_esp_sa;
+ ah_assoc = ira->ira_ipsec_ah_sa;
+ esp_assoc = ira->ira_ipsec_esp_sa;
- decaps = ii->ipsec_in_decaps;
+ decaps = (ira->ira_flags & IRAF_IPSEC_DECAPS);
switch (ap->ipa_act.ipa_type) {
case IPSEC_ACT_DISCARD:
@@ -1744,10 +1687,10 @@ ipsec_check_ipsecin_action(ipsec_in_t *ii, mblk_t *mp, ipsec_action_t *ap,
}
}
} else if (esp_assoc != NULL) {
- /*
- * Don't allow this. Check IPSEC NOTE above
- * ip_fanout_proto().
- */
+ /*
+ * Don't allow this. Check IPSEC NOTE above
+ * ip_fanout_proto().
+ */
*counter = DROPPER(ipss, ipds_spd_got_esp);
*reason = "unexpected ESP";
ret = B_FALSE;
@@ -1777,17 +1720,18 @@ ipsec_check_ipsecin_action(ipsec_in_t *ii, mblk_t *mp, ipsec_action_t *ap,
ret = B_FALSE;
break;
}
- if (ii->ipsec_in_action != NULL) {
+ if (ira->ira_ipsec_action != NULL) {
/*
* This can happen if we do a double policy-check on
* a packet
* XXX XXX should fix this case!
*/
- IPACT_REFRELE(ii->ipsec_in_action);
+ IPACT_REFRELE(ira->ira_ipsec_action);
}
- ASSERT(ii->ipsec_in_action == NULL);
+ ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
+ ASSERT(ira->ira_ipsec_action == NULL);
IPACT_REFHOLD(ap);
- ii->ipsec_in_action = ap;
+ ira->ira_ipsec_action = ap;
break; /* from switch */
}
return (ret);
@@ -1818,9 +1762,9 @@ static uint64_t
conn_to_unique(conn_t *connp, mblk_t *data_mp, ipha_t *ipha, ip6_t *ip6h)
{
ipsec_selector_t sel;
- uint8_t ulp = connp->conn_ulp;
+ uint8_t ulp = connp->conn_proto;
- ASSERT(connp->conn_latch->ipl_in_policy != NULL);
+ ASSERT(connp->conn_latch_in_policy != NULL);
if ((ulp == IPPROTO_TCP || ulp == IPPROTO_UDP || ulp == IPPROTO_SCTP) &&
(connp->conn_fport == 0 || connp->conn_lport == 0)) {
@@ -1839,46 +1783,51 @@ conn_to_unique(conn_t *connp, mblk_t *data_mp, ipha_t *ipha, ip6_t *ip6h)
SELRET_SUCCESS) {
ASSERT(sel.ips_local_port == connp->conn_lport);
ASSERT(sel.ips_remote_port == connp->conn_fport);
- ASSERT(sel.ips_protocol == connp->conn_ulp);
+ ASSERT(sel.ips_protocol == connp->conn_proto);
}
- ASSERT(connp->conn_ulp != 0);
+ ASSERT(connp->conn_proto != 0);
#endif
return (SA_UNIQUE_ID(connp->conn_fport, connp->conn_lport, ulp, 0));
}
/*
- * Called to check policy on a latched connection, both from this file
- * and from tcp.c
+ * Called to check policy on a latched connection.
+ * Note that we don't dereference conn_latch or conn_ihere since the conn might
+ * be closing. The caller passes a held ipsec_latch_t instead.
*/
-boolean_t
-ipsec_check_ipsecin_latch(ipsec_in_t *ii, mblk_t *mp, ipsec_latch_t *ipl,
- ipha_t *ipha, ip6_t *ip6h, const char **reason, kstat_named_t **counter,
- conn_t *connp)
+static boolean_t
+ipsec_check_ipsecin_latch(ip_recv_attr_t *ira, mblk_t *mp, ipsec_latch_t *ipl,
+ ipsec_action_t *ap, ipha_t *ipha, ip6_t *ip6h, const char **reason,
+ kstat_named_t **counter, conn_t *connp, netstack_t *ns)
{
- netstack_t *ns = ii->ipsec_in_ns;
ipsec_stack_t *ipss = ns->netstack_ipsec;
ASSERT(ipl->ipl_ids_latched == B_TRUE);
+ ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
- if (!ii->ipsec_in_loopback) {
+ if (!(ira->ira_flags & IRAF_LOOPBACK)) {
/*
* Over loopback, there aren't real security associations,
* so there are neither identities nor "unique" values
* for us to check the packet against.
*/
- if ((ii->ipsec_in_ah_sa != NULL) &&
- (!spd_match_inbound_ids(ipl, ii->ipsec_in_ah_sa))) {
- *counter = DROPPER(ipss, ipds_spd_ah_badid);
- *reason = "AH identity mismatch";
- return (B_FALSE);
+ if (ira->ira_ipsec_ah_sa != NULL) {
+ if (!spd_match_inbound_ids(ipl,
+ ira->ira_ipsec_ah_sa)) {
+ *counter = DROPPER(ipss, ipds_spd_ah_badid);
+ *reason = "AH identity mismatch";
+ return (B_FALSE);
+ }
}
- if ((ii->ipsec_in_esp_sa != NULL) &&
- (!spd_match_inbound_ids(ipl, ii->ipsec_in_esp_sa))) {
- *counter = DROPPER(ipss, ipds_spd_esp_badid);
- *reason = "ESP identity mismatch";
- return (B_FALSE);
+ if (ira->ira_ipsec_esp_sa != NULL) {
+ if (!spd_match_inbound_ids(ipl,
+ ira->ira_ipsec_esp_sa)) {
+ *counter = DROPPER(ipss, ipds_spd_esp_badid);
+ *reason = "ESP identity mismatch";
+ return (B_FALSE);
+ }
}
/*
@@ -1886,14 +1835,13 @@ ipsec_check_ipsecin_latch(ipsec_in_t *ii, mblk_t *mp, ipsec_latch_t *ipl,
* In DEBUG kernels (see conn_to_unique()'s implementation),
* verify this even if it REALLY slows things down.
*/
- if (!ipsec_check_ipsecin_unique(ii, reason, counter,
- conn_to_unique(connp, mp, ipha, ip6h))) {
+ if (!ipsec_check_ipsecin_unique(ira, reason, counter,
+ conn_to_unique(connp, mp, ipha, ip6h), ns)) {
return (B_FALSE);
}
}
-
- return (ipsec_check_ipsecin_action(ii, mp, ipl->ipl_in_action,
- ipha, ip6h, reason, counter));
+ return (ipsec_check_ipsecin_action(ira, mp, ap, ipha, ip6h, reason,
+ counter, ns));
}
/*
@@ -1903,52 +1851,48 @@ ipsec_check_ipsecin_latch(ipsec_in_t *ii, mblk_t *mp, ipsec_latch_t *ipl,
* Called from ipsec_check_global_policy, and ipsec_check_inbound_policy.
*
* Consumes a reference to ipsp.
+ * Returns the mblk if ok.
*/
static mblk_t *
-ipsec_check_ipsecin_policy(mblk_t *first_mp, ipsec_policy_t *ipsp,
- ipha_t *ipha, ip6_t *ip6h, uint64_t pkt_unique, netstack_t *ns)
+ipsec_check_ipsecin_policy(mblk_t *data_mp, ipsec_policy_t *ipsp,
+ ipha_t *ipha, ip6_t *ip6h, uint64_t pkt_unique, ip_recv_attr_t *ira,
+ netstack_t *ns)
{
- ipsec_in_t *ii;
ipsec_action_t *ap;
const char *reason = "no policy actions found";
- mblk_t *data_mp, *ipsec_mp;
- ipsec_stack_t *ipss = ns->netstack_ipsec;
ip_stack_t *ipst = ns->netstack_ip;
+ ipsec_stack_t *ipss = ns->netstack_ipsec;
kstat_named_t *counter;
counter = DROPPER(ipss, ipds_spd_got_secure);
- data_mp = first_mp->b_cont;
- ipsec_mp = first_mp;
-
ASSERT(ipsp != NULL);
ASSERT((ipha == NULL && ip6h != NULL) ||
(ip6h == NULL && ipha != NULL));
- ii = (ipsec_in_t *)ipsec_mp->b_rptr;
+ if (ira->ira_flags & IRAF_LOOPBACK)
+ return (ipsec_check_loopback_policy(data_mp, ira, ipsp));
- if (ii->ipsec_in_loopback)
- return (ipsec_check_loopback_policy(first_mp, B_TRUE, ipsp));
- ASSERT(ii->ipsec_in_type == IPSEC_IN);
- ASSERT(ii->ipsec_in_secure);
+ ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
- if (ii->ipsec_in_action != NULL) {
+ if (ira->ira_ipsec_action != NULL) {
/*
* this can happen if we do a double policy-check on a packet
* Would be nice to be able to delete this test..
*/
- IPACT_REFRELE(ii->ipsec_in_action);
+ IPACT_REFRELE(ira->ira_ipsec_action);
}
- ASSERT(ii->ipsec_in_action == NULL);
+ ASSERT(ira->ira_ipsec_action == NULL);
- if (!SA_IDS_MATCH(ii->ipsec_in_ah_sa, ii->ipsec_in_esp_sa)) {
+ if (!SA_IDS_MATCH(ira->ira_ipsec_ah_sa, ira->ira_ipsec_esp_sa)) {
reason = "inbound AH and ESP identities differ";
counter = DROPPER(ipss, ipds_spd_ahesp_diffid);
goto drop;
}
- if (!ipsec_check_ipsecin_unique(ii, &reason, &counter, pkt_unique))
+ if (!ipsec_check_ipsecin_unique(ira, &reason, &counter, pkt_unique,
+ ns))
goto drop;
/*
@@ -1957,21 +1901,21 @@ ipsec_check_ipsecin_policy(mblk_t *first_mp, ipsec_policy_t *ipsp,
*/
for (ap = ipsp->ipsp_act; ap != NULL; ap = ap->ipa_next) {
- if (ipsec_check_ipsecin_action(ii, data_mp, ap,
- ipha, ip6h, &reason, &counter)) {
+ if (ipsec_check_ipsecin_action(ira, data_mp, ap,
+ ipha, ip6h, &reason, &counter, ns)) {
BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded);
- IPPOL_REFRELE(ipsp, ns);
- return (first_mp);
+ IPPOL_REFRELE(ipsp);
+ return (data_mp);
}
}
drop:
ipsec_rl_strlog(ns, IP_MOD_ID, 0, 0, SL_ERROR|SL_WARN|SL_CONSOLE,
"ipsec inbound policy mismatch: %s, packet dropped\n",
reason);
- IPPOL_REFRELE(ipsp, ns);
- ASSERT(ii->ipsec_in_action == NULL);
+ IPPOL_REFRELE(ipsp);
+ ASSERT(ira->ira_ipsec_action == NULL);
BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed);
- ip_drop_packet(first_mp, B_TRUE, NULL, NULL, counter,
+ ip_drop_packet(data_mp, B_TRUE, NULL, counter,
&ipss->ipsec_spd_dropper);
return (NULL);
}
@@ -2075,7 +2019,7 @@ ipsec_find_policy_chain(ipsec_policy_t *best, ipsec_policy_t *chain,
*/
ipsec_policy_t *
ipsec_find_policy_head(ipsec_policy_t *best, ipsec_policy_head_t *head,
- int direction, ipsec_selector_t *sel, netstack_t *ns)
+ int direction, ipsec_selector_t *sel)
{
ipsec_policy_t *curbest;
ipsec_policy_root_t *root;
@@ -2121,7 +2065,7 @@ ipsec_find_policy_head(ipsec_policy_t *best, ipsec_policy_head_t *head,
IPPOL_REFHOLD(curbest);
if (best != NULL) {
- IPPOL_REFRELE(best, ns);
+ IPPOL_REFRELE(best);
}
}
@@ -2139,20 +2083,17 @@ ipsec_find_policy_head(ipsec_policy_t *best, ipsec_policy_head_t *head,
* reference when done.
*/
ipsec_policy_t *
-ipsec_find_policy(int direction, conn_t *connp, ipsec_out_t *io,
- ipsec_selector_t *sel, netstack_t *ns)
+ipsec_find_policy(int direction, const conn_t *connp, ipsec_selector_t *sel,
+ netstack_t *ns)
{
ipsec_policy_t *p;
ipsec_stack_t *ipss = ns->netstack_ipsec;
p = ipsec_find_policy_head(NULL, &ipss->ipsec_system_policy,
- direction, sel, ns);
+ direction, sel);
if ((connp != NULL) && (connp->conn_policy != NULL)) {
p = ipsec_find_policy_head(p, connp->conn_policy,
- direction, sel, ns);
- } else if ((io != NULL) && (io->ipsec_out_polhead != NULL)) {
- p = ipsec_find_policy_head(p, io->ipsec_out_polhead,
- direction, sel, ns);
+ direction, sel);
}
return (p);
@@ -2172,21 +2113,16 @@ ipsec_find_policy(int direction, conn_t *connp, ipsec_out_t *io,
* floor.
*/
mblk_t *
-ipsec_check_global_policy(mblk_t *first_mp, conn_t *connp,
- ipha_t *ipha, ip6_t *ip6h, boolean_t mctl_present, netstack_t *ns)
+ipsec_check_global_policy(mblk_t *data_mp, conn_t *connp,
+ ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, netstack_t *ns)
{
ipsec_policy_t *p;
ipsec_selector_t sel;
- mblk_t *data_mp, *ipsec_mp;
boolean_t policy_present;
kstat_named_t *counter;
- ipsec_in_t *ii = NULL;
uint64_t pkt_unique;
- ipsec_stack_t *ipss = ns->netstack_ipsec;
ip_stack_t *ipst = ns->netstack_ip;
-
- data_mp = mctl_present ? first_mp->b_cont : first_mp;
- ipsec_mp = mctl_present ? first_mp : NULL;
+ ipsec_stack_t *ipss = ns->netstack_ipsec;
sel.ips_is_icmp_inv_acq = 0;
@@ -2203,13 +2139,7 @@ ipsec_check_global_policy(mblk_t *first_mp, conn_t *connp,
* No global policy and no per-socket policy;
* just pass it back (but we shouldn't get here in that case)
*/
- return (first_mp);
- }
-
- if (ipsec_mp != NULL) {
- ASSERT(ipsec_mp->b_datap->db_type == M_CTL);
- ii = (ipsec_in_t *)(ipsec_mp->b_rptr);
- ASSERT(ii->ipsec_in_type == IPSEC_IN);
+ return (data_mp);
}
/*
@@ -2217,32 +2147,11 @@ ipsec_check_global_policy(mblk_t *first_mp, conn_t *connp,
* Otherwise consult system policy.
*/
if ((connp != NULL) && (connp->conn_latch != NULL)) {
- p = connp->conn_latch->ipl_in_policy;
+ p = connp->conn_latch_in_policy;
if (p != NULL) {
IPPOL_REFHOLD(p);
}
/*
- * The caller may have mistakenly assigned an ip6i_t as the
- * ip6h for this packet, so take that corner-case into
- * account.
- */
- if (ip6h != NULL && ip6h->ip6_nxt == IPPROTO_RAW) {
- ip6h++;
- /* First check for bizarro split-mblk headers. */
- if ((uintptr_t)ip6h > (uintptr_t)data_mp->b_wptr ||
- ((uintptr_t)ip6h) + sizeof (ip6_t) >
- (uintptr_t)data_mp->b_wptr) {
- ipsec_log_policy_failure(IPSEC_POLICY_MISMATCH,
- "ipsec_check_global_policy", ipha, ip6h,
- B_TRUE, ns);
- counter = DROPPER(ipss, ipds_spd_nomem);
- goto fail;
- }
- /* Next, see if ip6i is at the end of an mblk. */
- if (ip6h == (ip6_t *)data_mp->b_wptr)
- ip6h = (ip6_t *)data_mp->b_cont->b_rptr;
- }
- /*
* Fudge sel for UNIQUE_ID setting below.
*/
pkt_unique = conn_to_unique(connp, data_mp, ipha, ip6h);
@@ -2271,20 +2180,19 @@ ipsec_check_global_policy(mblk_t *first_mp, conn_t *connp,
* local policy alone.
*/
- p = ipsec_find_policy(IPSEC_TYPE_INBOUND, connp, NULL, &sel,
- ns);
+ p = ipsec_find_policy(IPSEC_TYPE_INBOUND, connp, &sel, ns);
pkt_unique = SA_UNIQUE_ID(sel.ips_remote_port,
sel.ips_local_port, sel.ips_protocol, 0);
}
if (p == NULL) {
- if (ipsec_mp == NULL) {
+ if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
/*
* We have no policy; default to succeeding.
* XXX paranoid system design doesn't do this.
*/
BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded);
- return (first_mp);
+ return (data_mp);
} else {
counter = DROPPER(ipss, ipds_spd_got_secure);
ipsec_log_policy_failure(IPSEC_POLICY_NOT_NEEDED,
@@ -2293,16 +2201,16 @@ ipsec_check_global_policy(mblk_t *first_mp, conn_t *connp,
goto fail;
}
}
- if ((ii != NULL) && (ii->ipsec_in_secure)) {
- return (ipsec_check_ipsecin_policy(ipsec_mp, p, ipha, ip6h,
- pkt_unique, ns));
+ if (ira->ira_flags & IRAF_IPSEC_SECURE) {
+ return (ipsec_check_ipsecin_policy(data_mp, p, ipha, ip6h,
+ pkt_unique, ira, ns));
}
if (p->ipsp_act->ipa_allow_clear) {
BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded);
- IPPOL_REFRELE(p, ns);
- return (first_mp);
+ IPPOL_REFRELE(p);
+ return (data_mp);
}
- IPPOL_REFRELE(p, ns);
+ IPPOL_REFRELE(p);
/*
* If we reach here, we will drop the packet because it failed the
* global policy check because the packet was cleartext, and it
@@ -2313,7 +2221,7 @@ ipsec_check_global_policy(mblk_t *first_mp, conn_t *connp,
counter = DROPPER(ipss, ipds_spd_got_clear);
fail:
- ip_drop_packet(first_mp, B_TRUE, NULL, NULL, counter,
+ ip_drop_packet(data_mp, B_TRUE, NULL, counter,
&ipss->ipsec_spd_dropper);
BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed);
return (NULL);
@@ -2435,7 +2343,7 @@ ipsec_inbound_accept_clear(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h)
case ICMP_FRAGMENTATION_NEEDED:
/*
* Be in sync with icmp_inbound, where we have
- * already set ire_max_frag.
+ * already set dce_pmtu
*/
#ifdef FRAGCACHE_DEBUG
cmn_err(CE_WARN, "ICMP frag needed\n");
@@ -2496,27 +2404,44 @@ ipsec_latch_ids(ipsec_latch_t *ipl, ipsid_t *local, ipsid_t *remote)
}
void
-ipsec_latch_inbound(ipsec_latch_t *ipl, ipsec_in_t *ii)
+ipsec_latch_inbound(conn_t *connp, ip_recv_attr_t *ira)
{
ipsa_t *sa;
+ ipsec_latch_t *ipl = connp->conn_latch;
if (!ipl->ipl_ids_latched) {
ipsid_t *local = NULL;
ipsid_t *remote = NULL;
- if (!ii->ipsec_in_loopback) {
- if (ii->ipsec_in_esp_sa != NULL)
- sa = ii->ipsec_in_esp_sa;
+ if (!(ira->ira_flags & IRAF_LOOPBACK)) {
+ ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
+ if (ira->ira_ipsec_esp_sa != NULL)
+ sa = ira->ira_ipsec_esp_sa;
else
- sa = ii->ipsec_in_ah_sa;
+ sa = ira->ira_ipsec_ah_sa;
ASSERT(sa != NULL);
local = sa->ipsa_dst_cid;
remote = sa->ipsa_src_cid;
}
ipsec_latch_ids(ipl, local, remote);
}
- ipl->ipl_in_action = ii->ipsec_in_action;
- IPACT_REFHOLD(ipl->ipl_in_action);
+ if (ira->ira_flags & IRAF_IPSEC_SECURE) {
+ if (connp->conn_latch_in_action != NULL) {
+ /*
+ * Previously cached action. This is probably
+ * harmless, but in DEBUG kernels, check for
+ * action equality.
+ *
+ * Preserve the existing action to preserve latch
+ * invariance.
+ */
+ ASSERT(connp->conn_latch_in_action ==
+ ira->ira_ipsec_action);
+ return;
+ }
+ connp->conn_latch_in_action = ira->ira_ipsec_action;
+ IPACT_REFHOLD(connp->conn_latch_in_action);
+ }
}
/*
@@ -2527,27 +2452,25 @@ ipsec_latch_inbound(ipsec_latch_t *ipl, ipsec_in_t *ii)
* see also ipsec_check_ipsecin_latch() and ipsec_check_global_policy()
*/
mblk_t *
-ipsec_check_inbound_policy(mblk_t *first_mp, conn_t *connp,
- ipha_t *ipha, ip6_t *ip6h, boolean_t mctl_present)
+ipsec_check_inbound_policy(mblk_t *mp, conn_t *connp,
+ ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira)
{
- ipsec_in_t *ii;
- boolean_t ret;
- mblk_t *mp = mctl_present ? first_mp->b_cont : first_mp;
- mblk_t *ipsec_mp = mctl_present ? first_mp : NULL;
- ipsec_latch_t *ipl;
- uint64_t unique_id;
+ boolean_t ret;
+ ipsec_latch_t *ipl;
+ ipsec_action_t *ap;
+ uint64_t unique_id;
ipsec_stack_t *ipss;
ip_stack_t *ipst;
netstack_t *ns;
ipsec_policy_head_t *policy_head;
+ ipsec_policy_t *p = NULL;
ASSERT(connp != NULL);
ns = connp->conn_netstack;
ipss = ns->netstack_ipsec;
ipst = ns->netstack_ip;
- if (ipsec_mp == NULL) {
-clear:
+ if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
/*
* This is the case where the incoming datagram is
* cleartext and we need to see whether this client
@@ -2559,49 +2482,49 @@ clear:
mutex_enter(&connp->conn_lock);
if (connp->conn_state_flags & CONN_CONDEMNED) {
mutex_exit(&connp->conn_lock);
- ip_drop_packet(first_mp, B_TRUE, NULL,
- NULL, DROPPER(ipss, ipds_spd_got_clear),
+ ip_drop_packet(mp, B_TRUE, NULL,
+ DROPPER(ipss, ipds_spd_got_clear),
&ipss->ipsec_spd_dropper);
BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed);
return (NULL);
}
- if ((ipl = connp->conn_latch) != NULL) {
+ if (connp->conn_latch != NULL) {
/* Hold a reference in case the conn is closing */
- IPLATCH_REFHOLD(ipl);
+ p = connp->conn_latch_in_policy;
+ if (p != NULL)
+ IPPOL_REFHOLD(p);
mutex_exit(&connp->conn_lock);
/*
* Policy is cached in the conn.
*/
- if ((ipl->ipl_in_policy != NULL) &&
- (!ipl->ipl_in_policy->ipsp_act->ipa_allow_clear)) {
+ if (p != NULL && !p->ipsp_act->ipa_allow_clear) {
ret = ipsec_inbound_accept_clear(mp,
ipha, ip6h);
if (ret) {
BUMP_MIB(&ipst->ips_ip_mib,
ipsecInSucceeded);
- IPLATCH_REFRELE(ipl, ns);
- return (first_mp);
+ IPPOL_REFRELE(p);
+ return (mp);
} else {
ipsec_log_policy_failure(
IPSEC_POLICY_MISMATCH,
"ipsec_check_inbound_policy", ipha,
ip6h, B_FALSE, ns);
- ip_drop_packet(first_mp, B_TRUE, NULL,
- NULL,
+ ip_drop_packet(mp, B_TRUE, NULL,
DROPPER(ipss, ipds_spd_got_clear),
&ipss->ipsec_spd_dropper);
BUMP_MIB(&ipst->ips_ip_mib,
ipsecInFailed);
- IPLATCH_REFRELE(ipl, ns);
+ IPPOL_REFRELE(p);
return (NULL);
}
} else {
BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded);
- IPLATCH_REFRELE(ipl, ns);
- return (first_mp);
+ if (p != NULL)
+ IPPOL_REFRELE(p);
+ return (mp);
}
} else {
- uchar_t db_type;
policy_head = connp->conn_policy;
/* Hold a reference in case the conn is closing */
@@ -2611,50 +2534,22 @@ clear:
/*
* As this is a non-hardbound connection we need
* to look at both per-socket policy and global
- * policy. As this is cleartext, mark the mp as
- * M_DATA in case if it is an ICMP error being
- * reported before calling ipsec_check_global_policy
- * so that it does not mistake it for IPSEC_IN.
+ * policy.
*/
- db_type = mp->b_datap->db_type;
- mp->b_datap->db_type = M_DATA;
- first_mp = ipsec_check_global_policy(first_mp, connp,
- ipha, ip6h, mctl_present, ns);
+ mp = ipsec_check_global_policy(mp, connp,
+ ipha, ip6h, ira, ns);
if (policy_head != NULL)
IPPH_REFRELE(policy_head, ns);
- if (first_mp != NULL)
- mp->b_datap->db_type = db_type;
- return (first_mp);
+ return (mp);
}
}
- /*
- * If it is inbound check whether the attached message
- * is secure or not. We have a special case for ICMP,
- * where we have a IPSEC_IN message and the attached
- * message is not secure. See icmp_inbound_error_fanout
- * for details.
- */
- ASSERT(ipsec_mp != NULL);
- ASSERT(ipsec_mp->b_datap->db_type == M_CTL);
- ii = (ipsec_in_t *)ipsec_mp->b_rptr;
-
- if (!ii->ipsec_in_secure)
- goto clear;
-
- /*
- * mp->b_cont could be either a M_CTL message
- * for icmp errors being sent up or a M_DATA message.
- */
- ASSERT(mp->b_datap->db_type == M_CTL || mp->b_datap->db_type == M_DATA);
-
- ASSERT(ii->ipsec_in_type == IPSEC_IN);
mutex_enter(&connp->conn_lock);
/* Connection is closing */
if (connp->conn_state_flags & CONN_CONDEMNED) {
mutex_exit(&connp->conn_lock);
- ip_drop_packet(first_mp, B_TRUE, NULL,
- NULL, DROPPER(ipss, ipds_spd_got_clear),
+ ip_drop_packet(mp, B_TRUE, NULL,
+ DROPPER(ipss, ipds_spd_got_clear),
&ipss->ipsec_spd_dropper);
BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed);
return (NULL);
@@ -2679,58 +2574,64 @@ clear:
* policy. It will check against conn or global
* depending on whichever is stronger.
*/
- retmp = ipsec_check_global_policy(first_mp, connp,
- ipha, ip6h, mctl_present, ns);
+ retmp = ipsec_check_global_policy(mp, connp,
+ ipha, ip6h, ira, ns);
if (policy_head != NULL)
IPPH_REFRELE(policy_head, ns);
return (retmp);
}
IPLATCH_REFHOLD(ipl);
+ /* Hold reference on conn_latch_in_action in case conn is closing */
+ ap = connp->conn_latch_in_action;
+ if (ap != NULL)
+ IPACT_REFHOLD(ap);
mutex_exit(&connp->conn_lock);
- if (ipl->ipl_in_action != NULL) {
+ if (ap != NULL) {
/* Policy is cached & latched; fast(er) path */
const char *reason;
kstat_named_t *counter;
- if (ipsec_check_ipsecin_latch(ii, mp, ipl,
- ipha, ip6h, &reason, &counter, connp)) {
+ if (ipsec_check_ipsecin_latch(ira, mp, ipl, ap,
+ ipha, ip6h, &reason, &counter, connp, ns)) {
BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded);
- IPLATCH_REFRELE(ipl, ns);
- return (first_mp);
+ IPLATCH_REFRELE(ipl);
+ IPACT_REFRELE(ap);
+ return (mp);
}
ipsec_rl_strlog(ns, IP_MOD_ID, 0, 0,
SL_ERROR|SL_WARN|SL_CONSOLE,
"ipsec inbound policy mismatch: %s, packet dropped\n",
reason);
- ip_drop_packet(first_mp, B_TRUE, NULL, NULL, counter,
+ ip_drop_packet(mp, B_TRUE, NULL, counter,
&ipss->ipsec_spd_dropper);
BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed);
- IPLATCH_REFRELE(ipl, ns);
+ IPLATCH_REFRELE(ipl);
+ IPACT_REFRELE(ap);
return (NULL);
- } else if (ipl->ipl_in_policy == NULL) {
+ }
+ if ((p = connp->conn_latch_in_policy) == NULL) {
ipsec_weird_null_inbound_policy++;
- IPLATCH_REFRELE(ipl, ns);
- return (first_mp);
+ IPLATCH_REFRELE(ipl);
+ return (mp);
}
unique_id = conn_to_unique(connp, mp, ipha, ip6h);
- IPPOL_REFHOLD(ipl->ipl_in_policy);
- first_mp = ipsec_check_ipsecin_policy(first_mp, ipl->ipl_in_policy,
- ipha, ip6h, unique_id, ns);
+ IPPOL_REFHOLD(p);
+ mp = ipsec_check_ipsecin_policy(mp, p, ipha, ip6h, unique_id, ira, ns);
/*
* NOTE: ipsecIn{Failed,Succeeeded} bumped by
* ipsec_check_ipsecin_policy().
*/
- if (first_mp != NULL)
- ipsec_latch_inbound(ipl, ii);
- IPLATCH_REFRELE(ipl, ns);
- return (first_mp);
+ if (mp != NULL)
+ ipsec_latch_inbound(connp, ira);
+ IPLATCH_REFRELE(ipl);
+ return (mp);
}
/*
- * Handle all sorts of cases like tunnel-mode, ICMP, and ip6i prepending.
+ * Handle all sorts of cases like tunnel-mode and ICMP.
*/
static int
prepended_length(mblk_t *mp, uintptr_t hptr)
@@ -2779,19 +2680,24 @@ prepended_length(mblk_t *mp, uintptr_t hptr)
* should put this packet in a fragment-gathering queue.
* Only returned if SEL_TUNNEL_MODE and SEL_PORT_POLICY
* is set.
+ *
+ * Note that ipha/ip6h can be in a different mblk (mp->b_cont) in the case
+ * of tunneled packets.
+ * Also, mp->b_rptr can be an ICMP error where ipha/ip6h is the packet in
+ * error past the ICMP error.
*/
static selret_t
ipsec_init_inbound_sel(ipsec_selector_t *sel, mblk_t *mp, ipha_t *ipha,
ip6_t *ip6h, uint8_t sel_flags)
{
uint16_t *ports;
- int outer_hdr_len = 0; /* For ICMP, tunnel-mode, or ip6i cases... */
+ int outer_hdr_len = 0; /* For ICMP or tunnel-mode cases... */
ushort_t hdr_len;
mblk_t *spare_mp = NULL;
uint8_t *nexthdrp, *transportp;
uint8_t nexthdr;
uint8_t icmp_proto;
- ip6_pkt_t ipp;
+ ip_pkt_t ipp;
boolean_t port_policy_present = (sel_flags & SEL_PORT_POLICY);
boolean_t is_icmp = (sel_flags & SEL_IS_ICMP);
boolean_t tunnel_mode = (sel_flags & SEL_TUNNEL_MODE);
@@ -2802,44 +2708,14 @@ ipsec_init_inbound_sel(ipsec_selector_t *sel, mblk_t *mp, ipha_t *ipha,
if (ip6h != NULL) {
outer_hdr_len = prepended_length(mp, (uintptr_t)ip6h);
-
nexthdr = ip6h->ip6_nxt;
-
- /*
- * The caller may have mistakenly assigned an ip6i_t as the
- * ip6h for this packet, so take that corner-case into
- * account.
- */
- if (nexthdr == IPPROTO_RAW) {
- ip6h++;
- /* First check for bizarro split-mblk headers. */
- if ((uintptr_t)ip6h > (uintptr_t)mp->b_wptr ||
- ((uintptr_t)ip6h) + sizeof (ip6_t) >
- (uintptr_t)mp->b_wptr) {
- return (SELRET_BADPKT);
- }
- /* Next, see if ip6i is at the end of an mblk. */
- if (ip6h == (ip6_t *)mp->b_wptr)
- ip6h = (ip6_t *)mp->b_cont->b_rptr;
-
- nexthdr = ip6h->ip6_nxt;
-
- /*
- * Finally, if we haven't adjusted for ip6i, do so
- * now. ip6i_t structs are prepended, so an ICMP
- * or tunnel packet would just be overwritten.
- */
- if (outer_hdr_len == 0)
- outer_hdr_len = sizeof (ip6i_t);
- }
-
icmp_proto = IPPROTO_ICMPV6;
sel->ips_isv4 = B_FALSE;
sel->ips_local_addr_v6 = ip6h->ip6_dst;
sel->ips_remote_addr_v6 = ip6h->ip6_src;
bzero(&ipp, sizeof (ipp));
- (void) ip_find_hdr_v6(mp, ip6h, &ipp, NULL);
+ (void) ip_find_hdr_v6(mp, ip6h, B_FALSE, &ipp, NULL);
switch (nexthdr) {
case IPPROTO_HOPOPTS:
@@ -2852,7 +2728,6 @@ ipsec_init_inbound_sel(ipsec_selector_t *sel, mblk_t *mp, ipha_t *ipha,
*/
if ((spare_mp = msgpullup(mp, -1)) == NULL)
return (SELRET_NOMEM);
-
if (!ip_hdr_length_nexthdr_v6(spare_mp,
(ip6_t *)(spare_mp->b_rptr + outer_hdr_len),
&hdr_len, &nexthdrp)) {
@@ -2930,6 +2805,10 @@ ipsec_init_inbound_sel(ipsec_selector_t *sel, mblk_t *mp, ipha_t *ipha,
return (SELRET_SUCCESS);
}
+/*
+ * This is called with a b_next chain of messages from the fragcache code,
+ * hence it needs to discard a chain on error.
+ */
static boolean_t
ipsec_init_outbound_ports(ipsec_selector_t *sel, mblk_t *mp, ipha_t *ipha,
ip6_t *ip6h, int outer_hdr_len, ipsec_stack_t *ipss)
@@ -2967,7 +2846,7 @@ ipsec_init_outbound_ports(ipsec_selector_t *sel, mblk_t *mp, ipha_t *ipha,
&hdr_len, &nexthdrp)) {
/* Always works, even if NULL. */
ipsec_freemsg_chain(spare_mp);
- ip_drop_packet_chain(mp, B_FALSE, NULL, NULL,
+ ip_drop_packet_chain(mp, B_FALSE, NULL,
DROPPER(ipss, ipds_spd_nomem),
&ipss->ipsec_spd_dropper);
return (B_FALSE);
@@ -3005,7 +2884,7 @@ ipsec_init_outbound_ports(ipsec_selector_t *sel, mblk_t *mp, ipha_t *ipha,
*/
if (spare_mp == NULL &&
(spare_mp = msgpullup(mp, -1)) == NULL) {
- ip_drop_packet_chain(mp, B_FALSE, NULL, NULL,
+ ip_drop_packet_chain(mp, B_FALSE, NULL,
DROPPER(ipss, ipds_spd_nomem),
&ipss->ipsec_spd_dropper);
return (B_FALSE);
@@ -3029,13 +2908,68 @@ ipsec_init_outbound_ports(ipsec_selector_t *sel, mblk_t *mp, ipha_t *ipha,
}
/*
+ * Prepend an mblk with a ipsec_crypto_t to the message chain.
+ * Frees the argument and returns NULL should the allocation fail.
+ * Returns the pointer to the crypto data part.
+ */
+mblk_t *
+ipsec_add_crypto_data(mblk_t *data_mp, ipsec_crypto_t **icp)
+{
+ mblk_t *mp;
+
+ mp = allocb(sizeof (ipsec_crypto_t), BPRI_MED);
+ if (mp == NULL) {
+ freemsg(data_mp);
+ return (NULL);
+ }
+ bzero(mp->b_rptr, sizeof (ipsec_crypto_t));
+ mp->b_wptr += sizeof (ipsec_crypto_t);
+ mp->b_cont = data_mp;
+ mp->b_datap->db_type = M_EVENT; /* For ASSERT */
+ *icp = (ipsec_crypto_t *)mp->b_rptr;
+ return (mp);
+}
+
+/*
+ * Remove what was prepended above. Return b_cont and a pointer to the
+ * crypto data.
+ * The caller must call ipsec_free_crypto_data for mblk once it is done
+ * with the crypto data.
+ */
+mblk_t *
+ipsec_remove_crypto_data(mblk_t *crypto_mp, ipsec_crypto_t **icp)
+{
+ ASSERT(crypto_mp->b_datap->db_type == M_EVENT);
+ ASSERT(MBLKL(crypto_mp) == sizeof (ipsec_crypto_t));
+
+ *icp = (ipsec_crypto_t *)crypto_mp->b_rptr;
+ return (crypto_mp->b_cont);
+}
+
+/*
+ * Free what was prepended above. Return b_cont.
+ */
+mblk_t *
+ipsec_free_crypto_data(mblk_t *crypto_mp)
+{
+ mblk_t *mp;
+
+ ASSERT(crypto_mp->b_datap->db_type == M_EVENT);
+ ASSERT(MBLKL(crypto_mp) == sizeof (ipsec_crypto_t));
+
+ mp = crypto_mp->b_cont;
+ freeb(crypto_mp);
+ return (mp);
+}
+
+/*
* Create an ipsec_action_t based on the way an inbound packet was protected.
* Used to reflect traffic back to a sender.
*
* We don't bother interning the action into the hash table.
*/
ipsec_action_t *
-ipsec_in_to_out_action(ipsec_in_t *ii)
+ipsec_in_to_out_action(ip_recv_attr_t *ira)
{
ipsa_t *ah_assoc, *esp_assoc;
uint_t auth_alg = 0, encr_alg = 0, espa_alg = 0;
@@ -3057,10 +2991,12 @@ ipsec_in_to_out_action(ipsec_in_t *ii)
*/
ap->ipa_act.ipa_type = IPSEC_ACT_APPLY;
ap->ipa_act.ipa_log = 0;
- ah_assoc = ii->ipsec_in_ah_sa;
+ ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
+
+ ah_assoc = ira->ira_ipsec_ah_sa;
ap->ipa_act.ipa_apply.ipp_use_ah = (ah_assoc != NULL);
- esp_assoc = ii->ipsec_in_esp_sa;
+ esp_assoc = ira->ira_ipsec_esp_sa;
ap->ipa_act.ipa_apply.ipp_use_esp = (esp_assoc != NULL);
if (esp_assoc != NULL) {
@@ -3074,7 +3010,8 @@ ipsec_in_to_out_action(ipsec_in_t *ii)
ap->ipa_act.ipa_apply.ipp_encr_alg = (uint8_t)encr_alg;
ap->ipa_act.ipa_apply.ipp_auth_alg = (uint8_t)auth_alg;
ap->ipa_act.ipa_apply.ipp_esp_auth_alg = (uint8_t)espa_alg;
- ap->ipa_act.ipa_apply.ipp_use_se = ii->ipsec_in_decaps;
+ ap->ipa_act.ipa_apply.ipp_use_se =
+ !!(ira->ira_flags & IRAF_IPSEC_DECAPS);
unique = B_FALSE;
if (esp_assoc != NULL) {
@@ -3104,7 +3041,7 @@ ipsec_in_to_out_action(ipsec_in_t *ii)
ap->ipa_act.ipa_apply.ipp_use_unique = unique;
ap->ipa_want_unique = unique;
ap->ipa_allow_clear = B_FALSE;
- ap->ipa_want_se = ii->ipsec_in_decaps;
+ ap->ipa_want_se = !!(ira->ira_flags & IRAF_IPSEC_DECAPS);
ap->ipa_want_ah = (ah_assoc != NULL);
ap->ipa_want_esp = (esp_assoc != NULL);
@@ -3500,13 +3437,14 @@ ipsec_sel_rel(ipsec_sel_t **spp, netstack_t *ns)
* Free a policy rule which we know is no longer being referenced.
*/
void
-ipsec_policy_free(ipsec_policy_t *ipp, netstack_t *ns)
+ipsec_policy_free(ipsec_policy_t *ipp)
{
ASSERT(ipp->ipsp_refs == 0);
ASSERT(ipp->ipsp_sel != NULL);
ASSERT(ipp->ipsp_act != NULL);
+ ASSERT(ipp->ipsp_netstack != NULL);
- ipsec_sel_rel(&ipp->ipsp_sel, ns);
+ ipsec_sel_rel(&ipp->ipsp_sel, ipp->ipsp_netstack);
IPACT_REFRELE(ipp->ipsp_act);
kmem_cache_free(ipsec_pol_cache, ipp);
}
@@ -3544,6 +3482,7 @@ ipsec_policy_create(ipsec_selkey_t *keys, const ipsec_act_t *a,
HASH_NULL(ipp, ipsp_hash);
+ ipp->ipsp_netstack = ns; /* Needed for ipsec_policy_free */
ipp->ipsp_refs = 1; /* caller's reference */
ipp->ipsp_sel = sp;
ipp->ipsp_act = ap;
@@ -3613,7 +3552,7 @@ ipsec_policy_delete(ipsec_policy_head_t *php, ipsec_selkey_t *keys, int dir,
continue;
}
- IPPOL_UNCHAIN(php, ip, ns);
+ IPPOL_UNCHAIN(php, ip);
php->iph_gen++;
ipsec_update_present_flags(ns->netstack_ipsec);
@@ -3664,7 +3603,7 @@ ipsec_policy_delete_index(ipsec_policy_head_t *php, uint64_t policy_index,
break;
}
- IPPOL_UNCHAIN(php, ip, ns);
+ IPPOL_UNCHAIN(php, ip);
found = B_TRUE;
}
@@ -3897,8 +3836,7 @@ ipsec_enter_policy(ipsec_policy_head_t *php, ipsec_policy_t *ipp, int direction,
}
static void
-ipsec_ipr_flush(ipsec_policy_head_t *php, ipsec_policy_root_t *ipr,
- netstack_t *ns)
+ipsec_ipr_flush(ipsec_policy_head_t *php, ipsec_policy_root_t *ipr)
{
ipsec_policy_t *ip, *nip;
int af, chain, nchain;
@@ -3906,7 +3844,7 @@ ipsec_ipr_flush(ipsec_policy_head_t *php, ipsec_policy_root_t *ipr,
for (af = 0; af < IPSEC_NAF; af++) {
for (ip = ipr->ipr_nonhash[af]; ip != NULL; ip = nip) {
nip = ip->ipsp_hash.hash_next;
- IPPOL_UNCHAIN(php, ip, ns);
+ IPPOL_UNCHAIN(php, ip);
}
ipr->ipr_nonhash[af] = NULL;
}
@@ -3916,7 +3854,7 @@ ipsec_ipr_flush(ipsec_policy_head_t *php, ipsec_policy_root_t *ipr,
for (ip = ipr->ipr_hash[chain].hash_head; ip != NULL;
ip = nip) {
nip = ip->ipsp_hash.hash_next;
- IPPOL_UNCHAIN(php, ip, ns);
+ IPPOL_UNCHAIN(php, ip);
}
ipr->ipr_hash[chain].hash_head = NULL;
}
@@ -3954,8 +3892,9 @@ ipsec_polhead_flush(ipsec_policy_head_t *php, netstack_t *ns)
ASSERT(RW_WRITE_HELD(&php->iph_lock));
for (dir = 0; dir < IPSEC_NTYPES; dir++)
- ipsec_ipr_flush(php, &php->iph_root[dir], ns);
+ ipsec_ipr_flush(php, &php->iph_root[dir]);
+ php->iph_gen++;
ipsec_update_present_flags(ns->netstack_ipsec);
}
@@ -4066,727 +4005,219 @@ ipsec_polhead_split(ipsec_policy_head_t *php, netstack_t *ns)
*
* NOTE2: This function is called by cleartext cases, so it needs to be
* in IP proper.
+ *
+ * Note: the caller has moved other parts of ira into ixa already.
*/
boolean_t
-ipsec_in_to_out(mblk_t *ipsec_mp, ipha_t *ipha, ip6_t *ip6h, zoneid_t zoneid)
-{
- ipsec_in_t *ii;
- ipsec_out_t *io;
- boolean_t v4;
- mblk_t *mp;
- boolean_t secure;
- uint_t ifindex;
+ipsec_in_to_out(ip_recv_attr_t *ira, ip_xmit_attr_t *ixa, mblk_t *data_mp,
+ ipha_t *ipha, ip6_t *ip6h)
+{
ipsec_selector_t sel;
- ipsec_action_t *reflect_action = NULL;
- netstack_t *ns;
-
- ASSERT(ipsec_mp->b_datap->db_type == M_CTL);
+ ipsec_action_t *reflect_action = NULL;
+ netstack_t *ns = ixa->ixa_ipst->ips_netstack;
bzero((void*)&sel, sizeof (sel));
- ii = (ipsec_in_t *)ipsec_mp->b_rptr;
-
- mp = ipsec_mp->b_cont;
- ASSERT(mp != NULL);
-
- if (ii->ipsec_in_action != NULL) {
+ if (ira->ira_ipsec_action != NULL) {
/* transfer reference.. */
- reflect_action = ii->ipsec_in_action;
- ii->ipsec_in_action = NULL;
- } else if (!ii->ipsec_in_loopback)
- reflect_action = ipsec_in_to_out_action(ii);
- secure = ii->ipsec_in_secure;
- ifindex = ii->ipsec_in_ill_index;
- ns = ii->ipsec_in_ns;
- v4 = ii->ipsec_in_v4;
-
- ipsec_in_release_refs(ii); /* No netstack_rele/hold needed */
-
- /*
- * Use the global zone's id if we don't have a specific zone
- * identified. This is likely to happen when the received packet's
- * destination is a Trusted Extensions all-zones address. We did
- * not copy the zoneid from ii->ipsec_in_zone id because that
- * information represents the zoneid we started input processing
- * with. The caller should have a better idea of which zone the
- * received packet was destined for.
- */
-
- if (zoneid == ALL_ZONES)
- zoneid = GLOBAL_ZONEID;
+ reflect_action = ira->ira_ipsec_action;
+ ira->ira_ipsec_action = NULL;
+ } else if (!(ira->ira_flags & IRAF_LOOPBACK))
+ reflect_action = ipsec_in_to_out_action(ira);
/*
* The caller is going to send the datagram out which might
- * go on the wire or delivered locally through ip_wput_local.
+ * go on the wire or delivered locally through ire_send_local.
*
* 1) If it goes out on the wire, new associations will be
* obtained.
- * 2) If it is delivered locally, ip_wput_local will convert
- * this IPSEC_OUT to a IPSEC_IN looking at the requests.
+ * 2) If it is delivered locally, ire_send_local will convert
+ * this ip_xmit_attr_t back to a ip_recv_attr_t looking at the
+ * requests.
*/
+ ixa->ixa_ipsec_action = reflect_action;
- io = (ipsec_out_t *)ipsec_mp->b_rptr;
- bzero(io, sizeof (ipsec_out_t));
- io->ipsec_out_type = IPSEC_OUT;
- io->ipsec_out_len = sizeof (ipsec_out_t);
- io->ipsec_out_frtn.free_func = ipsec_out_free;
- io->ipsec_out_frtn.free_arg = (char *)io;
- io->ipsec_out_act = reflect_action;
-
- if (!ipsec_init_outbound_ports(&sel, mp, ipha, ip6h, 0,
- ns->netstack_ipsec))
+ if (!ipsec_init_outbound_ports(&sel, data_mp, ipha, ip6h, 0,
+ ns->netstack_ipsec)) {
+ /* Note: data_mp already consumed and ip_drop_packet done */
return (B_FALSE);
-
- io->ipsec_out_src_port = sel.ips_local_port;
- io->ipsec_out_dst_port = sel.ips_remote_port;
- io->ipsec_out_proto = sel.ips_protocol;
- io->ipsec_out_icmp_type = sel.ips_icmp_type;
- io->ipsec_out_icmp_code = sel.ips_icmp_code;
+ }
+ ixa->ixa_ipsec_src_port = sel.ips_local_port;
+ ixa->ixa_ipsec_dst_port = sel.ips_remote_port;
+ ixa->ixa_ipsec_proto = sel.ips_protocol;
+ ixa->ixa_ipsec_icmp_type = sel.ips_icmp_type;
+ ixa->ixa_ipsec_icmp_code = sel.ips_icmp_code;
/*
* Don't use global policy for this, as we want
* to use the same protection that was applied to the inbound packet.
+ * Thus we set IXAF_NO_IPSEC is it arrived in the clear to make
+ * it be sent in the clear.
*/
- io->ipsec_out_use_global_policy = B_FALSE;
- io->ipsec_out_proc_begin = B_FALSE;
- io->ipsec_out_secure = secure;
- io->ipsec_out_v4 = v4;
- io->ipsec_out_ill_index = ifindex;
- io->ipsec_out_zoneid = zoneid;
- io->ipsec_out_ns = ns; /* No netstack_hold */
+ if (ira->ira_flags & IRAF_IPSEC_SECURE)
+ ixa->ixa_flags |= IXAF_IPSEC_SECURE;
+ else
+ ixa->ixa_flags |= IXAF_NO_IPSEC;
return (B_TRUE);
}
-mblk_t *
-ipsec_in_tag(mblk_t *mp, mblk_t *cont, netstack_t *ns)
-{
- ipsec_in_t *ii = (ipsec_in_t *)mp->b_rptr;
- ipsec_in_t *nii;
- mblk_t *nmp;
- frtn_t nfrtn;
- ipsec_stack_t *ipss = ns->netstack_ipsec;
-
- ASSERT(ii->ipsec_in_type == IPSEC_IN);
- ASSERT(ii->ipsec_in_len == sizeof (ipsec_in_t));
-
- nmp = ipsec_in_alloc(ii->ipsec_in_v4, ns);
- if (nmp == NULL) {
- ip_drop_packet_chain(cont, B_FALSE, NULL, NULL,
- DROPPER(ipss, ipds_spd_nomem),
- &ipss->ipsec_spd_dropper);
- return (NULL);
- }
-
- ASSERT(nmp->b_datap->db_type == M_CTL);
- ASSERT(nmp->b_wptr == (nmp->b_rptr + sizeof (ipsec_info_t)));
-
- /*
- * Bump refcounts.
- */
- if (ii->ipsec_in_ah_sa != NULL)
- IPSA_REFHOLD(ii->ipsec_in_ah_sa);
- if (ii->ipsec_in_esp_sa != NULL)
- IPSA_REFHOLD(ii->ipsec_in_esp_sa);
- if (ii->ipsec_in_policy != NULL)
- IPPH_REFHOLD(ii->ipsec_in_policy);
-
- /*
- * Copy everything, but preserve the free routine provided by
- * ipsec_in_alloc().
- */
- nii = (ipsec_in_t *)nmp->b_rptr;
- nfrtn = nii->ipsec_in_frtn;
- bcopy(ii, nii, sizeof (*ii));
- nii->ipsec_in_frtn = nfrtn;
-
- nmp->b_cont = cont;
-
- return (nmp);
-}
-
-mblk_t *
-ipsec_out_tag(mblk_t *mp, mblk_t *cont, netstack_t *ns)
-{
- ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr;
- ipsec_out_t *nio;
- mblk_t *nmp;
- frtn_t nfrtn;
- ipsec_stack_t *ipss = ns->netstack_ipsec;
-
- ASSERT(io->ipsec_out_type == IPSEC_OUT);
- ASSERT(io->ipsec_out_len == sizeof (ipsec_out_t));
-
- nmp = ipsec_alloc_ipsec_out(ns);
- if (nmp == NULL) {
- ip_drop_packet_chain(cont, B_FALSE, NULL, NULL,
- DROPPER(ipss, ipds_spd_nomem),
- &ipss->ipsec_spd_dropper);
- return (NULL);
- }
- ASSERT(nmp->b_datap->db_type == M_CTL);
- ASSERT(nmp->b_wptr == (nmp->b_rptr + sizeof (ipsec_info_t)));
-
- /*
- * Bump refcounts.
- */
- if (io->ipsec_out_ah_sa != NULL)
- IPSA_REFHOLD(io->ipsec_out_ah_sa);
- if (io->ipsec_out_esp_sa != NULL)
- IPSA_REFHOLD(io->ipsec_out_esp_sa);
- if (io->ipsec_out_polhead != NULL)
- IPPH_REFHOLD(io->ipsec_out_polhead);
- if (io->ipsec_out_policy != NULL)
- IPPOL_REFHOLD(io->ipsec_out_policy);
- if (io->ipsec_out_act != NULL)
- IPACT_REFHOLD(io->ipsec_out_act);
- if (io->ipsec_out_latch != NULL)
- IPLATCH_REFHOLD(io->ipsec_out_latch);
- if (io->ipsec_out_cred != NULL)
- crhold(io->ipsec_out_cred);
-
- /*
- * Copy everything, but preserve the free routine provided by
- * ipsec_alloc_ipsec_out().
- */
- nio = (ipsec_out_t *)nmp->b_rptr;
- nfrtn = nio->ipsec_out_frtn;
- bcopy(io, nio, sizeof (*io));
- nio->ipsec_out_frtn = nfrtn;
-
- nmp->b_cont = cont;
-
- return (nmp);
-}
-
-static void
-ipsec_out_release_refs(ipsec_out_t *io)
+void
+ipsec_out_release_refs(ip_xmit_attr_t *ixa)
{
- netstack_t *ns = io->ipsec_out_ns;
-
- ASSERT(io->ipsec_out_type == IPSEC_OUT);
- ASSERT(io->ipsec_out_len == sizeof (ipsec_out_t));
- ASSERT(io->ipsec_out_ns != NULL);
+ if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE))
+ return;
- /* Note: IPSA_REFRELE is multi-line macro */
- if (io->ipsec_out_ah_sa != NULL)
- IPSA_REFRELE(io->ipsec_out_ah_sa);
- if (io->ipsec_out_esp_sa != NULL)
- IPSA_REFRELE(io->ipsec_out_esp_sa);
- if (io->ipsec_out_polhead != NULL)
- IPPH_REFRELE(io->ipsec_out_polhead, ns);
- if (io->ipsec_out_policy != NULL)
- IPPOL_REFRELE(io->ipsec_out_policy, ns);
- if (io->ipsec_out_act != NULL)
- IPACT_REFRELE(io->ipsec_out_act);
- if (io->ipsec_out_cred != NULL) {
- crfree(io->ipsec_out_cred);
- io->ipsec_out_cred = NULL;
+ if (ixa->ixa_ipsec_ah_sa != NULL) {
+ IPSA_REFRELE(ixa->ixa_ipsec_ah_sa);
+ ixa->ixa_ipsec_ah_sa = NULL;
}
- if (io->ipsec_out_latch) {
- IPLATCH_REFRELE(io->ipsec_out_latch, ns);
- io->ipsec_out_latch = NULL;
+ if (ixa->ixa_ipsec_esp_sa != NULL) {
+ IPSA_REFRELE(ixa->ixa_ipsec_esp_sa);
+ ixa->ixa_ipsec_esp_sa = NULL;
}
-}
-
-static void
-ipsec_out_free(void *arg)
-{
- ipsec_out_t *io = (ipsec_out_t *)arg;
- ipsec_out_release_refs(io);
- kmem_cache_free(ipsec_info_cache, arg);
-}
-
-static void
-ipsec_in_release_refs(ipsec_in_t *ii)
-{
- netstack_t *ns = ii->ipsec_in_ns;
-
- ASSERT(ii->ipsec_in_ns != NULL);
-
- /* Note: IPSA_REFRELE is multi-line macro */
- if (ii->ipsec_in_ah_sa != NULL)
- IPSA_REFRELE(ii->ipsec_in_ah_sa);
- if (ii->ipsec_in_esp_sa != NULL)
- IPSA_REFRELE(ii->ipsec_in_esp_sa);
- if (ii->ipsec_in_policy != NULL)
- IPPH_REFRELE(ii->ipsec_in_policy, ns);
- if (ii->ipsec_in_da != NULL) {
- freeb(ii->ipsec_in_da);
- ii->ipsec_in_da = NULL;
+ if (ixa->ixa_ipsec_policy != NULL) {
+ IPPOL_REFRELE(ixa->ixa_ipsec_policy);
+ ixa->ixa_ipsec_policy = NULL;
}
-}
-
-static void
-ipsec_in_free(void *arg)
-{
- ipsec_in_t *ii = (ipsec_in_t *)arg;
- ipsec_in_release_refs(ii);
- kmem_cache_free(ipsec_info_cache, arg);
-}
-
-/*
- * This is called only for outbound datagrams if the datagram needs to
- * go out secure. A NULL mp can be passed to get an ipsec_out. This
- * facility is used by ip_unbind.
- *
- * NOTE : o As the data part could be modified by ipsec_out_process etc.
- * we can't make it fast by calling a dup.
- */
-mblk_t *
-ipsec_alloc_ipsec_out(netstack_t *ns)
-{
- mblk_t *ipsec_mp;
- ipsec_out_t *io = kmem_cache_alloc(ipsec_info_cache, KM_NOSLEEP);
-
- if (io == NULL)
- return (NULL);
-
- bzero(io, sizeof (ipsec_out_t));
-
- io->ipsec_out_type = IPSEC_OUT;
- io->ipsec_out_len = sizeof (ipsec_out_t);
- io->ipsec_out_frtn.free_func = ipsec_out_free;
- io->ipsec_out_frtn.free_arg = (char *)io;
-
- /*
- * Set the zoneid to ALL_ZONES which is used as an invalid value. Code
- * using ipsec_out_zoneid should assert that the zoneid has been set to
- * a sane value.
- */
- io->ipsec_out_zoneid = ALL_ZONES;
- io->ipsec_out_ns = ns; /* No netstack_hold */
-
- ipsec_mp = desballoc((uint8_t *)io, sizeof (ipsec_info_t), BPRI_HI,
- &io->ipsec_out_frtn);
- if (ipsec_mp == NULL) {
- ipsec_out_free(io);
-
- return (NULL);
+ if (ixa->ixa_ipsec_action != NULL) {
+ IPACT_REFRELE(ixa->ixa_ipsec_action);
+ ixa->ixa_ipsec_action = NULL;
}
- ipsec_mp->b_datap->db_type = M_CTL;
- ipsec_mp->b_wptr = ipsec_mp->b_rptr + sizeof (ipsec_info_t);
-
- return (ipsec_mp);
-}
-
-/*
- * Attach an IPSEC_OUT; use pol for policy if it is non-null.
- * Otherwise initialize using conn.
- *
- * If pol is non-null, we consume a reference to it.
- */
-mblk_t *
-ipsec_attach_ipsec_out(mblk_t **mp, conn_t *connp, ipsec_policy_t *pol,
- uint8_t proto, netstack_t *ns)
-{
- mblk_t *ipsec_mp;
- ipsec_stack_t *ipss = ns->netstack_ipsec;
-
- ASSERT((pol != NULL) || (connp != NULL));
-
- ipsec_mp = ipsec_alloc_ipsec_out(ns);
- if (ipsec_mp == NULL) {
- ipsec_rl_strlog(ns, IP_MOD_ID, 0, 0, SL_ERROR|SL_NOTE,
- "ipsec_attach_ipsec_out: Allocation failure\n");
- ip_drop_packet(*mp, B_FALSE, NULL, NULL,
- DROPPER(ipss, ipds_spd_nomem),
- &ipss->ipsec_spd_dropper);
- *mp = NULL;
- return (NULL);
+ if (ixa->ixa_ipsec_latch) {
+ IPLATCH_REFRELE(ixa->ixa_ipsec_latch);
+ ixa->ixa_ipsec_latch = NULL;
}
- ipsec_mp->b_cont = *mp;
- /*
- * If *mp is NULL, ipsec_init_ipsec_out() won't/should not be using it.
- */
- return (ipsec_init_ipsec_out(ipsec_mp, mp, connp, pol, proto, ns));
+ /* Clear the soft references to the SAs */
+ ixa->ixa_ipsec_ref[0].ipsr_sa = NULL;
+ ixa->ixa_ipsec_ref[0].ipsr_bucket = NULL;
+ ixa->ixa_ipsec_ref[0].ipsr_gen = 0;
+ ixa->ixa_ipsec_ref[1].ipsr_sa = NULL;
+ ixa->ixa_ipsec_ref[1].ipsr_bucket = NULL;
+ ixa->ixa_ipsec_ref[1].ipsr_gen = 0;
+ ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
}
-/*
- * Initialize the IPSEC_OUT (ipsec_mp) using pol if it is non-null.
- * Otherwise initialize using conn.
- *
- * If pol is non-null, we consume a reference to it.
- */
-mblk_t *
-ipsec_init_ipsec_out(mblk_t *ipsec_mp, mblk_t **mp, conn_t *connp,
- ipsec_policy_t *pol, uint8_t proto, netstack_t *ns)
+void
+ipsec_in_release_refs(ip_recv_attr_t *ira)
{
- ipsec_out_t *io;
- ipsec_policy_t *p;
- ipha_t *ipha;
- ip6_t *ip6h;
- ipsec_stack_t *ipss = ns->netstack_ipsec;
-
- ASSERT(ipsec_mp->b_cont == *mp);
-
- ASSERT((pol != NULL) || (connp != NULL));
-
- ASSERT(ipsec_mp->b_datap->db_type == M_CTL);
- ASSERT(ipsec_mp->b_wptr == (ipsec_mp->b_rptr + sizeof (ipsec_info_t)));
- io = (ipsec_out_t *)ipsec_mp->b_rptr;
- ASSERT(io->ipsec_out_type == IPSEC_OUT);
- ASSERT(io->ipsec_out_len == sizeof (ipsec_out_t));
- io->ipsec_out_latch = NULL;
- /*
- * Set the zoneid when we have the connp.
- * Otherwise, we're called from ip_wput_attach_policy() who will take
- * care of setting the zoneid.
- */
- if (connp != NULL)
- io->ipsec_out_zoneid = connp->conn_zoneid;
-
- io->ipsec_out_ns = ns; /* No netstack_hold */
-
- if (*mp != NULL) {
- ipha = (ipha_t *)(*mp)->b_rptr;
- if (IPH_HDR_VERSION(ipha) == IP_VERSION) {
- io->ipsec_out_v4 = B_TRUE;
- ip6h = NULL;
- } else {
- io->ipsec_out_v4 = B_FALSE;
- ip6h = (ip6_t *)ipha;
- ipha = NULL;
- }
- } else {
- ASSERT(connp != NULL && connp->conn_policy_cached);
- ip6h = NULL;
- ipha = NULL;
- io->ipsec_out_v4 = !connp->conn_pkt_isv6;
- }
-
- p = NULL;
-
- /*
- * Take latched policies over global policy. Check here again for
- * this, in case we had conn_latch set while the packet was flying
- * around in IP.
- */
- if (connp != NULL && connp->conn_latch != NULL) {
- ASSERT(ns == connp->conn_netstack);
- p = connp->conn_latch->ipl_out_policy;
- io->ipsec_out_latch = connp->conn_latch;
- IPLATCH_REFHOLD(connp->conn_latch);
- if (p != NULL) {
- IPPOL_REFHOLD(p);
- }
- io->ipsec_out_src_port = connp->conn_lport;
- io->ipsec_out_dst_port = connp->conn_fport;
- io->ipsec_out_icmp_type = io->ipsec_out_icmp_code = 0;
- if (pol != NULL)
- IPPOL_REFRELE(pol, ns);
- } else if (pol != NULL) {
- ipsec_selector_t sel;
-
- bzero((void*)&sel, sizeof (sel));
-
- p = pol;
- /*
- * conn does not have the port information. Get
- * it from the packet.
- */
+ if (!(ira->ira_flags & IRAF_IPSEC_SECURE))
+ return;
- if (!ipsec_init_outbound_ports(&sel, *mp, ipha, ip6h, 0,
- ns->netstack_ipsec)) {
- /* Callee did ip_drop_packet() on *mp. */
- *mp = NULL;
- freeb(ipsec_mp);
- return (NULL);
- }
- io->ipsec_out_src_port = sel.ips_local_port;
- io->ipsec_out_dst_port = sel.ips_remote_port;
- io->ipsec_out_icmp_type = sel.ips_icmp_type;
- io->ipsec_out_icmp_code = sel.ips_icmp_code;
+ if (ira->ira_ipsec_ah_sa != NULL) {
+ IPSA_REFRELE(ira->ira_ipsec_ah_sa);
+ ira->ira_ipsec_ah_sa = NULL;
}
-
- io->ipsec_out_proto = proto;
- io->ipsec_out_use_global_policy = B_TRUE;
- io->ipsec_out_secure = (p != NULL);
- io->ipsec_out_policy = p;
-
- if (p == NULL) {
- if (connp->conn_policy != NULL) {
- io->ipsec_out_secure = B_TRUE;
- ASSERT(io->ipsec_out_latch == NULL);
- ASSERT(io->ipsec_out_use_global_policy == B_TRUE);
- io->ipsec_out_need_policy = B_TRUE;
- ASSERT(io->ipsec_out_polhead == NULL);
- IPPH_REFHOLD(connp->conn_policy);
- io->ipsec_out_polhead = connp->conn_policy;
- }
- } else {
- /* Handle explicit drop action. */
- if (p->ipsp_act->ipa_act.ipa_type == IPSEC_ACT_DISCARD ||
- p->ipsp_act->ipa_act.ipa_type == IPSEC_ACT_REJECT) {
- ip_drop_packet(ipsec_mp, B_FALSE, NULL, NULL,
- DROPPER(ipss, ipds_spd_explicit),
- &ipss->ipsec_spd_dropper);
- *mp = NULL;
- ipsec_mp = NULL;
- }
+ if (ira->ira_ipsec_esp_sa != NULL) {
+ IPSA_REFRELE(ira->ira_ipsec_esp_sa);
+ ira->ira_ipsec_esp_sa = NULL;
}
-
- return (ipsec_mp);
+ ira->ira_flags &= ~IRAF_IPSEC_SECURE;
}
/*
- * Allocate an IPSEC_IN mblk. This will be prepended to an inbound datagram
- * and keep track of what-if-any IPsec processing will be applied to the
- * datagram.
- */
-mblk_t *
-ipsec_in_alloc(boolean_t isv4, netstack_t *ns)
-{
- mblk_t *ipsec_in;
- ipsec_in_t *ii = kmem_cache_alloc(ipsec_info_cache, KM_NOSLEEP);
-
- if (ii == NULL)
- return (NULL);
-
- bzero(ii, sizeof (ipsec_info_t));
- ii->ipsec_in_type = IPSEC_IN;
- ii->ipsec_in_len = sizeof (ipsec_in_t);
-
- ii->ipsec_in_v4 = isv4;
- ii->ipsec_in_secure = B_TRUE;
- ii->ipsec_in_ns = ns; /* No netstack_hold */
- ii->ipsec_in_stackid = ns->netstack_stackid;
-
- ii->ipsec_in_frtn.free_func = ipsec_in_free;
- ii->ipsec_in_frtn.free_arg = (char *)ii;
-
- ii->ipsec_in_zoneid = ALL_ZONES; /* default for received packets */
-
- ipsec_in = desballoc((uint8_t *)ii, sizeof (ipsec_info_t), BPRI_HI,
- &ii->ipsec_in_frtn);
- if (ipsec_in == NULL) {
- ip1dbg(("ipsec_in_alloc: IPSEC_IN allocation failure.\n"));
- ipsec_in_free(ii);
- return (NULL);
- }
-
- ipsec_in->b_datap->db_type = M_CTL;
- ipsec_in->b_wptr += sizeof (ipsec_info_t);
-
- return (ipsec_in);
-}
-
-/*
- * This is called from ip_wput_local when a packet which needs
- * security is looped back, to convert the IPSEC_OUT to a IPSEC_IN
- * before fanout, where the policy check happens. In most of the
- * cases, IPSEC processing has *never* been done. There is one case
- * (ip_wput_ire_fragmentit -> ip_wput_frag -> icmp_frag_needed) where
- * the packet is destined for localhost, IPSEC processing has already
- * been done.
+ * This is called from ire_send_local when a packet
+ * is looped back. We setup the ip_recv_attr_t "borrowing" the references
+ * held by the callers.
+ * Note that we don't do any IPsec but we carry the actions and IPSEC flags
+ * across so that the fanout policy checks see that IPsec was applied.
*
- * Future: This could happen after SA selection has occurred for
- * outbound.. which will tell us who the src and dst identities are..
- * Then it's just a matter of splicing the ah/esp SA pointers from the
- * ipsec_out_t to the ipsec_in_t.
+ * The caller should do ipsec_in_release_refs() on the ira by calling
+ * ira_cleanup().
*/
void
-ipsec_out_to_in(mblk_t *ipsec_mp)
+ipsec_out_to_in(ip_xmit_attr_t *ixa, ill_t *ill, ip_recv_attr_t *ira)
{
- ipsec_in_t *ii;
- ipsec_out_t *io;
ipsec_policy_t *pol;
ipsec_action_t *act;
- boolean_t v4, icmp_loopback;
- zoneid_t zoneid;
- netstack_t *ns;
- ASSERT(ipsec_mp->b_datap->db_type == M_CTL);
+ /* Non-IPsec operations */
+ ira->ira_free_flags = 0;
+ ira->ira_zoneid = ixa->ixa_zoneid;
+ ira->ira_cred = ixa->ixa_cred;
+ ira->ira_cpid = ixa->ixa_cpid;
+ ira->ira_tsl = ixa->ixa_tsl;
+ ira->ira_ill = ira->ira_rill = ill;
+ ira->ira_flags = ixa->ixa_flags & IAF_MASK;
+ ira->ira_no_loop_zoneid = ixa->ixa_no_loop_zoneid;
+ ira->ira_pktlen = ixa->ixa_pktlen;
+ ira->ira_ip_hdr_length = ixa->ixa_ip_hdr_length;
+ ira->ira_protocol = ixa->ixa_protocol;
+ ira->ira_mhip = NULL;
+
+ ira->ira_flags |= IRAF_LOOPBACK | IRAF_L2SRC_LOOPBACK;
+
+ ira->ira_sqp = ixa->ixa_sqp;
+ ira->ira_ring = NULL;
+
+ ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
+ ira->ira_rifindex = ira->ira_ruifindex;
+
+ if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE))
+ return;
- io = (ipsec_out_t *)ipsec_mp->b_rptr;
+ ira->ira_flags |= IRAF_IPSEC_SECURE;
- v4 = io->ipsec_out_v4;
- zoneid = io->ipsec_out_zoneid;
- icmp_loopback = io->ipsec_out_icmp_loopback;
- ns = io->ipsec_out_ns;
+ ira->ira_ipsec_ah_sa = NULL;
+ ira->ira_ipsec_esp_sa = NULL;
- act = io->ipsec_out_act;
+ act = ixa->ixa_ipsec_action;
if (act == NULL) {
- pol = io->ipsec_out_policy;
+ pol = ixa->ixa_ipsec_policy;
if (pol != NULL) {
act = pol->ipsp_act;
IPACT_REFHOLD(act);
}
}
- io->ipsec_out_act = NULL;
-
- ipsec_out_release_refs(io); /* No netstack_rele/hold needed */
-
- ii = (ipsec_in_t *)ipsec_mp->b_rptr;
- bzero(ii, sizeof (ipsec_in_t));
- ii->ipsec_in_type = IPSEC_IN;
- ii->ipsec_in_len = sizeof (ipsec_in_t);
- ii->ipsec_in_loopback = B_TRUE;
- ii->ipsec_in_ns = ns; /* No netstack_hold */
-
- ii->ipsec_in_frtn.free_func = ipsec_in_free;
- ii->ipsec_in_frtn.free_arg = (char *)ii;
- ii->ipsec_in_action = act;
- ii->ipsec_in_zoneid = zoneid;
-
- /*
- * In most of the cases, we can't look at the ipsec_out_XXX_sa
- * because this never went through IPSEC processing. So, look at
- * the requests and infer whether it would have gone through
- * IPSEC processing or not. Initialize the "done" fields with
- * the requests. The possible values for "done" fields are :
- *
- * 1) zero, indicates that a particular preference was never
- * requested.
- * 2) non-zero, indicates that it could be IPSEC_PREF_REQUIRED/
- * IPSEC_PREF_NEVER. If IPSEC_REQ_DONE is set, it means that
- * IPSEC processing has been completed.
- */
- ii->ipsec_in_secure = B_TRUE;
- ii->ipsec_in_v4 = v4;
- ii->ipsec_in_icmp_loopback = icmp_loopback;
+ ixa->ixa_ipsec_action = NULL;
+ ira->ira_ipsec_action = act;
}
/*
- * Consults global policy to see whether this datagram should
- * go out secure. If so it attaches a ipsec_mp in front and
- * returns.
+ * Consults global policy and per-socket policy to see whether this datagram
+ * should go out secure. If so it updates the ip_xmit_attr_t
+ * Should not be used when connecting, since then we want to latch the policy.
+ *
+ * If connp is NULL we just look at the global policy.
+ *
+ * Returns NULL if the packet was dropped, in which case the MIB has
+ * been incremented and ip_drop_packet done.
*/
mblk_t *
-ip_wput_attach_policy(mblk_t *ipsec_mp, ipha_t *ipha, ip6_t *ip6h, ire_t *ire,
- conn_t *connp, boolean_t unspec_src, zoneid_t zoneid)
+ip_output_attach_policy(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
+ const conn_t *connp, ip_xmit_attr_t *ixa)
{
- mblk_t *mp;
- ipsec_out_t *io = NULL;
ipsec_selector_t sel;
- uint_t ill_index;
- boolean_t conn_dontroutex;
- boolean_t conn_multicast_loopx;
- boolean_t policy_present;
- ip_stack_t *ipst = ire->ire_ipst;
+ boolean_t policy_present;
+ ip_stack_t *ipst = ixa->ixa_ipst;
netstack_t *ns = ipst->ips_netstack;
ipsec_stack_t *ipss = ns->netstack_ipsec;
+ ipsec_policy_t *p;
+ ixa->ixa_ipsec_policy_gen = ipss->ipsec_system_policy.iph_gen;
ASSERT((ipha != NULL && ip6h == NULL) ||
(ip6h != NULL && ipha == NULL));
- bzero((void*)&sel, sizeof (sel));
-
if (ipha != NULL)
policy_present = ipss->ipsec_outbound_v4_policy_present;
else
policy_present = ipss->ipsec_outbound_v6_policy_present;
- /*
- * Fast Path to see if there is any policy.
- */
- if (!policy_present) {
- if (ipsec_mp->b_datap->db_type == M_CTL) {
- io = (ipsec_out_t *)ipsec_mp->b_rptr;
- if (!io->ipsec_out_secure) {
- /*
- * If there is no global policy and ip_wput
- * or ip_wput_multicast has attached this mp
- * for multicast case, free the ipsec_mp and
- * return the original mp.
- */
- mp = ipsec_mp->b_cont;
- freeb(ipsec_mp);
- ipsec_mp = mp;
- io = NULL;
- }
- ASSERT(io == NULL || !io->ipsec_out_tunnel);
- }
- if (((io == NULL) || (io->ipsec_out_polhead == NULL)) &&
- ((connp == NULL) || (connp->conn_policy == NULL)))
- return (ipsec_mp);
- }
- ill_index = 0;
- conn_multicast_loopx = conn_dontroutex = B_FALSE;
- mp = ipsec_mp;
- if (ipsec_mp->b_datap->db_type == M_CTL) {
- mp = ipsec_mp->b_cont;
- /*
- * This is a connection where we have some per-socket
- * policy or ip_wput has attached an ipsec_mp for
- * the multicast datagram.
- */
- io = (ipsec_out_t *)ipsec_mp->b_rptr;
- if (!io->ipsec_out_secure) {
- /*
- * This ipsec_mp was allocated in ip_wput or
- * ip_wput_multicast so that we will know the
- * value of ill_index, conn_dontroute,
- * conn_multicast_loop in the multicast case if
- * we inherit global policy here.
- */
- ill_index = io->ipsec_out_ill_index;
- conn_dontroutex = io->ipsec_out_dontroute;
- conn_multicast_loopx = io->ipsec_out_multicast_loop;
- freeb(ipsec_mp);
- ipsec_mp = mp;
- io = NULL;
- }
- ASSERT(io == NULL || !io->ipsec_out_tunnel);
- }
+ if (!policy_present && (connp == NULL || connp->conn_policy == NULL))
+ return (mp);
+
+ bzero((void*)&sel, sizeof (sel));
if (ipha != NULL) {
- sel.ips_local_addr_v4 = (ipha->ipha_src != 0 ?
- ipha->ipha_src : ire->ire_src_addr);
+ sel.ips_local_addr_v4 = ipha->ipha_src;
sel.ips_remote_addr_v4 = ip_get_dst(ipha);
- sel.ips_protocol = (uint8_t)ipha->ipha_protocol;
sel.ips_isv4 = B_TRUE;
} else {
- ushort_t hdr_len;
- uint8_t *nexthdrp;
- boolean_t is_fragment;
-
sel.ips_isv4 = B_FALSE;
- if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) {
- if (!unspec_src)
- sel.ips_local_addr_v6 = ire->ire_src_addr_v6;
- } else {
- sel.ips_local_addr_v6 = ip6h->ip6_src;
- }
-
- sel.ips_remote_addr_v6 = ip_get_dst_v6(ip6h, mp, &is_fragment);
- if (is_fragment) {
- /*
- * It's a packet fragment for a packet that
- * we have already processed (since IPsec processing
- * is done before fragmentation), so we don't
- * have to do policy checks again. Fragments can
- * come back to us for processing if they have
- * been queued up due to flow control.
- */
- if (ipsec_mp->b_datap->db_type == M_CTL) {
- mp = ipsec_mp->b_cont;
- freeb(ipsec_mp);
- ipsec_mp = mp;
- }
- return (ipsec_mp);
- }
-
- /* IPv6 common-case. */
- sel.ips_protocol = ip6h->ip6_nxt;
- switch (ip6h->ip6_nxt) {
- case IPPROTO_TCP:
- case IPPROTO_UDP:
- case IPPROTO_SCTP:
- case IPPROTO_ICMPV6:
- break;
- default:
- if (!ip_hdr_length_nexthdr_v6(mp, ip6h,
- &hdr_len, &nexthdrp)) {
- BUMP_MIB(&ipst->ips_ip6_mib,
- ipIfStatsOutDiscards);
- freemsg(ipsec_mp); /* Not IPsec-related drop. */
- return (NULL);
- }
- sel.ips_protocol = *nexthdrp;
- break;
- }
+ sel.ips_local_addr_v6 = ip6h->ip6_src;
+ sel.ips_remote_addr_v6 = ip_get_dst_v6(ip6h, mp, NULL);
}
+ sel.ips_protocol = ixa->ixa_protocol;
if (!ipsec_init_outbound_ports(&sel, mp, ipha, ip6h, 0, ipss)) {
if (ipha != NULL) {
@@ -4794,65 +4225,36 @@ ip_wput_attach_policy(mblk_t *ipsec_mp, ipha_t *ipha, ip6_t *ip6h, ire_t *ire,
} else {
BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards);
}
-
- /* Callee dropped the packet. */
+ /* Note: mp already consumed and ip_drop_packet done */
return (NULL);
}
- if (io != NULL) {
- /*
- * We seem to have some local policy (we already have
- * an ipsec_out). Look at global policy and see
- * whether we have to inherit or not.
- */
- io->ipsec_out_need_policy = B_FALSE;
- ipsec_mp = ipsec_apply_global_policy(ipsec_mp, connp,
- &sel, ns);
- ASSERT((io->ipsec_out_policy != NULL) ||
- (io->ipsec_out_act != NULL));
- ASSERT(io->ipsec_out_need_policy == B_FALSE);
- return (ipsec_mp);
+ ASSERT(ixa->ixa_ipsec_policy == NULL);
+ p = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, &sel, ns);
+ ixa->ixa_ipsec_policy = p;
+ if (p != NULL) {
+ ixa->ixa_flags |= IXAF_IPSEC_SECURE;
+ if (connp == NULL || connp->conn_policy == NULL)
+ ixa->ixa_flags |= IXAF_IPSEC_GLOBAL_POLICY;
+ } else {
+ ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
}
- /*
- * We pass in a pointer to a pointer because mp can become
- * NULL due to allocation failures or explicit drops. Callers
- * of this function should assume a NULL mp means the packet
- * was dropped.
- */
- ipsec_mp = ipsec_attach_global_policy(&mp, connp, &sel, ns);
- if (ipsec_mp == NULL)
- return (mp);
/*
* Copy the right port information.
*/
- ASSERT(ipsec_mp->b_datap->db_type == M_CTL);
- io = (ipsec_out_t *)ipsec_mp->b_rptr;
-
- ASSERT(io->ipsec_out_need_policy == B_FALSE);
- ASSERT((io->ipsec_out_policy != NULL) ||
- (io->ipsec_out_act != NULL));
- io->ipsec_out_src_port = sel.ips_local_port;
- io->ipsec_out_dst_port = sel.ips_remote_port;
- io->ipsec_out_icmp_type = sel.ips_icmp_type;
- io->ipsec_out_icmp_code = sel.ips_icmp_code;
- /*
- * Set ill_index, conn_dontroute and conn_multicast_loop
- * for multicast datagrams.
- */
- io->ipsec_out_ill_index = ill_index;
- io->ipsec_out_dontroute = conn_dontroutex;
- io->ipsec_out_multicast_loop = conn_multicast_loopx;
-
- if (zoneid == ALL_ZONES)
- zoneid = GLOBAL_ZONEID;
- io->ipsec_out_zoneid = zoneid;
- return (ipsec_mp);
+ ixa->ixa_ipsec_src_port = sel.ips_local_port;
+ ixa->ixa_ipsec_dst_port = sel.ips_remote_port;
+ ixa->ixa_ipsec_icmp_type = sel.ips_icmp_type;
+ ixa->ixa_ipsec_icmp_code = sel.ips_icmp_code;
+ ixa->ixa_ipsec_proto = sel.ips_protocol;
+ return (mp);
}
/*
* When appropriate, this function caches inbound and outbound policy
- * for this connection.
+ * for this connection. The outbound policy is stored in conn_ixa.
+ * Note that it can not be used for SCTP since conn_faddr isn't set for SCTP.
*
* XXX need to work out more details about per-interface policy and
* caching here!
@@ -4866,20 +4268,38 @@ ipsec_conn_cache_policy(conn_t *connp, boolean_t isv4)
netstack_t *ns = connp->conn_netstack;
ipsec_stack_t *ipss = ns->netstack_ipsec;
+ connp->conn_ixa->ixa_ipsec_policy_gen =
+ ipss->ipsec_system_policy.iph_gen;
/*
* There is no policy latching for ICMP sockets because we can't
* decide on which policy to use until we see the packet and get
* type/code selectors.
*/
- if (connp->conn_ulp == IPPROTO_ICMP ||
- connp->conn_ulp == IPPROTO_ICMPV6) {
+ if (connp->conn_proto == IPPROTO_ICMP ||
+ connp->conn_proto == IPPROTO_ICMPV6) {
connp->conn_in_enforce_policy =
connp->conn_out_enforce_policy = B_TRUE;
if (connp->conn_latch != NULL) {
- IPLATCH_REFRELE(connp->conn_latch, ns);
+ IPLATCH_REFRELE(connp->conn_latch);
connp->conn_latch = NULL;
}
- connp->conn_flags |= IPCL_CHECK_POLICY;
+ if (connp->conn_latch_in_policy != NULL) {
+ IPPOL_REFRELE(connp->conn_latch_in_policy);
+ connp->conn_latch_in_policy = NULL;
+ }
+ if (connp->conn_latch_in_action != NULL) {
+ IPACT_REFRELE(connp->conn_latch_in_action);
+ connp->conn_latch_in_action = NULL;
+ }
+ if (connp->conn_ixa->ixa_ipsec_policy != NULL) {
+ IPPOL_REFRELE(connp->conn_ixa->ixa_ipsec_policy);
+ connp->conn_ixa->ixa_ipsec_policy = NULL;
+ }
+ if (connp->conn_ixa->ixa_ipsec_action != NULL) {
+ IPACT_REFRELE(connp->conn_ixa->ixa_ipsec_action);
+ connp->conn_ixa->ixa_ipsec_action = NULL;
+ }
+ connp->conn_ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
return (0);
}
@@ -4898,38 +4318,57 @@ ipsec_conn_cache_policy(conn_t *connp, boolean_t isv4)
return (ENOMEM);
}
- sel.ips_protocol = connp->conn_ulp;
+ bzero((void*)&sel, sizeof (sel));
+
+ sel.ips_protocol = connp->conn_proto;
sel.ips_local_port = connp->conn_lport;
sel.ips_remote_port = connp->conn_fport;
sel.ips_is_icmp_inv_acq = 0;
sel.ips_isv4 = isv4;
if (isv4) {
- sel.ips_local_addr_v4 = connp->conn_src;
- sel.ips_remote_addr_v4 = connp->conn_rem;
+ sel.ips_local_addr_v4 = connp->conn_laddr_v4;
+ sel.ips_remote_addr_v4 = connp->conn_faddr_v4;
} else {
- sel.ips_local_addr_v6 = connp->conn_srcv6;
- sel.ips_remote_addr_v6 = connp->conn_remv6;
+ sel.ips_local_addr_v6 = connp->conn_laddr_v6;
+ sel.ips_remote_addr_v6 = connp->conn_faddr_v6;
}
- p = ipsec_find_policy(IPSEC_TYPE_INBOUND, connp, NULL, &sel,
- ns);
- if (connp->conn_latch->ipl_in_policy != NULL)
- IPPOL_REFRELE(connp->conn_latch->ipl_in_policy, ns);
- connp->conn_latch->ipl_in_policy = p;
+ p = ipsec_find_policy(IPSEC_TYPE_INBOUND, connp, &sel, ns);
+ if (connp->conn_latch_in_policy != NULL)
+ IPPOL_REFRELE(connp->conn_latch_in_policy);
+ connp->conn_latch_in_policy = p;
connp->conn_in_enforce_policy = (p != NULL);
- p = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, NULL, &sel,
- ns);
- if (connp->conn_latch->ipl_out_policy != NULL)
- IPPOL_REFRELE(connp->conn_latch->ipl_out_policy, ns);
- connp->conn_latch->ipl_out_policy = p;
+ p = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, &sel, ns);
+ if (connp->conn_ixa->ixa_ipsec_policy != NULL)
+ IPPOL_REFRELE(connp->conn_ixa->ixa_ipsec_policy);
+ connp->conn_ixa->ixa_ipsec_policy = p;
connp->conn_out_enforce_policy = (p != NULL);
-
+ if (p != NULL) {
+ connp->conn_ixa->ixa_flags |= IXAF_IPSEC_SECURE;
+ if (connp->conn_policy == NULL) {
+ connp->conn_ixa->ixa_flags |=
+ IXAF_IPSEC_GLOBAL_POLICY;
+ }
+ } else {
+ connp->conn_ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
+ }
/* Clear the latched actions too, in case we're recaching. */
- if (connp->conn_latch->ipl_out_action != NULL)
- IPACT_REFRELE(connp->conn_latch->ipl_out_action);
- if (connp->conn_latch->ipl_in_action != NULL)
- IPACT_REFRELE(connp->conn_latch->ipl_in_action);
+ if (connp->conn_ixa->ixa_ipsec_action != NULL) {
+ IPACT_REFRELE(connp->conn_ixa->ixa_ipsec_action);
+ connp->conn_ixa->ixa_ipsec_action = NULL;
+ }
+ if (connp->conn_latch_in_action != NULL) {
+ IPACT_REFRELE(connp->conn_latch_in_action);
+ connp->conn_latch_in_action = NULL;
+ }
+ connp->conn_ixa->ixa_ipsec_src_port = sel.ips_local_port;
+ connp->conn_ixa->ixa_ipsec_dst_port = sel.ips_remote_port;
+ connp->conn_ixa->ixa_ipsec_icmp_type = sel.ips_icmp_type;
+ connp->conn_ixa->ixa_ipsec_icmp_code = sel.ips_icmp_code;
+ connp->conn_ixa->ixa_ipsec_proto = sel.ips_protocol;
+ } else {
+ connp->conn_ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
}
/*
@@ -4945,28 +4384,125 @@ ipsec_conn_cache_policy(conn_t *connp, boolean_t isv4)
* global policy (because conn_policy_cached is already set).
*/
connp->conn_policy_cached = B_TRUE;
- if (connp->conn_in_enforce_policy)
- connp->conn_flags |= IPCL_CHECK_POLICY;
return (0);
}
+/*
+ * When appropriate, this function caches outbound policy for faddr/fport.
+ * It is used when we are not connected i.e., when we can not latch the
+ * policy.
+ */
void
-iplatch_free(ipsec_latch_t *ipl, netstack_t *ns)
-{
- if (ipl->ipl_out_policy != NULL)
- IPPOL_REFRELE(ipl->ipl_out_policy, ns);
- if (ipl->ipl_in_policy != NULL)
- IPPOL_REFRELE(ipl->ipl_in_policy, ns);
- if (ipl->ipl_in_action != NULL)
- IPACT_REFRELE(ipl->ipl_in_action);
- if (ipl->ipl_out_action != NULL)
- IPACT_REFRELE(ipl->ipl_out_action);
+ipsec_cache_outbound_policy(const conn_t *connp, const in6_addr_t *v6src,
+ const in6_addr_t *v6dst, in_port_t dstport, ip_xmit_attr_t *ixa)
+{
+ boolean_t isv4 = (ixa->ixa_flags & IXAF_IS_IPV4) != 0;
+ boolean_t global_policy_present;
+ netstack_t *ns = connp->conn_netstack;
+ ipsec_stack_t *ipss = ns->netstack_ipsec;
+
+ ixa->ixa_ipsec_policy_gen = ipss->ipsec_system_policy.iph_gen;
+
+ /*
+ * There is no policy caching for ICMP sockets because we can't
+ * decide on which policy to use until we see the packet and get
+ * type/code selectors.
+ */
+ if (connp->conn_proto == IPPROTO_ICMP ||
+ connp->conn_proto == IPPROTO_ICMPV6) {
+ ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
+ if (ixa->ixa_ipsec_policy != NULL) {
+ IPPOL_REFRELE(ixa->ixa_ipsec_policy);
+ ixa->ixa_ipsec_policy = NULL;
+ }
+ if (ixa->ixa_ipsec_action != NULL) {
+ IPACT_REFRELE(ixa->ixa_ipsec_action);
+ ixa->ixa_ipsec_action = NULL;
+ }
+ return;
+ }
+
+ global_policy_present = isv4 ?
+ (ipss->ipsec_outbound_v4_policy_present ||
+ ipss->ipsec_inbound_v4_policy_present) :
+ (ipss->ipsec_outbound_v6_policy_present ||
+ ipss->ipsec_inbound_v6_policy_present);
+
+ if ((connp->conn_policy != NULL) || global_policy_present) {
+ ipsec_selector_t sel;
+ ipsec_policy_t *p;
+
+ bzero((void*)&sel, sizeof (sel));
+
+ sel.ips_protocol = connp->conn_proto;
+ sel.ips_local_port = connp->conn_lport;
+ sel.ips_remote_port = dstport;
+ sel.ips_is_icmp_inv_acq = 0;
+ sel.ips_isv4 = isv4;
+ if (isv4) {
+ IN6_V4MAPPED_TO_IPADDR(v6src, sel.ips_local_addr_v4);
+ IN6_V4MAPPED_TO_IPADDR(v6dst, sel.ips_remote_addr_v4);
+ } else {
+ sel.ips_local_addr_v6 = *v6src;
+ sel.ips_remote_addr_v6 = *v6dst;
+ }
+
+ p = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, &sel, ns);
+ if (ixa->ixa_ipsec_policy != NULL)
+ IPPOL_REFRELE(ixa->ixa_ipsec_policy);
+ ixa->ixa_ipsec_policy = p;
+ if (p != NULL) {
+ ixa->ixa_flags |= IXAF_IPSEC_SECURE;
+ if (connp->conn_policy == NULL)
+ ixa->ixa_flags |= IXAF_IPSEC_GLOBAL_POLICY;
+ } else {
+ ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
+ }
+ /* Clear the latched actions too, in case we're recaching. */
+ if (ixa->ixa_ipsec_action != NULL) {
+ IPACT_REFRELE(ixa->ixa_ipsec_action);
+ ixa->ixa_ipsec_action = NULL;
+ }
+
+ ixa->ixa_ipsec_src_port = sel.ips_local_port;
+ ixa->ixa_ipsec_dst_port = sel.ips_remote_port;
+ ixa->ixa_ipsec_icmp_type = sel.ips_icmp_type;
+ ixa->ixa_ipsec_icmp_code = sel.ips_icmp_code;
+ ixa->ixa_ipsec_proto = sel.ips_protocol;
+ } else {
+ ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
+ if (ixa->ixa_ipsec_policy != NULL) {
+ IPPOL_REFRELE(ixa->ixa_ipsec_policy);
+ ixa->ixa_ipsec_policy = NULL;
+ }
+ if (ixa->ixa_ipsec_action != NULL) {
+ IPACT_REFRELE(ixa->ixa_ipsec_action);
+ ixa->ixa_ipsec_action = NULL;
+ }
+ }
+}
+
+/*
+ * Returns B_FALSE if the policy has gone stale.
+ */
+boolean_t
+ipsec_outbound_policy_current(ip_xmit_attr_t *ixa)
+{
+ ipsec_stack_t *ipss = ixa->ixa_ipst->ips_netstack->netstack_ipsec;
+
+ if (!(ixa->ixa_flags & IXAF_IPSEC_GLOBAL_POLICY))
+ return (B_TRUE);
+
+ return (ixa->ixa_ipsec_policy_gen == ipss->ipsec_system_policy.iph_gen);
+}
+
+void
+iplatch_free(ipsec_latch_t *ipl)
+{
if (ipl->ipl_local_cid != NULL)
IPSID_REFRELE(ipl->ipl_local_cid);
if (ipl->ipl_remote_cid != NULL)
IPSID_REFRELE(ipl->ipl_remote_cid);
- if (ipl->ipl_local_id != NULL)
- crfree(ipl->ipl_local_id);
mutex_destroy(&ipl->ipl_lock);
kmem_free(ipl, sizeof (*ipl));
}
@@ -5622,18 +5158,19 @@ ipsec_unregister_prov_update(void)
* SAs are available. If there's no per-tunnel policy, or a match comes back
* with no match, then still return the packet and have global policy take
* a crack at it in IP.
+ * This updates the ip_xmit_attr with the IPsec policy.
*
* Remember -> we can be forwarding packets. Keep that in mind w.r.t.
* inner-packet contents.
*/
mblk_t *
ipsec_tun_outbound(mblk_t *mp, iptun_t *iptun, ipha_t *inner_ipv4,
- ip6_t *inner_ipv6, ipha_t *outer_ipv4, ip6_t *outer_ipv6, int outer_hdr_len)
+ ip6_t *inner_ipv6, ipha_t *outer_ipv4, ip6_t *outer_ipv6, int outer_hdr_len,
+ ip_xmit_attr_t *ixa)
{
ipsec_policy_head_t *polhead;
ipsec_selector_t sel;
- mblk_t *ipsec_mp, *ipsec_mp_head, *nmp;
- ipsec_out_t *io;
+ mblk_t *nmp;
boolean_t is_fragment;
ipsec_policy_t *pol;
ipsec_tun_pol_t *itp = iptun->iptun_itp;
@@ -5644,6 +5181,15 @@ ipsec_tun_outbound(mblk_t *mp, iptun_t *iptun, ipha_t *inner_ipv4,
outer_ipv4 != NULL && outer_ipv6 == NULL);
/* We take care of inners in a bit. */
+ /* Are the IPsec fields initialized at all? */
+ if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE)) {
+ ASSERT(ixa->ixa_ipsec_policy == NULL);
+ ASSERT(ixa->ixa_ipsec_latch == NULL);
+ ASSERT(ixa->ixa_ipsec_action == NULL);
+ ASSERT(ixa->ixa_ipsec_ah_sa == NULL);
+ ASSERT(ixa->ixa_ipsec_esp_sa == NULL);
+ }
+
ASSERT(itp != NULL && (itp->itp_flags & ITPF_P_ACTIVE));
polhead = itp->itp_policy;
@@ -5675,7 +5221,7 @@ ipsec_tun_outbound(mblk_t *mp, iptun_t *iptun, ipha_t *inner_ipv4,
if (mp->b_cont != NULL) {
nmp = msgpullup(mp, -1);
if (nmp == NULL) {
- ip_drop_packet(mp, B_FALSE, NULL, NULL,
+ ip_drop_packet(mp, B_FALSE, NULL,
DROPPER(ipss, ipds_spd_nomem),
&ipss->ipsec_spd_dropper);
return (NULL);
@@ -5734,8 +5280,8 @@ ipsec_tun_outbound(mblk_t *mp, iptun_t *iptun, ipha_t *inner_ipv4,
ip6h = (ip6_t *)mp->b_rptr;
if (!ip_hdr_length_nexthdr_v6(mp, ip6h,
&ip6_hdr_length, &v6_proto_p)) {
- ip_drop_packet_chain(mp, B_FALSE,
- NULL, NULL, DROPPER(ipss,
+ ip_drop_packet_chain(mp, B_FALSE, NULL,
+ DROPPER(ipss,
ipds_spd_malformed_packet),
&ipss->ipsec_spd_dropper);
return (NULL);
@@ -5761,8 +5307,8 @@ ipsec_tun_outbound(mblk_t *mp, iptun_t *iptun, ipha_t *inner_ipv4,
sel.ips_remote_addr_v6 = inner_ipv6->ip6_dst;
if (!ip_hdr_length_nexthdr_v6(mp,
inner_ipv6, &ip6_hdr_length, &v6_proto_p)) {
- ip_drop_packet_chain(mp, B_FALSE,
- NULL, NULL, DROPPER(ipss,
+ ip_drop_packet_chain(mp, B_FALSE, NULL,
+ DROPPER(ipss,
ipds_spd_malformed_frag),
&ipss->ipsec_spd_dropper);
return (NULL);
@@ -5802,8 +5348,7 @@ ipsec_tun_outbound(mblk_t *mp, iptun_t *iptun, ipha_t *inner_ipv4,
/* Success so far! */
}
rw_enter(&polhead->iph_lock, RW_READER);
- pol = ipsec_find_policy_head(NULL, polhead, IPSEC_TYPE_OUTBOUND,
- &sel, ns);
+ pol = ipsec_find_policy_head(NULL, polhead, IPSEC_TYPE_OUTBOUND, &sel);
rw_exit(&polhead->iph_lock);
if (pol == NULL) {
/*
@@ -5825,7 +5370,7 @@ ipsec_tun_outbound(mblk_t *mp, iptun_t *iptun, ipha_t *inner_ipv4,
cmn_err(CE_WARN, "ipsec_tun_outbound(): No matching tunnel "
"per-port policy\n");
#endif
- ip_drop_packet_chain(mp, B_FALSE, NULL, NULL,
+ ip_drop_packet_chain(mp, B_FALSE, NULL,
DROPPER(ipss, ipds_spd_explicit),
&ipss->ipsec_spd_dropper);
return (NULL);
@@ -5835,101 +5380,65 @@ ipsec_tun_outbound(mblk_t *mp, iptun_t *iptun, ipha_t *inner_ipv4,
cmn_err(CE_WARN, "Having matching tunnel per-port policy\n");
#endif
- /* Construct an IPSEC_OUT message. */
- ipsec_mp = ipsec_mp_head = ipsec_alloc_ipsec_out(ns);
- if (ipsec_mp == NULL) {
- IPPOL_REFRELE(pol, ns);
- ip_drop_packet(mp, B_FALSE, NULL, NULL,
- DROPPER(ipss, ipds_spd_nomem),
- &ipss->ipsec_spd_dropper);
- return (NULL);
- }
- ipsec_mp->b_cont = mp;
- io = (ipsec_out_t *)ipsec_mp->b_rptr;
- IPPH_REFHOLD(polhead);
/*
- * NOTE: free() function of ipsec_out mblk will release polhead and
- * pol references.
+ * NOTE: ixa_cleanup() function will release pol references.
*/
- io->ipsec_out_polhead = polhead;
- io->ipsec_out_policy = pol;
+ ixa->ixa_ipsec_policy = pol;
/*
* NOTE: There is a subtle difference between iptun_zoneid and
* iptun_connp->conn_zoneid explained in iptun_conn_create(). When
* interacting with the ip module, we must use conn_zoneid.
*/
- io->ipsec_out_zoneid = iptun->iptun_connp->conn_zoneid;
- io->ipsec_out_v4 = (outer_ipv4 != NULL);
- io->ipsec_out_secure = B_TRUE;
+ ixa->ixa_zoneid = iptun->iptun_connp->conn_zoneid;
+
+ ASSERT((outer_ipv4 != NULL) ? (ixa->ixa_flags & IXAF_IS_IPV4) :
+ !(ixa->ixa_flags & IXAF_IS_IPV4));
+ ASSERT(ixa->ixa_ipsec_policy != NULL);
+ ixa->ixa_flags |= IXAF_IPSEC_SECURE;
if (!(itp->itp_flags & ITPF_P_TUNNEL)) {
/* Set up transport mode for tunnelled packets. */
- io->ipsec_out_proto = (inner_ipv4 != NULL) ? IPPROTO_ENCAP :
+ ixa->ixa_ipsec_proto = (inner_ipv4 != NULL) ? IPPROTO_ENCAP :
IPPROTO_IPV6;
- return (ipsec_mp);
+ return (mp);
}
/* Fill in tunnel-mode goodies here. */
- io->ipsec_out_tunnel = B_TRUE;
+ ixa->ixa_flags |= IXAF_IPSEC_TUNNEL;
/* XXX Do I need to fill in all of the goodies here? */
if (inner_ipv4) {
- io->ipsec_out_inaf = AF_INET;
- io->ipsec_out_insrc[0] =
+ ixa->ixa_ipsec_inaf = AF_INET;
+ ixa->ixa_ipsec_insrc[0] =
pol->ipsp_sel->ipsl_key.ipsl_local.ipsad_v4;
- io->ipsec_out_indst[0] =
+ ixa->ixa_ipsec_indst[0] =
pol->ipsp_sel->ipsl_key.ipsl_remote.ipsad_v4;
} else {
- io->ipsec_out_inaf = AF_INET6;
- io->ipsec_out_insrc[0] =
+ ixa->ixa_ipsec_inaf = AF_INET6;
+ ixa->ixa_ipsec_insrc[0] =
pol->ipsp_sel->ipsl_key.ipsl_local.ipsad_v6.s6_addr32[0];
- io->ipsec_out_insrc[1] =
+ ixa->ixa_ipsec_insrc[1] =
pol->ipsp_sel->ipsl_key.ipsl_local.ipsad_v6.s6_addr32[1];
- io->ipsec_out_insrc[2] =
+ ixa->ixa_ipsec_insrc[2] =
pol->ipsp_sel->ipsl_key.ipsl_local.ipsad_v6.s6_addr32[2];
- io->ipsec_out_insrc[3] =
+ ixa->ixa_ipsec_insrc[3] =
pol->ipsp_sel->ipsl_key.ipsl_local.ipsad_v6.s6_addr32[3];
- io->ipsec_out_indst[0] =
+ ixa->ixa_ipsec_indst[0] =
pol->ipsp_sel->ipsl_key.ipsl_remote.ipsad_v6.s6_addr32[0];
- io->ipsec_out_indst[1] =
+ ixa->ixa_ipsec_indst[1] =
pol->ipsp_sel->ipsl_key.ipsl_remote.ipsad_v6.s6_addr32[1];
- io->ipsec_out_indst[2] =
+ ixa->ixa_ipsec_indst[2] =
pol->ipsp_sel->ipsl_key.ipsl_remote.ipsad_v6.s6_addr32[2];
- io->ipsec_out_indst[3] =
+ ixa->ixa_ipsec_indst[3] =
pol->ipsp_sel->ipsl_key.ipsl_remote.ipsad_v6.s6_addr32[3];
}
- io->ipsec_out_insrcpfx = pol->ipsp_sel->ipsl_key.ipsl_local_pfxlen;
- io->ipsec_out_indstpfx = pol->ipsp_sel->ipsl_key.ipsl_remote_pfxlen;
+ ixa->ixa_ipsec_insrcpfx = pol->ipsp_sel->ipsl_key.ipsl_local_pfxlen;
+ ixa->ixa_ipsec_indstpfx = pol->ipsp_sel->ipsl_key.ipsl_remote_pfxlen;
/* NOTE: These are used for transport mode too. */
- io->ipsec_out_src_port = pol->ipsp_sel->ipsl_key.ipsl_lport;
- io->ipsec_out_dst_port = pol->ipsp_sel->ipsl_key.ipsl_rport;
- io->ipsec_out_proto = pol->ipsp_sel->ipsl_key.ipsl_proto;
+ ixa->ixa_ipsec_src_port = pol->ipsp_sel->ipsl_key.ipsl_lport;
+ ixa->ixa_ipsec_dst_port = pol->ipsp_sel->ipsl_key.ipsl_rport;
+ ixa->ixa_ipsec_proto = pol->ipsp_sel->ipsl_key.ipsl_proto;
- /*
- * The mp pointer still valid
- * Add ipsec_out to each fragment.
- * The fragment head already has one
- */
- nmp = mp->b_next;
- mp->b_next = NULL;
- mp = nmp;
- ASSERT(ipsec_mp != NULL);
- while (mp != NULL) {
- nmp = mp->b_next;
- ipsec_mp->b_next = ipsec_out_tag(ipsec_mp_head, mp, ns);
- if (ipsec_mp->b_next == NULL) {
- ip_drop_packet_chain(ipsec_mp_head, B_FALSE, NULL, NULL,
- DROPPER(ipss, ipds_spd_nomem),
- &ipss->ipsec_spd_dropper);
- ip_drop_packet_chain(mp, B_FALSE, NULL, NULL,
- DROPPER(ipss, ipds_spd_nomem),
- &ipss->ipsec_spd_dropper);
- return (NULL);
- }
- ipsec_mp = ipsec_mp->b_next;
- mp->b_next = NULL;
- mp = nmp;
- }
- return (ipsec_mp_head);
+ return (mp);
}
/*
@@ -5937,16 +5446,28 @@ ipsec_tun_outbound(mblk_t *mp, iptun_t *iptun, ipha_t *inner_ipv4,
* calls ip_drop_packet() for me on NULL returns.
*/
mblk_t *
-ipsec_check_ipsecin_policy_reasm(mblk_t *ipsec_mp, ipsec_policy_t *pol,
+ipsec_check_ipsecin_policy_reasm(mblk_t *attr_mp, ipsec_policy_t *pol,
ipha_t *inner_ipv4, ip6_t *inner_ipv6, uint64_t pkt_unique, netstack_t *ns)
{
- /* Assume ipsec_mp is a chain of b_next-linked IPSEC_IN M_CTLs. */
+ /* Assume attr_mp is a chain of b_next-linked ip_recv_attr mblk. */
mblk_t *data_chain = NULL, *data_tail = NULL;
- mblk_t *ii_next;
-
- while (ipsec_mp != NULL) {
- ii_next = ipsec_mp->b_next;
- ipsec_mp->b_next = NULL; /* No tripping asserts. */
+ mblk_t *next;
+ mblk_t *data_mp;
+ ip_recv_attr_t iras;
+
+ while (attr_mp != NULL) {
+ ASSERT(ip_recv_attr_is_mblk(attr_mp));
+ next = attr_mp->b_next;
+ attr_mp->b_next = NULL; /* No tripping asserts. */
+
+ data_mp = attr_mp->b_cont;
+ attr_mp->b_cont = NULL;
+ if (!ip_recv_attr_from_mblk(attr_mp, &iras)) {
+ /* The ill or ip_stack_t disappeared on us */
+ freemsg(data_mp); /* ip_drop_packet?? */
+ ira_cleanup(&iras, B_TRUE);
+ goto fail;
+ }
/*
* Need IPPOL_REFHOLD(pol) for extras because
@@ -5954,67 +5475,67 @@ ipsec_check_ipsecin_policy_reasm(mblk_t *ipsec_mp, ipsec_policy_t *pol,
*/
IPPOL_REFHOLD(pol);
- if (ipsec_check_ipsecin_policy(ipsec_mp, pol, inner_ipv4,
- inner_ipv6, pkt_unique, ns) != NULL) {
- if (data_tail == NULL) {
- /* First one */
- data_chain = data_tail = ipsec_mp->b_cont;
- } else {
- data_tail->b_next = ipsec_mp->b_cont;
- data_tail = data_tail->b_next;
- }
- freeb(ipsec_mp);
+ data_mp = ipsec_check_ipsecin_policy(data_mp, pol, inner_ipv4,
+ inner_ipv6, pkt_unique, &iras, ns);
+ ira_cleanup(&iras, B_TRUE);
+
+ if (data_mp == NULL)
+ goto fail;
+
+ if (data_tail == NULL) {
+ /* First one */
+ data_chain = data_tail = data_mp;
} else {
- /*
- * ipsec_check_ipsecin_policy() freed ipsec_mp
- * already. Need to get rid of any extra pol
- * references, and any remaining bits as well.
- */
- IPPOL_REFRELE(pol, ns);
- ipsec_freemsg_chain(data_chain);
- ipsec_freemsg_chain(ii_next); /* ipdrop stats? */
- return (NULL);
+ data_tail->b_next = data_mp;
+ data_tail = data_mp;
}
- ipsec_mp = ii_next;
+ attr_mp = next;
}
/*
* One last release because either the loop bumped it up, or we never
* called ipsec_check_ipsecin_policy().
*/
- IPPOL_REFRELE(pol, ns);
+ IPPOL_REFRELE(pol);
/* data_chain is ready for return to tun module. */
return (data_chain);
-}
+fail:
+ /*
+ * Need to get rid of any extra pol
+ * references, and any remaining bits as well.
+ */
+ IPPOL_REFRELE(pol);
+ ipsec_freemsg_chain(data_chain);
+ ipsec_freemsg_chain(next); /* ipdrop stats? */
+ return (NULL);
+}
/*
- * Returns B_TRUE if the inbound packet passed an IPsec policy check. Returns
- * B_FALSE if it failed or if it is a fragment needing its friends before a
+ * Return a message if the inbound packet passed an IPsec policy check. Returns
+ * NULL if it failed or if it is a fragment needing its friends before a
* policy check can be performed.
*
- * Expects a non-NULL *data_mp, an optional ipsec_mp, and a non-NULL polhead.
- * data_mp may be reassigned with a b_next chain of packets if fragments
+ * Expects a non-NULL data_mp, and a non-NULL polhead.
+ * The returned mblk may be a b_next chain of packets if fragments
* neeeded to be collected for a proper policy check.
*
- * Always frees ipsec_mp, but only frees data_mp if returns B_FALSE. This
- * function calls ip_drop_packet() on data_mp if need be.
+ * This function calls ip_drop_packet() on data_mp if need be.
*
* NOTE: outer_hdr_len is signed. If it's a negative value, the caller
* is inspecting an ICMP packet.
*/
-boolean_t
-ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp,
+mblk_t *
+ipsec_tun_inbound(ip_recv_attr_t *ira, mblk_t *data_mp, ipsec_tun_pol_t *itp,
ipha_t *inner_ipv4, ip6_t *inner_ipv6, ipha_t *outer_ipv4,
ip6_t *outer_ipv6, int outer_hdr_len, netstack_t *ns)
{
ipsec_policy_head_t *polhead;
ipsec_selector_t sel;
- mblk_t *message = (ipsec_mp == NULL) ? *data_mp : ipsec_mp;
ipsec_policy_t *pol;
uint16_t tmpport;
selret_t rc;
- boolean_t retval, port_policy_present, is_icmp, global_present;
+ boolean_t port_policy_present, is_icmp, global_present;
in6_addr_t tmpaddr;
ipaddr_t tmp4;
uint8_t flags, *inner_hdr;
@@ -6032,7 +5553,6 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp,
ASSERT(inner_ipv4 != NULL && inner_ipv6 == NULL ||
inner_ipv4 == NULL && inner_ipv6 != NULL);
- ASSERT(message == *data_mp || message->b_cont == *data_mp);
if (outer_hdr_len < 0) {
outer_hdr_len = (-outer_hdr_len);
@@ -6042,6 +5562,8 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp,
}
if (itp != NULL && (itp->itp_flags & ITPF_P_ACTIVE)) {
+ mblk_t *mp = data_mp;
+
polhead = itp->itp_policy;
/*
* We need to perform full Tunnel-Mode enforcement,
@@ -6061,53 +5583,66 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp,
flags = ((port_policy_present ? SEL_PORT_POLICY : SEL_NONE) |
(is_icmp ? SEL_IS_ICMP : SEL_NONE) | SEL_TUNNEL_MODE);
- rc = ipsec_init_inbound_sel(&sel, *data_mp, inner_ipv4,
+ rc = ipsec_init_inbound_sel(&sel, data_mp, inner_ipv4,
inner_ipv6, flags);
switch (rc) {
case SELRET_NOMEM:
- ip_drop_packet(message, B_TRUE, NULL, NULL,
+ ip_drop_packet(data_mp, B_TRUE, NULL,
DROPPER(ipss, ipds_spd_nomem),
&ipss->ipsec_spd_dropper);
- return (B_FALSE);
+ return (NULL);
case SELRET_TUNFRAG:
/*
* At this point, if we're cleartext, we don't want
* to go there.
*/
- if (ipsec_mp == NULL) {
- ip_drop_packet(*data_mp, B_TRUE, NULL, NULL,
+ if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
+ ip_drop_packet(data_mp, B_TRUE, NULL,
DROPPER(ipss, ipds_spd_got_clear),
&ipss->ipsec_spd_dropper);
- *data_mp = NULL;
- return (B_FALSE);
+ return (NULL);
+ }
+ /*
+ * If we need to queue the packet. First we
+ * get an mblk with the attributes. ipsec_fragcache_add
+ * will prepend that to the queued data and return
+ * a list of b_next messages each of which starts with
+ * the attribute mblk.
+ */
+ mp = ip_recv_attr_to_mblk(ira);
+ if (mp == NULL) {
+ ip_drop_packet(data_mp, B_TRUE, NULL,
+ DROPPER(ipss, ipds_spd_nomem),
+ &ipss->ipsec_spd_dropper);
+ return (NULL);
}
- ASSERT(((ipsec_in_t *)ipsec_mp->b_rptr)->
- ipsec_in_secure);
- message = ipsec_fragcache_add(&itp->itp_fragcache,
- ipsec_mp, *data_mp, outer_hdr_len, ipss);
+ mp = ipsec_fragcache_add(&itp->itp_fragcache,
+ mp, data_mp, outer_hdr_len, ipss);
- if (message == NULL) {
+ if (mp == NULL) {
/*
* Data is cached, fragment chain is not
- * complete. I consume ipsec_mp and data_mp
+ * complete.
*/
- return (B_FALSE);
+ return (NULL);
}
/*
* If we get here, we have a full fragment chain.
* Reacquire headers and selectors from first fragment.
*/
- inner_hdr = message->b_cont->b_rptr;
+ ASSERT(ip_recv_attr_is_mblk(mp));
+ data_mp = mp->b_cont;
+ inner_hdr = data_mp->b_rptr;
if (outer_ipv4 != NULL) {
inner_hdr += IPH_HDR_LENGTH(
- (ipha_t *)message->b_cont->b_rptr);
+ (ipha_t *)data_mp->b_rptr);
} else {
- inner_hdr += ip_hdr_length_v6(message->b_cont,
- (ip6_t *)message->b_cont->b_rptr);
+ inner_hdr += ip_hdr_length_v6(data_mp,
+ (ip6_t *)data_mp->b_rptr);
}
- ASSERT(inner_hdr <= message->b_cont->b_wptr);
+ ASSERT(inner_hdr <= data_mp->b_wptr);
if (inner_ipv4 != NULL) {
inner_ipv4 = (ipha_t *)inner_hdr;
@@ -6121,7 +5656,7 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp,
* Use SEL_TUNNEL_MODE to take into account the outer
* header. Use SEL_POST_FRAG so we always get ports.
*/
- rc = ipsec_init_inbound_sel(&sel, message->b_cont,
+ rc = ipsec_init_inbound_sel(&sel, data_mp,
inner_ipv4, inner_ipv6,
SEL_TUNNEL_MODE | SEL_POST_FRAG);
switch (rc) {
@@ -6132,17 +5667,15 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp,
*/
break;
case SELRET_NOMEM:
- ip_drop_packet_chain(message, B_TRUE,
- NULL, NULL,
+ ip_drop_packet_chain(mp, B_TRUE, NULL,
DROPPER(ipss, ipds_spd_nomem),
&ipss->ipsec_spd_dropper);
- return (B_FALSE);
+ return (NULL);
case SELRET_BADPKT:
- ip_drop_packet_chain(message, B_TRUE,
- NULL, NULL,
+ ip_drop_packet_chain(mp, B_TRUE, NULL,
DROPPER(ipss, ipds_spd_malformed_frag),
&ipss->ipsec_spd_dropper);
- return (B_FALSE);
+ return (NULL);
case SELRET_TUNFRAG:
cmn_err(CE_WARN, "(TUNFRAG on 2nd call...)");
/* FALLTHRU */
@@ -6151,7 +5684,7 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp,
" returns bizarro 0x%x", rc);
/* Guaranteed panic! */
ASSERT(rc == SELRET_NOMEM);
- return (B_FALSE);
+ return (NULL);
}
/* FALLTHRU */
case SELRET_SUCCESS:
@@ -6174,7 +5707,7 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp,
"ipsec_init_inbound_sel() returns bizarro 0x%x",
rc);
ASSERT(rc == SELRET_NOMEM); /* Guaranteed panic! */
- return (B_FALSE);
+ return (NULL);
}
if (is_icmp) {
@@ -6192,42 +5725,54 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp,
/* find_policy_head() */
rw_enter(&polhead->iph_lock, RW_READER);
pol = ipsec_find_policy_head(NULL, polhead, IPSEC_TYPE_INBOUND,
- &sel, ns);
+ &sel);
rw_exit(&polhead->iph_lock);
if (pol != NULL) {
- if (ipsec_mp == NULL ||
- !((ipsec_in_t *)ipsec_mp->b_rptr)->
- ipsec_in_secure) {
- retval = pol->ipsp_act->ipa_allow_clear;
- if (!retval) {
+ uint64_t pkt_unique;
+
+ if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
+ if (!pol->ipsp_act->ipa_allow_clear) {
/*
* XXX should never get here with
* tunnel reassembled fragments?
*/
- ASSERT(message->b_next == NULL);
- ip_drop_packet(message, B_TRUE, NULL,
- NULL,
+ ASSERT(mp == data_mp);
+ ip_drop_packet(data_mp, B_TRUE, NULL,
DROPPER(ipss, ipds_spd_got_clear),
&ipss->ipsec_spd_dropper);
- } else if (ipsec_mp != NULL) {
- freeb(ipsec_mp);
+ IPPOL_REFRELE(pol);
+ return (NULL);
+ } else {
+ IPPOL_REFRELE(pol);
+ return (mp);
}
-
- IPPOL_REFRELE(pol, ns);
- return (retval);
}
+ pkt_unique = SA_UNIQUE_ID(sel.ips_remote_port,
+ sel.ips_local_port,
+ (inner_ipv4 == NULL) ? IPPROTO_IPV6 :
+ IPPROTO_ENCAP, sel.ips_protocol);
+
/*
* NOTE: The following releases pol's reference and
* calls ip_drop_packet() for me on NULL returns.
*
* "sel" is still good here, so let's use it!
*/
- *data_mp = ipsec_check_ipsecin_policy_reasm(message,
- pol, inner_ipv4, inner_ipv6, SA_UNIQUE_ID(
- sel.ips_remote_port, sel.ips_local_port,
- (inner_ipv4 == NULL) ? IPPROTO_IPV6 :
- IPPROTO_ENCAP, sel.ips_protocol), ns);
- return (*data_mp != NULL);
+ if (data_mp == mp) {
+ /* A single packet without attributes */
+ data_mp = ipsec_check_ipsecin_policy(data_mp,
+ pol, inner_ipv4, inner_ipv6, pkt_unique,
+ ira, ns);
+ } else {
+ /*
+ * We pass in the b_next chain of attr_mp's
+ * and get back a b_next chain of data_mp's.
+ */
+ data_mp = ipsec_check_ipsecin_policy_reasm(mp,
+ pol, inner_ipv4, inner_ipv6, pkt_unique,
+ ns);
+ }
+ return (data_mp);
}
/*
@@ -6237,11 +5782,10 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp,
* a new-style tunnel-mode tunnel.
*/
if ((itp->itp_flags & ITPF_P_TUNNEL) && !is_icmp) {
- ip_drop_packet_chain(message, B_TRUE, NULL,
- NULL,
+ ip_drop_packet_chain(data_mp, B_TRUE, NULL,
DROPPER(ipss, ipds_spd_explicit),
&ipss->ipsec_spd_dropper);
- return (B_FALSE);
+ return (NULL);
}
}
@@ -6251,24 +5795,22 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp,
* tunnel-mode tunnel, which either returns with a pass, or gets
* hit by the ip_drop_packet_chain() call right above here.
*/
+ ASSERT(data_mp->b_next == NULL);
/* If no per-tunnel security, check global policy now. */
- if (ipsec_mp != NULL && !global_present) {
- if (((ipsec_in_t *)(ipsec_mp->b_rptr))->
- ipsec_in_icmp_loopback) {
+ if ((ira->ira_flags & IRAF_IPSEC_SECURE) && !global_present) {
+ if (ira->ira_flags & IRAF_TRUSTED_ICMP) {
/*
- * This is an ICMP message with an ipsec_mp
- * attached. We should accept it.
+ * This is an ICMP message that was geenrated locally.
+ * We should accept it.
*/
- if (ipsec_mp != NULL)
- freeb(ipsec_mp);
- return (B_TRUE);
+ return (data_mp);
}
- ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL,
+ ip_drop_packet(data_mp, B_TRUE, NULL,
DROPPER(ipss, ipds_spd_got_secure),
&ipss->ipsec_spd_dropper);
- return (B_FALSE);
+ return (NULL);
}
if (is_icmp) {
@@ -6294,11 +5836,10 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp,
}
}
- /* NOTE: Frees message if it returns NULL. */
- if (ipsec_check_global_policy(message, NULL, outer_ipv4, outer_ipv6,
- (ipsec_mp != NULL), ns) == NULL) {
- return (B_FALSE);
- }
+ data_mp = ipsec_check_global_policy(data_mp, NULL, outer_ipv4,
+ outer_ipv6, ira, ns);
+ if (data_mp == NULL)
+ return (NULL);
if (is_icmp) {
/* Set things back to normal. */
@@ -6314,14 +5855,11 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp,
}
}
- if (ipsec_mp != NULL)
- freeb(ipsec_mp);
-
/*
* At this point, we pretend it's a cleartext accepted
* packet.
*/
- return (B_TRUE);
+ return (data_mp);
}
/*
@@ -6365,7 +5903,7 @@ itp_unlink(ipsec_tun_pol_t *node, netstack_t *ns)
rw_enter(&ipss->ipsec_tunnel_policy_lock, RW_WRITER);
ipss->ipsec_tunnel_policy_gen++;
- ipsec_fragcache_uninit(&node->itp_fragcache);
+ ipsec_fragcache_uninit(&node->itp_fragcache, ipss);
avl_remove(&ipss->ipsec_tunnel_policies, node);
rw_exit(&ipss->ipsec_tunnel_policy_lock);
ITP_REFRELE(node, ns);
@@ -6615,7 +6153,7 @@ ipsec_fragcache_init(ipsec_fragcache_t *frag)
}
void
-ipsec_fragcache_uninit(ipsec_fragcache_t *frag)
+ipsec_fragcache_uninit(ipsec_fragcache_t *frag, ipsec_stack_t *ipss)
{
ipsec_fragcache_entry_t *fep;
int i;
@@ -6627,7 +6165,7 @@ ipsec_fragcache_uninit(ipsec_fragcache_t *frag)
fep = (frag->itpf_ptr)[i];
while (fep != NULL) {
/* Returned fep is next in chain or NULL */
- fep = fragcache_delentry(i, fep, frag);
+ fep = fragcache_delentry(i, fep, frag, ipss);
}
}
/*
@@ -6658,10 +6196,12 @@ ipsec_fragcache_uninit(ipsec_fragcache_t *frag)
/*
* Add a fragment to the fragment cache. Consumes mp if NULL is returned.
* Returns mp if a whole fragment has been assembled, NULL otherwise
+ * The returned mp could be a b_next chain of fragments.
+ *
+ * The iramp argument is set on inbound; NULL if outbound.
*/
-
mblk_t *
-ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
+ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *iramp, mblk_t *mp,
int outer_hdr_len, ipsec_stack_t *ipss)
{
boolean_t is_v4;
@@ -6672,7 +6212,7 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
uint8_t v6_proto;
uint8_t *v6_proto_p;
uint16_t ip6_hdr_length;
- ip6_pkt_t ipp;
+ ip_pkt_t ipp;
ip6_frag_t *fraghdr;
ipsec_fragcache_entry_t *fep;
int i;
@@ -6680,10 +6220,7 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
int firstbyte, lastbyte;
int offset;
int last;
- boolean_t inbound = (ipsec_mp != NULL);
- mblk_t *first_mp = inbound ? ipsec_mp : mp;
-
- ASSERT(first_mp == mp || first_mp->b_cont == mp);
+ boolean_t inbound = (iramp != NULL);
/*
* You're on the slow path, so insure that every packet in the
@@ -6692,14 +6229,14 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
if (mp->b_cont != NULL) {
nmp = msgpullup(mp, -1);
if (nmp == NULL) {
- ip_drop_packet(first_mp, inbound, NULL, NULL,
+ ip_drop_packet(mp, inbound, NULL,
DROPPER(ipss, ipds_spd_nomem),
&ipss->ipsec_spd_dropper);
+ if (inbound)
+ (void) ip_recv_attr_free_mblk(iramp);
return (NULL);
}
freemsg(mp);
- if (ipsec_mp != NULL)
- ipsec_mp->b_cont = nmp;
mp = nmp;
}
@@ -6721,9 +6258,11 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
* If it fails we have a malformed packet
*/
mutex_exit(&frag->itpf_lock);
- ip_drop_packet(first_mp, inbound, NULL, NULL,
+ ip_drop_packet(mp, inbound, NULL,
DROPPER(ipss, ipds_spd_malformed_packet),
&ipss->ipsec_spd_dropper);
+ if (inbound)
+ (void) ip_recv_attr_free_mblk(iramp);
return (NULL);
} else {
v6_proto = *v6_proto_p;
@@ -6731,16 +6270,18 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
bzero(&ipp, sizeof (ipp));
- (void) ip_find_hdr_v6(mp, ip6h, &ipp, NULL);
+ (void) ip_find_hdr_v6(mp, ip6h, B_FALSE, &ipp, NULL);
if (!(ipp.ipp_fields & IPPF_FRAGHDR)) {
/*
* We think this is a fragment, but didn't find
* a fragment header. Something is wrong.
*/
mutex_exit(&frag->itpf_lock);
- ip_drop_packet(first_mp, inbound, NULL, NULL,
+ ip_drop_packet(mp, inbound, NULL,
DROPPER(ipss, ipds_spd_malformed_frag),
&ipss->ipsec_spd_dropper);
+ if (inbound)
+ (void) ip_recv_attr_free_mblk(iramp);
return (NULL);
}
fraghdr = ipp.ipp_fraghdr;
@@ -6759,7 +6300,7 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
*/
itpf_time = gethrestime_sec();
if (itpf_time >= frag->itpf_expire_hint)
- ipsec_fragcache_clean(frag);
+ ipsec_fragcache_clean(frag, ipss);
/* Lookup to see if there is an existing entry */
@@ -6814,11 +6355,13 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
/* check for bogus fragments and delete the entry */
if (firstbyte > 0 && firstbyte <= 8) {
if (fep != NULL)
- (void) fragcache_delentry(i, fep, frag);
+ (void) fragcache_delentry(i, fep, frag, ipss);
mutex_exit(&frag->itpf_lock);
- ip_drop_packet(first_mp, inbound, NULL, NULL,
+ ip_drop_packet(mp, inbound, NULL,
DROPPER(ipss, ipds_spd_malformed_frag),
&ipss->ipsec_spd_dropper);
+ if (inbound)
+ (void) ip_recv_attr_free_mblk(iramp);
return (NULL);
}
@@ -6826,12 +6369,14 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
if (fep == NULL) {
if (frag->itpf_freelist == NULL) {
/* see if there is some space */
- ipsec_fragcache_clean(frag);
+ ipsec_fragcache_clean(frag, ipss);
if (frag->itpf_freelist == NULL) {
mutex_exit(&frag->itpf_lock);
- ip_drop_packet(first_mp, inbound, NULL, NULL,
+ ip_drop_packet(mp, inbound, NULL,
DROPPER(ipss, ipds_spd_nomem),
&ipss->ipsec_spd_dropper);
+ if (inbound)
+ (void) ip_recv_attr_free_mblk(iramp);
return (NULL);
}
}
@@ -6879,7 +6424,7 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
ipha_t *niph;
ipha_t *oniph;
ip6_t *nip6h;
- ip6_pkt_t nipp;
+ ip_pkt_t nipp;
ip6_frag_t *nfraghdr;
uint16_t nip6_hdr_length;
uint8_t *nv6_proto_p;
@@ -6929,14 +6474,17 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
if (!ip_hdr_length_nexthdr_v6(ndata_mp, nip6h,
&nip6_hdr_length, &nv6_proto_p)) {
mutex_exit(&frag->itpf_lock);
- ip_drop_packet_chain(nmp, inbound, NULL, NULL,
+ ip_drop_packet_chain(nmp, inbound, NULL,
DROPPER(ipss, ipds_spd_malformed_frag),
&ipss->ipsec_spd_dropper);
ipsec_freemsg_chain(ndata_mp);
+ if (inbound)
+ (void) ip_recv_attr_free_mblk(iramp);
return (NULL);
}
bzero(&nipp, sizeof (nipp));
- (void) ip_find_hdr_v6(ndata_mp, nip6h, &nipp, NULL);
+ (void) ip_find_hdr_v6(ndata_mp, nip6h, B_FALSE, &nipp,
+ NULL);
nfraghdr = nipp.ipp_fraghdr;
nfirstbyte = ntohs(nfraghdr->ip6f_offlg &
IP6F_OFF_MASK);
@@ -6968,11 +6516,13 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
if (bcmp(data, ndata, MIN(lastbyte, nlastbyte) -
firstbyte)) {
/* Overlapping data does not match */
- (void) fragcache_delentry(i, fep, frag);
+ (void) fragcache_delentry(i, fep, frag, ipss);
mutex_exit(&frag->itpf_lock);
- ip_drop_packet(first_mp, inbound, NULL, NULL,
+ ip_drop_packet(mp, inbound, NULL,
DROPPER(ipss, ipds_spd_overlap_frag),
&ipss->ipsec_spd_dropper);
+ if (inbound)
+ (void) ip_recv_attr_free_mblk(iramp);
return (NULL);
}
/* Part of defense for jolt2.c fragmentation attack */
@@ -6987,9 +6537,11 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
* ---------- ------
*/
mutex_exit(&frag->itpf_lock);
- ip_drop_packet(first_mp, inbound, NULL, NULL,
+ ip_drop_packet(mp, inbound, NULL,
DROPPER(ipss, ipds_spd_evil_frag),
&ipss->ipsec_spd_dropper);
+ if (inbound)
+ (void) ip_recv_attr_free_mblk(iramp);
return (NULL);
}
@@ -7027,12 +6579,17 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
if (bcmp(data, ndata, MIN(lastbyte, nlastbyte)
- nfirstbyte)) {
/* Overlap mismatch */
- (void) fragcache_delentry(i, fep, frag);
+ (void) fragcache_delentry(i, fep, frag,
+ ipss);
mutex_exit(&frag->itpf_lock);
- ip_drop_packet(first_mp, inbound, NULL,
- NULL, DROPPER(ipss,
+ ip_drop_packet(mp, inbound, NULL,
+ DROPPER(ipss,
ipds_spd_overlap_frag),
&ipss->ipsec_spd_dropper);
+ if (inbound) {
+ (void) ip_recv_attr_free_mblk(
+ iramp);
+ }
return (NULL);
}
}
@@ -7046,21 +6603,31 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
prevmp = nmp;
}
- first_mp->b_next = nmp;
+ /* Prepend the attributes before we link it in */
+ if (iramp != NULL) {
+ ASSERT(iramp->b_cont == NULL);
+ iramp->b_cont = mp;
+ mp = iramp;
+ iramp = NULL;
+ }
+ mp->b_next = nmp;
if (prevmp == NULL) {
- fep->itpfe_fraglist = first_mp;
+ fep->itpfe_fraglist = mp;
} else {
- prevmp->b_next = first_mp;
+ prevmp->b_next = mp;
}
if (last)
fep->itpfe_last = 1;
/* Part of defense for jolt2.c fragmentation attack */
if (++(fep->itpfe_depth) > IPSEC_MAX_FRAGS) {
- (void) fragcache_delentry(i, fep, frag);
+ (void) fragcache_delentry(i, fep, frag, ipss);
mutex_exit(&frag->itpf_lock);
- ip_drop_packet(first_mp, inbound, NULL, NULL,
+ if (inbound)
+ mp = ip_recv_attr_free_mblk(mp);
+
+ ip_drop_packet(mp, inbound, NULL,
DROPPER(ipss, ipds_spd_max_frags),
&ipss->ipsec_spd_dropper);
return (NULL);
@@ -7078,7 +6645,7 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
#ifdef FRAGCACHE_DEBUG
cmn_err(CE_WARN, "Last fragment cached.\n");
- cmn_err(CE_WARN, "mp = %p, first_mp = %p.\n", mp, first_mp);
+ cmn_err(CE_WARN, "mp = %p\n", mp);
#endif
offset = 0;
@@ -7118,14 +6685,15 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
if (!ip_hdr_length_nexthdr_v6(data_mp, ip6h,
&ip6_hdr_length, &v6_proto_p)) {
mutex_exit(&frag->itpf_lock);
- ip_drop_packet_chain(mp, inbound, NULL, NULL,
+ ip_drop_packet_chain(mp, inbound, NULL,
DROPPER(ipss, ipds_spd_malformed_frag),
&ipss->ipsec_spd_dropper);
return (NULL);
}
v6_proto = *v6_proto_p;
bzero(&ipp, sizeof (ipp));
- (void) ip_find_hdr_v6(data_mp, ip6h, &ipp, NULL);
+ (void) ip_find_hdr_v6(data_mp, ip6h, B_FALSE, &ipp,
+ NULL);
fraghdr = ipp.ipp_fraghdr;
firstbyte = ntohs(fraghdr->ip6f_offlg &
IP6F_OFF_MASK);
@@ -7163,7 +6731,7 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
(!is_v4 && !(fraghdr->ip6f_offlg & IP6F_MORE_FRAG))) {
mp = fep->itpfe_fraglist;
fep->itpfe_fraglist = NULL;
- (void) fragcache_delentry(i, fep, frag);
+ (void) fragcache_delentry(i, fep, frag, ipss);
mutex_exit(&frag->itpf_lock);
if ((is_v4 && (firstbyte + ntohs(iph->ipha_length) >
@@ -7171,7 +6739,7 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
ntohs(ip6h->ip6_plen) > 65535))) {
/* It is an invalid "ping-o-death" packet */
/* Discard it */
- ip_drop_packet_chain(mp, inbound, NULL, NULL,
+ ip_drop_packet_chain(mp, inbound, NULL,
DROPPER(ipss, ipds_spd_evil_frag),
&ipss->ipsec_spd_dropper);
return (NULL);
@@ -7181,7 +6749,7 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
"mp->b_next = %p", mp, mp->b_next);
#endif
/*
- * For inbound case, mp has ipsec_in b_next'd chain
+ * For inbound case, mp has attrmp b_next'd chain
* For outbound case, it is just data mp chain
*/
return (mp);
@@ -7202,7 +6770,7 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp,
}
static void
-ipsec_fragcache_clean(ipsec_fragcache_t *frag)
+ipsec_fragcache_clean(ipsec_fragcache_t *frag, ipsec_stack_t *ipss)
{
ipsec_fragcache_entry_t *fep;
int i;
@@ -7221,7 +6789,7 @@ ipsec_fragcache_clean(ipsec_fragcache_t *frag)
while (fep) {
if (fep->itpfe_exp < itpf_time) {
/* found */
- fep = fragcache_delentry(i, fep, frag);
+ fep = fragcache_delentry(i, fep, frag, ipss);
} else {
if (fep->itpfe_exp < earlyexp) {
earlyfep = fep;
@@ -7237,12 +6805,12 @@ ipsec_fragcache_clean(ipsec_fragcache_t *frag)
/* if (!found) */
if (frag->itpf_freelist == NULL)
- (void) fragcache_delentry(earlyi, earlyfep, frag);
+ (void) fragcache_delentry(earlyi, earlyfep, frag, ipss);
}
static ipsec_fragcache_entry_t *
fragcache_delentry(int slot, ipsec_fragcache_entry_t *fep,
- ipsec_fragcache_t *frag)
+ ipsec_fragcache_t *frag, ipsec_stack_t *ipss)
{
ipsec_fragcache_entry_t *targp;
ipsec_fragcache_entry_t *nextp = fep->itpfe_next;
@@ -7250,7 +6818,12 @@ fragcache_delentry(int slot, ipsec_fragcache_entry_t *fep,
ASSERT(MUTEX_HELD(&frag->itpf_lock));
/* Free up any fragment list still in cache entry */
- ipsec_freemsg_chain(fep->itpfe_fraglist);
+ if (fep->itpfe_fraglist != NULL) {
+ ip_drop_packet_chain(fep->itpfe_fraglist,
+ ip_recv_attr_is_mblk(fep->itpfe_fraglist), NULL,
+ DROPPER(ipss, ipds_spd_nomem), &ipss->ipsec_spd_dropper);
+ }
+ fep->itpfe_fraglist = NULL;
targp = (frag->itpf_ptr)[slot];
ASSERT(targp != 0);
diff --git a/usr/src/uts/common/inet/ip/spdsock.c b/usr/src/uts/common/inet/ip/spdsock.c
index e15d23fdd8..1b25af4a97 100644
--- a/usr/src/uts/common/inet/ip/spdsock.c
+++ b/usr/src/uts/common/inet/ip/spdsock.c
@@ -58,7 +58,6 @@
#include <inet/nd.h>
#include <inet/ip_if.h>
#include <inet/optcom.h>
-#include <inet/ipsec_info.h>
#include <inet/ipsec_impl.h>
#include <inet/spdsock.h>
#include <inet/sadb.h>
@@ -1150,9 +1149,8 @@ spdsock_addrule(queue_t *q, ipsec_policy_head_t *iph, mblk_t *mp,
fail:
rw_exit(&iph->iph_lock);
- while ((--rulep) >= &rules[0]) {
- IPPOL_REFRELE(rulep->pol, spds->spds_netstack);
- }
+ while ((--rulep) >= &rules[0])
+ IPPOL_REFRELE(rulep->pol);
ipsec_actvec_free(actp, nact);
fail2:
if (itp != NULL) {
@@ -2519,8 +2517,8 @@ error:
* be invoked either once IPsec is loaded on a cached request, or
* when a request is received while IPsec is loaded.
*/
-static void
-spdsock_do_updatealg(spd_ext_t *extv[], int *diag, spd_stack_t *spds)
+static int
+spdsock_do_updatealg(spd_ext_t *extv[], spd_stack_t *spds)
{
struct spd_ext_actions *actp;
struct spd_attribute *attr, *endattr;
@@ -2529,17 +2527,15 @@ spdsock_do_updatealg(spd_ext_t *extv[], int *diag, spd_stack_t *spds)
ipsec_algtype_t alg_type = 0;
boolean_t skip_alg = B_TRUE, doing_proto = B_FALSE;
uint_t i, cur_key, cur_block, algid;
+ int diag = -1;
- *diag = -1;
ASSERT(MUTEX_HELD(&spds->spds_alg_lock));
/* parse the message, building the list of algorithms */
actp = (struct spd_ext_actions *)extv[SPD_EXT_ACTION];
- if (actp == NULL) {
- *diag = SPD_DIAGNOSTIC_NO_ACTION_EXT;
- return;
- }
+ if (actp == NULL)
+ return (SPD_DIAGNOSTIC_NO_ACTION_EXT);
start = (uint64_t *)actp;
end = (start + actp->spd_actions_len);
@@ -2583,7 +2579,7 @@ spdsock_do_updatealg(spd_ext_t *extv[], int *diag, spd_stack_t *spds)
ss1dbg(spds, ("spdsock_do_updatealg: "
"invalid alg id %d\n",
attr->spd_attr_value));
- *diag = SPD_DIAGNOSTIC_ALG_ID_RANGE;
+ diag = SPD_DIAGNOSTIC_ALG_ID_RANGE;
goto bail;
}
alg->alg_id = attr->spd_attr_value;
@@ -2623,7 +2619,7 @@ spdsock_do_updatealg(spd_ext_t *extv[], int *diag, spd_stack_t *spds)
cur_key >= alg->alg_nkey_sizes) {
ss1dbg(spds, ("spdsock_do_updatealg: "
"too many key sizes\n"));
- *diag = SPD_DIAGNOSTIC_ALG_NUM_KEY_SIZES;
+ diag = SPD_DIAGNOSTIC_ALG_NUM_KEY_SIZES;
goto bail;
}
alg->alg_key_sizes[cur_key++] = attr->spd_attr_value;
@@ -2659,7 +2655,7 @@ spdsock_do_updatealg(spd_ext_t *extv[], int *diag, spd_stack_t *spds)
cur_block >= alg->alg_nblock_sizes) {
ss1dbg(spds, ("spdsock_do_updatealg: "
"too many block sizes\n"));
- *diag = SPD_DIAGNOSTIC_ALG_NUM_BLOCK_SIZES;
+ diag = SPD_DIAGNOSTIC_ALG_NUM_BLOCK_SIZES;
goto bail;
}
alg->alg_block_sizes[cur_block++] =
@@ -2686,7 +2682,7 @@ spdsock_do_updatealg(spd_ext_t *extv[], int *diag, spd_stack_t *spds)
cur_block >= alg->alg_nparams) {
ss1dbg(spds, ("spdsock_do_updatealg: "
"too many params\n"));
- *diag = SPD_DIAGNOSTIC_ALG_NUM_BLOCK_SIZES;
+ diag = SPD_DIAGNOSTIC_ALG_NUM_BLOCK_SIZES;
goto bail;
}
/*
@@ -2703,7 +2699,7 @@ spdsock_do_updatealg(spd_ext_t *extv[], int *diag, spd_stack_t *spds)
if (attr->spd_attr_value > CRYPTO_MAX_MECH_NAME) {
ss1dbg(spds, ("spdsock_do_updatealg: "
"mech name too long\n"));
- *diag = SPD_DIAGNOSTIC_ALG_MECH_NAME_LEN;
+ diag = SPD_DIAGNOSTIC_ALG_MECH_NAME_LEN;
goto bail;
}
mech_name = (char *)(attr + 1);
@@ -2751,6 +2747,7 @@ bail:
for (algid = 0; algid < IPSEC_MAX_ALGS; algid++)
if (spds->spds_algs[alg_type][algid] != NULL)
ipsec_alg_free(spds->spds_algs[alg_type][algid]);
+ return (diag);
}
/*
@@ -2803,9 +2800,12 @@ spdsock_updatealg(queue_t *q, mblk_t *mp, spd_ext_t *extv[])
int diag;
mutex_enter(&spds->spds_alg_lock);
- spdsock_do_updatealg(extv, &diag, spds);
- mutex_exit(&spds->spds_alg_lock);
+ diag = spdsock_do_updatealg(extv, spds);
if (diag == -1) {
+ /* Keep the lock held while we walk the SA tables. */
+ sadb_alg_update(IPSEC_ALG_ALL, 0, 0,
+ spds->spds_netstack);
+ mutex_exit(&spds->spds_alg_lock);
spd_echo(q, mp);
if (audit_active) {
cred_t *cr;
@@ -2817,6 +2817,7 @@ spdsock_updatealg(queue_t *q, mblk_t *mp, spd_ext_t *extv[])
cpid);
}
} else {
+ mutex_exit(&spds->spds_alg_lock);
spdsock_diag(q, mp, diag);
if (audit_active) {
cred_t *cr;
@@ -3117,10 +3118,7 @@ spdsock_update_pending_algs(netstack_t *ns)
mutex_enter(&spds->spds_alg_lock);
if (spds->spds_algs_pending) {
- int diag;
-
- spdsock_do_updatealg(spds->spds_extv_algs, &diag,
- spds);
+ (void) spdsock_do_updatealg(spds->spds_extv_algs, spds);
spds->spds_algs_pending = B_FALSE;
}
mutex_exit(&spds->spds_alg_lock);
@@ -3265,7 +3263,7 @@ spdsock_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
int
spdsock_opt_set(queue_t *q, uint_t mgmt_flags, int level, int name,
uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
- void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
+ void *thisdg_attrs, cred_t *cr)
{
int *i1 = (int *)invalp;
spdsock_t *ss = (spdsock_t *)q->q_ptr;
@@ -3337,11 +3335,9 @@ spdsock_wput_other(queue_t *q, mblk_t *mp)
}
if (((union T_primitives *)mp->b_rptr)->type ==
T_SVR4_OPTMGMT_REQ) {
- (void) svr4_optcom_req(q, mp, cr,
- &spdsock_opt_obj, B_FALSE);
+ svr4_optcom_req(q, mp, cr, &spdsock_opt_obj);
} else {
- (void) tpi_optcom_req(q, mp, cr,
- &spdsock_opt_obj, B_FALSE);
+ tpi_optcom_req(q, mp, cr, &spdsock_opt_obj);
}
break;
case T_DATA_REQ:
diff --git a/usr/src/uts/common/inet/ip/spdsock_opt_data.c b/usr/src/uts/common/inet/ip/spdsock_opt_data.c
index df797bb37a..c5438f29cc 100644
--- a/usr/src/uts/common/inet/ip/spdsock_opt_data.c
+++ b/usr/src/uts/common/inet/ip/spdsock_opt_data.c
@@ -20,12 +20,10 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/stream.h>
#define _SUN_TPI_VERSION 1
@@ -53,9 +51,9 @@
*/
opdes_t spdsock_opt_arr[] = {
- { SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_PASSNEXT,
+ { SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, 0,
(t_uscalar_t)sizeof (int), 0 },
- { SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_PASSNEXT,
+ { SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, 0,
(t_uscalar_t)sizeof (int), 0 },
};
@@ -88,7 +86,6 @@ optdb_obj_t spdsock_opt_obj = {
NULL, /* SPDSOCK default value function pointer */
spdsock_opt_get, /* SPDSOCK get function pointer */
spdsock_opt_set, /* SPDSOCK set function pointer */
- B_TRUE, /* SPDSOCK is tpi provider */
SPDSOCK_OPT_ARR_CNT, /* SPDSOCK option database count of entries */
spdsock_opt_arr, /* SPDSOCK option database */
SPDSOCK_VALID_LEVELS_CNT, /* SPDSOCK valid level count of entries */
diff --git a/usr/src/uts/common/inet/ip/tn_ipopt.c b/usr/src/uts/common/inet/ip/tn_ipopt.c
index 359b8d4623..1ce050ec69 100644
--- a/usr/src/uts/common/inet/ip/tn_ipopt.c
+++ b/usr/src/uts/common/inet/ip/tn_ipopt.c
@@ -271,38 +271,40 @@ tsol_get_option_v6(mblk_t *mp, tsol_ip_label_t *label_type, uchar_t **buffer)
* tsol_check_dest()
*
* This routine verifies if a destination is allowed to recieve messages
- * based on the message cred's security label. If any adjustments to
- * the cred are needed due to the connection's MAC mode or
- * the destination's ability to receive labels, an "effective cred"
- * will be returned.
+ * based on the security label. If any adjustments to the label are needed
+ * due to the connection's MAC mode or the destination's ability
+ * to receive labels, an "effective label" will be returned.
+ *
+ * zone_is_global is set if the actual zoneid is global. That is, it is
+ * not set for an exclusive-IP zone.
*
- * On successful return, effective_cred will point to the new creds needed
- * or will be NULL if new creds aren't needed. On error, effective_cred
- * is NULL.
+ * On successful return, effective_tsl will point to the new label needed
+ * or will be NULL if a new label isn't needed. On error, effective_tsl will
+ * point to NULL.
*
* Returns:
- * 0 Have or constructed appropriate credentials
- * EHOSTUNREACH The credentials failed the remote host accreditation
+ * 0 Label (was|is now) correct
+ * EHOSTUNREACH The label failed the remote host accreditation
* ENOMEM Memory allocation failure
*/
int
-tsol_check_dest(const cred_t *credp, const void *dst, uchar_t version,
- uint_t mac_mode, cred_t **effective_cred)
+tsol_check_dest(const ts_label_t *tsl, const void *dst,
+ uchar_t version, uint_t mac_mode, boolean_t zone_is_global,
+ ts_label_t **effective_tsl)
{
- ts_label_t *tsl, *newtsl = NULL;
+ ts_label_t *newtsl = NULL;
tsol_tpc_t *dst_rhtp;
- zoneid_t zoneid;
- if (effective_cred != NULL)
- *effective_cred = NULL;
+ if (effective_tsl != NULL)
+ *effective_tsl = NULL;
ASSERT(version == IPV4_VERSION ||
(version == IPV6_VERSION &&
!IN6_IS_ADDR_V4MAPPED((in6_addr_t *)dst)));
/* Always pass kernel level communication (NULL label) */
- if ((tsl = crgetlabel(credp)) == NULL) {
+ if (tsl == NULL) {
DTRACE_PROBE2(tx__tnopt__log__info__labeling__mac__allownull,
- char *, "destination ip(1) with null cred was passed",
+ char *, "destination ip(1) with null label was passed",
ipaddr_t, dst);
return (0);
}
@@ -358,9 +360,8 @@ tsol_check_dest(const cred_t *credp, const void *dst, uchar_t version,
}
if (!blequal(&dst_rhtp->tpc_tp.tp_def_label,
&tsl->tsl_label)) {
- zoneid = crgetzoneid(credp);
if (mac_mode != CONN_MAC_AWARE ||
- !(zoneid == GLOBAL_ZONEID ||
+ !(zone_is_global ||
bldominates(&tsl->tsl_label,
&dst_rhtp->tpc_tp.tp_def_label))) {
DTRACE_PROBE4(
@@ -438,51 +439,43 @@ tsol_check_dest(const cred_t *credp, const void *dst, uchar_t version,
}
/*
- * Generate a new cred if we modified the security label or
- * label flags.
+ * Return the new label.
*/
if (newtsl != NULL) {
- if (effective_cred != NULL) {
- *effective_cred = copycred_from_tslabel(credp,
- newtsl, KM_NOSLEEP);
- }
- label_rele(newtsl);
- if (effective_cred != NULL && *effective_cred == NULL) {
- TPC_RELE(dst_rhtp);
- return (ENOMEM);
- }
+ if (effective_tsl != NULL)
+ *effective_tsl = newtsl;
+ else
+ label_rele(newtsl);
}
TPC_RELE(dst_rhtp);
return (0);
}
/*
- * tsol_compute_label()
+ * tsol_compute_label_v4()
*
* This routine computes the IP label that should be on a packet based on the
* connection and destination information.
*
+ * The zoneid is the IP zoneid (i.e., GLOBAL_ZONEID for exlusive-IP zones).
+ *
* Returns:
* 0 Fetched label
* EHOSTUNREACH No route to destination
* EINVAL Label cannot be computed
*/
int
-tsol_compute_label(const cred_t *credp, ipaddr_t dst, uchar_t *opt_storage,
- ip_stack_t *ipst)
+tsol_compute_label_v4(const ts_label_t *tsl, zoneid_t zoneid, ipaddr_t dst,
+ uchar_t *opt_storage, ip_stack_t *ipst)
{
uint_t sec_opt_len;
- ts_label_t *tsl;
- ire_t *ire, *sire = NULL;
- tsol_ire_gw_secattr_t *attrp;
- zoneid_t zoneid, ip_zoneid;
-
- ASSERT(credp != NULL);
+ ire_t *ire;
+ tsol_ire_gw_secattr_t *attrp = NULL;
if (opt_storage != NULL)
opt_storage[IPOPT_OLEN] = 0;
- if ((tsl = crgetlabel(credp)) == NULL)
+ if (tsl == NULL)
return (0);
/* always pass multicast */
@@ -493,67 +486,44 @@ tsol_compute_label(const cred_t *credp, ipaddr_t dst, uchar_t *opt_storage,
return (0);
if (tsl->tsl_flags & TSLF_UNLABELED) {
-
/*
* The destination is unlabeled. Only add a label if the
* destination is not a broadcast/local/loopback address,
* the destination is not on the same subnet, and the
* next-hop gateway is labeled.
- *
- * For exclusive stacks we set the zoneid to zero
- * to operate as if we are in the global zone for
- * IRE lookups.
*/
- zoneid = crgetzoneid(credp);
- if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
- ip_zoneid = GLOBAL_ZONEID;
- else
- ip_zoneid = zoneid;
-
- ire = ire_cache_lookup(dst, ip_zoneid, tsl, ipst);
-
- if (ire != NULL && (ire->ire_type & (IRE_BROADCAST | IRE_LOCAL |
- IRE_LOOPBACK | IRE_INTERFACE)) != 0) {
- IRE_REFRELE(ire);
- return (0);
- } else if (ire == NULL) {
- ire = ire_ftable_lookup(dst, 0, 0, 0, NULL, &sire,
- ip_zoneid, 0, tsl, (MATCH_IRE_RECURSIVE |
- MATCH_IRE_DEFAULT | MATCH_IRE_SECATTR), ipst);
- }
-
- /* no route to destination */
- if (ire == NULL) {
+ ire = ire_route_recursive_v4(dst, 0, NULL, zoneid, tsl,
+ MATCH_IRE_SECATTR, B_TRUE, 0, ipst, NULL, &attrp, NULL);
+ ASSERT(ire != NULL);
+ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+ /* no route to destination */
+ ire_refrele(ire);
DTRACE_PROBE3(
tx__tnopt__log__info__labeling__routedst__v4,
char *, "No route to unlabeled dest ip(1) with "
- "creds(2).", ipaddr_t, dst, cred_t *, credp);
+ "with label(2).", ipaddr_t, dst, ts_label_t *, tsl);
return (EHOSTUNREACH);
}
+ if (ire->ire_type & (IRE_BROADCAST | IRE_LOCAL | IRE_LOOPBACK |
+ IRE_INTERFACE)) {
+ ire_refrele(ire);
+ return (0);
+ }
/*
- * Prefix IRE from f-table lookup means that the destination
- * is not directly connected; check the next-hop attributes.
+ * ire_route_recursive gives us the first attrp it finds
+ * in the recursive lookup.
*/
- if (sire != NULL) {
- ASSERT(ire != NULL);
- IRE_REFRELE(ire);
- ire = sire;
- }
-
/*
* Return now if next hop gateway is unlabeled. There is
* no need to generate a CIPSO option for this message.
*/
- attrp = ire->ire_gw_secattr;
if (attrp == NULL || attrp->igsa_rhc == NULL ||
attrp->igsa_rhc->rhc_tpc->tpc_tp.host_type == UNLABELED) {
- IRE_REFRELE(ire);
+ ire_refrele(ire);
return (0);
}
-
- IRE_REFRELE(ire);
-
+ ire_refrele(ire);
}
/* compute the CIPSO option */
@@ -562,8 +532,8 @@ tsol_compute_label(const cred_t *credp, ipaddr_t dst, uchar_t *opt_storage,
if (sec_opt_len == 0) {
DTRACE_PROBE3(tx__tnopt__log__error__labeling__lostops__v4,
- char *, "options lack length for dest ip(1) with creds(2).",
- ipaddr_t, dst, cred_t *, credp);
+ char *, "options lack length for dest ip(1) with label(2).",
+ ipaddr_t, dst, ts_label_t *, tsl);
return (EINVAL);
}
@@ -575,6 +545,9 @@ tsol_compute_label(const cred_t *credp, ipaddr_t dst, uchar_t *opt_storage,
* header, move the 'buflen' bytes back to fill the gap, and return the number
* of bytes removed (as zero or negative number). Assumes that the headers are
* sane.
+ *
+ * Note that tsol_remove_secopt does not adjust ipha_length but
+ * tsol_remove_secopt_v6 does adjust ip6_plen.
*/
int
tsol_remove_secopt(ipha_t *ipha, int buflen)
@@ -659,6 +632,9 @@ tsol_remove_secopt(ipha_t *ipha, int buflen)
* option cannot be inserted. (Note that negative return values are possible
* when noops must be compressed, and that only -1 indicates error. Successful
* return value is always evenly divisible by 4, by definition.)
+ *
+ * Note that tsol_prepend_option does not adjust ipha_length but
+ * tsol_prepend_option_v6 does adjust ip6_plen.
*/
int
tsol_prepend_option(uchar_t *optbuf, ipha_t *ipha, int buflen)
@@ -810,28 +786,39 @@ tsol_prepend_option(uchar_t *optbuf, ipha_t *ipha, int buflen)
}
/*
- * tsol_check_label()
+ * tsol_check_label_v4()
*
* This routine computes the IP label that should be on the packet based on the
- * connection and destination information. If the label is there, it returns
- * zero, so the caller knows that the label is syncronized, and further calls
- * are not required. If the label isn't right, then the right one is inserted.
+ * connection and destination information. It's called by the IP forwarding
+ * logic and by ip_output_simple. The ULPs generate the labels before calling
+ * conn_ip_output. If any adjustments to
+ * the label are needed due to the connection's MAC-exempt status or
+ * the destination's ability to receive labels, an "effective label"
+ * will be returned.
*
* The packet's header is clear before entering IPsec's engine.
*
+ * The zoneid is the IP zoneid (i.e., GLOBAL_ZONEID for exlusive-IP zones).
+ * zone_is_global is set if the actual zoneid is global.
+ *
+ * On successful return, effective_tslp will point to the new label needed
+ * or will be NULL if a new label isn't needed. On error, effective_tsl will
+ * point to NULL.
+ *
* Returns:
- * 0 Label on packet (was|is now) correct
+ * 0 Label on (was|is now) correct
* EACCES The packet failed the remote host accreditation.
* ENOMEM Memory allocation failure.
* EINVAL Label cannot be computed
*/
int
-tsol_check_label(const cred_t *credp, mblk_t **mpp, uint_t mac_mode,
- ip_stack_t *ipst, pid_t pid)
+tsol_check_label_v4(const ts_label_t *tsl, zoneid_t zoneid, mblk_t **mpp,
+ uint_t mac_mode, boolean_t zone_is_global, ip_stack_t *ipst,
+ ts_label_t **effective_tslp)
{
mblk_t *mp = *mpp;
ipha_t *ipha;
- cred_t *effective_cred = NULL;
+ ts_label_t *effective_tsl = NULL;
uchar_t opt_storage[IP_MAX_OPT_LENGTH];
uint_t hlen;
uint_t sec_opt_len;
@@ -839,19 +826,18 @@ tsol_check_label(const cred_t *credp, mblk_t **mpp, uint_t mac_mode,
int delta_remove = 0, delta_add, adjust;
int retv;
+ *effective_tslp = NULL;
opt_storage[IPOPT_OPTVAL] = 0;
ipha = (ipha_t *)mp->b_rptr;
/*
* Verify the destination is allowed to receive packets at
- * the security label of the message data. check_dest()
- * may create a new effective cred with a modified label
- * or label flags. Apply any such cred to the message block
- * for use in future routing decisions.
+ * the security label of the message data. tsol_check_dest()
+ * may create a new effective label or label flags.
*/
- retv = tsol_check_dest(credp, &ipha->ipha_dst, IPV4_VERSION,
- mac_mode, &effective_cred);
+ retv = tsol_check_dest(tsl, &ipha->ipha_dst, IPV4_VERSION,
+ mac_mode, zone_is_global, &effective_tsl);
if (retv != 0)
return (retv);
@@ -859,16 +845,15 @@ tsol_check_label(const cred_t *credp, mblk_t **mpp, uint_t mac_mode,
* Calculate the security label to be placed in the text
* of the message (if any).
*/
- if (effective_cred != NULL) {
- if ((retv = tsol_compute_label(effective_cred,
+ if (effective_tsl != NULL) {
+ if ((retv = tsol_compute_label_v4(effective_tsl, zoneid,
ipha->ipha_dst, opt_storage, ipst)) != 0) {
- crfree(effective_cred);
+ label_rele(effective_tsl);
return (retv);
}
- mblk_setcred(mp, effective_cred, pid);
- crfree(effective_cred);
+ *effective_tslp = effective_tsl;
} else {
- if ((retv = tsol_compute_label(credp,
+ if ((retv = tsol_compute_label_v4(tsl, zoneid,
ipha->ipha_dst, opt_storage, ipst)) != 0) {
return (retv);
}
@@ -890,10 +875,6 @@ tsol_check_label(const cred_t *credp, mblk_t **mpp, uint_t mac_mode,
return (0);
}
- if (msg_getcred(mp, NULL) == NULL) {
- mblk_setcred(mp, (cred_t *)credp, NOPID);
- }
-
/*
* If there is an option there, then it must be the wrong one; delete.
*/
@@ -918,8 +899,13 @@ tsol_check_label(const cred_t *credp, mblk_t **mpp, uint_t mac_mode,
copylen = 256;
new_mp = allocb_tmpl(hlen + copylen +
(mp->b_rptr - mp->b_datap->db_base), mp);
- if (new_mp == NULL)
+ if (new_mp == NULL) {
+ if (effective_tsl != NULL) {
+ label_rele(effective_tsl);
+ *effective_tslp = NULL;
+ }
return (ENOMEM);
+ }
/* keep the bias */
new_mp->b_rptr += mp->b_rptr - mp->b_datap->db_base;
@@ -948,6 +934,10 @@ tsol_check_label(const cred_t *credp, mblk_t **mpp, uint_t mac_mode,
return (0);
param_prob:
+ if (effective_tsl != NULL) {
+ label_rele(effective_tsl);
+ *effective_tslp = NULL;
+ }
return (EINVAL);
}
@@ -972,19 +962,17 @@ param_prob:
* i.e starting from the IP6OPT_LS but not including the pad at the end.
* The user must prepend two octets (either padding or next header / length)
* and append padding out to the next 8 octet boundary.
+ *
+ * The zoneid is the IP zoneid (i.e., GLOBAL_ZONEID for exlusive-IP zones).
*/
int
-tsol_compute_label_v6(const cred_t *credp, const in6_addr_t *dst,
- uchar_t *opt_storage, ip_stack_t *ipst)
+tsol_compute_label_v6(const ts_label_t *tsl, zoneid_t zoneid,
+ const in6_addr_t *dst, uchar_t *opt_storage, ip_stack_t *ipst)
{
- ts_label_t *tsl;
uint_t sec_opt_len;
uint32_t doi;
- zoneid_t zoneid, ip_zoneid;
- ire_t *ire, *sire;
- tsol_ire_gw_secattr_t *attrp;
-
- ASSERT(credp != NULL);
+ ire_t *ire;
+ tsol_ire_gw_secattr_t *attrp = NULL;
if (ip6opt_ls == 0)
return (EINVAL);
@@ -992,15 +980,13 @@ tsol_compute_label_v6(const cred_t *credp, const in6_addr_t *dst,
if (opt_storage != NULL)
opt_storage[IPOPT_OLEN] = 0;
- if ((tsl = crgetlabel(credp)) == NULL)
+ if (tsl == NULL)
return (0);
/* Always pass multicast */
if (IN6_IS_ADDR_MULTICAST(dst))
return (0);
- zoneid = crgetzoneid(credp);
-
/*
* Fill in a V6 label. If a new format is added here, make certain
* that the maximum size of this label is reflected in sys/tsol/tnet.h
@@ -1012,62 +998,41 @@ tsol_compute_label_v6(const cred_t *credp, const in6_addr_t *dst,
if (tsl->tsl_flags & TSLF_UNLABELED) {
/*
* The destination is unlabeled. Only add a label if the
- * destination is not broadcast/local/loopback address,
+ * destination is not a broadcast/local/loopback address,
* the destination is not on the same subnet, and the
* next-hop gateway is labeled.
- *
- * For exclusive stacks we set the zoneid to zero to
- * operate as if we are in the global zone when
- * performing IRE lookups and conn_t comparisons.
*/
- if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
- ip_zoneid = GLOBAL_ZONEID;
- else
- ip_zoneid = zoneid;
-
- sire = NULL;
- ire = ire_cache_lookup_v6(dst, ip_zoneid, tsl, ipst);
-
- if (ire != NULL && (ire->ire_type & (IRE_LOCAL |
- IRE_LOOPBACK | IRE_INTERFACE)) != 0) {
- IRE_REFRELE(ire);
- return (0);
- } else if (ire == NULL) {
- ire = ire_ftable_lookup_v6(dst, NULL, NULL, 0, NULL,
- &sire, ip_zoneid, 0, tsl, (MATCH_IRE_RECURSIVE |
- MATCH_IRE_DEFAULT | MATCH_IRE_SECATTR), ipst);
- }
-
- /* no route to destination */
- if (ire == NULL) {
+ ire = ire_route_recursive_v6(dst, 0, NULL, zoneid, tsl,
+ MATCH_IRE_SECATTR, B_TRUE, 0, ipst, NULL, &attrp, NULL);
+ ASSERT(ire != NULL);
+ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+ /* no route to destination */
+ ire_refrele(ire);
DTRACE_PROBE3(
tx__tnopt__log__info__labeling__routedst__v6,
char *, "No route to unlabeled dest ip6(1) with "
- "creds(2).", in6_addr_t *, dst, cred_t *, credp);
+ "label(2).", in6_addr_t *, dst, ts_label_t *, tsl);
return (EHOSTUNREACH);
}
-
+ if (ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK |
+ IRE_INTERFACE)) {
+ ire_refrele(ire);
+ return (0);
+ }
/*
- * Prefix IRE from f-table lookup means that the destination
- * is not directly connected; check the next-hop attributes.
+ * ire_route_recursive gives us the first attrp it finds
+ * in the recursive lookup.
*/
- if (sire != NULL) {
- ASSERT(ire != NULL);
- IRE_REFRELE(ire);
- ire = sire;
- }
-
/*
* Return now if next hop gateway is unlabeled. There is
* no need to generate a CIPSO option for this message.
*/
- attrp = ire->ire_gw_secattr;
if (attrp == NULL || attrp->igsa_rhc == NULL ||
attrp->igsa_rhc->rhc_tpc->tpc_tp.host_type == UNLABELED) {
- IRE_REFRELE(ire);
+ ire_refrele(ire);
return (0);
}
- IRE_REFRELE(ire);
+ ire_refrele(ire);
}
/* compute the CIPSO option */
@@ -1079,7 +1044,7 @@ tsol_compute_label_v6(const cred_t *credp, const in6_addr_t *dst,
if (sec_opt_len == 0) {
DTRACE_PROBE3(tx__tnopt__log__error__labeling__lostops__v6,
char *, "options lack length for dest ip6(1) with "
- "creds(2).", in6_addr_t *, dst, cred_t *, credp);
+ "label(2).", in6_addr_t *, dst, ts_label_t *, tsl);
return (EINVAL);
}
@@ -1188,6 +1153,9 @@ tsol_find_secopt_v6(
* Header and data following the label option that is deleted are copied
* (i.e. slid backward) to the right position, and returns the number
* of bytes removed (as zero or negative number.)
+ *
+ * Note that tsol_remove_secopt does not adjust ipha_length but
+ * tsol_remove_secopt_v6 does adjust ip6_plen.
*/
int
tsol_remove_secopt_v6(ip6_t *ip6h, int buflen)
@@ -1286,6 +1254,9 @@ tsol_remove_secopt_v6(ip6_t *ip6h, int buflen)
* extra option being added. Header and data following the position where
* the label option is inserted are copied (i.e. slid forward) to the right
* position.
+ *
+ * Note that tsol_prepend_option does not adjust ipha_length but
+ * tsol_prepend_option_v6 does adjust ip6_plen.
*/
int
tsol_prepend_option_v6(uchar_t *optbuf, ip6_t *ip6h, int buflen)
@@ -1368,22 +1339,36 @@ tsol_prepend_option_v6(uchar_t *optbuf, ip6_t *ip6h, int buflen)
* tsol_check_label_v6()
*
* This routine computes the IP label that should be on the packet based on the
- * connection and destination information. It's called only by the IP
- * forwarding logic, because all internal modules atop IP know how to generate
- * their own labels.
+ * connection and destination information. It's called by the IP forwarding
+ * logic and by ip_output_simple. The ULPs generate the labels before calling
+ * conn_ip_output. If any adjustments to
+ * the label are needed due to the connection's MAC-exempt status or
+ * the destination's ability to receive labels, an "effective label"
+ * will be returned.
+ *
+ * The packet's header is clear before entering IPsec's engine.
+ *
+ * The zoneid is the IP zoneid (i.e., GLOBAL_ZONEID for exlusive-IP zones).
+ * zone_is_global is set if the actual zoneid is global.
+ *
+ * On successful return, effective_tslp will point to the new label needed
+ * or will be NULL if a new label isn't needed. On error, effective_tsl will
+ * point to NULL.
*
* Returns:
- * 0 Label on packet was already correct
+ * 0 Label on (was|is now) correct
* EACCES The packet failed the remote host accreditation.
* ENOMEM Memory allocation failure.
+ * EINVAL Label cannot be computed
*/
int
-tsol_check_label_v6(const cred_t *credp, mblk_t **mpp, uint_t mode,
- ip_stack_t *ipst, pid_t pid)
+tsol_check_label_v6(const ts_label_t *tsl, zoneid_t zoneid, mblk_t **mpp,
+ uint_t mac_mode, boolean_t zone_is_global, ip_stack_t *ipst,
+ ts_label_t **effective_tslp)
{
mblk_t *mp = *mpp;
ip6_t *ip6h;
- cred_t *effective_cred;
+ ts_label_t *effective_tsl = NULL;
/*
* Label option length is limited to IP_MAX_OPT_LENGTH for
* symmetry with IPv4. Can be relaxed if needed
@@ -1399,16 +1384,16 @@ tsol_check_label_v6(const cred_t *credp, mblk_t **mpp, uint_t mode,
uint_t hbhlen;
boolean_t hbh_needed;
+ *effective_tslp = NULL;
+
/*
* Verify the destination is allowed to receive packets at
- * the security label of the message data. check_dest()
- * may create a new effective cred with a modified label
- * or label flags. Apply any such cred to the message block
- * for use in future routing decisions.
+ * the security label of the message data. tsol_check_dest()
+ * may create a new effective label or label flags.
*/
ip6h = (ip6_t *)mp->b_rptr;
- retv = tsol_check_dest(credp, &ip6h->ip6_dst, IPV6_VERSION,
- mode, &effective_cred);
+ retv = tsol_check_dest(tsl, &ip6h->ip6_dst, IPV6_VERSION,
+ mac_mode, zone_is_global, &effective_tsl);
if (retv != 0)
return (retv);
@@ -1416,16 +1401,15 @@ tsol_check_label_v6(const cred_t *credp, mblk_t **mpp, uint_t mode,
* Calculate the security label to be placed in the text
* of the message (if any).
*/
- if (effective_cred != NULL) {
- if ((retv = tsol_compute_label_v6(effective_cred,
+ if (effective_tsl != NULL) {
+ if ((retv = tsol_compute_label_v6(effective_tsl, zoneid,
&ip6h->ip6_dst, opt_storage, ipst)) != 0) {
- crfree(effective_cred);
+ label_rele(effective_tsl);
return (retv);
}
- mblk_setcred(mp, effective_cred, pid);
- crfree(effective_cred);
+ *effective_tslp = effective_tsl;
} else {
- if ((retv = tsol_compute_label_v6(credp,
+ if ((retv = tsol_compute_label_v6(tsl, zoneid,
&ip6h->ip6_dst, opt_storage, ipst)) != 0)
return (retv);
}
@@ -1457,10 +1441,6 @@ tsol_check_label_v6(const cred_t *credp, mblk_t **mpp, uint_t mode,
return (0);
}
- if (msg_getcred(mp, NULL) == NULL) {
- mblk_setcred(mp, (cred_t *)credp, NOPID);
- }
-
if (secopt != NULL && sec_opt_len != 0 &&
(bcmp(opt_storage, secopt, sec_opt_len + 2) == 0)) {
/* The packet has the correct label already */
@@ -1499,8 +1479,13 @@ tsol_check_label_v6(const cred_t *credp, mblk_t **mpp, uint_t mode,
copylen = hdr_len;
new_mp = allocb_tmpl(hlen + copylen +
(mp->b_rptr - mp->b_datap->db_base), mp);
- if (new_mp == NULL)
+ if (new_mp == NULL) {
+ if (effective_tsl != NULL) {
+ label_rele(effective_tsl);
+ *effective_tslp = NULL;
+ }
return (ENOMEM);
+ }
/* keep the bias */
new_mp->b_rptr += mp->b_rptr - mp->b_datap->db_base;
@@ -1522,208 +1507,13 @@ tsol_check_label_v6(const cred_t *credp, mblk_t **mpp, uint_t mode,
ASSERT(mp->b_wptr + delta_add <= DB_LIM(mp));
mp->b_wptr += delta_add;
+ /* tsol_prepend_option_v6 has adjusted ip6_plen */
return (0);
param_prob:
- return (EINVAL);
-}
-
-/*
- * Update the given IPv6 "sticky options" structure to contain the provided
- * label, which is encoded as an IPv6 option. Existing label is removed if
- * necessary, and storage is allocated/freed/resized.
- *
- * Returns 0 on success, errno on failure.
- */
-int
-tsol_update_sticky(ip6_pkt_t *ipp, uint_t *labellen, const uchar_t *labelopt)
-{
- int rawlen, optlen, newlen;
- uchar_t *newopts;
-
- /*
- * rawlen is the size of the IPv6 label to be inserted from labelopt.
- * optlen is the total length of that option, including any necessary
- * headers and padding. newlen is the new size of the total hop-by-hop
- * options buffer, including user options.
- */
- ASSERT(*labellen <= ipp->ipp_hopoptslen);
- ASSERT((ipp->ipp_hopopts == NULL && ipp->ipp_hopoptslen == 0) ||
- (ipp->ipp_hopopts != NULL && ipp->ipp_hopoptslen != 0));
-
- if ((rawlen = labelopt[1]) != 0) {
- rawlen += 2; /* add in header size */
- optlen = (2 + rawlen + 7) & ~7;
- } else {
- optlen = 0;
- }
- newlen = ipp->ipp_hopoptslen + optlen - *labellen;
- if (newlen == 0 && ipp->ipp_hopopts != NULL) {
- /* Deleting all existing hop-by-hop options */
- kmem_free(ipp->ipp_hopopts, ipp->ipp_hopoptslen);
- ipp->ipp_hopopts = NULL;
- ipp->ipp_fields &= ~IPPF_HOPOPTS;
- } else if (optlen != *labellen) {
- /* If the label not same size as last time, then reallocate */
- if (newlen > IP6_MAX_OPT_LENGTH)
- return (EHOSTUNREACH);
- newopts = kmem_alloc(newlen, KM_NOSLEEP);
- if (newopts == NULL)
- return (ENOMEM);
- /*
- * If the user has hop-by-hop stickyoptions set, then copy his
- * options in after the security label.
- */
- if (ipp->ipp_hopoptslen > *labellen) {
- bcopy(ipp->ipp_hopopts + *labellen, newopts + optlen,
- ipp->ipp_hopoptslen - *labellen);
- /*
- * Stomp out any header gunk here - this was the
- * previous next-header and option length field.
- */
- newopts[optlen] = IP6OPT_PADN;
- newopts[optlen + 1] = 0;
- }
- if (ipp->ipp_hopopts != NULL)
- kmem_free(ipp->ipp_hopopts, ipp->ipp_hopoptslen);
- ipp->ipp_hopopts = (ip6_hbh_t *)newopts;
- }
- ipp->ipp_hopoptslen = newlen;
- *labellen = optlen;
-
- newopts = (uchar_t *)ipp->ipp_hopopts;
-
- /* If there are any options, then fix up reported length */
- if (newlen > 0) {
- newopts[1] = (newlen + 7) / 8 - 1;
- ipp->ipp_fields |= IPPF_HOPOPTS;
- }
-
- /* If there's a label, then insert it now */
- if (optlen > 0) {
- /* skip next-header and length fields */
- newopts += 2;
- bcopy(labelopt, newopts, rawlen);
- newopts += rawlen;
- /* make sure padding comes out right */
- optlen -= 2 + rawlen;
- if (optlen == 1) {
- newopts[0] = IP6OPT_PAD1;
- } else if (optlen > 1) {
- newopts[0] = IP6OPT_PADN;
- optlen -= 2;
- newopts[1] = optlen;
- if (optlen > 0)
- bzero(newopts + 2, optlen);
- }
- }
- return (0);
-}
-
-int
-tsol_update_options(uchar_t **opts, uint_t *totlen, uint_t *labellen,
- const uchar_t *labelopt)
-{
- int optlen, newlen;
- uchar_t *newopts;
-
- optlen = (labelopt[IPOPT_OLEN] + 3) & ~3;
- newlen = *totlen + optlen - *labellen;
- if (optlen > *labellen) {
- if (newlen > IP_MAX_OPT_LENGTH)
- return (EHOSTUNREACH);
- newopts = (uchar_t *)mi_alloc(newlen, BPRI_HI);
- if (newopts == NULL)
- return (ENOMEM);
- if (*totlen > *labellen) {
- bcopy(*opts + *labellen, newopts + optlen,
- *totlen - *labellen);
- }
- if (*opts != NULL)
- mi_free((char *)*opts);
- *opts = newopts;
- } else if (optlen < *labellen) {
- if (newlen == 0 && *opts != NULL) {
- mi_free((char *)*opts);
- *opts = NULL;
- }
- if (*totlen > *labellen) {
- ovbcopy(*opts + *labellen, *opts + optlen,
- *totlen - *labellen);
- }
- }
- *totlen = newlen;
- *labellen = optlen;
- if (optlen > 0) {
- newopts = *opts;
- bcopy(labelopt, newopts, optlen);
- /* check if there are user-supplied options that follow */
- if (optlen < newlen) {
- /* compute amount of embedded alignment needed */
- optlen -= newopts[IPOPT_OLEN];
- newopts += newopts[IPOPT_OLEN];
- while (--optlen >= 0)
- *newopts++ = IPOPT_NOP;
- } else if (optlen != newopts[IPOPT_OLEN]) {
- /*
- * The label option is the only option and it is
- * not a multiple of 4 bytes.
- */
- optlen -= newopts[IPOPT_OLEN];
- newopts += newopts[IPOPT_OLEN];
- while (--optlen >= 0)
- *newopts++ = IPOPT_EOL;
- }
+ if (effective_tsl != NULL) {
+ label_rele(effective_tsl);
+ *effective_tslp = NULL;
}
- return (0);
-}
-
-/*
- * This does the bulk of the processing for setting IPPROTO_IP {T_,}IP_OPTIONS.
- */
-boolean_t
-tsol_option_set(uchar_t **opts, uint_t *optlen, uint_t labellen,
- const uchar_t *useropts, uint_t userlen)
-{
- int newlen;
- uchar_t *newopts;
-
- newlen = userlen + labellen;
- if (newlen > *optlen) {
- /* need more room */
- newopts = (uchar_t *)mi_alloc(newlen, BPRI_HI);
- if (newopts == NULL)
- return (B_FALSE);
- /*
- * The supplied *opts can't be NULL in this case,
- * since there's an existing label.
- */
- if (labellen > 0)
- bcopy(*opts, newopts, labellen);
- if (*opts != NULL)
- mi_free((char *)*opts);
- *opts = newopts;
- }
-
- if (newlen == 0) {
- /* special case -- no remaining IP options at all */
- if (*opts != NULL) {
- mi_free((char *)*opts);
- *opts = NULL;
- }
- } else if (userlen > 0) {
- /* merge in the user's options */
- newopts = *opts;
- if (labellen > 0) {
- int extra = labellen - newopts[IPOPT_OLEN];
-
- newopts += newopts[IPOPT_OLEN];
- while (--extra >= 0)
- *newopts++ = IPOPT_NOP;
- }
- bcopy(useropts, newopts, userlen);
- }
-
- *optlen = newlen;
- return (B_TRUE);
+ return (EINVAL);
}
diff --git a/usr/src/uts/common/inet/ip/tnet.c b/usr/src/uts/common/inet/ip/tnet.c
index 1e5c0eb170..262d5bc339 100644
--- a/usr/src/uts/common/inet/ip/tnet.c
+++ b/usr/src/uts/common/inet/ip/tnet.c
@@ -133,16 +133,7 @@ int tsol_strict_error;
* - A set of route-related attributes that only get set for prefix
* IREs. If this is non-NULL, the prefix IRE has been associated
* with a set of gateway security attributes by way of route add/
- * change functionality. This field stays NULL for IRE_CACHEs.
- *
- * igsa_gcgrp
- *
- * - Group of gc's which only gets set for IRE_CACHEs. Each of the gc
- * points to a gcdb record that contains the security attributes
- * used to perform the credential checks of the packet which uses
- * the IRE. If the group is not empty, the list of gc's can be
- * traversed starting at gcgrp_head. This field stays NULL for
- * prefix IREs.
+ * change functionality.
*/
static kmem_cache_t *ire_gw_secattr_cache;
@@ -223,7 +214,6 @@ ire_gw_secattr_constructor(void *buf, void *cdrarg, int kmflags)
attrp->igsa_rhc = NULL;
attrp->igsa_gc = NULL;
- attrp->igsa_gcgrp = NULL;
return (0);
}
@@ -257,14 +247,9 @@ ire_gw_secattr_free(tsol_ire_gw_secattr_t *attrp)
GC_REFRELE(attrp->igsa_gc);
attrp->igsa_gc = NULL;
}
- if (attrp->igsa_gcgrp != NULL) {
- GCGRP_REFRELE(attrp->igsa_gcgrp);
- attrp->igsa_gcgrp = NULL;
- }
ASSERT(attrp->igsa_rhc == NULL);
ASSERT(attrp->igsa_gc == NULL);
- ASSERT(attrp->igsa_gcgrp == NULL);
kmem_cache_free(ire_gw_secattr_cache, attrp);
}
@@ -387,9 +372,6 @@ rtsa_validate(const struct rtsa_s *rp)
/*
* A brief explanation of the reference counting scheme:
*
- * Prefix IREs have a non-NULL igsa_gc and a NULL igsa_gcgrp;
- * IRE_CACHEs have it vice-versa.
- *
* Apart from dynamic references due to to reference holds done
* actively by threads, we have the following references:
*
@@ -402,8 +384,6 @@ rtsa_validate(const struct rtsa_s *rp)
* to the gc_refcnt.
*
* gcgrp_refcnt:
- * - An IRE_CACHE that points to an igsa_gcgrp contributes a reference
- * to the gcgrp_refcnt of the associated tsol_gcgrp_t.
* - Every tsol_gc_t in the chain headed by tsol_gcgrp_t contributes
* a reference to the gcgrp_refcnt.
*/
@@ -613,7 +593,6 @@ gcgrp_inactive(tsol_gcgrp_t *gcgrp)
mod_hash_t *hashp;
ASSERT(MUTEX_HELD(&gcgrp_lock));
- ASSERT(!RW_LOCK_HELD(&gcgrp->gcgrp_rwlock));
ASSERT(gcgrp != NULL && gcgrp->gcgrp_refcnt == 0);
ASSERT(gcgrp->gcgrp_head == NULL && gcgrp->gcgrp_count == 0);
@@ -686,21 +665,21 @@ cipso_to_sl(const uchar_t *option, bslabel_t *sl)
}
/*
- * If present, parse a CIPSO label in the incoming packet and
- * construct a ts_label_t that reflects the CIPSO label and attach it
- * to the dblk cred. Later as the mblk flows up through the stack any
+ * If present, parse the CIPSO label in the incoming packet and
+ * construct a ts_label_t that reflects the CIPSO label and put it in
+ * the ip_recv_attr_t. Later as the packet flows up through the stack any
* code that needs to examine the packet label can inspect the label
- * from the dblk cred. This function is called right in ip_rput for
- * all packets, i.e. locally destined and to be forwarded packets. The
- * forwarding path needs to examine the label to determine how to
- * forward the packet.
+ * from the ira_tsl. This function is
+ * called right in ip_input for all packets, i.e. locally destined and
+ * to be forwarded packets. The forwarding path needs to examine the label
+ * to determine how to forward the packet.
*
* This routine pulls all message text up into the first mblk.
* For IPv4, only the first 20 bytes of the IP header are guaranteed
* to exist. For IPv6, only the IPv6 header is guaranteed to exist.
*/
boolean_t
-tsol_get_pkt_label(mblk_t *mp, int version)
+tsol_get_pkt_label(mblk_t *mp, int version, ip_recv_attr_t *ira)
{
tsol_tpc_t *src_rhtp = NULL;
uchar_t *opt_ptr = NULL;
@@ -713,7 +692,6 @@ tsol_get_pkt_label(mblk_t *mp, int version)
const void *src;
const ip6_t *ip6h;
cred_t *credp;
- pid_t cpid;
int proto;
ASSERT(DB_TYPE(mp) == M_DATA);
@@ -846,28 +824,37 @@ tsol_get_pkt_label(mblk_t *mp, int version)
return (B_FALSE);
}
- /* Make sure no other thread is messing with this mblk */
- ASSERT(DB_REF(mp) == 1);
- /* Preserve db_cpid */
- credp = msg_extractcred(mp, &cpid);
- if (credp == NULL) {
+ if (ira->ira_cred == NULL) {
credp = newcred_from_bslabel(&sl, doi, KM_NOSLEEP);
+ if (credp == NULL)
+ return (B_FALSE);
} else {
cred_t *newcr;
- newcr = copycred_from_bslabel(credp, &sl, doi,
+ newcr = copycred_from_bslabel(ira->ira_cred, &sl, doi,
KM_NOSLEEP);
- crfree(credp);
+ if (newcr == NULL)
+ return (B_FALSE);
+ if (ira->ira_free_flags & IRA_FREE_CRED) {
+ crfree(ira->ira_cred);
+ ira->ira_free_flags &= ~IRA_FREE_CRED;
+ ira->ira_cred = NULL;
+ }
credp = newcr;
}
- if (credp == NULL)
- return (B_FALSE);
- crgetlabel(credp)->tsl_flags |= label_flags;
-
- mblk_setcred(mp, credp, cpid);
- crfree(credp); /* mblk has ref on cred */
+ /*
+ * Put the label in ira_tsl for convinience, while keeping
+ * the cred in ira_cred for getpeerucred which is used to get
+ * labels with TX.
+ * Note: no explicit refcnt/free_flag for ira_tsl. The free_flag
+ * for IRA_FREE_CRED is sufficient for both.
+ */
+ ira->ira_tsl = crgetlabel(credp);
+ ira->ira_cred = credp;
+ ira->ira_free_flags |= IRA_FREE_CRED;
+ ira->ira_tsl->tsl_flags |= label_flags;
return (B_TRUE);
}
@@ -878,25 +865,25 @@ tsol_get_pkt_label(mblk_t *mp, int version)
*/
boolean_t
tsol_receive_local(const mblk_t *mp, const void *addr, uchar_t version,
- boolean_t shared_addr, const conn_t *connp)
+ ip_recv_attr_t *ira, const conn_t *connp)
{
const cred_t *credp;
ts_label_t *plabel, *conn_plabel;
tsol_tpc_t *tp;
boolean_t retv;
const bslabel_t *label, *conn_label;
+ boolean_t shared_addr = (ira->ira_flags & IRAF_TX_SHARED_ADDR);
/*
- * The cases in which this can happen are:
- * - IPv6 Router Alert, where ip_rput_data_v6 deliberately skips
- * over the label attachment process.
- * - MLD output looped-back to ourselves.
- * - IPv4 Router Discovery, where tsol_get_pkt_label intentionally
- * avoids the labeling process.
- * We trust that all valid paths in the code set the cred pointer when
- * needed.
+ * tsol_get_pkt_label intentionally avoids the labeling process for:
+ * - IPv6 router and neighbor discovery as well as redirects.
+ * - MLD packets. (Anything between ICMPv6 code 130 and 138.)
+ * - IGMP packets.
+ * - IPv4 router discovery.
+ * In those cases ire_cred is NULL.
*/
- if ((credp = msg_getcred(mp, NULL)) == NULL)
+ credp = ira->ira_cred;
+ if (credp == NULL)
return (B_TRUE);
/*
@@ -904,17 +891,18 @@ tsol_receive_local(const mblk_t *mp, const void *addr, uchar_t version,
* same zoneid as the selected destination, then no checks are
* necessary. Membership in the zone is enough proof. This is
* intended to be a hot path through this function.
+ * Note: Using crgetzone here is ok since the peer is local.
*/
if (!crisremote(credp) &&
crgetzone(credp) == crgetzone(connp->conn_cred))
return (B_TRUE);
- plabel = crgetlabel(credp);
+ plabel = ira->ira_tsl;
conn_plabel = crgetlabel(connp->conn_cred);
ASSERT(plabel != NULL && conn_plabel != NULL);
label = label2bslabel(plabel);
- conn_label = label2bslabel(crgetlabel(connp->conn_cred));
+ conn_label = label2bslabel(conn_plabel);
/*
@@ -954,12 +942,8 @@ tsol_receive_local(const mblk_t *mp, const void *addr, uchar_t version,
blequal(label, conn_label))
return (B_TRUE);
- /*
- * conn_zoneid is global for an exclusive stack, thus we use
- * conn_cred to get the zoneid
- */
if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) ||
- (crgetzoneid(connp->conn_cred) != GLOBAL_ZONEID &&
+ (!connp->conn_zone_is_global &&
(plabel->tsl_doi != conn_plabel->tsl_doi ||
!bldominates(conn_label, label)))) {
DTRACE_PROBE3(
@@ -1046,16 +1030,13 @@ tsol_receive_local(const mblk_t *mp, const void *addr, uchar_t version,
}
boolean_t
-tsol_can_accept_raw(mblk_t *mp, boolean_t check_host)
+tsol_can_accept_raw(mblk_t *mp, ip_recv_attr_t *ira, boolean_t check_host)
{
ts_label_t *plabel = NULL;
tsol_tpc_t *src_rhtp, *dst_rhtp;
boolean_t retv;
- cred_t *credp;
- credp = msg_getcred(mp, NULL);
- if (credp != NULL)
- plabel = crgetlabel(credp);
+ plabel = ira->ira_tsl;
/* We are bootstrapping or the internal template was never deleted */
if (plabel == NULL)
@@ -1144,7 +1125,7 @@ tsol_can_accept_raw(mblk_t *mp, boolean_t check_host)
* TSLF_UNLABELED flag is sufficient.
*/
boolean_t
-tsol_can_reply_error(const mblk_t *mp)
+tsol_can_reply_error(const mblk_t *mp, ip_recv_attr_t *ira)
{
ts_label_t *plabel = NULL;
tsol_tpc_t *rhtp;
@@ -1152,7 +1133,6 @@ tsol_can_reply_error(const mblk_t *mp)
const ip6_t *ip6h;
boolean_t retv;
bslabel_t *pktbs;
- cred_t *credp;
/* Caller must pull up at least the IP header */
ASSERT(MBLKL(mp) >= (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION ?
@@ -1161,9 +1141,7 @@ tsol_can_reply_error(const mblk_t *mp)
if (!tsol_strict_error)
return (B_TRUE);
- credp = msg_getcred(mp, NULL);
- if (credp != NULL)
- plabel = crgetlabel(credp);
+ plabel = ira->ira_tsl;
/* We are bootstrapping or the internal template was never deleted */
if (plabel == NULL)
@@ -1227,33 +1205,30 @@ tsol_can_reply_error(const mblk_t *mp)
}
/*
- * Finds the zone associated with the given packet. Returns GLOBAL_ZONEID if
- * the zone cannot be located.
+ * Finds the zone associated with the receive attributes. Returns GLOBAL_ZONEID
+ * if the zone cannot be located.
*
* This is used by the classifier when the packet matches an ALL_ZONES IRE, and
* there's no MLP defined.
*
* Note that we assume that this is only invoked in the ALL_ZONES case.
- * Handling other cases would require handle exclusive stack zones where either
+ * Handling other cases would require handling exclusive IP zones where either
* this routine or the callers would have to map from
* the zoneid (zone->zone_id) to what IP uses in conn_zoneid etc.
*/
zoneid_t
-tsol_packet_to_zoneid(const mblk_t *mp)
+tsol_attr_to_zoneid(const ip_recv_attr_t *ira)
{
- cred_t *cr = msg_getcred(mp, NULL);
zone_t *zone;
ts_label_t *label;
- if (cr != NULL) {
- if ((label = crgetlabel(cr)) != NULL) {
- zone = zone_find_by_label(label);
- if (zone != NULL) {
- zoneid_t zoneid = zone->zone_id;
+ if ((label = ira->ira_tsl) != NULL) {
+ zone = zone_find_by_label(label);
+ if (zone != NULL) {
+ zoneid_t zoneid = zone->zone_id;
- zone_rele(zone);
- return (zoneid);
- }
+ zone_rele(zone);
+ return (zoneid);
}
}
return (GLOBAL_ZONEID);
@@ -1273,7 +1248,7 @@ tsol_ire_match_gwattr(ire_t *ire, const ts_label_t *tsl)
/* Not in Trusted mode or IRE is local/loopback/broadcast/interface */
if (!is_system_labeled() ||
(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST |
- IRE_INTERFACE)))
+ IRE_IF_ALL | IRE_MULTICAST | IRE_NOROUTE)))
goto done;
/*
@@ -1304,29 +1279,16 @@ tsol_ire_match_gwattr(ire_t *ire, const ts_label_t *tsl)
mutex_enter(&attrp->igsa_lock);
/*
- * Depending on the IRE type (prefix vs. cache), we seek the group
+ * We seek the group
* structure which contains all security credentials of the gateway.
- * A prefix IRE is associated with at most one gateway credential,
- * while a cache IRE is associated with every credentials that the
- * gateway has.
+ * An offline IRE is associated with at most one gateway credential.
*/
- if ((gc = attrp->igsa_gc) != NULL) { /* prefix */
+ if ((gc = attrp->igsa_gc) != NULL) {
gcgrp = gc->gc_grp;
ASSERT(gcgrp != NULL);
rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
- } else if ((gcgrp = attrp->igsa_gcgrp) != NULL) { /* cache */
- rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
- gc = gcgrp->gcgrp_head;
- if (gc == NULL) {
- /* gc group is empty, so the drop lock now */
- ASSERT(gcgrp->gcgrp_count == 0);
- rw_exit(&gcgrp->gcgrp_rwlock);
- gcgrp = NULL;
- }
- }
-
- if (gcgrp != NULL)
GCGRP_REFHOLD(gcgrp);
+ }
if ((gw_rhc = attrp->igsa_rhc) != NULL) {
/*
@@ -1354,12 +1316,11 @@ tsol_ire_match_gwattr(ire_t *ire, const ts_label_t *tsl)
ASSERT(ga->ga_af == AF_INET6);
paddr = &ga->ga_addr;
}
- } else if (ire->ire_ipversion == IPV6_VERSION &&
- !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) {
- paddr = &ire->ire_gateway_addr_v6;
- } else if (ire->ire_ipversion == IPV4_VERSION &&
- ire->ire_gateway_addr != INADDR_ANY) {
- paddr = &ire->ire_gateway_addr;
+ } else if (ire->ire_type & IRE_OFFLINK) {
+ if (ire->ire_ipversion == IPV6_VERSION)
+ paddr = &ire->ire_gateway_addr_v6;
+ else if (ire->ire_ipversion == IPV4_VERSION)
+ paddr = &ire->ire_gateway_addr;
}
/* We've found a gateway address to do the template lookup */
@@ -1408,6 +1369,7 @@ tsol_ire_match_gwattr(ire_t *ire, const ts_label_t *tsl)
}
if (gc != NULL) {
+
tsol_gcdb_t *gcdb;
/*
* In the case of IRE_CACHE we've got one or more gateway
@@ -1418,18 +1380,9 @@ tsol_ire_match_gwattr(ire_t *ire, const ts_label_t *tsl)
* just the route itself, so the loop is executed only once.
*/
ASSERT(gcgrp != NULL);
- do {
- gcdb = gc->gc_db;
- if (tsl->tsl_doi == gcdb->gcdb_doi &&
- _blinrange(&tsl->tsl_label, &gcdb->gcdb_slrange))
- break;
- if (ire->ire_type == IRE_CACHE)
- gc = gc->gc_next;
- else
- gc = NULL;
- } while (gc != NULL);
-
- if (gc == NULL) {
+ gcdb = gc->gc_db;
+ if (tsl->tsl_doi != gcdb->gcdb_doi ||
+ !_blinrange(&tsl->tsl_label, &gcdb->gcdb_slrange)) {
DTRACE_PROBE3(
tx__ip__log__drop__irematch__nogcmatched,
char *, "ire(1), tsl(2): all gc failed match",
@@ -1493,12 +1446,13 @@ done:
/*
* Performs label accreditation checks for packet forwarding.
+ * Add or remove a CIPSO option as needed.
*
* Returns a pointer to the modified mblk if allowed for forwarding,
* or NULL if the packet must be dropped.
*/
mblk_t *
-tsol_ip_forward(ire_t *ire, mblk_t *mp)
+tsol_ip_forward(ire_t *ire, mblk_t *mp, const ip_recv_attr_t *ira)
{
tsol_ire_gw_secattr_t *attrp = NULL;
ipha_t *ipha;
@@ -1516,11 +1470,14 @@ tsol_ip_forward(ire_t *ire, mblk_t *mp)
boolean_t need_tpc_rele = B_FALSE;
ipaddr_t *gw;
ip_stack_t *ipst = ire->ire_ipst;
- cred_t *credp;
- pid_t pid;
+ int err;
+ ts_label_t *effective_tsl = NULL;
ASSERT(ire != NULL && mp != NULL);
- ASSERT(ire->ire_stq != NULL);
+ /*
+ * Note that the ire is the first one found, i.e., an IRE_OFFLINK if
+ * the destination is offlink.
+ */
af = (ire->ire_ipversion == IPV4_VERSION) ? AF_INET : AF_INET6;
@@ -1530,16 +1487,6 @@ tsol_ip_forward(ire_t *ire, mblk_t *mp)
psrc = &ipha->ipha_src;
pdst = &ipha->ipha_dst;
proto = ipha->ipha_protocol;
-
- /*
- * off_link is TRUE if destination not directly reachable.
- * Surya note: we avoid creation of per-dst IRE_CACHE entries
- * for forwarded packets, so we set off_link to be TRUE
- * if the packet dst is different from the ire_addr of
- * the ire for the nexthop.
- */
- off_link = ((ipha->ipha_dst != ire->ire_addr) ||
- (ire->ire_gateway_addr != INADDR_ANY));
if (!tsol_get_option_v4(mp, &label_type, &opt_ptr))
return (NULL);
} else {
@@ -1561,14 +1508,15 @@ tsol_ip_forward(ire_t *ire, mblk_t *mp)
}
proto = *nexthdrp;
}
-
- /* destination not directly reachable? */
- off_link = !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6);
if (!tsol_get_option_v6(mp, &label_type, &opt_ptr))
return (NULL);
}
+ /*
+ * off_link is TRUE if destination not directly reachable.
+ */
+ off_link = (ire->ire_type & IRE_OFFLINK);
- if ((tsl = msg_getlabel(mp)) == NULL)
+ if ((tsl = ira->ira_tsl) == NULL)
return (mp);
if (tsl->tsl_flags & TSLF_IMPLICIT_IN) {
@@ -1611,11 +1559,7 @@ tsol_ip_forward(ire_t *ire, mblk_t *mp)
attrp = ire->ire_gw_secattr;
gw_rhtp = attrp->igsa_rhc->rhc_tpc;
} else {
- /*
- * use the ire_addr if this is the IRE_CACHE of nexthop
- */
- gw = (ire->ire_gateway_addr == NULL? &ire->ire_addr :
- &ire->ire_gateway_addr);
+ gw = &ire->ire_gateway_addr;
gw_rhtp = find_tpc(gw, ire->ire_ipversion, B_FALSE);
need_tpc_rele = B_TRUE;
}
@@ -1702,7 +1646,13 @@ tsol_ip_forward(ire_t *ire, mblk_t *mp)
/* adjust is negative */
ASSERT((mp->b_wptr + adjust) >= mp->b_rptr);
mp->b_wptr += adjust;
-
+ /*
+ * Note that caller adjusts ira_pktlen and
+ * ira_ip_hdr_length
+ *
+ * For AF_INET6 note that tsol_remove_secopt_v6
+ * adjusted ip6_plen.
+ */
if (af == AF_INET) {
ipha = (ipha_t *)mp->b_rptr;
iplen = ntohs(ipha->ipha_length) + adjust;
@@ -1729,17 +1679,34 @@ tsol_ip_forward(ire_t *ire, mblk_t *mp)
(!off_link || gw_rhtp->tpc_tp.host_type == UNLABELED))
goto keep_label;
-
- credp = msg_getcred(mp, &pid);
- if ((af == AF_INET &&
- tsol_check_label(credp, &mp, CONN_MAC_DEFAULT, ipst, pid) != 0) ||
- (af == AF_INET6 &&
- tsol_check_label_v6(credp, &mp, CONN_MAC_DEFAULT, ipst,
- pid) != 0)) {
+ /*
+ * Since we are forwarding packets we use GLOBAL_ZONEID for
+ * the IRE lookup in tsol_check_label.
+ * Since mac_exempt is false the zoneid isn't used for anything
+ * but the IRE lookup, hence we set zone_is_global to false.
+ */
+ if (af == AF_INET) {
+ err = tsol_check_label_v4(tsl, GLOBAL_ZONEID, &mp,
+ CONN_MAC_DEFAULT, B_FALSE, ipst, &effective_tsl);
+ } else {
+ err = tsol_check_label_v6(tsl, GLOBAL_ZONEID, &mp,
+ CONN_MAC_DEFAULT, B_FALSE, ipst, &effective_tsl);
+ }
+ if (err != 0) {
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+ ip_drop_output("tsol_check_label", mp, NULL);
+ freemsg(mp);
mp = NULL;
goto keep_label;
}
+ /*
+ * The effective_tsl must never affect the routing decision, hence
+ * we ignore it here.
+ */
+ if (effective_tsl != NULL)
+ label_rele(effective_tsl);
+
if (af == AF_INET) {
ipha = (ipha_t *)mp->b_rptr;
ipha->ipha_hdr_checksum = 0;
@@ -1885,13 +1852,13 @@ tsol_rtsa_init(rt_msghdr_t *rtm, tsol_rtsecattr_t *sp, caddr_t cp)
}
int
-tsol_ire_init_gwattr(ire_t *ire, uchar_t ipversion, tsol_gc_t *gc,
- tsol_gcgrp_t *gcgrp)
+tsol_ire_init_gwattr(ire_t *ire, uchar_t ipversion, tsol_gc_t *gc)
{
tsol_ire_gw_secattr_t *attrp;
boolean_t exists = B_FALSE;
in_addr_t ga_addr4;
void *paddr = NULL;
+ tsol_gcgrp_t *gcgrp = NULL;
ASSERT(ire != NULL);
@@ -1917,20 +1884,16 @@ tsol_ire_init_gwattr(ire_t *ire, uchar_t ipversion, tsol_gc_t *gc,
if (attrp->igsa_gc != NULL)
GC_REFRELE(attrp->igsa_gc);
- if (attrp->igsa_gcgrp != NULL)
- GCGRP_REFRELE(attrp->igsa_gcgrp);
}
ASSERT(!exists || MUTEX_HELD(&attrp->igsa_lock));
/*
* References already held by caller and we keep them;
- * note that both gc and gcgrp may be set to NULL to
- * clear out igsa_gc and igsa_gcgrp, respectively.
+ * note that gc may be set to NULL to clear out igsa_gc.
*/
attrp->igsa_gc = gc;
- attrp->igsa_gcgrp = gcgrp;
- if (gcgrp == NULL && gc != NULL) {
+ if (gc != NULL) {
gcgrp = gc->gc_grp;
ASSERT(gcgrp != NULL);
}
@@ -1955,12 +1918,11 @@ tsol_ire_init_gwattr(ire_t *ire, uchar_t ipversion, tsol_gc_t *gc,
ASSERT(ga->ga_af == AF_INET6);
paddr = &ga->ga_addr;
}
- } else if (ipversion == IPV6_VERSION &&
- !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) {
- paddr = &ire->ire_gateway_addr_v6;
- } else if (ipversion == IPV4_VERSION &&
- ire->ire_gateway_addr != INADDR_ANY) {
- paddr = &ire->ire_gateway_addr;
+ } else if (ire->ire_type & IRE_OFFLINK) {
+ if (ipversion == IPV6_VERSION)
+ paddr = &ire->ire_gateway_addr_v6;
+ else if (ipversion == IPV4_VERSION)
+ paddr = &ire->ire_gateway_addr;
}
/*
@@ -1990,7 +1952,7 @@ tsol_ire_init_gwattr(ire_t *ire, uchar_t ipversion, tsol_gc_t *gc,
* If we can't figure out what it is, then return mlptSingle. That's actually
* an error case.
*
- * The callers are assume to pass in zone->zone_id and not the zoneid that
+ * The callers are assumed to pass in zone->zone_id and not the zoneid that
* is stored in a conn_t (since the latter will be GLOBAL_ZONEID in an
* exclusive stack zone).
*/
@@ -2022,23 +1984,28 @@ tsol_mlp_addr_type(zoneid_t zoneid, uchar_t version, const void *addr,
version = IPV4_VERSION;
}
+ /* Check whether the IRE_LOCAL (or ipif) is ALL_ZONES */
if (version == IPV4_VERSION) {
in4 = *(const in_addr_t *)addr;
if ((in4 == INADDR_ANY) || CLASSD(in4)) {
return (mlptBoth);
}
- ire = ire_cache_lookup(in4, ip_zoneid, NULL, ipst);
+ ire = ire_ftable_lookup_v4(in4, 0, 0, IRE_LOCAL|IRE_LOOPBACK,
+ NULL, ip_zoneid, NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY,
+ 0, ipst, NULL);
} else {
if (IN6_IS_ADDR_UNSPECIFIED((const in6_addr_t *)addr) ||
IN6_IS_ADDR_MULTICAST((const in6_addr_t *)addr)) {
return (mlptBoth);
}
- ire = ire_cache_lookup_v6(addr, ip_zoneid, NULL, ipst);
+ ire = ire_ftable_lookup_v6(addr, 0, 0, IRE_LOCAL|IRE_LOOPBACK,
+ NULL, ip_zoneid, NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY,
+ 0, ipst, NULL);
}
/*
* If we can't find the IRE, then we have to behave exactly like
- * ip_bind_laddr{,_v6}. That means looking up the IPIF so that users
- * can bind to addresses on "down" interfaces.
+ * ip_laddr_verify_{v4,v6}. That means looking up the IPIF so that
+ * users can bind to addresses on "down" interfaces.
*
* If we can't find that either, then the bind is going to fail, so
* just give up. Note that there's a miniscule chance that the address
@@ -2047,10 +2014,10 @@ tsol_mlp_addr_type(zoneid_t zoneid, uchar_t version, const void *addr,
if (ire == NULL) {
if (version == IPV4_VERSION)
ipif = ipif_lookup_addr(*(const in_addr_t *)addr, NULL,
- ip_zoneid, NULL, NULL, NULL, NULL, ipst);
+ ip_zoneid, ipst);
else
ipif = ipif_lookup_addr_v6((const in6_addr_t *)addr,
- NULL, ip_zoneid, NULL, NULL, NULL, NULL, ipst);
+ NULL, ip_zoneid, ipst);
if (ipif == NULL) {
return (mlptSingle);
}
diff --git a/usr/src/uts/common/inet/ip2mac_impl.h b/usr/src/uts/common/inet/ip2mac_impl.h
index 19d0931441..9a09e14487 100644
--- a/usr/src/uts/common/inet/ip2mac_impl.h
+++ b/usr/src/uts/common/inet/ip2mac_impl.h
@@ -37,10 +37,10 @@ extern "C" {
#ifdef _KERNEL
-extern void nce_cb_dispatch(nce_t *);
-extern void nce_ip2mac_response(ip2mac_t *, nce_t *);
-extern void nce_cb_refhold_locked(nce_t *);
-extern void nce_cb_refrele(nce_t *);
+extern void ncec_cb_dispatch(ncec_t *);
+extern void ncec_ip2mac_response(ip2mac_t *, ncec_t *);
+extern void ncec_cb_refhold_locked(ncec_t *);
+extern void ncec_cb_refrele(ncec_t *);
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/inet/ip6.h b/usr/src/uts/common/inet/ip6.h
index 5408ab9e55..10c6c81ba2 100644
--- a/usr/src/uts/common/inet/ip6.h
+++ b/usr/src/uts/common/inet/ip6.h
@@ -57,105 +57,12 @@ typedef enum {
IP6_SCOPE_GLOBAL
} in6addr_scope_t;
-#ifdef _KERNEL
+/* From RFC 3542 - setting for IPV6_USE_MIN_MTU socket option */
+#define IPV6_USE_MIN_MTU_MULTICAST -1 /* Default */
+#define IPV6_USE_MIN_MTU_NEVER 0
+#define IPV6_USE_MIN_MTU_ALWAYS 1
-/*
- * Private header used between the transports and IP to carry the content
- * of the options IPV6_PKTINFO/IPV6_RECVPKTINFO (the interface index only)
- * and IPV6_NEXTHOP.
- * Also used to specify that raw sockets do not want the UDP/TCP transport
- * checksums calculated in IP (akin to IP_HDR_INCLUDED) and provide for
- * IPV6_CHECKSUM on the transmit side (using ip6i_checksum_off).
- *
- * When this header is used it must be the first header in the packet i.e.
- * before the real ip6 header. The use of a next header value of 255
- * (IPPROTO_RAW) in this header indicates its presence. Note that
- * ip6_nxt = IPPROTO_RAW indicates that "this" header is ip6_info - the
- * next header is always IPv6.
- *
- * Note that ip6i_nexthop is at the same offset as ip6_dst so that
- * this header can be kept in the packet while the it passes through
- * ip_newroute* and the ndp code. Those routines will use ip6_dst for
- * resolution.
- *
- * Implementation offset assumptions about ip6_info_t and ip6_t fields
- * and their alignments shown in figure below
- *
- * ip6_info (Private headers from transports to IP) header below
- * _______________________________________________________________ _ _ _ _ _
- * | .... | ip6i_nxt (255)| ......................|ip6i_nexthop| ...ip6_t.
- * --------------------------------------------------------------- - - - - -
- * ^ ^
- * <---- >| same offset for {ip6i_nxt,ip6_nxt} ^
- * ^ ^
- * <------^-------------------------------------->| same offset for
- * ^ ^ {ip6i_nxthop,ip6_dst}
- * _______________________________________________________________ _ _ _
- * | .... | ip6_nxt | ......................|ip6_dst | .other hdrs...
- * --------------------------------------------------------------- - - -
- * ip6_t (IPv6 protocol) header above
- */
-struct ip6_info {
- union {
- struct ip6_info_ctl {
- uint32_t ip6i_un1_flow;
- uint16_t ip6i_un1_plen; /* payload length */
- uint8_t ip6i_un1_nxt; /* next header */
- uint8_t ip6i_un1_hlim; /* hop limit */
- } ip6i_un1;
- } ip6i_ctlun;
- int ip6i_flags; /* See below */
- int ip6i_ifindex;
- int ip6i_checksum_off;
- int ip6i_pad;
- in6_addr_t ip6i_nexthop; /* Same offset as ip6_dst */
-};
-typedef struct ip6_info ip6i_t;
-
-#define ip6i_flow ip6i_ctlun.ip6i_un1.ip6i_un1_flow
-#define ip6i_vcf ip6i_flow /* Version, class, flow */
-#define ip6i_nxt ip6i_ctlun.ip6i_un1.ip6i_un1_nxt
-#define ip6i_hops ip6i_ctlun.ip6i_un1.ip6i_un1_hlim
-
-/* ip6_info flags */
-#define IP6I_IFINDEX 0x1 /* ip6i_ifindex is set (to nonzero value) */
-#define IP6I_NEXTHOP 0x2 /* ip6i_nexthop is different than ip6_dst */
-#define IP6I_NO_ULP_CKSUM 0x4
- /*
- * Do not generate TCP/UDP/SCTP transport checksum.
- * Used by raw sockets. Does not affect the
- * generation of transport checksums for ICMPv6
- * since such packets always arrive through
- * a raw socket.
- */
-#define IP6I_UNSPEC_SRC 0x8
- /* Used to carry conn_unspec_src through ip_newroute* */
-#define IP6I_RAW_CHECKSUM 0x10
- /* Compute checksum and stuff in ip6i_checksum_off */
-#define IP6I_VERIFY_SRC 0x20 /* Verify ip6_src. Used when IPV6_PKTINFO */
-#define IP6I_IPMP_PROBE 0x40 /* IPMP (in.mpathd) probe packet */
- /* 0x80 - 0x100 available */
-#define IP6I_DONTFRAG 0x200 /* Don't fragment this packet */
-#define IP6I_HOPLIMIT 0x400 /* hoplimit has been set by the sender */
-
-/*
- * These constants refer to the IPV6_USE_MIN_MTU API. The
- * actually values used in the API are these values shifted down
- * 10 bits minus 2 [-1, 1]. 0 (-2 after conversion) is considered
- * the same as the default (-1). IP6I_API_USE_MIN_MTU(f, x) returns
- * the flags field updated with min mtu. IP6I_USE_MIN_MTU_API takes the
- * field and returns the API value (+ the -2 value).
- */
-#define IP6I_USE_MIN_MTU_UNICAST 0x400
-#define IP6I_USE_MIN_MTU_ALWAYS 0x800
-#define IP6I_USE_MIN_MTU_NEVER 0xC00
-#define IP6I_USE_MIN_MTU_API(x) ((((x) & 0xC00) >> 10) - 2)
-#define IP6I_API_USE_MIN_MTU(f, x) (((f) & ~0xC00) &\
- ((((x) + 2) & 0x3) << 11))
-#define IPV6_USE_MIN_MTU_DEFAULT -2
-#define IPV6_USE_MIN_MTU_UNICAST -1
-#define IPV6_USE_MIN_MTU_ALWAYS 0
-#define IPV6_USE_MIN_MTU_NEVER 1
+#ifdef _KERNEL
/* Extract the scope from a multicast address */
#ifdef _BIG_ENDIAN
@@ -195,28 +102,18 @@ typedef struct ip6_info ip6i_t;
#define MIN_EHDR_LEN 8
#define MAX_EHDR_LEN 2048
-/*
- * The high-order bit of the version field is used by the transports to
- * indicate a reachability confirmation to IP.
- */
-#define IP_FORWARD_PROG_BIT 0x8
-
#ifdef _BIG_ENDIAN
#define IPV6_DEFAULT_VERS_AND_FLOW 0x60000000
#define IPV6_VERS_AND_FLOW_MASK 0xF0000000
-#define IP_FORWARD_PROG ((uint32_t)IP_FORWARD_PROG_BIT << 28)
#define V6_MCAST 0xFF000000
#define V6_LINKLOCAL 0xFE800000
#define IPV6_FLOW_TCLASS(x) (((x) & IPV6_FLOWINFO_TCLASS) >> 20)
#define IPV6_TCLASS_FLOW(f, c) (((f) & ~IPV6_FLOWINFO_TCLASS) |\
((c) << 20))
-
#else
#define IPV6_DEFAULT_VERS_AND_FLOW 0x00000060
#define IPV6_VERS_AND_FLOW_MASK 0x000000F0
-#define IP_FORWARD_PROG ((uint32_t)IP_FORWARD_PROG_BIT << 4)
-
#define V6_MCAST 0x000000FF
#define V6_LINKLOCAL 0x000080FE
@@ -328,71 +225,66 @@ extern const in6_addr_t ipv6_unspecified_group;
* FUNCTION PROTOTYPES
*/
-struct ipsec_out_s;
-
extern void convert2ascii(char *buf, const in6_addr_t *addr);
extern char *inet_ntop(int, const void *, char *, int);
extern int inet_pton(int, char *, void *);
-extern void icmp_time_exceeded_v6(queue_t *, mblk_t *, uint8_t,
- boolean_t, boolean_t, zoneid_t, ip_stack_t *);
-extern void icmp_unreachable_v6(queue_t *, mblk_t *, uint8_t,
- boolean_t, boolean_t, zoneid_t, ip_stack_t *);
-extern void icmp_inbound_error_fanout_v6(queue_t *, mblk_t *, ip6_t *,
- icmp6_t *, ill_t *, ill_t *, boolean_t, zoneid_t);
-extern boolean_t conn_wantpacket_v6(conn_t *, ill_t *, ip6_t *, int, zoneid_t);
-extern mblk_t *ip_add_info_v6(mblk_t *, ill_t *, const in6_addr_t *);
+extern void icmp_param_problem_nexthdr_v6(mblk_t *, boolean_t,
+ ip_recv_attr_t *);
+extern void icmp_pkt2big_v6(mblk_t *, uint32_t, boolean_t,
+ ip_recv_attr_t *);
+extern void icmp_time_exceeded_v6(mblk_t *, uint8_t, boolean_t,
+ ip_recv_attr_t *);
+extern void icmp_unreachable_v6(mblk_t *, uint8_t, boolean_t,
+ ip_recv_attr_t *);
+extern mblk_t *icmp_inbound_v6(mblk_t *, ip_recv_attr_t *);
+extern void icmp_inbound_error_fanout_v6(mblk_t *, icmp6_t *,
+ ip_recv_attr_t *);
+extern void icmp_update_out_mib_v6(ill_t *, icmp6_t *);
+
+extern boolean_t conn_wantpacket_v6(conn_t *, ip_recv_attr_t *, ip6_t *);
+
extern in6addr_scope_t ip_addr_scope_v6(const in6_addr_t *);
-extern mblk_t *ip_bind_v6(queue_t *, mblk_t *, conn_t *, ip6_pkt_t *);
-extern void ip_build_hdrs_v6(uchar_t *, uint_t, ip6_pkt_t *, uint8_t);
-extern int ip_fanout_send_icmp_v6(queue_t *, mblk_t *, uint_t,
- uint_t, uint8_t, uint_t, boolean_t, zoneid_t, ip_stack_t *);
-extern int ip_find_hdr_v6(mblk_t *, ip6_t *, ip6_pkt_t *, uint8_t *);
-extern in6_addr_t ip_get_dst_v6(ip6_t *, mblk_t *, boolean_t *);
+extern void ip_build_hdrs_v6(uchar_t *, uint_t, const ip_pkt_t *, uint8_t,
+ uint32_t);
+extern void ip_fanout_udp_multi_v6(mblk_t *, ip6_t *, uint16_t, uint16_t,
+ ip_recv_attr_t *);
+extern void ip_fanout_send_icmp_v6(mblk_t *, uint_t, uint8_t,
+ ip_recv_attr_t *);
+extern void ip_fanout_proto_v6(mblk_t *, ip6_t *, ip_recv_attr_t *);
+extern int ip_find_hdr_v6(mblk_t *, ip6_t *, boolean_t, ip_pkt_t *,
+ uint8_t *);
+extern in6_addr_t ip_get_dst_v6(ip6_t *, const mblk_t *, boolean_t *);
extern ip6_rthdr_t *ip_find_rthdr_v6(ip6_t *, uint8_t *);
-extern int ip_hdr_complete_v6(ip6_t *, zoneid_t, ip_stack_t *);
extern boolean_t ip_hdr_length_nexthdr_v6(mblk_t *, ip6_t *,
uint16_t *, uint8_t **);
extern int ip_hdr_length_v6(mblk_t *, ip6_t *);
-extern int ip_check_v6_mblk(mblk_t *, ill_t *);
extern uint32_t ip_massage_options_v6(ip6_t *, ip6_rthdr_t *, netstack_t *);
-extern void ip_wput_frag_v6(mblk_t *, ire_t *, uint_t, conn_t *, int, int);
-extern void ip_wput_ipsec_out_v6(queue_t *, mblk_t *, ip6_t *, ill_t *,
- ire_t *);
-extern int ip_total_hdrs_len_v6(ip6_pkt_t *);
+extern void ip_forward_xmit_v6(nce_t *, mblk_t *, ip6_t *, ip_recv_attr_t *,
+ uint32_t, uint32_t);
+extern mblk_t *ip_fraghdr_add_v6(mblk_t *, uint32_t, ip_xmit_attr_t *);
+extern int ip_fragment_v6(mblk_t *, nce_t *, iaflags_t, uint_t, uint32_t,
+ uint32_t, zoneid_t, zoneid_t, pfirepostfrag_t postfragfn,
+ uintptr_t *ixa_cookie);
+extern int ip_process_options_v6(mblk_t *, ip6_t *,
+ uint8_t *, uint_t, uint8_t, ip_recv_attr_t *);
+extern void ip_process_rthdr(mblk_t *, ip6_t *, ip6_rthdr_t *,
+ ip_recv_attr_t *);
+extern int ip_total_hdrs_len_v6(const ip_pkt_t *);
+extern mblk_t *ipsec_early_ah_v6(mblk_t *, ip_recv_attr_t *);
extern int ipsec_ah_get_hdr_size_v6(mblk_t *, boolean_t);
-extern void ip_wput_v6(queue_t *, mblk_t *);
-extern void ip_wput_local_v6(queue_t *, ill_t *, ip6_t *, mblk_t *,
- ire_t *, int, zoneid_t);
-extern void ip_output_v6(void *, mblk_t *, void *, int);
-extern void ip_xmit_v6(mblk_t *, ire_t *, uint_t, conn_t *, int,
- struct ipsec_out_s *);
+extern void ip_send_potential_redirect_v6(mblk_t *, ip6_t *, ire_t *,
+ ip_recv_attr_t *);
extern void ip_rput_v6(queue_t *, mblk_t *);
-extern void ip_rput_data_v6(queue_t *, ill_t *, mblk_t *, ip6_t *,
- uint_t, mblk_t *, mblk_t *);
-extern void mld_input(queue_t *, mblk_t *, ill_t *);
+extern mblk_t *mld_input(mblk_t *, ip_recv_attr_t *);
extern void mld_joingroup(ilm_t *);
extern void mld_leavegroup(ilm_t *);
extern void mld_timeout_handler(void *);
extern void pr_addr_dbg(char *, int, const void *);
-extern int ip_multirt_apply_membership_v6(int (*fn)(conn_t *, boolean_t,
- const in6_addr_t *, int, mcast_record_t, const in6_addr_t *, mblk_t *),
- ire_t *, conn_t *, boolean_t, const in6_addr_t *, mcast_record_t,
- const in6_addr_t *, mblk_t *);
-extern void ip_newroute_ipif_v6(queue_t *, mblk_t *, ipif_t *,
- const in6_addr_t *, const in6_addr_t *, int, zoneid_t);
-extern void ip_newroute_v6(queue_t *, mblk_t *, const in6_addr_t *,
- const in6_addr_t *, ill_t *, zoneid_t, ip_stack_t *);
extern void *ip6_kstat_init(netstackid_t, ip6_stat_t *);
extern void ip6_kstat_fini(netstackid_t, kstat_t *);
-extern size_t ip6_get_src_preferences(conn_t *, uint32_t *);
-extern int ip6_set_src_preferences(conn_t *, uint32_t);
-extern int ip6_set_pktinfo(cred_t *, conn_t *, struct in6_pktinfo *);
-extern int ip_proto_bind_laddr_v6(conn_t *, mblk_t **, uint8_t,
- const in6_addr_t *, uint16_t, boolean_t);
-extern int ip_proto_bind_connected_v6(conn_t *, mblk_t **,
- uint8_t, in6_addr_t *, uint16_t, const in6_addr_t *, ip6_pkt_t *,
- uint16_t, boolean_t, boolean_t, cred_t *);
+extern size_t ip6_get_src_preferences(ip_xmit_attr_t *, uint32_t *);
+extern int ip6_set_src_preferences(ip_xmit_attr_t *, uint32_t);
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/inet/ip_arp.h b/usr/src/uts/common/inet/ip_arp.h
new file mode 100644
index 0000000000..2cb7e7a05a
--- /dev/null
+++ b/usr/src/uts/common/inet/ip_arp.h
@@ -0,0 +1,136 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _IP_ARP_H
+#define _IP_ARP_H
+
+/*
+ * Data-structures and functions related to the IP STREAMS queue that handles
+ * packets with the SAP set to 0x806 (ETHERTYPE_ARP).
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <inet/ip.h>
+#include <inet/ip_ndp.h>
+#include <sys/stream.h>
+
+#ifdef _KERNEL
+extern struct streamtab dummymodinfo;
+
+struct arl_ill_common_s;
+/*
+ * The arl_s structure tracks the state of the associated ARP stream.
+ */
+typedef struct arl_s {
+ queue_t *arl_rq;
+ queue_t *arl_wq;
+ ip_stack_t *arl_ipst;
+ zoneid_t arl_zoneid;
+ cred_t *arl_credp;
+ ip_m_t arl_media;
+ struct arl_ill_common_s *arl_common;
+ int arl_muxid;
+ uint_t arl_ppa;
+ t_uscalar_t arl_sap;
+ t_uscalar_t arl_sap_length;
+ uint_t arl_phys_addr_length;
+ char *arl_name;
+ int arl_name_length;
+ t_uscalar_t arl_mactype;
+#define arl_first_mp_to_free arl_dlpi_deferred
+ mblk_t *arl_dlpi_deferred;
+ mblk_t *arl_unbind_mp;
+ mblk_t *arl_detach_mp;
+#define arl_last_mp_to_free arl_detach_mp
+ uint_t arl_state_flags;
+ uint_t
+ arl_needs_attach:1,
+ arl_dlpi_style_set:1,
+ arl_pad_to_bit_31:30;
+ uint_t arl_refcnt;
+ kcondvar_t arl_cv;
+ t_uscalar_t arl_dlpi_pending;
+ kmutex_t arl_lock;
+ int arl_error;
+} arl_t;
+
+/*
+ * The arl_ill_common_t structure is a super-structure that contains pointers
+ * to a pair of matching ill_t, arl_t structures. Given an arl_t (or
+ * ill_t) the corresponding ill_t (or arl_t) must be obtained by
+ * synchronizing on the ai_lock, and ensuring that the desired ill/arl
+ * pointer is non-null, not condemned. The arl_ill_common_t is allocated in
+ * arl_init() and freed only when both the ill_t and the arl_t structures
+ * become NULL.
+ * Lock hierarchy: the ai_lock must be take before the ill_lock or arl_lock.
+ */
+
+typedef struct arl_ill_common_s {
+ kmutex_t ai_lock;
+ ill_t *ai_ill;
+ arl_t *ai_arl;
+ kcondvar_t ai_ill_unplumb_done; /* sent from ip_modclose() */
+} arl_ill_common_t;
+
+extern boolean_t arp_no_defense;
+
+extern struct module_info arp_mod_info;
+extern int arp_ll_up(ill_t *);
+extern int arp_ll_down(ill_t *);
+extern boolean_t arp_announce(ncec_t *);
+extern boolean_t arp_probe(ncec_t *);
+extern int arp_request(ncec_t *, in_addr_t, ill_t *);
+extern void arp_failure(mblk_t *, ip_recv_attr_t *);
+extern int arl_wait_for_info_ack(arl_t *);
+extern int arl_init(queue_t *, arl_t *);
+extern void arl_set_muxid(ill_t *, int);
+extern int arl_get_muxid(ill_t *);
+extern void arp_send_replumb_conf(ill_t *);
+extern void arp_unbind_complete(ill_t *);
+extern ill_t *arl_to_ill(arl_t *);
+#endif
+
+#define ARP_RETRANS_TIMER 500 /* time in milliseconds */
+
+/* The following are arl_state_flags */
+#define ARL_LL_SUBNET_PENDING 0x01 /* Waiting for DL_INFO_ACK from drv */
+#define ARL_CONDEMNED 0x02 /* No more new ref's to the ILL */
+#define ARL_DL_UNBIND_IN_PROGRESS 0x04 /* UNBIND_REQ is sent */
+#define ARL_LL_BIND_PENDING 0x0020 /* BIND sent */
+#define ARL_LL_UP 0x0040 /* BIND acked */
+#define ARL_LL_DOWN 0x0080
+#define ARL_LL_UNBOUND 0x0100 /* UNBIND acked */
+#define ARL_LL_REPLUMBING 0x0200 /* replumb in progress */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _IP_ARP_H */
diff --git a/usr/src/uts/common/inet/ip_ftable.h b/usr/src/uts/common/inet/ip_ftable.h
index 6a3a05183b..d8fa9e566d 100644
--- a/usr/src/uts/common/inet/ip_ftable.h
+++ b/usr/src/uts/common/inet/ip_ftable.h
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -56,7 +56,7 @@ struct rt_entry {
*
* The comment below (and for other netstack_t references) refers
* to the fact that we only do netstack_hold in particular cases,
- * such as the references from open streams (ill_t and conn_t's
+ * such as the references from open endpoints (ill_t and conn_t's
* pointers). Internally within IP we rely on IP's ability to cleanup e.g.
* ire_t's when an ill goes away.
*/
@@ -74,26 +74,8 @@ int rtfunc(struct radix_node *, void *);
typedef struct rt_entry rt_t;
typedef struct rtfuncarg rtf_t;
-/* For ire_forward() */
-enum ire_forward_action {
- Forward_ok, /* OK to use this IRE to forward */
- Forward_check_multirt, /* CGTP multirt check required */
- Forward_ret_icmp_err, /* Callers to return an ICMP error */
- Forward_blackhole /* Packet is silently discarded */
-};
-
struct ts_label_s;
-extern ire_t *ire_ftable_lookup(ipaddr_t, ipaddr_t, ipaddr_t, int,
- const ipif_t *, ire_t **, zoneid_t, uint32_t,
- const struct ts_label_s *, int, ip_stack_t *);
-extern ire_t *ire_lookup_multi(ipaddr_t, zoneid_t, ip_stack_t *);
-extern ire_t *ipif_lookup_multi_ire(ipif_t *, ipaddr_t);
extern void ire_delete_host_redirects(ipaddr_t, ip_stack_t *);
-extern ire_t *ire_ihandle_lookup_onlink(ire_t *);
-extern ire_t *ire_forward(ipaddr_t, enum ire_forward_action *, ire_t *,
- ire_t *, const struct ts_label_s *, ip_stack_t *);
-extern ire_t *ire_forward_simple(ipaddr_t, enum ire_forward_action *,
- ip_stack_t *);
extern irb_t *ire_get_bucket(ire_t *);
extern uint_t ifindex_lookup(const struct sockaddr *, zoneid_t);
extern int ipfil_sendpkt(const struct sockaddr *, mblk_t *, uint_t, zoneid_t);
diff --git a/usr/src/uts/common/inet/ip_if.h b/usr/src/uts/common/inet/ip_if.h
index 80a9f74e88..d081d9256b 100644
--- a/usr/src/uts/common/inet/ip_if.h
+++ b/usr/src/uts/common/inet/ip_if.h
@@ -80,12 +80,12 @@ extern "C" {
#define IFF_PHYINTINST_FLAGS (IFF_DEBUG|IFF_NOTRAILERS|IFF_NOARP| \
IFF_MULTICAST|IFF_ROUTER|IFF_NONUD|IFF_NORTEXCH|IFF_IPV4|IFF_IPV6| \
- IFF_XRESOLV|IFF_COS_ENABLED)
+ IFF_COS_ENABLED|IFF_FIXEDMTU)
#define IFF_LOGINT_FLAGS (IFF_UP|IFF_BROADCAST|IFF_POINTOPOINT| \
IFF_UNNUMBERED|IFF_DHCPRUNNING|IFF_PRIVATE|IFF_NOXMIT|IFF_NOLOCAL| \
IFF_DEPRECATED|IFF_ADDRCONF|IFF_ANYCAST|IFF_NOFAILOVER| \
- IFF_PREFERRED|IFF_TEMPORARY|IFF_FIXEDMTU|IFF_DUPLICATE)
+ IFF_PREFERRED|IFF_TEMPORARY|IFF_DUPLICATE)
#define PHYI_LOOPBACK IFF_LOOPBACK /* is a loopback net */
#define PHYI_RUNNING IFF_RUNNING /* resources allocated */
@@ -109,8 +109,8 @@ extern "C" {
#define ILLF_NORTEXCH IFF_NORTEXCH /* No routing info exchange */
#define ILLF_IPV4 IFF_IPV4 /* IPv4 interface */
#define ILLF_IPV6 IFF_IPV6 /* IPv6 interface */
-#define ILLF_XRESOLV IFF_XRESOLV /* IPv6 external resolver */
#define ILLF_COS_ENABLED IFF_COS_ENABLED /* Is CoS marking supported */
+#define ILLF_FIXEDMTU IFF_FIXEDMTU /* set with SIOCSLIFMTU */
#define IPIF_UP IFF_UP /* interface is up */
#define IPIF_BROADCAST IFF_BROADCAST /* broadcast address valid */
@@ -126,7 +126,6 @@ extern "C" {
#define IPIF_NOFAILOVER IFF_NOFAILOVER /* No failover on NIC failure */
#define IPIF_PREFERRED IFF_PREFERRED /* Prefer as source address */
#define IPIF_TEMPORARY IFF_TEMPORARY /* RFC3041 */
-#define IPIF_FIXEDMTU IFF_FIXEDMTU /* set with SIOCSLIFMTU */
#define IPIF_DUPLICATE IFF_DUPLICATE /* address is in use */
#ifdef DEBUG
@@ -135,6 +134,12 @@ extern "C" {
#define ILL_MAC_PERIM_HELD(ill)
#endif
+/*
+ * match flags for ipif_lookup_addr_common* functions
+ */
+#define IPIF_MATCH_ILLGRP 0x00000001
+#define IPIF_MATCH_NONDUP 0x00000002
+
/* for ipif_resolver_up */
enum ip_resolver_action {
Res_act_initial, /* initial address establishment */
@@ -143,134 +148,144 @@ enum ip_resolver_action {
Res_act_none /* do nothing */
};
-extern mblk_t *ill_arp_alloc(ill_t *, const uchar_t *, caddr_t);
-extern mblk_t *ipif_area_alloc(ipif_t *, uint_t);
-extern mblk_t *ipif_ared_alloc(ipif_t *);
-extern mblk_t *ill_ared_alloc(ill_t *, ipaddr_t);
-extern mblk_t *ill_arie_alloc(ill_t *, const char *, const void *);
-extern boolean_t ill_dlpi_pending(ill_t *, t_uscalar_t);
+extern int ill_add_ires(ill_t *);
+extern void ill_delete_ires(ill_t *);
extern void ill_dlpi_done(ill_t *, t_uscalar_t);
+extern boolean_t ill_dlpi_pending(ill_t *, t_uscalar_t);
+extern void ill_dlpi_dispatch(ill_t *, mblk_t *);
extern void ill_dlpi_send(ill_t *, mblk_t *);
extern void ill_dlpi_send_deferred(ill_t *);
+extern void ill_dlpi_queue(ill_t *, mblk_t *);
+extern void ill_dlpi_send_queued(ill_t *);
+extern void ill_mcast_queue(ill_t *, mblk_t *);
+extern void ill_mcast_send_queued(ill_t *);
+extern void ill_mcast_timer_start(ip_stack_t *);
extern void ill_capability_done(ill_t *);
extern mblk_t *ill_dlur_gen(uchar_t *, uint_t, t_uscalar_t, t_scalar_t);
/* NOTE: Keep unmodified ill_lookup_on_ifindex for ipp for now */
-extern ill_t *ill_lookup_on_ifindex_global_instance(uint_t, boolean_t,
- queue_t *, mblk_t *, ipsq_func_t, int *);
-extern ill_t *ill_lookup_on_ifindex(uint_t, boolean_t, queue_t *, mblk_t *,
- ipsq_func_t, int *, ip_stack_t *);
-extern ill_t *ill_lookup_on_name(char *, boolean_t,
- boolean_t, queue_t *, mblk_t *, ipsq_func_t, int *, boolean_t *,
+extern ill_t *ill_lookup_on_ifindex_global_instance(uint_t, boolean_t);
+extern ill_t *ill_lookup_on_ifindex(uint_t, boolean_t, ip_stack_t *);
+extern ill_t *ill_lookup_on_ifindex_zoneid(uint_t, zoneid_t, boolean_t,
ip_stack_t *);
+extern ill_t *ill_lookup_on_name(char *, boolean_t,
+ boolean_t, boolean_t *, ip_stack_t *);
+extern boolean_t ip_ifindex_valid(uint_t, boolean_t, ip_stack_t *);
extern uint_t ill_get_next_ifindex(uint_t, boolean_t, ip_stack_t *);
extern uint_t ill_get_ifindex_by_name(char *, ip_stack_t *);
-extern void ill_grp_cache_delete(ire_t *, char *);
-extern void ill_ipif_cache_delete(ire_t *, char *);
-extern void ill_stq_cache_delete(ire_t *, char *);
+extern uint_t ill_get_upper_ifindex(const ill_t *);
extern void ill_delete(ill_t *);
extern void ill_delete_tail(ill_t *);
extern int ill_dl_phys(ill_t *, ipif_t *, mblk_t *, queue_t *);
-extern int ill_dls_info(struct sockaddr_dl *, const ipif_t *);
+extern int ill_dls_info(struct sockaddr_dl *, const ill_t *);
extern void ill_fastpath_ack(ill_t *, mblk_t *);
-extern void ill_fastpath_nack(ill_t *);
extern int ill_fastpath_probe(ill_t *, mblk_t *);
-extern void ill_fastpath_flush(ill_t *);
extern int ill_forward_set(ill_t *, boolean_t);
extern void ill_frag_prune(ill_t *, uint_t);
extern void ill_frag_free_pkts(ill_t *, ipfb_t *, ipf_t *, int);
extern time_t ill_frag_timeout(ill_t *, time_t);
extern int ill_init(queue_t *, ill_t *);
-extern void ill_refresh_bcast(ill_t *);
extern void ill_restart_dad(ill_t *, boolean_t);
extern void ill_setdefaulttoken(ill_t *);
extern void ill_setdesttoken(ill_t *);
+extern void ill_set_inputfn(ill_t *);
+extern void ill_set_inputfn_all(ip_stack_t *);
extern int ill_set_phys_addr(ill_t *, mblk_t *);
extern int ill_replumb(ill_t *, mblk_t *);
extern void ill_set_ndmp(ill_t *, mblk_t *, uint_t, uint_t);
-extern mblk_t *ill_pending_mp_get(ill_t *, conn_t **, uint_t);
-extern boolean_t ill_pending_mp_add(ill_t *, conn_t *, mblk_t *);
extern boolean_t ill_is_freeable(ill_t *ill);
extern void ill_refhold(ill_t *);
extern void ill_refhold_locked(ill_t *);
-extern int ill_check_and_refhold(ill_t *);
+extern boolean_t ill_check_and_refhold(ill_t *);
extern void ill_refrele(ill_t *);
extern boolean_t ill_waiter_inc(ill_t *);
extern void ill_waiter_dcr(ill_t *);
extern void ill_trace_ref(ill_t *);
extern void ill_untrace_ref(ill_t *);
+extern void ill_downi(ire_t *, char *);
+extern void ill_downi_if_clone(ire_t *, char *);
extern boolean_t ill_down_start(queue_t *, mblk_t *);
+extern ill_t *ill_lookup_group_v4(ipaddr_t, zoneid_t,
+ ip_stack_t *, boolean_t *, ipaddr_t *);
extern ill_t *ill_lookup_group_v6(const in6_addr_t *, zoneid_t,
- ip_stack_t *);
+ ip_stack_t *, boolean_t *, in6_addr_t *);
extern void ill_capability_ack(ill_t *, mblk_t *);
extern void ill_capability_probe(ill_t *);
extern void ill_capability_reset(ill_t *, boolean_t);
extern void ill_taskq_dispatch(ip_stack_t *);
-extern void ill_mtu_change(ire_t *, char *);
+extern void ill_get_name(const ill_t *, char *, int);
+extern void ill_group_cleanup(ill_t *);
extern int ill_up_ipifs(ill_t *, queue_t *, mblk_t *);
+extern void ip_update_source_selection(ip_stack_t *);
extern uint_t ill_appaddr_cnt(const ill_t *);
extern uint_t ill_ptpaddr_cnt(const ill_t *);
+extern uint_t ill_admupaddr_cnt(const ill_t *);
+
+extern ill_t *ill_lookup_multicast(ip_stack_t *, zoneid_t, boolean_t);
+extern void ill_save_ire(ill_t *, ire_t *);
+extern void ill_remove_saved_ire(ill_t *, ire_t *);
+extern int ill_recover_saved_ire(ill_t *);
extern void ip_interface_cleanup(ip_stack_t *);
extern void ipif_get_name(const ipif_t *, char *, int);
extern ipif_t *ipif_getby_indexes(uint_t, uint_t, boolean_t, ip_stack_t *);
extern void ipif_init(ip_stack_t *);
-extern ipif_t *ipif_lookup_addr(ipaddr_t, ill_t *, zoneid_t, queue_t *,
- mblk_t *, ipsq_func_t, int *, ip_stack_t *);
-extern boolean_t ip_addr_exists(ipaddr_t, zoneid_t, ip_stack_t *);
+extern ipif_t *ipif_lookup_addr(ipaddr_t, ill_t *, zoneid_t, ip_stack_t *);
+extern ipif_t *ipif_lookup_addr_exact(ipaddr_t, ill_t *, ip_stack_t *);
+extern ipif_t *ipif_lookup_addr_nondup(ipaddr_t, ill_t *, zoneid_t,
+ ip_stack_t *);
extern ipif_t *ipif_lookup_addr_v6(const in6_addr_t *, ill_t *, zoneid_t,
- queue_t *, mblk_t *, ipsq_func_t, int *, ip_stack_t *);
-extern boolean_t ip_addr_exists_v6(const in6_addr_t *, zoneid_t,
ip_stack_t *);
extern ipif_t *ipif_lookup_addr_exact_v6(const in6_addr_t *, ill_t *,
ip_stack_t *);
+extern ipif_t *ipif_lookup_addr_nondup_v6(const in6_addr_t *, ill_t *,
+ zoneid_t, ip_stack_t *);
extern zoneid_t ipif_lookup_addr_zoneid(ipaddr_t, ill_t *, ip_stack_t *);
extern zoneid_t ipif_lookup_addr_zoneid_v6(const in6_addr_t *, ill_t *,
ip_stack_t *);
-extern ipif_t *ipif_lookup_group(ipaddr_t, zoneid_t, ip_stack_t *);
-extern ipif_t *ipif_lookup_group_v6(const in6_addr_t *, zoneid_t,
- ip_stack_t *);
-extern ipif_t *ipif_lookup_interface(ipaddr_t, ipaddr_t,
- queue_t *, mblk_t *, ipsq_func_t, int *, ip_stack_t *);
-extern ipif_t *ipif_lookup_multicast(ip_stack_t *, zoneid_t, boolean_t);
+extern ipif_t *ipif_lookup_interface(ipaddr_t, ipaddr_t, ip_stack_t *);
extern ipif_t *ipif_lookup_remote(ill_t *, ipaddr_t, zoneid_t);
-extern ipif_t *ipif_lookup_onlink_addr(ipaddr_t, zoneid_t, ip_stack_t *);
-extern ipif_t *ipif_lookup_seqid(ill_t *, uint_t);
-extern boolean_t ipif_lookup_zoneid(ill_t *, zoneid_t, int, ipif_t **);
-extern ipif_t *ipif_select_source(ill_t *, ipaddr_t, zoneid_t);
-extern boolean_t ipif_usesrc_avail(ill_t *, zoneid_t);
+extern boolean_t ipif_lookup_testaddr_v6(ill_t *, const in6_addr_t *,
+ ipif_t **);
+extern boolean_t ipif_lookup_testaddr_v4(ill_t *, const in_addr_t *,
+ ipif_t **);
+extern ipif_t *ipif_select_source_v4(ill_t *, ipaddr_t, zoneid_t, boolean_t,
+ boolean_t *);
+extern boolean_t ipif_zone_avail(uint_t, boolean_t, zoneid_t, ip_stack_t *);
+extern ipif_t *ipif_good_addr(ill_t *, zoneid_t);
+extern int ip_select_source_v4(ill_t *, ipaddr_t, ipaddr_t, ipaddr_t,
+ zoneid_t, ip_stack_t *, ipaddr_t *, uint32_t *, uint64_t *);
extern void ipif_refhold(ipif_t *);
extern void ipif_refhold_locked(ipif_t *);
extern void ipif_refrele(ipif_t *);
extern void ipif_all_down_tail(ipsq_t *, queue_t *, mblk_t *, void *);
-extern void ipif_resolver_down(ipif_t *);
extern int ipif_resolver_up(ipif_t *, enum ip_resolver_action);
-extern int ipif_arp_setup_multicast(ipif_t *, mblk_t **);
extern int ipif_down(ipif_t *, queue_t *, mblk_t *);
-extern void ipif_down_tail(ipif_t *);
+extern int ipif_down_tail(ipif_t *);
extern void ipif_multicast_down(ipif_t *);
extern void ipif_multicast_up(ipif_t *);
extern void ipif_ndp_down(ipif_t *);
extern int ipif_ndp_up(ipif_t *, boolean_t);
-extern int ipif_ndp_setup_multicast(ipif_t *, struct nce_s **);
extern int ipif_up_done(ipif_t *);
extern int ipif_up_done_v6(ipif_t *);
extern void ipif_up_notify(ipif_t *);
-extern void ipif_update_other_ipifs_v6(ipif_t *);
-extern void ipif_recreate_interface_routes_v6(ipif_t *, ipif_t *);
-extern void ill_update_source_selection(ill_t *);
extern ipif_t *ipif_select_source_v6(ill_t *, const in6_addr_t *, boolean_t,
- uint32_t, zoneid_t);
+ uint32_t, zoneid_t, boolean_t, boolean_t *);
+extern int ip_select_source_v6(ill_t *, const in6_addr_t *,
+ const in6_addr_t *, zoneid_t, ip_stack_t *, uint_t, uint32_t, in6_addr_t *,
+ uint32_t *, uint64_t *);
extern boolean_t ipif_cant_setlinklocal(ipif_t *);
extern void ipif_setlinklocal(ipif_t *);
extern void ipif_setdestlinklocal(ipif_t *);
-extern ipif_t *ipif_lookup_on_ifindex(uint_t, boolean_t, zoneid_t, queue_t *,
- mblk_t *, ipsq_func_t, int *, ip_stack_t *);
+extern ipif_t *ipif_lookup_on_ifindex(uint_t, boolean_t, zoneid_t,
+ ip_stack_t *);
extern ipif_t *ipif_get_next_ipif(ipif_t *curr, ill_t *ill);
extern void ipif_ill_refrele_tail(ill_t *ill);
+extern void ipif_nce_down(ipif_t *ipif);
+extern int ipif_arp_down(ipif_t *ipif);
extern void ipif_mask_reply(ipif_t *);
extern int ipif_up(ipif_t *, queue_t *, mblk_t *);
@@ -290,7 +305,7 @@ extern void qwriter_ip(ill_t *, queue_t *, mblk_t *, ipsq_func_t, int,
boolean_t);
typedef int ip_extract_func_t(queue_t *, mblk_t *, const ip_ioctl_cmd_t *,
- cmd_info_t *, ipsq_func_t);
+ cmd_info_t *);
extern ip_extract_func_t ip_extract_arpreq, ip_extract_lifreq;
@@ -298,16 +313,14 @@ extern int ip_addr_availability_check(ipif_t *);
extern void ip_ll_subnet_defaults(ill_t *, mblk_t *);
extern int ip_rt_add(ipaddr_t, ipaddr_t, ipaddr_t, ipaddr_t, int,
- ipif_t *, ire_t **, boolean_t, queue_t *, mblk_t *, ipsq_func_t,
- struct rtsa_s *, ip_stack_t *);
+ ill_t *, ire_t **, boolean_t, struct rtsa_s *, ip_stack_t *, zoneid_t);
extern int ip_rt_add_v6(const in6_addr_t *, const in6_addr_t *,
- const in6_addr_t *, const in6_addr_t *, int, ipif_t *, ire_t **,
- queue_t *, mblk_t *, ipsq_func_t, struct rtsa_s *, ip_stack_t *ipst);
+ const in6_addr_t *, const in6_addr_t *, int, ill_t *, ire_t **,
+ struct rtsa_s *, ip_stack_t *, zoneid_t);
extern int ip_rt_delete(ipaddr_t, ipaddr_t, ipaddr_t, uint_t, int,
- ipif_t *, boolean_t, queue_t *, mblk_t *, ipsq_func_t, ip_stack_t *);
+ ill_t *, boolean_t, ip_stack_t *, zoneid_t);
extern int ip_rt_delete_v6(const in6_addr_t *, const in6_addr_t *,
- const in6_addr_t *, uint_t, int, ipif_t *, queue_t *, mblk_t *,
- ipsq_func_t, ip_stack_t *);
+ const in6_addr_t *, uint_t, int, ill_t *, ip_stack_t *, zoneid_t);
extern int ip_siocdelndp_v6(ipif_t *, sin_t *, queue_t *, mblk_t *,
ip_ioctl_cmd_t *, void *);
extern int ip_siocqueryndp_v6(ipif_t *, sin_t *, queue_t *, mblk_t *,
@@ -454,11 +467,12 @@ extern int ip_sioctl_get_lifsrcof(ipif_t *, sin_t *, queue_t *,
extern void ip_sioctl_copyin_resume(ipsq_t *, queue_t *, mblk_t *, void *);
extern void ip_sioctl_copyin_setup(queue_t *, mblk_t *);
-extern void ip_sioctl_iocack(ipsq_t *, queue_t *, mblk_t *, void *);
extern ip_ioctl_cmd_t *ip_sioctl_lookup(int);
-
-extern void conn_delete_ire(conn_t *, caddr_t);
-extern boolean_t phyint_exists(uint_t, ip_stack_t *);
+extern void ipif_delete_ires_v4(ipif_t *);
+extern void ipif_delete_ires_v6(ipif_t *);
+extern int ipif_arp_up(ipif_t *, enum ip_resolver_action, boolean_t);
+extern void ipif_dup_recovery(void *);
+extern void ipif_do_recovery(ipif_t *);
/*
* Notes on reference tracing on ill, ipif, ire, nce data structures:
diff --git a/usr/src/uts/common/inet/ip_impl.h b/usr/src/uts/common/inet/ip_impl.h
index 5f9d674e17..694f7a63b0 100644
--- a/usr/src/uts/common/inet/ip_impl.h
+++ b/usr/src/uts/common/inet/ip_impl.h
@@ -50,10 +50,12 @@ extern "C" {
#define IP_HDR_CSUM_TTL_ADJUST 256
#define IP_TCP_CSUM_COMP IPPROTO_TCP
#define IP_UDP_CSUM_COMP IPPROTO_UDP
+#define IP_ICMPV6_CSUM_COMP IPPROTO_ICMPV6
#else
#define IP_HDR_CSUM_TTL_ADJUST 1
#define IP_TCP_CSUM_COMP (IPPROTO_TCP << 8)
#define IP_UDP_CSUM_COMP (IPPROTO_UDP << 8)
+#define IP_ICMPV6_CSUM_COMP (IPPROTO_ICMPV6 << 8)
#endif
#define TCP_CHECKSUM_OFFSET 16
@@ -62,240 +64,20 @@ extern "C" {
#define UDP_CHECKSUM_OFFSET 6
#define UDP_CHECKSUM_SIZE 2
+#define ICMPV6_CHECKSUM_OFFSET 2
+#define ICMPV6_CHECKSUM_SIZE 2
+
#define IPH_TCPH_CHECKSUMP(ipha, hlen) \
((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + TCP_CHECKSUM_OFFSET)))
#define IPH_UDPH_CHECKSUMP(ipha, hlen) \
((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + UDP_CHECKSUM_OFFSET)))
+#define IPH_ICMPV6_CHECKSUMP(ipha, hlen) \
+ ((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + ICMPV6_CHECKSUM_OFFSET)))
+
#define ILL_HCKSUM_CAPABLE(ill) \
(((ill)->ill_capabilities & ILL_CAPAB_HCKSUM) != 0)
-/*
- * Macro that performs software checksum calculation on the IP header.
- */
-#define IP_HDR_CKSUM(ipha, sum, v_hlen_tos_len, ttl_protocol) { \
- (sum) += (ttl_protocol) + (ipha)->ipha_ident + \
- ((v_hlen_tos_len) >> 16) + \
- ((v_hlen_tos_len) & 0xFFFF) + \
- (ipha)->ipha_fragment_offset_and_flags; \
- (sum) = (((sum) & 0xFFFF) + ((sum) >> 16)); \
- (sum) = ~((sum) + ((sum) >> 16)); \
- (ipha)->ipha_hdr_checksum = (uint16_t)(sum); \
-}
-
-#define IS_IP_HDR_HWCKSUM(ipsec, mp, ill) \
- ((!ipsec) && (DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) && \
- ILL_HCKSUM_CAPABLE(ill) && dohwcksum)
-
-/*
- * This macro acts as a wrapper around IP_CKSUM_XMIT_FAST, and it performs
- * several checks on the IRE and ILL (among other things) in order to see
- * whether or not hardware checksum offload is allowed for the outgoing
- * packet. It assumes that the caller has held a reference to the IRE.
- */
-#define IP_CKSUM_XMIT(ill, ire, mp, ihp, up, proto, start, end, \
- max_frag, ipsec_len, pseudo) { \
- uint32_t _hck_flags; \
- /* \
- * We offload checksum calculation to hardware when IPsec isn't \
- * present and if fragmentation isn't required. We also check \
- * if M_DATA fastpath is safe to be used on the corresponding \
- * IRE; this check is performed without grabbing ire_lock but \
- * instead by holding a reference to it. This is sufficient \
- * for IRE_CACHE; for IRE_BROADCAST on non-Ethernet links, the \
- * DL_NOTE_FASTPATH_FLUSH indication could come up from the \
- * driver and trigger the IRE (hence fp_mp) deletion. This is \
- * why only IRE_CACHE type is eligible for offload. \
- * \
- * The presense of IP options also forces the network stack to \
- * calculate the checksum in software. This is because: \
- * \
- * Wrap around: certain partial-checksum NICs (eri, ce) limit \
- * the size of "start offset" width to 6-bit. This effectively \
- * sets the largest value of the offset to 64-bytes, starting \
- * from the MAC header. When the cumulative MAC and IP headers \
- * exceed such limit, the offset will wrap around. This causes \
- * the checksum to be calculated at the wrong place. \
- * \
- * IPv4 source routing: none of the full-checksum capable NICs \
- * is capable of correctly handling the IPv4 source-routing \
- * option for purposes of calculating the pseudo-header; the \
- * actual destination is different from the destination in the \
- * header which is that of the next-hop. (This case may not be \
- * true for NICs which can parse IPv6 extension headers, but \
- * we choose to simplify the implementation by not offloading \
- * checksum when they are present.) \
- * \
- */ \
- if ((ill) != NULL && ILL_HCKSUM_CAPABLE(ill) && \
- !((ire)->ire_flags & RTF_MULTIRT) && \
- (!((ire)->ire_type & IRE_BROADCAST) || \
- (ill)->ill_type == IFT_ETHER) && \
- (ipsec_len) == 0 && \
- (((ire)->ire_ipversion == IPV4_VERSION && \
- (start) == IP_SIMPLE_HDR_LENGTH && \
- ((ire)->ire_nce != NULL && \
- (ire)->ire_nce->nce_fp_mp != NULL && \
- MBLKHEAD(mp) >= MBLKL((ire)->ire_nce->nce_fp_mp))) || \
- ((ire)->ire_ipversion == IPV6_VERSION && \
- (start) == IPV6_HDR_LEN && \
- (ire)->ire_nce->nce_fp_mp != NULL && \
- MBLKHEAD(mp) >= MBLKL((ire)->ire_nce->nce_fp_mp))) && \
- (max_frag) >= (uint_t)((end) + (ipsec_len)) && \
- dohwcksum) { \
- _hck_flags = (ill)->ill_hcksum_capab->ill_hcksum_txflags; \
- } else { \
- _hck_flags = 0; \
- } \
- IP_CKSUM_XMIT_FAST((ire)->ire_ipversion, _hck_flags, mp, ihp, \
- up, proto, start, end, pseudo); \
-}
-
-/*
- * Based on the device capabilities, this macro either marks an outgoing
- * packet with hardware checksum offload information or calculate the
- * checksum in software. If the latter is performed, the checksum field
- * of the dblk is cleared; otherwise it will be non-zero and contain the
- * necessary flag(s) for the driver.
- */
-#define IP_CKSUM_XMIT_FAST(ipver, hck_flags, mp, ihp, up, proto, start, \
- end, pseudo) { \
- uint32_t _sum; \
- /* \
- * Underlying interface supports hardware checksum offload for \
- * the payload; leave the payload checksum for the hardware to \
- * calculate. N.B: We only need to set up checksum info on the \
- * first mblk. \
- */ \
- DB_CKSUMFLAGS(mp) = 0; \
- if (((ipver) == IPV4_VERSION && \
- ((hck_flags) & HCKSUM_INET_FULL_V4)) || \
- ((ipver) == IPV6_VERSION && \
- ((hck_flags) & HCKSUM_INET_FULL_V6))) { \
- /* \
- * Hardware calculates pseudo-header, header and the \
- * payload checksums, so clear the checksum field in \
- * the protocol header. \
- */ \
- *(up) = 0; \
- DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM; \
- } else if ((hck_flags) & HCKSUM_INET_PARTIAL) { \
- /* \
- * Partial checksum offload has been enabled. Fill \
- * the checksum field in the protocl header with the \
- * pseudo-header checksum value. \
- */ \
- _sum = ((proto) == IPPROTO_UDP) ? \
- IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP; \
- _sum += *(up) + (pseudo); \
- _sum = (_sum & 0xFFFF) + (_sum >> 16); \
- *(up) = (_sum & 0xFFFF) + (_sum >> 16); \
- /* \
- * Offsets are relative to beginning of IP header. \
- */ \
- DB_CKSUMSTART(mp) = (start); \
- DB_CKSUMSTUFF(mp) = ((proto) == IPPROTO_UDP) ? \
- (start) + UDP_CHECKSUM_OFFSET : \
- (start) + TCP_CHECKSUM_OFFSET; \
- DB_CKSUMEND(mp) = (end); \
- DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM; \
- } else { \
- /* \
- * Software checksumming. \
- */ \
- _sum = ((proto) == IPPROTO_UDP) ? \
- IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP; \
- _sum += (pseudo); \
- _sum = IP_CSUM(mp, start, _sum); \
- *(up) = (uint16_t)(((proto) == IPPROTO_UDP) ? \
- (_sum ? _sum : ~_sum) : _sum); \
- } \
- /* \
- * Hardware supports IP header checksum offload; clear the \
- * contents of IP header checksum field as expected by NIC. \
- * Do this only if we offloaded either full or partial sum. \
- */ \
- if ((ipver) == IPV4_VERSION && DB_CKSUMFLAGS(mp) != 0 && \
- ((hck_flags) & HCKSUM_IPHDRCKSUM)) { \
- DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM; \
- ((ipha_t *)(ihp))->ipha_hdr_checksum = 0; \
- } \
-}
-
-/*
- * Macro to inspect the checksum of a fully-reassembled incoming datagram.
- */
-#define IP_CKSUM_RECV_REASS(hck_flags, off, pseudo, sum, err) { \
- (err) = B_FALSE; \
- if ((hck_flags) & HCK_FULLCKSUM) { \
- /* \
- * The sum of all fragment checksums should \
- * result in -0 (0xFFFF) or otherwise invalid. \
- */ \
- if ((sum) != 0xFFFF) \
- (err) = B_TRUE; \
- } else if ((hck_flags) & HCK_PARTIALCKSUM) { \
- (sum) += (pseudo); \
- (sum) = ((sum) & 0xFFFF) + ((sum) >> 16); \
- (sum) = ((sum) & 0xFFFF) + ((sum) >> 16); \
- if (~(sum) & 0xFFFF) \
- (err) = B_TRUE; \
- } else if (((sum) = IP_CSUM(mp, off, pseudo)) != 0) { \
- (err) = B_TRUE; \
- } \
-}
-
-/*
- * This macro inspects an incoming packet to see if the checksum value
- * contained in it is valid; if the hardware has provided the information,
- * the value is verified, otherwise it performs software checksumming.
- * The checksum value is returned to caller.
- */
-#define IP_CKSUM_RECV(hck_flags, sum, cksum_start, ulph_off, mp, mp1, err) { \
- int32_t _len; \
- \
- (err) = B_FALSE; \
- if ((hck_flags) & HCK_FULLCKSUM) { \
- /* \
- * Full checksum has been computed by the hardware \
- * and has been attached. If the driver wants us to \
- * verify the correctness of the attached value, in \
- * order to protect against faulty hardware, compare \
- * it against -0 (0xFFFF) to see if it's valid. \
- */ \
- (sum) = DB_CKSUM16(mp); \
- if (!((hck_flags) & HCK_FULLCKSUM_OK) && (sum) != 0xFFFF) \
- (err) = B_TRUE; \
- } else if (((hck_flags) & HCK_PARTIALCKSUM) && \
- ((mp1) == NULL || (mp1)->b_cont == NULL) && \
- (ulph_off) >= DB_CKSUMSTART(mp) && \
- ((_len = (ulph_off) - DB_CKSUMSTART(mp)) & 1) == 0) { \
- uint32_t _adj; \
- /* \
- * Partial checksum has been calculated by hardware \
- * and attached to the packet; in addition, any \
- * prepended extraneous data is even byte aligned, \
- * and there are at most two mblks associated with \
- * the packet. If any such data exists, we adjust \
- * the checksum; also take care any postpended data. \
- */ \
- IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, _len, _adj); \
- /* \
- * One's complement subtract extraneous checksum \
- */ \
- (sum) += DB_CKSUM16(mp); \
- if (_adj >= (sum)) \
- (sum) = ~(_adj - (sum)) & 0xFFFF; \
- else \
- (sum) -= _adj; \
- (sum) = ((sum) & 0xFFFF) + ((int)(sum) >> 16); \
- (sum) = ((sum) & 0xFFFF) + ((int)(sum) >> 16); \
- if (~(sum) & 0xFFFF) \
- (err) = B_TRUE; \
- } else if (((sum) = IP_CSUM(mp, ulph_off, sum)) != 0) { \
- (err) = B_TRUE; \
- } \
-}
/*
* Macro to adjust a given checksum value depending on any prepended
@@ -338,98 +120,37 @@ extern "C" {
} \
}
-#define ILL_MDT_CAPABLE(ill) \
- (((ill)->ill_capabilities & ILL_CAPAB_MDT) != 0)
-
-/*
- * ioctl identifier and structure for Multidata Transmit update
- * private M_CTL communication from IP to ULP.
- */
-#define MDT_IOC_INFO_UPDATE (('M' << 8) + 1020)
-
-typedef struct ip_mdt_info_s {
- uint_t mdt_info_id; /* MDT_IOC_INFO_UPDATE */
- ill_mdt_capab_t mdt_capab; /* ILL MDT capabilities */
-} ip_mdt_info_t;
+#define IS_SIMPLE_IPH(ipha) \
+ ((ipha)->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)
/*
- * Macro that determines whether or not a given ILL is allowed for MDT.
+ * Currently supported flags for LSO.
*/
-#define ILL_MDT_USABLE(ill) \
- (ILL_MDT_CAPABLE(ill) && \
- ill->ill_mdt_capab != NULL && \
- ill->ill_mdt_capab->ill_mdt_version == MDT_VERSION_2 && \
- ill->ill_mdt_capab->ill_mdt_on != 0)
+#define LSO_BASIC_TCP_IPV4 DLD_LSO_BASIC_TCP_IPV4
+#define LSO_BASIC_TCP_IPV6 DLD_LSO_BASIC_TCP_IPV6
-#define ILL_LSO_CAPABLE(ill) \
- (((ill)->ill_capabilities & ILL_CAPAB_DLD_LSO) != 0)
+#define ILL_LSO_CAPABLE(ill) \
+ (((ill)->ill_capabilities & ILL_CAPAB_LSO) != 0)
-/*
- * ioctl identifier and structure for Large Segment Offload
- * private M_CTL communication from IP to ULP.
- */
-#define LSO_IOC_INFO_UPDATE (('L' << 24) + ('S' << 16) + ('O' << 8))
-
-typedef struct ip_lso_info_s {
- uint_t lso_info_id; /* LSO_IOC_INFO_UPDATE */
- ill_lso_capab_t lso_capab; /* ILL LSO capabilities */
-} ip_lso_info_t;
-
-/*
- * Macro that determines whether or not a given ILL is allowed for LSO.
- */
#define ILL_LSO_USABLE(ill) \
(ILL_LSO_CAPABLE(ill) && \
- ill->ill_lso_capab != NULL && \
- ill->ill_lso_capab->ill_lso_on != 0)
+ ill->ill_lso_capab != NULL)
-#define ILL_LSO_TCP_USABLE(ill) \
+#define ILL_LSO_TCP_IPV4_USABLE(ill) \
(ILL_LSO_USABLE(ill) && \
- ill->ill_lso_capab->ill_lso_flags & DLD_LSO_TX_BASIC_TCP_IPV4)
+ ill->ill_lso_capab->ill_lso_flags & LSO_BASIC_TCP_IPV4)
-/*
- * Macro that determines whether or not a given CONN may be considered
- * for fast path prior to proceeding further with LSO or Multidata.
- */
-#define CONN_IS_LSO_MD_FASTPATH(connp) \
- ((connp)->conn_dontroute == 0 && /* SO_DONTROUTE */ \
- !((connp)->conn_nexthop_set) && /* IP_NEXTHOP */ \
- (connp)->conn_outgoing_ill == NULL) /* IP{V6}_BOUND_IF */
-
-/* Definitions for fragmenting IP packets using MDT. */
-
-/*
- * Smaller and private version of pdescinfo_t used specifically for IP,
- * which allows for only a single payload span per packet.
- */
-typedef struct ip_pdescinfo_s PDESCINFO_STRUCT(2) ip_pdescinfo_t;
+#define ILL_LSO_TCP_IPV6_USABLE(ill) \
+ (ILL_LSO_USABLE(ill) && \
+ ill->ill_lso_capab->ill_lso_flags & LSO_BASIC_TCP_IPV6)
-/*
- * Macro version of ip_can_frag_mdt() which avoids the function call if we
- * only examine a single message block.
- */
-#define IP_CAN_FRAG_MDT(mp, hdr_len, len) \
- (((mp)->b_cont == NULL) ? \
- (MBLKL(mp) >= ((hdr_len) + ip_wput_frag_mdt_min)) : \
- ip_can_frag_mdt((mp), (hdr_len), (len)))
+#define ILL_ZCOPY_CAPABLE(ill) \
+ (((ill)->ill_capabilities & ILL_CAPAB_ZEROCOPY) != 0)
-/*
- * Macro that determines whether or not a given IPC requires
- * outbound IPSEC processing.
- */
-#define CONN_IPSEC_OUT_ENCAPSULATED(connp) \
- ((connp)->conn_out_enforce_policy || \
- ((connp)->conn_latch != NULL && \
- (connp)->conn_latch->ipl_out_policy != NULL))
+#define ILL_ZCOPY_USABLE(ill) \
+ (ILL_ZCOPY_CAPABLE(ill) && (ill->ill_zerocopy_capab != NULL) && \
+ (ill->ill_zerocopy_capab->ill_zerocopy_flags != 0))
-/*
- * Macro that checks whether or not a particular UDP conn is
- * flow-controlling on the read-side.
- *
- * Note that this check is done after the conn is found in
- * the UDP fanout table.
- */
-#define CONN_UDP_FLOWCTLD(connp) !canputnext((connp)->conn_rq)
/* Macro that follows definitions of flags for mac_tx() (see mac_client.h) */
#define IP_DROP_ON_NO_DESC 0x01 /* Equivalent to MAC_DROP_ON_NO_DESC */
@@ -437,74 +158,7 @@ typedef struct ip_pdescinfo_s PDESCINFO_STRUCT(2) ip_pdescinfo_t;
#define ILL_DIRECT_CAPABLE(ill) \
(((ill)->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0)
-#define ILL_SEND_TX(ill, ire, hint, mp, flag, connp) { \
- if (ILL_DIRECT_CAPABLE(ill) && DB_TYPE(mp) == M_DATA) { \
- ill_dld_direct_t *idd; \
- uintptr_t cookie; \
- conn_t *udp_connp = (conn_t *)connp; \
- \
- idd = &(ill)->ill_dld_capab->idc_direct; \
- /* \
- * Send the packet directly to DLD, where it \
- * may be queued depending on the availability \
- * of transmit resources at the media layer. \
- * Ignore the returned value for the time being \
- * In future, we may want to take this into \
- * account and flow control the TCP. \
- */ \
- cookie = idd->idd_tx_df(idd->idd_tx_dh, mp, \
- (uintptr_t)(hint), flag); \
- \
- /* \
- * non-NULL cookie indicates flow control situation \
- * and the cookie itself identifies this specific \
- * Tx ring that is blocked. This cookie is used to \
- * block the UDP conn that is sending packets over \
- * this specific Tx ring. \
- */ \
- if ((cookie != NULL) && (udp_connp != NULL) && \
- (udp_connp->conn_ulp == IPPROTO_UDP)) { \
- idl_tx_list_t *idl_txl; \
- ip_stack_t *ipst; \
- \
- /* \
- * Flow controlled. \
- */ \
- DTRACE_PROBE2(ill__send__tx__cookie, \
- uintptr_t, cookie, conn_t *, udp_connp); \
- ipst = udp_connp->conn_netstack->netstack_ip; \
- idl_txl = \
- &ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)];\
- mutex_enter(&idl_txl->txl_lock); \
- if (udp_connp->conn_direct_blocked || \
- (idd->idd_tx_fctl_df(idd->idd_tx_fctl_dh, \
- cookie) == 0)) { \
- DTRACE_PROBE1(ill__tx__not__blocked, \
- boolean, \
- udp_connp->conn_direct_blocked); \
- } else if (idl_txl->txl_cookie != NULL && \
- idl_txl->txl_cookie != cookie) { \
- udp_t *udp = udp_connp->conn_udp; \
- udp_stack_t *us = udp->udp_us; \
- \
- DTRACE_PROBE2(ill__send__tx__collision, \
- uintptr_t, cookie, \
- uintptr_t, idl_txl->txl_cookie); \
- UDP_STAT(us, udp_cookie_coll); \
- } else { \
- udp_connp->conn_direct_blocked = B_TRUE;\
- idl_txl->txl_cookie = cookie; \
- conn_drain_insert(udp_connp, idl_txl); \
- DTRACE_PROBE1(ill__send__tx__insert, \
- conn_t *, udp_connp); \
- } \
- mutex_exit(&idl_txl->txl_lock); \
- } \
- } else { \
- putnext((ire)->ire_stq, mp); \
- } \
-}
-
+/* This macro is used by the mac layer */
#define MBLK_RX_FANOUT_SLOWPATH(mp, ipha) \
(DB_TYPE(mp) != M_DATA || DB_REF(mp) != 1 || !OK_32PTR(ipha) || \
(((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH) >= (mp)->b_wptr))
@@ -520,13 +174,11 @@ typedef struct ip_pdescinfo_s PDESCINFO_STRUCT(2) ip_pdescinfo_t;
netstackid_to_zoneid((ipst)->ips_netstack->netstack_stackid) : \
(zoneid))
-extern int ip_wput_frag_mdt_min;
-extern boolean_t ip_can_frag_mdt(mblk_t *, ssize_t, ssize_t);
-extern mblk_t *ip_prepend_zoneid(mblk_t *, zoneid_t, ip_stack_t *);
extern void ill_flow_enable(void *, ip_mac_tx_cookie_t);
-extern zoneid_t ip_get_zoneid_v4(ipaddr_t, mblk_t *, ip_stack_t *, zoneid_t);
+extern zoneid_t ip_get_zoneid_v4(ipaddr_t, mblk_t *, ip_recv_attr_t *,
+ zoneid_t);
extern zoneid_t ip_get_zoneid_v6(in6_addr_t *, mblk_t *, const ill_t *,
- ip_stack_t *, zoneid_t);
+ ip_recv_attr_t *, zoneid_t);
/*
* flag passed in by IP based protocols to get a private ip stream with
@@ -542,8 +194,6 @@ extern zoneid_t ip_get_zoneid_v6(in6_addr_t *, mblk_t *, const ill_t *,
#define DEV_IP "/devices/pseudo/ip@0:ip"
#define DEV_IP6 "/devices/pseudo/ip6@0:ip6"
-extern struct kmem_cache *ip_helper_stream_cache;
-
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/inet/ip_ire.h b/usr/src/uts/common/inet/ip_ire.h
index f4882f7640..d4dfd9c97e 100644
--- a/usr/src/uts/common/inet/ip_ire.h
+++ b/usr/src/uts/common/inet/ip_ire.h
@@ -68,106 +68,26 @@ extern "C" {
((addr).s6_addr8[14] & (mask).s6_addr8[14]) ^ \
((addr).s6_addr8[15] & (mask).s6_addr8[15])) & ((table_size) - 1))
+#define IRE_HIDDEN_TYPE(ire_type) ((ire_type) & \
+ (IRE_HOST | IRE_PREFIX | IRE_DEFAULT | IRE_IF_ALL | IRE_BROADCAST))
+
/*
* match parameter definitions for IRE lookup routines.
*/
#define MATCH_IRE_DSTONLY 0x0000 /* Match just the address */
#define MATCH_IRE_TYPE 0x0001 /* Match IRE type */
-#define MATCH_IRE_SRC 0x0002 /* Match IRE source address */
-#define MATCH_IRE_MASK 0x0004 /* Match IRE mask */
-#define MATCH_IRE_WQ 0x0008 /* Match IRE ire_stq to write queue */
-#define MATCH_IRE_GW 0x0010 /* Match IRE gateway */
-#define MATCH_IRE_IPIF 0x0020 /* Match IRE ipif */
-#define MATCH_IRE_RECURSIVE 0x0040 /* Do recursive lookup if necessary */
-#define MATCH_IRE_DEFAULT 0x0080 /* Return default route if no route */
- /* found. */
-#define MATCH_IRE_RJ_BHOLE 0x0100 /* During lookup if we hit an ire */
- /* with RTF_REJECT or RTF_BLACKHOLE, */
- /* return the ire. No recursive */
- /* lookup should be done. */
-#define MATCH_IRE_IHANDLE 0x0200 /* Match IRE on ihandle */
-#define MATCH_IRE_MARK_TESTHIDDEN 0x0400 /* Match IRE_MARK_TESTHIDDEN IREs */
-
-/*
- * MATCH_IRE_PARENT is used whenever we unconditionally want to get the
- * parent IRE (sire) while recursively searching IREs for an offsubnet
- * destination. With this flag, even if no IRE_CACHETABLE or IRE_INTERFACE
- * is found to help resolving IRE_OFFSUBNET in lookup routines, the
- * IRE_OFFSUBNET sire, if any, is returned to the caller.
- */
-/* UNUSED 0x0800 */
-#define MATCH_IRE_ILL 0x1000 /* Match IRE on the ill */
-
-#define MATCH_IRE_PARENT 0x2000 /* Match parent ire, if any, */
- /* even if ire is not matched. */
-#define MATCH_IRE_ZONEONLY 0x4000 /* Match IREs in specified zone, ie */
+#define MATCH_IRE_MASK 0x0002 /* Match IRE mask */
+#define MATCH_IRE_SHORTERMASK 0x0004 /* A mask shorter than the argument */
+#define MATCH_IRE_GW 0x0008 /* Match IRE gateway */
+#define MATCH_IRE_ILL 0x0010 /* Match IRE on the ill */
+#define MATCH_IRE_ZONEONLY 0x0020 /* Match IREs in specified zone, ie */
/* don't match IRE_LOCALs from other */
/* zones or shared IREs */
-#define MATCH_IRE_MARK_PRIVATE_ADDR 0x8000 /* Match IRE ire_marks with */
- /* IRE_MARK_PRIVATE_ADDR. */
-#define MATCH_IRE_SECATTR 0x10000 /* Match gateway security attributes */
-#define MATCH_IRE_COMPLETE 0x20000 /* ire_ftable_lookup() can return */
- /* IRE_CACHE entry only if it is */
- /* ND_REACHABLE */
+#define MATCH_IRE_SECATTR 0x0040 /* Match gateway security attributes */
+#define MATCH_IRE_TESTHIDDEN 0x0080 /* Match ire_testhidden IREs */
-/*
- * Any ire to nce association is long term, and
- * the refhold and refrele may be done by different
- * threads. So all cases of making or breaking ire to
- * nce association should all effectively use the NOTR variants.
- * To understand the *effectively* part read on.
- *
- * ndp_lookup() and ndp_add_v4()/ndp_add_v6() implicitly do
- * NCE_REFHOLD. So wherever we make ire to nce association after
- * calling these functions, we effectively want to end up with
- * NCE_REFHOLD_NOTR. We call this macro to achieve this effect. This
- * macro changes a NCE_REFHOLD to a NCE_REFHOLD_NOTR. The macro's
- * NCE_REFRELE cancels off ndp_lookup[ndp_add]'s implicit NCE_REFHOLD,
- * and what you are left with is a NCE_REFHOLD_NOTR
- */
-#define NCE_REFHOLD_TO_REFHOLD_NOTR(nce) { \
- NCE_REFHOLD_NOTR(nce); \
- NCE_REFRELE(nce); \
-}
-
-/*
- * find the next ire_t entry in the ire_next chain starting at ire
- * that is not CONDEMNED. ire is set to NULL if we reach the end of the list.
- * Caller must hold the ire_bucket lock.
- */
+#define MAX_IRE_RECURSION 4 /* Max IREs in ire_route_recursive */
-#define IRE_FIND_NEXT_ORIGIN(ire) { \
- while ((ire) != NULL && ((ire)->ire_marks & IRE_MARK_CONDEMNED))\
- (ire) = (ire)->ire_next; \
-}
-
-
-/* Structure for ire_cache_count() */
-typedef struct {
- int icc_total; /* Total number of IRE_CACHE */
- int icc_unused; /* # off/no PMTU unused since last reclaim */
- int icc_offlink; /* # offlink without PMTU information */
- int icc_pmtu; /* # offlink with PMTU information */
- int icc_onlink; /* # onlink */
-} ire_cache_count_t;
-
-/*
- * Structure for ire_cache_reclaim(). Each field is a fraction i.e. 1 meaning
- * reclaim all, N meaning reclaim 1/Nth of all entries, 0 meaning reclaim none.
- *
- * The comment below (and for other netstack_t references) refers
- * to the fact that we only do netstack_hold in particular cases,
- * such as the references from open streams (ill_t and conn_t's
- * pointers). Internally within IP we rely on IP's ability to cleanup e.g.
- * ire_t's when an ill goes away.
- */
-typedef struct {
- int icr_unused; /* Fraction for unused since last reclaim */
- int icr_offlink; /* Fraction for offlink without PMTU info */
- int icr_pmtu; /* Fraction for offlink with PMTU info */
- int icr_onlink; /* Fraction for onlink */
- ip_stack_t *icr_ipst; /* Does not have a netstack_hold */
-} ire_cache_reclaim_t;
/*
* We use atomics so that we get an accurate accounting on the ires.
@@ -176,180 +96,250 @@ typedef struct {
#define BUMP_IRE_STATS(ire_stats, x) atomic_add_64(&(ire_stats).x, 1)
#ifdef _KERNEL
-/*
- * Structure for passing args for the IRE cache lookup functions.
- */
-typedef struct ire_ctable_args_s {
- void *ict_addr;
- void *ict_gateway;
- int ict_type;
- const ipif_t *ict_ipif;
- zoneid_t ict_zoneid;
- const ts_label_t *ict_tsl;
- int ict_flags;
- ip_stack_t *ict_ipst;
- queue_t *ict_wq;
-} ire_ctable_args_t;
-
struct ts_label_s;
struct nce_s;
+/*
+ * structure for passing args between ire_ftable_lookup and ire_find_best_route
+ */
+typedef struct ire_ftable_args_s {
+ in6_addr_t ift_addr_v6;
+ in6_addr_t ift_mask_v6;
+ in6_addr_t ift_gateway_v6;
+#define ift_addr V4_PART_OF_V6(ift_addr_v6)
+#define ift_mask V4_PART_OF_V6(ift_mask_v6)
+#define ift_gateway V4_PART_OF_V6(ift_gateway_v6)
+ int ift_type;
+ const ill_t *ift_ill;
+ zoneid_t ift_zoneid;
+ const ts_label_t *ift_tsl;
+ int ift_flags;
+ ire_t *ift_best_ire;
+} ire_ftable_args_t;
extern ipaddr_t ip_plen_to_mask(uint_t);
extern in6_addr_t *ip_plen_to_mask_v6(uint_t, in6_addr_t *);
extern int ip_ire_advise(queue_t *, mblk_t *, cred_t *);
extern int ip_ire_delete(queue_t *, mblk_t *, cred_t *);
-extern boolean_t ip_ire_clookup_and_delete(ipaddr_t, ipif_t *, ip_stack_t *);
-extern void ip_ire_clookup_and_delete_v6(const in6_addr_t *,
- ip_stack_t *);
-
-extern void ip_ire_req(queue_t *, mblk_t *);
+extern void ip_ire_reclaim(void *);
extern int ip_mask_to_plen(ipaddr_t);
extern int ip_mask_to_plen_v6(const in6_addr_t *);
-extern ire_t *ipif_to_ire(const ipif_t *);
-extern ire_t *ipif_to_ire_v6(const ipif_t *);
-
-extern int ire_add(ire_t **, queue_t *, mblk_t *, ipsq_func_t, boolean_t);
-extern void ire_add_then_send(queue_t *, ire_t *, mblk_t *);
-extern int ire_add_v6(ire_t **, queue_t *, mblk_t *, ipsq_func_t);
-extern int ire_atomic_start(irb_t *irb_ptr, ire_t *ire, queue_t *q,
- mblk_t *mp, ipsq_func_t func);
+extern ire_t *ire_add(ire_t *);
+extern ire_t *ire_add_v6(ire_t *);
+extern int ire_atomic_start(irb_t *irb_ptr, ire_t *ire);
extern void ire_atomic_end(irb_t *irb_ptr, ire_t *ire);
-extern void ire_cache_count(ire_t *, char *);
-extern ire_t *ire_cache_lookup(ipaddr_t, zoneid_t,
- const struct ts_label_s *, ip_stack_t *);
-extern ire_t *ire_cache_lookup_simple(ipaddr_t, ip_stack_t *);
-extern ire_t *ire_cache_lookup_v6(const in6_addr_t *, zoneid_t,
- const struct ts_label_s *, ip_stack_t *);
-extern void ire_cache_reclaim(ire_t *, char *);
-
-extern ire_t *ire_create_mp(uchar_t *, uchar_t *, uchar_t *, uchar_t *,
- uint_t, struct nce_s *, queue_t *, queue_t *, ushort_t, ipif_t *, ipaddr_t,
- uint32_t, uint32_t, uint32_t, const iulp_t *, tsol_gc_t *, tsol_gcgrp_t *,
- ip_stack_t *);
-extern ire_t *ire_create(uchar_t *, uchar_t *, uchar_t *, uchar_t *,
- uint_t *, struct nce_s *, queue_t *, queue_t *, ushort_t, ipif_t *,
- ipaddr_t, uint32_t, uint32_t, uint32_t, const iulp_t *, tsol_gc_t *,
- tsol_gcgrp_t *, ip_stack_t *);
-
-extern ire_t **ire_check_and_create_bcast(ipif_t *, ipaddr_t,
- ire_t **, int);
-extern ire_t **ire_create_bcast(ipif_t *, ipaddr_t, ire_t **);
-extern ire_t *ire_init(ire_t *, uchar_t *, uchar_t *, uchar_t *, uchar_t *,
- uint_t *, struct nce_s *, queue_t *, queue_t *, ushort_t, ipif_t *,
- ipaddr_t, uint32_t, uint32_t, uint32_t, const iulp_t *, tsol_gc_t *,
- tsol_gcgrp_t *, ip_stack_t *);
-
-extern boolean_t ire_init_common(ire_t *, uint_t *, struct nce_s *, queue_t *,
- queue_t *, ushort_t, ipif_t *, uint32_t, uint32_t, uint32_t, uchar_t,
- const iulp_t *, tsol_gc_t *, tsol_gcgrp_t *, ip_stack_t *);
-
-extern ire_t *ire_create_v6(const in6_addr_t *, const in6_addr_t *,
- const in6_addr_t *, const in6_addr_t *, uint_t *, struct nce_s *, queue_t *,
- queue_t *, ushort_t, ipif_t *,
- const in6_addr_t *, uint32_t, uint32_t, uint_t, const iulp_t *,
- tsol_gc_t *, tsol_gcgrp_t *, ip_stack_t *);
-
-extern ire_t *ire_create_mp_v6(const in6_addr_t *, const in6_addr_t *,
- const in6_addr_t *, const in6_addr_t *, struct nce_s *, queue_t *,
- queue_t *, ushort_t, ipif_t *,
- const in6_addr_t *, uint32_t, uint32_t, uint_t, const iulp_t *,
- tsol_gc_t *, tsol_gcgrp_t *, ip_stack_t *);
-
+extern ire_t *ire_create(uchar_t *, uchar_t *, uchar_t *,
+ ushort_t, ill_t *, zoneid_t, uint_t, tsol_gc_t *, ip_stack_t *);
-extern void ire_clookup_delete_cache_gw(ipaddr_t, zoneid_t,
- ip_stack_t *);
-extern void ire_clookup_delete_cache_gw_v6(const in6_addr_t *, zoneid_t,
+extern ire_t **ire_create_bcast(ill_t *, ipaddr_t, zoneid_t, ire_t **);
+extern ire_t *ire_create_if_clone(ire_t *, const in6_addr_t *, uint_t *);
+extern ire_t *ire_lookup_bcast(ill_t *, ipaddr_t, zoneid_t);
+extern int ire_init_v4(ire_t *, uchar_t *, uchar_t *, uchar_t *,
+ ushort_t, ill_t *, zoneid_t, uint_t, tsol_gc_t *, ip_stack_t *);
+extern int ire_init_v6(ire_t *, const in6_addr_t *, const in6_addr_t *,
+ const in6_addr_t *, ushort_t, ill_t *, zoneid_t, uint_t, tsol_gc_t *,
ip_stack_t *);
-extern ire_t *ire_ctable_lookup(ipaddr_t, ipaddr_t, int, const ipif_t *,
- zoneid_t, const struct ts_label_s *, int, ip_stack_t *);
+extern int ire_init_common(ire_t *, ushort_t, ill_t *, zoneid_t, uint_t,
+ uchar_t, tsol_gc_t *, ip_stack_t *);
-extern ire_t *ire_ctable_lookup_v6(const in6_addr_t *, const in6_addr_t *,
- int, const ipif_t *, zoneid_t, const struct ts_label_s *, int,
- ip_stack_t *);
+extern ire_t *ire_create_v6(const in6_addr_t *, const in6_addr_t *,
+ const in6_addr_t *, ushort_t, ill_t *, zoneid_t, uint_t,
+ tsol_gc_t *, ip_stack_t *);
extern void ire_delete(ire_t *);
-extern void ire_delete_cache_gw(ire_t *, char *);
-extern void ire_delete_cache_gw_v6(ire_t *, char *);
-extern void ire_delete_cache_v6(ire_t *, char *);
extern void ire_delete_v6(ire_t *);
-extern void ire_expire(ire_t *, char *);
+/*
+ * ire_pref used to make sure we don't set up routing loops in the ire_dep
+ * chain.
+ */
+extern int ire_pref(ire_t *);
+extern boolean_t ire_dep_build(ire_t *[], uint_t [], uint_t);
+extern void ire_dep_delete_if_clone(ire_t *);
+extern void ire_dep_incr_generation(ire_t *);
+extern void ire_dep_remove(ire_t *);
+extern void ire_dep_unbuild(ire_t *[], uint_t);
+extern uint_t ire_dep_validate_generations(ire_t *);
+extern void ire_dep_invalidate_generations(ire_t *);
+extern boolean_t ire_determine_nce_capable(ire_t *);
extern void ire_flush_cache_v4(ire_t *, int);
extern void ire_flush_cache_v6(ire_t *, int);
+extern ire_t *ire_ftable_lookup_v4(ipaddr_t, ipaddr_t, ipaddr_t, int,
+ const ill_t *, zoneid_t, const struct ts_label_s *, int, uint32_t,
+ ip_stack_t *, uint_t *);
extern ire_t *ire_ftable_lookup_v6(const in6_addr_t *, const in6_addr_t *,
- const in6_addr_t *, int, const ipif_t *, ire_t **, zoneid_t,
- uint32_t, const struct ts_label_s *, int, ip_stack_t *);
-
-extern ire_t *ire_ihandle_lookup_onlink(ire_t *);
-extern ire_t *ire_ihandle_lookup_offlink(ire_t *, ire_t *);
-extern ire_t *ire_ihandle_lookup_offlink_v6(ire_t *, ire_t *);
-
-extern boolean_t ire_local_same_lan(ire_t *, ire_t *);
-extern boolean_t ire_local_ok_across_zones(ire_t *, zoneid_t, void *,
- const struct ts_label_s *, ip_stack_t *);
-
-extern ire_t *ire_lookup_local(zoneid_t, ip_stack_t *);
-extern ire_t *ire_lookup_local_v6(zoneid_t, ip_stack_t *);
-
-extern ire_t *ire_lookup_multi(ipaddr_t, zoneid_t, ip_stack_t *);
-extern ire_t *ire_lookup_multi_v6(const in6_addr_t *, zoneid_t,
- ip_stack_t *);
-
+ const in6_addr_t *, int, const ill_t *, zoneid_t,
+ const struct ts_label_s *, int, uint32_t, ip_stack_t *, uint_t *);
+
+extern ire_t *ire_ftable_lookup_simple_v4(ipaddr_t, uint32_t, ip_stack_t *,
+ uint_t *);
+extern ire_t *ire_ftable_lookup_simple_v6(const in6_addr_t *, uint32_t,
+ ip_stack_t *, uint_t *);
+
+extern boolean_t ire_gateway_ok_zone_v4(ipaddr_t, zoneid_t, ill_t *,
+ const ts_label_t *, ip_stack_t *, boolean_t);
+extern boolean_t ire_gateway_ok_zone_v6(const in6_addr_t *, zoneid_t, ill_t *,
+ const ts_label_t *, ip_stack_t *, boolean_t);
+
+extern ire_t *ire_alt_local(ire_t *, zoneid_t, const ts_label_t *,
+ const ill_t *, uint_t *);
+
+extern ill_t *ire_lookup_multi_ill_v4(ipaddr_t, zoneid_t, ip_stack_t *,
+ boolean_t *, ipaddr_t *);
+extern ill_t *ire_lookup_multi_ill_v6(const in6_addr_t *, zoneid_t,
+ ip_stack_t *, boolean_t *, in6_addr_t *);
+
+extern ire_t *ire_nexthop(ire_t *);
+extern ill_t *ire_nexthop_ill(ire_t *);
+extern ill_t *ire_nce_ill(ire_t *);
+
+extern ire_t *ire_reject(ip_stack_t *, boolean_t);
+extern ire_t *ire_blackhole(ip_stack_t *, boolean_t);
+extern ire_t *ire_multicast(ill_t *);
+
+/* The different ire_recvfn functions */
+extern void ire_recv_forward_v4(ire_t *, mblk_t *, void *,
+ ip_recv_attr_t *);
+extern void ire_recv_noroute_v4(ire_t *, mblk_t *, void *,
+ ip_recv_attr_t *);
+extern void ire_recv_broadcast_v4(ire_t *, mblk_t *, void *,
+ ip_recv_attr_t *);
+extern void ire_recv_multicast_v4(ire_t *, mblk_t *, void *,
+ ip_recv_attr_t *);
+extern void ire_recv_multirt_v4(ire_t *, mblk_t *, void *,
+ ip_recv_attr_t *);
+extern void ire_recv_loopback_v4(ire_t *, mblk_t *, void *,
+ ip_recv_attr_t *);
+extern void ire_recv_local_v4(ire_t *, mblk_t *, void *,
+ ip_recv_attr_t *);
+extern void ire_recv_noaccept_v4(ire_t *, mblk_t *, void *,
+ ip_recv_attr_t *);
+
+extern void ire_recv_forward_v6(ire_t *, mblk_t *, void *,
+ ip_recv_attr_t *);
+extern void ire_recv_noroute_v6(ire_t *, mblk_t *, void *,
+ ip_recv_attr_t *);
+extern void ire_recv_multicast_v6(ire_t *, mblk_t *, void *,
+ ip_recv_attr_t *);
+extern void ire_recv_multirt_v6(ire_t *, mblk_t *, void *,
+ ip_recv_attr_t *);
+extern void ire_recv_loopback_v6(ire_t *, mblk_t *, void *,
+ ip_recv_attr_t *);
+extern void ire_recv_local_v6(ire_t *, mblk_t *, void *, ip_recv_attr_t *);
+extern void ire_recv_noaccept_v6(ire_t *, mblk_t *, void *,
+ ip_recv_attr_t *);
+
+extern void irb_refhold(irb_t *);
+extern void irb_refhold_locked(irb_t *);
+extern void irb_refrele(irb_t *);
+extern void irb_increment_generation(irb_t *);
+
+extern void ire_refhold(ire_t *);
+extern void ire_refhold_notr(ire_t *);
+extern void ire_refhold_locked(ire_t *);
extern void ire_refrele(ire_t *);
extern void ire_refrele_notr(ire_t *);
-extern ire_t *ire_route_lookup(ipaddr_t, ipaddr_t, ipaddr_t, int,
- const ipif_t *, ire_t **, zoneid_t, const struct ts_label_s *, int,
- ip_stack_t *);
-
-extern ire_t *ire_route_lookup_v6(const in6_addr_t *, const in6_addr_t *,
- const in6_addr_t *, int, const ipif_t *, ire_t **, zoneid_t,
- const struct ts_label_s *, int, ip_stack_t *);
-
-extern ill_t *ire_to_ill(const ire_t *);
+extern void ire_make_condemned(ire_t *);
+extern boolean_t ire_no_good(ire_t *);
+extern nce_t *ire_handle_condemned_nce(nce_t *, ire_t *, ipha_t *, ip6_t *,
+ boolean_t);
+
+extern ire_t *ire_round_robin(irb_t *, ire_ftable_args_t *, uint_t,
+ ire_t *, ip_stack_t *);
+
+extern ire_t *ire_route_recursive_v4(ipaddr_t, uint_t, const ill_t *,
+ zoneid_t, const ts_label_t *, uint_t, boolean_t, uint32_t, ip_stack_t *,
+ ipaddr_t *, tsol_ire_gw_secattr_t **, uint_t *);
+extern ire_t *ire_route_recursive_v6(const in6_addr_t *, uint_t,
+ const ill_t *, zoneid_t, const ts_label_t *, uint_t, boolean_t, uint32_t,
+ ip_stack_t *, in6_addr_t *, tsol_ire_gw_secattr_t **, uint_t *);
+extern ire_t *ire_route_recursive_dstonly_v4(ipaddr_t, boolean_t,
+ uint32_t, ip_stack_t *);
+extern ire_t *ire_route_recursive_dstonly_v6(const in6_addr_t *, boolean_t,
+ uint32_t, ip_stack_t *);
+extern ire_t *ire_route_recursive_impl_v4(ire_t *ire, ipaddr_t, uint_t,
+ const ill_t *, zoneid_t, const ts_label_t *, uint_t, boolean_t, uint32_t,
+ ip_stack_t *, ipaddr_t *, tsol_ire_gw_secattr_t **, uint_t *);
+extern ire_t *ire_route_recursive_impl_v6(ire_t *ire, const in6_addr_t *,
+ uint_t, const ill_t *, zoneid_t, const ts_label_t *, uint_t, boolean_t,
+ uint32_t, ip_stack_t *, in6_addr_t *, tsol_ire_gw_secattr_t **, uint_t *);
+
+/* The different ire_sendfn functions */
+extern int ire_send_local_v4(ire_t *, mblk_t *, void *,
+ ip_xmit_attr_t *, uint32_t *);
+extern int ire_send_multirt_v4(ire_t *, mblk_t *, void *,
+ ip_xmit_attr_t *, uint32_t *);
+extern int ire_send_noroute_v4(ire_t *, mblk_t *, void *,
+ ip_xmit_attr_t *, uint32_t *);
+extern int ire_send_multicast_v4(ire_t *, mblk_t *, void *,
+ ip_xmit_attr_t *, uint32_t *);
+extern int ire_send_broadcast_v4(ire_t *, mblk_t *, void *,
+ ip_xmit_attr_t *, uint32_t *);
+extern int ire_send_wire_v4(ire_t *, mblk_t *, void *,
+ ip_xmit_attr_t *, uint32_t *);
+extern int ire_send_local_v6(ire_t *, mblk_t *, void *,
+ ip_xmit_attr_t *, uint32_t *);
+extern int ire_send_multirt_v6(ire_t *, mblk_t *, void *,
+ ip_xmit_attr_t *, uint32_t *);
+extern int ire_send_noroute_v6(ire_t *, mblk_t *, void *,
+ ip_xmit_attr_t *, uint32_t *);
+extern int ire_send_multicast_v6(ire_t *, mblk_t *, void *,
+ ip_xmit_attr_t *, uint32_t *);
+extern int ire_send_wire_v6(ire_t *, mblk_t *, void *,
+ ip_xmit_attr_t *, uint32_t *);
+
+extern nce_t *ire_to_nce_pkt(ire_t *, mblk_t *);
+extern nce_t *ire_to_nce(ire_t *, ipaddr_t, const in6_addr_t *);
+
+/* Different ire_postfragfn functions */
+extern int ip_xmit(mblk_t *, struct nce_s *,
+ iaflags_t, uint_t, uint32_t, zoneid_t, zoneid_t, uintptr_t *);
+extern int ip_postfrag_loopcheck(mblk_t *, struct nce_s *,
+ iaflags_t, uint_t, uint32_t, zoneid_t, zoneid_t, uintptr_t *);
+extern int ip_postfrag_multirt_v4(mblk_t *, struct nce_s *,
+ iaflags_t, uint_t, uint32_t, zoneid_t, zoneid_t, uintptr_t *);
+extern int ip_postfrag_multirt_v6(mblk_t *, struct nce_s *,
+ iaflags_t, uint_t, uint32_t, zoneid_t, zoneid_t, uintptr_t *);
+
+extern void ip_postfrag_loopback(mblk_t *, struct nce_s *,
+ iaflags_t, uint_t, zoneid_t);
+extern int ire_revalidate_nce(ire_t *);
+
+extern ire_t *ip_select_route_pkt(mblk_t *, ip_xmit_attr_t *,
+ uint_t *, int *, boolean_t *);
+extern ire_t *ip_select_route(const in6_addr_t *, ip_xmit_attr_t *,
+ uint_t *, in6_addr_t *, int *, boolean_t *);
+extern ire_t *ip_select_route_v4(ipaddr_t, ip_xmit_attr_t *,
+ uint_t *, ipaddr_t *, int *, boolean_t *);
+extern ire_t *ip_select_route_v6(const in6_addr_t *, ip_xmit_attr_t *,
+ uint_t *, in6_addr_t *, int *, boolean_t *);
extern void ire_walk(pfv_t, void *, ip_stack_t *);
extern void ire_walk_ill(uint_t, uint_t, pfv_t, void *, ill_t *);
-extern void ire_walk_ill_v4(uint_t, uint_t, pfv_t, void *, ill_t *);
-extern void ire_walk_ill_v6(uint_t, uint_t, pfv_t, void *, ill_t *);
extern void ire_walk_v4(pfv_t, void *, zoneid_t, ip_stack_t *);
extern void ire_walk_ill_tables(uint_t match_flags, uint_t ire_type,
pfv_t func, void *arg, size_t ftbl_sz, size_t htbl_sz,
- irb_t **ipftbl, size_t ctbl_sz, irb_t *ipctbl, ill_t *ill,
+ irb_t **ipftbl, ill_t *ill,
zoneid_t zoneid, ip_stack_t *);
extern void ire_walk_v6(pfv_t, void *, zoneid_t, ip_stack_t *);
-extern boolean_t ire_multirt_lookup(ire_t **, ire_t **, uint32_t, int *,
- const struct ts_label_s *, ip_stack_t *);
-extern boolean_t ire_multirt_need_resolve(ipaddr_t,
- const struct ts_label_s *, ip_stack_t *);
-extern boolean_t ire_multirt_lookup_v6(ire_t **, ire_t **, uint32_t,
- const struct ts_label_s *, ip_stack_t *);
-extern boolean_t ire_multirt_need_resolve_v6(const in6_addr_t *,
- const struct ts_label_s *, ip_stack_t *);
-
-extern ire_t *ipif_lookup_multi_ire(ipif_t *, ipaddr_t);
-extern ire_t *ipif_lookup_multi_ire_v6(ipif_t *, const in6_addr_t *);
-
-extern ire_t *ire_get_next_bcast_ire(ire_t *, ire_t *);
-extern ire_t *ire_get_next_default_ire(ire_t *, ire_t *);
-
-extern void ire_arpresolve(ire_t *);
-extern void ire_freemblk(ire_t *);
extern boolean_t ire_match_args(ire_t *, ipaddr_t, ipaddr_t, ipaddr_t,
- int, const ipif_t *, zoneid_t, uint32_t, const struct ts_label_s *, int,
- queue_t *);
-extern int ire_nce_init(ire_t *, struct nce_s *);
+ int, const ill_t *, zoneid_t, const struct ts_label_s *, int);
+extern boolean_t ire_match_args_v6(ire_t *, const in6_addr_t *,
+ const in6_addr_t *, const in6_addr_t *, int, const ill_t *, zoneid_t,
+ const ts_label_t *, int);
+
+extern struct nce_s *arp_nce_init(ill_t *, in_addr_t, int);
extern boolean_t ire_walk_ill_match(uint_t, uint_t, ire_t *, ill_t *,
zoneid_t, ip_stack_t *);
-extern ire_t *ire_arpresolve_lookup(ipaddr_t, ipaddr_t, ipif_t *, zoneid_t,
- ip_stack_t *, queue_t *);
+extern void ire_increment_generation(ire_t *);
+extern void ire_increment_multicast_generation(ip_stack_t *, boolean_t);
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/inet/ip_multi.h b/usr/src/uts/common/inet/ip_multi.h
index 7dee133967..c41ef99e3e 100644
--- a/usr/src/uts/common/inet/ip_multi.h
+++ b/usr/src/uts/common/inet/ip_multi.h
@@ -49,18 +49,9 @@ typedef enum {
} ilg_stat_t;
/*
- * Flags shared via ips_mrt_flags, used by mcast_restart_timers_thread().
- */
-typedef enum {
- IP_MRT_STOP = 0x1, /* request to stop thread */
- IP_MRT_DONE = 0x2, /* indication that thread is stopped */
- IP_MRT_RUN = 0x4 /* request to restart timers */
-} ip_mrt_flags_t;
-
-/*
* Extern functions
*/
-extern mblk_t *igmp_input(queue_t *, mblk_t *, ill_t *);
+extern mblk_t *igmp_input(mblk_t *, ip_recv_attr_t *);
extern void igmp_joingroup(ilm_t *);
extern void igmp_leavegroup(ilm_t *);
extern void igmp_slowtimo(void *);
@@ -73,85 +64,64 @@ extern void mld_statechange(ilm_t *, mcast_record_t, slist_t *);
extern void mld_slowtimo(void *);
extern void ilg_delete_all(conn_t *connp);
-extern ilg_t *ilg_lookup_ill_v6(conn_t *, const in6_addr_t *,
- ill_t *);
-extern ilg_t *ilg_lookup_ill_withsrc(conn_t *, ipaddr_t, ipaddr_t,
- ill_t *);
-extern ilg_t *ilg_lookup_ill_withsrc_v6(conn_t *, const in6_addr_t *,
- const in6_addr_t *, ill_t *);
+extern boolean_t conn_hasmembers_ill_withsrc_v4(conn_t *, ipaddr_t,
+ ipaddr_t, ill_t *);
+extern boolean_t conn_hasmembers_ill_withsrc_v6(conn_t *,
+ const in6_addr_t *, const in6_addr_t *, ill_t *);
extern void ill_leave_multicast(ill_t *);
extern void ill_recover_multicast(ill_t *);
-extern int ip_get_dlpi_mbcast(ill_t *, mblk_t *);
-
-extern void ilm_free(ipif_t *);
-extern ilm_t *ilm_lookup_ill(ill_t *, ipaddr_t, zoneid_t);
-extern ilm_t *ilm_lookup_ill_v6(ill_t *, const in6_addr_t *,
- boolean_t, zoneid_t);
-extern ilm_t *ilm_lookup_ipif(ipif_t *, ipaddr_t);
-
-extern int ilm_numentries_v6(ill_t *, const in6_addr_t *);
-extern int ilm_walk_ipif(ipif_t *);
-extern int ilm_walk_ill(ill_t *);
-extern void ilm_walker_cleanup(ill_t *);
-extern int ip_ll_send_disabmulti_req(ill_t *, const in6_addr_t *);
-extern int ip_ll_send_enabmulti_req(ill_t *, const in6_addr_t *);
-
-extern int ip_addmulti(ipaddr_t, ipif_t *, ilg_stat_t,
- mcast_record_t, slist_t *);
-extern int ip_addmulti_v6(const in6_addr_t *, ill_t *,
- zoneid_t, ilg_stat_t, mcast_record_t, slist_t *);
-extern int ip_delmulti(ipaddr_t, ipif_t *, boolean_t, boolean_t);
-extern int ip_delmulti_v6(const in6_addr_t *, ill_t *,
- zoneid_t, boolean_t, boolean_t);
+extern void ip_dlur_to_mhi(ill_t *, mblk_t *,
+ struct mac_header_info_s *);
+
+/* These make up the data path interface used by ip_output and ip_input */
+extern boolean_t ill_hasmembers_v4(ill_t *, ipaddr_t);
+extern boolean_t ill_hasmembers_v6(ill_t *, const in6_addr_t *);
+extern boolean_t ill_hasmembers_otherzones_v4(ill_t *, ipaddr_t,
+ zoneid_t);
+extern boolean_t ill_hasmembers_otherzones_v6(ill_t *,
+ const in6_addr_t *, zoneid_t);
+extern zoneid_t ill_hasmembers_nextzone_v4(ill_t *, ipaddr_t, zoneid_t);
+extern zoneid_t ill_hasmembers_nextzone_v6(ill_t *, const in6_addr_t *,
+ zoneid_t);
+
+extern ilm_t *ip_addmulti(const in6_addr_t *, ill_t *, zoneid_t,
+ int *);
+extern int ip_delmulti(ilm_t *);
+extern int ip_mforward(mblk_t *, ip_recv_attr_t *);
+extern void ip_mroute_decap(mblk_t *, ip_recv_attr_t *);
extern int ill_join_allmulti(ill_t *);
extern void ill_leave_allmulti(ill_t *);
extern int ip_join_allmulti(uint_t, boolean_t, ip_stack_t *);
extern int ip_leave_allmulti(uint_t, boolean_t, ip_stack_t *);
extern void ip_purge_allmulti(ill_t *);
-extern void ip_multicast_loopback(queue_t *, ill_t *, mblk_t *,
- int, zoneid_t);
-extern int ip_mforward(ill_t *, ipha_t *, mblk_t *);
-extern void ip_mroute_decap(queue_t *, mblk_t *, ill_t *);
extern int ip_mroute_mrt(mblk_t *, ip_stack_t *);
extern int ip_mroute_stats(mblk_t *, ip_stack_t *);
extern int ip_mroute_vif(mblk_t *, ip_stack_t *);
-extern int ip_mrouter_done(mblk_t *, ip_stack_t *);
-extern int ip_mrouter_get(int, queue_t *, uchar_t *);
-extern int ip_mrouter_set(int, queue_t *, int, uchar_t *, int,
- mblk_t *);
+extern int ip_mrouter_done(ip_stack_t *);
+extern int ip_mrouter_get(int, conn_t *, uchar_t *);
+extern int ip_mrouter_set(int, conn_t *, int, uchar_t *, int);
extern void ip_mrouter_stack_init(ip_stack_t *);
extern void ip_mrouter_stack_destroy(ip_stack_t *);
-extern int ip_opt_add_group(conn_t *, boolean_t, ipaddr_t,
- ipaddr_t, uint_t *, mcast_record_t, ipaddr_t, mblk_t *);
-extern int ip_opt_delete_group(conn_t *, boolean_t, ipaddr_t,
- ipaddr_t, uint_t *, mcast_record_t, ipaddr_t, mblk_t *);
-extern int ip_opt_add_group_v6(conn_t *, boolean_t,
- const in6_addr_t *, int, mcast_record_t, const in6_addr_t *, mblk_t *);
-extern int ip_opt_delete_group_v6(conn_t *, boolean_t,
- const in6_addr_t *, int, mcast_record_t, const in6_addr_t *, mblk_t *);
+extern int ip_opt_add_group(conn_t *, boolean_t,
+ const in6_addr_t *, ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *);
+extern int ip_opt_delete_group(conn_t *, boolean_t,
+ const in6_addr_t *, ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *);
extern int mrt_ioctl(ipif_t *, sin_t *, queue_t *, mblk_t *,
ip_ioctl_cmd_t *, void *);
extern int ip_sioctl_msfilter(ipif_t *, sin_t *, queue_t *,
mblk_t *, ip_ioctl_cmd_t *, void *);
-extern int ip_extract_msfilter(queue_t *, mblk_t *,
- const ip_ioctl_cmd_t *, cmd_info_t *, ipsq_func_t);
extern int ip_copyin_msfilter(queue_t *, mblk_t *);
-extern void ip_wput_ctl(queue_t *, mblk_t *);
-
-extern int pim_input(queue_t *, mblk_t *, ill_t *);
-extern void reset_conn_ipif(ipif_t *);
-extern void reset_conn_ill(ill_t *);
+extern mblk_t *pim_input(mblk_t *, ip_recv_attr_t *);
+extern void update_conn_ill(ill_t *, ip_stack_t *);
extern void reset_mrt_ill(ill_t *);
extern void reset_mrt_vif_ipif(ipif_t *);
-extern void mcast_restart_timers_thread(ip_stack_t *);
+extern void igmp_start_timers(unsigned, ip_stack_t *);
+extern void mld_start_timers(unsigned, ip_stack_t *);
extern void ilm_inactive(ilm_t *);
-extern ilm_t *ilm_walker_start(ilm_walker_t *, ill_t *);
-extern ilm_t *ilm_walker_step(ilm_walker_t *, ilm_t *);
-extern void ilm_walker_finish(ilm_walker_t *);
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/inet/ip_ndp.h b/usr/src/uts/common/inet/ip_ndp.h
index c1a48b1f1a..21c907f3f3 100644
--- a/usr/src/uts/common/inet/ip_ndp.h
+++ b/usr/src/uts/common/inet/ip_ndp.h
@@ -35,7 +35,7 @@
/*
* Internal definitions for the kernel implementation of the IPv6
- * Neighbor Discovery Protocol (NDP).
+ * Neighbor Discovery Protocol (NDP) and Address Resolution Protocol (ARP).
*/
#ifdef __cplusplus
@@ -48,131 +48,149 @@ extern "C" {
* callbacks set up with ip2mac interface, waiting for result
* of neighbor resolution.
*/
-typedef struct nce_cb_s {
- list_node_t nce_cb_node;
- void *nce_cb_id;
- uint32_t nce_cb_flags;
- ip2mac_callback_t *nce_cb_func;
- void *nce_cb_arg;
-} nce_cb_t;
+typedef struct ncec_cb_s {
+ list_node_t ncec_cb_node; /* next entry in list */
+ void *ncec_cb_id;
+ uint32_t ncec_cb_flags;
+ ip2mac_callback_t *ncec_cb_func;
+ void *ncec_cb_arg;
+} ncec_cb_t;
#define NCE_CB_DISPATCHED 0x00000001
/*
- * NDP Cache Entry
+ * Core information tracking Neighbor Reachability is tracked in the
+ * ncec_s/ncec_t. The information contained in the ncec_t does not contain
+ * any link-specific details other than the pointer to the ill_t itself.
+ * The link-specific information is tracked in the nce_t structure.
*/
-typedef struct nce_s {
- struct nce_s *nce_next; /* Hash chain next pointer */
- struct nce_s **nce_ptpn; /* Pointer to previous next */
- struct ill_s *nce_ill; /* Associated ill */
- uint16_t nce_flags; /* See below */
- uint16_t nce_state; /* See reachability states in if.h */
- int16_t nce_pcnt; /* Probe counter */
- uint16_t nce_rcnt; /* Retransmit counter */
- in6_addr_t nce_addr; /* address of the nighbor */
- in6_addr_t nce_mask; /* If not all ones, mask allows an */
- /* entry to respond to requests for a group of addresses, for */
- /* instantance multicast addresses */
- in6_addr_t nce_extract_mask; /* For mappings */
- uint32_t nce_ll_extract_start; /* For mappings */
-#define nce_first_mp_to_free nce_fp_mp
- mblk_t *nce_fp_mp; /* link layer fast path mp */
- mblk_t *nce_res_mp; /* DL_UNITDATA_REQ */
- mblk_t *nce_qd_mp; /* Head outgoing queued packets */
-#define nce_last_mp_to_free nce_qd_mp
- mblk_t *nce_timer_mp; /* NDP timer mblk */
- mblk_t *nce_mp; /* mblk we are in, last to be freed */
- uint64_t nce_last; /* Time last reachable in msec */
- uint32_t nce_refcnt; /* nce active usage count */
- kmutex_t nce_lock; /* See comments on top for what */
+struct ncec_s {
+ struct ncec_s *ncec_next; /* Hash chain next pointer */
+ struct ncec_s **ncec_ptpn; /* Pointer to previous next */
+ struct ill_s *ncec_ill; /* Associated ill */
+ uint16_t ncec_flags; /* See below */
+ uint16_t ncec_state; /* See reachability states in if.h */
+ int16_t ncec_pcnt; /* Probe counter */
+ uint16_t ncec_rcnt; /* Retransmit counter */
+ in6_addr_t ncec_addr; /* address of the nighbor */
+ uchar_t *ncec_lladdr;
+ mblk_t *ncec_qd_mp; /* Head outgoing queued packets */
+ uint64_t ncec_last; /* Time last reachable in msec */
+ uint32_t ncec_refcnt; /* ncec active usage count */
+ kmutex_t ncec_lock; /* See comments on top for what */
/* this field protects */
- int nce_unsolicit_count; /* Unsolicited Adv count */
- struct nce_s *nce_fastpath; /* for fastpath list */
- timeout_id_t nce_timeout_id;
- uchar_t nce_ipversion; /* IPv4(ARP)/IPv6(NDP) version */
- uint_t nce_defense_count; /* number of NDP conflicts */
- uint_t nce_defense_time; /* last time defended (secs) */
- uint64_t nce_init_time; /* time when it was set to ND_INITIAL */
- boolean_t nce_trace_disable; /* True when alloc fails */
- list_t nce_cb;
- uint_t nce_cb_walker_cnt;
+ int ncec_unsolicit_count; /* Unsolicited Adv count */
+ timeout_id_t ncec_timeout_id;
+ uchar_t ncec_ipversion; /* IPv4(ARP)/IPv6(NDP) version */
+ uint_t ncec_defense_count; /* number of NDP conflicts */
+ uint_t ncec_last_time_defended; /* last time defended (secs) */
+ uint64_t ncec_init_time; /* time when it was set to ND_INITIAL */
+ boolean_t ncec_trace_disable; /* True when alloc fails */
+ /*
+ * interval to keep track of DAD probes.
+ */
+ clock_t ncec_xmit_interval;
+ ip_stack_t *ncec_ipst; /* Does not have a netstack_hold */
+ list_t ncec_cb; /* callbacks waiting for resolution */
+ uint_t ncec_cb_walker_cnt;
+ uint_t ncec_nprobes;
+ uint_t ncec_lladdr_length;
+};
+
+/*
+ * The nce_t list hangs off the ill_s and tracks information that depends
+ * on the underlying physical link. Thus when the ill goes down,
+ * the nce_t list has to be flushed. This is done as part of ill_delete()
+ *
+ * When the fastpath ack comes back in ill_fastpath_ack we call
+ * nce_fastpath_update to update the nce_t. We never actually
+ * flush the fastpath list, which is kept as an index into the
+ * ncec_t structures.
+ *
+ * when we ndp_delete, we remove the nce entries pointing
+ * at the dying ncec from the ill_fastpath_list chain.
+ *
+ */
+struct nce_s {
+ list_node_t nce_node;
+ ill_t *nce_ill;
+ boolean_t nce_is_condemned;
+ in6_addr_t nce_addr;
+ /*
+ * link-layer specific fields below
+ */
+ mblk_t *nce_dlur_mp; /* DL_UNITDATA_REQ mp */
+ mblk_t *nce_fp_mp; /* fast path mp */
+ struct ncec_s *nce_common;
+ kmutex_t nce_lock;
+ uint32_t nce_refcnt;
uint_t nce_ipif_cnt; /* number of ipifs with the nce_addr */
/* as their local address */
-} nce_t;
+};
/*
* The ndp_g_t structure contains protocol specific information needed
* to synchronize and manage neighbor cache entries for IPv4 and IPv6.
* There are 2 such structures, ips_ndp4 and ips_ndp6.
* ips_ndp6 contains the data structures needed for IPv6 Neighbor Discovery.
- * ips_ndp4 has IPv4 link layer info in its nce_t structures
- * Note that the nce_t is not currently used as the arp cache itself;
- * it is used for the following purposes:
- * - queue packets in nce_qd_mp while waiting for arp resolution to complete
- * - nce_{res, fp}_mp are used to track DL_UNITDATA request/responses.
- * - track state of ARP resolution in the nce_state;
+ * ips_ndp4 contains the data structures for IPv4 ARP.
*
* Locking notes:
* ndp_g_lock protects neighbor cache tables access and
- * insertion/removal of cache entries into/from these tables.
- * nce_lock protects nce_pcnt, nce_rcnt, nce_qd_mp nce_state, nce_res_mp,
- * nce_refcnt, nce_last, and nce_cb_walker_cnt.
- * nce_refcnt is incremented for every ire pointing to this nce and
- * every time ndp_lookup() finds an nce.
- * Should there be a need to obtain nce_lock and ndp_g_lock, ndp_g_lock is
+ * insertion/removal of cache entries into/from these tables. The ncec_lock
+ * and nce_lock protect fields in the ncec_t and nce_t structures.
+ * Should there be a need to obtain nce[c]_lock and ndp_g_lock, ndp_g_lock is
* acquired first.
- * To avoid becoming exclusive when deleting NCEs, ndp_walk() routine holds
- * the ndp_g_lock (i.e global lock) and marks NCEs to be deleted with
- * NCE_F_CONDEMNED. When all active users of such NCEs are gone the walk
- * routine passes a list for deletion to nce_ire_delete_list().
- *
- * When the link-layer address of some onlink host changes, ARP will send
- * an AR_CN_ANNOUNCE message to ip so that stale neighbor-cache
- * information will not get used. This message is processed in ip_arp_news()
- * by walking the nce list, and updating as appropriate. The ndp_g_hw_change
- * flag is set by ip_arp_news() to notify nce_t users that ip_arp_news() is
- * in progress.
*/
typedef struct ndp_g_s {
kmutex_t ndp_g_lock; /* Lock protecting cache hash table */
- nce_t *nce_mask_entries; /* mask not all ones */
- nce_t *nce_hash_tbl[NCE_TABLE_SIZE];
+ ncec_t *nce_hash_tbl[NCE_TABLE_SIZE];
int ndp_g_walker; /* # of active thread walking hash list */
boolean_t ndp_g_walker_cleanup; /* true implies defer deletion. */
- int ndp_g_hw_change; /* non-zero if nce flush in progress */
} ndp_g_t;
-#define NDP_HW_CHANGE_INCR(ndp) { \
- mutex_enter(&(ndp)->ndp_g_lock); \
- (ndp)->ndp_g_hw_change++; \
- mutex_exit(&(ndp)->ndp_g_lock); \
-}
-
-#define NDP_HW_CHANGE_DECR(ndp) { \
- mutex_enter(&(ndp)->ndp_g_lock); \
- (ndp)->ndp_g_hw_change--; \
- mutex_exit(&(ndp)->ndp_g_lock); \
-}
-
-/* nce_flags */
-#define NCE_F_PERMANENT 0x1
-#define NCE_F_MAPPING 0x2
+/* ncec_flags */
+#define NCE_F_MYADDR 0x1 /* ipif exists for the ncec_addr */
+#define NCE_F_UNVERIFIED 0x2 /* DAD in progress. */
#define NCE_F_ISROUTER 0x4
-/* unused 0x8 */
+#define NCE_F_FAST 0x8
+
+/*
+ * NCE_F_NONUD is used to disable IPv6 Neighbor Unreachability Detection or
+ * IPv4 aging and maps to the ATF_PERM flag for arp(1m)
+ */
#define NCE_F_NONUD 0x10
+
#define NCE_F_ANYCAST 0x20
#define NCE_F_CONDEMNED 0x40
#define NCE_F_UNSOL_ADV 0x80
#define NCE_F_BCAST 0x100
+#define NCE_F_MCAST 0x200
+
+/*
+ * NCE_F_PUBLISH is set for all ARP/ND entries that we announce. This
+ * includes locally configured addresses as well as those that we proxy for.
+ */
+#define NCE_F_PUBLISH 0x400
+
+/*
+ * NCE_F_AUTHORITY is set for any address that we have authoritatitve
+ * information for. This includes locally configured addresses as well
+ * as statically configured arp entries that are set up using the "permanent"
+ * option described in arp(1m). The NCE_F_AUTHORITY asserts that we would
+ * reject any updates for that nce's (host, link-layer-address) information
+ */
+#define NCE_F_AUTHORITY 0x800
-#define NCE_EXTERNAL_FLAGS_MASK \
- (NCE_F_PERMANENT | NCE_F_MAPPING | NCE_F_ISROUTER | NCE_F_NONUD | \
- NCE_F_ANYCAST | NCE_F_UNSOL_ADV)
+#define NCE_F_DELAYED 0x1000 /* rescheduled on dad_defend_rate */
+#define NCE_F_STATIC 0x2000
/* State REACHABLE, STALE, DELAY or PROBE */
-#define NCE_ISREACHABLE(nce) \
- (((((nce)->nce_state) >= ND_REACHABLE) && \
- ((nce)->nce_state) <= ND_PROBE))
+#define NCE_ISREACHABLE(ncec) \
+ (((((ncec)->ncec_state) >= ND_REACHABLE) && \
+ ((ncec)->ncec_state) <= ND_PROBE))
+
+#define NCE_ISCONDEMNED(ncec) ((ncec)->ncec_flags & NCE_F_CONDEMNED)
/* NDP flags set in SOL/ADV requests */
#define NDP_UNICAST 0x1
@@ -184,95 +202,14 @@ typedef struct ndp_g_s {
/* Number of packets queued in NDP for a neighbor */
#define ND_MAX_Q 4
-
-#ifdef DEBUG
-#define NCE_TRACE_REF(nce) nce_trace_ref(nce)
-#define NCE_UNTRACE_REF(nce) nce_untrace_ref(nce)
-#else
-#define NCE_TRACE_REF(nce)
-#define NCE_UNTRACE_REF(nce)
-#endif
-
-#define NCE_REFHOLD(nce) { \
- mutex_enter(&(nce)->nce_lock); \
- (nce)->nce_refcnt++; \
- ASSERT((nce)->nce_refcnt != 0); \
- NCE_TRACE_REF(nce); \
- mutex_exit(&(nce)->nce_lock); \
-}
-
-#define NCE_REFHOLD_NOTR(nce) { \
- mutex_enter(&(nce)->nce_lock); \
- (nce)->nce_refcnt++; \
- ASSERT((nce)->nce_refcnt != 0); \
- mutex_exit(&(nce)->nce_lock); \
-}
-
-#define NCE_REFHOLD_LOCKED(nce) { \
- ASSERT(MUTEX_HELD(&(nce)->nce_lock)); \
- (nce)->nce_refcnt++; \
- NCE_TRACE_REF(nce); \
-}
-
-/* nce_inactive destroys the mutex thus no mutex_exit is needed */
-#define NCE_REFRELE(nce) { \
- mutex_enter(&(nce)->nce_lock); \
- NCE_UNTRACE_REF(nce); \
- ASSERT((nce)->nce_refcnt != 0); \
- if (--(nce)->nce_refcnt == 0) \
- ndp_inactive(nce); \
- else { \
- mutex_exit(&(nce)->nce_lock);\
- } \
-}
-
-#define NCE_REFRELE_NOTR(nce) { \
- mutex_enter(&(nce)->nce_lock); \
- ASSERT((nce)->nce_refcnt != 0); \
- if (--(nce)->nce_refcnt == 0) \
- ndp_inactive(nce); \
- else { \
- mutex_exit(&(nce)->nce_lock);\
- } \
-}
-
-#define NDP_RESTART_TIMER(nce, ms) { \
- ASSERT(!MUTEX_HELD(&(nce)->nce_lock)); \
- if ((nce)->nce_timeout_id != 0) { \
- /* Ok to untimeout bad id. we don't hold a lock. */ \
- (void) untimeout((nce)->nce_timeout_id); \
- } \
- mutex_enter(&(nce)->nce_lock); \
- /* Don't start the timer if the nce has been deleted */ \
- if (!((nce)->nce_flags & NCE_F_CONDEMNED)) \
- nce->nce_timeout_id = timeout(ndp_timer, nce, \
- MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms)); \
- mutex_exit(&(nce)->nce_lock); \
-}
-
-/* Structure for ndp_cache_count() */
-typedef struct {
- int ncc_total; /* Total number of NCEs */
- int ncc_host; /* NCE entries without R bit set */
-} ncc_cache_count_t;
-
-/*
- * Structure of ndp_cache_reclaim(). Each field is a fraction i.e. 1 means
- * reclaim all, N means reclaim 1/Nth of all entries, 0 means reclaim none.
- */
-typedef struct {
- int ncr_host; /* Fraction for host entries */
-} nce_cache_reclaim_t;
-
/*
- * Structure for nce_delete_hw_changed; specifies an IPv4 address to link-layer
- * address mapping. Any route that has a cached copy of a mapping for that
- * IPv4 address that doesn't match the given mapping must be purged.
+ * Structure for nce_update_hw_changed;
*/
typedef struct {
ipaddr_t hwm_addr; /* IPv4 address */
- uint_t hwm_hwlen; /* Length of hardware address (may be 0) */
+ uint_t hwm_hwlen; /* Length of hardware address (may be 0) */
uchar_t *hwm_hwaddr; /* Pointer to new hardware address, if any */
+ int hwm_flags;
} nce_hw_map_t;
/* When SAP is greater than zero address appears before SAP */
@@ -284,6 +221,15 @@ typedef struct {
((sizeof (dl_unitdata_req_t)) + ((ill)->ill_phys_addr_length)) : \
(sizeof (dl_unitdata_req_t)))
+#define NCE_MYADDR(ncec) (((ncec)->ncec_flags & NCE_F_MYADDR) != 0)
+
+/*
+ * NCE_PUBLISH() identifies the addresses that we are publishing. This
+ * includes locally configured address (NCE_MYADDR()) as well as those that
+ * we are proxying.
+ */
+#define NCE_PUBLISH(ncec) ((ncec->ncec_flags & NCE_F_PUBLISH) != 0)
+
#ifdef _BIG_ENDIAN
#define NCE_LL_SAP_COPY(ill, mp) \
{ \
@@ -327,55 +273,65 @@ typedef struct {
/* NDP Cache Entry Hash Table */
#define NCE_TABLE_SIZE 256
-extern void ndp_cache_count(nce_t *, char *);
-extern void ndp_cache_reclaim(nce_t *, char *);
-extern void ndp_delete(nce_t *);
-extern void ndp_delete_per_ill(nce_t *, uchar_t *);
-extern void ndp_fastpath_flush(nce_t *, char *);
-extern boolean_t ndp_fastpath_update(nce_t *, void *);
+extern void ip_nce_reclaim(void *);
+extern void ncec_delete(ncec_t *);
+extern void ncec_delete_per_ill(ncec_t *, uchar_t *);
+extern void nce_fastpath_update(ill_t *, mblk_t *);
extern nd_opt_hdr_t *ndp_get_option(nd_opt_hdr_t *, int, int);
-extern void ndp_inactive(nce_t *);
-extern void ndp_input(ill_t *, mblk_t *, mblk_t *);
-extern boolean_t ndp_lookup_ipaddr(in_addr_t, netstack_t *);
-extern nce_t *ndp_lookup_v6(ill_t *, boolean_t, const in6_addr_t *,
- boolean_t);
-extern nce_t *ndp_lookup_v4(ill_t *, const in_addr_t *, boolean_t);
-extern int ndp_mcastreq(ill_t *, const in6_addr_t *, uint32_t, uint32_t,
+extern void ncec_inactive(ncec_t *);
+extern void ndp_input(mblk_t *, ip_recv_attr_t *);
+extern ncec_t *ncec_lookup_illgrp_v6(ill_t *, const in6_addr_t *);
+extern ncec_t *ncec_lookup_illgrp_v4(ill_t *, const in_addr_t *);
+extern nce_t *nce_lookup_v4(ill_t *, const in_addr_t *);
+extern nce_t *nce_lookup_v6(ill_t *, const in6_addr_t *);
+extern void nce_make_unreachable(ncec_t *);
+extern mblk_t *ndp_mcastreq(ill_t *, const in6_addr_t *, uint32_t, uint32_t,
mblk_t *);
-extern int ndp_noresolver(ill_t *, const in6_addr_t *);
-extern void ndp_process(nce_t *, uchar_t *, uint32_t, boolean_t);
+extern nce_t *ndp_nce_init(ill_t *, const in6_addr_t *, int);
+extern void nce_process(ncec_t *, uchar_t *, uint32_t, boolean_t);
extern int ndp_query(ill_t *, lif_nd_req_t *);
-extern int ndp_resolver(ill_t *, const in6_addr_t *, mblk_t *, zoneid_t);
extern int ndp_sioc_update(ill_t *, lif_nd_req_t *);
extern boolean_t ndp_verify_optlen(nd_opt_hdr_t *, int);
-extern void ndp_timer(void *);
-extern void ndp_walk(ill_t *, pfi_t, void *, ip_stack_t *);
-extern void ndp_walk_common(ndp_g_t *, ill_t *, pfi_t,
+extern void nce_timer(void *);
+extern void ncec_walk(ill_t *, pfi_t, void *, ip_stack_t *);
+extern void ncec_walk_common(ndp_g_t *, ill_t *, pfi_t,
void *, boolean_t);
-extern boolean_t ndp_restart_dad(nce_t *);
-extern void ndp_do_recovery(ipif_t *);
-extern void nce_resolv_failed(nce_t *);
-extern void arp_resolv_failed(nce_t *);
-extern void nce_fastpath_list_add(nce_t *);
-extern void nce_fastpath_list_delete(nce_t *);
-extern void nce_fastpath_list_dispatch(ill_t *,
- boolean_t (*)(nce_t *, void *), void *);
-extern void nce_queue_mp_common(nce_t *, mblk_t *, boolean_t);
-extern void nce_delete_hw_changed(nce_t *, void *);
-extern void nce_fastpath(nce_t *);
-extern int ndp_add_v6(ill_t *, uchar_t *, const in6_addr_t *,
- const in6_addr_t *, const in6_addr_t *, uint32_t, uint16_t, uint16_t,
- nce_t **);
-extern int ndp_lookup_then_add_v6(ill_t *, boolean_t, uchar_t *,
- const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, uint32_t,
- uint16_t, uint16_t, nce_t **);
-extern int ndp_lookup_then_add_v4(ill_t *,
- const in_addr_t *, uint16_t, nce_t **, nce_t *);
-extern void ip_ndp_resolve(nce_t *);
+extern boolean_t nce_restart_dad(ncec_t *);
+extern void ndp_resolv_failed(ncec_t *);
+extern void arp_resolv_failed(ncec_t *);
+extern void nce_fastpath_list_delete(ill_t *, ncec_t *, list_t *);
+extern void nce_queue_mp(ncec_t *, mblk_t *, boolean_t);
+extern void nce_update_hw_changed(ncec_t *, void *);
+extern int nce_lookup_then_add_v6(ill_t *, uchar_t *, uint_t,
+ const in6_addr_t *, uint16_t, uint16_t, nce_t **);
+extern int nce_lookup_then_add_v4(ill_t *, uchar_t *, uint_t,
+ const in_addr_t *, uint16_t, uint16_t, nce_t **);
+extern boolean_t nce_cmp_ll_addr(const ncec_t *, const uchar_t *, uint32_t);
+extern void nce_update(ncec_t *, uint16_t, uchar_t *);
+extern nce_t *nce_lookup_mapping(ill_t *, const in6_addr_t *);
+
+extern void nce_restart_timer(ncec_t *, uint_t);
+extern void ncec_refrele(ncec_t *);
+extern void ncec_refhold(ncec_t *);
+extern void ncec_refrele_notr(ncec_t *);
+extern void ncec_refhold_notr(ncec_t *);
+extern void nce_resolv_ok(ncec_t *);
+extern uint32_t ndp_solicit(ncec_t *, in6_addr_t, ill_t *);
+extern boolean_t ip_nce_conflict(mblk_t *, ip_recv_attr_t *, ncec_t *);
+extern boolean_t ndp_announce(ncec_t *);
+extern void ip_nce_lookup_and_update(ipaddr_t *, ipif_t *, ip_stack_t *,
+ uchar_t *, int, int);
+extern void nce_refrele(nce_t *);
+extern void nce_refhold(nce_t *);
+extern void nce_delete(nce_t *);
+extern void nce_flush(ill_t *, boolean_t);
+extern void nce_walk(ill_t *, pfi_t, void *);
+extern void ip_ndp_resolve(struct ncec_s *);
+extern void ip_addr_recover(ipsq_t *, queue_t *, mblk_t *, void *);
#ifdef DEBUG
-extern void nce_trace_ref(nce_t *);
-extern void nce_untrace_ref(nce_t *);
+extern void nce_trace_ref(ncec_t *);
+extern void nce_untrace_ref(ncec_t *);
#endif
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/inet/ip_netinfo.h b/usr/src/uts/common/inet/ip_netinfo.h
index b34cf0751e..a496248e23 100644
--- a/usr/src/uts/common/inet/ip_netinfo.h
+++ b/usr/src/uts/common/inet/ip_netinfo.h
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -41,10 +41,13 @@ extern void ip_net_init(ip_stack_t *, netstack_t *);
extern void ip_net_destroy(ip_stack_t *);
extern void ipv4_hook_init(ip_stack_t *);
extern void ipv6_hook_init(ip_stack_t *);
+extern void arp_hook_init(ip_stack_t *);
extern void ipv4_hook_destroy(ip_stack_t *);
extern void ipv6_hook_destroy(ip_stack_t *);
+extern void arp_hook_destroy(ip_stack_t *);
extern void ipv4_hook_shutdown(ip_stack_t *);
extern void ipv6_hook_shutdown(ip_stack_t *);
+extern void arp_hook_shutdown(ip_stack_t *);
extern void ip_ne_queue_func(void *);
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/inet/ip_rts.h b/usr/src/uts/common/inet/ip_rts.h
index 61bc451995..f5cbedd370 100644
--- a/usr/src/uts/common/inet/ip_rts.h
+++ b/usr/src/uts/common/inet/ip_rts.h
@@ -48,7 +48,8 @@ extern "C" {
#ifdef _KERNEL
extern void ip_rts_change(int, ipaddr_t, ipaddr_t,
- ipaddr_t, ipaddr_t, ipaddr_t, int, int, int, ip_stack_t *);
+ ipaddr_t, ipaddr_t, ipaddr_t, int, int,
+ int, ip_stack_t *);
extern void ip_rts_change_v6(int, const in6_addr_t *, const in6_addr_t *,
const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, int, int, int,
@@ -74,15 +75,17 @@ extern size_t rts_data_msg_size(int, sa_family_t, uint_t);
extern void rts_fill_msg_v6(int, int, const in6_addr_t *,
const in6_addr_t *, const in6_addr_t *, const in6_addr_t *,
- const in6_addr_t *, const in6_addr_t *, const ipif_t *, mblk_t *,
- uint_t, const tsol_gc_t *);
+ const in6_addr_t *, const in6_addr_t *, const in6_addr_t *,
+ const ill_t *, mblk_t *, const tsol_gc_t *);
extern size_t rts_header_msg_size(int);
+extern void rts_merge_metrics(iulp_t *, const iulp_t *);
+
extern void rts_queue_input(mblk_t *, conn_t *, sa_family_t, uint_t,
ip_stack_t *);
-extern int ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *, cred_t *);
+extern int ip_rts_request_common(mblk_t *mp, conn_t *, cred_t *);
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/inet/ip_stack.h b/usr/src/uts/common/inet/ip_stack.h
index b5d9715c65..d2f6c07234 100644
--- a/usr/src/uts/common/inet/ip_stack.h
+++ b/usr/src/uts/common/inet/ip_stack.h
@@ -38,6 +38,7 @@ extern "C" {
#ifdef _KERNEL
#include <sys/list.h>
+
/*
* IP statistics.
*/
@@ -46,52 +47,45 @@ extern "C" {
((ipst)->ips_ip_statistics.x.value.ui64 += (n))
typedef struct ip_stat {
- kstat_named_t ipsec_fanout_proto;
kstat_named_t ip_udp_fannorm;
kstat_named_t ip_udp_fanmb;
- kstat_named_t ip_udp_fanothers;
- kstat_named_t ip_udp_fast_path;
- kstat_named_t ip_udp_slow_path;
- kstat_named_t ip_udp_input_err;
- kstat_named_t ip_tcppullup;
- kstat_named_t ip_tcpoptions;
- kstat_named_t ip_multipkttcp;
- kstat_named_t ip_tcp_fast_path;
- kstat_named_t ip_tcp_slow_path;
- kstat_named_t ip_tcp_input_error;
+ kstat_named_t ip_recv_pullup;
kstat_named_t ip_db_ref;
- kstat_named_t ip_notaligned1;
- kstat_named_t ip_notaligned2;
- kstat_named_t ip_multimblk3;
- kstat_named_t ip_multimblk4;
- kstat_named_t ip_ipoptions;
- kstat_named_t ip_classify_fail;
+ kstat_named_t ip_notaligned;
+ kstat_named_t ip_multimblk;
kstat_named_t ip_opt;
- kstat_named_t ip_udp_rput_local;
kstat_named_t ipsec_proto_ahesp;
kstat_named_t ip_conn_flputbq;
kstat_named_t ip_conn_walk_drain;
kstat_named_t ip_out_sw_cksum;
+ kstat_named_t ip_out_sw_cksum_bytes;
kstat_named_t ip_in_sw_cksum;
- kstat_named_t ip_trash_ire_reclaim_calls;
- kstat_named_t ip_trash_ire_reclaim_success;
- kstat_named_t ip_ire_arp_timer_expired;
- kstat_named_t ip_ire_redirect_timer_expired;
- kstat_named_t ip_ire_pmtu_timer_expired;
- kstat_named_t ip_input_multi_squeue;
+ kstat_named_t ip_ire_reclaim_calls;
+ kstat_named_t ip_ire_reclaim_deleted;
+ kstat_named_t ip_nce_reclaim_calls;
+ kstat_named_t ip_nce_reclaim_deleted;
+ kstat_named_t ip_dce_reclaim_calls;
+ kstat_named_t ip_dce_reclaim_deleted;
kstat_named_t ip_tcp_in_full_hw_cksum_err;
kstat_named_t ip_tcp_in_part_hw_cksum_err;
kstat_named_t ip_tcp_in_sw_cksum_err;
- kstat_named_t ip_tcp_out_sw_cksum_bytes;
kstat_named_t ip_udp_in_full_hw_cksum_err;
kstat_named_t ip_udp_in_part_hw_cksum_err;
kstat_named_t ip_udp_in_sw_cksum_err;
- kstat_named_t ip_udp_out_sw_cksum_bytes;
- kstat_named_t ip_frag_mdt_pkt_out;
- kstat_named_t ip_frag_mdt_discarded;
- kstat_named_t ip_frag_mdt_allocfail;
- kstat_named_t ip_frag_mdt_addpdescfail;
- kstat_named_t ip_frag_mdt_allocd;
+ kstat_named_t conn_in_recvdstaddr;
+ kstat_named_t conn_in_recvopts;
+ kstat_named_t conn_in_recvif;
+ kstat_named_t conn_in_recvslla;
+ kstat_named_t conn_in_recvucred;
+ kstat_named_t conn_in_recvttl;
+ kstat_named_t conn_in_recvhopopts;
+ kstat_named_t conn_in_recvhoplimit;
+ kstat_named_t conn_in_recvdstopts;
+ kstat_named_t conn_in_recvrthdrdstopts;
+ kstat_named_t conn_in_recvrthdr;
+ kstat_named_t conn_in_recvpktinfo;
+ kstat_named_t conn_in_recvtclass;
+ kstat_named_t conn_in_timestamp;
} ip_stat_t;
@@ -103,20 +97,22 @@ typedef struct ip_stat {
((ipst)->ips_ip6_statistics.x.value.ui64 += (n))
typedef struct ip6_stat {
- kstat_named_t ip6_udp_fast_path;
- kstat_named_t ip6_udp_slow_path;
kstat_named_t ip6_udp_fannorm;
kstat_named_t ip6_udp_fanmb;
+ kstat_named_t ip6_recv_pullup;
+ kstat_named_t ip6_db_ref;
+ kstat_named_t ip6_notaligned;
+ kstat_named_t ip6_multimblk;
+ kstat_named_t ipsec_proto_ahesp;
kstat_named_t ip6_out_sw_cksum;
+ kstat_named_t ip6_out_sw_cksum_bytes;
kstat_named_t ip6_in_sw_cksum;
kstat_named_t ip6_tcp_in_full_hw_cksum_err;
kstat_named_t ip6_tcp_in_part_hw_cksum_err;
kstat_named_t ip6_tcp_in_sw_cksum_err;
- kstat_named_t ip6_tcp_out_sw_cksum_bytes;
kstat_named_t ip6_udp_in_full_hw_cksum_err;
kstat_named_t ip6_udp_in_part_hw_cksum_err;
kstat_named_t ip6_udp_in_sw_cksum_err;
- kstat_named_t ip6_udp_out_sw_cksum_bytes;
kstat_named_t ip6_frag_mdt_pkt_out;
kstat_named_t ip6_frag_mdt_discarded;
kstat_named_t ip6_frag_mdt_allocfail;
@@ -150,6 +146,8 @@ typedef struct srcid_map {
struct ip_stack {
netstack_t *ips_netstack; /* Common netstack */
+ uint_t ips_src_generation; /* Both IPv4 and IPv6 */
+
struct ipparam_s *ips_param_arr; /* ndd variable table */
struct ipndp_s *ips_ndp_arr;
@@ -178,10 +176,6 @@ struct ip_stack {
kmutex_t ips_ip_mi_lock;
kmutex_t ips_ip_addr_avail_lock;
krwlock_t ips_ill_g_lock;
- krwlock_t ips_ipsec_capab_ills_lock;
- /* protects the list of IPsec capable ills */
- struct ipsec_capab_ill_s *ips_ipsec_capab_ills_ah;
- struct ipsec_capab_ill_s *ips_ipsec_capab_ills_esp;
krwlock_t ips_ill_g_usesrc_lock;
@@ -198,10 +192,10 @@ struct ip_stack {
struct connf_s *ips_rts_clients;
struct connf_s *ips_ipcl_conn_fanout;
struct connf_s *ips_ipcl_bind_fanout;
- struct connf_s *ips_ipcl_proto_fanout;
+ struct connf_s *ips_ipcl_proto_fanout_v4;
struct connf_s *ips_ipcl_proto_fanout_v6;
struct connf_s *ips_ipcl_udp_fanout;
- struct connf_s *ips_ipcl_raw_fanout;
+ struct connf_s *ips_ipcl_raw_fanout; /* RAW SCTP sockets */
struct connf_s *ips_ipcl_iptun_fanout;
uint_t ips_ipcl_conn_fanout_size;
uint_t ips_ipcl_bind_fanout_size;
@@ -237,31 +231,47 @@ struct ip_stack {
/* IPv4 forwarding table */
struct radix_node_head *ips_ip_ftable;
- /* This is dynamically allocated in ip_ire_init */
- struct irb *ips_ip_cache_table;
-
#define IPV6_ABITS 128
#define IP6_MASK_TABLE_SIZE (IPV6_ABITS + 1) /* 129 ptrs */
-
struct irb *ips_ip_forwarding_table_v6[IP6_MASK_TABLE_SIZE];
- /* This is dynamically allocated in ip_ire_init */
- struct irb *ips_ip_cache_table_v6;
- uint32_t ips_ire_handle;
/*
* ire_ft_init_lock is used while initializing ip_forwarding_table
* dynamically in ire_add.
*/
kmutex_t ips_ire_ft_init_lock;
- kmutex_t ips_ire_handle_lock; /* Protects ire_handle */
- uint32_t ips_ip_cache_table_size;
- uint32_t ips_ip6_cache_table_size;
+ /*
+ * This is the IPv6 counterpart of RADIX_NODE_HEAD_LOCK. It is used
+ * to prevent adds and deletes while we are doing a ftable_lookup
+ * and extracting the ire_generation.
+ */
+ krwlock_t ips_ip6_ire_head_lock;
+
uint32_t ips_ip6_ftable_hash_size;
ire_stats_t ips_ire_stats_v4; /* IPv4 ire statistics */
ire_stats_t ips_ire_stats_v6; /* IPv6 ire statistics */
+ /* Count how many condemned objects for kmem_cache callbacks */
+ uint32_t ips_num_ire_condemned;
+ uint32_t ips_num_nce_condemned;
+ uint32_t ips_num_dce_condemned;
+
+ struct ire_s *ips_ire_reject_v4; /* For unreachable dests */
+ struct ire_s *ips_ire_reject_v6; /* For unreachable dests */
+ struct ire_s *ips_ire_blackhole_v4; /* For temporary failures */
+ struct ire_s *ips_ire_blackhole_v6; /* For temporary failures */
+
+ /* ips_ire_dep_lock protects ire_dep_* relationship between IREs */
+ krwlock_t ips_ire_dep_lock;
+
+ /* Destination Cache Entries */
+ struct dce_s *ips_dce_default;
+ uint_t ips_dce_hashsize;
+ struct dcb_s *ips_dce_hash_v4;
+ struct dcb_s *ips_dce_hash_v6;
+
/* pending binds */
mblk_t *ips_ip6_asp_pending_ops;
mblk_t *ips_ip6_asp_pending_ops_tail;
@@ -293,9 +303,10 @@ struct ip_stack {
uint_t ips_icmp_pkt_err_sent;
/* Protected by ip_mi_lock */
- void *ips_ip_g_head; /* Instance Data List Head */
+ void *ips_ip_g_head; /* IP Instance Data List Head */
+ void *ips_arp_g_head; /* ARP Instance Data List Head */
- caddr_t ips_ip_g_nd; /* Named Dispatch List Head */
+ caddr_t ips_ip_g_nd; /* Named Dispatch List Head */
/* Multirouting stuff */
/* Interval (in ms) between consecutive 'bad MTU' warnings */
@@ -306,27 +317,11 @@ struct ip_stack {
struct cgtp_filter_ops *ips_ip_cgtp_filter_ops; /* CGTP hooks */
boolean_t ips_ip_cgtp_filter; /* Enable/disable CGTP hooks */
- kmutex_t ips_ip_trash_timer_lock;
- timeout_id_t ips_ip_ire_expire_id; /* IRE expiration timer. */
struct ipsq_s *ips_ipsq_g_head;
uint_t ips_ill_index; /* Used to assign interface indicies */
/* When set search for unused index */
boolean_t ips_ill_index_wrap;
- clock_t ips_ip_ire_arp_time_elapsed;
- /* Time since IRE cache last flushed */
- clock_t ips_ip_ire_rd_time_elapsed;
- /* ... redirect IREs last flushed */
- clock_t ips_ip_ire_pmtu_time_elapsed;
- /* Time since path mtu increase */
-
- uint_t ips_ip_redirect_cnt;
- /* Num of redirect routes in ftable */
- uint_t ips_ipv6_ire_default_count;
- /* Number of IPv6 IRE_DEFAULT entries */
- uint_t ips_ipv6_ire_default_index;
- /* Walking IPv6 index used to mod in */
-
uint_t ips_loopback_packets;
/* NDP/NCE structures for IPv4 and IPv6 */
@@ -379,15 +374,17 @@ struct ip_stack {
struct srcid_map *ips_srcid_head;
krwlock_t ips_srcid_lock;
- uint64_t ips_ipif_g_seqid;
+ uint64_t ips_ipif_g_seqid; /* Used only for sctp_addr.c */
union phyint_list_u *ips_phyint_g_list; /* start of phyint list */
-/* ip_neti.c */
+/* ip_netinfo.c */
hook_family_t ips_ipv4root;
hook_family_t ips_ipv6root;
+ hook_family_t ips_arproot;
net_handle_t ips_ipv4_net_data;
net_handle_t ips_ipv6_net_data;
+ net_handle_t ips_arp_net_data;
/*
* Hooks for firewalling
@@ -397,17 +394,23 @@ struct ip_stack {
hook_event_t ips_ip4_forwarding_event;
hook_event_t ips_ip4_loopback_in_event;
hook_event_t ips_ip4_loopback_out_event;
+
hook_event_t ips_ip6_physical_in_event;
hook_event_t ips_ip6_physical_out_event;
hook_event_t ips_ip6_forwarding_event;
hook_event_t ips_ip6_loopback_in_event;
hook_event_t ips_ip6_loopback_out_event;
+ hook_event_t ips_arp_physical_in_event;
+ hook_event_t ips_arp_physical_out_event;
+ hook_event_t ips_arp_nic_events;
+
hook_event_token_t ips_ipv4firewall_physical_in;
hook_event_token_t ips_ipv4firewall_physical_out;
hook_event_token_t ips_ipv4firewall_forwarding;
hook_event_token_t ips_ipv4firewall_loopback_in;
hook_event_token_t ips_ipv4firewall_loopback_out;
+
hook_event_token_t ips_ipv6firewall_physical_in;
hook_event_token_t ips_ipv6firewall_physical_out;
hook_event_token_t ips_ipv6firewall_forwarding;
@@ -419,6 +422,10 @@ struct ip_stack {
hook_event_token_t ips_ipv4nicevents;
hook_event_token_t ips_ipv6nicevents;
+ hook_event_token_t ips_arp_physical_in;
+ hook_event_token_t ips_arp_physical_out;
+ hook_event_token_t ips_arpnicevents;
+
net_handle_t ips_ip4_observe_pr;
net_handle_t ips_ip6_observe_pr;
hook_event_t ips_ip4_observe;
@@ -432,13 +439,6 @@ struct ip_stack {
krwlock_t ips_ipmp_lock;
mod_hash_t *ips_ipmp_grp_hash;
-/* igmp.c */
- /* multicast restart timers thread logic */
- kmutex_t ips_mrt_lock;
- uint_t ips_mrt_flags;
- kcondvar_t ips_mrt_cv;
- kcondvar_t ips_mrt_done_cv;
- kthread_t *ips_mrt_thread;
};
typedef struct ip_stack ip_stack_t;
diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h
index e24bcd9a73..15a7c32376 100644
--- a/usr/src/uts/common/inet/ipclassifier.h
+++ b/usr/src/uts/common/inet/ipclassifier.h
@@ -41,8 +41,11 @@ extern "C" {
#include <sys/sunddi.h>
#include <sys/sunldi.h>
-typedef void (*edesc_spf)(void *, mblk_t *, void *, int);
-typedef void (*edesc_rpf)(void *, mblk_t *, void *);
+typedef void (*edesc_rpf)(void *, mblk_t *, void *, ip_recv_attr_t *);
+struct icmph_s;
+struct icmp6_hdr;
+typedef boolean_t (*edesc_vpf)(conn_t *, void *, struct icmph_s *,
+ struct icmp6_hdr *, ip_recv_attr_t *);
/*
* ==============================
@@ -53,7 +56,7 @@ typedef void (*edesc_rpf)(void *, mblk_t *, void *);
/*
* The connection structure contains the common information/flags/ref needed.
* Implementation will keep the connection struct, the layers (with their
- * respective data for event i.e. tcp_t if event was tcp_input) all in one
+ * respective data for event i.e. tcp_t if event was tcp_input_data) all in one
* contiguous memory location.
*/
@@ -61,14 +64,14 @@ typedef void (*edesc_rpf)(void *, mblk_t *, void *);
/* Unused 0x00020000 */
/* Unused 0x00040000 */
#define IPCL_FULLY_BOUND 0x00080000 /* Bound to correct squeue */
-#define IPCL_CHECK_POLICY 0x00100000 /* Needs policy checking */
-#define IPCL_SOCKET 0x00200000 /* Sockfs connection */
-#define IPCL_ACCEPTOR 0x00400000 /* Sockfs priv acceptor */
+/* Unused 0x00100000 */
+/* Unused 0x00200000 */
+/* Unused 0x00400000 */
#define IPCL_CL_LISTENER 0x00800000 /* Cluster listener */
-#define IPCL_EAGER 0x01000000 /* Incoming connection */
+/* Unused 0x01000000 */
/* Unused 0x02000000 */
-#define IPCL_TCP6 0x04000000 /* AF_INET6 TCP */
-#define IPCL_TCP4 0x08000000 /* IPv4 packet format TCP */
+/* Unused 0x04000000 */
+/* Unused 0x08000000 */
/* Unused 0x10000000 */
/* Unused 0x20000000 */
#define IPCL_CONNECTED 0x40000000 /* Conn in connected table */
@@ -83,41 +86,21 @@ typedef void (*edesc_rpf)(void *, mblk_t *, void *);
#define IPCL_RTSCONN 0x00000020 /* From rts_conn_cache */
/* Unused 0x00000040 */
#define IPCL_IPTUN 0x00000080 /* iptun module above us */
+
#define IPCL_NONSTR 0x00001000 /* A non-STREAMS socket */
-#define IPCL_IN_SQUEUE 0x10000000 /* Waiting squeue to finish */
+/* Unused 0x10000000 */
-/* Conn Masks */
-#define IPCL_TCP (IPCL_TCP4|IPCL_TCP6)
#define IPCL_REMOVED 0x00000100
#define IPCL_REUSED 0x00000200
-/* The packet format is IPv4; could be an AF_INET or AF_INET6 socket */
-#define IPCL_IS_TCP4(connp) \
- (((connp)->conn_flags & IPCL_TCP4))
-
-/* Connected AF_INET with no IPsec policy */
-#define IPCL_IS_TCP4_CONNECTED_NO_POLICY(connp) \
- (((connp)->conn_flags & \
- (IPCL_TCP4|IPCL_CONNECTED|IPCL_CHECK_POLICY|IPCL_TCP6)) \
- == (IPCL_TCP4|IPCL_CONNECTED))
-
#define IPCL_IS_CONNECTED(connp) \
((connp)->conn_flags & IPCL_CONNECTED)
#define IPCL_IS_BOUND(connp) \
((connp)->conn_flags & IPCL_BOUND)
-/* AF_INET TCP that is bound */
-#define IPCL_IS_TCP4_BOUND(connp) \
- (((connp)->conn_flags & \
- (IPCL_TCP4|IPCL_BOUND|IPCL_TCP6)) == \
- (IPCL_TCP4|IPCL_BOUND))
-
-#define IPCL_IS_FULLY_BOUND(connp) \
- ((connp)->conn_flags & IPCL_FULLY_BOUND)
-
/*
- * Can't use conn_protocol since we need to tell difference
+ * Can't use conn_proto since we need to tell difference
* between a real TCP socket and a SOCK_RAW, IPPROTO_TCP.
*/
#define IPCL_IS_TCP(connp) \
@@ -180,22 +163,80 @@ typedef struct ip_helper_stream_info_s {
#define CONN_MAC_IMPLICIT 2
/*
+ * conn receive ancillary definition.
+ *
+ * These are the set of socket options that make the receive side
+ * potentially pass up ancillary data items.
+ * We have a union with an integer so that we can quickly check whether
+ * any ancillary data items need to be added.
+ */
+typedef struct crb_s {
+ union {
+ uint32_t crbu_all;
+ struct {
+ uint32_t
+ crbb_recvdstaddr : 1, /* IP_RECVDSTADDR option */
+ crbb_recvopts : 1, /* IP_RECVOPTS option */
+ crbb_recvif : 1, /* IP_RECVIF option */
+ crbb_recvslla : 1, /* IP_RECVSLLA option */
+
+ crbb_recvttl : 1, /* IP_RECVTTL option */
+ crbb_ip_recvpktinfo : 1, /* IP*_RECVPKTINFO option */
+ crbb_ipv6_recvhoplimit : 1, /* IPV6_RECVHOPLIMIT option */
+ crbb_ipv6_recvhopopts : 1, /* IPV6_RECVHOPOPTS option */
+
+ crbb_ipv6_recvdstopts : 1, /* IPV6_RECVDSTOPTS option */
+ crbb_ipv6_recvrthdr : 1, /* IPV6_RECVRTHDR option */
+ crbb_old_ipv6_recvdstopts : 1, /* old form of IPV6_DSTOPTS */
+ crbb_ipv6_recvrthdrdstopts : 1, /* IPV6_RECVRTHDRDSTOPTS */
+
+ crbb_ipv6_recvtclass : 1, /* IPV6_RECVTCLASS */
+ crbb_recvucred : 1, /* IP_RECVUCRED option */
+ crbb_timestamp : 1; /* SO_TIMESTAMP "socket" option */
+
+ } crbb;
+ } crbu;
+} crb_t;
+
+#define crb_all crbu.crbu_all
+#define crb_recvdstaddr crbu.crbb.crbb_recvdstaddr
+#define crb_recvopts crbu.crbb.crbb_recvopts
+#define crb_recvif crbu.crbb.crbb_recvif
+#define crb_recvslla crbu.crbb.crbb_recvslla
+#define crb_recvttl crbu.crbb.crbb_recvttl
+#define crb_ip_recvpktinfo crbu.crbb.crbb_ip_recvpktinfo
+#define crb_ipv6_recvhoplimit crbu.crbb.crbb_ipv6_recvhoplimit
+#define crb_ipv6_recvhopopts crbu.crbb.crbb_ipv6_recvhopopts
+#define crb_ipv6_recvdstopts crbu.crbb.crbb_ipv6_recvdstopts
+#define crb_ipv6_recvrthdr crbu.crbb.crbb_ipv6_recvrthdr
+#define crb_old_ipv6_recvdstopts crbu.crbb.crbb_old_ipv6_recvdstopts
+#define crb_ipv6_recvrthdrdstopts crbu.crbb.crbb_ipv6_recvrthdrdstopts
+#define crb_ipv6_recvtclass crbu.crbb.crbb_ipv6_recvtclass
+#define crb_recvucred crbu.crbb.crbb_recvucred
+#define crb_timestamp crbu.crbb.crbb_timestamp
+
+/*
* The initial fields in the conn_t are setup by the kmem_cache constructor,
* and are preserved when it is freed. Fields after that are bzero'ed when
* the conn_t is freed.
+ *
+ * Much of the conn_t is protected by conn_lock.
+ *
+ * conn_lock is also used by some ULPs (like UDP and RAWIP) to protect
+ * their state.
*/
struct conn_s {
kmutex_t conn_lock;
uint32_t conn_ref; /* Reference counter */
uint32_t conn_flags; /* Conn Flags */
-
union {
tcp_t *cp_tcp; /* Pointer to the tcp struct */
struct udp_s *cp_udp; /* Pointer to the udp struct */
struct icmp_s *cp_icmp; /* Pointer to rawip struct */
struct rts_s *cp_rts; /* Pointer to rts struct */
struct iptun_s *cp_iptun; /* Pointer to iptun_t */
+ struct sctp_s *cp_sctp; /* For IPCL_SCTPCONN */
void *cp_priv;
} conn_proto_priv;
#define conn_tcp conn_proto_priv.cp_tcp
@@ -203,71 +244,68 @@ struct conn_s {
#define conn_icmp conn_proto_priv.cp_icmp
#define conn_rts conn_proto_priv.cp_rts
#define conn_iptun conn_proto_priv.cp_iptun
+#define conn_sctp conn_proto_priv.cp_sctp
#define conn_priv conn_proto_priv.cp_priv
kcondvar_t conn_cv;
- uint8_t conn_ulp; /* protocol type */
+ uint8_t conn_proto; /* protocol type */
edesc_rpf conn_recv; /* Pointer to recv routine */
+ edesc_rpf conn_recvicmp; /* For ICMP error */
+ edesc_vpf conn_verifyicmp; /* Verify ICMP error */
+
+ ip_xmit_attr_t *conn_ixa; /* Options if no ancil data */
/* Fields after this are bzero'ed when the conn_t is freed. */
+#define conn_start_clr conn_recv_ancillary
+
+ /* Options for receive-side ancillary data */
+ crb_t conn_recv_ancillary;
squeue_t *conn_sqp; /* Squeue for processing */
uint_t conn_state_flags; /* IP state flags */
-#define conn_start_clr conn_state_flags
- ire_t *conn_ire_cache; /* outbound ire cache */
+ int conn_lingertime; /* linger time (in seconds) */
+
unsigned int
conn_on_sqp : 1, /* Conn is being processed */
- conn_dontroute : 1, /* SO_DONTROUTE state */
- conn_loopback : 1, /* SO_LOOPBACK state */
+ conn_linger : 1, /* SO_LINGER state */
+ conn_useloopback : 1, /* SO_USELOOPBACK state */
conn_broadcast : 1, /* SO_BROADCAST state */
conn_reuseaddr : 1, /* SO_REUSEADDR state */
- conn_multicast_loop : 1, /* IP_MULTICAST_LOOP */
+ conn_keepalive : 1, /* SO_KEEPALIVE state */
conn_multi_router : 1, /* Wants all multicast pkts */
- conn_draining : 1, /* ip_wsrv running */
-
conn_did_putbq : 1, /* ip_wput did a putbq */
+
conn_unspec_src : 1, /* IP_UNSPEC_SRC */
conn_policy_cached : 1, /* Is policy cached/latched ? */
conn_in_enforce_policy : 1, /* Enforce Policy on inbound */
-
conn_out_enforce_policy : 1, /* Enforce Policy on outbound */
- conn_af_isv6 : 1, /* ip address family ver 6 */
- conn_pkt_isv6 : 1, /* ip packet format ver 6 */
- conn_ip_recvpktinfo : 1, /* IPV*_RECVPKTINFO option */
-
- conn_ipv6_recvhoplimit : 1, /* IPV6_RECVHOPLIMIT option */
- conn_ipv6_recvhopopts : 1, /* IPV6_RECVHOPOPTS option */
- conn_ipv6_recvdstopts : 1, /* IPV6_RECVDSTOPTS option */
- conn_ipv6_recvrthdr : 1, /* IPV6_RECVRTHDR option */
- conn_ipv6_recvrtdstopts : 1, /* IPV6_RECVRTHDRDSTOPTS */
+ conn_debug : 1, /* SO_DEBUG */
conn_ipv6_v6only : 1, /* IPV6_V6ONLY */
- conn_ipv6_recvtclass : 1, /* IPV6_RECVTCLASS */
+ conn_oobinline : 1, /* SO_OOBINLINE state */
+ conn_dgram_errind : 1, /* SO_DGRAM_ERRIND state */
+
+ conn_exclbind : 1, /* SO_EXCLBIND state */
+ conn_mdt_ok : 1, /* MDT is permitted */
+ conn_allzones : 1, /* SO_ALLZONES */
conn_ipv6_recvpathmtu : 1, /* IPV6_RECVPATHMTU */
- conn_pathmtu_valid : 1, /* The cached mtu is valid. */
- conn_ipv6_dontfrag : 1, /* IPV6_DONTFRAG */
- conn_fully_bound : 1, /* Fully bound connection */
- conn_recvif : 1, /* IP_RECVIF option */
+ conn_mcbc_bind : 1, /* Bound to multi/broadcast */
- conn_recvslla : 1, /* IP_RECVSLLA option */
- conn_mdt_ok : 1, /* MDT is permitted */
- conn_nexthop_set : 1,
- conn_allzones : 1; /* SO_ALLZONES */
+ conn_pad_to_bit_31 : 11;
- unsigned int
- conn_lso_ok : 1; /* LSO is usable */
boolean_t conn_direct_blocked; /* conn is flow-controlled */
squeue_t *conn_initial_sqp; /* Squeue at open time */
squeue_t *conn_final_sqp; /* Squeue after connect */
ill_t *conn_dhcpinit_ill; /* IP_DHCPINIT_IF */
- ipsec_latch_t *conn_latch; /* latched state */
- ill_t *conn_outgoing_ill; /* IP{,V6}_BOUND_IF */
- edesc_spf conn_send; /* Pointer to send routine */
+ ipsec_latch_t *conn_latch; /* latched IDS */
+ struct ipsec_policy_s *conn_latch_in_policy; /* latched policy (in) */
+ struct ipsec_action_s *conn_latch_in_action; /* latched action (in) */
+ uint_t conn_bound_if; /* IP*_BOUND_IF */
queue_t *conn_rq; /* Read queue */
queue_t *conn_wq; /* Write queue */
dev_t conn_dev; /* Minor number */
@@ -275,80 +313,137 @@ struct conn_s {
ip_helper_stream_info_t *conn_helper_info;
cred_t *conn_cred; /* Credentials */
+ pid_t conn_cpid; /* pid from open/connect */
+ uint64_t conn_open_time; /* time when this was opened */
+
connf_t *conn_g_fanout; /* Global Hash bucket head */
struct conn_s *conn_g_next; /* Global Hash chain next */
struct conn_s *conn_g_prev; /* Global Hash chain prev */
struct ipsec_policy_head_s *conn_policy; /* Configured policy */
- in6_addr_t conn_bound_source_v6;
-#define conn_bound_source V4_PART_OF_V6(conn_bound_source_v6)
-
+ in6_addr_t conn_bound_addr_v6; /* Address in bind() */
+#define conn_bound_addr_v4 V4_PART_OF_V6(conn_bound_addr_v6)
connf_t *conn_fanout; /* Hash bucket we're part of */
struct conn_s *conn_next; /* Hash chain next */
struct conn_s *conn_prev; /* Hash chain prev */
+
struct {
- in6_addr_t connua_laddr; /* Local address */
+ in6_addr_t connua_laddr; /* Local address - match */
in6_addr_t connua_faddr; /* Remote address */
} connua_v6addr;
-#define conn_src V4_PART_OF_V6(connua_v6addr.connua_laddr)
-#define conn_rem V4_PART_OF_V6(connua_v6addr.connua_faddr)
-#define conn_srcv6 connua_v6addr.connua_laddr
-#define conn_remv6 connua_v6addr.connua_faddr
+#define conn_laddr_v4 V4_PART_OF_V6(connua_v6addr.connua_laddr)
+#define conn_faddr_v4 V4_PART_OF_V6(connua_v6addr.connua_faddr)
+#define conn_laddr_v6 connua_v6addr.connua_laddr
+#define conn_faddr_v6 connua_v6addr.connua_faddr
+ in6_addr_t conn_saddr_v6; /* Local address - source */
+#define conn_saddr_v4 V4_PART_OF_V6(conn_saddr_v6)
+
union {
/* Used for classifier match performance */
- uint32_t conn_ports2;
+ uint32_t connu_ports2;
struct {
- in_port_t tcpu_fport; /* Remote port */
- in_port_t tcpu_lport; /* Local port */
- } tcpu_ports;
+ in_port_t connu_fport; /* Remote port */
+ in_port_t connu_lport; /* Local port */
+ } connu_ports;
} u_port;
-#define conn_fport u_port.tcpu_ports.tcpu_fport
-#define conn_lport u_port.tcpu_ports.tcpu_lport
-#define conn_ports u_port.conn_ports2
-#define conn_upq conn_rq
- uint8_t conn_unused_byte;
-
- uint_t conn_proto; /* SO_PROTOTYPE state */
- ill_t *conn_incoming_ill; /* IP{,V6}_BOUND_IF */
+#define conn_fport u_port.connu_ports.connu_fport
+#define conn_lport u_port.connu_ports.connu_lport
+#define conn_ports u_port.connu_ports2
+
+ uint_t conn_incoming_ifindex; /* IP{,V6}_BOUND_IF, scopeid */
ill_t *conn_oper_pending_ill; /* pending shared ioctl */
- ilg_t *conn_ilg; /* Group memberships */
- int conn_ilg_allocated; /* Number allocated */
- int conn_ilg_inuse; /* Number currently used */
- int conn_ilg_walker_cnt; /* No of ilg walkers */
- /* XXXX get rid of this, once ilg_delete_all is fixed */
- kcondvar_t conn_refcv;
-
- struct ipif_s *conn_multicast_ipif; /* IP_MULTICAST_IF */
- ill_t *conn_multicast_ill; /* IPV6_MULTICAST_IF */
- struct conn_s *conn_drain_next; /* Next conn in drain list */
- struct conn_s *conn_drain_prev; /* Prev conn in drain list */
+ krwlock_t conn_ilg_lock; /* Protects conn_ilg_* */
+ ilg_t *conn_ilg; /* Group memberships */
+
+ kcondvar_t conn_refcv; /* For conn_oper_pending_ill */
+
+ struct conn_s *conn_drain_next; /* Next conn in drain list */
+ struct conn_s *conn_drain_prev; /* Prev conn in drain list */
idl_t *conn_idl; /* Ptr to the drain list head */
mblk_t *conn_ipsec_opt_mp; /* ipsec option mblk */
- uint32_t conn_src_preferences; /* prefs for src addr select */
- /* mtuinfo from IPV6_PACKET_TOO_BIG conditional on conn_pathmtu_valid */
- struct ip6_mtuinfo mtuinfo;
zoneid_t conn_zoneid; /* zone connection is in */
- in6_addr_t conn_nexthop_v6; /* nexthop IP address */
- uchar_t conn_broadcast_ttl; /* IP_BROADCAST_TTL */
-#define conn_nexthop_v4 V4_PART_OF_V6(conn_nexthop_v6)
- cred_t *conn_effective_cred; /* Effective TX credentials */
int conn_rtaware; /* RT_AWARE sockopt value */
kcondvar_t conn_sq_cv; /* For non-STREAMS socket IO */
- kthread_t *conn_sq_caller; /* Caller of squeue sync ops */
sock_upcalls_t *conn_upcalls; /* Upcalls to sockfs */
sock_upper_handle_t conn_upper_handle; /* Upper handle: sonode * */
unsigned int
- conn_ulp_labeled : 1, /* ULP label is synced */
conn_mlp_type : 2, /* mlp_type_t; tsol/tndb.h */
conn_anon_mlp : 1, /* user wants anon MLP */
-
conn_anon_port : 1, /* user bound anonymously */
+
conn_mac_mode : 2, /* normal/loose/implicit MAC */
- conn_spare : 26;
+ conn_anon_priv_bind : 1, /* *_ANON_PRIV_BIND state */
+ conn_zone_is_global : 1, /* GLOBAL_ZONEID */
+ conn_spare : 24;
boolean_t conn_flow_cntrld;
netstack_t *conn_netstack; /* Corresponds to a netstack_hold */
+
+ /*
+ * IP format that packets received for this struct should use.
+ * Value can be IP4_VERSION or IPV6_VERSION.
+ * The sending version is encoded using IXAF_IS_IPV4.
+ */
+ ushort_t conn_ipversion;
+
+ /* Written to only once at the time of opening the endpoint */
+ sa_family_t conn_family; /* Family from socket() call */
+ uint_t conn_so_type; /* Type from socket() call */
+
+ uint_t conn_sndbuf; /* SO_SNDBUF state */
+ uint_t conn_rcvbuf; /* SO_RCVBUF state */
+ uint_t conn_wroff; /* Current write offset */
+
+ uint_t conn_sndlowat; /* Send buffer low water mark */
+ uint_t conn_rcvlowat; /* Recv buffer low water mark */
+
+ uint8_t conn_default_ttl; /* Default TTL/hoplimit */
+
+ uint32_t conn_flowinfo; /* Connected flow id and tclass */
+
+ /*
+ * The most recent address for sendto. Initially set to zero
+ * which is always different than then the destination address
+ * since the send interprets zero as the loopback address.
+ */
+ in6_addr_t conn_v6lastdst;
+#define conn_v4lastdst V4_PART_OF_V6(conn_v6lastdst)
+ ushort_t conn_lastipversion;
+ in_port_t conn_lastdstport;
+ uint32_t conn_lastflowinfo; /* IPv6-only */
+ uint_t conn_lastscopeid; /* IPv6-only */
+ uint_t conn_lastsrcid; /* Only for AF_INET6 */
+ /*
+ * When we are not connected conn_saddr might be unspecified.
+ * We track the source that was used with conn_v6lastdst here.
+ */
+ in6_addr_t conn_v6lastsrc;
+#define conn_v4lastsrc V4_PART_OF_V6(conn_v6lastsrc)
+
+ /* Templates for transmitting packets */
+ ip_pkt_t conn_xmit_ipp; /* Options if no ancil data */
+
+ /*
+ * Header template - conn_ht_ulp is a pointer into conn_ht_iphc.
+ * Note that ixa_ip_hdr_length indicates the offset of ht_ulp in
+ * ht_iphc
+ *
+ * The header template is maintained for connected endpoints (and
+ * updated when sticky options are changed) and also for the lastdst.
+ * There is no conflict between those usages since SOCK_DGRAM and
+ * SOCK_RAW can not be used to specify a destination address (with
+ * sendto/sendmsg) if the socket has been connected.
+ */
+ uint8_t *conn_ht_iphc; /* Start of IP header */
+ uint_t conn_ht_iphc_allocated; /* Allocated buffer size */
+ uint_t conn_ht_iphc_len; /* IP+ULP size */
+ uint8_t *conn_ht_ulp; /* Upper-layer header */
+ uint_t conn_ht_ulp_len; /* ULP header len */
+
+ /* Checksum to compensate for source routed packets. Host byte order */
+ uint32_t conn_sum;
+
#ifdef CONN_DEBUG
#define CONN_TRACE_MAX 10
int conn_trace_last; /* ndx of last used tracebuf */
@@ -357,18 +452,6 @@ struct conn_s {
};
/*
- * These two macros are used by TX. First priority is SCM_UCRED having
- * set the label in the mblk. Second priority is the open credentials with
- * peer's label (aka conn_effective_cred). Last priority is the open
- * credentials. BEST_CRED takes all three into account in the above order.
- * CONN_CRED is for connection-oriented cases when we don't need to look
- * at the mblk.
- */
-#define CONN_CRED(connp) ((connp)->conn_effective_cred == NULL ? \
- (connp)->conn_cred : (connp)->conn_effective_cred)
-#define BEST_CRED(mp, connp, pidp) ip_best_cred(mp, connp, pidp)
-
-/*
* connf_t - connection fanout data.
*
* The hash tables and their linkage (conn_t.{hashnextp, hashprevp} are
@@ -461,29 +544,22 @@ struct connf_s {
/*
- * IPCL_PROTO_MATCH() only matches conns with the specified zoneid, while
- * IPCL_PROTO_MATCH_V6() can match other conns in the multicast case, see
- * ip_fanout_proto().
+ * IPCL_PROTO_MATCH() and IPCL_PROTO_MATCH_V6() only matches conns with
+ * the specified ira_zoneid or conn_allzones by calling conn_wantpacket.
*/
-#define IPCL_PROTO_MATCH(connp, protocol, ipha, ill, \
- fanout_flags, zoneid) \
- ((((connp)->conn_src == INADDR_ANY) || \
- (((connp)->conn_src == ((ipha)->ipha_dst)) && \
- (((connp)->conn_rem == INADDR_ANY) || \
- ((connp)->conn_rem == ((ipha)->ipha_src))))) && \
- IPCL_ZONE_MATCH(connp, zoneid) && \
- (conn_wantpacket((connp), (ill), (ipha), (fanout_flags), \
- (zoneid)) || ((protocol) == IPPROTO_PIM) || \
- ((protocol) == IPPROTO_RSVP)))
-
-#define IPCL_PROTO_MATCH_V6(connp, protocol, ip6h, ill, \
- fanout_flags, zoneid) \
- ((IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_srcv6) || \
- (IN6_ARE_ADDR_EQUAL(&(connp)->conn_srcv6, &((ip6h)->ip6_dst)) && \
- (IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_remv6) || \
- IN6_ARE_ADDR_EQUAL(&(connp)->conn_remv6, &((ip6h)->ip6_src))))) && \
- (conn_wantpacket_v6((connp), (ill), (ip6h), \
- (fanout_flags), (zoneid)) || ((protocol) == IPPROTO_RSVP)))
+#define IPCL_PROTO_MATCH(connp, ira, ipha) \
+ ((((connp)->conn_laddr_v4 == INADDR_ANY) || \
+ (((connp)->conn_laddr_v4 == ((ipha)->ipha_dst)) && \
+ (((connp)->conn_faddr_v4 == INADDR_ANY) || \
+ ((connp)->conn_faddr_v4 == ((ipha)->ipha_src))))) && \
+ conn_wantpacket((connp), (ira), (ipha)))
+
+#define IPCL_PROTO_MATCH_V6(connp, ira, ip6h) \
+ ((IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_laddr_v6) || \
+ (IN6_ARE_ADDR_EQUAL(&(connp)->conn_laddr_v6, &((ip6h)->ip6_dst)) && \
+ (IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_faddr_v6) || \
+ IN6_ARE_ADDR_EQUAL(&(connp)->conn_faddr_v6, &((ip6h)->ip6_src))))) && \
+ (conn_wantpacket_v6((connp), (ira), (ip6h))))
#define IPCL_CONN_HASH(src, ports, ipst) \
((unsigned)(ntohl((src)) ^ ((ports) >> 24) ^ ((ports) >> 16) ^ \
@@ -493,31 +569,17 @@ struct connf_s {
IPCL_CONN_HASH(V4_PART_OF_V6((src)), (ports), (ipst))
#define IPCL_CONN_MATCH(connp, proto, src, dst, ports) \
- ((connp)->conn_ulp == (proto) && \
+ ((connp)->conn_proto == (proto) && \
(connp)->conn_ports == (ports) && \
- _IPCL_V4_MATCH((connp)->conn_remv6, (src)) && \
- _IPCL_V4_MATCH((connp)->conn_srcv6, (dst)) && \
+ _IPCL_V4_MATCH((connp)->conn_faddr_v6, (src)) && \
+ _IPCL_V4_MATCH((connp)->conn_laddr_v6, (dst)) && \
!(connp)->conn_ipv6_v6only)
#define IPCL_CONN_MATCH_V6(connp, proto, src, dst, ports) \
- ((connp)->conn_ulp == (proto) && \
+ ((connp)->conn_proto == (proto) && \
(connp)->conn_ports == (ports) && \
- IN6_ARE_ADDR_EQUAL(&(connp)->conn_remv6, &(src)) && \
- IN6_ARE_ADDR_EQUAL(&(connp)->conn_srcv6, &(dst)))
-
-#define IPCL_CONN_INIT(connp, protocol, src, rem, ports) { \
- (connp)->conn_ulp = protocol; \
- IN6_IPADDR_TO_V4MAPPED(src, &(connp)->conn_srcv6); \
- IN6_IPADDR_TO_V4MAPPED(rem, &(connp)->conn_remv6); \
- (connp)->conn_ports = ports; \
-}
-
-#define IPCL_CONN_INIT_V6(connp, protocol, src, rem, ports) { \
- (connp)->conn_ulp = protocol; \
- (connp)->conn_srcv6 = src; \
- (connp)->conn_remv6 = rem; \
- (connp)->conn_ports = ports; \
-}
+ IN6_ARE_ADDR_EQUAL(&(connp)->conn_faddr_v6, &(src)) && \
+ IN6_ARE_ADDR_EQUAL(&(connp)->conn_laddr_v6, &(dst)))
#define IPCL_PORT_HASH(port, size) \
((((port) >> 8) ^ (port)) & ((size) - 1))
@@ -527,33 +589,45 @@ struct connf_s {
(ipst)->ips_ipcl_bind_fanout_size)
#define IPCL_BIND_MATCH(connp, proto, laddr, lport) \
- ((connp)->conn_ulp == (proto) && \
+ ((connp)->conn_proto == (proto) && \
(connp)->conn_lport == (lport) && \
- (_IPCL_V4_MATCH_ANY((connp)->conn_srcv6) || \
- _IPCL_V4_MATCH((connp)->conn_srcv6, (laddr))) && \
+ (_IPCL_V4_MATCH_ANY((connp)->conn_laddr_v6) || \
+ _IPCL_V4_MATCH((connp)->conn_laddr_v6, (laddr))) && \
!(connp)->conn_ipv6_v6only)
#define IPCL_BIND_MATCH_V6(connp, proto, laddr, lport) \
- ((connp)->conn_ulp == (proto) && \
+ ((connp)->conn_proto == (proto) && \
(connp)->conn_lport == (lport) && \
- (IN6_ARE_ADDR_EQUAL(&(connp)->conn_srcv6, &(laddr)) || \
- IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_srcv6)))
+ (IN6_ARE_ADDR_EQUAL(&(connp)->conn_laddr_v6, &(laddr)) || \
+ IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_laddr_v6)))
+/*
+ * We compare conn_laddr since it captures both connected and a bind to
+ * a multicast or broadcast address.
+ * The caller needs to match the zoneid and also call conn_wantpacket
+ * for multicast, broadcast, or when conn_incoming_ifindex is set.
+ */
#define IPCL_UDP_MATCH(connp, lport, laddr, fport, faddr) \
(((connp)->conn_lport == (lport)) && \
- ((_IPCL_V4_MATCH_ANY((connp)->conn_srcv6) || \
- (_IPCL_V4_MATCH((connp)->conn_srcv6, (laddr)) && \
- (_IPCL_V4_MATCH_ANY((connp)->conn_remv6) || \
- (_IPCL_V4_MATCH((connp)->conn_remv6, (faddr)) && \
+ ((_IPCL_V4_MATCH_ANY((connp)->conn_laddr_v6) || \
+ (_IPCL_V4_MATCH((connp)->conn_laddr_v6, (laddr)) && \
+ (_IPCL_V4_MATCH_ANY((connp)->conn_faddr_v6) || \
+ (_IPCL_V4_MATCH((connp)->conn_faddr_v6, (faddr)) && \
(connp)->conn_fport == (fport)))))) && \
!(connp)->conn_ipv6_v6only)
+/*
+ * We compare conn_laddr since it captures both connected and a bind to
+ * a multicast or broadcast address.
+ * The caller needs to match the zoneid and also call conn_wantpacket_v6
+ * for multicast or when conn_incoming_ifindex is set.
+ */
#define IPCL_UDP_MATCH_V6(connp, lport, laddr, fport, faddr) \
(((connp)->conn_lport == (lport)) && \
- (IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_srcv6) || \
- (IN6_ARE_ADDR_EQUAL(&(connp)->conn_srcv6, &(laddr)) && \
- (IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_remv6) || \
- (IN6_ARE_ADDR_EQUAL(&(connp)->conn_remv6, &(faddr)) && \
+ (IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_laddr_v6) || \
+ (IN6_ARE_ADDR_EQUAL(&(connp)->conn_laddr_v6, &(laddr)) && \
+ (IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_faddr_v6) || \
+ (IN6_ARE_ADDR_EQUAL(&(connp)->conn_faddr_v6, &(faddr)) && \
(connp)->conn_fport == (fport))))))
#define IPCL_IPTUN_HASH(laddr, faddr) \
@@ -567,32 +641,12 @@ struct connf_s {
(laddr)->s6_addr32[2] ^ (laddr)->s6_addr32[3])
#define IPCL_IPTUN_MATCH(connp, laddr, faddr) \
- (_IPCL_V4_MATCH((connp)->conn_srcv6, (laddr)) && \
- _IPCL_V4_MATCH((connp)->conn_remv6, (faddr)))
+ (_IPCL_V4_MATCH((connp)->conn_laddr_v6, (laddr)) && \
+ _IPCL_V4_MATCH((connp)->conn_faddr_v6, (faddr)))
#define IPCL_IPTUN_MATCH_V6(connp, laddr, faddr) \
- (IN6_ARE_ADDR_EQUAL(&(connp)->conn_srcv6, (laddr)) && \
- IN6_ARE_ADDR_EQUAL(&(connp)->conn_remv6, (faddr)))
-
-#define IPCL_TCP_EAGER_INIT(connp, protocol, src, rem, ports) { \
- (connp)->conn_flags |= (IPCL_TCP4|IPCL_EAGER); \
- IN6_IPADDR_TO_V4MAPPED(src, &(connp)->conn_srcv6); \
- IN6_IPADDR_TO_V4MAPPED(rem, &(connp)->conn_remv6); \
- (connp)->conn_ports = ports; \
- (connp)->conn_send = ip_output; \
- (connp)->conn_sqp = IP_SQUEUE_GET(lbolt); \
- (connp)->conn_initial_sqp = (connp)->conn_sqp; \
-}
-
-#define IPCL_TCP_EAGER_INIT_V6(connp, protocol, src, rem, ports) { \
- (connp)->conn_flags |= (IPCL_TCP6|IPCL_EAGER); \
- (connp)->conn_srcv6 = src; \
- (connp)->conn_remv6 = rem; \
- (connp)->conn_ports = ports; \
- (connp)->conn_send = ip_output_v6; \
- (connp)->conn_sqp = IP_SQUEUE_GET(lbolt); \
- (connp)->conn_initial_sqp = (connp)->conn_sqp; \
-}
+ (IN6_ARE_ADDR_EQUAL(&(connp)->conn_laddr_v6, (laddr)) && \
+ IN6_ARE_ADDR_EQUAL(&(connp)->conn_faddr_v6, (faddr)))
#define IPCL_UDP_HASH(lport, ipst) \
IPCL_PORT_HASH(lport, (ipst)->ips_ipcl_udp_fanout_size)
@@ -606,18 +660,20 @@ struct connf_s {
/*
* This is similar to IPCL_BIND_MATCH except that the local port check
* is changed to a wildcard port check.
+ * We compare conn_laddr since it captures both connected and a bind to
+ * a multicast or broadcast address.
*/
#define IPCL_RAW_MATCH(connp, proto, laddr) \
- ((connp)->conn_ulp == (proto) && \
+ ((connp)->conn_proto == (proto) && \
(connp)->conn_lport == 0 && \
- (_IPCL_V4_MATCH_ANY((connp)->conn_srcv6) || \
- _IPCL_V4_MATCH((connp)->conn_srcv6, (laddr))))
+ (_IPCL_V4_MATCH_ANY((connp)->conn_laddr_v6) || \
+ _IPCL_V4_MATCH((connp)->conn_laddr_v6, (laddr))))
#define IPCL_RAW_MATCH_V6(connp, proto, laddr) \
- ((connp)->conn_ulp == (proto) && \
+ ((connp)->conn_proto == (proto) && \
(connp)->conn_lport == 0 && \
- (IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_srcv6) || \
- IN6_ARE_ADDR_EQUAL(&(connp)->conn_srcv6, &(laddr))))
+ (IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_laddr_v6) || \
+ IN6_ARE_ADDR_EQUAL(&(connp)->conn_laddr_v6, &(laddr))))
/* Function prototypes */
extern void ipcl_g_init(void);
@@ -631,28 +687,27 @@ void ipcl_hash_insert_wildcard(connf_t *, conn_t *);
void ipcl_hash_remove(conn_t *);
void ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp);
-extern int ipcl_bind_insert(conn_t *, uint8_t, ipaddr_t, uint16_t);
-extern int ipcl_bind_insert_v6(conn_t *, uint8_t, const in6_addr_t *,
- uint16_t);
-extern int ipcl_conn_insert(conn_t *, uint8_t, ipaddr_t, ipaddr_t,
- uint32_t);
-extern int ipcl_conn_insert_v6(conn_t *, uint8_t, const in6_addr_t *,
- const in6_addr_t *, uint32_t, uint_t);
+extern int ipcl_bind_insert(conn_t *);
+extern int ipcl_bind_insert_v4(conn_t *);
+extern int ipcl_bind_insert_v6(conn_t *);
+extern int ipcl_conn_insert(conn_t *);
+extern int ipcl_conn_insert_v4(conn_t *);
+extern int ipcl_conn_insert_v6(conn_t *);
extern conn_t *ipcl_get_next_conn(connf_t *, conn_t *, uint32_t);
-void ipcl_proto_insert(conn_t *, uint8_t);
-void ipcl_proto_insert_v6(conn_t *, uint8_t);
-conn_t *ipcl_classify_v4(mblk_t *, uint8_t, uint_t, zoneid_t, ip_stack_t *);
-conn_t *ipcl_classify_v6(mblk_t *, uint8_t, uint_t, zoneid_t, ip_stack_t *);
-conn_t *ipcl_classify(mblk_t *, zoneid_t, ip_stack_t *);
-conn_t *ipcl_classify_raw(mblk_t *, uint8_t, zoneid_t, uint32_t, ipha_t *,
+conn_t *ipcl_classify_v4(mblk_t *, uint8_t, uint_t, ip_recv_attr_t *,
+ ip_stack_t *);
+conn_t *ipcl_classify_v6(mblk_t *, uint8_t, uint_t, ip_recv_attr_t *,
ip_stack_t *);
+conn_t *ipcl_classify(mblk_t *, ip_recv_attr_t *, ip_stack_t *);
+conn_t *ipcl_classify_raw(mblk_t *, uint8_t, uint32_t, ipha_t *,
+ ip6_t *, ip_recv_attr_t *, ip_stack_t *);
conn_t *ipcl_iptun_classify_v4(ipaddr_t *, ipaddr_t *, ip_stack_t *);
conn_t *ipcl_iptun_classify_v6(in6_addr_t *, in6_addr_t *, ip_stack_t *);
void ipcl_globalhash_insert(conn_t *);
void ipcl_globalhash_remove(conn_t *);
void ipcl_walk(pfv_t, void *, ip_stack_t *);
-conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcph_t *, int, ip_stack_t *);
+conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack_t *);
conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
ip_stack_t *);
conn_t *ipcl_lookup_listener_v4(uint16_t, ipaddr_t, zoneid_t, ip_stack_t *);
@@ -661,17 +716,19 @@ conn_t *ipcl_lookup_listener_v6(uint16_t, in6_addr_t *, uint_t, zoneid_t,
int conn_trace_ref(conn_t *);
int conn_untrace_ref(conn_t *);
void ipcl_conn_cleanup(conn_t *);
-conn_t *ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *, ipha_t *, tcph_t *,
+extern uint_t conn_recvancillary_size(conn_t *, crb_t, ip_recv_attr_t *,
+ mblk_t *, ip_pkt_t *);
+extern void conn_recvancillary_add(conn_t *, crb_t, ip_recv_attr_t *,
+ ip_pkt_t *, uchar_t *, uint_t);
+conn_t *ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *, ipha_t *, tcpha_t *,
ip_stack_t *);
-conn_t *ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *, ip6_t *, tcph_t *,
+conn_t *ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *, ip6_t *, tcpha_t *,
ip_stack_t *);
-extern int ip_create_helper_stream(conn_t *connp, ldi_ident_t li);
-extern void ip_free_helper_stream(conn_t *connp);
-
-extern int ip_get_options(conn_t *, int, int, void *, t_uscalar_t *, cred_t *);
-extern int ip_set_options(conn_t *, int, int, const void *, t_uscalar_t,
- cred_t *);
+extern int ip_create_helper_stream(conn_t *, ldi_ident_t);
+extern void ip_free_helper_stream(conn_t *);
+extern int ip_helper_stream_setup(queue_t *, dev_t *, int, int,
+ cred_t *, boolean_t);
#ifdef __cplusplus
}
diff --git a/usr/src/uts/common/inet/ipdrop.h b/usr/src/uts/common/inet/ipdrop.h
index 153c9c1925..74fe8cfd94 100644
--- a/usr/src/uts/common/inet/ipdrop.h
+++ b/usr/src/uts/common/inet/ipdrop.h
@@ -41,8 +41,10 @@ typedef struct ipdropper_s {
void ip_drop_register(ipdropper_t *, char *);
void ip_drop_unregister(ipdropper_t *);
-void ip_drop_packet(mblk_t *, boolean_t, ill_t *, ire_t *, struct kstat_named *,
+void ip_drop_packet(mblk_t *, boolean_t, ill_t *, struct kstat_named *,
ipdropper_t *);
+void ip_drop_input(char *, mblk_t *, ill_t *);
+void ip_drop_output(char *, mblk_t *, ill_t *);
/*
* ip_dropstats - When a protocol developer comes up with a new reason to
diff --git a/usr/src/uts/common/inet/ipp_common.h b/usr/src/uts/common/inet/ipp_common.h
index 9ac9837f66..d7380896b6 100644
--- a/usr/src/uts/common/inet/ipp_common.h
+++ b/usr/src/uts/common/inet/ipp_common.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _INET_IPP_COMMON_H
#define _INET_IPP_COMMON_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -49,14 +47,6 @@ extern uint32_t ipp_action_count;
#define IPP_ENABLED(proc, ipst) ((ipp_action_count != 0) && \
(~((ipst)->ips_ip_policy_mask) & (proc)))
-/* Apply IPQoS policies for inbound traffic? */
-#define IP6_IN_IPP(flags, ipst) (IPP_ENABLED(IPP_LOCAL_IN, ipst) && \
- (!((flags) & IP6_NO_IPPOLICY)))
-
-/* Apply IPQoS policies for oubound traffic? */
-#define IP6_OUT_IPP(flags, ipst) \
- (IPP_ENABLED(IPP_LOCAL_OUT, ipst) && (!((flags) & IP6_NO_IPPOLICY)))
-
/* Extracts 8 bit traffic class from IPV6 flow label field */
#ifdef _BIG_ENDIAN
#define __IPV6_TCLASS_FROM_FLOW(n) (((n)>>20) & 0xff)
@@ -78,7 +68,9 @@ typedef struct ip_priv {
} ip_priv_t;
/* The entry point for ip policy processing */
-extern void ip_process(ip_proc_t, mblk_t **, uint32_t);
+#ifdef ILL_CONDEMNED
+extern mblk_t *ip_process(ip_proc_t, mblk_t *, ill_t *, ill_t *);
+#endif
extern void ip_priv_free(void *);
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/inet/ipsec_impl.h b/usr/src/uts/common/inet/ipsec_impl.h
index c5fa9367fe..228e01008d 100644
--- a/usr/src/uts/common/inet/ipsec_impl.h
+++ b/usr/src/uts/common/inet/ipsec_impl.h
@@ -410,24 +410,25 @@ struct ipsec_policy_s
uint32_t ipsp_refs;
ipsec_sel_t *ipsp_sel; /* selector set (shared) */
ipsec_action_t *ipsp_act; /* action (may be shared) */
+ netstack_t *ipsp_netstack; /* No netstack_hold */
};
#define IPPOL_REFHOLD(ipp) { \
atomic_add_32(&(ipp)->ipsp_refs, 1); \
ASSERT((ipp)->ipsp_refs != 0); \
}
-#define IPPOL_REFRELE(ipp, ns) { \
+#define IPPOL_REFRELE(ipp) { \
ASSERT((ipp)->ipsp_refs != 0); \
membar_exit(); \
if (atomic_add_32_nv(&(ipp)->ipsp_refs, -1) == 0) \
- ipsec_policy_free(ipp, ns); \
+ ipsec_policy_free(ipp); \
(ipp) = 0; \
}
-#define IPPOL_UNCHAIN(php, ip, ns) \
- HASHLIST_UNCHAIN((ip), ipsp_hash); \
- avl_remove(&(php)->iph_rulebyid, (ip)); \
- IPPOL_REFRELE(ip, ns);
+#define IPPOL_UNCHAIN(php, ip) \
+ HASHLIST_UNCHAIN((ip), ipsp_hash); \
+ avl_remove(&(php)->iph_rulebyid, (ip)); \
+ IPPOL_REFRELE(ip);
/*
* Policy ruleset. One per (protocol * direction) for system policy.
@@ -590,8 +591,6 @@ typedef struct ipsid_s
atomic_add_32(&(ipsid)->ipsid_refcnt, -1); \
}
-struct ipsec_out_s;
-
/*
* Following are the estimates of what the maximum AH and ESP header size
* would be. This is used to tell the upper layer the right value of MSS
@@ -708,6 +707,17 @@ typedef struct ipsif_s
kmutex_t ipsif_lock;
} ipsif_t;
+/*
+ * For call to the kernel crypto framework. State needed during
+ * the execution of a crypto request.
+ */
+typedef struct ipsec_crypto_s {
+ size_t ic_skip_len; /* len to skip for AH auth */
+ crypto_data_t ic_crypto_data; /* single op crypto data */
+ crypto_dual_data_t ic_crypto_dual_data; /* for dual ops */
+ crypto_data_t ic_crypto_mac; /* to store the MAC */
+ ipsa_cm_mech_t ic_cmm;
+} ipsec_crypto_t;
/*
* IPsec stack instances
@@ -826,45 +836,40 @@ extern boolean_t ipsec_loaded(ipsec_stack_t *);
extern boolean_t ipsec_failed(ipsec_stack_t *);
/*
- * callback from ipsec_loader to ip
- */
-extern void ip_ipsec_load_complete(ipsec_stack_t *);
-
-/*
* ipsec policy entrypoints (spd.c)
*/
extern void ipsec_policy_g_destroy(void);
extern void ipsec_policy_g_init(void);
+extern mblk_t *ipsec_add_crypto_data(mblk_t *, ipsec_crypto_t **);
+extern mblk_t *ipsec_remove_crypto_data(mblk_t *, ipsec_crypto_t **);
+extern mblk_t *ipsec_free_crypto_data(mblk_t *);
extern int ipsec_alloc_table(ipsec_policy_head_t *, int, int, boolean_t,
netstack_t *);
extern void ipsec_polhead_init(ipsec_policy_head_t *, int);
extern void ipsec_polhead_destroy(ipsec_policy_head_t *);
extern void ipsec_polhead_free_table(ipsec_policy_head_t *);
extern mblk_t *ipsec_check_global_policy(mblk_t *, conn_t *, ipha_t *,
- ip6_t *, boolean_t, netstack_t *);
+ ip6_t *, ip_recv_attr_t *, netstack_t *ns);
extern mblk_t *ipsec_check_inbound_policy(mblk_t *, conn_t *, ipha_t *, ip6_t *,
- boolean_t);
+ ip_recv_attr_t *);
-extern boolean_t ipsec_in_to_out(mblk_t *, ipha_t *, ip6_t *, zoneid_t);
+extern boolean_t ipsec_in_to_out(ip_recv_attr_t *, ip_xmit_attr_t *,
+ mblk_t *, ipha_t *, ip6_t *);
+extern void ipsec_in_release_refs(ip_recv_attr_t *);
+extern void ipsec_out_release_refs(ip_xmit_attr_t *);
extern void ipsec_log_policy_failure(int, char *, ipha_t *, ip6_t *, boolean_t,
- netstack_t *);
+ netstack_t *);
extern boolean_t ipsec_inbound_accept_clear(mblk_t *, ipha_t *, ip6_t *);
extern int ipsec_conn_cache_policy(conn_t *, boolean_t);
-extern mblk_t *ipsec_alloc_ipsec_out(netstack_t *);
-extern mblk_t *ipsec_attach_ipsec_out(mblk_t **, conn_t *, ipsec_policy_t *,
- uint8_t, netstack_t *);
-extern mblk_t *ipsec_init_ipsec_out(mblk_t *, mblk_t **, conn_t *,
- ipsec_policy_t *, uint8_t, netstack_t *);
-struct ipsec_in_s;
-extern ipsec_action_t *ipsec_in_to_out_action(struct ipsec_in_s *);
-extern boolean_t ipsec_check_ipsecin_latch(struct ipsec_in_s *, mblk_t *,
- struct ipsec_latch_s *, ipha_t *, ip6_t *, const char **, kstat_named_t **,
- conn_t *);
-extern void ipsec_latch_inbound(ipsec_latch_t *ipl, struct ipsec_in_s *ii);
-
-extern void ipsec_policy_free(ipsec_policy_t *, netstack_t *);
+extern void ipsec_cache_outbound_policy(const conn_t *, const in6_addr_t *,
+ const in6_addr_t *, in_port_t, ip_xmit_attr_t *);
+extern boolean_t ipsec_outbound_policy_current(ip_xmit_attr_t *);
+extern ipsec_action_t *ipsec_in_to_out_action(ip_recv_attr_t *);
+extern void ipsec_latch_inbound(conn_t *connp, ip_recv_attr_t *ira);
+
+extern void ipsec_policy_free(ipsec_policy_t *);
extern void ipsec_action_free(ipsec_action_t *);
extern void ipsec_polhead_free(ipsec_policy_head_t *, netstack_t *);
extern ipsec_policy_head_t *ipsec_polhead_split(ipsec_policy_head_t *,
@@ -894,12 +899,8 @@ extern void ipsec_actvec_free(ipsec_act_t *, uint_t);
extern int ipsec_req_from_head(ipsec_policy_head_t *, ipsec_req_t *, int);
extern mblk_t *ipsec_construct_inverse_acquire(sadb_msg_t *, sadb_ext_t **,
netstack_t *);
-extern mblk_t *ip_wput_attach_policy(mblk_t *, ipha_t *, ip6_t *, ire_t *,
- conn_t *, boolean_t, zoneid_t);
-extern mblk_t *ip_wput_ire_parse_ipsec_out(mblk_t *, ipha_t *, ip6_t *,
- ire_t *, conn_t *, boolean_t, zoneid_t);
-extern ipsec_policy_t *ipsec_find_policy(int, conn_t *,
- struct ipsec_out_s *, ipsec_selector_t *, netstack_t *);
+extern ipsec_policy_t *ipsec_find_policy(int, const conn_t *,
+ ipsec_selector_t *, netstack_t *);
extern ipsid_t *ipsid_lookup(int, char *, netstack_t *);
extern boolean_t ipsid_equal(ipsid_t *, ipsid_t *);
extern void ipsid_gc(netstack_t *);
@@ -912,29 +913,29 @@ extern void ipsec_enter_policy(ipsec_policy_head_t *, ipsec_policy_t *, int,
netstack_t *);
extern boolean_t ipsec_check_action(ipsec_act_t *, int *, netstack_t *);
-extern mblk_t *ipsec_out_tag(mblk_t *, mblk_t *, netstack_t *);
-extern mblk_t *ipsec_in_tag(mblk_t *, mblk_t *, netstack_t *);
-extern mblk_t *ip_copymsg(mblk_t *mp);
-
-extern void iplatch_free(ipsec_latch_t *, netstack_t *);
+extern void iplatch_free(ipsec_latch_t *);
extern ipsec_latch_t *iplatch_create(void);
extern int ipsec_set_req(cred_t *, conn_t *, ipsec_req_t *);
extern void ipsec_insert_always(avl_tree_t *tree, void *new_node);
extern int32_t ipsec_act_ovhd(const ipsec_act_t *act);
-extern int sadb_whack_label(mblk_t **, ipsa_t *);
-extern int sadb_whack_label_v6(mblk_t **, ipsa_t *);
+extern mblk_t *sadb_whack_label(mblk_t *, ipsa_t *, ip_xmit_attr_t *,
+ kstat_named_t *, ipdropper_t *);
+extern mblk_t *sadb_whack_label_v4(mblk_t *, ipsa_t *, kstat_named_t *,
+ ipdropper_t *);
+extern mblk_t *sadb_whack_label_v6(mblk_t *, ipsa_t *, kstat_named_t *,
+ ipdropper_t *);
extern boolean_t update_iv(uint8_t *, queue_t *, ipsa_t *, ipsecesp_stack_t *);
/*
* Tunnel-support SPD functions and variables.
*/
struct iptun_s; /* Defined in inet/iptun/iptun_impl.h. */
-extern boolean_t ipsec_tun_inbound(mblk_t *, mblk_t **, ipsec_tun_pol_t *,
+extern mblk_t *ipsec_tun_inbound(ip_recv_attr_t *, mblk_t *, ipsec_tun_pol_t *,
ipha_t *, ip6_t *, ipha_t *, ip6_t *, int, netstack_t *);
extern mblk_t *ipsec_tun_outbound(mblk_t *, struct iptun_s *, ipha_t *,
- ip6_t *, ipha_t *, ip6_t *, int);
+ ip6_t *, ipha_t *, ip6_t *, int, ip_xmit_attr_t *);
extern void itp_free(ipsec_tun_pol_t *, netstack_t *);
extern ipsec_tun_pol_t *create_tunnel_policy(char *, int *, uint64_t *,
netstack_t *);
@@ -951,9 +952,9 @@ extern ipsec_tun_pol_t *itp_get_byaddr(uint32_t *, uint32_t *, int,
*/
extern void ipsecah_in_assocfailure(mblk_t *, char, ushort_t, char *,
- uint32_t, void *, int, ipsecah_stack_t *);
+ uint32_t, void *, int, ip_recv_attr_t *ira);
extern void ipsecesp_in_assocfailure(mblk_t *, char, ushort_t, char *,
- uint32_t, void *, int, ipsecesp_stack_t *);
+ uint32_t, void *, int, ip_recv_attr_t *ira);
extern void ipsecesp_send_keepalive(ipsa_t *);
/*
@@ -987,13 +988,8 @@ extern void ipsecah_algs_changed(netstack_t *);
extern void ipsecesp_algs_changed(netstack_t *);
extern void ipsecesp_init_funcs(ipsa_t *);
extern void ipsecah_init_funcs(ipsa_t *);
-extern ipsec_status_t ipsecah_icmp_error(mblk_t *);
-extern ipsec_status_t ipsecesp_icmp_error(mblk_t *);
-
-/*
- * Wrapper for putnext() to ipsec accelerated interface.
- */
-extern void ipsec_hw_putnext(queue_t *, mblk_t *);
+extern mblk_t *ipsecah_icmp_error(mblk_t *, ip_recv_attr_t *);
+extern mblk_t *ipsecesp_icmp_error(mblk_t *, ip_recv_attr_t *);
/*
* spdsock functions that are called directly by IP.
@@ -1003,11 +999,11 @@ extern void spdsock_update_pending_algs(netstack_t *);
/*
* IP functions that are called from AH and ESP.
*/
-extern boolean_t ipsec_outbound_sa(mblk_t *, uint_t);
-extern esph_t *ipsec_inbound_esp_sa(mblk_t *, netstack_t *);
-extern ah_t *ipsec_inbound_ah_sa(mblk_t *, netstack_t *);
+extern boolean_t ipsec_outbound_sa(mblk_t *, ip_xmit_attr_t *, uint_t);
+extern mblk_t *ipsec_inbound_esp_sa(mblk_t *, ip_recv_attr_t *, esph_t **);
+extern mblk_t *ipsec_inbound_ah_sa(mblk_t *, ip_recv_attr_t *, ah_t **);
extern ipsec_policy_t *ipsec_find_policy_head(ipsec_policy_t *,
- ipsec_policy_head_t *, int, ipsec_selector_t *, netstack_t *);
+ ipsec_policy_head_t *, int, ipsec_selector_t *);
/*
* IP dropper init/destroy.
@@ -1019,7 +1015,7 @@ void ip_drop_destroy(ipsec_stack_t *);
* Common functions
*/
extern boolean_t ip_addr_match(uint8_t *, int, in6_addr_t *);
-extern boolean_t ipsec_label_match(cred_t *, cred_t *);
+extern boolean_t ipsec_label_match(ts_label_t *, ts_label_t *);
/*
* AH and ESP counters types.
diff --git a/usr/src/uts/common/inet/ipsec_info.h b/usr/src/uts/common/inet/ipsec_info.h
index 3c7ede8405..c1bde9fcb7 100644
--- a/usr/src/uts/common/inet/ipsec_info.h
+++ b/usr/src/uts/common/inet/ipsec_info.h
@@ -34,22 +34,12 @@ extern "C" {
/*
* IPsec informational messages. These are M_CTL STREAMS messages, which
- * convey IPsec information between various IP and related modules. The
- * messages come in a few flavors:
- *
- * * IPSEC_{IN,OUT} - These show what IPsec action have been taken (for
- * inbound datagrams), or need to be taken (for outbound datagrams).
- * They flow between AH/ESP and IP.
+ * convey IPsec information between various IP and related modules. Most
+ * have been deprecated by the de-STREAMS-ing of TCP/IP. What remains is:
*
* * Keysock consumer interface - These messages are wrappers for
* PF_KEY messages. They flow between AH/ESP and keysock.
*
- * Some of these messages include pointers such as a netstack_t pointer.
- * We do not explicitly reference count those with netstack_hold/rele,
- * since we depend on IP's ability to discard all of the IPSEC_{IN,OUT}
- * messages in order to handle the ipsa pointers.
- * We have special logic when doing asynch callouts to kEF for which we
- * verify netstack_t pointer using the netstackid_t.
*/
/*
@@ -69,223 +59,11 @@ extern "C" {
* M_CTL types for IPsec messages. Remember, the values 0x40 - 0x4f and 0x60
* - 0x6f are not to be used because of potential little-endian confusion.
*
- * Offsets 1-25 (decimal) are in use, spread through this file.
+ * Offsets 3-7 (decimal) are in use, spread through this file.
* Check for duplicates through the whole file before adding.
*/
/*
- * IPSEC_{IN,OUT} policy expressors.
- */
-#define IPSEC_IN (IPSEC_M_CTL + 1)
-#define IPSEC_OUT (IPSEC_M_CTL + 2)
-#define MAXSALTSIZE 8
-
-/*
- * For combined mode ciphers, store the crypto_mechanism_t in the
- * per-packet ipsec_in_t/ipsec_out_t structures. This is because the PARAMS
- * and nonce values change for each packet. For non-combined mode
- * ciphers, these values are constant for the life of the SA.
- */
-typedef struct ipsa_cm_mech_s {
- crypto_mechanism_t combined_mech;
- union {
- CK_AES_CCM_PARAMS paramu_ccm;
- CK_AES_GCM_PARAMS paramu_gcm;
- } paramu;
- uint8_t nonce[MAXSALTSIZE + sizeof (uint64_t)];
-#define param_ulMACSize paramu.paramu_ccm.ulMACSize
-#define param_ulNonceSize paramu.paramu_ccm.ipsa_ulNonceSize
-#define param_ulAuthDataSize paramu.paramu_ccm.ipsa_ulAuthDataSize
-#define param_ulDataSize paramu.paramu_ccm.ipsa_ulDataSize
-#define param_nonce paramu.paramu_ccm.nonce
-#define param_authData paramu.paramu_ccm.authData
-#define param_pIv paramu.paramu_gcm.ipsa_pIv
-#define param_ulIvLen paramu.paramu_gcm.ulIvLen
-#define param_ulIvBits paramu.paramu_gcm.ulIvBits
-#define param_pAAD paramu.paramu_gcm.pAAD
-#define param_ulAADLen paramu.paramu_gcm.ulAADLen
-#define param_ulTagBits paramu.paramu_gcm.ulTagBits
-} ipsa_cm_mech_t;
-
-/*
- * This is used for communication between IP and IPSEC (AH/ESP)
- * for Inbound datagrams. IPSEC_IN is allocated by IP before IPSEC
- * processing begins. On return spi fields are initialized so that
- * IP can locate the security associations later on for doing policy
- * checks. For loopback case, IPSEC processing is not done. But the
- * attributes of the security are reflected in <foo>_done fields below.
- * The code in policy check infers that it is a loopback case and
- * would not try to get the associations.
- *
- * The comment below (and for other netstack_t references) refers
- * to the fact that we only do netstack_hold in particular cases,
- * such as the references from open streams (ill_t and conn_t's
- * pointers). Internally within IP we rely on IP's ability to cleanup e.g.
- * ire_t's when an ill goes away.
- */
-typedef struct ipsec_in_s {
- uint32_t ipsec_in_type;
- uint32_t ipsec_in_len;
- frtn_t ipsec_in_frtn; /* for esballoc() callback */
- struct ipsa_s *ipsec_in_ah_sa; /* SA for AH */
- struct ipsa_s *ipsec_in_esp_sa; /* SA for ESP */
-
- struct ipsec_policy_head_s *ipsec_in_policy;
- struct ipsec_action_s *ipsec_in_action; /* how we made it in.. */
- unsigned int
- ipsec_in_secure : 1, /* Is the message attached secure ? */
- ipsec_in_v4 : 1, /* Is this an ipv4 packet ? */
- ipsec_in_loopback : 1, /* Is this a loopback request ? */
- ipsec_in_dont_check : 1, /* Used by TCP to avoid policy check */
-
- ipsec_in_decaps : 1, /* Was this packet decapsulated from */
- /* a matching inner packet? */
- ipsec_in_accelerated : 1, /* hardware accelerated packet */
-
- ipsec_in_icmp_loopback : 1, /* Looped-back ICMP packet, */
- /* all should trust this. */
- ipsec_in_pad_bits : 25;
-
- int ipsec_in_ill_index; /* interface on which ipha_dst was */
- /* configured when pkt was recv'd */
- int ipsec_in_rill_index; /* interface on which pkt was recv'd */
- uint32_t ipsec_in_esp_udp_ports; /* For an ESP-in-UDP packet. */
- mblk_t *ipsec_in_da; /* data attr. for accelerated pkts */
-
- /*
- * For call to the kernel crypto framework. State needed during
- * the execution of a crypto request. Storing these here
- * allow us to avoid a separate allocation before calling the
- * crypto framework.
- */
- size_t ipsec_in_skip_len; /* len to skip for AH auth */
- crypto_data_t ipsec_in_crypto_data; /* single op crypto data */
- crypto_dual_data_t ipsec_in_crypto_dual_data; /* for dual ops */
- crypto_data_t ipsec_in_crypto_mac; /* to store the MAC */
-
- zoneid_t ipsec_in_zoneid; /* target zone for the datagram */
- netstack_t *ipsec_in_ns; /* Does not have a netstack_hold */
- ipsa_cm_mech_t ipsec_in_cmm; /* PARAMS for Combined mode mechs */
- netstackid_t ipsec_in_stackid; /* Used while waing for kEF callback */
-} ipsec_in_t;
-
-#define IPSECOUT_MAX_ADDRLEN 4 /* Max addr len. (in 32-bit words) */
-/*
- * This is used for communication between IP and IPSEC (AH/ESP)
- * for Outbound datagrams. IPSEC_OUT is allocated by IP before IPSEC
- * processing begins. On return SA fields are initialized so that
- * IP can locate the security associations later on for doing policy
- * checks. The policy and the actions associated with this packet are
- * stored in the ipsec_out_policy and ipsec_out_act fields respectively.
- * IPSEC_OUT is also used to carry non-ipsec information when conn is
- * absent or the conn information is lost across the calls to ARP.
- * example: message from ARP or from ICMP error routines.
- */
-typedef struct ipsec_out_s {
- uint32_t ipsec_out_type;
- uint32_t ipsec_out_len;
- frtn_t ipsec_out_frtn; /* for esballoc() callback */
- struct ipsec_policy_head_s *ipsec_out_polhead;
- ipsec_latch_t *ipsec_out_latch;
- struct ipsec_policy_s *ipsec_out_policy; /* why are we here? */
- struct ipsec_action_s *ipsec_out_act; /* what do we want? */
- struct ipsa_s *ipsec_out_ah_sa; /* AH SA used for the packet */
- struct ipsa_s *ipsec_out_esp_sa; /* ESP SA used for the packet */
- /*
- * NOTE: "Source" and "Dest" are w.r.t. outbound datagrams. Ports can
- * be zero, and the protocol number is needed to make the ports
- * significant.
- */
- uint16_t ipsec_out_src_port; /* Source port number of d-gram. */
- uint16_t ipsec_out_dst_port; /* Destination port number of d-gram. */
- uint8_t ipsec_out_icmp_type; /* ICMP type of d-gram */
- uint8_t ipsec_out_icmp_code; /* ICMP code of d-gram */
-
- sa_family_t ipsec_out_inaf; /* Inner address family */
- uint32_t ipsec_out_insrc[IPSECOUT_MAX_ADDRLEN]; /* Inner src address */
- uint32_t ipsec_out_indst[IPSECOUT_MAX_ADDRLEN]; /* Inner dest address */
- uint8_t ipsec_out_insrcpfx; /* Inner source prefix */
- uint8_t ipsec_out_indstpfx; /* Inner destination prefix */
-
- uint_t ipsec_out_ill_index; /* ill index used for multicast etc. */
- uint8_t ipsec_out_proto; /* IP protocol number for d-gram. */
- unsigned int
- ipsec_out_tunnel : 1, /* Tunnel mode? */
- ipsec_out_use_global_policy : 1, /* Inherit global policy ? */
- ipsec_out_secure : 1, /* Is this secure ? */
- ipsec_out_proc_begin : 1, /* IPSEC processing begun */
- /*
- * Following five values reflects the values stored
- * in conn.
- */
- ipsec_out_multicast_loop : 1,
- ipsec_out_dontroute : 1,
- ipsec_out_reserved : 1,
- ipsec_out_v4 : 1,
-
- ipsec_out_unspec_src : 1, /* IPv6 ip6i_t info */
- ipsec_out_reachable : 1, /* NDP reachability info */
- ipsec_out_failed: 1,
- ipsec_out_se_done: 1,
-
- ipsec_out_esp_done: 1,
- ipsec_out_ah_done: 1,
- ipsec_out_need_policy: 1,
-
- /*
- * To indicate that packet must be accelerated, i.e.
- * ICV or encryption performed, by Provider.
- */
- ipsec_out_accelerated : 1,
- /*
- * Used by IP to tell IPsec that the outbound ill for this
- * packet supports acceleration of the AH or ESP prototocol.
- * If set, ipsec_out_capab_ill_index contains the
- * index of the ill.
- */
- ipsec_out_is_capab_ill : 1,
- /*
- * Indicates ICMP message destined for self. These
- * messages are to be trusted by all receivers.
- */
- ipsec_out_icmp_loopback: 1,
- ipsec_out_ip_nexthop : 1, /* IP_NEXTHOP option is set */
- ipsec_out_pad_bits : 13;
- cred_t *ipsec_out_cred;
- uint32_t ipsec_out_capab_ill_index;
-
- /*
- * For call to the kernel crypto framework. State needed during
- * the execution of a crypto request. Storing these here
- * allow us to avoid a separate allocation before calling the
- * crypto framework.
- */
- size_t ipsec_out_skip_len; /* len to skip for AH auth */
- crypto_data_t ipsec_out_crypto_data; /* single op crypto data */
- crypto_dual_data_t ipsec_out_crypto_dual_data; /* for dual ops */
- crypto_data_t ipsec_out_crypto_mac; /* to store the MAC */
-
- zoneid_t ipsec_out_zoneid; /* source zone for the datagram */
- in6_addr_t ipsec_out_nexthop_v6; /* nexthop IP address */
-#define ipsec_out_nexthop_addr V4_PART_OF_V6(ipsec_out_nexthop_v6)
- netstack_t *ipsec_out_ns; /* Does not have a netstack_hold */
- netstackid_t ipsec_out_stackid; /* Used while waing for kEF callback */
- ipsa_cm_mech_t ipsec_out_cmm; /* PARAMS for Combined mode mechs */
-} ipsec_out_t;
-
-/*
- * This is used to mark the ipsec_out_t *req* fields
- * when the operation is done without affecting the
- * requests.
- */
-#define IPSEC_REQ_DONE 0x80000000
-/*
- * Operation could not be performed by the AH/ESP
- * module.
- */
-#define IPSEC_REQ_FAILED 0x40000000
-
-/*
* Keysock consumer interface.
*
* The driver/module keysock (which is a driver to PF_KEY sockets, but is
@@ -368,32 +146,6 @@ typedef struct keysock_out_err_s {
} keysock_out_err_t;
/*
- * M_CTL message type for sending inbound pkt information between IP & ULP.
- * These are _not_ related to IPsec in any way, but are here so that there is
- * one place where all these values are defined which makes it easier to track.
- * The choice of this value has the same rationale as explained above.
- */
-#define IN_PKTINFO (IPSEC_M_CTL + 24)
-
-
-/*
- * IPSEC_CTL messages are used by IPsec to send control type requests
- * to IP. Such a control message is currently used by IPsec to request
- * that IP send the contents of an IPsec SA or the entire SADB to
- * every IPsec hardware acceleration capable provider.
- */
-
-#define IPSEC_CTL (IPSEC_M_CTL + 25)
-
-typedef struct ipsec_ctl_s {
- uint32_t ipsec_ctl_type;
- uint32_t ipsec_ctl_len;
- uint_t ipsec_ctl_sa_type;
- void *ipsec_ctl_sa;
-} ipsec_ctl_t;
-
-
-/*
* All IPsec informational messages are placed into the ipsec_info_t
* union, so that allocation can be done once, and IPsec informational
* messages can be recycled.
@@ -403,13 +155,10 @@ typedef union ipsec_info_u {
uint32_t ipsec_allu_type;
uint32_t ipsec_allu_len; /* In bytes */
} ipsec_allu;
- ipsec_in_t ipsec_in;
- ipsec_out_t ipsec_out;
keysock_hello_ack_t keysock_hello_ack;
keysock_in_t keysock_in;
keysock_out_t keysock_out;
keysock_out_err_t keysock_out_err;
- ipsec_ctl_t ipsec_ctl;
} ipsec_info_t;
#define ipsec_info_type ipsec_allu.ipsec_allu_type
#define ipsec_info_len ipsec_allu.ipsec_allu_len
diff --git a/usr/src/uts/common/inet/ipsecah.h b/usr/src/uts/common/inet/ipsecah.h
index c389664164..cde745da88 100644
--- a/usr/src/uts/common/inet/ipsecah.h
+++ b/usr/src/uts/common/inet/ipsecah.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _INET_IPSECAH_H
#define _INET_IPSECAH_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <inet/ip.h>
#include <inet/ipdrop.h>
@@ -62,9 +60,6 @@ typedef struct ah_kstats_s
kstat_named_t ah_stat_acquire_requests;
kstat_named_t ah_stat_bytes_expired;
kstat_named_t ah_stat_out_discards;
- kstat_named_t ah_stat_in_accelerated;
- kstat_named_t ah_stat_out_accelerated;
- kstat_named_t ah_stat_noaccel;
kstat_named_t ah_stat_crypto_sync;
kstat_named_t ah_stat_crypto_async;
kstat_named_t ah_stat_crypto_failures;
@@ -116,8 +111,6 @@ struct ipsecah_stack {
*/
queue_t *ah_pfkey_q;
timeout_id_t ah_event;
-
- mblk_t *ah_ip_unbind;
};
typedef struct ipsecah_stack ipsecah_stack_t;
diff --git a/usr/src/uts/common/inet/ipsecesp.h b/usr/src/uts/common/inet/ipsecesp.h
index 2dfb73c667..7be35276aa 100644
--- a/usr/src/uts/common/inet/ipsecesp.h
+++ b/usr/src/uts/common/inet/ipsecesp.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _INET_IPSECESP_H
#define _INET_IPSECESP_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <inet/ip.h>
#include <inet/ipdrop.h>
@@ -70,10 +68,7 @@ struct ipsecesp_stack {
queue_t *esp_pfkey_q;
timeout_id_t esp_event;
- mblk_t *esp_ip_unbind;
-
sadbp_t esp_sadb;
-
};
typedef struct ipsecesp_stack ipsecesp_stack_t;
diff --git a/usr/src/uts/common/inet/iptun/iptun.c b/usr/src/uts/common/inet/iptun/iptun.c
index bc2f1d64d5..505aaccb31 100644
--- a/usr/src/uts/common/inet/iptun/iptun.c
+++ b/usr/src/uts/common/inet/iptun/iptun.c
@@ -76,6 +76,8 @@
#include <inet/ip.h>
#include <inet/ip_ire.h>
#include <inet/ipsec_impl.h>
+#include <sys/tsol/label.h>
+#include <sys/tsol/tnet.h>
#include <inet/iptun.h>
#include "iptun_impl.h"
@@ -87,8 +89,6 @@
#define IPTUN_HASH_KEY(key) ((mod_hash_key_t)(uintptr_t)(key))
-#define IPTUNQ_DEV "/dev/iptunq"
-
#define IPTUN_MIN_IPV4_MTU 576 /* ip.h still uses 68 (!) */
#define IPTUN_MIN_IPV6_MTU IPV6_MIN_MTU
#define IPTUN_MAX_IPV4_MTU (IP_MAXPACKET - sizeof (ipha_t))
@@ -113,15 +113,18 @@ static iptun_encaplim_t iptun_encaplim_init = {
0
};
-/* Table containing per-iptun-type information. */
+/*
+ * Table containing per-iptun-type information.
+ * Since IPv6 can run over all of these we have the IPv6 min as the min MTU.
+ */
static iptun_typeinfo_t iptun_type_table[] = {
- { IPTUN_TYPE_IPV4, MAC_PLUGIN_IDENT_IPV4, IPV4_VERSION, ip_output,
- IPTUN_MIN_IPV4_MTU, IPTUN_MAX_IPV4_MTU, B_TRUE },
- { IPTUN_TYPE_IPV6, MAC_PLUGIN_IDENT_IPV6, IPV6_VERSION, ip_output_v6,
+ { IPTUN_TYPE_IPV4, MAC_PLUGIN_IDENT_IPV4, IPV4_VERSION,
+ IPTUN_MIN_IPV6_MTU, IPTUN_MAX_IPV4_MTU, B_TRUE },
+ { IPTUN_TYPE_IPV6, MAC_PLUGIN_IDENT_IPV6, IPV6_VERSION,
IPTUN_MIN_IPV6_MTU, IPTUN_MAX_IPV6_MTU, B_TRUE },
- { IPTUN_TYPE_6TO4, MAC_PLUGIN_IDENT_6TO4, IPV4_VERSION, ip_output,
- IPTUN_MIN_IPV4_MTU, IPTUN_MAX_IPV4_MTU, B_FALSE },
- { IPTUN_TYPE_UNKNOWN, NULL, 0, NULL, 0, 0, B_FALSE }
+ { IPTUN_TYPE_6TO4, MAC_PLUGIN_IDENT_6TO4, IPV4_VERSION,
+ IPTUN_MIN_IPV6_MTU, IPTUN_MAX_IPV4_MTU, B_FALSE },
+ { IPTUN_TYPE_UNKNOWN, NULL, 0, 0, 0, B_FALSE }
};
/*
@@ -140,7 +143,6 @@ kmem_cache_t *iptun_cache;
ddi_taskq_t *iptun_taskq;
typedef enum {
- IPTUN_TASK_PMTU_UPDATE, /* obtain new destination path-MTU */
IPTUN_TASK_MTU_UPDATE, /* tell mac about new tunnel link MTU */
IPTUN_TASK_LADDR_UPDATE, /* tell mac about new local address */
IPTUN_TASK_RADDR_UPDATE, /* tell mac about new remote address */
@@ -158,13 +160,23 @@ static int iptun_enter(iptun_t *);
static void iptun_exit(iptun_t *);
static void iptun_headergen(iptun_t *, boolean_t);
static void iptun_drop_pkt(mblk_t *, uint64_t *);
-static void iptun_input(void *, mblk_t *, void *);
+static void iptun_input(void *, mblk_t *, void *, ip_recv_attr_t *);
+static void iptun_input_icmp(void *, mblk_t *, void *, ip_recv_attr_t *);
static void iptun_output(iptun_t *, mblk_t *);
-static uint32_t iptun_get_maxmtu(iptun_t *, uint32_t);
-static uint32_t iptun_update_mtu(iptun_t *, uint32_t);
-static uint32_t iptun_get_dst_pmtu(iptun_t *);
+static uint32_t iptun_get_maxmtu(iptun_t *, ip_xmit_attr_t *, uint32_t);
+static uint32_t iptun_update_mtu(iptun_t *, ip_xmit_attr_t *, uint32_t);
+static uint32_t iptun_get_dst_pmtu(iptun_t *, ip_xmit_attr_t *);
+static void iptun_update_dst_pmtu(iptun_t *, ip_xmit_attr_t *);
static int iptun_setladdr(iptun_t *, const struct sockaddr_storage *);
+static void iptun_output_6to4(iptun_t *, mblk_t *);
+static void iptun_output_common(iptun_t *, ip_xmit_attr_t *, mblk_t *);
+static boolean_t iptun_verifyicmp(conn_t *, void *, icmph_t *, icmp6_t *,
+ ip_recv_attr_t *);
+
+static void iptun_notify(void *, ip_xmit_attr_t *, ixa_notify_type_t,
+ ixa_notify_arg_t);
+
static mac_callbacks_t iptun_m_callbacks;
static int
@@ -295,13 +307,6 @@ iptun_m_tx(void *arg, mblk_t *mpchain)
return (NULL);
}
- /*
- * Request the destination's path MTU information regularly in case
- * path MTU has increased.
- */
- if (IPTUN_PMTU_TOO_OLD(iptun))
- iptun_task_dispatch(iptun, IPTUN_TASK_PMTU_UPDATE);
-
for (mp = mpchain; mp != NULL; mp = nmp) {
nmp = mp->b_next;
mp->b_next = NULL;
@@ -350,7 +355,7 @@ iptun_m_setprop(void *barg, const char *pr_name, mac_prop_id_t pr_num,
}
break;
case MAC_PROP_MTU: {
- uint32_t maxmtu = iptun_get_maxmtu(iptun, 0);
+ uint32_t maxmtu = iptun_get_maxmtu(iptun, NULL, 0);
if (value < iptun->iptun_typeinfo->iti_minmtu ||
value > maxmtu) {
@@ -434,7 +439,7 @@ iptun_m_getprop(void *barg, const char *pr_name, mac_prop_id_t pr_num,
}
break;
case MAC_PROP_MTU: {
- uint32_t maxmtu = iptun_get_maxmtu(iptun, 0);
+ uint32_t maxmtu = iptun_get_maxmtu(iptun, NULL, 0);
if (is_possible) {
range.range_uint32[0].mpur_min =
@@ -516,20 +521,11 @@ iptun_enter_by_linkid(datalink_id_t linkid, iptun_t **iptun)
}
/*
- * Handle tasks that were deferred through the iptun_taskq. These fall into
- * two categories:
- *
- * 1. Tasks that were defered because we didn't want to spend time doing them
- * while in the data path. Only IPTUN_TASK_PMTU_UPDATE falls into this
- * category.
- *
- * 2. Tasks that were defered because they require calling up to the mac
- * module, and we can't call up to the mac module while holding locks.
+ * Handle tasks that were deferred through the iptun_taskq because they require
+ * calling up to the mac module, and we can't call up to the mac module while
+ * holding locks.
*
- * Handling 1 is easy; we just lookup the iptun_t, perform the task, exit the
- * tunnel, and we're done.
- *
- * Handling 2 is tricky to get right without introducing race conditions and
+ * This is tricky to get right without introducing race conditions and
* deadlocks with the mac module, as we cannot issue an upcall while in the
* iptun_t. The reason is that upcalls may try and enter the mac perimeter,
* while iptun callbacks (such as iptun_m_setprop()) called from the mac
@@ -573,12 +569,6 @@ iptun_task_cb(void *arg)
if (iptun_enter_by_linkid(linkid, &iptun) != 0)
return;
- if (task == IPTUN_TASK_PMTU_UPDATE) {
- (void) iptun_update_mtu(iptun, 0);
- iptun_exit(iptun);
- return;
- }
-
iptun->iptun_flags |= IPTUN_UPCALL_PENDING;
switch (task) {
@@ -742,53 +732,143 @@ iptun_canbind(iptun_t *iptun)
!(iptun->iptun_typeinfo->iti_hasraddr)));
}
+/*
+ * Verify that the local address is valid, and insert in the fanout
+ */
static int
iptun_bind(iptun_t *iptun)
{
- conn_t *connp = iptun->iptun_connp;
- int err;
+ conn_t *connp = iptun->iptun_connp;
+ int error = 0;
+ ip_xmit_attr_t *ixa;
+ iulp_t uinfo;
+ ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
+
+ /* Get an exclusive ixa for this thread, and replace conn_ixa */
+ ixa = conn_get_ixa(connp, B_TRUE);
+ if (ixa == NULL)
+ return (ENOMEM);
+ ASSERT(ixa->ixa_refcnt >= 2);
+ ASSERT(ixa == connp->conn_ixa);
+
+ /* We create PMTU state including for 6to4 */
+ ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
ASSERT(iptun_canbind(iptun));
+ mutex_enter(&connp->conn_lock);
+ /*
+ * Note that conn_proto can't be set since the upper protocol
+ * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
+ * ipcl_iptun_classify doesn't use conn_proto.
+ */
+ connp->conn_ipversion = iptun->iptun_typeinfo->iti_ipvers;
+
switch (iptun->iptun_typeinfo->iti_type) {
case IPTUN_TYPE_IPV4:
- /*
- * When we set a tunnel's destination address, we do not care
- * if the destination is reachable. Transient routing issues
- * should not inhibit the creation of a tunnel interface, for
- * example. For that reason, we pass in B_FALSE for the
- * verify_dst argument of ip_proto_bind_connected_v4() (and
- * similarly for IPv6 tunnels below).
- */
- err = ip_proto_bind_connected_v4(connp, NULL, IPPROTO_ENCAP,
- &iptun->iptun_laddr4, 0, iptun->iptun_raddr4, 0, B_TRUE,
- B_FALSE, iptun->iptun_cred);
+ IN6_IPADDR_TO_V4MAPPED(iptun->iptun_laddr4,
+ &connp->conn_laddr_v6);
+ IN6_IPADDR_TO_V4MAPPED(iptun->iptun_raddr4,
+ &connp->conn_faddr_v6);
+ ixa->ixa_flags |= IXAF_IS_IPV4;
+ if (ip_laddr_verify_v4(iptun->iptun_laddr4, IPCL_ZONEID(connp),
+ ipst, B_FALSE) != IPVL_UNICAST_UP) {
+ mutex_exit(&connp->conn_lock);
+ error = EADDRNOTAVAIL;
+ goto done;
+ }
break;
case IPTUN_TYPE_IPV6:
- err = ip_proto_bind_connected_v6(connp, NULL, IPPROTO_IPV6,
- &iptun->iptun_laddr6, 0, &iptun->iptun_raddr6, NULL, 0,
- B_TRUE, B_FALSE, iptun->iptun_cred);
+ connp->conn_laddr_v6 = iptun->iptun_laddr6;
+ connp->conn_faddr_v6 = iptun->iptun_raddr6;
+ ixa->ixa_flags &= ~IXAF_IS_IPV4;
+ /* We use a zero scopeid for now */
+ if (ip_laddr_verify_v6(&iptun->iptun_laddr6, IPCL_ZONEID(connp),
+ ipst, B_FALSE, 0) != IPVL_UNICAST_UP) {
+ mutex_exit(&connp->conn_lock);
+ error = EADDRNOTAVAIL;
+ goto done;
+ }
break;
case IPTUN_TYPE_6TO4:
- err = ip_proto_bind_laddr_v4(connp, NULL, IPPROTO_IPV6,
- iptun->iptun_laddr4, 0, B_TRUE);
- break;
+ IN6_IPADDR_TO_V4MAPPED(iptun->iptun_laddr4,
+ &connp->conn_laddr_v6);
+ IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &connp->conn_faddr_v6);
+ ixa->ixa_flags |= IXAF_IS_IPV4;
+ mutex_exit(&connp->conn_lock);
+
+ switch (ip_laddr_verify_v4(iptun->iptun_laddr4,
+ IPCL_ZONEID(connp), ipst, B_FALSE)) {
+ case IPVL_UNICAST_UP:
+ case IPVL_UNICAST_DOWN:
+ break;
+ default:
+ error = EADDRNOTAVAIL;
+ goto done;
+ }
+ goto insert;
}
- if (err == 0) {
- iptun->iptun_flags |= IPTUN_BOUND;
+ /* In case previous destination was multirt */
+ ip_attr_newdst(ixa);
- /*
- * Now that we're bound with ip below us, this is a good time
- * to initialize the destination path MTU and to re-calculate
- * the tunnel's link MTU.
- */
- (void) iptun_update_mtu(iptun, 0);
+ /*
+ * When we set a tunnel's destination address, we do not
+ * care if the destination is reachable. Transient routing
+ * issues should not inhibit the creation of a tunnel
+ * interface, for example. Thus we pass B_FALSE here.
+ */
+ connp->conn_saddr_v6 = connp->conn_laddr_v6;
+ mutex_exit(&connp->conn_lock);
- if (IS_IPTUN_RUNNING(iptun))
- iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE);
- }
- return (err);
+ /* As long as the MTU is large we avoid fragmentation */
+ ixa->ixa_flags |= IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF;
+
+ /* We handle IPsec in iptun_output_common */
+ error = ip_attr_connect(connp, ixa, &connp->conn_saddr_v6,
+ &connp->conn_faddr_v6, &connp->conn_faddr_v6, 0,
+ &connp->conn_saddr_v6, &uinfo, 0);
+
+ if (error != 0)
+ goto done;
+
+ /* saddr shouldn't change since it was already set */
+ ASSERT(IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
+ &connp->conn_saddr_v6));
+
+ /* We set IXAF_VERIFY_PMTU to catch PMTU increases */
+ ixa->ixa_flags |= IXAF_VERIFY_PMTU;
+ ASSERT(uinfo.iulp_mtu != 0);
+
+ /*
+ * Allow setting new policies.
+ * The addresses/ports are already set, thus the IPsec policy calls
+ * can handle their passed-in conn's.
+ */
+ connp->conn_policy_cached = B_FALSE;
+
+insert:
+ error = ipcl_conn_insert(connp);
+ if (error != 0)
+ goto done;
+
+ /* Record this as the "last" send even though we haven't sent any */
+ connp->conn_v6lastdst = connp->conn_faddr_v6;
+
+ iptun->iptun_flags |= IPTUN_BOUND;
+ /*
+ * Now that we're bound with ip below us, this is a good
+ * time to initialize the destination path MTU and to
+ * re-calculate the tunnel's link MTU.
+ */
+ (void) iptun_update_mtu(iptun, ixa, 0);
+
+ if (IS_IPTUN_RUNNING(iptun))
+ iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE);
+
+done:
+ ixa_refrele(ixa);
+ return (error);
}
static void
@@ -986,7 +1066,7 @@ iptun_set_sec_simple(iptun_t *iptun, const ipsec_req_t *ipsr)
* Adjust MTU and make sure the DL side knows what's up.
*/
itp->itp_flags = ITPF_P_ACTIVE;
- (void) iptun_update_mtu(iptun, 0);
+ (void) iptun_update_mtu(iptun, NULL, 0);
old_policy = B_FALSE; /* Blank out inactive - we succeeded */
} else {
rw_exit(&itp->itp_policy->iph_lock);
@@ -1170,8 +1250,16 @@ iptun_conn_create(iptun_t *iptun, netstack_t *ns, cred_t *credp)
connp->conn_flags |= IPCL_IPTUN;
connp->conn_iptun = iptun;
connp->conn_recv = iptun_input;
- connp->conn_rq = ns->netstack_iptun->iptuns_g_q;
- connp->conn_wq = WR(connp->conn_rq);
+ connp->conn_recvicmp = iptun_input_icmp;
+ connp->conn_verifyicmp = iptun_verifyicmp;
+
+ /*
+ * Register iptun_notify to listen to capability changes detected by IP.
+ * This upcall is made in the context of the call to conn_ip_output.
+ */
+ connp->conn_ixa->ixa_notify = iptun_notify;
+ connp->conn_ixa->ixa_notify_cookie = iptun;
+
/*
* For exclusive stacks we set conn_zoneid to GLOBAL_ZONEID as is done
* for all other conn_t's.
@@ -1187,11 +1275,32 @@ iptun_conn_create(iptun_t *iptun, netstack_t *ns, cred_t *credp)
connp->conn_cred = credp;
/* crfree() is done in ipcl_conn_destroy(), called by CONN_DEC_REF() */
crhold(connp->conn_cred);
+ connp->conn_cpid = NOPID;
- connp->conn_send = iptun->iptun_typeinfo->iti_txfunc;
- connp->conn_af_isv6 = iptun->iptun_typeinfo->iti_ipvers == IPV6_VERSION;
+ /* conn_allzones can not be set this early, hence no IPCL_ZONEID */
+ connp->conn_ixa->ixa_zoneid = connp->conn_zoneid;
ASSERT(connp->conn_ref == 1);
+ /* Cache things in ixa without an extra refhold */
+ connp->conn_ixa->ixa_cred = connp->conn_cred;
+ connp->conn_ixa->ixa_cpid = connp->conn_cpid;
+ if (is_system_labeled())
+ connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
+
+ /*
+ * Have conn_ip_output drop packets should our outer source
+ * go invalid
+ */
+ connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
+
+ switch (iptun->iptun_typeinfo->iti_ipvers) {
+ case IPV4_VERSION:
+ connp->conn_family = AF_INET6;
+ break;
+ case IPV6_VERSION:
+ connp->conn_family = AF_INET;
+ break;
+ }
mutex_enter(&connp->conn_lock);
connp->conn_state_flags &= ~CONN_INCIPIENT;
mutex_exit(&connp->conn_lock);
@@ -1207,26 +1316,6 @@ iptun_conn_destroy(conn_t *connp)
CONN_DEC_REF(connp);
}
-static int
-iptun_create_g_q(iptun_stack_t *iptuns, cred_t *credp)
-{
- int err;
- conn_t *connp;
-
- ASSERT(iptuns->iptuns_g_q == NULL);
- /*
- * The global queue for this stack is set when iptunq_open() calls
- * iptun_set_g_q().
- */
- err = ldi_open_by_name(IPTUNQ_DEV, FWRITE|FREAD, credp,
- &iptuns->iptuns_g_q_lh, iptun_ldi_ident);
- if (err == 0) {
- connp = iptuns->iptuns_g_q->q_ptr;
- connp->conn_recv = iptun_input;
- }
- return (err);
-}
-
static iptun_t *
iptun_alloc(void)
{
@@ -1289,11 +1378,6 @@ iptun_free(iptun_t *iptun)
iptun->iptun_connp = NULL;
}
- netstack_rele(iptun->iptun_ns);
- iptun->iptun_ns = NULL;
- crfree(iptun->iptun_cred);
- iptun->iptun_cred = NULL;
-
kmem_cache_free(iptun_cache, iptun);
atomic_dec_32(&iptun_tunnelcount);
}
@@ -1340,19 +1424,6 @@ iptun_create(iptun_kparams_t *ik, cred_t *credp)
ns = netstack_find_by_cred(credp);
iptuns = ns->netstack_iptun;
- /*
- * Before we create any tunnel, we need to ensure that the default
- * STREAMS queue (used to satisfy the ip module's requirement for one)
- * is created. We only do this once per stack. The stream is closed
- * when the stack is destroyed in iptun_stack_fni().
- */
- mutex_enter(&iptuns->iptuns_lock);
- if (iptuns->iptuns_g_q == NULL)
- err = iptun_create_g_q(iptuns, zone_kcred());
- mutex_exit(&iptuns->iptuns_lock);
- if (err != 0)
- goto done;
-
if ((iptun = iptun_alloc()) == NULL) {
err = ENOMEM;
goto done;
@@ -1360,8 +1431,6 @@ iptun_create(iptun_kparams_t *ik, cred_t *credp)
iptun->iptun_linkid = ik->iptun_kparam_linkid;
iptun->iptun_zoneid = zoneid;
- crhold(credp);
- iptun->iptun_cred = credp;
iptun->iptun_ns = ns;
iptun->iptun_typeinfo = iptun_gettypeinfo(ik->iptun_kparam_type);
@@ -1668,49 +1737,142 @@ iptun_set_policy(datalink_id_t linkid, ipsec_tun_pol_t *itp)
ITP_REFHOLD(itp);
iptun->iptun_itp = itp;
/* IPsec policy means IPsec overhead, which means lower MTU. */
- (void) iptun_update_mtu(iptun, 0);
+ (void) iptun_update_mtu(iptun, NULL, 0);
}
iptun_exit(iptun);
}
/*
* Obtain the path MTU to the tunnel destination.
+ * Can return zero in some cases.
*/
static uint32_t
-iptun_get_dst_pmtu(iptun_t *iptun)
+iptun_get_dst_pmtu(iptun_t *iptun, ip_xmit_attr_t *ixa)
{
- ire_t *ire = NULL;
- ip_stack_t *ipst = iptun->iptun_ns->netstack_ip;
uint32_t pmtu = 0;
+ conn_t *connp = iptun->iptun_connp;
+ boolean_t need_rele = B_FALSE;
/*
- * We only obtain the destination IRE for tunnels that have a remote
- * tunnel address.
+ * We only obtain the pmtu for tunnels that have a remote tunnel
+ * address.
*/
if (!(iptun->iptun_flags & IPTUN_RADDR))
return (0);
- switch (iptun->iptun_typeinfo->iti_ipvers) {
- case IPV4_VERSION:
- ire = ire_route_lookup(iptun->iptun_raddr4, INADDR_ANY,
- INADDR_ANY, 0, NULL, NULL, iptun->iptun_connp->conn_zoneid,
- NULL, (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst);
- break;
- case IPV6_VERSION:
- ire = ire_route_lookup_v6(&iptun->iptun_raddr6, NULL, NULL, 0,
- NULL, NULL, iptun->iptun_connp->conn_zoneid, NULL,
- (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst);
- break;
+ if (ixa == NULL) {
+ ixa = conn_get_ixa(connp, B_FALSE);
+ if (ixa == NULL)
+ return (0);
+ need_rele = B_TRUE;
}
+ /*
+ * Guard against ICMP errors before we have sent, as well as against
+ * and a thread which held conn_ixa.
+ */
+ if (ixa->ixa_ire != NULL) {
+ pmtu = ip_get_pmtu(ixa);
- if (ire != NULL) {
- pmtu = ire->ire_max_frag;
- ire_refrele(ire);
+ /*
+ * For both IPv4 and IPv6 we can have indication that the outer
+ * header needs fragmentation.
+ */
+ if (ixa->ixa_flags & IXAF_PMTU_TOO_SMALL) {
+ /* Must allow fragmentation in ip_output */
+ ixa->ixa_flags &= ~IXAF_DONTFRAG;
+ } else if (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_6TO4) {
+ ixa->ixa_flags |= IXAF_DONTFRAG;
+ } else {
+ /* ip_get_pmtu might have set this - we don't want it */
+ ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
+ }
}
+
+ if (need_rele)
+ ixa_refrele(ixa);
return (pmtu);
}
/*
+ * Update the ip_xmit_attr_t to capture the current lower path mtu as known
+ * by ip.
+ */
+static void
+iptun_update_dst_pmtu(iptun_t *iptun, ip_xmit_attr_t *ixa)
+{
+ uint32_t pmtu;
+ conn_t *connp = iptun->iptun_connp;
+ boolean_t need_rele = B_FALSE;
+
+ /* IXAF_VERIFY_PMTU is not set if we don't have a fixed destination */
+ if (!(iptun->iptun_flags & IPTUN_RADDR))
+ return;
+
+ if (ixa == NULL) {
+ ixa = conn_get_ixa(connp, B_FALSE);
+ if (ixa == NULL)
+ return;
+ need_rele = B_TRUE;
+ }
+ /*
+ * Guard against ICMP errors before we have sent, as well as against
+ * and a thread which held conn_ixa.
+ */
+ if (ixa->ixa_ire != NULL) {
+ pmtu = ip_get_pmtu(ixa);
+ /*
+ * Update ixa_fragsize and ixa_pmtu.
+ */
+ ixa->ixa_fragsize = ixa->ixa_pmtu = pmtu;
+
+ /*
+ * For both IPv4 and IPv6 we can have indication that the outer
+ * header needs fragmentation.
+ */
+ if (ixa->ixa_flags & IXAF_PMTU_TOO_SMALL) {
+ /* Must allow fragmentation in ip_output */
+ ixa->ixa_flags &= ~IXAF_DONTFRAG;
+ } else if (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_6TO4) {
+ ixa->ixa_flags |= IXAF_DONTFRAG;
+ } else {
+ /* ip_get_pmtu might have set this - we don't want it */
+ ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
+ }
+ }
+
+ if (need_rele)
+ ixa_refrele(ixa);
+}
+
+/*
+ * There is nothing that iptun can verify in addition to IP having
+ * verified the IP addresses in the fanout.
+ */
+/* ARGSUSED */
+static boolean_t
+iptun_verifyicmp(conn_t *connp, void *arg2, icmph_t *icmph, icmp6_t *icmp6,
+ ip_recv_attr_t *ira)
+{
+ return (B_TRUE);
+}
+
+/*
+ * Notify function registered with ip_xmit_attr_t.
+ */
+static void
+iptun_notify(void *arg, ip_xmit_attr_t *ixa, ixa_notify_type_t ntype,
+ ixa_notify_arg_t narg)
+{
+ iptun_t *iptun = (iptun_t *)arg;
+
+ switch (ntype) {
+ case IXAN_PMTU:
+ (void) iptun_update_mtu(iptun, ixa, narg);
+ break;
+ }
+}
+
+/*
* Returns the max of old_ovhd and the overhead associated with pol.
*/
static uint32_t
@@ -1765,18 +1927,18 @@ iptun_get_ipsec_overhead(iptun_t *iptun)
/* Check for both IPv4 and IPv6. */
sel.ips_protocol = IPPROTO_ENCAP;
pol = ipsec_find_policy_head(NULL, iph, IPSEC_TYPE_OUTBOUND,
- &sel, ns);
+ &sel);
if (pol != NULL) {
ipsec_ovhd = ipsec_act_ovhd(&pol->ipsp_act->ipa_act);
- IPPOL_REFRELE(pol, ns);
+ IPPOL_REFRELE(pol);
}
sel.ips_protocol = IPPROTO_IPV6;
pol = ipsec_find_policy_head(NULL, iph, IPSEC_TYPE_OUTBOUND,
- &sel, ns);
+ &sel);
if (pol != NULL) {
ipsec_ovhd = max(ipsec_ovhd,
ipsec_act_ovhd(&pol->ipsp_act->ipa_act));
- IPPOL_REFRELE(pol, ns);
+ IPPOL_REFRELE(pol);
}
IPPH_REFRELE(iph, ns);
} else {
@@ -1802,10 +1964,14 @@ iptun_get_ipsec_overhead(iptun_t *iptun)
}
/*
- * Calculate and return the maximum possible MTU for the given tunnel.
+ * Calculate and return the maximum possible upper MTU for the given tunnel.
+ *
+ * If new_pmtu is set then we also need to update the lower path MTU information
+ * in the ip_xmit_attr_t. That is needed since we set IXAF_VERIFY_PMTU so that
+ * we are notified by conn_ip_output() when the path MTU increases.
*/
static uint32_t
-iptun_get_maxmtu(iptun_t *iptun, uint32_t new_pmtu)
+iptun_get_maxmtu(iptun_t *iptun, ip_xmit_attr_t *ixa, uint32_t new_pmtu)
{
size_t header_size, ipsec_overhead;
uint32_t maxmtu, pmtu;
@@ -1816,13 +1982,11 @@ iptun_get_maxmtu(iptun_t *iptun, uint32_t new_pmtu)
* iptun_get_dst_pmtu().
*/
if (new_pmtu != 0) {
- if (iptun->iptun_flags & IPTUN_RADDR) {
+ if (iptun->iptun_flags & IPTUN_RADDR)
iptun->iptun_dpmtu = new_pmtu;
- iptun->iptun_dpmtu_lastupdate = ddi_get_lbolt();
- }
pmtu = new_pmtu;
} else if (iptun->iptun_flags & IPTUN_RADDR) {
- if ((pmtu = iptun_get_dst_pmtu(iptun)) == 0) {
+ if ((pmtu = iptun_get_dst_pmtu(iptun, ixa)) == 0) {
/*
* We weren't able to obtain the path-MTU of the
* destination. Use the previous value.
@@ -1830,7 +1994,6 @@ iptun_get_maxmtu(iptun_t *iptun, uint32_t new_pmtu)
pmtu = iptun->iptun_dpmtu;
} else {
iptun->iptun_dpmtu = pmtu;
- iptun->iptun_dpmtu_lastupdate = ddi_get_lbolt();
}
} else {
/*
@@ -1866,19 +2029,23 @@ iptun_get_maxmtu(iptun_t *iptun, uint32_t new_pmtu)
}
/*
- * Re-calculate the tunnel's MTU and notify the MAC layer of any change in
- * MTU. The new_pmtu argument is the new path MTU to the tunnel destination
- * to be used in the tunnel MTU calculation. Passing in 0 for new_pmtu causes
- * the path MTU to be dynamically updated using iptun_update_pmtu().
+ * Re-calculate the tunnel's MTU as seen from above and notify the MAC layer
+ * of any change in MTU. The new_pmtu argument is the new lower path MTU to
+ * the tunnel destination to be used in the tunnel MTU calculation. Passing
+ * in 0 for new_pmtu causes the lower path MTU to be dynamically updated using
+ * ip_get_pmtu().
*
* If the calculated tunnel MTU is different than its previous value, then we
* notify the MAC layer above us of this change using mac_maxsdu_update().
*/
static uint32_t
-iptun_update_mtu(iptun_t *iptun, uint32_t new_pmtu)
+iptun_update_mtu(iptun_t *iptun, ip_xmit_attr_t *ixa, uint32_t new_pmtu)
{
uint32_t newmtu;
+ /* We always update the ixa since we might have set IXAF_VERIFY_PMTU */
+ iptun_update_dst_pmtu(iptun, ixa);
+
/*
* We return the current MTU without updating it if it was pegged to a
* static value using the MAC_PROP_MTU link property.
@@ -1887,8 +2054,7 @@ iptun_update_mtu(iptun_t *iptun, uint32_t new_pmtu)
return (iptun->iptun_mtu);
/* If the MTU isn't fixed, then use the maximum possible value. */
- newmtu = iptun_get_maxmtu(iptun, new_pmtu);
-
+ newmtu = iptun_get_maxmtu(iptun, ixa, new_pmtu);
/*
* We only dynamically adjust the tunnel MTU for tunnels with
* destinations because dynamic MTU calculations are based on the
@@ -1929,7 +2095,7 @@ iptun_build_icmperr(size_t hdrs_size, mblk_t *orig_pkt)
{
mblk_t *icmperr_mp;
- if ((icmperr_mp = allocb_tmpl(hdrs_size, orig_pkt)) != NULL) {
+ if ((icmperr_mp = allocb(hdrs_size, BPRI_MED)) != NULL) {
icmperr_mp->b_wptr += hdrs_size;
/* tack on the offending packet */
icmperr_mp->b_cont = orig_pkt;
@@ -1942,12 +2108,15 @@ iptun_build_icmperr(size_t hdrs_size, mblk_t *orig_pkt)
* the ICMP error.
*/
static void
-iptun_sendicmp_v4(iptun_t *iptun, icmph_t *icmp, ipha_t *orig_ipha, mblk_t *mp)
+iptun_sendicmp_v4(iptun_t *iptun, icmph_t *icmp, ipha_t *orig_ipha, mblk_t *mp,
+ ts_label_t *tsl)
{
size_t orig_pktsize, hdrs_size;
mblk_t *icmperr_mp;
ipha_t *new_ipha;
icmph_t *new_icmp;
+ ip_xmit_attr_t ixas;
+ conn_t *connp = iptun->iptun_connp;
orig_pktsize = msgdsize(mp);
hdrs_size = sizeof (ipha_t) + sizeof (icmph_t);
@@ -1974,17 +2143,35 @@ iptun_sendicmp_v4(iptun_t *iptun, icmph_t *icmp, ipha_t *orig_ipha, mblk_t *mp)
new_icmp->icmph_checksum = 0;
new_icmp->icmph_checksum = IP_CSUM(icmperr_mp, sizeof (ipha_t), 0);
- ip_output(iptun->iptun_connp, icmperr_mp, iptun->iptun_connp->conn_wq,
- IP_WPUT);
+ bzero(&ixas, sizeof (ixas));
+ ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
+ if (new_ipha->ipha_src == INADDR_ANY)
+ ixas.ixa_flags |= IXAF_SET_SOURCE;
+
+ ixas.ixa_zoneid = IPCL_ZONEID(connp);
+ ixas.ixa_ipst = connp->conn_netstack->netstack_ip;
+ ixas.ixa_cred = connp->conn_cred;
+ ixas.ixa_cpid = NOPID;
+ if (is_system_labeled())
+ ixas.ixa_tsl = tsl;
+
+ ixas.ixa_ifindex = 0;
+ ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
+
+ (void) ip_output_simple(icmperr_mp, &ixas);
+ ixa_cleanup(&ixas);
}
static void
-iptun_sendicmp_v6(iptun_t *iptun, icmp6_t *icmp6, ip6_t *orig_ip6h, mblk_t *mp)
+iptun_sendicmp_v6(iptun_t *iptun, icmp6_t *icmp6, ip6_t *orig_ip6h, mblk_t *mp,
+ ts_label_t *tsl)
{
size_t orig_pktsize, hdrs_size;
mblk_t *icmp6err_mp;
ip6_t *new_ip6h;
icmp6_t *new_icmp6;
+ ip_xmit_attr_t ixas;
+ conn_t *connp = iptun->iptun_connp;
orig_pktsize = msgdsize(mp);
hdrs_size = sizeof (ip6_t) + sizeof (icmp6_t);
@@ -2004,16 +2191,31 @@ iptun_sendicmp_v6(iptun_t *iptun, icmp6_t *icmp6, ip6_t *orig_ip6h, mblk_t *mp)
new_ip6h->ip6_dst = orig_ip6h->ip6_src;
*new_icmp6 = *icmp6;
- /* The checksum is calculated in ip_wput_ire_v6(). */
+ /* The checksum is calculated in ip_output_simple and friends. */
new_icmp6->icmp6_cksum = new_ip6h->ip6_plen;
- ip_output_v6(iptun->iptun_connp, icmp6err_mp,
- iptun->iptun_connp->conn_wq, IP_WPUT);
+ bzero(&ixas, sizeof (ixas));
+ ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
+ if (IN6_IS_ADDR_UNSPECIFIED(&new_ip6h->ip6_src))
+ ixas.ixa_flags |= IXAF_SET_SOURCE;
+
+ ixas.ixa_zoneid = IPCL_ZONEID(connp);
+ ixas.ixa_ipst = connp->conn_netstack->netstack_ip;
+ ixas.ixa_cred = connp->conn_cred;
+ ixas.ixa_cpid = NOPID;
+ if (is_system_labeled())
+ ixas.ixa_tsl = tsl;
+
+ ixas.ixa_ifindex = 0;
+ ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
+
+ (void) ip_output_simple(icmp6err_mp, &ixas);
+ ixa_cleanup(&ixas);
}
static void
iptun_icmp_error_v4(iptun_t *iptun, ipha_t *orig_ipha, mblk_t *mp,
- uint8_t type, uint8_t code)
+ uint8_t type, uint8_t code, ts_label_t *tsl)
{
icmph_t icmp;
@@ -2021,12 +2223,12 @@ iptun_icmp_error_v4(iptun_t *iptun, ipha_t *orig_ipha, mblk_t *mp,
icmp.icmph_type = type;
icmp.icmph_code = code;
- iptun_sendicmp_v4(iptun, &icmp, orig_ipha, mp);
+ iptun_sendicmp_v4(iptun, &icmp, orig_ipha, mp, tsl);
}
static void
iptun_icmp_fragneeded_v4(iptun_t *iptun, uint32_t newmtu, ipha_t *orig_ipha,
- mblk_t *mp)
+ mblk_t *mp, ts_label_t *tsl)
{
icmph_t icmp;
@@ -2035,12 +2237,12 @@ iptun_icmp_fragneeded_v4(iptun_t *iptun, uint32_t newmtu, ipha_t *orig_ipha,
icmp.icmph_du_zero = 0;
icmp.icmph_du_mtu = htons(newmtu);
- iptun_sendicmp_v4(iptun, &icmp, orig_ipha, mp);
+ iptun_sendicmp_v4(iptun, &icmp, orig_ipha, mp, tsl);
}
static void
iptun_icmp_error_v6(iptun_t *iptun, ip6_t *orig_ip6h, mblk_t *mp,
- uint8_t type, uint8_t code, uint32_t offset)
+ uint8_t type, uint8_t code, uint32_t offset, ts_label_t *tsl)
{
icmp6_t icmp6;
@@ -2050,12 +2252,12 @@ iptun_icmp_error_v6(iptun_t *iptun, ip6_t *orig_ip6h, mblk_t *mp,
if (type == ICMP6_PARAM_PROB)
icmp6.icmp6_pptr = htonl(offset);
- iptun_sendicmp_v6(iptun, &icmp6, orig_ip6h, mp);
+ iptun_sendicmp_v6(iptun, &icmp6, orig_ip6h, mp, tsl);
}
static void
iptun_icmp_toobig_v6(iptun_t *iptun, uint32_t newmtu, ip6_t *orig_ip6h,
- mblk_t *mp)
+ mblk_t *mp, ts_label_t *tsl)
{
icmp6_t icmp6;
@@ -2063,7 +2265,7 @@ iptun_icmp_toobig_v6(iptun_t *iptun, uint32_t newmtu, ip6_t *orig_ip6h,
icmp6.icmp6_code = 0;
icmp6.icmp6_mtu = htonl(newmtu);
- iptun_sendicmp_v6(iptun, &icmp6, orig_ip6h, mp);
+ iptun_sendicmp_v6(iptun, &icmp6, orig_ip6h, mp, tsl);
}
/*
@@ -2105,13 +2307,15 @@ is_icmp_error(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h)
/*
* Find inner and outer IP headers from a tunneled packet as setup for calls
* into ipsec_tun_{in,out}bound().
+ * Note that we need to allow the outer header to be in a separate mblk from
+ * the inner header.
+ * If the caller knows the outer_hlen, the caller passes it in. Otherwise zero.
*/
static size_t
-iptun_find_headers(mblk_t *mp, ipha_t **outer4, ipha_t **inner4, ip6_t **outer6,
- ip6_t **inner6)
+iptun_find_headers(mblk_t *mp, size_t outer_hlen, ipha_t **outer4,
+ ipha_t **inner4, ip6_t **outer6, ip6_t **inner6)
{
ipha_t *ipha;
- size_t outer_hlen;
size_t first_mblkl = MBLKL(mp);
mblk_t *inner_mp;
@@ -2128,12 +2332,14 @@ iptun_find_headers(mblk_t *mp, ipha_t **outer4, ipha_t **inner4, ip6_t **outer6,
case IPV4_VERSION:
*outer4 = ipha;
*outer6 = NULL;
- outer_hlen = IPH_HDR_LENGTH(ipha);
+ if (outer_hlen == 0)
+ outer_hlen = IPH_HDR_LENGTH(ipha);
break;
case IPV6_VERSION:
*outer4 = NULL;
*outer6 = (ip6_t *)ipha;
- outer_hlen = ip_hdr_length_v6(mp, (ip6_t *)ipha);
+ if (outer_hlen == 0)
+ outer_hlen = ip_hdr_length_v6(mp, (ip6_t *)ipha);
break;
default:
return (0);
@@ -2192,8 +2398,8 @@ iptun_find_headers(mblk_t *mp, ipha_t **outer4, ipha_t **inner4, ip6_t **outer6,
* whatever the very-inner packet is (IPv4(2) or IPv6).
*/
static void
-iptun_input_icmp_v4(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp,
- icmph_t *icmph)
+iptun_input_icmp_v4(iptun_t *iptun, mblk_t *data_mp, icmph_t *icmph,
+ ip_recv_attr_t *ira)
{
uint8_t *orig;
ipha_t *outer4, *inner4;
@@ -2201,12 +2407,6 @@ iptun_input_icmp_v4(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp,
int outer_hlen;
uint8_t type, code;
- /*
- * Change the db_type to M_DATA because subsequent operations assume
- * the ICMP packet is M_DATA again (i.e. calls to msgdsize()).
- */
- data_mp->b_datap->db_type = M_DATA;
-
ASSERT(data_mp->b_cont == NULL);
/*
* Temporarily move b_rptr forward so that iptun_find_headers() can
@@ -2220,13 +2420,12 @@ iptun_input_icmp_v4(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp,
* here).
*/
ASSERT(MBLKL(data_mp) >= 0);
- outer_hlen = iptun_find_headers(data_mp, &outer4, &inner4, &outer6,
+ outer_hlen = iptun_find_headers(data_mp, 0, &outer4, &inner4, &outer6,
&inner6);
ASSERT(outer6 == NULL);
data_mp->b_rptr = orig;
if (outer_hlen == 0) {
- iptun_drop_pkt((ipsec_mp != NULL ? ipsec_mp : data_mp),
- &iptun->iptun_ierrors);
+ iptun_drop_pkt(data_mp, &iptun->iptun_ierrors);
return;
}
@@ -2234,10 +2433,9 @@ iptun_input_icmp_v4(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp,
ASSERT(outer4->ipha_protocol == IPPROTO_ENCAP ||
outer4->ipha_protocol == IPPROTO_IPV6);
- /* ipsec_tun_inbound() always frees ipsec_mp. */
- if (!ipsec_tun_inbound(ipsec_mp, &data_mp, iptun->iptun_itp,
- inner4, inner6, outer4, outer6, -outer_hlen,
- iptun->iptun_ns)) {
+ data_mp = ipsec_tun_inbound(ira, data_mp, iptun->iptun_itp,
+ inner4, inner6, outer4, outer6, -outer_hlen, iptun->iptun_ns);
+ if (data_mp == NULL) {
/* Callee did all of the freeing. */
atomic_inc_64(&iptun->iptun_ierrors);
return;
@@ -2269,15 +2467,15 @@ iptun_input_icmp_v4(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp,
* also have IPsec policy by letting iptun_update_mtu
* take care of it.
*/
- newmtu =
- iptun_update_mtu(iptun, ntohs(icmph->icmph_du_mtu));
+ newmtu = iptun_update_mtu(iptun, NULL,
+ ntohs(icmph->icmph_du_mtu));
if (inner4 != NULL) {
iptun_icmp_fragneeded_v4(iptun, newmtu, inner4,
- data_mp);
+ data_mp, ira->ira_tsl);
} else {
iptun_icmp_toobig_v6(iptun, newmtu, inner6,
- data_mp);
+ data_mp, ira->ira_tsl);
}
return;
}
@@ -2310,10 +2508,13 @@ iptun_input_icmp_v4(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp,
return;
}
- if (inner4 != NULL)
- iptun_icmp_error_v4(iptun, inner4, data_mp, type, code);
- else
- iptun_icmp_error_v6(iptun, inner6, data_mp, type, code, 0);
+ if (inner4 != NULL) {
+ iptun_icmp_error_v4(iptun, inner4, data_mp, type, code,
+ ira->ira_tsl);
+ } else {
+ iptun_icmp_error_v6(iptun, inner6, data_mp, type, code, 0,
+ ira->ira_tsl);
+ }
}
/*
@@ -2324,17 +2525,17 @@ iptun_input_icmp_v4(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp,
static boolean_t
iptun_find_encaplimit(mblk_t *mp, ip6_t *ip6h, uint8_t **encaplim_ptr)
{
- ip6_pkt_t pkt;
+ ip_pkt_t pkt;
uint8_t *endptr;
ip6_dest_t *destp;
struct ip6_opt *optp;
pkt.ipp_fields = 0; /* must be initialized */
- (void) ip_find_hdr_v6(mp, ip6h, &pkt, NULL);
+ (void) ip_find_hdr_v6(mp, ip6h, B_FALSE, &pkt, NULL);
if ((pkt.ipp_fields & IPPF_DSTOPTS) != 0) {
destp = pkt.ipp_dstopts;
- } else if ((pkt.ipp_fields & IPPF_RTDSTOPTS) != 0) {
- destp = pkt.ipp_rtdstopts;
+ } else if ((pkt.ipp_fields & IPPF_RTHDRDSTOPTS) != 0) {
+ destp = pkt.ipp_rthdrdstopts;
} else {
return (B_FALSE);
}
@@ -2370,8 +2571,8 @@ iptun_find_encaplimit(mblk_t *mp, ip6_t *ip6h, uint8_t **encaplim_ptr)
* whatever the very-inner packet is (IPv4 or IPv6(2)).
*/
static void
-iptun_input_icmp_v6(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp,
- icmp6_t *icmp6h)
+iptun_input_icmp_v6(iptun_t *iptun, mblk_t *data_mp, icmp6_t *icmp6h,
+ ip_recv_attr_t *ira)
{
uint8_t *orig;
ipha_t *outer4, *inner4;
@@ -2379,12 +2580,6 @@ iptun_input_icmp_v6(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp,
int outer_hlen;
uint8_t type, code;
- /*
- * Change the db_type to M_DATA because subsequent operations assume
- * the ICMP packet is M_DATA again (i.e. calls to msgdsize().)
- */
- data_mp->b_datap->db_type = M_DATA;
-
ASSERT(data_mp->b_cont == NULL);
/*
@@ -2399,19 +2594,18 @@ iptun_input_icmp_v6(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp,
* here).
*/
ASSERT(MBLKL(data_mp) >= 0);
- outer_hlen = iptun_find_headers(data_mp, &outer4, &inner4, &outer6,
+ outer_hlen = iptun_find_headers(data_mp, 0, &outer4, &inner4, &outer6,
&inner6);
ASSERT(outer4 == NULL);
data_mp->b_rptr = orig; /* Restore r_ptr */
if (outer_hlen == 0) {
- iptun_drop_pkt((ipsec_mp != NULL ? ipsec_mp : data_mp),
- &iptun->iptun_ierrors);
+ iptun_drop_pkt(data_mp, &iptun->iptun_ierrors);
return;
}
- if (!ipsec_tun_inbound(ipsec_mp, &data_mp, iptun->iptun_itp,
- inner4, inner6, outer4, outer6, -outer_hlen,
- iptun->iptun_ns)) {
+ data_mp = ipsec_tun_inbound(ira, data_mp, iptun->iptun_itp,
+ inner4, inner6, outer4, outer6, -outer_hlen, iptun->iptun_ns);
+ if (data_mp == NULL) {
/* Callee did all of the freeing. */
atomic_inc_64(&iptun->iptun_ierrors);
return;
@@ -2466,13 +2660,15 @@ iptun_input_icmp_v6(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp,
* have IPsec policy by letting iptun_update_mtu take care of
* it.
*/
- newmtu = iptun_update_mtu(iptun, ntohl(icmp6h->icmp6_mtu));
+ newmtu = iptun_update_mtu(iptun, NULL,
+ ntohl(icmp6h->icmp6_mtu));
if (inner4 != NULL) {
iptun_icmp_fragneeded_v4(iptun, newmtu, inner4,
- data_mp);
+ data_mp, ira->ira_tsl);
} else {
- iptun_icmp_toobig_v6(iptun, newmtu, inner6, data_mp);
+ iptun_icmp_toobig_v6(iptun, newmtu, inner6, data_mp,
+ ira->ira_tsl);
}
return;
}
@@ -2481,51 +2677,57 @@ iptun_input_icmp_v6(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp,
return;
}
- if (inner4 != NULL)
- iptun_icmp_error_v4(iptun, inner4, data_mp, type, code);
- else
- iptun_icmp_error_v6(iptun, inner6, data_mp, type, code, 0);
+ if (inner4 != NULL) {
+ iptun_icmp_error_v4(iptun, inner4, data_mp, type, code,
+ ira->ira_tsl);
+ } else {
+ iptun_icmp_error_v6(iptun, inner6, data_mp, type, code, 0,
+ ira->ira_tsl);
+ }
}
+/*
+ * Called as conn_recvicmp from IP for ICMP errors.
+ */
+/* ARGSUSED2 */
static void
-iptun_input_icmp(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp)
+iptun_input_icmp(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
{
- mblk_t *tmpmp;
- size_t hlen;
+ conn_t *connp = arg;
+ iptun_t *iptun = connp->conn_iptun;
+ mblk_t *tmpmp;
+ size_t hlen;
- if (data_mp->b_cont != NULL) {
+ ASSERT(IPCL_IS_IPTUN(connp));
+
+ if (mp->b_cont != NULL) {
/*
* Since ICMP error processing necessitates access to bits
* that are within the ICMP error payload (the original packet
* that caused the error), pull everything up into a single
* block for convenience.
*/
- data_mp->b_datap->db_type = M_DATA;
- if ((tmpmp = msgpullup(data_mp, -1)) == NULL) {
- iptun_drop_pkt((ipsec_mp != NULL ? ipsec_mp : data_mp),
- &iptun->iptun_norcvbuf);
+ if ((tmpmp = msgpullup(mp, -1)) == NULL) {
+ iptun_drop_pkt(mp, &iptun->iptun_norcvbuf);
return;
}
- freemsg(data_mp);
- data_mp = tmpmp;
- if (ipsec_mp != NULL)
- ipsec_mp->b_cont = data_mp;
+ freemsg(mp);
+ mp = tmpmp;
}
+ hlen = ira->ira_ip_hdr_length;
switch (iptun->iptun_typeinfo->iti_ipvers) {
case IPV4_VERSION:
/*
* The outer IP header coming up from IP is always ipha_t
* alligned (otherwise, we would have crashed in ip).
*/
- hlen = IPH_HDR_LENGTH((ipha_t *)data_mp->b_rptr);
- iptun_input_icmp_v4(iptun, ipsec_mp, data_mp,
- (icmph_t *)(data_mp->b_rptr + hlen));
+ iptun_input_icmp_v4(iptun, mp, (icmph_t *)(mp->b_rptr + hlen),
+ ira);
break;
case IPV6_VERSION:
- hlen = ip_hdr_length_v6(data_mp, (ip6_t *)data_mp->b_rptr);
- iptun_input_icmp_v6(iptun, ipsec_mp, data_mp,
- (icmp6_t *)(data_mp->b_rptr + hlen));
+ iptun_input_icmp_v6(iptun, mp, (icmp6_t *)(mp->b_rptr + hlen),
+ ira);
break;
}
}
@@ -2578,63 +2780,24 @@ iptun_in_6to4_ok(iptun_t *iptun, ipha_t *outer4, ip6_t *inner6)
* Input function for everything that comes up from the ip module below us.
* This is called directly from the ip module via connp->conn_recv().
*
- * There are two kinds of packets that can arrive here: (1) IP-in-IP tunneled
- * packets and (2) ICMP errors containing IP-in-IP packets transmitted by us.
- * They have the following structure:
- *
- * 1) M_DATA
- * 2) M_CTL[->M_DATA]
- *
- * (2) Is an M_CTL optionally followed by M_DATA, where the M_CTL block is the
- * start of the actual ICMP packet (it doesn't contain any special control
- * information).
- *
- * Either (1) or (2) can be IPsec-protected, in which case an M_CTL block
- * containing an ipsec_in_t will have been prepended to either (1) or (2),
- * making a total of four combinations of possible mblk chains:
- *
- * A) (1)
- * B) (2)
- * C) M_CTL(ipsec_in_t)->(1)
- * D) M_CTL(ipsec_in_t)->(2)
+ * We receive M_DATA messages with IP-in-IP tunneled packets.
*/
-/* ARGSUSED */
+/* ARGSUSED2 */
static void
-iptun_input(void *arg, mblk_t *mp, void *arg2)
+iptun_input(void *arg, mblk_t *data_mp, void *arg2, ip_recv_attr_t *ira)
{
conn_t *connp = arg;
iptun_t *iptun = connp->conn_iptun;
int outer_hlen;
ipha_t *outer4, *inner4;
ip6_t *outer6, *inner6;
- mblk_t *data_mp = mp;
ASSERT(IPCL_IS_IPTUN(connp));
- ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_CTL);
-
- if (DB_TYPE(mp) == M_CTL) {
- if (((ipsec_in_t *)(mp->b_rptr))->ipsec_in_type != IPSEC_IN) {
- iptun_input_icmp(iptun, NULL, mp);
- return;
- }
-
- data_mp = mp->b_cont;
- if (DB_TYPE(data_mp) == M_CTL) {
- /* Protected ICMP packet. */
- iptun_input_icmp(iptun, mp, data_mp);
- return;
- }
- }
-
- /*
- * Request the destination's path MTU information regularly in case
- * path MTU has increased.
- */
- if (IPTUN_PMTU_TOO_OLD(iptun))
- iptun_task_dispatch(iptun, IPTUN_TASK_PMTU_UPDATE);
+ ASSERT(DB_TYPE(data_mp) == M_DATA);
- if ((outer_hlen = iptun_find_headers(data_mp, &outer4, &inner4, &outer6,
- &inner6)) == 0)
+ outer_hlen = iptun_find_headers(data_mp, ira->ira_ip_hdr_length,
+ &outer4, &inner4, &outer6, &inner6);
+ if (outer_hlen == 0)
goto drop;
/*
@@ -2644,25 +2807,22 @@ iptun_input(void *arg, mblk_t *mp, void *arg2)
* the more involved tsol_receive_local() since the tunnel link itself
* cannot be assigned to shared-stack non-global zones.
*/
- if (is_system_labeled()) {
- cred_t *msg_cred;
-
- if ((msg_cred = msg_getcred(data_mp, NULL)) == NULL)
+ if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
+ if (ira->ira_tsl == NULL)
goto drop;
- if (tsol_check_dest(msg_cred, (outer4 != NULL ?
+ if (tsol_check_dest(ira->ira_tsl, (outer4 != NULL ?
(void *)&outer4->ipha_dst : (void *)&outer6->ip6_dst),
(outer4 != NULL ? IPV4_VERSION : IPV6_VERSION),
- CONN_MAC_DEFAULT, NULL) != 0)
+ CONN_MAC_DEFAULT, B_FALSE, NULL) != 0)
goto drop;
}
- if (!ipsec_tun_inbound((mp == data_mp ? NULL : mp), &data_mp,
- iptun->iptun_itp, inner4, inner6, outer4, outer6, outer_hlen,
- iptun->iptun_ns)) {
+ data_mp = ipsec_tun_inbound(ira, data_mp, iptun->iptun_itp,
+ inner4, inner6, outer4, outer6, outer_hlen, iptun->iptun_ns);
+ if (data_mp == NULL) {
/* Callee did all of the freeing. */
return;
}
- mp = data_mp;
if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_6TO4 &&
!iptun_in_6to4_ok(iptun, outer4, inner6))
@@ -2673,6 +2833,8 @@ iptun_input(void *arg, mblk_t *mp, void *arg2)
* we might as well split up any b_next chains here.
*/
do {
+ mblk_t *mp;
+
mp = data_mp->b_next;
data_mp->b_next = NULL;
@@ -2684,7 +2846,7 @@ iptun_input(void *arg, mblk_t *mp, void *arg2)
} while (data_mp != NULL);
return;
drop:
- iptun_drop_pkt(mp, &iptun->iptun_ierrors);
+ iptun_drop_pkt(data_mp, &iptun->iptun_ierrors);
}
/*
@@ -2744,6 +2906,10 @@ iptun_out_process_6to4(iptun_t *iptun, ipha_t *outer4, ip6_t *inner6)
/* destination is a 6to4 router */
IN6_6TO4_TO_V4ADDR(&inner6->ip6_dst,
(struct in_addr *)&outer4->ipha_dst);
+
+ /* Reject attempts to send to INADDR_ANY */
+ if (outer4->ipha_dst == INADDR_ANY)
+ return (B_FALSE);
} else {
/*
* The destination is a native IPv6 address. If output to a
@@ -2770,12 +2936,11 @@ iptun_out_process_6to4(iptun_t *iptun, ipha_t *outer4, ip6_t *inner6)
*/
static mblk_t *
iptun_out_process_ipv4(iptun_t *iptun, mblk_t *mp, ipha_t *outer4,
- ipha_t *inner4, ip6_t *inner6)
+ ipha_t *inner4, ip6_t *inner6, ip_xmit_attr_t *ixa)
{
uint8_t *innerptr = (inner4 != NULL ?
(uint8_t *)inner4 : (uint8_t *)inner6);
- size_t minmtu = (inner4 != NULL ?
- IPTUN_MIN_IPV4_MTU : IPTUN_MIN_IPV6_MTU);
+ size_t minmtu = iptun->iptun_typeinfo->iti_minmtu;
if (inner4 != NULL) {
ASSERT(outer4->ipha_protocol == IPPROTO_ENCAP);
@@ -2791,13 +2956,11 @@ iptun_out_process_ipv4(iptun_t *iptun, mblk_t *mp, ipha_t *outer4,
} else {
ASSERT(outer4->ipha_protocol == IPPROTO_IPV6 &&
inner6 != NULL);
-
- if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_6TO4 &&
- !iptun_out_process_6to4(iptun, outer4, inner6)) {
- iptun_drop_pkt(mp, &iptun->iptun_oerrors);
- return (NULL);
- }
}
+ if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF)
+ outer4->ipha_fragment_offset_and_flags |= IPH_DF_HTONS;
+ else
+ outer4->ipha_fragment_offset_and_flags &= ~IPH_DF_HTONS;
/*
* As described in section 3.2.2 of RFC4213, if the packet payload is
@@ -2807,11 +2970,19 @@ iptun_out_process_ipv4(iptun_t *iptun, mblk_t *mp, ipha_t *outer4,
* won't be allowed to drop its MTU as a result, since the packet was
* already smaller than the smallest allowable MTU for that interface.
*/
- if (mp->b_wptr - innerptr <= minmtu)
+ if (mp->b_wptr - innerptr <= minmtu) {
outer4->ipha_fragment_offset_and_flags = 0;
+ ixa->ixa_flags &= ~IXAF_DONTFRAG;
+ } else if (!(ixa->ixa_flags & IXAF_PMTU_TOO_SMALL) &&
+ (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_6TO4)) {
+ ixa->ixa_flags |= IXAF_DONTFRAG;
+ }
- outer4->ipha_length = htons(msgdsize(mp));
+ ixa->ixa_ip_hdr_length = IPH_HDR_LENGTH(outer4);
+ ixa->ixa_pktlen = msgdsize(mp);
+ ixa->ixa_protocol = outer4->ipha_protocol;
+ outer4->ipha_length = htons(ixa->ixa_pktlen);
return (mp);
}
@@ -2830,7 +3001,7 @@ iptun_insert_encaplimit(iptun_t *iptun, mblk_t *mp, ip6_t *outer6,
ASSERT(mp->b_cont == NULL);
mp->b_rptr += sizeof (ip6_t);
- newmp = allocb_tmpl(sizeof (iptun_ipv6hdrs_t) + MBLKL(mp), mp);
+ newmp = allocb(sizeof (iptun_ipv6hdrs_t) + MBLKL(mp), BPRI_MED);
if (newmp == NULL) {
iptun_drop_pkt(mp, &iptun->iptun_noxmtbuf);
return (NULL);
@@ -2861,8 +3032,12 @@ iptun_insert_encaplimit(iptun_t *iptun, mblk_t *mp, ip6_t *outer6,
* on error.
*/
static mblk_t *
-iptun_out_process_ipv6(iptun_t *iptun, mblk_t *mp, ip6_t *outer6, ip6_t *inner6)
+iptun_out_process_ipv6(iptun_t *iptun, mblk_t *mp, ip6_t *outer6,
+ ipha_t *inner4, ip6_t *inner6, ip_xmit_attr_t *ixa)
{
+ uint8_t *innerptr = (inner4 != NULL ?
+ (uint8_t *)inner4 : (uint8_t *)inner6);
+ size_t minmtu = iptun->iptun_typeinfo->iti_minmtu;
uint8_t *limit, *configlimit;
uint32_t offset;
iptun_ipv6hdrs_t *v6hdrs;
@@ -2887,7 +3062,7 @@ iptun_out_process_ipv6(iptun_t *iptun, mblk_t *mp, ip6_t *outer6, ip6_t *inner6)
mp->b_rptr = (uint8_t *)inner6;
offset = limit - mp->b_rptr;
iptun_icmp_error_v6(iptun, inner6, mp, ICMP6_PARAM_PROB,
- 0, offset);
+ 0, offset, ixa->ixa_tsl);
atomic_inc_64(&iptun->iptun_noxmtbuf);
return (NULL);
}
@@ -2900,6 +3075,7 @@ iptun_out_process_ipv6(iptun_t *iptun, mblk_t *mp, ip6_t *outer6, ip6_t *inner6)
if ((mp = iptun_insert_encaplimit(iptun, mp, outer6,
(*limit - 1))) == NULL)
return (NULL);
+ v6hdrs = (iptun_ipv6hdrs_t *)mp->b_rptr;
} else {
/*
* There is an existing encapsulation limit option in
@@ -2914,9 +3090,23 @@ iptun_out_process_ipv6(iptun_t *iptun, mblk_t *mp, ip6_t *outer6, ip6_t *inner6)
if ((*limit - 1) < *configlimit)
*configlimit = (*limit - 1);
}
+ ixa->ixa_ip_hdr_length = sizeof (iptun_ipv6hdrs_t);
+ ixa->ixa_protocol = v6hdrs->it6h_encaplim.iel_destopt.ip6d_nxt;
+ } else {
+ ixa->ixa_ip_hdr_length = sizeof (ip6_t);
+ ixa->ixa_protocol = outer6->ip6_nxt;
}
+ /*
+ * See iptun_output_process_ipv4() why we allow fragmentation for
+ * small packets
+ */
+ if (mp->b_wptr - innerptr <= minmtu)
+ ixa->ixa_flags &= ~IXAF_DONTFRAG;
+ else if (!(ixa->ixa_flags & IXAF_PMTU_TOO_SMALL))
+ ixa->ixa_flags |= IXAF_DONTFRAG;
- outer6->ip6_plen = htons(msgdsize(mp) - sizeof (ip6_t));
+ ixa->ixa_pktlen = msgdsize(mp);
+ outer6->ip6_plen = htons(ixa->ixa_pktlen - sizeof (ip6_t));
return (mp);
}
@@ -2929,11 +3119,9 @@ static void
iptun_output(iptun_t *iptun, mblk_t *mp)
{
conn_t *connp = iptun->iptun_connp;
- int outer_hlen;
mblk_t *newmp;
- ipha_t *outer4, *inner4;
- ip6_t *outer6, *inner6;
- ipsec_tun_pol_t *itp = iptun->iptun_itp;
+ int error;
+ ip_xmit_attr_t *ixa;
ASSERT(mp->b_datap->db_type == M_DATA);
@@ -2946,17 +3134,262 @@ iptun_output(iptun_t *iptun, mblk_t *mp)
mp = newmp;
}
- outer_hlen = iptun_find_headers(mp, &outer4, &inner4, &outer6, &inner6);
+ if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_6TO4) {
+ iptun_output_6to4(iptun, mp);
+ return;
+ }
+
+ if (is_system_labeled()) {
+ /*
+ * Since the label can be different meaning a potentially
+ * different IRE,we always use a unique ip_xmit_attr_t.
+ */
+ ixa = conn_get_ixa_exclusive(connp);
+ } else {
+ /*
+ * If no other thread is using conn_ixa this just gets a
+ * reference to conn_ixa. Otherwise we get a safe copy of
+ * conn_ixa.
+ */
+ ixa = conn_get_ixa(connp, B_FALSE);
+ }
+ if (ixa == NULL) {
+ iptun_drop_pkt(mp, &iptun->iptun_oerrors);
+ return;
+ }
+
+ /*
+ * In case we got a safe copy of conn_ixa, then we need
+ * to fill in any pointers in it.
+ */
+ if (ixa->ixa_ire == NULL) {
+ error = ip_attr_connect(connp, ixa, &connp->conn_saddr_v6,
+ &connp->conn_faddr_v6, &connp->conn_faddr_v6, 0,
+ NULL, NULL, 0);
+ if (error != 0) {
+ if (ixa->ixa_ire != NULL &&
+ (error == EHOSTUNREACH || error == ENETUNREACH)) {
+ /*
+ * Let conn_ip_output/ire_send_noroute return
+ * the error and send any local ICMP error.
+ */
+ error = 0;
+ } else {
+ ixa_refrele(ixa);
+ iptun_drop_pkt(mp, &iptun->iptun_oerrors);
+ return;
+ }
+ }
+ }
+
+ iptun_output_common(iptun, ixa, mp);
+ ixa_refrele(ixa);
+}
+
+/*
+ * We use an ixa based on the last destination.
+ */
+static void
+iptun_output_6to4(iptun_t *iptun, mblk_t *mp)
+{
+ conn_t *connp = iptun->iptun_connp;
+ ipha_t *outer4, *inner4;
+ ip6_t *outer6, *inner6;
+ ip_xmit_attr_t *ixa;
+ ip_xmit_attr_t *oldixa;
+ int error;
+ boolean_t need_connect;
+ in6_addr_t v6dst;
+
+ ASSERT(mp->b_cont == NULL); /* Verified by iptun_output */
+
+ /* Make sure we set ipha_dst before we look at ipha_dst */
+
+ (void) iptun_find_headers(mp, 0, &outer4, &inner4, &outer6, &inner6);
+ ASSERT(outer4 != NULL);
+ if (!iptun_out_process_6to4(iptun, outer4, inner6)) {
+ iptun_drop_pkt(mp, &iptun->iptun_oerrors);
+ return;
+ }
+
+ if (is_system_labeled()) {
+ /*
+ * Since the label can be different meaning a potentially
+ * different IRE,we always use a unique ip_xmit_attr_t.
+ */
+ ixa = conn_get_ixa_exclusive(connp);
+ } else {
+ /*
+ * If no other thread is using conn_ixa this just gets a
+ * reference to conn_ixa. Otherwise we get a safe copy of
+ * conn_ixa.
+ */
+ ixa = conn_get_ixa(connp, B_FALSE);
+ }
+ if (ixa == NULL) {
+ iptun_drop_pkt(mp, &iptun->iptun_oerrors);
+ return;
+ }
+
+ mutex_enter(&connp->conn_lock);
+ if (connp->conn_v4lastdst == outer4->ipha_dst) {
+ need_connect = (ixa->ixa_ire == NULL);
+ } else {
+ /* In case previous destination was multirt */
+ ip_attr_newdst(ixa);
+
+ /*
+ * We later update conn_ixa when we update conn_v4lastdst
+ * which enables subsequent packets to avoid redoing
+ * ip_attr_connect
+ */
+ need_connect = B_TRUE;
+ }
+ mutex_exit(&connp->conn_lock);
+
+ /*
+ * In case we got a safe copy of conn_ixa, or otherwise we don't
+ * have a current ixa_ire, then we need to fill in any pointers in
+ * the ixa.
+ */
+ if (need_connect) {
+ IN6_IPADDR_TO_V4MAPPED(outer4->ipha_dst, &v6dst);
+
+ /* We handle IPsec in iptun_output_common */
+ error = ip_attr_connect(connp, ixa, &connp->conn_saddr_v6,
+ &v6dst, &v6dst, 0, NULL, NULL, 0);
+ if (error != 0) {
+ if (ixa->ixa_ire != NULL &&
+ (error == EHOSTUNREACH || error == ENETUNREACH)) {
+ /*
+ * Let conn_ip_output/ire_send_noroute return
+ * the error and send any local ICMP error.
+ */
+ error = 0;
+ } else {
+ ixa_refrele(ixa);
+ iptun_drop_pkt(mp, &iptun->iptun_oerrors);
+ return;
+ }
+ }
+ }
+
+ iptun_output_common(iptun, ixa, mp);
+
+ /* Atomically replace conn_ixa and conn_v4lastdst */
+ mutex_enter(&connp->conn_lock);
+ if (connp->conn_v4lastdst != outer4->ipha_dst) {
+ /* Remember the dst which corresponds to conn_ixa */
+ connp->conn_v6lastdst = v6dst;
+ oldixa = conn_replace_ixa(connp, ixa);
+ } else {
+ oldixa = NULL;
+ }
+ mutex_exit(&connp->conn_lock);
+ ixa_refrele(ixa);
+ if (oldixa != NULL)
+ ixa_refrele(oldixa);
+}
+
+/*
+ * Check the destination/label. Modifies *mpp by adding/removing CIPSO.
+ *
+ * We get the label from the message in order to honor the
+ * ULPs/IPs choice of label. This will be NULL for forwarded
+ * packets, neighbor discovery packets and some others.
+ */
+static int
+iptun_output_check_label(mblk_t **mpp, ip_xmit_attr_t *ixa)
+{
+ cred_t *cr;
+ int adjust;
+ int iplen;
+ int err;
+ ts_label_t *effective_tsl = NULL;
+
+
+ ASSERT(is_system_labeled());
+
+ cr = msg_getcred(*mpp, NULL);
+ if (cr == NULL)
+ return (0);
+
+ /*
+ * We need to start with a label based on the IP/ULP above us
+ */
+ ip_xmit_attr_restore_tsl(ixa, cr);
+
+ /*
+ * Need to update packet with any CIPSO option since
+ * conn_ip_output doesn't do that.
+ */
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ ipha_t *ipha;
+
+ ipha = (ipha_t *)(*mpp)->b_rptr;
+ iplen = ntohs(ipha->ipha_length);
+ err = tsol_check_label_v4(ixa->ixa_tsl,
+ ixa->ixa_zoneid, mpp, CONN_MAC_DEFAULT, B_FALSE,
+ ixa->ixa_ipst, &effective_tsl);
+ if (err != 0)
+ return (err);
+
+ ipha = (ipha_t *)(*mpp)->b_rptr;
+ adjust = (int)ntohs(ipha->ipha_length) - iplen;
+ } else {
+ ip6_t *ip6h;
+
+ ip6h = (ip6_t *)(*mpp)->b_rptr;
+ iplen = ntohs(ip6h->ip6_plen);
+
+ err = tsol_check_label_v6(ixa->ixa_tsl,
+ ixa->ixa_zoneid, mpp, CONN_MAC_DEFAULT, B_FALSE,
+ ixa->ixa_ipst, &effective_tsl);
+ if (err != 0)
+ return (err);
+
+ ip6h = (ip6_t *)(*mpp)->b_rptr;
+ adjust = (int)ntohs(ip6h->ip6_plen) - iplen;
+ }
+
+ if (effective_tsl != NULL) {
+ /* Update the label */
+ ip_xmit_attr_replace_tsl(ixa, effective_tsl);
+ }
+ ixa->ixa_pktlen += adjust;
+ ixa->ixa_ip_hdr_length += adjust;
+ return (0);
+}
+
+
+static void
+iptun_output_common(iptun_t *iptun, ip_xmit_attr_t *ixa, mblk_t *mp)
+{
+ ipsec_tun_pol_t *itp = iptun->iptun_itp;
+ int outer_hlen;
+ mblk_t *newmp;
+ ipha_t *outer4, *inner4;
+ ip6_t *outer6, *inner6;
+ int error;
+ boolean_t update_pktlen;
+
+ ASSERT(ixa->ixa_ire != NULL);
+
+ outer_hlen = iptun_find_headers(mp, 0, &outer4, &inner4, &outer6,
+ &inner6);
if (outer_hlen == 0) {
iptun_drop_pkt(mp, &iptun->iptun_oerrors);
return;
}
/* Perform header processing. */
- if (outer4 != NULL)
- mp = iptun_out_process_ipv4(iptun, mp, outer4, inner4, inner6);
- else
- mp = iptun_out_process_ipv6(iptun, mp, outer6, inner6);
+ if (outer4 != NULL) {
+ mp = iptun_out_process_ipv4(iptun, mp, outer4, inner4, inner6,
+ ixa);
+ } else {
+ mp = iptun_out_process_ipv6(iptun, mp, outer6, inner4, inner6,
+ ixa);
+ }
if (mp == NULL)
return;
@@ -2964,27 +3397,57 @@ iptun_output(iptun_t *iptun, mblk_t *mp)
* Let's hope the compiler optimizes this with "branch taken".
*/
if (itp != NULL && (itp->itp_flags & ITPF_P_ACTIVE)) {
- if ((mp = ipsec_tun_outbound(mp, iptun, inner4, inner6, outer4,
- outer6, outer_hlen)) == NULL) {
- /* ipsec_tun_outbound() frees mp on error. */
+ /* This updates the ip_xmit_attr_t */
+ mp = ipsec_tun_outbound(mp, iptun, inner4, inner6, outer4,
+ outer6, outer_hlen, ixa);
+ if (mp == NULL) {
atomic_inc_64(&iptun->iptun_oerrors);
return;
}
+ if (is_system_labeled()) {
+ /*
+ * Might change the packet by adding/removing CIPSO.
+ * After this caller inner* and outer* and outer_hlen
+ * might be invalid.
+ */
+ error = iptun_output_check_label(&mp, ixa);
+ if (error != 0) {
+ ip2dbg(("label check failed (%d)\n", error));
+ iptun_drop_pkt(mp, &iptun->iptun_oerrors);
+ return;
+ }
+ }
+
/*
* ipsec_tun_outbound() returns a chain of tunneled IP
* fragments linked with b_next (or a single message if the
- * tunneled packet wasn't a fragment). Each message in the
- * chain is prepended by an IPSEC_OUT M_CTL block with
+ * tunneled packet wasn't a fragment).
+ * If fragcache returned a list then we need to update
+ * ixa_pktlen for all packets in the list.
+ */
+ update_pktlen = (mp->b_next != NULL);
+
+ /*
+ * Otherwise, we're good to go. The ixa has been updated with
* instructions for outbound IPsec processing.
*/
for (newmp = mp; newmp != NULL; newmp = mp) {
- ASSERT(newmp->b_datap->db_type == M_CTL);
atomic_inc_64(&iptun->iptun_opackets);
- atomic_add_64(&iptun->iptun_obytes,
- msgdsize(newmp->b_cont));
+ atomic_add_64(&iptun->iptun_obytes, ixa->ixa_pktlen);
mp = mp->b_next;
newmp->b_next = NULL;
- connp->conn_send(connp, newmp, connp->conn_wq, IP_WPUT);
+
+ if (update_pktlen)
+ ixa->ixa_pktlen = msgdsize(mp);
+
+ atomic_inc_64(&iptun->iptun_opackets);
+ atomic_add_64(&iptun->iptun_obytes, ixa->ixa_pktlen);
+
+ error = conn_ip_output(newmp, ixa);
+ if (error == EMSGSIZE) {
+ /* IPsec policy might have changed */
+ (void) iptun_update_mtu(iptun, ixa, 0);
+ }
}
} else {
/*
@@ -2992,30 +3455,37 @@ iptun_output(iptun_t *iptun, mblk_t *mp)
* packet in its output path if there's no active tunnel
* policy.
*/
- atomic_inc_64(&iptun->iptun_opackets);
- atomic_add_64(&iptun->iptun_obytes, msgdsize(mp));
- connp->conn_send(connp, mp, connp->conn_wq, IP_WPUT);
- }
-}
+ ASSERT(ixa->ixa_ipsec_policy == NULL);
+ mp = ip_output_attach_policy(mp, outer4, outer6, NULL, ixa);
+ if (mp == NULL) {
+ atomic_inc_64(&iptun->iptun_oerrors);
+ return;
+ }
+ if (is_system_labeled()) {
+ /*
+ * Might change the packet by adding/removing CIPSO.
+ * After this caller inner* and outer* and outer_hlen
+ * might be invalid.
+ */
+ error = iptun_output_check_label(&mp, ixa);
+ if (error != 0) {
+ ip2dbg(("label check failed (%d)\n", error));
+ iptun_drop_pkt(mp, &iptun->iptun_oerrors);
+ return;
+ }
+ }
-/*
- * Note that the setting or clearing iptun_{set,get}_g_q() is serialized via
- * iptuns_lock and iptunq_open(), so we must never be in a situation where
- * iptun_set_g_q() is called if the queue has already been set or vice versa
- * (hence the ASSERT()s.)
- */
-void
-iptun_set_g_q(netstack_t *ns, queue_t *q)
-{
- ASSERT(ns->netstack_iptun->iptuns_g_q == NULL);
- ns->netstack_iptun->iptuns_g_q = q;
-}
+ atomic_inc_64(&iptun->iptun_opackets);
+ atomic_add_64(&iptun->iptun_obytes, ixa->ixa_pktlen);
-void
-iptun_clear_g_q(netstack_t *ns)
-{
- ASSERT(ns->netstack_iptun->iptuns_g_q != NULL);
- ns->netstack_iptun->iptuns_g_q = NULL;
+ error = conn_ip_output(mp, ixa);
+ if (error == EMSGSIZE) {
+ /* IPsec policy might have changed */
+ (void) iptun_update_mtu(iptun, ixa, 0);
+ }
+ }
+ if (ixa->ixa_flags & IXAF_IPSEC_SECURE)
+ ipsec_out_release_refs(ixa);
}
static mac_callbacks_t iptun_m_callbacks = {
diff --git a/usr/src/uts/common/inet/iptun/iptun_dev.c b/usr/src/uts/common/inet/iptun/iptun_dev.c
index 52218bdc18..5043063690 100644
--- a/usr/src/uts/common/inet/iptun/iptun_dev.c
+++ b/usr/src/uts/common/inet/iptun/iptun_dev.c
@@ -91,11 +91,9 @@ iptun_stack_shutdown(netstackid_t stackid, void *arg)
/* note that iptun_delete() removes iptun from the list */
while ((iptun = list_head(&iptuns->iptuns_iptunlist)) != NULL) {
linkid = iptun->iptun_linkid;
- (void) iptun_delete(linkid, iptun->iptun_cred);
+ (void) iptun_delete(linkid, iptun->iptun_connp->conn_cred);
(void) dls_mgmt_destroy(linkid, B_FALSE);
}
- if (iptuns->iptuns_g_q != NULL)
- (void) ldi_close(iptuns->iptuns_g_q_lh, FWRITE|FREAD, CRED());
}
/*
diff --git a/usr/src/uts/common/inet/iptun/iptun_impl.h b/usr/src/uts/common/inet/iptun/iptun_impl.h
index 593adb7d9c..07e168a423 100644
--- a/usr/src/uts/common/inet/iptun/iptun_impl.h
+++ b/usr/src/uts/common/inet/iptun/iptun_impl.h
@@ -80,7 +80,6 @@ typedef struct iptun_typeinfo {
iptun_type_t iti_type;
const char *iti_ident; /* MAC-Type plugin identifier */
uint_t iti_ipvers; /* outer header IP version */
- edesc_spf iti_txfunc; /* function used to transmit to ip */
uint32_t iti_minmtu; /* minimum possible tunnel MTU */
uint32_t iti_maxmtu; /* maximum possible tunnel MTU */
boolean_t iti_hasraddr; /* has a remote adress */
@@ -95,13 +94,6 @@ typedef struct iptun_typeinfo {
*
* The datapath reads certain fields without locks for performance reasons.
*
- * - IPTUN_PMTU_TOO_OLD() is used without a lock to determine if the
- * destination path-MTU should be queried. This reads iptun_flags
- * IPTUN_RADDR, IPTUN_FIXED_MTU, and iptun_dpmtu_lastupdate. All of these
- * can change without adversely affecting the tunnel, as the worst case
- * scenario is that we launch a task that will ultimately either do nothing
- * or needlessly query the destination path-MTU.
- *
* - IPTUN_IS_RUNNING() is used (read access to iptun_flags IPTUN_BOUND and
* IPTUN_MAC_STARTED) to drop packets if they're sent while the tunnel is
* not running. This is harmless as the worst case scenario is that a
@@ -119,12 +111,10 @@ typedef struct iptun_s {
conn_t *iptun_connp;
zoneid_t iptun_zoneid;
netstack_t *iptun_ns;
- cred_t *iptun_cred;
struct ipsec_tun_pol_s *iptun_itp;
iptun_typeinfo_t *iptun_typeinfo;
uint32_t iptun_mtu;
uint32_t iptun_dpmtu; /* destination path MTU */
- clock_t iptun_dpmtu_lastupdate;
uint8_t iptun_hoplimit;
uint8_t iptun_encaplimit;
iptun_addr_t iptun_laddr; /* local address */
@@ -172,37 +162,12 @@ typedef struct iptun_s {
(IPTUN_BOUND | IPTUN_MAC_STARTED))
/*
- * We request ire information for the tunnel destination in order to obtain
- * its path MTU information. We use that to calculate the initial link MTU of
- * a tunnel.
- *
- * After that, if the path MTU of the tunnel destination becomes smaller
- * than the link MTU of the tunnel, then we will receive a packet too big
- * (aka fragmentation needed) ICMP error when we transmit a packet larger
- * than the path MTU, and we will adjust the tunne's MTU based on the ICMP
- * error's MTU information.
- *
- * In addition to that, we also need to request the ire information
- * periodically to make sure the link MTU of a tunnel doesn't become stale
- * if the path MTU of the tunnel destination becomes larger than the link
- * MTU of the tunnel. The period for the requests is ten minutes in
- * accordance with rfc1191.
- */
-#define IPTUN_PMTU_AGE SEC_TO_TICK(600)
-#define IPTUN_PMTU_TOO_OLD(ipt) \
- (((ipt)->iptun_flags & IPTUN_RADDR) && \
- !((ipt)->iptun_flags & IPTUN_FIXED_MTU) && \
- (ddi_get_lbolt() - (ipt)->iptun_dpmtu_lastupdate) > IPTUN_PMTU_AGE)
-
-/*
- * iptuns_lock protects iptuns_iptunlist and iptuns_g_q.
+ * iptuns_lock protects iptuns_iptunlist.
*/
typedef struct iptun_stack {
netstack_t *iptuns_netstack; /* Common netstack */
kmutex_t iptuns_lock;
list_t iptuns_iptunlist; /* list of tunnels in this stack. */
- queue_t *iptuns_g_q; /* read-side IP queue */
- ldi_handle_t iptuns_g_q_lh;
ipaddr_t iptuns_relay_rtr_addr;
} iptun_stack_t;
@@ -222,8 +187,6 @@ extern int iptun_info(iptun_kparams_t *, cred_t *);
extern int iptun_set_6to4relay(netstack_t *, ipaddr_t);
extern void iptun_get_6to4relay(netstack_t *, ipaddr_t *);
extern void iptun_set_policy(datalink_id_t, ipsec_tun_pol_t *);
-extern void iptun_set_g_q(netstack_t *, queue_t *);
-extern void iptun_clear_g_q(netstack_t *);
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/inet/keysock.h b/usr/src/uts/common/inet/keysock.h
index 50189666c7..cb618cedaf 100644
--- a/usr/src/uts/common/inet/keysock.h
+++ b/usr/src/uts/common/inet/keysock.h
@@ -19,22 +19,20 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _INET_KEYSOCK_H
#define _INET_KEYSOCK_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
extern int keysock_opt_get(queue_t *, int, int, uchar_t *);
extern int keysock_opt_set(queue_t *, uint_t, int, int, uint_t,
- uchar_t *, uint_t *, uchar_t *, void *, cred_t *cr, mblk_t *mblk);
+ uchar_t *, uint_t *, uchar_t *, void *, cred_t *cr);
/*
* Object to represent database of options to search passed to
diff --git a/usr/src/uts/common/inet/kssl/ksslrec.c b/usr/src/uts/common/inet/kssl/ksslrec.c
index 14a285b4ab..6b7ce0ad42 100644
--- a/usr/src/uts/common/inet/kssl/ksslrec.c
+++ b/usr/src/uts/common/inet/kssl/ksslrec.c
@@ -239,7 +239,7 @@ kssl_compute_record_mac(
* context when called from strsock_kssl_input(). During the
* SSL handshake, we are called for client_finished message
* handling from a squeue worker thread that gets scheduled
- * by an squeue_fill() call. This thread is not in interrupt
+ * by an SQ_FILL call. This thread is not in interrupt
* context and so can block.
*/
rv = crypto_mac(&spec->hmac_mech, &dd, &spec->hmac_key,
diff --git a/usr/src/uts/common/inet/mi.c b/usr/src/uts/common/inet/mi.c
index f88fe3709b..9fe77e88c4 100644
--- a/usr/src/uts/common/inet/mi.c
+++ b/usr/src/uts/common/inet/mi.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -1359,7 +1359,7 @@ mi_tpi_addr_and_opt(MBLKP mp, char *addr, t_scalar_t addr_length,
* This code is used more than just for unitdata ind
* (also for T_CONN_IND and T_CONN_CON) and
* relies on correct functioning on the happy
- * coincidence that the the address and option buffers
+ * coincidence that the address and option buffers
* represented by length/offset in all these primitives
* are isomorphic in terms of offset from start of data
* structure
diff --git a/usr/src/uts/common/inet/mib2.h b/usr/src/uts/common/inet/mib2.h
index 16bed4ec2c..06db81ea74 100644
--- a/usr/src/uts/common/inet/mib2.h
+++ b/usr/src/uts/common/inet/mib2.h
@@ -66,8 +66,8 @@ extern "C" {
* "get all" is supported, so all modules get a copy of the request to
* return everything it knows. In general, we use MIB2_IP. There is
* one exception: in general, IP will not report information related to
- * IRE_MARK_TESTHIDDEN routes (e.g., in the MIB2_IP_ROUTE table).
- * However, using the special value EXPER_IP_AND_TESTHIDDEN will cause
+ * ire_testhidden and IRE_IF_CLONE routes (e.g., in the MIB2_IP_ROUTE
+ * table). However, using the special value EXPER_IP_AND_ALL_IRES will cause
* all information to be reported. This special value should only be
* used by IPMP-aware low-level utilities (e.g. in.mpathd).
*
@@ -109,7 +109,7 @@ extern "C" {
#define EXPER_IGMP (EXPER+1)
#define EXPER_DVMRP (EXPER+2)
#define EXPER_RAWIP (EXPER+3)
-#define EXPER_IP_AND_TESTHIDDEN (EXPER+4)
+#define EXPER_IP_AND_ALL_IRES (EXPER+4)
/*
* Define range of levels for experimental use
@@ -170,6 +170,7 @@ typedef uint32_t DeviceIndex; /* Interface index */
#define EXPER_IP_GROUP_SOURCES 102
#define EXPER_IP6_GROUP_SOURCES 103
#define EXPER_IP_RTATTR 104
+#define EXPER_IP_DCE 105
/*
* There can be one of each of these tables per transport (MIB2_* above).
@@ -267,15 +268,13 @@ typedef struct mib2_ip {
int ipMemberEntrySize; /* Size of ip_member_t */
int ipGroupSourceEntrySize; /* Size of ip_grpsrc_t */
- /* # of IPv6 packets received by IPv4 and dropped */
- Counter ipInIPv6;
- /* # of IPv6 packets transmitted by ip_wput */
- Counter ipOutIPv6;
- /* # of times ip_wput has switched to become ip_wput_v6 */
- Counter ipOutSwitchIPv6;
+ Counter ipInIPv6; /* # of IPv6 packets received by IPv4 and dropped */
+ Counter ipOutIPv6; /* No longer used */
+ Counter ipOutSwitchIPv6; /* No longer used */
int ipRouteAttributeSize; /* Size of mib2_ipAttributeEntry_t */
int transportMLPSize; /* Size of mib2_transportMLPEntry_t */
+ int ipDestEntrySize; /* Size of dest_cache_entry_t */
} mib2_ip_t;
/*
@@ -503,14 +502,11 @@ typedef struct mib2_ipIfStatsEntry {
*/
Counter ipIfStatsInWrongIPVersion;
/*
- * Depending on the value of ipIfStatsIPVersion, this counter tracks
- * v4: # of IPv6 packets transmitted by ip_wput or,
- * v6: # of IPv4 packets transmitted by ip_wput_v6.
+ * This counter is no longer used
*/
Counter ipIfStatsOutWrongIPVersion;
/*
- * Depending on the value of ipIfStatsIPVersion, this counter tracks
- * # of times ip_wput has switched to become ip_wput_v6, or vice versa.
+ * This counter is no longer used
*/
Counter ipIfStatsOutSwitchIPVersion;
@@ -981,6 +977,21 @@ typedef struct ipv6_grpsrc {
/*
+ * List of destination cache entries
+ */
+typedef struct dest_cache_entry {
+ /* IP Multicast address */
+ IpAddress DestIpv4Address;
+ Ip6Address DestIpv6Address;
+ uint_t DestFlags; /* DCEF_* */
+ uint32_t DestPmtu; /* Path MTU if DCEF_PMTU */
+ uint32_t DestIdent; /* Per destination IP ident. */
+ DeviceIndex DestIfindex; /* For IPv6 link-locals */
+ uint32_t DestAge; /* Age of MTU info in seconds */
+} dest_cache_entry_t;
+
+
+/*
* ICMP Group
*/
typedef struct mib2_icmp {
diff --git a/usr/src/uts/common/inet/optcom.c b/usr/src/uts/common/inet/optcom.c
index e35b7f6af5..e4d1abff4c 100644
--- a/usr/src/uts/common/inet/optcom.c
+++ b/usr/src/uts/common/inet/optcom.c
@@ -58,21 +58,21 @@
* Function prototypes
*/
static t_scalar_t process_topthdrs_first_pass(mblk_t *, cred_t *, optdb_obj_t *,
- boolean_t *, size_t *);
+ size_t *);
static t_scalar_t do_options_second_pass(queue_t *q, mblk_t *reqmp,
mblk_t *ack_mp, cred_t *, optdb_obj_t *dbobjp,
- mblk_t *first_mp, boolean_t is_restart, boolean_t *queued_statusp);
+ t_uscalar_t *worst_statusp);
static t_uscalar_t get_worst_status(t_uscalar_t, t_uscalar_t);
static int do_opt_default(queue_t *, struct T_opthdr *, uchar_t **,
t_uscalar_t *, cred_t *, optdb_obj_t *);
static void do_opt_current(queue_t *, struct T_opthdr *, uchar_t **,
t_uscalar_t *, cred_t *cr, optdb_obj_t *);
-static int do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt,
+static void do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt,
uint_t optset_context, uchar_t **resptrp, t_uscalar_t *worst_statusp,
- cred_t *, optdb_obj_t *dbobjp, mblk_t *first_mp);
+ cred_t *, optdb_obj_t *dbobjp);
static boolean_t opt_level_valid(t_uscalar_t, optlevel_t *, uint_t);
static size_t opt_level_allopts_lengths(t_uscalar_t, opdes_t *, uint_t);
-static boolean_t opt_length_ok(opdes_t *, struct T_opthdr *);
+static boolean_t opt_length_ok(opdes_t *, t_uscalar_t optlen);
static t_uscalar_t optcom_max_optbuf_len(opdes_t *, uint_t);
static boolean_t opt_bloated_maxsize(opdes_t *);
@@ -176,35 +176,15 @@ optcom_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
* job requested.
* XXX Code below needs some restructuring after we have some more
* macros to support 'struct opthdr' in the headers.
- *
- * IP-MT notes: The option management framework functions svr4_optcom_req() and
- * tpi_optcom_req() allocate and prepend an M_CTL mblk to the actual
- * T_optmgmt_req mblk and pass the chain as an additional parameter to the
- * protocol set functions. If a protocol set function (such as ip_opt_set)
- * cannot process the option immediately it can return EINPROGRESS. ip_opt_set
- * enqueues the message in the appropriate sq and returns EINPROGRESS. Later
- * the sq framework arranges to restart this operation and passes control to
- * the restart function ip_restart_optmgmt() which in turn calls
- * svr4_optcom_req() or tpi_optcom_req() to restart the option processing.
- *
- * XXX Remove the asynchronous behavior of svr_optcom_req() and
- * tpi_optcom_req().
*/
-int
-svr4_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
- boolean_t pass_to_ip)
+void
+svr4_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp)
{
pfi_t deffn = dbobjp->odb_deffn;
pfi_t getfn = dbobjp->odb_getfn;
opt_set_fn setfn = dbobjp->odb_setfn;
opdes_t *opt_arr = dbobjp->odb_opt_des_arr;
uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
- boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
- opt_restart_t *or;
- struct opthdr *restart_opt;
- boolean_t is_restart = B_FALSE;
- mblk_t *first_mp;
-
t_uscalar_t max_optbuf_len;
int len;
mblk_t *mp1 = NULL;
@@ -214,33 +194,10 @@ svr4_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
struct opthdr *opt_end;
struct opthdr *opt_start;
opdes_t *optd;
- boolean_t pass_to_next = B_FALSE;
struct T_optmgmt_ack *toa;
struct T_optmgmt_req *tor;
int error;
- /*
- * Allocate M_CTL and prepend to the packet for restarting this
- * option if needed. IP may need to queue and restart the option
- * if it cannot obtain exclusive conditions immediately. Please see
- * IP-MT notes before the start of svr4_optcom_req
- */
- if (mp->b_datap->db_type == M_CTL) {
- is_restart = B_TRUE;
- first_mp = mp;
- mp = mp->b_cont;
- ASSERT(mp->b_wptr - mp->b_rptr >=
- sizeof (struct T_optmgmt_req));
- tor = (struct T_optmgmt_req *)mp->b_rptr;
- ASSERT(tor->MGMT_flags == T_NEGOTIATE);
-
- or = (opt_restart_t *)first_mp->b_rptr;
- opt_start = or->or_start;
- opt_end = or->or_end;
- restart_opt = or->or_ropt;
- goto restart;
- }
-
tor = (struct T_optmgmt_req *)mp->b_rptr;
/* Verify message integrity. */
if (mp->b_wptr - mp->b_rptr < sizeof (struct T_optmgmt_req))
@@ -255,7 +212,7 @@ svr4_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
break;
default:
optcom_err_ack(q, mp, TBADFLAG, 0);
- return (0);
+ return;
}
if (tor->MGMT_flags == T_DEFAULT) {
/* Is it a request for default option settings? */
@@ -278,7 +235,6 @@ svr4_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
* ----historical comment end -------
*/
/* T_DEFAULT not passed down */
- ASSERT(topmost_tpiprovider == B_TRUE);
freemsg(mp);
max_optbuf_len = optcom_max_optbuf_len(opt_arr,
opt_arr_cnt);
@@ -286,7 +242,7 @@ svr4_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
if (!mp) {
no_mem:;
optcom_err_ack(q, mp, TSYSERR, ENOMEM);
- return (0);
+ return;
}
/* Initialize the T_optmgmt_ack header. */
@@ -362,7 +318,7 @@ no_mem:;
mp->b_datap->db_type = M_PCPROTO;
/* Ship it back. */
qreply(q, mp);
- return (0);
+ return;
}
/* T_DEFAULT processing complete - no more T_DEFAULT */
@@ -414,15 +370,15 @@ no_mem:;
goto bad_opt;
error = proto_opt_check(opt->level, opt->name, opt->len, NULL,
- opt_arr, opt_arr_cnt, topmost_tpiprovider,
+ opt_arr, opt_arr_cnt,
tor->MGMT_flags == T_NEGOTIATE, tor->MGMT_flags == T_CHECK,
cr);
if (error < 0) {
optcom_err_ack(q, mp, -error, 0);
- return (0);
+ return;
} else if (error > 0) {
optcom_err_ack(q, mp, TSYSERR, error);
- return (0);
+ return;
}
} /* end for loop scanning option buffer */
@@ -491,24 +447,9 @@ no_mem:;
/* Ditch the input buffer. */
freemsg(mp);
mp = mp1;
- /* Always let the next module look at the option. */
- pass_to_next = B_TRUE;
break;
case T_NEGOTIATE:
- first_mp = allocb(sizeof (opt_restart_t), BPRI_LO);
- if (first_mp == NULL) {
- optcom_err_ack(q, mp, TSYSERR, ENOMEM);
- return (0);
- }
- first_mp->b_datap->db_type = M_CTL;
- or = (opt_restart_t *)first_mp->b_rptr;
- or->or_start = opt_start;
- or->or_end = opt_end;
- or->or_type = T_SVR4_OPTMGMT_REQ;
- or->or_private = 0;
- first_mp->b_cont = mp;
-restart:
/*
* Here we are expecting that the response buffer is exactly
* the same size as the input buffer. We pass each opthdr
@@ -523,22 +464,16 @@ restart:
*/
toa = (struct T_optmgmt_ack *)tor;
- for (opt = is_restart ? restart_opt: opt_start; opt < opt_end;
- opt = next_opt) {
+ for (opt = opt_start; opt < opt_end; opt = next_opt) {
int error;
- /*
- * Point to the current option in or, in case this
- * option has to be restarted later on
- */
- or->or_ropt = opt;
next_opt = (struct opthdr *)((uchar_t *)&opt[1] +
_TPI_ALIGN_OPT(opt->len));
error = (*setfn)(q, SETFN_OPTCOM_NEGOTIATE,
opt->level, opt->name,
opt->len, (uchar_t *)&opt[1],
- &opt->len, (uchar_t *)&opt[1], NULL, cr, first_mp);
+ &opt->len, (uchar_t *)&opt[1], NULL, cr);
/*
* Treat positive "errors" as real.
* Note: negative errors are to be treated as
@@ -549,99 +484,48 @@ restart:
* it is valid but was either handled upstream
* or will be handled downstream.
*/
- if (error == EINPROGRESS) {
- /*
- * The message is queued and will be
- * reprocessed later. Typically ip queued
- * the message to get some exclusive conditions
- * and later on calls this func again.
- */
- return (EINPROGRESS);
- } else if (error > 0) {
+ if (error > 0) {
optcom_err_ack(q, mp, TSYSERR, error);
- freeb(first_mp);
- return (0);
+ return;
}
/*
* error < 0 means option is not recognized.
- * But with OP_PASSNEXT the next module
- * might recognize it.
*/
}
- /* Done with the restart control mp. */
- freeb(first_mp);
- pass_to_next = B_TRUE;
break;
default:
optcom_err_ack(q, mp, TBADFLAG, 0);
- return (0);
+ return;
}
- if (pass_to_next && (q->q_next != NULL || pass_to_ip)) {
- /* Send it down to the next module and let it reply */
- toa->PRIM_type = T_SVR4_OPTMGMT_REQ; /* Changed by IP to ACK */
- if (q->q_next != NULL)
- putnext(q, mp);
- else
- ip_output(Q_TO_CONN(q), mp, q, IP_WPUT);
- } else {
- /* Set common fields in the header. */
- toa->MGMT_flags = T_SUCCESS;
- mp->b_datap->db_type = M_PCPROTO;
- toa->PRIM_type = T_OPTMGMT_ACK;
- qreply(q, mp);
- }
- return (0);
+ /* Set common fields in the header. */
+ toa->MGMT_flags = T_SUCCESS;
+ mp->b_datap->db_type = M_PCPROTO;
+ toa->PRIM_type = T_OPTMGMT_ACK;
+ qreply(q, mp);
+ return;
bad_opt:;
optcom_err_ack(q, mp, TBADOPT, 0);
- return (0);
}
/*
* New optcom_req inspired by TPI/XTI semantics
*/
-int
-tpi_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
- boolean_t pass_to_ip)
+void
+tpi_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp)
{
t_scalar_t t_error;
mblk_t *toa_mp;
- boolean_t pass_to_next;
size_t toa_len;
struct T_optmgmt_ack *toa;
struct T_optmgmt_req *tor =
(struct T_optmgmt_req *)mp->b_rptr;
-
- opt_restart_t *or;
- boolean_t is_restart = B_FALSE;
- mblk_t *first_mp = NULL;
t_uscalar_t worst_status;
- boolean_t queued_status;
-
- /*
- * Allocate M_CTL and prepend to the packet for restarting this
- * option if needed. IP may need to queue and restart the option
- * if it cannot obtain exclusive conditions immediately. Please see
- * IP-MT notes before the start of svr4_optcom_req
- */
- if (mp->b_datap->db_type == M_CTL) {
- is_restart = B_TRUE;
- first_mp = mp;
- toa_mp = mp->b_cont;
- mp = toa_mp->b_cont;
- ASSERT(mp->b_wptr - mp->b_rptr >=
- sizeof (struct T_optmgmt_req));
- tor = (struct T_optmgmt_req *)mp->b_rptr;
- ASSERT(tor->MGMT_flags == T_NEGOTIATE);
-
- or = (opt_restart_t *)first_mp->b_rptr;
- goto restart;
- }
/* Verify message integrity. */
if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_optmgmt_req)) {
optcom_err_ack(q, mp, TBADOPT, 0);
- return (0);
+ return;
}
/* Verify MGMT_flags legal */
@@ -654,7 +538,7 @@ tpi_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
break;
default:
optcom_err_ack(q, mp, TBADFLAG, 0);
- return (0);
+ return;
}
/*
@@ -669,7 +553,6 @@ tpi_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
* T_ALLOPT mean that length can be different for output buffer).
*/
- pass_to_next = B_FALSE; /* initial value */
toa_len = 0; /* initial value */
/*
@@ -677,13 +560,11 @@ tpi_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
* - estimate cumulative length needed for results
* - set "status" field based on permissions, option header check
* etc.
- * - determine "pass_to_next" whether we need to send request to
- * downstream module/driver.
*/
if ((t_error = process_topthdrs_first_pass(mp, cr, dbobjp,
- &pass_to_next, &toa_len)) != 0) {
+ &toa_len)) != 0) {
optcom_err_ack(q, mp, t_error, 0);
- return (0);
+ return;
}
/*
@@ -697,26 +578,14 @@ tpi_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
toa_mp = allocb_tmpl(toa_len, mp);
if (!toa_mp) {
optcom_err_ack(q, mp, TSYSERR, ENOMEM);
- return (0);
+ return;
}
- first_mp = allocb(sizeof (opt_restart_t), BPRI_LO);
- if (first_mp == NULL) {
- freeb(toa_mp);
- optcom_err_ack(q, mp, TSYSERR, ENOMEM);
- return (0);
- }
- first_mp->b_datap->db_type = M_CTL;
- or = (opt_restart_t *)first_mp->b_rptr;
/*
* Set initial values for generating output.
*/
- or->or_worst_status = T_SUCCESS;
- or->or_type = T_OPTMGMT_REQ;
- or->or_private = 0;
- /* remaining fields fileed in do_options_second_pass */
+ worst_status = T_SUCCESS; /* initial value */
-restart:
/*
* This routine makes another pass through the option buffer this
* time acting on the request based on "status" result in the
@@ -724,19 +593,11 @@ restart:
* all options of a certain level and acts on each for this request.
*/
if ((t_error = do_options_second_pass(q, mp, toa_mp, cr, dbobjp,
- first_mp, is_restart, &queued_status)) != 0) {
+ &worst_status)) != 0) {
freemsg(toa_mp);
optcom_err_ack(q, mp, t_error, 0);
- return (0);
- }
- if (queued_status) {
- /* Option will be restarted */
- return (EINPROGRESS);
+ return;
}
- worst_status = or->or_worst_status;
- /* Done with the first mp */
- freeb(first_mp);
- toa_mp->b_cont = NULL;
/*
* Following code relies on the coincidence that T_optmgmt_req
@@ -749,34 +610,12 @@ restart:
toa->MGMT_flags = tor->MGMT_flags;
-
freemsg(mp); /* free input mblk */
- /*
- * If there is atleast one option that requires a downstream
- * forwarding and if it is possible, we forward the message
- * downstream. Else we ack it.
- */
- if (pass_to_next && (q->q_next != NULL || pass_to_ip)) {
- /*
- * We pass it down as T_OPTMGMT_REQ. This code relies
- * on the happy coincidence that T_optmgmt_req and
- * T_optmgmt_ack are identical data structures
- * at the binary representation level.
- */
- toa_mp->b_datap->db_type = M_PROTO;
- toa->PRIM_type = T_OPTMGMT_REQ;
- if (q->q_next != NULL)
- putnext(q, toa_mp);
- else
- ip_output(Q_TO_CONN(q), toa_mp, q, IP_WPUT);
- } else {
- toa->PRIM_type = T_OPTMGMT_ACK;
- toa_mp->b_datap->db_type = M_PCPROTO;
- toa->MGMT_flags |= worst_status; /* XXX "worst" or "OR" TPI ? */
- qreply(q, toa_mp);
- }
- return (0);
+ toa->PRIM_type = T_OPTMGMT_ACK;
+ toa_mp->b_datap->db_type = M_PCPROTO;
+ toa->MGMT_flags |= worst_status; /* XXX "worst" or "OR" TPI ? */
+ qreply(q, toa_mp);
}
@@ -786,17 +625,14 @@ restart:
* - estimate cumulative length needed for results
* - set "status" field based on permissions, option header check
* etc.
- * - determine "pass_to_next" whether we need to send request to
- * downstream module/driver.
*/
static t_scalar_t
process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
- boolean_t *pass_to_nextp, size_t *toa_lenp)
+ size_t *toa_lenp)
{
opdes_t *opt_arr = dbobjp->odb_opt_des_arr;
uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
- boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
optlevel_t *valid_level_arr = dbobjp->odb_valid_levels_arr;
uint_t valid_level_arr_cnt = dbobjp->odb_valid_levels_arr_cnt;
struct T_opthdr *opt;
@@ -843,18 +679,14 @@ process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
* unchanged if they do not understand an
* option.
*/
- if (topmost_tpiprovider) {
- if (!opt_level_valid(opt->level,
- valid_level_arr,
- valid_level_arr_cnt))
- return (TBADOPT);
- /*
- * level is valid - initialize
- * option as not supported
- */
- opt->status = T_NOTSUPPORT;
- }
-
+ if (!opt_level_valid(opt->level,
+ valid_level_arr, valid_level_arr_cnt))
+ return (TBADOPT);
+ /*
+ * level is valid - initialize
+ * option as not supported
+ */
+ opt->status = T_NOTSUPPORT;
*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
continue;
}
@@ -866,7 +698,6 @@ process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
*/
allopt_len = 0;
if (tor->MGMT_flags == T_CHECK ||
- !topmost_tpiprovider ||
((allopt_len = opt_level_allopts_lengths(opt->level,
opt_arr, opt_arr_cnt)) == 0)) {
/*
@@ -874,11 +705,6 @@ process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
* It is not valid to to use T_ALLOPT with
* T_CHECK flag.
*
- * T_ALLOPT is assumed "expanded" at the
- * topmost_tpiprovider level so it should not
- * be there as an "option name" if this is not
- * a topmost_tpiprovider call and we fail it.
- *
* opt_level_allopts_lengths() is used to verify
* that "level" associated with the T_ALLOPT is
* supported.
@@ -892,15 +718,8 @@ process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
*toa_lenp += allopt_len;
opt->status = T_SUCCESS;
- /* XXX - always set T_ALLOPT 'pass_to_next' for now */
- *pass_to_nextp = B_TRUE;
continue;
}
- /*
- * Check if option wants to flow downstream
- */
- if (optd->opdes_props & OP_PASSNEXT)
- *pass_to_nextp = B_TRUE;
/* Additional checks dependent on operation. */
switch (tor->MGMT_flags) {
@@ -972,7 +791,9 @@ process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
* Note: This can override anything about this
* option request done at a higher level.
*/
- if (!opt_length_ok(optd, opt)) {
+ if (opt->len < sizeof (struct T_opthdr) ||
+ !opt_length_ok(optd,
+ opt->len - sizeof (struct T_opthdr))) {
/* bad size */
*toa_lenp += _TPI_ALIGN_TOPT(opt->len);
opt->status = T_FAILURE;
@@ -1034,23 +855,14 @@ process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
*/
static t_scalar_t
do_options_second_pass(queue_t *q, mblk_t *reqmp, mblk_t *ack_mp, cred_t *cr,
- optdb_obj_t *dbobjp, mblk_t *first_mp, boolean_t is_restart,
- boolean_t *queued_statusp)
+ optdb_obj_t *dbobjp, t_uscalar_t *worst_statusp)
{
- boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
int failed_option;
struct T_opthdr *opt;
- struct T_opthdr *opt_start, *opt_end, *restart_opt;
+ struct T_opthdr *opt_start, *opt_end;
uchar_t *optr;
uint_t optset_context;
struct T_optmgmt_req *tor = (struct T_optmgmt_req *)reqmp->b_rptr;
- opt_restart_t *or;
- t_uscalar_t *worst_statusp;
- int err;
-
- *queued_statusp = B_FALSE;
- or = (opt_restart_t *)first_mp->b_rptr;
- worst_statusp = &or->or_worst_status;
optr = (uchar_t *)ack_mp->b_rptr +
sizeof (struct T_optmgmt_ack); /* assumed int32_t aligned */
@@ -1058,32 +870,16 @@ do_options_second_pass(queue_t *q, mblk_t *reqmp, mblk_t *ack_mp, cred_t *cr,
/*
* Set initial values for scanning input
*/
- if (is_restart) {
- opt_start = (struct T_opthdr *)or->or_start;
- opt_end = (struct T_opthdr *)or->or_end;
- restart_opt = (struct T_opthdr *)or->or_ropt;
- } else {
- opt_start = (struct T_opthdr *)mi_offset_param(reqmp,
- tor->OPT_offset, tor->OPT_length);
- if (opt_start == NULL)
- return (TBADOPT);
- opt_end = (struct T_opthdr *)((uchar_t *)opt_start +
- tor->OPT_length);
- or->or_start = (struct opthdr *)opt_start;
- or->or_end = (struct opthdr *)opt_end;
- /*
- * construct the mp chain, in case the setfn needs to
- * queue this and restart option processing later on.
- */
- first_mp->b_cont = ack_mp;
- ack_mp->b_cont = reqmp;
- }
+ opt_start = (struct T_opthdr *)mi_offset_param(reqmp,
+ tor->OPT_offset, tor->OPT_length);
+ if (opt_start == NULL)
+ return (TBADOPT);
+ opt_end = (struct T_opthdr *)((uchar_t *)opt_start + tor->OPT_length);
ASSERT(__TPI_TOPT_ISALIGNED(opt_start)); /* verified in first pass */
- for (opt = is_restart ? restart_opt : opt_start;
- opt && (opt < opt_end);
+ for (opt = opt_start; opt && (opt < opt_end);
opt = _TPI_TOPT_NEXTHDR(opt_start, tor->OPT_length, opt)) {
- or->or_ropt = (struct opthdr *)opt;
+
/* verified in first pass */
ASSERT(_TPI_TOPT_VALID(opt, opt_start, opt_end));
@@ -1144,9 +940,7 @@ do_options_second_pass(queue_t *q, mblk_t *reqmp, mblk_t *ack_mp, cred_t *cr,
*/
if (do_opt_default(q, opt, &optr, worst_statusp,
cr, dbobjp) < 0) {
- /* fail or pass transparently */
- if (topmost_tpiprovider)
- opt->status = T_FAILURE;
+ opt->status = T_FAILURE;
bcopy(opt, optr, opt->len);
optr += _TPI_ALIGN_TOPT(opt->len);
*worst_statusp = get_worst_status(opt->status,
@@ -1166,12 +960,8 @@ do_options_second_pass(queue_t *q, mblk_t *reqmp, mblk_t *ack_mp, cred_t *cr,
optset_context = SETFN_OPTCOM_CHECKONLY;
else /* T_NEGOTIATE */
optset_context = SETFN_OPTCOM_NEGOTIATE;
- err = do_opt_check_or_negotiate(q, opt, optset_context,
- &optr, worst_statusp, cr, dbobjp, first_mp);
- if (err == EINPROGRESS) {
- *queued_statusp = B_TRUE;
- return (0);
- }
+ do_opt_check_or_negotiate(q, opt, optset_context,
+ &optr, worst_statusp, cr, dbobjp);
break;
default:
return (TBADFLAG);
@@ -1236,7 +1026,6 @@ do_opt_default(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp,
pfi_t deffn = dbobjp->odb_deffn;
opdes_t *opt_arr = dbobjp->odb_opt_des_arr;
uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
- boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
struct T_opthdr *topth;
opdes_t *optd;
@@ -1248,15 +1037,8 @@ do_opt_default(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp,
optd = proto_opt_lookup(reqopt->level, reqopt->name,
opt_arr, opt_arr_cnt);
- if (optd == NULL) {
- /*
- * not found - fail this one. Should not happen
- * for topmost_tpiprovider as calling routine
- * should have verified it.
- */
- ASSERT(!topmost_tpiprovider);
- return (-1);
- }
+ /* Calling routine should have verified it it exists */
+ ASSERT(optd != NULL);
topth = (struct T_opthdr *)(*resptrp);
topth->level = reqopt->level;
@@ -1333,10 +1115,7 @@ do_opt_default(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp,
*
* lookup and stuff default values of all the options of the
* level specified
- * Note: This expansion of T_ALLOPT should happen in
- * a topmost_tpiprovider.
*/
- ASSERT(topmost_tpiprovider);
for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
if (reqopt->level != optd->opdes_level)
continue;
@@ -1453,8 +1232,6 @@ do_opt_current(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp,
pfi_t getfn = dbobjp->odb_getfn;
opdes_t *opt_arr = dbobjp->odb_opt_des_arr;
uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
- boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
-
struct T_opthdr *topth;
opdes_t *optd;
int optlen;
@@ -1484,7 +1261,6 @@ do_opt_current(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp,
*resptrp -= sizeof (struct T_opthdr);
}
} else { /* T_ALLOPT processing */
- ASSERT(topmost_tpiprovider == B_TRUE);
/* scan and get all options */
for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
/* skip other levels */
@@ -1530,14 +1306,9 @@ do_opt_current(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp,
}
if (*resptrp == initptr) {
/*
- * getfn failed and does not want to handle this option. Maybe
- * something downstream will or something upstream did. (If
- * topmost_tpiprovider, initialize "status" to failure which
- * can possibly change downstream). Copy the input "as is" from
- * input option buffer if any to maintain transparency.
+ * getfn failed and does not want to handle this option.
*/
- if (topmost_tpiprovider)
- reqopt->status = T_FAILURE;
+ reqopt->status = T_FAILURE;
bcopy(reqopt, *resptrp, reqopt->len);
*resptrp += _TPI_ALIGN_TOPT(reqopt->len);
*worst_statusp = get_worst_status(reqopt->status,
@@ -1545,18 +1316,15 @@ do_opt_current(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp,
}
}
-/* ARGSUSED */
-static int
+static void
do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt,
uint_t optset_context, uchar_t **resptrp, t_uscalar_t *worst_statusp,
- cred_t *cr, optdb_obj_t *dbobjp, mblk_t *first_mp)
+ cred_t *cr, optdb_obj_t *dbobjp)
{
pfi_t deffn = dbobjp->odb_deffn;
opt_set_fn setfn = dbobjp->odb_setfn;
opdes_t *opt_arr = dbobjp->odb_opt_des_arr;
uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
- boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider;
-
struct T_opthdr *topth;
opdes_t *optd;
int error;
@@ -1572,12 +1340,10 @@ do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt,
error = (*setfn)(q, optset_context, reqopt->level, reqopt->name,
reqopt->len - sizeof (struct T_opthdr),
_TPI_TOPT_DATA(reqopt), &optlen, _TPI_TOPT_DATA(topth),
- NULL, cr, first_mp);
+ NULL, cr);
if (error) {
/* failed - reset "*resptrp" */
*resptrp -= sizeof (struct T_opthdr);
- if (error == EINPROGRESS)
- return (error);
} else {
/*
* success - "value" already filled in setfn()
@@ -1594,7 +1360,6 @@ do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt,
} else { /* T_ALLOPT processing */
/* only for T_NEGOTIATE case */
ASSERT(optset_context == SETFN_OPTCOM_NEGOTIATE);
- ASSERT(topmost_tpiprovider == B_TRUE);
/* scan and set all options to default value */
for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) {
@@ -1670,7 +1435,7 @@ do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt,
error = (*setfn)(q, SETFN_OPTCOM_NEGOTIATE,
reqopt->level, optd->opdes_name, optsize,
(uchar_t *)optd->opdes_defbuf, &optlen,
- _TPI_TOPT_DATA(topth), NULL, cr, NULL);
+ _TPI_TOPT_DATA(topth), NULL, cr);
if (error) {
/*
* failed, return as T_FAILURE and null value
@@ -1693,20 +1458,14 @@ do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt,
if (*resptrp == initptr) {
/*
- * setfn failed and does not want to handle this option. Maybe
- * something downstream will or something upstream
- * did. Copy the input as is from input option buffer if any to
- * maintain transparency (maybe something at a level above
- * did something.
+ * setfn failed and does not want to handle this option.
*/
- if (topmost_tpiprovider)
- reqopt->status = T_FAILURE;
+ reqopt->status = T_FAILURE;
bcopy(reqopt, *resptrp, reqopt->len);
*resptrp += _TPI_ALIGN_TOPT(reqopt->len);
*worst_statusp = get_worst_status(reqopt->status,
*worst_statusp);
}
- return (0);
}
/*
@@ -1886,7 +1645,8 @@ tpi_optcom_buf(queue_t *q, mblk_t *mp, t_scalar_t *opt_lenp,
*/
/* verify length */
- if (!opt_length_ok(optd, opt)) {
+ if (opt->len < (t_uscalar_t)sizeof (struct T_opthdr) ||
+ !opt_length_ok(optd, opt->len - sizeof (struct T_opthdr))) {
/* bad size */
if ((optd->opdes_props & OP_NOT_ABSREQ) == 0) {
/* option is absolute requirement */
@@ -1914,7 +1674,7 @@ tpi_optcom_buf(queue_t *q, mblk_t *mp, t_scalar_t *opt_lenp,
error = (*setfn)(q, optset_context, opt->level, opt->name,
opt->len - (t_uscalar_t)sizeof (struct T_opthdr),
_TPI_TOPT_DATA(opt), &olen, _TPI_TOPT_DATA(opt),
- thisdg_attrs, cr, NULL);
+ thisdg_attrs, cr);
if (olen > (int)(opt->len - sizeof (struct T_opthdr))) {
/*
@@ -2113,8 +1873,12 @@ opt_bloated_maxsize(opdes_t *optd)
return (B_FALSE);
}
+/*
+ * optlen is the length of the option content
+ * Caller should check the optlen is at least sizeof (struct T_opthdr)
+ */
static boolean_t
-opt_length_ok(opdes_t *optd, struct T_opthdr *opt)
+opt_length_ok(opdes_t *optd, t_uscalar_t optlen)
{
/*
* Verify length.
@@ -2122,95 +1886,60 @@ opt_length_ok(opdes_t *optd, struct T_opthdr *opt)
* less than maxlen of variable length option.
*/
if (optd->opdes_props & OP_VARLEN) {
- if (opt->len <= optd->opdes_size +
- (t_uscalar_t)sizeof (struct T_opthdr))
+ if (optlen <= optd->opdes_size)
return (B_TRUE);
} else {
/* fixed length option */
- if (opt->len == optd->opdes_size +
- (t_uscalar_t)sizeof (struct T_opthdr))
+ if (optlen == optd->opdes_size)
return (B_TRUE);
}
return (B_FALSE);
}
/*
- * This routine appends a pssed in hop-by-hop option to the existing
- * option (in this case a cipso label encoded in HOPOPT option). The
- * passed in option is always padded. The 'reservelen' is the
- * length of reserved data (label). New memory will be allocated if
- * the current buffer is not large enough. Return failure if memory
+ * This routine manages the allocation and free of the space for
+ * an extension header or option. Returns failure if memory
* can not be allocated.
*/
int
-optcom_pkt_set(uchar_t *invalp, uint_t inlen, boolean_t sticky,
- uchar_t **optbufp, uint_t *optlenp, uint_t reservelen)
+optcom_pkt_set(uchar_t *invalp, uint_t inlen,
+ uchar_t **optbufp, uint_t *optlenp)
{
uchar_t *optbuf;
uchar_t *optp;
- if (!sticky) {
- *optbufp = invalp;
- *optlenp = inlen;
- return (0);
- }
-
- if (inlen == *optlenp - reservelen) {
+ if (inlen == *optlenp) {
/* Unchanged length - no need to reallocate */
- optp = *optbufp + reservelen;
+ optp = *optbufp;
bcopy(invalp, optp, inlen);
- if (reservelen != 0) {
- /*
- * Convert the NextHeader and Length of the
- * passed in hop-by-hop header to pads
- */
- optp[0] = IP6OPT_PADN;
- optp[1] = 0;
- }
return (0);
}
- if (inlen + reservelen > 0) {
+ if (inlen > 0) {
/* Allocate new buffer before free */
- optbuf = kmem_alloc(inlen + reservelen, KM_NOSLEEP);
+ optbuf = kmem_alloc(inlen, KM_NOSLEEP);
if (optbuf == NULL)
return (ENOMEM);
} else {
optbuf = NULL;
}
- /* Copy out old reserved data (label) */
- if (reservelen > 0)
- bcopy(*optbufp, optbuf, reservelen);
-
/* Free old buffer */
if (*optlenp != 0)
kmem_free(*optbufp, *optlenp);
if (inlen > 0)
- bcopy(invalp, optbuf + reservelen, inlen);
+ bcopy(invalp, optbuf, inlen);
- if (reservelen != 0) {
- /*
- * Convert the NextHeader and Length of the
- * passed in hop-by-hop header to pads
- */
- optbuf[reservelen] = IP6OPT_PADN;
- optbuf[reservelen + 1] = 0;
- /*
- * Set the Length of the hop-by-hop header, number of 8
- * byte-words following the 1st 8 bytes
- */
- optbuf[1] = (reservelen + inlen - 1) >> 3;
- }
*optbufp = optbuf;
- *optlenp = inlen + reservelen;
+ *optlenp = inlen;
return (0);
}
int
process_auxiliary_options(conn_t *connp, void *control, t_uscalar_t controllen,
- void *optbuf, optdb_obj_t *dbobjp, int (*opt_set_fn)(conn_t *, uint_t, int,
- int, uint_t, uchar_t *, uint_t *, uchar_t *, void *, cred_t *), cred_t *cr)
+ void *optbuf, optdb_obj_t *dbobjp, int (*opt_set_fn)(conn_t *,
+ uint_t, int, int, uint_t, uchar_t *, uint_t *, uchar_t *, void *, cred_t *),
+ cred_t *cr)
{
struct cmsghdr *cmsg;
opdes_t *optd;
@@ -2254,7 +1983,7 @@ process_auxiliary_options(conn_t *connp, void *control, t_uscalar_t controllen,
}
error = opt_set_fn(connp, SETFN_UD_NEGOTIATE, optd->opdes_level,
optd->opdes_name, len, (uchar_t *)CMSG_CONTENT(cmsg),
- &outlen, (uchar_t *)CMSG_CONTENT(cmsg), (void *)optbuf, cr);
+ &outlen, (uchar_t *)CMSG_CONTENT(cmsg), optbuf, cr);
if (error > 0) {
return (error);
} else if (outlen > len) {
diff --git a/usr/src/uts/common/inet/optcom.h b/usr/src/uts/common/inet/optcom.h
index df4f227e95..01ca52a759 100644
--- a/usr/src/uts/common/inet/optcom.h
+++ b/usr/src/uts/common/inet/optcom.h
@@ -34,6 +34,7 @@ extern "C" {
#if defined(_KERNEL) && defined(__STDC__)
#include <inet/ipclassifier.h>
+
/* Options Description Structure */
typedef struct opdes_s {
t_uscalar_t opdes_name; /* option name */
@@ -138,20 +139,15 @@ typedef struct opdes_s {
#define OA_NO_PERMISSION(x, c) (OA_MATCHED_PRIV((x), (c)) ? \
((x)->opdes_access_priv == 0) : ((x)->opdes_access_nopriv == 0))
-#define PASS_OPT_TO_IP(connp) \
- if (IPCL_IS_NONSTR(connp)) \
- return (-EINVAL)
-
/*
* Other properties set in opdes_props field.
*/
-#define OP_PASSNEXT 0x1 /* to pass option to next module or not */
-#define OP_VARLEN 0x2 /* option is varible length */
-#define OP_NOT_ABSREQ 0x4 /* option is not a "absolute requirement" */
+#define OP_VARLEN 0x1 /* option is varible length */
+#define OP_NOT_ABSREQ 0x2 /* option is not a "absolute requirement" */
/* i.e. failure to negotiate does not */
/* abort primitive ("ignore" semantics ok) */
-#define OP_NODEFAULT 0x8 /* no concept of "default value" */
-#define OP_DEF_FN 0x10 /* call a "default function" to get default */
+#define OP_NODEFAULT 0x4 /* no concept of "default value" */
+#define OP_DEF_FN 0x8 /* call a "default function" to get default */
/* value, not from static table */
@@ -165,13 +161,12 @@ typedef t_uscalar_t optlevel_t;
typedef int (*opt_def_fn)(queue_t *, int, int, uchar_t *);
typedef int (*opt_get_fn)(queue_t *, int, int, uchar_t *);
typedef int (*opt_set_fn)(queue_t *, uint_t, int, int, uint_t, uchar_t *,
- uint_t *, uchar_t *, void *, cred_t *, mblk_t *);
+ uint_t *, uchar_t *, void *, cred_t *);
typedef struct optdb_obj {
opt_def_fn odb_deffn; /* default value function */
opt_get_fn odb_getfn; /* get function */
opt_set_fn odb_setfn; /* set function */
- boolean_t odb_topmost_tpiprovider; /* whether topmost tpi */
/* provider or downstream */
uint_t odb_opt_arr_cnt; /* count of number of options in db */
opdes_t *odb_opt_des_arr; /* option descriptors in db */
@@ -182,22 +177,6 @@ typedef struct optdb_obj {
} optdb_obj_t;
/*
- * This is used to restart option processing. This goes inside an M_CTL
- * which is prepended to the packet. IP may need to become exclusive on
- * an ill for setting some options. For dg. IP_ADD_MEMBERSHIP. Since
- * there can be more than 1 option packed in an option buffer, we need to
- * remember where to restart option processing after resuming from a wait
- * for exclusive condition in IP.
- */
-typedef struct opt_restart_s {
- struct opthdr *or_start; /* start of option buffer */
- struct opthdr *or_end; /* end of option buffer */
- struct opthdr *or_ropt; /* restart option here */
- t_uscalar_t or_worst_status; /* Used by tpi_optcom_req */
- t_uscalar_t or_type; /* svr4 or tpi optcom variant */
- int or_private; /* currently used by CGTP */
-} opt_restart_t;
-/*
* Values for "optset_context" parameter passed to
* transport specific "setfn()" routines
*/
@@ -210,16 +189,12 @@ typedef struct opt_restart_s {
* Function prototypes
*/
extern void optcom_err_ack(queue_t *, mblk_t *, t_scalar_t, int);
-extern int svr4_optcom_req(queue_t *, mblk_t *, cred_t *, optdb_obj_t *,
- boolean_t);
-extern int tpi_optcom_req(queue_t *, mblk_t *, cred_t *, optdb_obj_t *,
- boolean_t);
+extern void svr4_optcom_req(queue_t *, mblk_t *, cred_t *, optdb_obj_t *);
+extern void tpi_optcom_req(queue_t *, mblk_t *, cred_t *, optdb_obj_t *);
extern int tpi_optcom_buf(queue_t *, mblk_t *, t_scalar_t *, t_scalar_t,
cred_t *, optdb_obj_t *, void *, int *);
extern t_uscalar_t optcom_max_optsize(opdes_t *, uint_t);
-extern int optcom_pkt_set(uchar_t *, uint_t, boolean_t, uchar_t **, uint_t *,
- uint_t);
-
+extern int optcom_pkt_set(uchar_t *, uint_t, uchar_t **, uint_t *);
extern int process_auxiliary_options(conn_t *, void *, t_uscalar_t,
void *, optdb_obj_t *, int (*)(conn_t *, uint_t, int, int, uint_t,
uchar_t *, uint_t *, uchar_t *, void *, cred_t *), cred_t *);
diff --git a/usr/src/uts/common/inet/proto_set.c b/usr/src/uts/common/inet/proto_set.c
index 45f07d2ed3..499f046f6d 100644
--- a/usr/src/uts/common/inet/proto_set.c
+++ b/usr/src/uts/common/inet/proto_set.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -348,27 +348,21 @@ proto_opt_lookup(t_uscalar_t level, t_uscalar_t name, opdes_t *opt_arr,
/*
* Do a lookup of the options in the array and do permission and length checking
* Returns zero if there is no error (note: for non-tpi-providers not being able
- * to find the option is not an error). TPI errors are returned as -ve.
+ * to find the option is not an error). TPI errors are returned as negative
+ * numbers and errnos as positive numbers.
+ * If max_len is set we update it based on the max length of the option.
*/
int
proto_opt_check(int level, int name, int len, t_uscalar_t *max_len,
- opdes_t *opt_arr, uint_t opt_arr_cnt, boolean_t topmost_tpiprovider,
- boolean_t negotiate, boolean_t check, cred_t *cr)
+ opdes_t *opt_arr, uint_t opt_arr_cnt, boolean_t negotiate, boolean_t check,
+ cred_t *cr)
{
opdes_t *optd;
/* Find the option in the opt_arr. */
- if ((optd = proto_opt_lookup(level, name, opt_arr, opt_arr_cnt)) ==
- NULL) {
- /*
- * Not found, that is a bad thing if
- * the caller is a tpi provider
- */
- if (topmost_tpiprovider)
- return (-TBADOPT);
- else
- return (0); /* skip unmodified */
- }
+ optd = proto_opt_lookup(level, name, opt_arr, opt_arr_cnt);
+ if (optd == NULL)
+ return (-TBADOPT);
/* Additional checks dependent on operation. */
if (negotiate) {
@@ -409,15 +403,12 @@ proto_opt_check(int level, int name, int len, t_uscalar_t *max_len,
return (-TBADOPT);
}
/*
- * XXX Change the comments.
- *
* XXX Since T_CURRENT was not there in TLI and the
* official TLI inspired TPI standard, getsockopt()
* API uses T_CHECK (for T_CURRENT semantics)
- * The following fallthru makes sense because of its
- * historical use as semantic equivalent to T_CURRENT.
+ * Thus T_CHECK includes the T_CURRENT semantics due to that
+ * historical use.
*/
- /* FALLTHRU */
if (!OA_READ_PERMISSION(optd, cr)) {
/* can't read option value */
if (!(OA_MATCHED_PRIV(optd, cr)) &&
diff --git a/usr/src/uts/common/inet/proto_set.h b/usr/src/uts/common/inet/proto_set.h
index 8e714c7c05..488cf4d478 100644
--- a/usr/src/uts/common/inet/proto_set.h
+++ b/usr/src/uts/common/inet/proto_set.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -48,7 +48,7 @@ extern int proto_tlitosyserr(int);
extern int proto_verify_ip_addr(int, const struct sockaddr *, socklen_t);
extern int proto_opt_check(int, int, int, t_uscalar_t *, opdes_t *,
- uint_t, boolean_t, boolean_t, boolean_t, cred_t *);
+ uint_t, boolean_t, boolean_t, cred_t *);
extern opdes_t *proto_opt_lookup(t_uscalar_t, t_uscalar_t, opdes_t *, uint_t);
#ifdef __cplusplus
diff --git a/usr/src/uts/common/inet/rawip_impl.h b/usr/src/uts/common/inet/rawip_impl.h
index 5635bb0f01..348c4f5239 100644
--- a/usr/src/uts/common/inet/rawip_impl.h
+++ b/usr/src/uts/common/inet/rawip_impl.h
@@ -69,87 +69,25 @@ typedef struct icmp_stack icmp_stack_t;
/* Internal icmp control structure, one per open stream */
typedef struct icmp_s {
- krwlock_t icmp_rwlock; /* Protects most of icmp_t */
- t_scalar_t icmp_pending_op; /* The current TPI operation */
/*
- * Following fields up to icmp_ipversion protected by conn_lock.
+ * The addresses and ports in the conn_t and icmp_state are protected by
+ * conn_lock. conn_lock also protects the content of icmp_t.
*/
uint_t icmp_state; /* TPI state */
- in6_addr_t icmp_v6src; /* Source address of this stream */
- in6_addr_t icmp_bound_v6src; /* Explicitely bound to address */
- sin6_t icmp_v6dst; /* Connected destination */
- /*
- * IP format that packets transmitted from this struct should use.
- * Value can be IP4_VERSION or IPV6_VERSION.
- */
- uchar_t icmp_ipversion;
-
- /* Written to only once at the time of opening the endpoint */
- sa_family_t icmp_family; /* Family from socket() call */
-
- /* Following protected by icmp_rwlock */
- uint32_t icmp_max_hdr_len; /* For write offset in stream head */
- uint_t icmp_proto;
- uint_t icmp_ip_snd_options_len; /* Len of IPv4 options */
- uint8_t *icmp_ip_snd_options; /* Ptr to IPv4 options */
- uint8_t icmp_multicast_ttl; /* IP*_MULTICAST_TTL/HOPS */
- ipaddr_t icmp_multicast_if_addr; /* IP_MULTICAST_IF option */
- uint_t icmp_multicast_if_index; /* IPV6_MULTICAST_IF option */
- int icmp_bound_if; /* IP*_BOUND_IF option */
/* Written to only once at the time of opening the endpoint */
conn_t *icmp_connp;
- /* Following protected by icmp_rwlock */
uint_t
- icmp_debug : 1, /* SO_DEBUG "socket" option. */
- icmp_dontroute : 1, /* SO_DONTROUTE "socket" option. */
- icmp_broadcast : 1, /* SO_BROADCAST "socket" option. */
- icmp_reuseaddr : 1, /* SO_REUSEADDR "socket" option. */
-
- icmp_useloopback : 1, /* SO_USELOOPBACK "socket" option. */
icmp_hdrincl : 1, /* IP_HDRINCL option + RAW and IGMP */
- icmp_dgram_errind : 1, /* SO_DGRAM_ERRIND option */
- icmp_unspec_source : 1, /* IP*_UNSPEC_SRC option */
- icmp_raw_checksum : 1, /* raw checksum per IPV6_CHECKSUM */
- icmp_no_tp_cksum : 1, /* icmp_proto is UDP or TCP */
- icmp_ip_recvpktinfo : 1, /* IPV[4,6]_RECVPKTINFO option */
- icmp_ipv6_recvhoplimit : 1, /* IPV6_RECVHOPLIMIT option */
+ icmp_pad_to_bit_31: 31;
- icmp_ipv6_recvhopopts : 1, /* IPV6_RECVHOPOPTS option */
- icmp_ipv6_recvdstopts : 1, /* IPV6_RECVDSTOPTS option */
- icmp_ipv6_recvrthdr : 1, /* IPV6_RECVRTHDR option */
- icmp_ipv6_recvpathmtu : 1, /* IPV6_RECVPATHMTU option */
-
- icmp_recvif:1, /* IP_RECVIF for raw sockets option */
- icmp_ipv6_recvtclass : 1, /* IPV6_RECVTCLASS option */
- icmp_ipv6_recvrtdstopts : 1, /* Obsolete IPV6_RECVRTHDRDSTOPTS */
- icmp_old_ipv6_recvdstopts : 1, /* Old ver of IPV6_RECVDSTOPTS */
-
- icmp_timestamp : 1, /* SO_TIMESTAMP "socket" option */
-
- icmp_pad_to_bit_31: 11;
-
- uint8_t icmp_type_of_service;
- uint8_t icmp_ttl; /* TTL or hoplimit */
- uint32_t icmp_checksum_off; /* user supplied checksum offset */
icmp6_filter_t *icmp_filter; /* ICMP6_FILTER option */
- ip6_pkt_t icmp_sticky_ipp; /* Sticky options */
- uint8_t *icmp_sticky_hdrs; /* Prebuilt IPv6 hdrs */
- uint_t icmp_sticky_hdrs_len; /* Incl. ip6h and any ip6i */
- zoneid_t icmp_zoneid; /* ID of owning zone */
- uint_t icmp_label_len; /* length of security label */
- uint_t icmp_label_len_v6; /* sec. part of sticky opt */
- in6_addr_t icmp_v6lastdst; /* most recent destination */
- cred_t *icmp_last_cred; /* most recent credentials */
- cred_t *icmp_effective_cred; /* cred with effective label */
+ /* Set at open time and never changed */
icmp_stack_t *icmp_is; /* Stack instance */
- size_t icmp_xmit_hiwat;
- size_t icmp_xmit_lowat;
- size_t icmp_recv_hiwat;
- size_t icmp_recv_lowat;
+
int icmp_delayed_error;
kmutex_t icmp_recv_lock;
mblk_t *icmp_fallback_queue_head;
@@ -165,6 +103,10 @@ typedef struct icmp_s {
extern optdb_obj_t icmp_opt_obj;
extern uint_t icmp_max_optsize;
+extern int icmp_opt_default(queue_t *, t_scalar_t, t_scalar_t, uchar_t *);
+extern int icmp_tpi_opt_get(queue_t *, t_scalar_t, t_scalar_t, uchar_t *);
+extern int icmp_tpi_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *,
+ uint_t *, uchar_t *, void *, cred_t *);
extern mblk_t *icmp_snmp_get(queue_t *q, mblk_t *mpctl);
extern void icmp_ddi_g_init(void);
diff --git a/usr/src/uts/common/inet/rts_impl.h b/usr/src/uts/common/inet/rts_impl.h
index de7cd8970b..b2b9080e9e 100644
--- a/usr/src/uts/common/inet/rts_impl.h
+++ b/usr/src/uts/common/inet/rts_impl.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -71,13 +71,7 @@ typedef struct rts_s {
uint_t rts_state; /* Provider interface state */
uint_t rts_error; /* Routing socket error code */
uint_t rts_flag; /* Pending I/O state */
- uint_t rts_proto; /* SO_PROTOTYPE "socket" option. */
- uint_t rts_debug : 1, /* SO_DEBUG "socket" option. */
- rts_dontroute : 1, /* SO_DONTROUTE "socket" option. */
- rts_broadcast : 1, /* SO_BROADCAST "socket" option. */
- rts_reuseaddr : 1, /* SO_REUSEADDR "socket" option. */
- rts_useloopback : 1, /* SO_USELOOPBACK "socket" option. */
- rts_multicast_loop : 1, /* IP_MULTICAST_LOOP option */
+ uint_t
rts_hdrincl : 1, /* IP_HDRINCL option + RAW and IGMP */
: 0;
@@ -86,30 +80,16 @@ typedef struct rts_s {
/* Written to only once at the time of opening the endpoint */
conn_t *rts_connp;
- /* Outbound flow control */
- size_t rts_xmit_hiwat;
- size_t rts_xmit_lowat;
-
- /* Inbound flow control */
- size_t rts_recv_hiwat;
- size_t rts_recv_lowat;
-
- kmutex_t rts_send_mutex;
- kmutex_t rts_recv_mutex;
- kcondvar_t rts_send_cv;
- kcondvar_t rts_io_cv;
+ kmutex_t rts_recv_mutex; /* For recv flow control */
} rts_t;
#define RTS_WPUT_PENDING 0x1 /* Waiting for write-side to complete */
-#define RTS_REQ_PENDING 0x1 /* For direct sockets */
#define RTS_WRW_PENDING 0x2 /* Routing socket write in progress */
-#define RTS_REQ_INPROG 0x2 /* For direct sockets */
/*
* Object to represent database of options to search passed to
* {sock,tpi}optcom_req() interface routine to take care of option
* management and associated methods.
- * XXX. These and other externs should really move to a rts header.
*/
extern optdb_obj_t rts_opt_obj;
extern uint_t rts_max_optsize;
@@ -119,7 +99,7 @@ extern void rts_ddi_g_destroy(void);
extern int rts_tpi_opt_get(queue_t *, t_scalar_t, t_scalar_t, uchar_t *);
extern int rts_tpi_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *,
- uint_t *, uchar_t *, void *, cred_t *, mblk_t *);
+ uint_t *, uchar_t *, void *, cred_t *);
extern int rts_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name,
uchar_t *ptr);
diff --git a/usr/src/uts/common/inet/sadb.h b/usr/src/uts/common/inet/sadb.h
index 6d3b9b5b27..7a45a41b85 100644
--- a/usr/src/uts/common/inet/sadb.h
+++ b/usr/src/uts/common/inet/sadb.h
@@ -37,14 +37,34 @@ extern "C" {
#define IPSA_MAX_ADDRLEN 4 /* Max address len. (in 32-bits) for an SA. */
-/*
- * Return codes of IPsec processing functions.
- */
-typedef enum {
- IPSEC_STATUS_SUCCESS = 1,
- IPSEC_STATUS_FAILED = 2,
- IPSEC_STATUS_PENDING = 3
-} ipsec_status_t;
+#define MAXSALTSIZE 8
+
+/*
+ * For combined mode ciphers, store the crypto_mechanism_t in the
+ * per-packet ipsec_in_t/ipsec_out_t structures. This is because the PARAMS
+ * and nonce values change for each packet. For non-combined mode
+ * ciphers, these values are constant for the life of the SA.
+ */
+typedef struct ipsa_cm_mech_s {
+ crypto_mechanism_t combined_mech;
+ union {
+ CK_AES_CCM_PARAMS paramu_ccm;
+ CK_AES_GCM_PARAMS paramu_gcm;
+ } paramu;
+ uint8_t nonce[MAXSALTSIZE + sizeof (uint64_t)];
+#define param_ulMACSize paramu.paramu_ccm.ulMACSize
+#define param_ulNonceSize paramu.paramu_ccm.ipsa_ulNonceSize
+#define param_ulAuthDataSize paramu.paramu_ccm.ipsa_ulAuthDataSize
+#define param_ulDataSize paramu.paramu_ccm.ipsa_ulDataSize
+#define param_nonce paramu.paramu_ccm.nonce
+#define param_authData paramu.paramu_ccm.authData
+#define param_pIv paramu.paramu_gcm.ipsa_pIv
+#define param_ulIvLen paramu.paramu_gcm.ulIvLen
+#define param_ulIvBits paramu.paramu_gcm.ulIvBits
+#define param_pAAD paramu.paramu_gcm.pAAD
+#define param_ulAADLen paramu.paramu_gcm.ulAADLen
+#define param_ulTagBits paramu.paramu_gcm.ulTagBits
+} ipsa_cm_mech_t;
/*
* The Initialization Vector (also known as IV or Nonce) used to
@@ -280,9 +300,13 @@ typedef struct ipsa_s {
/*
* Input and output processing functions called from IP.
+ * The mblk_t is the data; the IPsec information is in the attributes
+ * Returns NULL if the mblk is consumed which it is if there was
+ * a failure or if pending. If failure then
+ * the ipIfInDiscards/OutDiscards counters are increased.
*/
- ipsec_status_t (*ipsa_output_func)(mblk_t *);
- ipsec_status_t (*ipsa_input_func)(mblk_t *, void *);
+ mblk_t *(*ipsa_output_func)(mblk_t *, ip_xmit_attr_t *);
+ mblk_t *(*ipsa_input_func)(mblk_t *, void *, ip_recv_attr_t *);
/*
* Soft reference to paired SA
@@ -290,8 +314,8 @@ typedef struct ipsa_s {
uint32_t ipsa_otherspi;
netstack_t *ipsa_netstack; /* Does not have a netstack_hold */
- cred_t *ipsa_cred; /* MLS: cred_t attributes */
- cred_t *ipsa_ocred; /* MLS: outer label */
+ ts_label_t *ipsa_tsl; /* MLS: label attributes */
+ ts_label_t *ipsa_otsl; /* MLS: outer label */
uint8_t ipsa_mac_exempt; /* MLS: mac exempt flag */
uchar_t ipsa_opt_storage[IP_MAX_OPT_LENGTH];
} ipsa_t;
@@ -382,7 +406,7 @@ typedef struct ipsa_s {
#define IPSA_F_EALG1 SADB_X_SAFLAGS_EALG1 /* Encrypt alg flag 1 */
#define IPSA_F_EALG2 SADB_X_SAFLAGS_EALG2 /* Encrypt alg flag 2 */
-#define IPSA_F_HW 0x200000 /* hwaccel capable SA */
+#define IPSA_F_ASYNC 0x200000 /* Call KCF asynchronously? */
#define IPSA_F_NATT_LOC SADB_X_SAFLAGS_NATT_LOC
#define IPSA_F_NATT_REM SADB_X_SAFLAGS_NATT_REM
#define IPSA_F_BEHIND_NAT SADB_X_SAFLAGS_NATTED
@@ -503,8 +527,8 @@ typedef struct ipsacq_s {
uint8_t ipsacq_icmp_type;
uint8_t ipsacq_icmp_code;
- /* credentials associated with triggering packet */
- cred_t *ipsacq_cred;
+ /* label associated with triggering packet */
+ ts_label_t *ipsacq_tsl;
} ipsacq_t;
/*
@@ -529,7 +553,7 @@ typedef struct iacqf_s {
* A (network protocol, ipsec protocol) specific SADB.
* (i.e., one each for {ah, esp} and {v4, v6}.
*
- * Keep outbound assocs about the same as ire_cache entries for now.
+ * Keep outbound assocs in a simple hash table for now.
* One danger point, multiple SAs for a single dest will clog a bucket.
* For the future, consider two-level hashing (2nd hash on IPC?), then probe.
*/
@@ -550,7 +574,6 @@ typedef struct sadb_s
typedef struct sadbp_s
{
uint32_t s_satype;
- queue_t *s_ip_q;
uint32_t *s_acquire_timeout;
void (*s_acqfn)(ipsacq_t *, mblk_t *, netstack_t *);
sadb_t s_v4;
@@ -583,14 +606,16 @@ typedef struct templist_s
#define ALL_ZEROES_PTR ((uint32_t *)&ipv6_all_zeros)
/*
- * Form unique id from ipsec_out_t
+ * Form unique id from ip_xmit_attr_t.
*/
-
-#define SA_FORM_UNIQUE_ID(io) \
- SA_UNIQUE_ID((io)->ipsec_out_src_port, (io)->ipsec_out_dst_port, \
- ((io)->ipsec_out_tunnel ? ((io)->ipsec_out_inaf == AF_INET6 ? \
- IPPROTO_IPV6 : IPPROTO_ENCAP) : (io)->ipsec_out_proto), \
- ((io)->ipsec_out_tunnel ? (io)->ipsec_out_proto : 0))
+#define SA_FORM_UNIQUE_ID(ixa) \
+ SA_UNIQUE_ID((ixa)->ixa_ipsec_src_port, (ixa)->ixa_ipsec_dst_port, \
+ (((ixa)->ixa_flags & IXAF_IPSEC_TUNNEL) ? \
+ ((ixa)->ixa_ipsec_inaf == AF_INET6 ? \
+ IPPROTO_IPV6 : IPPROTO_ENCAP) : \
+ (ixa)->ixa_ipsec_proto), \
+ (((ixa)->ixa_flags & IXAF_IPSEC_TUNNEL) ? \
+ (ixa)->ixa_ipsec_proto : 0))
/*
* This macro is used to generate unique ids (along with the addresses, both
@@ -698,8 +723,8 @@ boolean_t sadb_match_query(ipsa_query_t *q, ipsa_t *sa);
/* SA retrieval (inbound and outbound) */
ipsa_t *ipsec_getassocbyspi(isaf_t *, uint32_t, uint32_t *, uint32_t *,
sa_family_t);
-ipsa_t *ipsec_getassocbyconn(isaf_t *, ipsec_out_t *, uint32_t *, uint32_t *,
- sa_family_t, uint8_t, cred_t *);
+ipsa_t *ipsec_getassocbyconn(isaf_t *, ip_xmit_attr_t *, uint32_t *, uint32_t *,
+ sa_family_t, uint8_t, ts_label_t *);
/* SA insertion. */
int sadb_insertassoc(ipsa_t *, isaf_t *);
@@ -727,9 +752,9 @@ boolean_t sadb_addrfix(keysock_in_t *, queue_t *, mblk_t *, netstack_t *);
int sadb_addrset(ire_t *);
int sadb_delget_sa(mblk_t *, keysock_in_t *, sadbp_t *, int *, queue_t *,
uint8_t);
-int sadb_purge_sa(mblk_t *, keysock_in_t *, sadb_t *, int *, queue_t *,
- queue_t *);
-int sadb_common_add(queue_t *, queue_t *, mblk_t *, sadb_msg_t *,
+
+int sadb_purge_sa(mblk_t *, keysock_in_t *, sadb_t *, int *, queue_t *);
+int sadb_common_add(queue_t *, mblk_t *, sadb_msg_t *,
keysock_in_t *, isaf_t *, isaf_t *, ipsa_t *, boolean_t, boolean_t, int *,
netstack_t *, sadbp_t *);
void sadb_set_usetime(ipsa_t *);
@@ -737,7 +762,7 @@ boolean_t sadb_age_bytes(queue_t *, ipsa_t *, uint64_t, boolean_t);
int sadb_update_sa(mblk_t *, keysock_in_t *, mblk_t **, sadbp_t *,
int *, queue_t *, int (*)(mblk_t *, keysock_in_t *, int *, netstack_t *),
netstack_t *, uint8_t);
-void sadb_acquire(mblk_t *, ipsec_out_t *, boolean_t, boolean_t);
+void sadb_acquire(mblk_t *, ip_xmit_attr_t *, boolean_t, boolean_t);
void gcm_params_init(ipsa_t *, uchar_t *, uint_t, uchar_t *, ipsa_cm_mech_t *,
crypto_data_t *);
void ccm_params_init(ipsa_t *, uchar_t *, uint_t, uchar_t *, ipsa_cm_mech_t *,
@@ -754,16 +779,17 @@ boolean_t sadb_replay_check(ipsa_t *, uint32_t);
boolean_t sadb_replay_peek(ipsa_t *, uint32_t);
int sadb_dump(queue_t *, mblk_t *, keysock_in_t *, sadb_t *);
void sadb_replay_delete(ipsa_t *);
-void sadb_ager(sadb_t *, queue_t *, queue_t *, int, netstack_t *);
+void sadb_ager(sadb_t *, queue_t *, int, netstack_t *);
timeout_id_t sadb_retimeout(hrtime_t, queue_t *, void (*)(void *), void *,
uint_t *, uint_t, short);
void sadb_sa_refrele(void *target);
-boolean_t sadb_set_lpkt(ipsa_t *, mblk_t *, netstack_t *);
+boolean_t sadb_set_lpkt(ipsa_t *, mblk_t *, ip_recv_attr_t *);
mblk_t *sadb_clear_lpkt(ipsa_t *);
-void sadb_buf_pkt(ipsa_t *, mblk_t *, netstack_t *);
+void sadb_buf_pkt(ipsa_t *, mblk_t *, ip_recv_attr_t *);
void sadb_clear_buf_pkt(void *ipkt);
+/* Note that buf_pkt is the product of ip_recv_attr_to_mblk() */
#define HANDLE_BUF_PKT(taskq, stack, dropper, buf_pkt) \
{ \
if (buf_pkt != NULL) { \
@@ -774,8 +800,9 @@ void sadb_clear_buf_pkt(void *ipkt);
while (buf_pkt != NULL) { \
tmp = buf_pkt->b_next; \
buf_pkt->b_next = NULL; \
+ buf_pkt = ip_recv_attr_free_mblk(buf_pkt); \
ip_drop_packet(buf_pkt, B_TRUE, NULL, \
- NULL, DROPPER(stack, \
+ DROPPER(stack, \
ipds_sadb_inidle_timeout), \
&dropper); \
buf_pkt = tmp; \
@@ -785,24 +812,8 @@ void sadb_clear_buf_pkt(void *ipkt);
} \
/*
- * Hw accel-related calls (downloading sadb to driver)
- */
-void sadb_ill_download(ill_t *, uint_t);
-mblk_t *sadb_fmt_sa_req(uint_t, uint_t, ipsa_t *, boolean_t);
-/*
- * Sub-set of the IPsec hardware acceleration capabilities functions
- * implemented by ip_if.c
- */
-extern boolean_t ipsec_capab_match(ill_t *, uint_t, boolean_t, ipsa_t *,
- netstack_t *);
-extern void ill_ipsec_capab_send_all(uint_t, mblk_t *, ipsa_t *,
- netstack_t *);
-
-
-/*
- * One IPsec -> IP linking routine, and two IPsec rate-limiting routines.
+ * Two IPsec rate-limiting routines.
*/
-extern boolean_t sadb_t_bind_req(queue_t *, int);
/*PRINTFLIKE6*/
extern void ipsec_rl_strlog(netstack_t *, short, short, char,
ushort_t, char *, ...)
@@ -818,7 +829,8 @@ extern void ipsec_assocfailure(short, short, char, ushort_t, char *, uint32_t,
typedef enum ipsec_algtype {
IPSEC_ALG_AUTH = 0,
- IPSEC_ALG_ENCR = 1
+ IPSEC_ALG_ENCR = 1,
+ IPSEC_ALG_ALL = 2
} ipsec_algtype_t;
/*
@@ -886,11 +898,10 @@ extern void ipsec_alg_fix_min_max(ipsec_alginfo_t *, ipsec_algtype_t,
extern void alg_flag_check(ipsec_alginfo_t *);
extern void ipsec_alg_free(ipsec_alginfo_t *);
extern void ipsec_register_prov_update(void);
-extern void sadb_alg_update(ipsec_algtype_t, uint8_t, boolean_t,
- netstack_t *);
+extern void sadb_alg_update(ipsec_algtype_t, uint8_t, boolean_t, netstack_t *);
-extern int sadb_sens_len_from_cred(cred_t *);
-extern void sadb_sens_from_cred(sadb_sens_t *, int, cred_t *, int);
+extern int sadb_sens_len_from_label(ts_label_t *);
+extern void sadb_sens_from_label(sadb_sens_t *, int, ts_label_t *, int);
/*
* Context templates management.
diff --git a/usr/src/uts/common/inet/sctp/sctp.c b/usr/src/uts/common/inet/sctp/sctp.c
index 00fc6cda42..d444e1f10e 100644
--- a/usr/src/uts/common/inet/sctp/sctp.c
+++ b/usr/src/uts/common/inet/sctp/sctp.c
@@ -56,6 +56,8 @@
#include <inet/common.h>
#include <inet/ip.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ire.h>
#include <inet/ip6.h>
#include <inet/mi.h>
#include <inet/mib2.h>
@@ -74,12 +76,6 @@
int sctpdebug;
sin6_t sctp_sin6_null; /* Zero address for quick clears */
-/*
- * Have to ensure that sctp_g_q_close is not done by an
- * interrupt thread.
- */
-static taskq_t *sctp_taskq;
-
static void sctp_closei_local(sctp_t *sctp);
static int sctp_init_values(sctp_t *, sctp_t *, int);
static void sctp_icmp_error_ipv6(sctp_t *sctp, mblk_t *mp);
@@ -91,12 +87,10 @@ static void sctp_conn_cache_fini();
static int sctp_conn_cache_constructor();
static void sctp_conn_cache_destructor();
static void sctp_conn_clear(conn_t *);
-void sctp_g_q_setup(sctp_stack_t *);
-void sctp_g_q_create(sctp_stack_t *);
-void sctp_g_q_destroy(sctp_stack_t *);
+static void sctp_notify(void *, ip_xmit_attr_t *, ixa_notify_type_t,
+ ixa_notify_arg_t);
static void *sctp_stack_init(netstackid_t stackid, netstack_t *ns);
-static void sctp_stack_shutdown(netstackid_t stackid, void *arg);
static void sctp_stack_fini(netstackid_t stackid, void *arg);
/*
@@ -178,8 +172,8 @@ sctp_create_eager(sctp_t *psctp)
{
sctp_t *sctp;
mblk_t *ack_mp, *hb_mp;
- conn_t *connp, *pconnp;
- cred_t *credp;
+ conn_t *connp;
+ cred_t *credp;
sctp_stack_t *sctps = psctp->sctp_sctps;
if ((connp = ipcl_conn_create(IPCL_SCTPCONN, KM_NOSLEEP,
@@ -187,8 +181,6 @@ sctp_create_eager(sctp_t *psctp)
return (NULL);
}
- connp->conn_ulp_labeled = is_system_labeled();
-
sctp = CONN2SCTP(connp);
sctp->sctp_sctps = sctps;
@@ -200,7 +192,6 @@ sctp_create_eager(sctp_t *psctp)
freeb(ack_mp);
sctp_conn_clear(connp);
sctp->sctp_sctps = NULL;
- SCTP_G_Q_REFRELE(sctps);
kmem_cache_free(sctp_conn_cache, connp);
return (NULL);
}
@@ -208,43 +199,20 @@ sctp_create_eager(sctp_t *psctp)
sctp->sctp_ack_mp = ack_mp;
sctp->sctp_heartbeat_mp = hb_mp;
- /* Inherit information from the "parent" */
- sctp->sctp_ipversion = psctp->sctp_ipversion;
- sctp->sctp_family = psctp->sctp_family;
- pconnp = psctp->sctp_connp;
- connp->conn_af_isv6 = pconnp->conn_af_isv6;
- connp->conn_pkt_isv6 = pconnp->conn_pkt_isv6;
- connp->conn_ipv6_v6only = pconnp->conn_ipv6_v6only;
if (sctp_init_values(sctp, psctp, KM_NOSLEEP) != 0) {
freeb(ack_mp);
freeb(hb_mp);
sctp_conn_clear(connp);
sctp->sctp_sctps = NULL;
- SCTP_G_Q_REFRELE(sctps);
kmem_cache_free(sctp_conn_cache, connp);
return (NULL);
}
- /*
- * If the parent is multilevel, then we'll fix up the remote cred
- * when we do sctp_accept_comm.
- */
- if ((credp = pconnp->conn_cred) != NULL) {
+ if ((credp = psctp->sctp_connp->conn_cred) != NULL) {
connp->conn_cred = credp;
crhold(credp);
- /*
- * If the caller has the process-wide flag set, then default to
- * MAC exempt mode. This allows read-down to unlabeled hosts.
- */
- if (getpflags(NET_MAC_AWARE, credp) != 0)
- connp->conn_mac_mode = CONN_MAC_AWARE;
}
- connp->conn_allzones = pconnp->conn_allzones;
- connp->conn_zoneid = pconnp->conn_zoneid;
- sctp->sctp_cpid = psctp->sctp_cpid;
- sctp->sctp_open_time = lbolt64;
-
sctp->sctp_mss = psctp->sctp_mss;
sctp->sctp_detached = B_TRUE;
/*
@@ -263,11 +231,6 @@ void
sctp_clean_death(sctp_t *sctp, int err)
{
ASSERT(sctp != NULL);
- ASSERT((sctp->sctp_family == AF_INET &&
- sctp->sctp_ipversion == IPV4_VERSION) ||
- (sctp->sctp_family == AF_INET6 &&
- (sctp->sctp_ipversion == IPV4_VERSION ||
- sctp->sctp_ipversion == IPV6_VERSION)));
dprint(3, ("sctp_clean_death %p, state %d\n", (void *)sctp,
sctp->sctp_state));
@@ -328,7 +291,8 @@ sctp_clean_death(sctp_t *sctp, int err)
int
sctp_disconnect(sctp_t *sctp)
{
- int error = 0;
+ int error = 0;
+ conn_t *connp = sctp->sctp_connp;
dprint(3, ("sctp_disconnect %p, state %d\n", (void *)sctp,
sctp->sctp_state));
@@ -358,7 +322,7 @@ sctp_disconnect(sctp_t *sctp)
* If SO_LINGER has set a zero linger time, terminate the
* association and send an ABORT.
*/
- if (sctp->sctp_linger && sctp->sctp_lingertime == 0) {
+ if (connp->conn_linger && connp->conn_lingertime == 0) {
sctp_user_abort(sctp, NULL);
WAKE_SCTP(sctp);
return (error);
@@ -382,7 +346,7 @@ sctp_disconnect(sctp_t *sctp)
sctp_send_shutdown(sctp, 0);
/* Pass gathered wisdom to IP for keeping */
- sctp_update_ire(sctp);
+ sctp_update_dce(sctp);
/*
* If lingering on close then wait until the shutdown
@@ -391,21 +355,15 @@ sctp_disconnect(sctp_t *sctp)
* can be called more than once. Make sure that only
* one thread waits.
*/
- if (sctp->sctp_linger && sctp->sctp_lingertime > 0 &&
+ if (connp->conn_linger && connp->conn_lingertime > 0 &&
sctp->sctp_state >= SCTPS_ESTABLISHED &&
!sctp->sctp_lingering) {
clock_t stoptime; /* in ticks */
clock_t ret;
- /*
- * Process the sendq to send the SHUTDOWN out
- * before waiting.
- */
- sctp_process_sendq(sctp);
-
sctp->sctp_lingering = 1;
sctp->sctp_client_errno = 0;
- stoptime = lbolt + sctp->sctp_lingertime;
+ stoptime = lbolt + connp->conn_lingertime * hz;
mutex_enter(&sctp->sctp_lock);
sctp->sctp_running = B_FALSE;
@@ -429,7 +387,6 @@ sctp_disconnect(sctp_t *sctp)
}
WAKE_SCTP(sctp);
- sctp_process_sendq(sctp);
return (error);
}
@@ -493,7 +450,6 @@ static void
sctp_closei_local(sctp_t *sctp)
{
mblk_t *mp;
- ire_t *ire = NULL;
conn_t *connp = sctp->sctp_connp;
/* Sanity check, don't do the same thing twice. */
@@ -516,11 +472,7 @@ sctp_closei_local(sctp_t *sctp)
/* Set the CONN_CLOSING flag so that IP will not cache IRE again. */
mutex_enter(&connp->conn_lock);
connp->conn_state_flags |= CONN_CLOSING;
- ire = connp->conn_ire_cache;
- connp->conn_ire_cache = NULL;
mutex_exit(&connp->conn_lock);
- if (ire != NULL)
- IRE_REFRELE_NOTR(ire);
/* Remove from all hashes. */
sctp_bind_hash_remove(sctp);
@@ -534,14 +486,12 @@ sctp_closei_local(sctp_t *sctp)
*/
mutex_enter(&sctp->sctp_recvq_lock);
while ((mp = sctp->sctp_recvq) != NULL) {
- mblk_t *ipsec_mp;
-
sctp->sctp_recvq = mp->b_next;
mp->b_next = NULL;
- if ((ipsec_mp = mp->b_prev) != NULL) {
- freeb(ipsec_mp);
- mp->b_prev = NULL;
- }
+
+ if (ip_recv_attr_is_mblk(mp))
+ mp = ip_recv_attr_free_mblk(mp);
+
freemsg(mp);
}
mutex_exit(&sctp->sctp_recvq_lock);
@@ -668,7 +618,7 @@ sctp_free(conn_t *connp)
SCTP_UNLINK(sctp, sctps);
ASSERT(connp->conn_ref == 0);
- ASSERT(connp->conn_ulp == IPPROTO_SCTP);
+ ASSERT(connp->conn_proto == IPPROTO_SCTP);
ASSERT(!MUTEX_HELD(&sctp->sctp_reflock));
ASSERT(sctp->sctp_refcnt == 0);
@@ -723,8 +673,6 @@ sctp_free(conn_t *connp)
list_destroy(&sctp->sctp_saddrs[cnt].sctp_ipif_list);
}
- ip6_pkt_free(&sctp->sctp_sticky_ipp);
-
if (sctp->sctp_hopopts != NULL) {
mi_free(sctp->sctp_hopopts);
sctp->sctp_hopopts = NULL;
@@ -737,12 +685,12 @@ sctp_free(conn_t *connp)
sctp->sctp_dstoptslen = 0;
}
ASSERT(sctp->sctp_dstoptslen == 0);
- if (sctp->sctp_rtdstopts != NULL) {
- mi_free(sctp->sctp_rtdstopts);
- sctp->sctp_rtdstopts = NULL;
- sctp->sctp_rtdstoptslen = 0;
+ if (sctp->sctp_rthdrdstopts != NULL) {
+ mi_free(sctp->sctp_rthdrdstopts);
+ sctp->sctp_rthdrdstopts = NULL;
+ sctp->sctp_rthdrdstoptslen = 0;
}
- ASSERT(sctp->sctp_rtdstoptslen == 0);
+ ASSERT(sctp->sctp_rthdrdstoptslen == 0);
if (sctp->sctp_rthdr != NULL) {
mi_free(sctp->sctp_rthdr);
sctp->sctp_rthdr = NULL;
@@ -806,9 +754,7 @@ sctp_free(conn_t *connp)
sctp->sctp_v6label_len = 0;
sctp->sctp_v4label_len = 0;
- /* Every sctp_t holds one reference on the default queue */
sctp->sctp_sctps = NULL;
- SCTP_G_Q_REFRELE(sctps);
sctp_conn_clear(connp);
kmem_cache_free(sctp_conn_cache, connp);
@@ -822,10 +768,12 @@ sctp_display(sctp_t *sctp, char *sup_buf)
char buf1[30];
static char priv_buf[INET6_ADDRSTRLEN * 2 + 80];
char *cp;
+ conn_t *connp;
if (sctp == NULL)
return ("NULL_SCTP");
+ connp = sctp->sctp_connp;
buf = (sup_buf != NULL) ? sup_buf : priv_buf;
switch (sctp->sctp_state) {
@@ -865,7 +813,7 @@ sctp_display(sctp_t *sctp, char *sup_buf)
break;
}
(void) mi_sprintf(buf, "[%u, %u] %s",
- ntohs(sctp->sctp_lport), ntohs(sctp->sctp_fport), cp);
+ ntohs(connp->conn_lport), ntohs(connp->conn_fport), cp);
return (buf);
}
@@ -880,13 +828,9 @@ sctp_init_values(sctp_t *sctp, sctp_t *psctp, int sleep)
int err;
int cnt;
sctp_stack_t *sctps = sctp->sctp_sctps;
- conn_t *connp, *pconnp;
+ conn_t *connp;
- ASSERT((sctp->sctp_family == AF_INET &&
- sctp->sctp_ipversion == IPV4_VERSION) ||
- (sctp->sctp_family == AF_INET6 &&
- (sctp->sctp_ipversion == IPV4_VERSION ||
- sctp->sctp_ipversion == IPV6_VERSION)));
+ connp = sctp->sctp_connp;
sctp->sctp_nsaddrs = 0;
for (cnt = 0; cnt < SCTP_IPIF_HASH; cnt++) {
@@ -895,7 +839,7 @@ sctp_init_values(sctp_t *sctp, sctp_t *psctp, int sleep)
sizeof (sctp_saddr_ipif_t), offsetof(sctp_saddr_ipif_t,
saddr_ipif));
}
- sctp->sctp_ports = 0;
+ connp->conn_ports = 0;
sctp->sctp_running = B_FALSE;
sctp->sctp_state = SCTPS_IDLE;
@@ -925,51 +869,16 @@ sctp_init_values(sctp_t *sctp, sctp_t *psctp, int sleep)
if (psctp != NULL) {
/*
* Inherit from parent
+ *
+ * Start by inheriting from the conn_t, including conn_ixa and
+ * conn_xmit_ipp.
*/
- sctp->sctp_iphc = kmem_zalloc(psctp->sctp_iphc_len, sleep);
- if (sctp->sctp_iphc == NULL) {
- sctp->sctp_iphc_len = 0;
- err = ENOMEM;
- goto failure;
- }
- sctp->sctp_iphc_len = psctp->sctp_iphc_len;
- sctp->sctp_hdr_len = psctp->sctp_hdr_len;
-
- sctp->sctp_iphc6 = kmem_zalloc(psctp->sctp_iphc6_len, sleep);
- if (sctp->sctp_iphc6 == NULL) {
- sctp->sctp_iphc6_len = 0;
- err = ENOMEM;
+ err = conn_inherit_parent(psctp->sctp_connp, connp);
+ if (err != 0)
goto failure;
- }
- sctp->sctp_iphc6_len = psctp->sctp_iphc6_len;
- sctp->sctp_hdr6_len = psctp->sctp_hdr6_len;
-
- sctp->sctp_ip_hdr_len = psctp->sctp_ip_hdr_len;
- sctp->sctp_ip_hdr6_len = psctp->sctp_ip_hdr6_len;
-
- /*
- * Copy the IP+SCTP header templates from listener
- */
- bcopy(psctp->sctp_iphc, sctp->sctp_iphc,
- psctp->sctp_hdr_len);
- sctp->sctp_ipha = (ipha_t *)sctp->sctp_iphc;
- sctp->sctp_sctph = (sctp_hdr_t *)(sctp->sctp_iphc +
- sctp->sctp_ip_hdr_len);
-
- bcopy(psctp->sctp_iphc6, sctp->sctp_iphc6,
- psctp->sctp_hdr6_len);
- if (((ip6i_t *)(sctp->sctp_iphc6))->ip6i_nxt == IPPROTO_RAW) {
- sctp->sctp_ip6h = (ip6_t *)(sctp->sctp_iphc6 +
- sizeof (ip6i_t));
- } else {
- sctp->sctp_ip6h = (ip6_t *)sctp->sctp_iphc6;
- }
- sctp->sctp_sctph6 = (sctp_hdr_t *)(sctp->sctp_iphc6 +
- sctp->sctp_ip_hdr6_len);
sctp->sctp_cookie_lifetime = psctp->sctp_cookie_lifetime;
- sctp->sctp_xmit_lowater = psctp->sctp_xmit_lowater;
- sctp->sctp_xmit_hiwater = psctp->sctp_xmit_hiwater;
+
sctp->sctp_cwnd_max = psctp->sctp_cwnd_max;
sctp->sctp_rwnd = psctp->sctp_rwnd;
sctp->sctp_irwnd = psctp->sctp_rwnd;
@@ -996,43 +905,23 @@ sctp_init_values(sctp_t *sctp, sctp_t *psctp, int sleep)
sctp->sctp_tx_adaptation_code = psctp->sctp_tx_adaptation_code;
/* xxx should be a better way to copy these flags xxx */
- sctp->sctp_debug = psctp->sctp_debug;
sctp->sctp_bound_to_all = psctp->sctp_bound_to_all;
sctp->sctp_cansleep = psctp->sctp_cansleep;
sctp->sctp_send_adaptation = psctp->sctp_send_adaptation;
sctp->sctp_ndelay = psctp->sctp_ndelay;
sctp->sctp_events = psctp->sctp_events;
- sctp->sctp_ipv6_recvancillary = psctp->sctp_ipv6_recvancillary;
-
- /* Copy IP-layer options */
- connp = sctp->sctp_connp;
- pconnp = psctp->sctp_connp;
-
- connp->conn_broadcast = pconnp->conn_broadcast;
- connp->conn_loopback = pconnp->conn_loopback;
- connp->conn_dontroute = pconnp->conn_dontroute;
- connp->conn_reuseaddr = pconnp->conn_reuseaddr;
-
} else {
/*
- * Initialize the header template
- */
- if ((err = sctp_header_init_ipv4(sctp, sleep)) != 0) {
- goto failure;
- }
- if ((err = sctp_header_init_ipv6(sctp, sleep)) != 0) {
- goto failure;
- }
-
- /*
* Set to system defaults
*/
sctp->sctp_cookie_lifetime =
MSEC_TO_TICK(sctps->sctps_cookie_life);
- sctp->sctp_xmit_lowater = sctps->sctps_xmit_lowat;
- sctp->sctp_xmit_hiwater = sctps->sctps_xmit_hiwat;
+ connp->conn_sndlowat = sctps->sctps_xmit_lowat;
+ connp->conn_sndbuf = sctps->sctps_xmit_hiwat;
+ connp->conn_rcvbuf = sctps->sctps_recv_hiwat;
+
sctp->sctp_cwnd_max = sctps->sctps_cwnd_max_;
- sctp->sctp_rwnd = sctps->sctps_recv_hiwat;
+ sctp->sctp_rwnd = connp->conn_rcvbuf;
sctp->sctp_irwnd = sctp->sctp_rwnd;
sctp->sctp_pd_point = sctp->sctp_rwnd;
sctp->sctp_rto_max = MSEC_TO_TICK(sctps->sctps_rto_maxg);
@@ -1049,13 +938,28 @@ sctp_init_values(sctp_t *sctp, sctp_t *psctp, int sleep)
sctp->sctp_hb_interval =
MSEC_TO_TICK(sctps->sctps_heartbeat_interval);
+
+ if (connp->conn_family == AF_INET)
+ connp->conn_default_ttl = sctps->sctps_ipv4_ttl;
+ else
+ connp->conn_default_ttl = sctps->sctps_ipv6_hoplimit;
+
+ connp->conn_xmit_ipp.ipp_unicast_hops =
+ connp->conn_default_ttl;
+
+ /*
+ * Initialize the header template
+ */
+ if ((err = sctp_build_hdrs(sctp, sleep)) != 0) {
+ goto failure;
+ }
}
+
sctp->sctp_understands_asconf = B_TRUE;
sctp->sctp_understands_addip = B_TRUE;
sctp->sctp_prsctp_aware = B_FALSE;
sctp->sctp_connp->conn_ref = 1;
- sctp->sctp_connp->conn_fully_bound = B_FALSE;
sctp->sctp_prsctpdrop = 0;
sctp->sctp_msgcount = 0;
@@ -1063,14 +967,7 @@ sctp_init_values(sctp_t *sctp, sctp_t *psctp, int sleep)
return (0);
failure:
- if (sctp->sctp_iphc != NULL) {
- kmem_free(sctp->sctp_iphc, sctp->sctp_iphc_len);
- sctp->sctp_iphc = NULL;
- }
- if (sctp->sctp_iphc6 != NULL) {
- kmem_free(sctp->sctp_iphc6, sctp->sctp_iphc6_len);
- sctp->sctp_iphc6 = NULL;
- }
+ sctp_headers_free(sctp);
return (err);
}
@@ -1102,8 +999,122 @@ sctp_icmp_verf(sctp_t *sctp, sctp_hdr_t *sh, mblk_t *mp)
}
/*
+ * Update the SCTP state according to change of PMTU.
+ *
+ * Path MTU might have changed by either increase or decrease, so need to
+ * adjust the MSS based on the value of ixa_pmtu.
+ */
+static void
+sctp_update_pmtu(sctp_t *sctp, sctp_faddr_t *fp, boolean_t decrease_only)
+{
+ uint32_t pmtu;
+ int32_t mss;
+ ip_xmit_attr_t *ixa = fp->ixa;
+
+ if (sctp->sctp_state < SCTPS_ESTABLISHED)
+ return;
+
+ /*
+ * Always call ip_get_pmtu() to make sure that IP has updated
+ * ixa_flags properly.
+ */
+ pmtu = ip_get_pmtu(ixa);
+
+ /*
+ * Calculate the MSS by decreasing the PMTU by sctp_hdr_len and
+ * IPsec overhead if applied. Make sure to use the most recent
+ * IPsec information.
+ */
+ mss = pmtu - conn_ipsec_length(sctp->sctp_connp);
+ if (ixa->ixa_flags & IXAF_IS_IPV4)
+ mss -= sctp->sctp_hdr_len;
+ else
+ mss -= sctp->sctp_hdr6_len;
+
+ /*
+ * Nothing to change, so just return.
+ */
+ if (mss == fp->sfa_pmss)
+ return;
+
+ /*
+ * Currently, for ICMP errors, only PMTU decrease is handled.
+ */
+ if (mss > fp->sfa_pmss && decrease_only)
+ return;
+
+#ifdef DEBUG
+ (void) printf("sctp_update_pmtu mss from %d to %d\n",
+ fp->sfa_pmss, mss);
+#endif
+ DTRACE_PROBE2(sctp_update_pmtu, int32_t, fp->sfa_pmss, uint32_t, mss);
+
+ /*
+ * Update ixa_fragsize and ixa_pmtu.
+ */
+ ixa->ixa_fragsize = ixa->ixa_pmtu = pmtu;
+
+ /*
+ * Make sure that sfa_pmss is a multiple of
+ * SCTP_ALIGN.
+ */
+ fp->sfa_pmss = mss & ~(SCTP_ALIGN - 1);
+ fp->pmtu_discovered = 1;
+
+#ifdef notyet
+ if (mss < sctp->sctp_sctps->sctps_mss_min)
+ ixa->ixa_flags |= IXAF_PMTU_TOO_SMALL;
+#endif
+ if (ixa->ixa_flags & IXAF_PMTU_TOO_SMALL)
+ ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
+
+ /*
+ * If below the min size then ip_get_pmtu cleared IXAF_PMTU_IPV4_DF.
+ * Make sure to clear IXAF_DONTFRAG, which is used by IP to decide
+ * whether to fragment the packet.
+ */
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ if (!(ixa->ixa_flags & IXAF_PMTU_IPV4_DF)) {
+ fp->df = B_FALSE;
+ if (fp == sctp->sctp_current) {
+ sctp->sctp_ipha->
+ ipha_fragment_offset_and_flags = 0;
+ }
+ }
+ }
+}
+
+/*
+ * Notify function registered with ip_xmit_attr_t. It's called in the context
+ * of conn_ip_output so it's safe to update the SCTP state.
+ * Currently only used for pmtu changes.
+ */
+/* ARGSUSED1 */
+static void
+sctp_notify(void *arg, ip_xmit_attr_t *ixa, ixa_notify_type_t ntype,
+ ixa_notify_arg_t narg)
+{
+ sctp_t *sctp = (sctp_t *)arg;
+ sctp_faddr_t *fp;
+
+ switch (ntype) {
+ case IXAN_PMTU:
+ /* Find the faddr based on the ip_xmit_attr_t pointer */
+ for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->next) {
+ if (fp->ixa == ixa)
+ break;
+ }
+ if (fp != NULL)
+ sctp_update_pmtu(sctp, fp, B_FALSE);
+ break;
+ default:
+ break;
+ }
+}
+
+/*
* sctp_icmp_error is called by sctp_input() to process ICMP error messages
- * passed up by IP. The queue is the default queue. We need to find a sctp_t
+ * passed up by IP. We need to find a sctp_t
* that corresponds to the returned datagram. Passes the message back in on
* the correct queue once it has located the connection.
* Assumes that IP has pulled up everything up to and including
@@ -1116,8 +1127,6 @@ sctp_icmp_error(sctp_t *sctp, mblk_t *mp)
ipha_t *ipha;
int iph_hdr_length;
sctp_hdr_t *sctph;
- mblk_t *first_mp;
- uint32_t new_mtu;
in6_addr_t dst;
sctp_faddr_t *fp;
sctp_stack_t *sctps = sctp->sctp_sctps;
@@ -1125,12 +1134,10 @@ sctp_icmp_error(sctp_t *sctp, mblk_t *mp)
dprint(1, ("sctp_icmp_error: sctp=%p, mp=%p\n", (void *)sctp,
(void *)mp));
- first_mp = mp;
-
ipha = (ipha_t *)mp->b_rptr;
if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
- sctp_icmp_error_ipv6(sctp, first_mp);
+ sctp_icmp_error_ipv6(sctp, mp);
return;
}
@@ -1144,7 +1151,7 @@ sctp_icmp_error(sctp_t *sctp, mblk_t *mp)
/* first_mp must expose the full sctp header. */
if ((uchar_t *)(sctph + 1) >= mp->b_wptr) {
/* not enough data for SCTP header */
- freemsg(first_mp);
+ freemsg(mp);
return;
}
@@ -1175,19 +1182,7 @@ sctp_icmp_error(sctp_t *sctp, mblk_t *mp)
if (fp == NULL) {
break;
}
-
- new_mtu = ntohs(icmph->icmph_du_mtu);
-
- if (new_mtu - sctp->sctp_hdr_len >= fp->sfa_pmss)
- break;
-
- /*
- * Make sure that sfa_pmss is a multiple of
- * SCTP_ALIGN.
- */
- fp->sfa_pmss = (new_mtu - sctp->sctp_hdr_len) &
- ~(SCTP_ALIGN - 1);
- fp->pmtu_discovered = 1;
+ sctp_update_pmtu(sctp, fp, B_TRUE);
/*
* It is possible, even likely that a fast retransmit
* attempt has been dropped by ip as a result of this
@@ -1229,7 +1224,7 @@ sctp_icmp_error(sctp_t *sctp, mblk_t *mp)
break;
}
}
- freemsg(first_mp);
+ freemsg(mp);
}
/*
@@ -1246,7 +1241,6 @@ sctp_icmp_error_ipv6(sctp_t *sctp, mblk_t *mp)
uint16_t iph_hdr_length;
sctp_hdr_t *sctpha;
uint8_t *nexthdrp;
- uint32_t new_mtu;
sctp_faddr_t *fp;
sctp_stack_t *sctps = sctp->sctp_sctps;
@@ -1294,16 +1288,16 @@ sctp_icmp_error_ipv6(sctp_t *sctp, mblk_t *mp)
break;
}
- new_mtu = ntohs(icmp6->icmp6_mtu);
-
- if (new_mtu - sctp->sctp_hdr6_len >= fp->sfa_pmss)
- break;
-
- /* Make sure that sfa_pmss is a multiple of SCTP_ALIGN. */
- fp->sfa_pmss = (new_mtu - sctp->sctp_hdr6_len) &
- ~(SCTP_ALIGN - 1);
- fp->pmtu_discovered = 1;
-
+ sctp_update_pmtu(sctp, fp, B_TRUE);
+ /*
+ * It is possible, even likely that a fast retransmit
+ * attempt has been dropped by ip as a result of this
+ * error, retransmission bundles as much as possible.
+ * A retransmit here prevents significant delays waiting
+ * on the timer. Analogous to behaviour of TCP after
+ * ICMP too big.
+ */
+ sctp_rexmit(sctp, fp);
break;
case ICMP6_DST_UNREACH:
@@ -1366,12 +1360,12 @@ sctp_icmp_error_ipv6(sctp_t *sctp, mblk_t *mp)
* If parent pointer is passed in, inherit settings from it.
*/
sctp_t *
-sctp_create(void *ulpd, sctp_t *parent, int family, int flags,
+sctp_create(void *ulpd, sctp_t *parent, int family, int type, int flags,
sock_upcalls_t *upcalls, sctp_sockbuf_limits_t *sbl,
cred_t *credp)
{
sctp_t *sctp, *psctp;
- conn_t *sctp_connp;
+ conn_t *connp;
mblk_t *ack_mp, *hb_mp;
int sleep = flags & SCTP_CAN_BLOCK ? KM_SLEEP : KM_NOSLEEP;
zoneid_t zoneid;
@@ -1403,18 +1397,8 @@ sctp_create(void *ulpd, sctp_t *parent, int family, int flags,
zoneid = GLOBAL_ZONEID;
else
zoneid = crgetzoneid(credp);
-
- /*
- * For stackid zero this is done from strplumb.c, but
- * non-zero stackids are handled here.
- */
- if (sctps->sctps_g_q == NULL &&
- sctps->sctps_netstack->netstack_stackid !=
- GLOBAL_NETSTACKID) {
- sctp_g_q_setup(sctps);
- }
}
- if ((sctp_connp = ipcl_conn_create(IPCL_SCTPCONN, sleep,
+ if ((connp = ipcl_conn_create(IPCL_SCTPCONN, sleep,
sctps->sctps_netstack)) == NULL) {
netstack_rele(sctps->sctps_netstack);
SCTP_KSTAT(sctps, sctp_conn_create);
@@ -1425,49 +1409,38 @@ sctp_create(void *ulpd, sctp_t *parent, int family, int flags,
* done at top of sctp_create.
*/
netstack_rele(sctps->sctps_netstack);
- sctp = CONN2SCTP(sctp_connp);
+ sctp = CONN2SCTP(connp);
sctp->sctp_sctps = sctps;
- sctp_connp->conn_ulp_labeled = is_system_labeled();
if ((ack_mp = sctp_timer_alloc(sctp, sctp_ack_timer, sleep)) == NULL ||
(hb_mp = sctp_timer_alloc(sctp, sctp_heartbeat_timer,
sleep)) == NULL) {
if (ack_mp != NULL)
freeb(ack_mp);
- sctp_conn_clear(sctp_connp);
+ sctp_conn_clear(connp);
sctp->sctp_sctps = NULL;
- SCTP_G_Q_REFRELE(sctps);
- kmem_cache_free(sctp_conn_cache, sctp_connp);
+ kmem_cache_free(sctp_conn_cache, connp);
return (NULL);
}
sctp->sctp_ack_mp = ack_mp;
sctp->sctp_heartbeat_mp = hb_mp;
- switch (family) {
- case AF_INET6:
- sctp_connp->conn_af_isv6 = B_TRUE;
- sctp->sctp_ipversion = IPV6_VERSION;
- sctp->sctp_family = AF_INET6;
- break;
+ /*
+ * Have conn_ip_output drop packets should our outer source
+ * go invalid, and tell us about mtu changes.
+ */
+ connp->conn_ixa->ixa_flags |= IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE |
+ IXAF_VERIFY_PMTU;
+ connp->conn_family = family;
+ connp->conn_so_type = type;
- case AF_INET:
- sctp_connp->conn_af_isv6 = B_FALSE;
- sctp_connp->conn_pkt_isv6 = B_FALSE;
- sctp->sctp_ipversion = IPV4_VERSION;
- sctp->sctp_family = AF_INET;
- break;
- default:
- ASSERT(0);
- break;
- }
if (sctp_init_values(sctp, psctp, sleep) != 0) {
freeb(ack_mp);
freeb(hb_mp);
- sctp_conn_clear(sctp_connp);
+ sctp_conn_clear(connp);
sctp->sctp_sctps = NULL;
- SCTP_G_Q_REFRELE(sctps);
- kmem_cache_free(sctp_conn_cache, sctp_connp);
+ kmem_cache_free(sctp_conn_cache, connp);
return (NULL);
}
sctp->sctp_cansleep = ((flags & SCTP_CAN_BLOCK) == SCTP_CAN_BLOCK);
@@ -1476,6 +1449,8 @@ sctp_create(void *ulpd, sctp_t *parent, int family, int flags,
sctp->sctp_hdr6_len : sctp->sctp_hdr_len);
if (psctp != NULL) {
+ conn_t *pconnp = psctp->sctp_connp;
+
RUN_SCTP(psctp);
/*
* Inherit local address list, local port. Parent is either
@@ -1488,10 +1463,9 @@ sctp_create(void *ulpd, sctp_t *parent, int family, int flags,
freeb(ack_mp);
freeb(hb_mp);
sctp_headers_free(sctp);
- sctp_conn_clear(sctp_connp);
+ sctp_conn_clear(connp);
sctp->sctp_sctps = NULL;
- SCTP_G_Q_REFRELE(sctps);
- kmem_cache_free(sctp_conn_cache, sctp_connp);
+ kmem_cache_free(sctp_conn_cache, connp);
return (NULL);
}
@@ -1500,28 +1474,32 @@ sctp_create(void *ulpd, sctp_t *parent, int family, int flags,
* followed by sctp_connect(). So don't add this guy to
* bind hash.
*/
- sctp->sctp_lport = psctp->sctp_lport;
+ connp->conn_lport = pconnp->conn_lport;
sctp->sctp_state = SCTPS_BOUND;
- sctp->sctp_allzones = psctp->sctp_allzones;
- sctp->sctp_zoneid = psctp->sctp_zoneid;
WAKE_SCTP(psctp);
} else {
- sctp->sctp_zoneid = zoneid;
- }
-
- sctp->sctp_cpid = curproc->p_pid;
- sctp->sctp_open_time = lbolt64;
+ ASSERT(connp->conn_cred == NULL);
+ connp->conn_zoneid = zoneid;
+ /*
+ * conn_allzones can not be set this early, hence
+ * no IPCL_ZONEID
+ */
+ connp->conn_ixa->ixa_zoneid = zoneid;
+ connp->conn_open_time = lbolt64;
+ connp->conn_cred = credp;
+ crhold(credp);
+ connp->conn_cpid = curproc->p_pid;
- ASSERT(sctp_connp->conn_cred == NULL);
- sctp_connp->conn_cred = credp;
- crhold(credp);
+ /*
+ * If the caller has the process-wide flag set, then default to
+ * MAC exempt mode. This allows read-down to unlabeled hosts.
+ */
+ if (getpflags(NET_MAC_AWARE, credp) != 0)
+ connp->conn_mac_mode = CONN_MAC_AWARE;
- /*
- * If the caller has the process-wide flag set, then default to MAC
- * exempt mode. This allows read-down to unlabeled hosts.
- */
- if (getpflags(NET_MAC_AWARE, credp) != 0)
- sctp_connp->conn_mac_mode = CONN_MAC_AWARE;
+ connp->conn_zone_is_global =
+ (crgetzoneid(credp) == GLOBAL_ZONEID);
+ }
/* Initialize SCTP instance values, our verf tag must never be 0 */
(void) random_get_pseudo_bytes((uint8_t *)&sctp->sctp_lvtag,
@@ -1536,20 +1514,17 @@ sctp_create(void *ulpd, sctp_t *parent, int family, int flags,
sctp->sctp_adv_pap = sctp->sctp_lastack_rxd;
/* Information required by upper layer */
- if (ulpd != NULL) {
- sctp->sctp_ulpd = ulpd;
-
- ASSERT(upcalls != NULL);
- sctp->sctp_upcalls = upcalls;
- ASSERT(sbl != NULL);
- /* Fill in the socket buffer limits for sctpsockfs */
- sbl->sbl_txlowat = sctp->sctp_xmit_lowater;
- sbl->sbl_txbuf = sctp->sctp_xmit_hiwater;
- sbl->sbl_rxbuf = sctp->sctp_rwnd;
- sbl->sbl_rxlowat = SCTP_RECV_LOWATER;
- }
- /* If no ulpd, must be creating the default sctp */
- ASSERT(ulpd != NULL || sctps->sctps_gsctp == NULL);
+ ASSERT(ulpd != NULL);
+ sctp->sctp_ulpd = ulpd;
+
+ ASSERT(upcalls != NULL);
+ sctp->sctp_upcalls = upcalls;
+ ASSERT(sbl != NULL);
+ /* Fill in the socket buffer limits for sctpsockfs */
+ sbl->sbl_txlowat = connp->conn_sndlowat;
+ sbl->sbl_txbuf = connp->conn_sndbuf;
+ sbl->sbl_rxbuf = sctp->sctp_rwnd;
+ sbl->sbl_rxlowat = SCTP_RECV_LOWATER;
/* Insert this in the global list. */
SCTP_LINK(sctp, sctps);
@@ -1557,232 +1532,6 @@ sctp_create(void *ulpd, sctp_t *parent, int family, int flags,
return (sctp);
}
-/*
- * Make sure we wait until the default queue is setup, yet allow
- * sctp_g_q_create() to open a SCTP stream.
- * We need to allow sctp_g_q_create() do do an open
- * of sctp, hence we compare curhread.
- * All others have to wait until the sctps_g_q has been
- * setup.
- */
-void
-sctp_g_q_setup(sctp_stack_t *sctps)
-{
- mutex_enter(&sctps->sctps_g_q_lock);
- if (sctps->sctps_g_q != NULL) {
- mutex_exit(&sctps->sctps_g_q_lock);
- return;
- }
- if (sctps->sctps_g_q_creator == NULL) {
- /* This thread will set it up */
- sctps->sctps_g_q_creator = curthread;
- mutex_exit(&sctps->sctps_g_q_lock);
- sctp_g_q_create(sctps);
- mutex_enter(&sctps->sctps_g_q_lock);
- ASSERT(sctps->sctps_g_q_creator == curthread);
- sctps->sctps_g_q_creator = NULL;
- cv_signal(&sctps->sctps_g_q_cv);
- ASSERT(sctps->sctps_g_q != NULL);
- mutex_exit(&sctps->sctps_g_q_lock);
- return;
- }
- /* Everybody but the creator has to wait */
- if (sctps->sctps_g_q_creator != curthread) {
- while (sctps->sctps_g_q == NULL)
- cv_wait(&sctps->sctps_g_q_cv, &sctps->sctps_g_q_lock);
- }
- mutex_exit(&sctps->sctps_g_q_lock);
-}
-
-#define IP "ip"
-
-#define SCTP6DEV "/devices/pseudo/sctp6@0:sctp6"
-
-/*
- * Create a default sctp queue here instead of in strplumb
- */
-void
-sctp_g_q_create(sctp_stack_t *sctps)
-{
- int error;
- ldi_handle_t lh = NULL;
- ldi_ident_t li = NULL;
- int rval;
- cred_t *cr;
- major_t IP_MAJ;
-
-#ifdef NS_DEBUG
- (void) printf("sctp_g_q_create()for stack %d\n",
- sctps->sctps_netstack->netstack_stackid);
-#endif
-
- IP_MAJ = ddi_name_to_major(IP);
-
- ASSERT(sctps->sctps_g_q_creator == curthread);
-
- error = ldi_ident_from_major(IP_MAJ, &li);
- if (error) {
-#ifdef DEBUG
- printf("sctp_g_q_create: lyr ident get failed error %d\n",
- error);
-#endif
- return;
- }
-
- cr = zone_get_kcred(netstackid_to_zoneid(
- sctps->sctps_netstack->netstack_stackid));
- ASSERT(cr != NULL);
- /*
- * We set the sctp default queue to IPv6 because IPv4 falls
- * back to IPv6 when it can't find a client, but
- * IPv6 does not fall back to IPv4.
- */
- error = ldi_open_by_name(SCTP6DEV, FREAD|FWRITE, cr, &lh, li);
- if (error) {
-#ifdef DEBUG
- printf("sctp_g_q_create: open of SCTP6DEV failed error %d\n",
- error);
-#endif
- goto out;
- }
-
- /*
- * This ioctl causes the sctp framework to cache a pointer to
- * this stream, so we don't want to close the stream after
- * this operation.
- * Use the kernel credentials that are for the zone we're in.
- */
- error = ldi_ioctl(lh, SCTP_IOC_DEFAULT_Q,
- (intptr_t)0, FKIOCTL, cr, &rval);
- if (error) {
-#ifdef DEBUG
- printf("sctp_g_q_create: ioctl SCTP_IOC_DEFAULT_Q failed "
- "error %d\n", error);
-#endif
- goto out;
- }
- sctps->sctps_g_q_lh = lh; /* For sctp_g_q_inactive */
- lh = NULL;
-out:
- /* Close layered handles */
- if (li)
- ldi_ident_release(li);
- /* Keep cred around until _inactive needs it */
- sctps->sctps_g_q_cr = cr;
-}
-
-/*
- * Remove the sctp_default queue so that new connections will not find it.
- * SCTP uses sctp_g_q for all transmission, so all sctp'ts implicitly
- * refer to it. Hence have each one have a reference on sctp_g_q_ref!
- *
- * We decrement the refcnt added in sctp_g_q_create. Once all the
- * sctp_t's which use the default go away, sctp_g_q_close will be called
- * and close the sctp_g_q. Once sctp_g_q is closed, sctp_close() will drop the
- * last reference count on the stack by calling netstack_rele().
- */
-void
-sctp_g_q_destroy(sctp_stack_t *sctps)
-{
- if (sctps->sctps_g_q == NULL) {
- return; /* Nothing to cleanup */
- }
- /*
- * Keep sctps_g_q and sctps_gsctp until the last reference has
- * dropped, since the output is always done using those.
- * Need to decrement twice to take sctp_g_q_create and
- * the gsctp reference into account so that sctp_g_q_inactive is called
- * when all but the default queue remains.
- */
-#ifdef NS_DEBUG
- (void) printf("sctp_g_q_destroy: ref %d\n",
- sctps->sctps_g_q_ref);
-#endif
- SCTP_G_Q_REFRELE(sctps);
-}
-
-/*
- * Called when last user (could be sctp_g_q_destroy) drops reference count
- * using SCTP_G_Q_REFRELE.
- * Run by sctp_q_q_inactive using a taskq.
- */
-static void
-sctp_g_q_close(void *arg)
-{
- sctp_stack_t *sctps = arg;
- int error;
- ldi_handle_t lh = NULL;
- ldi_ident_t li = NULL;
- cred_t *cr;
- major_t IP_MAJ;
-
- IP_MAJ = ddi_name_to_major(IP);
-
- lh = sctps->sctps_g_q_lh;
- if (lh == NULL)
- return; /* Nothing to cleanup */
-
- error = ldi_ident_from_major(IP_MAJ, &li);
- if (error) {
-#ifdef NS_DEBUG
- printf("sctp_g_q_inactive: lyr ident get failed error %d\n",
- error);
-#endif
- return;
- }
-
- cr = sctps->sctps_g_q_cr;
- sctps->sctps_g_q_cr = NULL;
- ASSERT(cr != NULL);
-
- /*
- * Make sure we can break the recursion when sctp_close decrements
- * the reference count causing g_q_inactive to be called again.
- */
- sctps->sctps_g_q_lh = NULL;
-
- /* close the default queue */
- (void) ldi_close(lh, FREAD|FWRITE, cr);
-
- /* Close layered handles */
- ldi_ident_release(li);
- crfree(cr);
-
- ASSERT(sctps->sctps_g_q != NULL);
- sctps->sctps_g_q = NULL;
- /*
- * Now free sctps_gsctp.
- */
- ASSERT(sctps->sctps_gsctp != NULL);
- sctp_closei_local(sctps->sctps_gsctp);
- SCTP_CONDEMNED(sctps->sctps_gsctp);
- SCTP_REFRELE(sctps->sctps_gsctp);
- sctps->sctps_gsctp = NULL;
-}
-
-/*
- * Called when last sctp_t drops reference count using SCTP_G_Q_REFRELE.
- *
- * Have to ensure that the ldi routines are not used by an
- * interrupt thread by using a taskq.
- */
-void
-sctp_g_q_inactive(sctp_stack_t *sctps)
-{
- if (sctps->sctps_g_q_lh == NULL)
- return; /* Nothing to cleanup */
-
- ASSERT(sctps->sctps_g_q_ref == 0);
- SCTP_G_Q_REFHOLD(sctps); /* Compensate for what g_q_destroy did */
-
- if (servicing_interrupt()) {
- (void) taskq_dispatch(sctp_taskq, sctp_g_q_close,
- (void *) sctps, TQ_SLEEP);
- } else {
- sctp_g_q_close(sctps);
- }
-}
-
/* Run at module load time */
void
sctp_ddi_g_init(void)
@@ -1802,16 +1551,12 @@ sctp_ddi_g_init(void)
/* Initialize tables used for CRC calculation */
sctp_crc32_init();
- sctp_taskq = taskq_create("sctp_taskq", 1, minclsyspri, 1, 1,
- TASKQ_PREPOPULATE);
-
/*
* We want to be informed each time a stack is created or
* destroyed in the kernel, so we can maintain the
* set of sctp_stack_t's.
*/
- netstack_register(NS_SCTP, sctp_stack_init, sctp_stack_shutdown,
- sctp_stack_fini);
+ netstack_register(NS_SCTP, sctp_stack_init, NULL, sctp_stack_fini);
}
static void *
@@ -1823,8 +1568,6 @@ sctp_stack_init(netstackid_t stackid, netstack_t *ns)
sctps->sctps_netstack = ns;
/* Initialize locks */
- mutex_init(&sctps->sctps_g_q_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&sctps->sctps_g_q_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&sctps->sctps_g_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&sctps->sctps_epriv_port_lock, NULL, MUTEX_DEFAULT, NULL);
sctps->sctps_g_num_epriv_ports = SCTP_NUM_EPRIV_PORTS;
@@ -1875,19 +1618,6 @@ sctp_ddi_g_destroy(void)
sctp_ftsn_sets_fini();
netstack_unregister(NS_SCTP);
- taskq_destroy(sctp_taskq);
-}
-
-/*
- * Shut down the SCTP stack instance.
- */
-/* ARGSUSED */
-static void
-sctp_stack_shutdown(netstackid_t stackid, void *arg)
-{
- sctp_stack_t *sctps = (sctp_stack_t *)arg;
-
- sctp_g_q_destroy(sctps);
}
/*
@@ -1922,8 +1652,6 @@ sctp_stack_fini(netstackid_t stackid, void *arg)
mutex_destroy(&sctps->sctps_g_lock);
mutex_destroy(&sctps->sctps_epriv_port_lock);
- mutex_destroy(&sctps->sctps_g_q_lock);
- cv_destroy(&sctps->sctps_g_q_cv);
kmem_free(sctps, sizeof (*sctps));
}
@@ -1934,7 +1662,8 @@ sctp_display_all(sctp_stack_t *sctps)
sctp_t *sctp_walker;
mutex_enter(&sctps->sctps_g_lock);
- for (sctp_walker = sctps->sctps_gsctp; sctp_walker != NULL;
+ for (sctp_walker = list_head(&sctps->sctps_g_list);
+ sctp_walker != NULL;
sctp_walker = (sctp_t *)list_next(&sctps->sctps_g_list,
sctp_walker)) {
(void) sctp_display(sctp_walker, NULL);
@@ -2009,81 +1738,6 @@ sctp_inc_taskq(sctp_stack_t *sctps)
}
#ifdef DEBUG
-uint32_t sendq_loop_cnt = 0;
-uint32_t sendq_collision = 0;
-uint32_t sendq_empty = 0;
-#endif
-
-void
-sctp_add_sendq(sctp_t *sctp, mblk_t *mp)
-{
- mutex_enter(&sctp->sctp_sendq_lock);
- if (sctp->sctp_sendq == NULL) {
- sctp->sctp_sendq = mp;
- sctp->sctp_sendq_tail = mp;
- } else {
- sctp->sctp_sendq_tail->b_next = mp;
- sctp->sctp_sendq_tail = mp;
- }
- mutex_exit(&sctp->sctp_sendq_lock);
-}
-
-void
-sctp_process_sendq(sctp_t *sctp)
-{
- mblk_t *mp;
-#ifdef DEBUG
- uint32_t loop_cnt = 0;
-#endif
-
- mutex_enter(&sctp->sctp_sendq_lock);
- if (sctp->sctp_sendq == NULL || sctp->sctp_sendq_sending) {
-#ifdef DEBUG
- if (sctp->sctp_sendq == NULL)
- sendq_empty++;
- else
- sendq_collision++;
-#endif
- mutex_exit(&sctp->sctp_sendq_lock);
- return;
- }
- sctp->sctp_sendq_sending = B_TRUE;
-
- /*
- * Note that while we are in this loop, other thread can put
- * new packets in the receive queue. We may be looping for
- * quite a while. This is OK even for an interrupt thread.
- * The reason is that SCTP should only able to send a limited
- * number of packets out in a burst. So the number of times
- * we go through this loop should not be many.
- */
- while ((mp = sctp->sctp_sendq) != NULL) {
- sctp->sctp_sendq = mp->b_next;
- ASSERT(sctp->sctp_connp->conn_ref > 0);
- mutex_exit(&sctp->sctp_sendq_lock);
- mp->b_next = NULL;
- CONN_INC_REF(sctp->sctp_connp);
- mp->b_flag |= MSGHASREF;
- /* If we don't have sctp_current, default to IPv4 */
- IP_PUT(mp, sctp->sctp_connp, sctp->sctp_current == NULL ?
- B_TRUE : sctp->sctp_current->isv4);
- BUMP_LOCAL(sctp->sctp_opkts);
-#ifdef DEBUG
- loop_cnt++;
-#endif
- mutex_enter(&sctp->sctp_sendq_lock);
- }
-
- sctp->sctp_sendq_tail = NULL;
- sctp->sctp_sendq_sending = B_FALSE;
-#ifdef DEBUG
- if (loop_cnt > sendq_loop_cnt)
- sendq_loop_cnt = loop_cnt;
-#endif
- mutex_exit(&sctp->sctp_sendq_lock);
-}
-
-#ifdef DEBUG
uint32_t recvq_loop_cnt = 0;
uint32_t recvq_call = 0;
#endif
@@ -2144,10 +1798,19 @@ sctp_find_next_tq(sctp_t *sctp)
* If the try_harder argument is B_TRUE, this routine sctp_find_next_tq()
* will try very hard to dispatch the task. Refer to the comment
* for that routine on how it does that.
+ *
+ * On failure the message has been freed i.e., this routine always consumes the
+ * message. It bumps ipIfStatsInDiscards and and uses ip_drop_input to drop.
*/
-boolean_t
-sctp_add_recvq(sctp_t *sctp, mblk_t *mp, boolean_t caller_hold_lock)
+void
+sctp_add_recvq(sctp_t *sctp, mblk_t *mp, boolean_t caller_hold_lock,
+ ip_recv_attr_t *ira)
{
+ mblk_t *attrmp;
+ ip_stack_t *ipst = sctp->sctp_sctps->sctps_netstack->netstack_ip;
+
+ ASSERT(ira->ira_ill == NULL);
+
if (!caller_hold_lock)
mutex_enter(&sctp->sctp_recvq_lock);
@@ -2157,12 +1820,28 @@ sctp_add_recvq(sctp_t *sctp, mblk_t *mp, boolean_t caller_hold_lock)
if (!sctp_find_next_tq(sctp)) {
if (!caller_hold_lock)
mutex_exit(&sctp->sctp_recvq_lock);
- return (B_FALSE);
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, NULL);
+ freemsg(mp);
+ return;
}
/* Make sure the sctp_t will not go away. */
SCTP_REFHOLD(sctp);
}
+ attrmp = ip_recv_attr_to_mblk(ira);
+ if (attrmp == NULL) {
+ if (!caller_hold_lock)
+ mutex_exit(&sctp->sctp_recvq_lock);
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, NULL);
+ freemsg(mp);
+ return;
+ }
+ ASSERT(attrmp->b_cont == NULL);
+ attrmp->b_cont = mp;
+ mp = attrmp;
+
if (sctp->sctp_recvq == NULL) {
sctp->sctp_recvq = mp;
sctp->sctp_recvq_tail = mp;
@@ -2173,7 +1852,6 @@ sctp_add_recvq(sctp_t *sctp, mblk_t *mp, boolean_t caller_hold_lock)
if (!caller_hold_lock)
mutex_exit(&sctp->sctp_recvq_lock);
- return (B_TRUE);
}
static void
@@ -2181,10 +1859,10 @@ sctp_process_recvq(void *arg)
{
sctp_t *sctp = (sctp_t *)arg;
mblk_t *mp;
- mblk_t *ipsec_mp;
#ifdef DEBUG
uint32_t loop_cnt = 0;
#endif
+ ip_recv_attr_t iras;
#ifdef _BIG_ENDIAN
#define IPVER(ip6h) ((((uint32_t *)ip6h)[0] >> 28) & 0x7)
@@ -2204,16 +1882,31 @@ sctp_process_recvq(void *arg)
* quite a while.
*/
while ((mp = sctp->sctp_recvq) != NULL) {
+ mblk_t *data_mp;
+
sctp->sctp_recvq = mp->b_next;
mutex_exit(&sctp->sctp_recvq_lock);
mp->b_next = NULL;
#ifdef DEBUG
loop_cnt++;
#endif
- ipsec_mp = mp->b_prev;
mp->b_prev = NULL;
- sctp_input_data(sctp, mp, ipsec_mp);
+ data_mp = mp->b_cont;
+ mp->b_cont = NULL;
+ if (!ip_recv_attr_from_mblk(mp, &iras)) {
+ ip_drop_input("ip_recv_attr_from_mblk", mp, NULL);
+ freemsg(mp);
+ ira_cleanup(&iras, B_TRUE);
+ continue;
+ }
+
+ if (iras.ira_flags & IRAF_ICMP_ERROR)
+ sctp_icmp_error(sctp, data_mp);
+ else
+ sctp_input_data(sctp, data_mp, &iras);
+
+ ira_cleanup(&iras, B_TRUE);
mutex_enter(&sctp->sctp_recvq_lock);
}
@@ -2224,8 +1917,6 @@ sctp_process_recvq(void *arg)
WAKE_SCTP(sctp);
- /* We may have sent something when processing the receive queue. */
- sctp_process_sendq(sctp);
#ifdef DEBUG
if (loop_cnt > recvq_loop_cnt)
recvq_loop_cnt = loop_cnt;
@@ -2238,18 +1929,32 @@ sctp_process_recvq(void *arg)
static int
sctp_conn_cache_constructor(void *buf, void *cdrarg, int kmflags)
{
- conn_t *sctp_connp = (conn_t *)buf;
- sctp_t *sctp = (sctp_t *)&sctp_connp[1];
+ conn_t *connp = (conn_t *)buf;
+ sctp_t *sctp = (sctp_t *)&connp[1];
+ bzero(connp, sizeof (conn_t));
bzero(buf, (char *)&sctp[1] - (char *)buf);
- sctp->sctp_connp = sctp_connp;
mutex_init(&sctp->sctp_reflock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&sctp->sctp_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&sctp->sctp_recvq_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&sctp->sctp_cv, NULL, CV_DEFAULT, NULL);
- mutex_init(&sctp->sctp_sendq_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
+ connp->conn_flags = IPCL_SCTPCONN;
+ connp->conn_proto = IPPROTO_SCTP;
+ connp->conn_sctp = sctp;
+ sctp->sctp_connp = connp;
+ rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
+
+ connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
+ if (connp->conn_ixa == NULL) {
+ return (ENOMEM);
+ }
+ connp->conn_ixa->ixa_refcnt = 1;
+ connp->conn_ixa->ixa_protocol = connp->conn_proto;
+ connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
return (0);
}
@@ -2257,14 +1962,13 @@ sctp_conn_cache_constructor(void *buf, void *cdrarg, int kmflags)
static void
sctp_conn_cache_destructor(void *buf, void *cdrarg)
{
- conn_t *sctp_connp = (conn_t *)buf;
- sctp_t *sctp = (sctp_t *)&sctp_connp[1];
+ conn_t *connp = (conn_t *)buf;
+ sctp_t *sctp = (sctp_t *)&connp[1];
+ ASSERT(sctp->sctp_connp == connp);
ASSERT(!MUTEX_HELD(&sctp->sctp_lock));
ASSERT(!MUTEX_HELD(&sctp->sctp_reflock));
ASSERT(!MUTEX_HELD(&sctp->sctp_recvq_lock));
- ASSERT(!MUTEX_HELD(&sctp->sctp_sendq_lock));
- ASSERT(!MUTEX_HELD(&sctp->sctp_connp->conn_lock));
ASSERT(sctp->sctp_conn_hash_next == NULL);
ASSERT(sctp->sctp_conn_hash_prev == NULL);
@@ -2317,16 +2021,6 @@ sctp_conn_cache_destructor(void *buf, void *cdrarg)
ASSERT(sctp->sctp_recvq_tail == NULL);
ASSERT(sctp->sctp_recvq_tq == NULL);
- ASSERT(sctp->sctp_sendq == NULL);
- ASSERT(sctp->sctp_sendq_tail == NULL);
- ASSERT(sctp->sctp_sendq_sending == B_FALSE);
-
- ASSERT(sctp->sctp_ipp_hopopts == NULL);
- ASSERT(sctp->sctp_ipp_rtdstopts == NULL);
- ASSERT(sctp->sctp_ipp_rthdr == NULL);
- ASSERT(sctp->sctp_ipp_dstopts == NULL);
- ASSERT(sctp->sctp_ipp_pathmtu == NULL);
-
/*
* sctp_pad_mp can be NULL if the memory allocation fails
* in sctp_init_values() and the conn_t is freed.
@@ -2340,8 +2034,18 @@ sctp_conn_cache_destructor(void *buf, void *cdrarg)
mutex_destroy(&sctp->sctp_lock);
mutex_destroy(&sctp->sctp_recvq_lock);
cv_destroy(&sctp->sctp_cv);
- mutex_destroy(&sctp->sctp_sendq_lock);
+ mutex_destroy(&connp->conn_lock);
+ cv_destroy(&connp->conn_cv);
+ rw_destroy(&connp->conn_ilg_lock);
+
+ /* Can be NULL if constructor failed */
+ if (connp->conn_ixa != NULL) {
+ ASSERT(connp->conn_ixa->ixa_refcnt == 1);
+ ASSERT(connp->conn_ixa->ixa_ire == NULL);
+ ASSERT(connp->conn_ixa->ixa_nce == NULL);
+ ixa_refrele(connp->conn_ixa);
+ }
}
static void
@@ -2361,31 +2065,53 @@ sctp_conn_cache_fini()
void
sctp_conn_init(conn_t *connp)
{
- connp->conn_flags = IPCL_SCTPCONN;
+ ASSERT(connp->conn_flags == IPCL_SCTPCONN);
connp->conn_rq = connp->conn_wq = NULL;
- connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
- connp->conn_ulp = IPPROTO_SCTP;
+ connp->conn_ixa->ixa_flags |= IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE |
+ IXAF_VERIFY_PMTU;
+
+ ASSERT(connp->conn_proto == IPPROTO_SCTP);
+ ASSERT(connp->conn_ixa->ixa_protocol == connp->conn_proto);
connp->conn_state_flags |= CONN_INCIPIENT;
- mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
+
+ ASSERT(connp->conn_sctp != NULL);
+
+ /*
+ * Register sctp_notify to listen to capability changes detected by IP.
+ * This upcall is made in the context of the call to conn_ip_output
+ * thus it holds whatever locks sctp holds across conn_ip_output.
+ */
+ connp->conn_ixa->ixa_notify = sctp_notify;
+ connp->conn_ixa->ixa_notify_cookie = connp->conn_sctp;
}
static void
sctp_conn_clear(conn_t *connp)
{
/* Clean up conn_t stuff */
- if (connp->conn_latch != NULL)
- IPLATCH_REFRELE(connp->conn_latch, connp->conn_netstack);
- if (connp->conn_policy != NULL)
+ if (connp->conn_latch != NULL) {
+ IPLATCH_REFRELE(connp->conn_latch);
+ connp->conn_latch = NULL;
+ }
+ if (connp->conn_latch_in_policy != NULL) {
+ IPPOL_REFRELE(connp->conn_latch_in_policy);
+ connp->conn_latch_in_policy = NULL;
+ }
+ if (connp->conn_latch_in_action != NULL) {
+ IPACT_REFRELE(connp->conn_latch_in_action);
+ connp->conn_latch_in_action = NULL;
+ }
+ if (connp->conn_policy != NULL) {
IPPH_REFRELE(connp->conn_policy, connp->conn_netstack);
- if (connp->conn_ipsec_opt_mp != NULL)
+ connp->conn_policy = NULL;
+ }
+ if (connp->conn_ipsec_opt_mp != NULL) {
freemsg(connp->conn_ipsec_opt_mp);
- if (connp->conn_cred != NULL)
- crfree(connp->conn_cred);
- if (connp->conn_effective_cred != NULL)
- crfree(connp->conn_effective_cred);
- mutex_destroy(&connp->conn_lock);
- cv_destroy(&connp->conn_cv);
+ connp->conn_ipsec_opt_mp = NULL;
+ }
netstack_rele(connp->conn_netstack);
- bzero(connp, sizeof (struct conn_s));
+ connp->conn_netstack = NULL;
+
+ /* Leave conn_ixa and other constructed fields in place */
+ ipcl_conn_cleanup(connp);
}
diff --git a/usr/src/uts/common/inet/sctp/sctp_addr.c b/usr/src/uts/common/inet/sctp/sctp_addr.c
index b347d30dda..306362211d 100644
--- a/usr/src/uts/common/inet/sctp/sctp_addr.c
+++ b/usr/src/uts/common/inet/sctp/sctp_addr.c
@@ -41,6 +41,7 @@
#include <inet/common.h>
#include <inet/ip.h>
#include <inet/ip6.h>
+#include <inet/ip_ire.h>
#include <inet/ip_if.h>
#include <inet/ipclassifier.h>
#include <inet/sctp_ip.h>
@@ -236,6 +237,7 @@ sctp_get_all_ipifs(sctp_t *sctp, int sleep)
int error = 0;
sctp_stack_t *sctps = sctp->sctp_sctps;
boolean_t isv6;
+ conn_t *connp = sctp->sctp_connp;
rw_enter(&sctps->sctps_g_ipifs_lock, RW_READER);
for (i = 0; i < SCTP_IPIF_HASH; i++) {
@@ -250,8 +252,8 @@ sctp_get_all_ipifs(sctp_t *sctp, int sleep)
!SCTP_IPIF_ZONE_MATCH(sctp, sctp_ipif) ||
SCTP_IS_ADDR_UNSPEC(!isv6,
sctp_ipif->sctp_ipif_saddr) ||
- (sctp->sctp_ipversion == IPV4_VERSION && isv6) ||
- (sctp->sctp_connp->conn_ipv6_v6only && !isv6)) {
+ (connp->conn_family == AF_INET && isv6) ||
+ (connp->conn_ipv6_v6only && !isv6)) {
rw_exit(&sctp_ipif->sctp_ipif_lock);
sctp_ipif = list_next(
&sctps->sctps_g_ipifs[i].sctp_ipif_list,
@@ -303,6 +305,7 @@ sctp_valid_addr_list(sctp_t *sctp, const void *addrs, uint32_t addrcnt,
boolean_t check_addrs = B_FALSE;
boolean_t check_lport = B_FALSE;
uchar_t *p = list;
+ conn_t *connp = sctp->sctp_connp;
/*
* Need to check for port and address depending on the state.
@@ -325,11 +328,11 @@ sctp_valid_addr_list(sctp_t *sctp, const void *addrs, uint32_t addrcnt,
boolean_t lookup_saddr = B_TRUE;
uint_t ifindex = 0;
- switch (sctp->sctp_family) {
+ switch (connp->conn_family) {
case AF_INET:
sin4 = (struct sockaddr_in *)addrs + cnt;
if (sin4->sin_family != AF_INET || (check_lport &&
- sin4->sin_port != sctp->sctp_lport)) {
+ sin4->sin_port != connp->conn_lport)) {
err = EINVAL;
goto free_ret;
}
@@ -351,14 +354,14 @@ sctp_valid_addr_list(sctp_t *sctp, const void *addrs, uint32_t addrcnt,
case AF_INET6:
sin6 = (struct sockaddr_in6 *)addrs + cnt;
if (sin6->sin6_family != AF_INET6 || (check_lport &&
- sin6->sin6_port != sctp->sctp_lport)) {
+ sin6->sin6_port != connp->conn_lport)) {
err = EINVAL;
goto free_ret;
}
addr = sin6->sin6_addr;
/* Contains the interface index */
ifindex = sin6->sin6_scope_id;
- if (sctp->sctp_connp->conn_ipv6_v6only &&
+ if (connp->conn_ipv6_v6only &&
IN6_IS_ADDR_V4MAPPED(&addr)) {
err = EAFNOSUPPORT;
goto free_ret;
@@ -382,7 +385,7 @@ sctp_valid_addr_list(sctp_t *sctp, const void *addrs, uint32_t addrcnt,
}
if (lookup_saddr) {
ipif = sctp_lookup_ipif_addr(&addr, B_TRUE,
- sctp->sctp_zoneid, !sctp->sctp_connp->conn_allzones,
+ IPCL_ZONEID(connp), !connp->conn_allzones,
ifindex, 0, B_TRUE, sctp->sctp_sctps);
if (ipif == NULL) {
/* Address not in the list */
@@ -495,6 +498,8 @@ sctp_ipif_hash_insert(sctp_t *sctp, sctp_ipif_t *ipif, int sleep,
/*
* Given a source address, walk through the peer address list to see
* if the source address is being used. If it is, reset that.
+ * A cleared saddr will then make sctp_make_mp lookup the destination again
+ * and as part of that look for a new source.
*/
static void
sctp_fix_saddr(sctp_t *sctp, in6_addr_t *saddr)
@@ -504,10 +509,6 @@ sctp_fix_saddr(sctp_t *sctp, in6_addr_t *saddr)
for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->next) {
if (!IN6_ARE_ADDR_EQUAL(&fp->saddr, saddr))
continue;
- if (fp->ire != NULL) {
- IRE_REFRELE_NOTR(fp->ire);
- fp->ire = NULL;
- }
V6_SET_ZERO(fp->saddr);
}
}
@@ -874,8 +875,8 @@ sctp_update_saddrs(sctp_ipif_t *oipif, sctp_ipif_t *nipif, int idx,
sctp_saddr_ipif_t *sobj;
int count;
- sctp = sctps->sctps_gsctp;
mutex_enter(&sctps->sctps_g_lock);
+ sctp = list_head(&sctps->sctps_g_list);
while (sctp != NULL && oipif->sctp_ipif_refcnt > 0) {
mutex_enter(&sctp->sctp_reflock);
if (sctp->sctp_condemned ||
@@ -1202,7 +1203,6 @@ sctp_update_ipif(ipif_t *ipif, int op)
rw_downgrade(&sctps->sctps_g_ipifs_lock);
rw_enter(&sctp_ipif->sctp_ipif_lock, RW_WRITER);
sctp_ipif->sctp_ipif_state = SCTP_IPIFS_UP;
- sctp_ipif->sctp_ipif_mtu = ipif->ipif_mtu;
sctp_ipif->sctp_ipif_flags = ipif->ipif_flags;
rw_exit(&sctp_ipif->sctp_ipif_lock);
sctp_chk_and_updt_saddr(hindex, sctp_ipif,
@@ -1214,7 +1214,6 @@ sctp_update_ipif(ipif_t *ipif, int op)
rw_downgrade(&sctps->sctps_g_ipifs_lock);
rw_enter(&sctp_ipif->sctp_ipif_lock, RW_WRITER);
- sctp_ipif->sctp_ipif_mtu = ipif->ipif_mtu;
sctp_ipif->sctp_ipif_zoneid = ipif->ipif_zoneid;
sctp_ipif->sctp_ipif_flags = ipif->ipif_flags;
rw_exit(&sctp_ipif->sctp_ipif_lock);
@@ -1226,7 +1225,6 @@ sctp_update_ipif(ipif_t *ipif, int op)
rw_downgrade(&sctps->sctps_g_ipifs_lock);
rw_enter(&sctp_ipif->sctp_ipif_lock, RW_WRITER);
sctp_ipif->sctp_ipif_state = SCTP_IPIFS_DOWN;
- sctp_ipif->sctp_ipif_mtu = ipif->ipif_mtu;
sctp_ipif->sctp_ipif_flags = ipif->ipif_flags;
rw_exit(&sctp_ipif->sctp_ipif_lock);
@@ -1277,6 +1275,7 @@ sctp_del_saddr_list(sctp_t *sctp, const void *addrs, int addcnt,
in6_addr_t addr;
sctp_ipif_t *sctp_ipif;
int ifindex = 0;
+ conn_t *connp = sctp->sctp_connp;
ASSERT(sctp->sctp_nsaddrs >= addcnt);
@@ -1288,7 +1287,7 @@ sctp_del_saddr_list(sctp_t *sctp, const void *addrs, int addcnt,
}
for (cnt = 0; cnt < addcnt; cnt++) {
- switch (sctp->sctp_family) {
+ switch (connp->conn_family) {
case AF_INET:
sin4 = (struct sockaddr_in *)addrs + cnt;
IN6_INADDR_TO_V4MAPPED(&sin4->sin_addr, &addr);
@@ -1301,7 +1300,7 @@ sctp_del_saddr_list(sctp_t *sctp, const void *addrs, int addcnt,
break;
}
sctp_ipif = sctp_lookup_ipif_addr(&addr, B_FALSE,
- sctp->sctp_zoneid, !sctp->sctp_connp->conn_allzones,
+ IPCL_ZONEID(connp), !connp->conn_allzones,
ifindex, 0, B_TRUE, sctp->sctp_sctps);
ASSERT(sctp_ipif != NULL);
sctp_ipif_hash_remove(sctp, sctp_ipif);
@@ -1356,10 +1355,10 @@ int
sctp_saddr_add_addr(sctp_t *sctp, in6_addr_t *addr, uint_t ifindex)
{
sctp_ipif_t *sctp_ipif;
+ conn_t *connp = sctp->sctp_connp;
- sctp_ipif = sctp_lookup_ipif_addr(addr, B_TRUE, sctp->sctp_zoneid,
- !sctp->sctp_connp->conn_allzones, ifindex, 0, B_TRUE,
- sctp->sctp_sctps);
+ sctp_ipif = sctp_lookup_ipif_addr(addr, B_TRUE, IPCL_ZONEID(connp),
+ !connp->conn_allzones, ifindex, 0, B_TRUE, sctp->sctp_sctps);
if (sctp_ipif == NULL)
return (EINVAL);
@@ -1386,6 +1385,7 @@ sctp_check_saddr(sctp_t *sctp, int supp_af, boolean_t delete,
int scanned = 0;
int naddr;
int nsaddr;
+ conn_t *connp = sctp->sctp_connp;
ASSERT(!sctp->sctp_loopback && !sctp->sctp_linklocal && supp_af != 0);
@@ -1393,7 +1393,7 @@ sctp_check_saddr(sctp_t *sctp, int supp_af, boolean_t delete,
* Irregardless of the supported address in the INIT, v4
* must be supported.
*/
- if (sctp->sctp_family == AF_INET)
+ if (connp->conn_family == AF_INET)
supp_af = PARM_SUPP_V4;
nsaddr = sctp->sctp_nsaddrs;
@@ -1501,13 +1501,15 @@ sctp_getmyaddrs(void *conn, void *myaddrs, int *addrcnt)
int l;
sctp_saddr_ipif_t *obj;
sctp_t *sctp = (sctp_t *)conn;
- int family = sctp->sctp_family;
+ conn_t *connp = sctp->sctp_connp;
+ int family = connp->conn_family;
int max = *addrcnt;
size_t added = 0;
struct sockaddr_in6 *sin6;
struct sockaddr_in *sin4;
int scanned = 0;
boolean_t skip_lback = B_FALSE;
+ ip_xmit_attr_t *ixa = connp->conn_ixa;
if (sctp->sctp_nsaddrs == 0)
return (EINVAL);
@@ -1543,15 +1545,27 @@ sctp_getmyaddrs(void *conn, void *myaddrs, int *addrcnt)
case AF_INET:
sin4 = (struct sockaddr_in *)myaddrs + added;
sin4->sin_family = AF_INET;
- sin4->sin_port = sctp->sctp_lport;
+ sin4->sin_port = connp->conn_lport;
IN6_V4MAPPED_TO_INADDR(&addr, &sin4->sin_addr);
break;
case AF_INET6:
sin6 = (struct sockaddr_in6 *)myaddrs + added;
sin6->sin6_family = AF_INET6;
- sin6->sin6_port = sctp->sctp_lport;
+ sin6->sin6_port = connp->conn_lport;
sin6->sin6_addr = addr;
+ /*
+ * Note that flowinfo is only returned for
+ * getpeername just like for TCP and UDP.
+ */
+ sin6->sin6_flowinfo = 0;
+
+ if (IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr) &&
+ (ixa->ixa_flags & IXAF_SCOPEID_SET))
+ sin6->sin6_scope_id = ixa->ixa_scopeid;
+ else
+ sin6->sin6_scope_id = 0;
+ sin6->__sin6_src_id = 0;
break;
}
added++;
@@ -1700,6 +1714,7 @@ sctp_get_addrlist(sctp_t *sctp, const void *addrs, uint32_t *addrcnt,
uchar_t *p;
int err = 0;
sctp_stack_t *sctps = sctp->sctp_sctps;
+ conn_t *connp = sctp->sctp_connp;
*addrlist = NULL;
*size = 0;
@@ -1707,7 +1722,7 @@ sctp_get_addrlist(sctp_t *sctp, const void *addrs, uint32_t *addrcnt,
/*
* Create a list of sockaddr_in[6] structs using the input list.
*/
- if (sctp->sctp_family == AF_INET) {
+ if (connp->conn_family == AF_INET) {
*size = sizeof (struct sockaddr_in) * *addrcnt;
*addrlist = kmem_zalloc(*size, KM_SLEEP);
p = *addrlist;
@@ -1772,7 +1787,7 @@ get_all_addrs:
* We allocate upfront so that the clustering module need to bother
* re-sizing the list.
*/
- if (sctp->sctp_family == AF_INET) {
+ if (connp->conn_family == AF_INET) {
*size = sizeof (struct sockaddr_in) *
sctps->sctps_g_ipifs_count;
} else {
@@ -1805,7 +1820,7 @@ get_all_addrs:
SCTP_IS_IPIF_LOOPBACK(sctp_ipif) ||
SCTP_IS_IPIF_LINKLOCAL(sctp_ipif) ||
!SCTP_IPIF_ZONE_MATCH(sctp, sctp_ipif) ||
- (sctp->sctp_ipversion == IPV4_VERSION &&
+ (connp->conn_family == AF_INET &&
sctp_ipif->sctp_ipif_isv6) ||
(sctp->sctp_connp->conn_ipv6_v6only &&
!sctp_ipif->sctp_ipif_isv6)) {
@@ -1816,7 +1831,7 @@ get_all_addrs:
continue;
}
rw_exit(&sctp_ipif->sctp_ipif_lock);
- if (sctp->sctp_family == AF_INET) {
+ if (connp->conn_family == AF_INET) {
s4 = (struct sockaddr_in *)p;
IN6_V4MAPPED_TO_INADDR(&addr, &s4->sin_addr);
s4->sin_family = AF_INET;
diff --git a/usr/src/uts/common/inet/sctp/sctp_addr.h b/usr/src/uts/common/inet/sctp/sctp_addr.h
index 9408c452d4..35e8300958 100644
--- a/usr/src/uts/common/inet/sctp/sctp_addr.h
+++ b/usr/src/uts/common/inet/sctp/sctp_addr.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SCTP_ADDR_H
#define _SCTP_ADDR_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/list.h>
#include <sys/zone.h>
#include <inet/ip.h>
@@ -54,7 +52,6 @@ extern "C" {
typedef struct sctp_ipif_s {
list_node_t sctp_ipifs; /* Used by the global list */
struct sctp_ill_s *sctp_ipif_ill;
- uint_t sctp_ipif_mtu;
uint_t sctp_ipif_id;
in6_addr_t sctp_ipif_saddr;
int sctp_ipif_state;
diff --git a/usr/src/uts/common/inet/sctp/sctp_asconf.c b/usr/src/uts/common/inet/sctp/sctp_asconf.c
index 859faab0b8..fd7e34f7ba 100644
--- a/usr/src/uts/common/inet/sctp/sctp_asconf.c
+++ b/usr/src/uts/common/inet/sctp/sctp_asconf.c
@@ -571,7 +571,8 @@ sctp_input_asconf(sctp_t *sctp, sctp_chunk_hdr_t *ch, sctp_faddr_t *fp)
* it is the clustering module's responsibility to free the lists.
*/
if (cl_sctp_assoc_change != NULL) {
- (*cl_sctp_assoc_change)(sctp->sctp_family, alist, asize,
+ (*cl_sctp_assoc_change)(sctp->sctp_connp->conn_family,
+ alist, asize,
acount, dlist, dsize, dcount, SCTP_CL_PADDR,
(cl_sctp_handle_t)sctp);
/* alist and dlist will be freed by the clustering module */
@@ -586,9 +587,10 @@ sctp_input_asconf(sctp_t *sctp, sctp_chunk_hdr_t *ch, sctp_faddr_t *fp)
ach->sch_len = htons(msgdsize(hmp) - sctp->sctp_hdr_len);
else
ach->sch_len = htons(msgdsize(hmp) - sctp->sctp_hdr6_len);
- sctp_set_iplen(sctp, hmp);
- sctp_add_sendq(sctp, hmp);
+ sctp_set_iplen(sctp, hmp, fp->ixa);
+ (void) conn_ip_output(hmp, fp->ixa);
+ BUMP_LOCAL(sctp->sctp_opkts);
sctp_validate_peer(sctp);
}
@@ -809,7 +811,7 @@ sctp_input_asconf_ack(sctp_t *sctp, sctp_chunk_hdr_t *ch, sctp_faddr_t *fp)
mp->b_prev = NULL;
ainfo->sctp_cl_alist = NULL;
ainfo->sctp_cl_dlist = NULL;
- (*cl_sctp_assoc_change)(sctp->sctp_family, alist,
+ (*cl_sctp_assoc_change)(sctp->sctp_connp->conn_family, alist,
ainfo->sctp_cl_asize, acount, dlist, ainfo->sctp_cl_dsize,
dcount, SCTP_CL_LADDR, (cl_sctp_handle_t)sctp);
/* alist and dlist will be freed by the clustering module */
@@ -1010,12 +1012,13 @@ sctp_wput_asconf(sctp_t *sctp, sctp_faddr_t *fp)
fp->suna += MBLKL(mp);
/* Attach the header and send the chunk */
ipmp->b_cont = mp;
- sctp_set_iplen(sctp, ipmp);
sctp->sctp_cchunk_pend = 1;
SCTP_SET_SENT_FLAG(sctp->sctp_cxmit_list);
SCTP_SET_CHUNK_DEST(sctp->sctp_cxmit_list, fp);
- sctp_add_sendq(sctp, ipmp);
+ sctp_set_iplen(sctp, ipmp, fp->ixa);
+ (void) conn_ip_output(ipmp, fp->ixa);
+ BUMP_LOCAL(sctp->sctp_opkts);
SCTP_FADDR_RC_TIMER_RESTART(sctp, fp, fp->rto);
#undef SCTP_SET_SENT_FLAG
}
@@ -1418,6 +1421,7 @@ sctp_add_ip(sctp_t *sctp, const void *addrs, uint32_t cnt)
uint16_t type = htons(PARM_ADD_IP);
boolean_t v4mapped = B_FALSE;
sctp_cl_ainfo_t *ainfo = NULL;
+ conn_t *connp = sctp->sctp_connp;
/* Does the peer understand ASCONF and Add-IP? */
if (!sctp->sctp_understands_asconf || !sctp->sctp_understands_addip)
@@ -1453,7 +1457,7 @@ sctp_add_ip(sctp_t *sctp, const void *addrs, uint32_t cnt)
* o Must be part of the association
*/
for (i = 0; i < cnt; i++) {
- switch (sctp->sctp_family) {
+ switch (connp->conn_family) {
case AF_INET:
sin4 = (struct sockaddr_in *)addrs + i;
v4mapped = B_TRUE;
@@ -1538,6 +1542,7 @@ sctp_del_ip(sctp_t *sctp, const void *addrs, uint32_t cnt, uchar_t *ulist,
uchar_t *p = ulist;
boolean_t check_lport = B_FALSE;
sctp_stack_t *sctps = sctp->sctp_sctps;
+ conn_t *connp = sctp->sctp_connp;
/* Does the peer understand ASCONF and Add-IP? */
if (sctp->sctp_state <= SCTPS_LISTEN || !sctps->sctps_addip_enabled ||
@@ -1577,10 +1582,11 @@ sctp_del_ip(sctp_t *sctp, const void *addrs, uint32_t cnt, uchar_t *ulist,
for (i = 0; i < cnt; i++) {
ifindex = 0;
- switch (sctp->sctp_family) {
+ switch (connp->conn_family) {
case AF_INET:
sin4 = (struct sockaddr_in *)addrs + i;
- if (check_lport && sin4->sin_port != sctp->sctp_lport) {
+ if (check_lport &&
+ sin4->sin_port != connp->conn_lport) {
error = EINVAL;
goto fail;
}
@@ -1591,7 +1597,7 @@ sctp_del_ip(sctp_t *sctp, const void *addrs, uint32_t cnt, uchar_t *ulist,
case AF_INET6:
sin6 = (struct sockaddr_in6 *)addrs + i;
if (check_lport &&
- sin6->sin6_port != sctp->sctp_lport) {
+ sin6->sin6_port != connp->conn_lport) {
error = EINVAL;
goto fail;
}
@@ -1675,7 +1681,7 @@ fail:
for (i = 0; i < addrcnt; i++) {
ifindex = 0;
- switch (sctp->sctp_family) {
+ switch (connp->conn_family) {
case AF_INET:
sin4 = (struct sockaddr_in *)addrs + i;
IN6_INADDR_TO_V4MAPPED(&(sin4->sin_addr), &addr);
@@ -1697,7 +1703,7 @@ fail:
}
int
-sctp_set_peerprim(sctp_t *sctp, const void *inp, uint_t inlen)
+sctp_set_peerprim(sctp_t *sctp, const void *inp)
{
const struct sctp_setprim *prim = inp;
const struct sockaddr_storage *ss;
@@ -1717,9 +1723,6 @@ sctp_set_peerprim(sctp_t *sctp, const void *inp, uint_t inlen)
return (EOPNOTSUPP);
}
- if (inlen < sizeof (*prim))
- return (EINVAL);
-
/* Don't do anything if we are not connected */
if (sctp->sctp_state != SCTPS_ESTABLISHED)
return (EINVAL);
diff --git a/usr/src/uts/common/inet/sctp/sctp_asconf.h b/usr/src/uts/common/inet/sctp/sctp_asconf.h
index 8940aa00bc..221172d7bb 100644
--- a/usr/src/uts/common/inet/sctp/sctp_asconf.h
+++ b/usr/src/uts/common/inet/sctp/sctp_asconf.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _INET_SCTP_SCTP_ASCONF_H
#define _INET_SCTP_SCTP_ASCONF_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -57,7 +55,7 @@ extern int sctp_del_ip(sctp_t *, const void *, uint32_t, uchar_t *, size_t);
extern void sctp_asconf_free_cxmit(sctp_t *, sctp_chunk_hdr_t *);
extern void sctp_input_asconf(sctp_t *, sctp_chunk_hdr_t *, sctp_faddr_t *);
extern void sctp_input_asconf_ack(sctp_t *, sctp_chunk_hdr_t *, sctp_faddr_t *);
-extern int sctp_set_peerprim(sctp_t *, const void *, uint_t);
+extern int sctp_set_peerprim(sctp_t *, const void *);
extern void sctp_wput_asconf(sctp_t *, sctp_faddr_t *);
#ifdef __cplusplus
diff --git a/usr/src/uts/common/inet/sctp/sctp_bind.c b/usr/src/uts/common/inet/sctp/sctp_bind.c
index c0c1c7556e..9e0b0e7418 100644
--- a/usr/src/uts/common/inet/sctp/sctp_bind.c
+++ b/usr/src/uts/common/inet/sctp/sctp_bind.c
@@ -56,6 +56,7 @@ static int
sctp_select_port(sctp_t *sctp, in_port_t *requested_port, int *user_specified)
{
sctp_stack_t *sctps = sctp->sctp_sctps;
+ conn_t *connp = sctp->sctp_connp;
/*
* Get a valid port (within the anonymous range and should not
@@ -68,7 +69,7 @@ sctp_select_port(sctp_t *sctp, in_port_t *requested_port, int *user_specified)
if (*requested_port == 0) {
*requested_port = sctp_update_next_port(
sctps->sctps_next_port_to_try,
- crgetzone(sctp->sctp_credp), sctps);
+ crgetzone(connp->conn_cred), sctps);
if (*requested_port == 0)
return (EACCES);
*user_specified = 0;
@@ -101,7 +102,7 @@ sctp_select_port(sctp_t *sctp, in_port_t *requested_port, int *user_specified)
* sctp_bind() should take a cred_t argument so that
* we can use it here.
*/
- if (secpolicy_net_privaddr(sctp->sctp_credp,
+ if (secpolicy_net_privaddr(connp->conn_cred,
*requested_port, IPPROTO_SCTP) != 0) {
dprint(1,
("sctp_bind(x): no prive for port %d",
@@ -120,6 +121,7 @@ sctp_listen(sctp_t *sctp)
{
sctp_tf_t *tf;
sctp_stack_t *sctps = sctp->sctp_sctps;
+ conn_t *connp = sctp->sctp_connp;
RUN_SCTP(sctp);
/*
@@ -138,7 +140,7 @@ sctp_listen(sctp_t *sctp)
int ret;
bzero(&ss, sizeof (ss));
- ss.ss_family = sctp->sctp_family;
+ ss.ss_family = connp->conn_family;
WAKE_SCTP(sctp);
if ((ret = sctp_bind(sctp, (struct sockaddr *)&ss,
@@ -147,12 +149,18 @@ sctp_listen(sctp_t *sctp)
RUN_SCTP(sctp)
}
+ /* Cache things in the ixa without any refhold */
+ connp->conn_ixa->ixa_cred = connp->conn_cred;
+ connp->conn_ixa->ixa_cpid = connp->conn_cpid;
+ if (is_system_labeled())
+ connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
+
sctp->sctp_state = SCTPS_LISTEN;
(void) random_get_pseudo_bytes(sctp->sctp_secret, SCTP_SECRET_LEN);
sctp->sctp_last_secret_update = lbolt64;
bzero(sctp->sctp_old_secret, SCTP_SECRET_LEN);
tf = &sctps->sctps_listen_fanout[SCTP_LISTEN_HASH(
- ntohs(sctp->sctp_lport))];
+ ntohs(connp->conn_lport))];
sctp_listen_hash_insert(tf, sctp);
WAKE_SCTP(sctp);
return (0);
@@ -170,6 +178,10 @@ sctp_bind(sctp_t *sctp, struct sockaddr *sa, socklen_t len)
in_port_t requested_port;
in_port_t allocated_port;
int err = 0;
+ conn_t *connp = sctp->sctp_connp;
+ uint_t scope_id;
+ sin_t *sin;
+ sin6_t *sin6;
ASSERT(sctp != NULL);
@@ -188,25 +200,35 @@ sctp_bind(sctp_t *sctp, struct sockaddr *sa, socklen_t len)
switch (sa->sa_family) {
case AF_INET:
+ sin = (sin_t *)sa;
if (len < sizeof (struct sockaddr_in) ||
- sctp->sctp_family == AF_INET6) {
+ connp->conn_family == AF_INET6) {
err = EINVAL;
goto done;
}
- requested_port = ntohs(((struct sockaddr_in *)sa)->sin_port);
+ requested_port = ntohs(sin->sin_port);
break;
case AF_INET6:
+ sin6 = (sin6_t *)sa;
if (len < sizeof (struct sockaddr_in6) ||
- sctp->sctp_family == AF_INET) {
+ connp->conn_family == AF_INET) {
err = EINVAL;
goto done;
}
- requested_port = ntohs(((struct sockaddr_in6 *)sa)->sin6_port);
+ requested_port = ntohs(sin6->sin6_port);
/* Set the flowinfo. */
- sctp->sctp_ip6h->ip6_vcf =
- (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
- (((struct sockaddr_in6 *)sa)->sin6_flowinfo &
- ~IPV6_VERS_AND_FLOW_MASK);
+ connp->conn_flowinfo =
+ sin6->sin6_flowinfo & ~IPV6_VERS_AND_FLOW_MASK;
+
+ scope_id = sin6->sin6_scope_id;
+ if (scope_id != 0 && IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
+ connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
+ connp->conn_ixa->ixa_scopeid = scope_id;
+ connp->conn_incoming_ifindex = scope_id;
+ } else {
+ connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
+ connp->conn_incoming_ifindex = connp->conn_bound_if;
+ }
break;
default:
err = EAFNOSUPPORT;
@@ -247,7 +269,7 @@ sctp_bindx(sctp_t *sctp, const void *addrs, int addrcnt, int bindop)
switch (bindop) {
case SCTP_BINDX_ADD_ADDR:
return (sctp_bind_add(sctp, addrs, addrcnt, B_FALSE,
- sctp->sctp_lport));
+ sctp->sctp_connp->conn_lport));
case SCTP_BINDX_REM_ADDR:
return (sctp_bind_del(sctp, addrs, addrcnt, B_FALSE));
default:
@@ -265,6 +287,7 @@ sctp_bind_add(sctp_t *sctp, const void *addrs, uint32_t addrcnt,
int err = 0;
boolean_t do_asconf = B_FALSE;
sctp_stack_t *sctps = sctp->sctp_sctps;
+ conn_t *connp = sctp->sctp_connp;
if (!caller_hold_lock)
RUN_SCTP(sctp);
@@ -329,7 +352,7 @@ sctp_bind_add(sctp_t *sctp, const void *addrs, uint32_t addrcnt,
return (err);
}
ASSERT(addrlist != NULL);
- (*cl_sctp_check_addrs)(sctp->sctp_family, port, &addrlist,
+ (*cl_sctp_check_addrs)(connp->conn_family, port, &addrlist,
size, &addrcnt, unspec == 1);
if (addrcnt == 0) {
/* We free the list */
@@ -345,8 +368,8 @@ sctp_bind_add(sctp_t *sctp, const void *addrs, uint32_t addrcnt,
err = sctp_valid_addr_list(sctp, addrlist, addrcnt, llist,
lsize);
if (err == 0 && do_listen) {
- (*cl_sctp_listen)(sctp->sctp_family, llist,
- addrcnt, sctp->sctp_lport);
+ (*cl_sctp_listen)(connp->conn_family, llist,
+ addrcnt, connp->conn_lport);
/* list will be freed by the clustering module */
} else if (err != 0 && llist != NULL) {
kmem_free(llist, lsize);
@@ -373,8 +396,6 @@ sctp_bind_add(sctp_t *sctp, const void *addrs, uint32_t addrcnt,
}
if (!caller_hold_lock)
WAKE_SCTP(sctp);
- if (do_asconf)
- sctp_process_sendq(sctp);
return (0);
}
@@ -390,6 +411,7 @@ sctp_bind_del(sctp_t *sctp, const void *addrs, uint32_t addrcnt,
uchar_t *ulist = NULL;
size_t usize = 0;
sctp_stack_t *sctps = sctp->sctp_sctps;
+ conn_t *connp = sctp->sctp_connp;
if (!caller_hold_lock)
RUN_SCTP(sctp);
@@ -439,14 +461,12 @@ sctp_bind_del(sctp_t *sctp, const void *addrs, uint32_t addrcnt,
/* ulist will be non-NULL only if cl_sctp_unlisten is non-NULL */
if (ulist != NULL) {
ASSERT(cl_sctp_unlisten != NULL);
- (*cl_sctp_unlisten)(sctp->sctp_family, ulist, addrcnt,
- sctp->sctp_lport);
+ (*cl_sctp_unlisten)(connp->conn_family, ulist, addrcnt,
+ connp->conn_lport);
/* ulist will be freed by the clustering module */
}
if (!caller_hold_lock)
WAKE_SCTP(sctp);
- if (do_asconf)
- sctp_process_sendq(sctp);
return (error);
}
@@ -473,9 +493,10 @@ sctp_bindi(sctp_t *sctp, in_port_t port, boolean_t bind_to_req_port_only,
int count = 0;
/* maximum number of times to run around the loop */
int loopmax;
- zoneid_t zoneid = sctp->sctp_zoneid;
- zone_t *zone = crgetzone(sctp->sctp_credp);
sctp_stack_t *sctps = sctp->sctp_sctps;
+ conn_t *connp = sctp->sctp_connp;
+ zone_t *zone = crgetzone(connp->conn_cred);
+ zoneid_t zoneid = connp->conn_zoneid;
/*
* Lookup for free addresses is done in a loop and "loopmax"
@@ -523,8 +544,9 @@ sctp_bindi(sctp_t *sctp, in_port_t port, boolean_t bind_to_req_port_only,
mutex_enter(&tbf->tf_lock);
for (lsctp = tbf->tf_sctp; lsctp != NULL;
lsctp = lsctp->sctp_bind_hash) {
+ conn_t *lconnp = lsctp->sctp_connp;
- if (lport != lsctp->sctp_lport ||
+ if (lport != lconnp->conn_lport ||
lsctp->sctp_state < SCTPS_BOUND)
continue;
@@ -534,14 +556,14 @@ sctp_bindi(sctp_t *sctp, in_port_t port, boolean_t bind_to_req_port_only,
* privilege as being in all zones, as there's
* otherwise no way to identify the right receiver.
*/
- if (lsctp->sctp_zoneid != zoneid &&
- lsctp->sctp_mac_mode == CONN_MAC_DEFAULT &&
- sctp->sctp_mac_mode == CONN_MAC_DEFAULT)
+ if (lconnp->conn_zoneid != zoneid &&
+ lconnp->conn_mac_mode == CONN_MAC_DEFAULT &&
+ connp->conn_mac_mode == CONN_MAC_DEFAULT)
continue;
addrcmp = sctp_compare_saddrs(sctp, lsctp);
if (addrcmp != SCTP_ADDR_DISJOINT) {
- if (!sctp->sctp_reuseaddr) {
+ if (!connp->conn_reuseaddr) {
/* in use */
break;
} else if (lsctp->sctp_state == SCTPS_BOUND ||
@@ -563,10 +585,9 @@ sctp_bindi(sctp_t *sctp, in_port_t port, boolean_t bind_to_req_port_only,
/* The port number is busy */
mutex_exit(&tbf->tf_lock);
} else {
- conn_t *connp = sctp->sctp_connp;
-
if (is_system_labeled()) {
mlp_type_t addrtype, mlptype;
+ uint_t ipversion;
/*
* On a labeled system we must check the type
@@ -575,11 +596,16 @@ sctp_bindi(sctp_t *sctp, in_port_t port, boolean_t bind_to_req_port_only,
* and that the user's requested binding
* is permitted.
*/
+ if (connp->conn_family == AF_INET)
+ ipversion = IPV4_VERSION;
+ else
+ ipversion = IPV6_VERSION;
+
addrtype = tsol_mlp_addr_type(
connp->conn_allzones ? ALL_ZONES :
zone->zone_id,
- sctp->sctp_ipversion,
- sctp->sctp_ipversion == IPV4_VERSION ?
+ ipversion,
+ connp->conn_family == AF_INET ?
(void *)&sctp->sctp_ipha->ipha_src :
(void *)&sctp->sctp_ip6h->ip6_src,
sctps->sctps_netstack->netstack_ip);
@@ -631,8 +657,7 @@ sctp_bindi(sctp_t *sctp, in_port_t port, boolean_t bind_to_req_port_only,
* number.
*/
sctp->sctp_state = SCTPS_BOUND;
- sctp->sctp_lport = lport;
- sctp->sctp_sctph->sh_sport = lport;
+ connp->conn_lport = lport;
ASSERT(&sctps->sctps_bind_fanout[
SCTP_BIND_HASH(port)] == tbf);
diff --git a/usr/src/uts/common/inet/sctp/sctp_common.c b/usr/src/uts/common/inet/sctp/sctp_common.c
index 3486ba1150..b518eb3981 100644
--- a/usr/src/uts/common/inet/sctp/sctp_common.c
+++ b/usr/src/uts/common/inet/sctp/sctp_common.c
@@ -44,6 +44,8 @@
#include <inet/ip.h>
#include <inet/ip6.h>
#include <inet/ip_ire.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ndp.h>
#include <inet/mib2.h>
#include <inet/nd.h>
#include <inet/optcom.h>
@@ -57,7 +59,7 @@
static struct kmem_cache *sctp_kmem_faddr_cache;
static void sctp_init_faddr(sctp_t *, sctp_faddr_t *, in6_addr_t *, mblk_t *);
-/* Set the source address. Refer to comments in sctp_get_ire(). */
+/* Set the source address. Refer to comments in sctp_get_dest(). */
void
sctp_set_saddr(sctp_t *sctp, sctp_faddr_t *fp)
{
@@ -68,7 +70,7 @@ sctp_set_saddr(sctp_t *sctp, sctp_faddr_t *fp)
/*
* If there is no source address avaialble, mark this peer address
* as unreachable for now. When the heartbeat timer fires, it will
- * call sctp_get_ire() to re-check if there is any source address
+ * call sctp_get_dest() to re-check if there is any source address
* available.
*/
if (!addr_set)
@@ -76,25 +78,31 @@ sctp_set_saddr(sctp_t *sctp, sctp_faddr_t *fp)
}
/*
- * Call this function to update the cached IRE of a peer addr fp.
+ * Call this function to get information about a peer addr fp.
+ *
+ * Uses ip_attr_connect to avoid explicit use of ire and source address
+ * selection.
*/
void
-sctp_get_ire(sctp_t *sctp, sctp_faddr_t *fp)
+sctp_get_dest(sctp_t *sctp, sctp_faddr_t *fp)
{
- ire_t *ire;
- ipaddr_t addr4;
in6_addr_t laddr;
+ in6_addr_t nexthop;
sctp_saddr_ipif_t *sp;
int hdrlen;
- ts_label_t *tsl;
sctp_stack_t *sctps = sctp->sctp_sctps;
- ip_stack_t *ipst = sctps->sctps_netstack->netstack_ip;
+ conn_t *connp = sctp->sctp_connp;
+ iulp_t uinfo;
+ uint_t pmtu;
+ int error;
+ uint32_t flags = IPDF_VERIFY_DST | IPDF_IPSEC |
+ IPDF_SELECT_SRC | IPDF_UNIQUE_DCE;
- /* Remove the previous cache IRE */
- if ((ire = fp->ire) != NULL) {
- IRE_REFRELE_NOTR(ire);
- fp->ire = NULL;
- }
+ /*
+ * Tell sctp_make_mp it needs to call us again should we not
+ * complete and set the saddr.
+ */
+ fp->saddr = ipv6_all_zeros;
/*
* If this addr is not reachable, mark it as unconfirmed for now, the
@@ -105,29 +113,28 @@ sctp_get_ire(sctp_t *sctp, sctp_faddr_t *fp)
fp->state = SCTP_FADDRS_UNCONFIRMED;
}
- tsl = crgetlabel(CONN_CRED(sctp->sctp_connp));
+ /*
+ * Socket is connected - enable PMTU discovery.
+ */
+ if (!sctps->sctps_ignore_path_mtu)
+ fp->ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
- if (fp->isv4) {
- IN6_V4MAPPED_TO_IPADDR(&fp->faddr, addr4);
- ire = ire_cache_lookup(addr4, sctp->sctp_zoneid, tsl, ipst);
- if (ire != NULL)
- IN6_IPADDR_TO_V4MAPPED(ire->ire_src_addr, &laddr);
- } else {
- ire = ire_cache_lookup_v6(&fp->faddr, sctp->sctp_zoneid, tsl,
- ipst);
- if (ire != NULL)
- laddr = ire->ire_src_addr_v6;
- }
+ ip_attr_nexthop(&connp->conn_xmit_ipp, fp->ixa, &fp->faddr,
+ &nexthop);
- if (ire == NULL) {
- dprint(3, ("ire2faddr: no ire for %x:%x:%x:%x\n",
+ laddr = fp->saddr;
+ error = ip_attr_connect(connp, fp->ixa, &laddr, &fp->faddr, &nexthop,
+ connp->conn_fport, &laddr, &uinfo, flags);
+
+ if (error != 0) {
+ dprint(3, ("sctp_get_dest: no ire for %x:%x:%x:%x\n",
SCTP_PRINTADDR(fp->faddr)));
/*
* It is tempting to just leave the src addr
* unspecified and let IP figure it out, but we
* *cannot* do this, since IP may choose a src addr
* that is not part of this association... unless
- * this sctp has bound to all addrs. So if the ire
+ * this sctp has bound to all addrs. So if the dest
* lookup fails, try to find one in our src addr
* list, unless the sctp has bound to all addrs, in
* which case we change the src addr to unspec.
@@ -144,56 +151,44 @@ sctp_get_ire(sctp_t *sctp, sctp_faddr_t *fp)
return;
goto check_current;
}
+ ASSERT(fp->ixa->ixa_ire != NULL);
+ ASSERT(!(fp->ixa->ixa_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)));
+
+ if (!sctp->sctp_loopback)
+ sctp->sctp_loopback = uinfo.iulp_loopback;
/* Make sure the laddr is part of this association */
- if ((sp = sctp_saddr_lookup(sctp, &ire->ire_ipif->ipif_v6lcl_addr,
- 0)) != NULL && !sp->saddr_ipif_dontsrc) {
+ if ((sp = sctp_saddr_lookup(sctp, &laddr, 0)) != NULL &&
+ !sp->saddr_ipif_dontsrc) {
if (sp->saddr_ipif_unconfirmed == 1)
sp->saddr_ipif_unconfirmed = 0;
+ /* We did IPsec policy lookup for laddr already */
fp->saddr = laddr;
} else {
- dprint(2, ("ire2faddr: src addr is not part of assc\n"));
+ dprint(2, ("sctp_get_dest: src addr is not part of assoc "
+ "%x:%x:%x:%x\n", SCTP_PRINTADDR(laddr)));
/*
* Set the src to the first saddr and hope for the best.
- * Note that we will still do the ire caching below.
- * Otherwise, whenever we send a packet, we need to do
- * the ire lookup again and still may not get the correct
- * source address. Note that this case should very seldomly
+ * Note that this case should very seldomly
* happen. One scenario this can happen is an app
* explicitly bind() to an address. But that address is
* not the preferred source address to send to the peer.
*/
sctp_set_saddr(sctp, fp);
if (fp->state == SCTP_FADDRS_UNREACH) {
- IRE_REFRELE(ire);
return;
}
}
/*
- * Note that ire_cache_lookup_*() returns an ire with the tracing
- * bits enabled. This requires the thread holding the ire also
- * do the IRE_REFRELE(). Thus we need to do IRE_REFHOLD_NOTR()
- * and then IRE_REFRELE() the ire here to make the tracing bits
- * work.
- */
- IRE_REFHOLD_NOTR(ire);
- IRE_REFRELE(ire);
-
- /* Cache the IRE */
- fp->ire = ire;
- if (fp->ire->ire_type == IRE_LOOPBACK && !sctp->sctp_loopback)
- sctp->sctp_loopback = 1;
-
- /*
* Pull out RTO information for this faddr and use it if we don't
* have any yet.
*/
- if (fp->srtt == -1 && ire->ire_uinfo.iulp_rtt != 0) {
+ if (fp->srtt == -1 && uinfo.iulp_rtt != 0) {
/* The cached value is in ms. */
- fp->srtt = MSEC_TO_TICK(ire->ire_uinfo.iulp_rtt);
- fp->rttvar = MSEC_TO_TICK(ire->ire_uinfo.iulp_rtt_sd);
+ fp->srtt = MSEC_TO_TICK(uinfo.iulp_rtt);
+ fp->rttvar = MSEC_TO_TICK(uinfo.iulp_rtt_sd);
fp->rto = 3 * fp->srtt;
/* Bound the RTO by configured min and max values */
@@ -205,6 +200,7 @@ sctp_get_ire(sctp_t *sctp, sctp_faddr_t *fp)
}
SCTP_MAX_RTO(sctp, fp);
}
+ pmtu = uinfo.iulp_mtu;
/*
* Record the MTU for this faddr. If the MTU for this faddr has
@@ -215,9 +211,9 @@ sctp_get_ire(sctp_t *sctp, sctp_faddr_t *fp)
} else {
hdrlen = sctp->sctp_hdr6_len;
}
- if ((fp->sfa_pmss + hdrlen) != ire->ire_max_frag) {
+ if ((fp->sfa_pmss + hdrlen) != pmtu) {
/* Make sure that sfa_pmss is a multiple of SCTP_ALIGN. */
- fp->sfa_pmss = (ire->ire_max_frag - hdrlen) & ~(SCTP_ALIGN - 1);
+ fp->sfa_pmss = (pmtu - hdrlen) & ~(SCTP_ALIGN - 1);
if (fp->cwnd < (fp->sfa_pmss * 2)) {
SET_CWND(fp, fp->sfa_pmss,
sctps->sctps_slow_start_initial);
@@ -230,28 +226,16 @@ check_current:
}
void
-sctp_update_ire(sctp_t *sctp)
+sctp_update_dce(sctp_t *sctp)
{
- ire_t *ire;
sctp_faddr_t *fp;
sctp_stack_t *sctps = sctp->sctp_sctps;
+ iulp_t uinfo;
+ ip_stack_t *ipst = sctps->sctps_netstack->netstack_ip;
+ uint_t ifindex;
for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->next) {
- if ((ire = fp->ire) == NULL)
- continue;
- mutex_enter(&ire->ire_lock);
-
- /*
- * If the cached IRE is going away, there is no point to
- * update it.
- */
- if (ire->ire_marks & IRE_MARK_CONDEMNED) {
- mutex_exit(&ire->ire_lock);
- IRE_REFRELE_NOTR(ire);
- fp->ire = NULL;
- continue;
- }
-
+ bzero(&uinfo, sizeof (uinfo));
/*
* Only record the PMTU for this faddr if we actually have
* done discovery. This prevents initialized default from
@@ -259,70 +243,60 @@ sctp_update_ire(sctp_t *sctp)
*/
if (fp->pmtu_discovered) {
if (fp->isv4) {
- ire->ire_max_frag = fp->sfa_pmss +
+ uinfo.iulp_mtu = fp->sfa_pmss +
sctp->sctp_hdr_len;
} else {
- ire->ire_max_frag = fp->sfa_pmss +
+ uinfo.iulp_mtu = fp->sfa_pmss +
sctp->sctp_hdr6_len;
}
}
-
if (sctps->sctps_rtt_updates != 0 &&
fp->rtt_updates >= sctps->sctps_rtt_updates) {
/*
- * If there is no old cached values, initialize them
- * conservatively. Set them to be (1.5 * new value).
- * This code copied from ip_ire_advise(). The cached
- * value is in ms.
+ * dce_update_uinfo() merges these values with the
+ * old values.
*/
- if (ire->ire_uinfo.iulp_rtt != 0) {
- ire->ire_uinfo.iulp_rtt =
- (ire->ire_uinfo.iulp_rtt +
- TICK_TO_MSEC(fp->srtt)) >> 1;
- } else {
- ire->ire_uinfo.iulp_rtt =
- TICK_TO_MSEC(fp->srtt + (fp->srtt >> 1));
- }
- if (ire->ire_uinfo.iulp_rtt_sd != 0) {
- ire->ire_uinfo.iulp_rtt_sd =
- (ire->ire_uinfo.iulp_rtt_sd +
- TICK_TO_MSEC(fp->rttvar)) >> 1;
+ uinfo.iulp_rtt = TICK_TO_MSEC(fp->srtt);
+ uinfo.iulp_rtt_sd = TICK_TO_MSEC(fp->rttvar);
+ fp->rtt_updates = 0;
+ }
+ ifindex = 0;
+ if (IN6_IS_ADDR_LINKSCOPE(&fp->faddr)) {
+ /*
+ * If we are going to create a DCE we'd better have
+ * an ifindex
+ */
+ if (fp->ixa->ixa_nce != NULL) {
+ ifindex = fp->ixa->ixa_nce->nce_common->
+ ncec_ill->ill_phyint->phyint_ifindex;
} else {
- ire->ire_uinfo.iulp_rtt_sd =
- TICK_TO_MSEC(fp->rttvar +
- (fp->rttvar >> 1));
+ continue;
}
- fp->rtt_updates = 0;
}
- mutex_exit(&ire->ire_lock);
+
+ (void) dce_update_uinfo(&fp->faddr, ifindex, &uinfo, ipst);
}
}
/*
- * The sender must set the total length in the IP header.
- * If sendto == NULL, the current will be used.
+ * The sender must later set the total length in the IP header.
*/
mblk_t *
-sctp_make_mp(sctp_t *sctp, sctp_faddr_t *sendto, int trailer)
+sctp_make_mp(sctp_t *sctp, sctp_faddr_t *fp, int trailer)
{
mblk_t *mp;
size_t ipsctplen;
int isv4;
- sctp_faddr_t *fp;
sctp_stack_t *sctps = sctp->sctp_sctps;
boolean_t src_changed = B_FALSE;
- ASSERT(sctp->sctp_current != NULL || sendto != NULL);
- if (sendto == NULL) {
- fp = sctp->sctp_current;
- } else {
- fp = sendto;
- }
+ ASSERT(fp != NULL);
isv4 = fp->isv4;
- /* Try to look for another IRE again. */
- if (fp->ire == NULL) {
- sctp_get_ire(sctp, fp);
+ if (SCTP_IS_ADDR_UNSPEC(isv4, fp->saddr) ||
+ (fp->ixa->ixa_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
+ /* Need to pick a source */
+ sctp_get_dest(sctp, fp);
/*
* Although we still may not get an IRE, the source address
* may be changed in sctp_get_ire(). Set src_changed to
@@ -334,7 +308,9 @@ sctp_make_mp(sctp_t *sctp, sctp_faddr_t *sendto, int trailer)
/* There is no suitable source address to use, return. */
if (fp->state == SCTP_FADDRS_UNREACH)
return (NULL);
- ASSERT(!SCTP_IS_ADDR_UNSPEC(fp->isv4, fp->saddr));
+
+ ASSERT(fp->ixa->ixa_ire != NULL);
+ ASSERT(!SCTP_IS_ADDR_UNSPEC(isv4, fp->saddr));
if (isv4) {
ipsctplen = sctp->sctp_hdr_len;
@@ -342,8 +318,7 @@ sctp_make_mp(sctp_t *sctp, sctp_faddr_t *sendto, int trailer)
ipsctplen = sctp->sctp_hdr6_len;
}
- mp = allocb_cred(ipsctplen + sctps->sctps_wroff_xtra + trailer,
- CONN_CRED(sctp->sctp_connp), sctp->sctp_cpid);
+ mp = allocb(ipsctplen + sctps->sctps_wroff_xtra + trailer, BPRI_MED);
if (mp == NULL) {
ip1dbg(("sctp_make_mp: error making mp..\n"));
return (NULL);
@@ -377,18 +352,6 @@ sctp_make_mp(sctp_t *sctp, sctp_faddr_t *sendto, int trailer)
}
}
ASSERT(sctp->sctp_connp != NULL);
-
- /*
- * IP will not free this IRE if it is condemned. SCTP needs to
- * free it.
- */
- if ((fp->ire != NULL) && (fp->ire->ire_marks & IRE_MARK_CONDEMNED)) {
- IRE_REFRELE_NOTR(fp->ire);
- fp->ire = NULL;
- }
- /* Stash the conn and ire ptr info. for IP */
- SCTP_STASH_IPINFO(mp, fp->ire);
-
return (mp);
}
@@ -410,17 +373,22 @@ sctp_set_ulp_prop(sctp_t *sctp)
}
ASSERT(sctp->sctp_ulpd);
+ sctp->sctp_connp->conn_wroff = sctps->sctps_wroff_xtra + hdrlen +
+ sizeof (sctp_data_hdr_t);
+
ASSERT(sctp->sctp_current->sfa_pmss == sctp->sctp_mss);
bzero(&sopp, sizeof (sopp));
sopp.sopp_flags = SOCKOPT_MAXBLK|SOCKOPT_WROFF;
- sopp.sopp_wroff = sctps->sctps_wroff_xtra + hdrlen +
- sizeof (sctp_data_hdr_t);
+ sopp.sopp_wroff = sctp->sctp_connp->conn_wroff;
sopp.sopp_maxblk = sctp->sctp_mss - sizeof (sctp_data_hdr_t);
sctp->sctp_ulp_prop(sctp->sctp_ulpd, &sopp);
}
+/*
+ * Set the lengths in the packet and the transmit attributes.
+ */
void
-sctp_set_iplen(sctp_t *sctp, mblk_t *mp)
+sctp_set_iplen(sctp_t *sctp, mblk_t *mp, ip_xmit_attr_t *ixa)
{
uint16_t sum = 0;
ipha_t *iph;
@@ -432,19 +400,15 @@ sctp_set_iplen(sctp_t *sctp, mblk_t *mp)
for (; pmp; pmp = pmp->b_cont)
sum += pmp->b_wptr - pmp->b_rptr;
+ ixa->ixa_pktlen = sum;
if (isv4) {
iph = (ipha_t *)mp->b_rptr;
iph->ipha_length = htons(sum);
+ ixa->ixa_ip_hdr_length = sctp->sctp_ip_hdr_len;
} else {
ip6h = (ip6_t *)mp->b_rptr;
- /*
- * If an ip6i_t is present, the real IPv6 header
- * immediately follows.
- */
- if (ip6h->ip6_nxt == IPPROTO_RAW)
- ip6h = (ip6_t *)&ip6h[1];
- ip6h->ip6_plen = htons(sum - ((char *)&sctp->sctp_ip6h[1] -
- sctp->sctp_iphc6));
+ ip6h->ip6_plen = htons(sum - IPV6_HDR_LEN);
+ ixa->ixa_ip_hdr_length = sctp->sctp_ip_hdr6_len;
}
}
@@ -501,21 +465,21 @@ sctp_add_faddr(sctp_t *sctp, in6_addr_t *addr, int sleep, boolean_t first)
sctp_faddr_t *faddr;
mblk_t *timer_mp;
int err;
+ conn_t *connp = sctp->sctp_connp;
if (is_system_labeled()) {
- cred_t *effective_cred;
+ ip_xmit_attr_t *ixa = connp->conn_ixa;
+ ts_label_t *effective_tsl = NULL;
+
+ ASSERT(ixa->ixa_tsl != NULL);
/*
* Verify the destination is allowed to receive packets
* at the security label of the connection we are initiating.
*
- * tsol_check_dest() will create a new effective cred for
+ * tsol_check_dest() will create a new effective label for
* this connection with a modified label or label flags only
- * if there are changes from the original cred.
- *
- * conn_effective_cred may be non-NULL if a previous
- * faddr was already added or if this is a server
- * accepting a connection on a multi-label port.
+ * if there are changes from the original label.
*
* Accept whatever label we get if this is the first
* destination address for this connection. The security
@@ -525,27 +489,28 @@ sctp_add_faddr(sctp_t *sctp, in6_addr_t *addr, int sleep, boolean_t first)
if (IN6_IS_ADDR_V4MAPPED(addr)) {
uint32_t dst;
IN6_V4MAPPED_TO_IPADDR(addr, dst);
- err = tsol_check_dest(CONN_CRED(sctp->sctp_connp),
- &dst, IPV4_VERSION, sctp->sctp_mac_mode,
- &effective_cred);
+ err = tsol_check_dest(ixa->ixa_tsl,
+ &dst, IPV4_VERSION, connp->conn_mac_mode,
+ connp->conn_zone_is_global, &effective_tsl);
} else {
- err = tsol_check_dest(CONN_CRED(sctp->sctp_connp),
- addr, IPV6_VERSION, sctp->sctp_mac_mode,
- &effective_cred);
+ err = tsol_check_dest(ixa->ixa_tsl,
+ addr, IPV6_VERSION, connp->conn_mac_mode,
+ connp->conn_zone_is_global, &effective_tsl);
}
if (err != 0)
return (err);
- if (sctp->sctp_faddrs == NULL &&
- sctp->sctp_connp->conn_effective_cred == NULL) {
- sctp->sctp_connp->conn_effective_cred = effective_cred;
- } else if (effective_cred != NULL) {
- crfree(effective_cred);
+
+ if (sctp->sctp_faddrs == NULL && effective_tsl != NULL) {
+ ip_xmit_attr_replace_tsl(ixa, effective_tsl);
+ } else if (effective_tsl != NULL) {
+ label_rele(effective_tsl);
return (EHOSTUNREACH);
}
}
if ((faddr = kmem_cache_alloc(sctp_kmem_faddr_cache, sleep)) == NULL)
return (ENOMEM);
+ bzero(faddr, sizeof (*faddr));
timer_mp = sctp_timer_alloc((sctp), sctp_rexmit_timer, sleep);
if (timer_mp == NULL) {
kmem_cache_free(sctp_kmem_faddr_cache, faddr);
@@ -553,16 +518,19 @@ sctp_add_faddr(sctp_t *sctp, in6_addr_t *addr, int sleep, boolean_t first)
}
((sctpt_t *)(timer_mp->b_rptr))->sctpt_faddr = faddr;
- sctp_init_faddr(sctp, faddr, addr, timer_mp);
-
- /* Check for subnet broadcast. */
- if (faddr->ire != NULL && faddr->ire->ire_type & IRE_BROADCAST) {
- IRE_REFRELE_NOTR(faddr->ire);
- sctp_timer_free(timer_mp);
- faddr->timer_mp = NULL;
+ /* Start with any options set on the conn */
+ faddr->ixa = conn_get_ixa_exclusive(connp);
+ if (faddr->ixa == NULL) {
+ freemsg(timer_mp);
kmem_cache_free(sctp_kmem_faddr_cache, faddr);
- return (EADDRNOTAVAIL);
+ return (ENOMEM);
}
+ faddr->ixa->ixa_notify_cookie = connp->conn_sctp;
+
+ sctp_init_faddr(sctp, faddr, addr, timer_mp);
+ ASSERT(faddr->ixa->ixa_cred != NULL);
+
+ /* ip_attr_connect didn't allow broadcats/multicast dest */
ASSERT(faddr->next == NULL);
if (sctp->sctp_faddrs == NULL) {
@@ -644,7 +612,7 @@ sctp_redo_faddr_srcs(sctp_t *sctp)
sctp_faddr_t *fp;
for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->next) {
- sctp_get_ire(sctp, fp);
+ sctp_get_dest(sctp, fp);
}
}
@@ -662,15 +630,17 @@ sctp_faddr_alive(sctp_t *sctp, sctp_faddr_t *fp)
fp->state = SCTP_FADDRS_ALIVE;
sctp_intf_event(sctp, fp->faddr, SCTP_ADDR_AVAILABLE, 0);
/* Should have a full IRE now */
- sctp_get_ire(sctp, fp);
+ sctp_get_dest(sctp, fp);
/*
* If this is the primary, switch back to it now. And
* we probably want to reset the source addr used to reach
* it.
+ * Note that if we didn't find a source in sctp_get_dest
+ * then we'd be unreachable at this point in time.
*/
- if (fp == sctp->sctp_primary) {
- ASSERT(fp->state != SCTP_FADDRS_UNREACH);
+ if (fp == sctp->sctp_primary &&
+ fp->state != SCTP_FADDRS_UNREACH) {
sctp_set_faddr_current(sctp, fp);
return;
}
@@ -816,9 +786,9 @@ sctp_unlink_faddr(sctp_t *sctp, sctp_faddr_t *fp)
fp->rc_timer_mp = NULL;
fp->rc_timer_running = 0;
}
- if (fp->ire != NULL) {
- IRE_REFRELE_NOTR(fp->ire);
- fp->ire = NULL;
+ if (fp->ixa != NULL) {
+ ixa_refrele(fp->ixa);
+ fp->ixa = NULL;
}
if (fp == sctp->sctp_faddrs) {
@@ -837,7 +807,6 @@ gotit:
fpp->next = fp->next;
}
mutex_exit(&sctp->sctp_conn_tfp->tf_lock);
- /* XXX faddr2ire? */
kmem_cache_free(sctp_kmem_faddr_cache, fp);
sctp->sctp_nfaddrs--;
}
@@ -866,8 +835,10 @@ sctp_zap_faddrs(sctp_t *sctp, int caller_holds_lock)
for (fp = sctp->sctp_faddrs; fp; fp = fpn) {
fpn = fp->next;
- if (fp->ire != NULL)
- IRE_REFRELE_NOTR(fp->ire);
+ if (fp->ixa != NULL) {
+ ixa_refrele(fp->ixa);
+ fp->ixa = NULL;
+ }
kmem_cache_free(sctp_kmem_faddr_cache, fp);
sctp->sctp_nfaddrs--;
}
@@ -888,242 +859,177 @@ sctp_zap_addrs(sctp_t *sctp)
}
/*
- * Initialize the IPv4 header. Loses any record of any IP options.
+ * Build two SCTP header templates; one for IPv4 and one for IPv6.
+ * Store them in sctp_iphc and sctp_iphc6 respectively (and related fields).
+ * There are no IP addresses in the templates, but the port numbers and
+ * verifier are field in from the conn_t and sctp_t.
+ *
+ * Returns failure if can't allocate memory, or if there is a problem
+ * with a routing header/option.
+ *
+ * We allocate space for the minimum sctp header (sctp_hdr_t).
+ *
+ * We massage an routing option/header. There is no checksum implication
+ * for a routing header for sctp.
+ *
+ * Caller needs to update conn_wroff if desired.
+ *
+ * TSol notes: This assumes that a SCTP association has a single peer label
+ * since we only track a single pair of ipp_label_v4/v6 and not a separate one
+ * for each faddr.
*/
int
-sctp_header_init_ipv4(sctp_t *sctp, int sleep)
+sctp_build_hdrs(sctp_t *sctp, int sleep)
{
+ conn_t *connp = sctp->sctp_connp;
+ ip_pkt_t *ipp = &connp->conn_xmit_ipp;
+ uint_t ip_hdr_length;
+ uchar_t *hdrs;
+ uint_t hdrs_len;
+ uint_t ulp_hdr_length = sizeof (sctp_hdr_t);
+ ipha_t *ipha;
+ ip6_t *ip6h;
sctp_hdr_t *sctph;
- sctp_stack_t *sctps = sctp->sctp_sctps;
+ in6_addr_t v6src, v6dst;
+ ipaddr_t v4src, v4dst;
- /*
- * This is a simple initialization. If there's
- * already a template, it should never be too small,
- * so reuse it. Otherwise, allocate space for the new one.
- */
- if (sctp->sctp_iphc != NULL) {
- ASSERT(sctp->sctp_iphc_len >= SCTP_MAX_COMBINED_HEADER_LENGTH);
- bzero(sctp->sctp_iphc, sctp->sctp_iphc_len);
- } else {
- sctp->sctp_iphc_len = SCTP_MAX_COMBINED_HEADER_LENGTH;
- sctp->sctp_iphc = kmem_zalloc(sctp->sctp_iphc_len, sleep);
- if (sctp->sctp_iphc == NULL) {
- sctp->sctp_iphc_len = 0;
- return (ENOMEM);
- }
- }
+ v4src = connp->conn_saddr_v4;
+ v4dst = connp->conn_faddr_v4;
+ v6src = connp->conn_saddr_v6;
+ v6dst = connp->conn_faddr_v6;
- sctp->sctp_ipha = (ipha_t *)sctp->sctp_iphc;
+ /* First do IPv4 header */
+ ip_hdr_length = ip_total_hdrs_len_v4(ipp);
- sctp->sctp_hdr_len = sizeof (ipha_t) + sizeof (sctp_hdr_t);
- sctp->sctp_ip_hdr_len = sizeof (ipha_t);
- sctp->sctp_ipha->ipha_length = htons(sizeof (ipha_t) +
- sizeof (sctp_hdr_t));
- sctp->sctp_ipha->ipha_version_and_hdr_length =
- (IP_VERSION << 4) | IP_SIMPLE_HDR_LENGTH_IN_WORDS;
+ /* In case of TX label and IP options it can be too much */
+ if (ip_hdr_length > IP_MAX_HDR_LENGTH) {
+ /* Preserves existing TX errno for this */
+ return (EHOSTUNREACH);
+ }
+ hdrs_len = ip_hdr_length + ulp_hdr_length;
+ ASSERT(hdrs_len != 0);
- /*
- * These two fields should be zero, and are already set above.
- *
- * sctp->sctp_ipha->ipha_ident,
- * sctp->sctp_ipha->ipha_fragment_offset_and_flags.
- */
+ if (hdrs_len != sctp->sctp_iphc_len) {
+ /* Allocate new before we free any old */
+ hdrs = kmem_alloc(hdrs_len, sleep);
+ if (hdrs == NULL)
+ return (ENOMEM);
- sctp->sctp_ipha->ipha_ttl = sctps->sctps_ipv4_ttl;
- sctp->sctp_ipha->ipha_protocol = IPPROTO_SCTP;
+ if (sctp->sctp_iphc != NULL)
+ kmem_free(sctp->sctp_iphc, sctp->sctp_iphc_len);
+ sctp->sctp_iphc = hdrs;
+ sctp->sctp_iphc_len = hdrs_len;
+ } else {
+ hdrs = sctp->sctp_iphc;
+ }
+ sctp->sctp_hdr_len = sctp->sctp_iphc_len;
+ sctp->sctp_ip_hdr_len = ip_hdr_length;
- sctph = (sctp_hdr_t *)(sctp->sctp_iphc + sizeof (ipha_t));
+ sctph = (sctp_hdr_t *)(hdrs + ip_hdr_length);
sctp->sctp_sctph = sctph;
-
- return (0);
-}
-
-/*
- * Update sctp_sticky_hdrs based on sctp_sticky_ipp.
- * The headers include ip6i_t (if needed), ip6_t, any sticky extension
- * headers, and the maximum size sctp header (to avoid reallocation
- * on the fly for additional sctp options).
- * Returns failure if can't allocate memory.
- */
-int
-sctp_build_hdrs(sctp_t *sctp)
-{
- char *hdrs;
- uint_t hdrs_len;
- ip6i_t *ip6i;
- char buf[SCTP_MAX_HDR_LENGTH];
- ip6_pkt_t *ipp = &sctp->sctp_sticky_ipp;
- in6_addr_t src;
- in6_addr_t dst;
- sctp_stack_t *sctps = sctp->sctp_sctps;
-
- /*
- * save the existing sctp header and source/dest IP addresses
- */
- bcopy(sctp->sctp_sctph6, buf, sizeof (sctp_hdr_t));
- src = sctp->sctp_ip6h->ip6_src;
- dst = sctp->sctp_ip6h->ip6_dst;
- hdrs_len = ip_total_hdrs_len_v6(ipp) + SCTP_MAX_HDR_LENGTH;
+ sctph->sh_sport = connp->conn_lport;
+ sctph->sh_dport = connp->conn_fport;
+ sctph->sh_verf = sctp->sctp_fvtag;
+ sctph->sh_chksum = 0;
+
+ ipha = (ipha_t *)hdrs;
+ sctp->sctp_ipha = ipha;
+
+ ipha->ipha_src = v4src;
+ ipha->ipha_dst = v4dst;
+ ip_build_hdrs_v4(hdrs, ip_hdr_length, ipp, connp->conn_proto);
+ ipha->ipha_length = htons(hdrs_len);
+ ipha->ipha_fragment_offset_and_flags = 0;
+
+ if (ipp->ipp_fields & IPPF_IPV4_OPTIONS)
+ (void) ip_massage_options(ipha, connp->conn_netstack);
+
+ /* Now IPv6 */
+ ip_hdr_length = ip_total_hdrs_len_v6(ipp);
+ hdrs_len = ip_hdr_length + ulp_hdr_length;
ASSERT(hdrs_len != 0);
- if (hdrs_len > sctp->sctp_iphc6_len) {
- /* Need to reallocate */
- hdrs = kmem_zalloc(hdrs_len, KM_NOSLEEP);
+
+ if (hdrs_len != sctp->sctp_iphc6_len) {
+ /* Allocate new before we free any old */
+ hdrs = kmem_alloc(hdrs_len, sleep);
if (hdrs == NULL)
return (ENOMEM);
- if (sctp->sctp_iphc6_len != 0)
+ if (sctp->sctp_iphc6 != NULL)
kmem_free(sctp->sctp_iphc6, sctp->sctp_iphc6_len);
sctp->sctp_iphc6 = hdrs;
sctp->sctp_iphc6_len = hdrs_len;
- }
- ip_build_hdrs_v6((uchar_t *)sctp->sctp_iphc6,
- hdrs_len - SCTP_MAX_HDR_LENGTH, ipp, IPPROTO_SCTP);
-
- /* Set header fields not in ipp */
- if (ipp->ipp_fields & IPPF_HAS_IP6I) {
- ip6i = (ip6i_t *)sctp->sctp_iphc6;
- sctp->sctp_ip6h = (ip6_t *)&ip6i[1];
} else {
- sctp->sctp_ip6h = (ip6_t *)sctp->sctp_iphc6;
+ hdrs = sctp->sctp_iphc6;
}
- /*
- * sctp->sctp_ip_hdr_len will include ip6i_t if there is one.
- */
- sctp->sctp_ip_hdr6_len = hdrs_len - SCTP_MAX_HDR_LENGTH;
- sctp->sctp_sctph6 = (sctp_hdr_t *)(sctp->sctp_iphc6 +
- sctp->sctp_ip_hdr6_len);
- sctp->sctp_hdr6_len = sctp->sctp_ip_hdr6_len + sizeof (sctp_hdr_t);
-
- bcopy(buf, sctp->sctp_sctph6, sizeof (sctp_hdr_t));
+ sctp->sctp_hdr6_len = sctp->sctp_iphc6_len;
+ sctp->sctp_ip_hdr6_len = ip_hdr_length;
- sctp->sctp_ip6h->ip6_src = src;
- sctp->sctp_ip6h->ip6_dst = dst;
- /*
- * If the hoplimit was not set by ip_build_hdrs_v6(), we need to
- * set it to the default value for SCTP.
- */
- if (!(ipp->ipp_fields & IPPF_UNICAST_HOPS))
- sctp->sctp_ip6h->ip6_hops = sctps->sctps_ipv6_hoplimit;
- /*
- * If we're setting extension headers after a connection
- * has been established, and if we have a routing header
- * among the extension headers, call ip_massage_options_v6 to
- * manipulate the routing header/ip6_dst set the checksum
- * difference in the sctp header template.
- * (This happens in sctp_connect_ipv6 if the routing header
- * is set prior to the connect.)
- */
-
- if ((sctp->sctp_state >= SCTPS_COOKIE_WAIT) &&
- (sctp->sctp_sticky_ipp.ipp_fields & IPPF_RTHDR)) {
- ip6_rthdr_t *rth;
-
- rth = ip_find_rthdr_v6(sctp->sctp_ip6h,
- (uint8_t *)sctp->sctp_sctph6);
+ sctph = (sctp_hdr_t *)(hdrs + ip_hdr_length);
+ sctp->sctp_sctph6 = sctph;
+ sctph->sh_sport = connp->conn_lport;
+ sctph->sh_dport = connp->conn_fport;
+ sctph->sh_verf = sctp->sctp_fvtag;
+ sctph->sh_chksum = 0;
+
+ ip6h = (ip6_t *)hdrs;
+ sctp->sctp_ip6h = ip6h;
+
+ ip6h->ip6_src = v6src;
+ ip6h->ip6_dst = v6dst;
+ ip_build_hdrs_v6(hdrs, ip_hdr_length, ipp, connp->conn_proto,
+ connp->conn_flowinfo);
+ ip6h->ip6_plen = htons(hdrs_len - IPV6_HDR_LEN);
+
+ if (ipp->ipp_fields & IPPF_RTHDR) {
+ uint8_t *end;
+ ip6_rthdr_t *rth;
+
+ end = (uint8_t *)ip6h + ip_hdr_length;
+ rth = ip_find_rthdr_v6(ip6h, end);
if (rth != NULL) {
- (void) ip_massage_options_v6(sctp->sctp_ip6h, rth,
- sctps->sctps_netstack);
+ (void) ip_massage_options_v6(ip6h, rth,
+ connp->conn_netstack);
}
- }
- return (0);
-}
-/*
- * Initialize the IPv6 header. Loses any record of any IPv6 extension headers.
- */
-int
-sctp_header_init_ipv6(sctp_t *sctp, int sleep)
-{
- sctp_hdr_t *sctph;
- sctp_stack_t *sctps = sctp->sctp_sctps;
-
- /*
- * This is a simple initialization. If there's
- * already a template, it should never be too small,
- * so reuse it. Otherwise, allocate space for the new one.
- * Ensure that there is enough space to "downgrade" the sctp_t
- * to an IPv4 sctp_t. This requires having space for a full load
- * of IPv4 options
- */
- if (sctp->sctp_iphc6 != NULL) {
- ASSERT(sctp->sctp_iphc6_len >=
- SCTP_MAX_COMBINED_HEADER_LENGTH);
- bzero(sctp->sctp_iphc6, sctp->sctp_iphc6_len);
- } else {
- sctp->sctp_iphc6_len = SCTP_MAX_COMBINED_HEADER_LENGTH;
- sctp->sctp_iphc6 = kmem_zalloc(sctp->sctp_iphc_len, sleep);
- if (sctp->sctp_iphc6 == NULL) {
- sctp->sctp_iphc6_len = 0;
- return (ENOMEM);
- }
+ /*
+ * Verify that the first hop isn't a mapped address.
+ * Routers along the path need to do this verification
+ * for subsequent hops.
+ */
+ if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_dst))
+ return (EADDRNOTAVAIL);
}
- sctp->sctp_hdr6_len = IPV6_HDR_LEN + sizeof (sctp_hdr_t);
- sctp->sctp_ip_hdr6_len = IPV6_HDR_LEN;
- sctp->sctp_ip6h = (ip6_t *)sctp->sctp_iphc6;
-
- /* Initialize the header template */
-
- sctp->sctp_ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
- sctp->sctp_ip6h->ip6_plen = ntohs(sizeof (sctp_hdr_t));
- sctp->sctp_ip6h->ip6_nxt = IPPROTO_SCTP;
- sctp->sctp_ip6h->ip6_hops = sctps->sctps_ipv6_hoplimit;
-
- sctph = (sctp_hdr_t *)(sctp->sctp_iphc6 + IPV6_HDR_LEN);
- sctp->sctp_sctph6 = sctph;
-
return (0);
}
static int
-sctp_v4_label(sctp_t *sctp)
+sctp_v4_label(sctp_t *sctp, sctp_faddr_t *fp)
{
- uchar_t optbuf[IP_MAX_OPT_LENGTH];
- const cred_t *cr = CONN_CRED(sctp->sctp_connp);
- int added;
+ conn_t *connp = sctp->sctp_connp;
- if (tsol_compute_label(cr, sctp->sctp_ipha->ipha_dst, optbuf,
- sctp->sctp_sctps->sctps_netstack->netstack_ip) != 0)
- return (EACCES);
-
- added = tsol_remove_secopt(sctp->sctp_ipha, sctp->sctp_hdr_len);
- if (added == -1)
- return (EACCES);
- sctp->sctp_hdr_len += added;
- sctp->sctp_sctph = (sctp_hdr_t *)((uchar_t *)sctp->sctp_sctph + added);
- sctp->sctp_ip_hdr_len += added;
- if ((sctp->sctp_v4label_len = optbuf[IPOPT_OLEN]) != 0) {
- sctp->sctp_v4label_len = (sctp->sctp_v4label_len + 3) & ~3;
- added = tsol_prepend_option(optbuf, sctp->sctp_ipha,
- sctp->sctp_hdr_len);
- if (added == -1)
- return (EACCES);
- sctp->sctp_hdr_len += added;
- sctp->sctp_sctph = (sctp_hdr_t *)((uchar_t *)sctp->sctp_sctph +
- added);
- sctp->sctp_ip_hdr_len += added;
- }
- return (0);
+ ASSERT(fp->ixa->ixa_flags & IXAF_IS_IPV4);
+ return (conn_update_label(connp, fp->ixa, &fp->faddr,
+ &connp->conn_xmit_ipp));
}
static int
-sctp_v6_label(sctp_t *sctp)
+sctp_v6_label(sctp_t *sctp, sctp_faddr_t *fp)
{
- uchar_t optbuf[TSOL_MAX_IPV6_OPTION];
- const cred_t *cr = CONN_CRED(sctp->sctp_connp);
+ conn_t *connp = sctp->sctp_connp;
- if (tsol_compute_label_v6(cr, &sctp->sctp_ip6h->ip6_dst, optbuf,
- sctp->sctp_sctps->sctps_netstack->netstack_ip) != 0)
- return (EACCES);
- if (tsol_update_sticky(&sctp->sctp_sticky_ipp, &sctp->sctp_v6label_len,
- optbuf) != 0)
- return (EACCES);
- if (sctp_build_hdrs(sctp) != 0)
- return (EACCES);
- return (0);
+ ASSERT(!(fp->ixa->ixa_flags & IXAF_IS_IPV4));
+ return (conn_update_label(connp, fp->ixa, &fp->faddr,
+ &connp->conn_xmit_ipp));
}
/*
* XXX implement more sophisticated logic
+ *
+ * Tsol note: We have already verified the addresses using tsol_check_dest
+ * in sctp_add_faddr, thus no need to redo that here.
+ * We do setup ipp_label_v4 and ipp_label_v6 based on which addresses
+ * we have.
*/
int
sctp_set_hdraddrs(sctp_t *sctp)
@@ -1131,50 +1037,43 @@ sctp_set_hdraddrs(sctp_t *sctp)
sctp_faddr_t *fp;
int gotv4 = 0;
int gotv6 = 0;
+ conn_t *connp = sctp->sctp_connp;
ASSERT(sctp->sctp_faddrs != NULL);
ASSERT(sctp->sctp_nsaddrs > 0);
/* Set up using the primary first */
+ connp->conn_faddr_v6 = sctp->sctp_primary->faddr;
+ /* saddr may be unspec; make_mp() will handle this */
+ connp->conn_saddr_v6 = sctp->sctp_primary->saddr;
+ connp->conn_laddr_v6 = connp->conn_saddr_v6;
if (IN6_IS_ADDR_V4MAPPED(&sctp->sctp_primary->faddr)) {
- IN6_V4MAPPED_TO_IPADDR(&sctp->sctp_primary->faddr,
- sctp->sctp_ipha->ipha_dst);
- /* saddr may be unspec; make_mp() will handle this */
- IN6_V4MAPPED_TO_IPADDR(&sctp->sctp_primary->saddr,
- sctp->sctp_ipha->ipha_src);
- if (!is_system_labeled() || sctp_v4_label(sctp) == 0) {
+ if (!is_system_labeled() ||
+ sctp_v4_label(sctp, sctp->sctp_primary) == 0) {
gotv4 = 1;
- if (sctp->sctp_ipversion == IPV4_VERSION) {
- goto copyports;
+ if (connp->conn_family == AF_INET) {
+ goto done;
}
}
} else {
- sctp->sctp_ip6h->ip6_dst = sctp->sctp_primary->faddr;
- /* saddr may be unspec; make_mp() will handle this */
- sctp->sctp_ip6h->ip6_src = sctp->sctp_primary->saddr;
- if (!is_system_labeled() || sctp_v6_label(sctp) == 0)
+ if (!is_system_labeled() ||
+ sctp_v6_label(sctp, sctp->sctp_primary) == 0) {
gotv6 = 1;
+ }
}
for (fp = sctp->sctp_faddrs; fp; fp = fp->next) {
if (!gotv4 && IN6_IS_ADDR_V4MAPPED(&fp->faddr)) {
- IN6_V4MAPPED_TO_IPADDR(&fp->faddr,
- sctp->sctp_ipha->ipha_dst);
- /* copy in the faddr_t's saddr */
- IN6_V4MAPPED_TO_IPADDR(&fp->saddr,
- sctp->sctp_ipha->ipha_src);
- if (!is_system_labeled() || sctp_v4_label(sctp) == 0) {
+ if (!is_system_labeled() ||
+ sctp_v4_label(sctp, fp) == 0) {
gotv4 = 1;
- if (sctp->sctp_ipversion == IPV4_VERSION ||
- gotv6) {
+ if (connp->conn_family == AF_INET || gotv6) {
break;
}
}
} else if (!gotv6 && !IN6_IS_ADDR_V4MAPPED(&fp->faddr)) {
- sctp->sctp_ip6h->ip6_dst = fp->faddr;
- /* copy in the faddr_t's saddr */
- sctp->sctp_ip6h->ip6_src = fp->saddr;
- if (!is_system_labeled() || sctp_v6_label(sctp) == 0) {
+ if (!is_system_labeled() ||
+ sctp_v6_label(sctp, fp) == 0) {
gotv6 = 1;
if (gotv4)
break;
@@ -1182,16 +1081,10 @@ sctp_set_hdraddrs(sctp_t *sctp)
}
}
-copyports:
+done:
if (!gotv4 && !gotv6)
return (EACCES);
- /* copy in the ports for good measure */
- sctp->sctp_sctph->sh_sport = sctp->sctp_lport;
- sctp->sctp_sctph->sh_dport = sctp->sctp_fport;
-
- sctp->sctp_sctph6->sh_sport = sctp->sctp_lport;
- sctp->sctp_sctph6->sh_dport = sctp->sctp_fport;
return (0);
}
@@ -1343,6 +1236,7 @@ sctp_get_addrparams(sctp_t *sctp, sctp_t *psctp, mblk_t *pkt,
boolean_t check_saddr = B_TRUE;
in6_addr_t curaddr;
sctp_stack_t *sctps = sctp->sctp_sctps;
+ conn_t *connp = sctp->sctp_connp;
if (sctp_options != NULL)
*sctp_options = 0;
@@ -1473,8 +1367,7 @@ sctp_get_addrparams(sctp_t *sctp, sctp_t *psctp, mblk_t *pkt,
if (ta == 0 ||
ta == INADDR_BROADCAST ||
ta == htonl(INADDR_LOOPBACK) ||
- CLASSD(ta) ||
- sctp->sctp_connp->conn_ipv6_v6only) {
+ CLASSD(ta) || connp->conn_ipv6_v6only) {
goto next;
}
IN6_INADDR_TO_V4MAPPED((struct in_addr *)
@@ -1492,7 +1385,7 @@ sctp_get_addrparams(sctp_t *sctp, sctp_t *psctp, mblk_t *pkt,
goto next;
}
} else if (ph->sph_type == htons(PARM_ADDR6) &&
- sctp->sctp_family == AF_INET6) {
+ connp->conn_family == AF_INET6) {
/* An v4 socket should not take v6 addresses. */
if (remaining >= PARM_ADDR6_LEN) {
in6_addr_t *addr6;
@@ -1567,7 +1460,7 @@ next:
}
bcopy(&curaddr, dlist, sizeof (curaddr));
sctp_get_faddr_list(sctp, alist, asize);
- (*cl_sctp_assoc_change)(sctp->sctp_family, alist, asize,
+ (*cl_sctp_assoc_change)(connp->conn_family, alist, asize,
sctp->sctp_nfaddrs, dlist, dsize, 1, SCTP_CL_PADDR,
(cl_sctp_handle_t)sctp);
/* alist and dlist will be freed by the clustering module */
@@ -1581,7 +1474,7 @@ next:
*/
int
sctp_secure_restart_check(mblk_t *pkt, sctp_chunk_hdr_t *ich, uint32_t ports,
- int sleep, sctp_stack_t *sctps)
+ int sleep, sctp_stack_t *sctps, ip_recv_attr_t *ira)
{
sctp_faddr_t *fp, *fphead = NULL;
sctp_parm_hdr_t *ph;
@@ -1696,7 +1589,7 @@ sctp_secure_restart_check(mblk_t *pkt, sctp_chunk_hdr_t *ich, uint32_t ports,
mutex_enter(&tf->tf_lock);
for (sctp = tf->tf_sctp; sctp; sctp = sctp->sctp_conn_hash_next) {
- if (ports != sctp->sctp_ports) {
+ if (ports != sctp->sctp_connp->conn_ports) {
continue;
}
compres = sctp_compare_faddrsets(fphead, sctp->sctp_faddrs);
@@ -1776,7 +1669,8 @@ done:
/* Send off the abort */
sctp_send_abort(sctp, sctp_init2vtag(ich),
- SCTP_ERR_RESTART_NEW_ADDRS, dtail, dlen, pkt, 0, B_TRUE);
+ SCTP_ERR_RESTART_NEW_ADDRS, dtail, dlen, pkt, 0, B_TRUE,
+ ira);
kmem_free(dtail, PARM_ADDR6_LEN * nadded);
}
@@ -1787,6 +1681,10 @@ cleanup:
sctp_faddr_t *fpn;
for (fp = fphead; fp; fp = fpn) {
fpn = fp->next;
+ if (fp->ixa != NULL) {
+ ixa_refrele(fp->ixa);
+ fp->ixa = NULL;
+ }
kmem_cache_free(sctp_kmem_faddr_cache, fp);
}
}
@@ -1850,6 +1748,8 @@ sctp_init_faddr(sctp_t *sctp, sctp_faddr_t *fp, in6_addr_t *addr,
{
sctp_stack_t *sctps = sctp->sctp_sctps;
+ ASSERT(fp->ixa != NULL);
+
bcopy(addr, &fp->faddr, sizeof (*addr));
if (IN6_IS_ADDR_V4MAPPED(addr)) {
fp->isv4 = 1;
@@ -1857,11 +1757,13 @@ sctp_init_faddr(sctp_t *sctp, sctp_faddr_t *fp, in6_addr_t *addr,
fp->sfa_pmss =
(sctps->sctps_initial_mtu - sctp->sctp_hdr_len) &
~(SCTP_ALIGN - 1);
+ fp->ixa->ixa_flags |= IXAF_IS_IPV4;
} else {
fp->isv4 = 0;
fp->sfa_pmss =
(sctps->sctps_initial_mtu - sctp->sctp_hdr6_len) &
~(SCTP_ALIGN - 1);
+ fp->ixa->ixa_flags &= ~IXAF_IS_IPV4;
}
fp->cwnd = sctps->sctps_slow_start_initial * fp->sfa_pmss;
fp->rto = MIN(sctp->sctp_rto_initial, sctp->sctp_init_rto_max);
@@ -1884,14 +1786,13 @@ sctp_init_faddr(sctp_t *sctp, sctp_faddr_t *fp, in6_addr_t *addr,
fp->df = 1;
fp->pmtu_discovered = 0;
fp->next = NULL;
- fp->ire = NULL;
fp->T3expire = 0;
(void) random_get_pseudo_bytes((uint8_t *)&fp->hb_secret,
sizeof (fp->hb_secret));
fp->hb_expiry = lbolt64;
fp->rxt_unacked = 0;
- sctp_get_ire(sctp, fp);
+ sctp_get_dest(sctp, fp);
}
/*ARGSUSED*/
diff --git a/usr/src/uts/common/inet/sctp/sctp_conn.c b/usr/src/uts/common/inet/sctp/sctp_conn.c
index 60c22a3673..7dc048f919 100644
--- a/usr/src/uts/common/inet/sctp/sctp_conn.c
+++ b/usr/src/uts/common/inet/sctp/sctp_conn.c
@@ -64,38 +64,19 @@ sctp_accept_comm(sctp_t *listener, sctp_t *acceptor, mblk_t *cr_pkt,
uint_t sctp_options;
conn_t *aconnp;
conn_t *lconnp;
- cred_t *credp;
- ts_label_t *tslp;
sctp_stack_t *sctps = listener->sctp_sctps;
sctph = (sctp_hdr_t *)(cr_pkt->b_rptr + ip_hdr_len);
ASSERT(OK_32PTR(sctph));
- acceptor->sctp_lport = listener->sctp_lport;
- acceptor->sctp_fport = sctph->sh_sport;
+ aconnp = acceptor->sctp_connp;
+ lconnp = listener->sctp_connp;
+ aconnp->conn_lport = lconnp->conn_lport;
+ aconnp->conn_fport = sctph->sh_sport;
ich = (sctp_chunk_hdr_t *)(iack + 1);
init = (sctp_init_chunk_t *)(ich + 1);
- /*
- * If this is an MLP connection, packets are to be
- * exchanged using the security label of the received
- * Cookie packet instead of the server application's label.
- * Create an effective cred for the connection by attaching
- * the received packet's security label to the server
- * application's cred.
- */
- aconnp = acceptor->sctp_connp;
- lconnp = listener->sctp_connp;
- ASSERT(aconnp->conn_effective_cred == NULL);
- if (lconnp->conn_mlp_type != mlptSingle &&
- (credp = msg_getcred(cr_pkt, NULL)) != NULL &&
- (tslp = crgetlabel(credp)) != NULL) {
- if ((aconnp->conn_effective_cred = copycred_from_tslabel(
- aconnp->conn_cred, tslp, KM_NOSLEEP)) == NULL)
- return (ENOMEM);
- }
-
/* acceptor isn't in any fanouts yet, so don't need to hold locks */
ASSERT(acceptor->sctp_faddrs == NULL);
err = sctp_get_addrparams(acceptor, listener, cr_pkt, ich,
@@ -106,14 +87,15 @@ sctp_accept_comm(sctp_t *listener, sctp_t *acceptor, mblk_t *cr_pkt,
if ((err = sctp_set_hdraddrs(acceptor)) != 0)
return (err);
+ if ((err = sctp_build_hdrs(acceptor, KM_NOSLEEP)) != 0)
+ return (err);
+
if ((sctp_options & SCTP_PRSCTP_OPTION) &&
listener->sctp_prsctp_aware && sctps->sctps_prsctp_enabled) {
acceptor->sctp_prsctp_aware = B_TRUE;
} else {
acceptor->sctp_prsctp_aware = B_FALSE;
}
- /* The new sctp_t is fully bound now. */
- acceptor->sctp_connp->conn_fully_bound = B_TRUE;
/* Get initial TSNs */
acceptor->sctp_ltsn = ntohl(iack->sic_inittsn);
@@ -142,9 +124,9 @@ sctp_accept_comm(sctp_t *listener, sctp_t *acceptor, mblk_t *cr_pkt,
RUN_SCTP(acceptor);
sctp_conn_hash_insert(&sctps->sctps_conn_fanout[
- SCTP_CONN_HASH(sctps, acceptor->sctp_ports)], acceptor, 0);
+ SCTP_CONN_HASH(sctps, aconnp->conn_ports)], acceptor, 0);
sctp_bind_hash_insert(&sctps->sctps_bind_fanout[
- SCTP_BIND_HASH(ntohs(acceptor->sctp_lport))], acceptor, 0);
+ SCTP_BIND_HASH(ntohs(aconnp->conn_lport))], acceptor, 0);
/*
* No need to check for multicast destination since ip will only pass
@@ -170,10 +152,9 @@ sctp_accept_comm(sctp_t *listener, sctp_t *acceptor, mblk_t *cr_pkt,
/* Process the COOKIE packet, mp, directed at the listener 'sctp' */
sctp_t *
sctp_conn_request(sctp_t *sctp, mblk_t *mp, uint_t ifindex, uint_t ip_hdr_len,
- sctp_init_chunk_t *iack, mblk_t *ipsec_mp)
+ sctp_init_chunk_t *iack, ip_recv_attr_t *ira)
{
sctp_t *eager;
- uint_t ipvers;
ip6_t *ip6h;
int err;
conn_t *connp, *econnp;
@@ -181,6 +162,8 @@ sctp_conn_request(sctp_t *sctp, mblk_t *mp, uint_t ifindex, uint_t ip_hdr_len,
struct sock_proto_props sopp;
cred_t *cr;
pid_t cpid;
+ in6_addr_t faddr, laddr;
+ ip_xmit_attr_t *ixa;
/*
* No need to check for duplicate as this is the listener
@@ -189,89 +172,116 @@ sctp_conn_request(sctp_t *sctp, mblk_t *mp, uint_t ifindex, uint_t ip_hdr_len,
* fanout already done cannot find a match, it means that
* there is no duplicate.
*/
- ipvers = IPH_HDR_VERSION(mp->b_rptr);
- ASSERT(ipvers == IPV6_VERSION || ipvers == IPV4_VERSION);
ASSERT(OK_32PTR(mp->b_rptr));
if ((eager = sctp_create_eager(sctp)) == NULL) {
return (NULL);
}
- if (ipvers != IPV4_VERSION) {
- ip6h = (ip6_t *)mp->b_rptr;
- if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))
- eager->sctp_linklocal = 1;
- /*
- * Record ifindex (might be zero) to tie this connection to
- * that interface if either the listener was bound or
- * if the connection is using link-local addresses.
- */
- if (sctp->sctp_bound_if == ifindex ||
- IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))
- eager->sctp_bound_if = ifindex;
- /*
- * XXX broken. bound_if is always overwritten by statement
- * below. What is the right thing to do here?
- */
- eager->sctp_bound_if = sctp->sctp_bound_if;
- }
-
connp = sctp->sctp_connp;
sctps = sctp->sctp_sctps;
econnp = eager->sctp_connp;
if (connp->conn_policy != NULL) {
- ipsec_in_t *ii;
-
- ASSERT(ipsec_mp != NULL);
- ii = (ipsec_in_t *)(ipsec_mp->b_rptr);
- ASSERT(ii->ipsec_in_policy == NULL);
- IPPH_REFHOLD(connp->conn_policy);
- ii->ipsec_in_policy = connp->conn_policy;
-
- ipsec_mp->b_datap->db_type = IPSEC_POLICY_SET;
- if (!ip_bind_ipsec_policy_set(econnp, ipsec_mp)) {
+ /* Inherit the policy from the listener; use actions from ira */
+ if (!ip_ipsec_policy_inherit(econnp, connp, ira)) {
sctp_close_eager(eager);
BUMP_MIB(&sctps->sctps_mib, sctpListenDrop);
return (NULL);
}
}
- if (ipsec_mp != NULL) {
+ ip6h = (ip6_t *)mp->b_rptr;
+ if (ira->ira_flags & IXAF_IS_IPV4) {
+ ipha_t *ipha;
+
+ ipha = (ipha_t *)ip6h;
+ IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &laddr);
+ IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &faddr);
+ } else {
+ laddr = ip6h->ip6_dst;
+ faddr = ip6h->ip6_src;
+ }
+
+ if (ira->ira_flags & IRAF_IPSEC_SECURE) {
/*
* XXX need to fix the cached policy issue here.
- * We temporarily set the conn_src/conn_rem here so
+ * We temporarily set the conn_laddr/conn_faddr here so
* that IPsec can use it for the latched policy
* selector. This is obvioursly wrong as SCTP can
* use different addresses...
*/
- if (ipvers == IPV4_VERSION) {
- ipha_t *ipha;
-
- ipha = (ipha_t *)mp->b_rptr;
- econnp->conn_src = ipha->ipha_dst;
- econnp->conn_rem = ipha->ipha_src;
- } else {
- econnp->conn_srcv6 = ip6h->ip6_dst;
- econnp->conn_remv6 = ip6h->ip6_src;
- }
+ econnp->conn_laddr_v6 = laddr;
+ econnp->conn_faddr_v6 = faddr;
+ econnp->conn_saddr_v6 = laddr;
}
- if (ipsec_conn_cache_policy(econnp, ipvers == IPV4_VERSION) != 0) {
+ if (ipsec_conn_cache_policy(econnp,
+ (ira->ira_flags & IRAF_IS_IPV4) != 0) != 0) {
sctp_close_eager(eager);
BUMP_MIB(&sctps->sctps_mib, sctpListenDrop);
return (NULL);
}
/* Save for getpeerucred */
- cr = msg_getcred(mp, &cpid);
+ cr = ira->ira_cred;
+ cpid = ira->ira_cpid;
+
+ if (is_system_labeled()) {
+ ip_xmit_attr_t *ixa = econnp->conn_ixa;
+
+ ASSERT(ira->ira_tsl != NULL);
+
+ /* Discard any old label */
+ if (ixa->ixa_free_flags & IXA_FREE_TSL) {
+ ASSERT(ixa->ixa_tsl != NULL);
+ label_rele(ixa->ixa_tsl);
+ ixa->ixa_free_flags &= ~IXA_FREE_TSL;
+ ixa->ixa_tsl = NULL;
+ }
+
+ if ((connp->conn_mlp_type != mlptSingle ||
+ connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
+ ira->ira_tsl != NULL) {
+ /*
+ * If this is an MLP connection or a MAC-Exempt
+ * connection with an unlabeled node, packets are to be
+ * exchanged using the security label of the received
+ * Cookie packet instead of the server application's
+ * label.
+ * tsol_check_dest called from ip_set_destination
+ * might later update TSF_UNLABELED by replacing
+ * ixa_tsl with a new label.
+ */
+ label_hold(ira->ira_tsl);
+ ip_xmit_attr_replace_tsl(ixa, ira->ira_tsl);
+ } else {
+ ixa->ixa_tsl = crgetlabel(econnp->conn_cred);
+ }
+ }
err = sctp_accept_comm(sctp, eager, mp, ip_hdr_len, iack);
- if (err) {
+ if (err != 0) {
sctp_close_eager(eager);
BUMP_MIB(&sctps->sctps_mib, sctpListenDrop);
return (NULL);
}
+ ASSERT(eager->sctp_current->ixa != NULL);
+
+ ixa = eager->sctp_current->ixa;
+ if (!(ira->ira_flags & IXAF_IS_IPV4)) {
+ ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4));
+
+ if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src) ||
+ IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst)) {
+ eager->sctp_linklocal = 1;
+
+ ixa->ixa_flags |= IXAF_SCOPEID_SET;
+ ixa->ixa_scopeid = ifindex;
+ econnp->conn_incoming_ifindex = ifindex;
+ }
+ }
+
/*
* On a clustered note send this notification to the clustering
* subsystem.
@@ -299,9 +309,9 @@ sctp_conn_request(sctp_t *sctp, mblk_t *mp, uint_t ifindex, uint_t ip_hdr_len,
/* The clustering module frees these list */
sctp_get_saddr_list(eager, slist, ssize);
sctp_get_faddr_list(eager, flist, fsize);
- (*cl_sctp_connect)(eager->sctp_family, slist,
- eager->sctp_nsaddrs, eager->sctp_lport, flist,
- eager->sctp_nfaddrs, eager->sctp_fport, B_FALSE,
+ (*cl_sctp_connect)(econnp->conn_family, slist,
+ eager->sctp_nsaddrs, econnp->conn_lport, flist,
+ eager->sctp_nfaddrs, econnp->conn_fport, B_FALSE,
(cl_sctp_handle_t)eager);
}
@@ -318,7 +328,7 @@ sctp_conn_request(sctp_t *sctp, mblk_t *mp, uint_t ifindex, uint_t ip_hdr_len,
bzero(&sopp, sizeof (sopp));
sopp.sopp_flags = SOCKOPT_MAXBLK|SOCKOPT_WROFF;
sopp.sopp_maxblk = strmsgsz;
- if (eager->sctp_family == AF_INET) {
+ if (econnp->conn_family == AF_INET) {
sopp.sopp_wroff = sctps->sctps_wroff_xtra +
sizeof (sctp_data_hdr_t) + sctp->sctp_hdr_len;
} else {
@@ -335,7 +345,8 @@ sctp_conn_request(sctp_t *sctp, mblk_t *mp, uint_t ifindex, uint_t ip_hdr_len,
* with an OK ack.
*/
int
-sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
+sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen,
+ cred_t *cr, pid_t pid)
{
sin_t *sin;
sin6_t *sin6;
@@ -346,18 +357,18 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
sctp_t *lsctp;
char buf[INET6_ADDRSTRLEN];
int sleep = sctp->sctp_cansleep ? KM_SLEEP : KM_NOSLEEP;
- int hdrlen;
- ip6_rthdr_t *rth;
int err;
sctp_faddr_t *cur_fp;
sctp_stack_t *sctps = sctp->sctp_sctps;
- struct sock_proto_props sopp;
+ conn_t *connp = sctp->sctp_connp;
+ uint_t scope_id = 0;
+ ip_xmit_attr_t *ixa;
/*
* Determine packet type based on type of address passed in
* the request should contain an IPv4 or IPv6 address.
* Make sure that address family matches the type of
- * family of the the address passed down
+ * family of the address passed down.
*/
if (addrlen < sizeof (sin_t)) {
return (EINVAL);
@@ -372,7 +383,7 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
ip0dbg(("sctp_connect: non-unicast\n"));
return (EINVAL);
}
- if (sctp->sctp_connp->conn_ipv6_v6only)
+ if (connp->conn_ipv6_v6only)
return (EAFNOSUPPORT);
/* convert to v6 mapped */
@@ -397,11 +408,6 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &dstaddr);
}
dstport = sin->sin_port;
- if (sin->sin_family == AF_INET) {
- hdrlen = sctp->sctp_hdr_len;
- } else {
- hdrlen = sctp->sctp_hdr6_len;
- }
break;
case AF_INET6:
sin6 = (sin6_t *)dst;
@@ -411,7 +417,7 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
ip0dbg(("sctp_connect: non-unicast\n"));
return (EINVAL);
}
- if (sctp->sctp_connp->conn_ipv6_v6only &&
+ if (connp->conn_ipv6_v6only &&
IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
return (EAFNOSUPPORT);
}
@@ -420,11 +426,13 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
dstaddr = ipv6_loopback;
} else {
dstaddr = sin6->sin6_addr;
- if (IN6_IS_ADDR_LINKLOCAL(&dstaddr))
+ if (IN6_IS_ADDR_LINKLOCAL(&dstaddr)) {
sctp->sctp_linklocal = 1;
+ scope_id = sin6->sin6_scope_id;
+ }
}
dstport = sin6->sin6_port;
- hdrlen = sctp->sctp_hdr6_len;
+ connp->conn_flowinfo = sin6->sin6_flowinfo;
break;
default:
dprint(1, ("sctp_connect: unknown family %d\n",
@@ -437,12 +445,29 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
RUN_SCTP(sctp);
- if (sctp->sctp_family != dst->sa_family ||
- (sctp->sctp_connp->conn_state_flags & CONN_CLOSING)) {
+ if (connp->conn_family != dst->sa_family ||
+ (connp->conn_state_flags & CONN_CLOSING)) {
WAKE_SCTP(sctp);
return (EINVAL);
}
+ /* We update our cred/cpid based on the caller of connect */
+ if (connp->conn_cred != cr) {
+ crhold(cr);
+ crfree(connp->conn_cred);
+ connp->conn_cred = cr;
+ }
+ connp->conn_cpid = pid;
+
+ /* Cache things in conn_ixa without any refhold */
+ ixa = connp->conn_ixa;
+ ixa->ixa_cred = cr;
+ ixa->ixa_cpid = pid;
+ if (is_system_labeled()) {
+ /* We need to restart with a label based on the cred */
+ ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
+ }
+
switch (sctp->sctp_state) {
case SCTPS_IDLE: {
struct sockaddr_storage ss;
@@ -459,7 +484,7 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
ASSERT(sctp->sctp_nsaddrs == 0);
bzero(&ss, sizeof (ss));
- ss.ss_family = sctp->sctp_family;
+ ss.ss_family = connp->conn_family;
WAKE_SCTP(sctp);
if ((err = sctp_bind(sctp, (struct sockaddr *)&ss,
sizeof (ss))) != 0) {
@@ -474,7 +499,7 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
/* do the connect */
/* XXX check for attempt to connect to self */
- sctp->sctp_fport = dstport;
+ connp->conn_fport = dstport;
ASSERT(sctp->sctp_iphc);
ASSERT(sctp->sctp_iphc6);
@@ -487,9 +512,9 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
*/
sctp_conn_hash_remove(sctp);
tbf = &sctps->sctps_conn_fanout[SCTP_CONN_HASH(sctps,
- sctp->sctp_ports)];
+ connp->conn_ports)];
mutex_enter(&tbf->tf_lock);
- lsctp = sctp_lookup(sctp, &dstaddr, tbf, &sctp->sctp_ports,
+ lsctp = sctp_lookup(sctp, &dstaddr, tbf, &connp->conn_ports,
SCTPS_COOKIE_WAIT);
if (lsctp != NULL) {
/* found a duplicate connection */
@@ -498,6 +523,7 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
WAKE_SCTP(sctp);
return (EADDRINUSE);
}
+
/*
* OK; set up the peer addr (this may grow after we get
* the INIT ACK from the peer with additional addresses).
@@ -509,6 +535,7 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
return (err);
}
cur_fp = sctp->sctp_faddrs;
+ ASSERT(cur_fp->ixa != NULL);
/* No valid src addr, return. */
if (cur_fp->state == SCTP_FADDRS_UNREACH) {
@@ -523,6 +550,16 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
sctp_conn_hash_insert(tbf, sctp, 1);
mutex_exit(&tbf->tf_lock);
+ ixa = cur_fp->ixa;
+ ASSERT(ixa->ixa_cred != NULL);
+
+ if (scope_id != 0) {
+ ixa->ixa_flags |= IXAF_SCOPEID_SET;
+ ixa->ixa_scopeid = scope_id;
+ } else {
+ ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
+ }
+
/* initialize composite headers */
if ((err = sctp_set_hdraddrs(sctp)) != 0) {
sctp_conn_hash_remove(sctp);
@@ -530,15 +567,10 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
return (err);
}
- /*
- * Massage a routing header (if present) putting the first hop
- * in ip6_dst.
- */
- rth = ip_find_rthdr_v6(sctp->sctp_ip6h,
- (uint8_t *)sctp->sctp_sctph6);
- if (rth != NULL) {
- (void) ip_massage_options_v6(sctp->sctp_ip6h, rth,
- sctps->sctps_netstack);
+ if ((err = sctp_build_hdrs(sctp, KM_SLEEP)) != 0) {
+ sctp_conn_hash_remove(sctp);
+ WAKE_SCTP(sctp);
+ return (err);
}
/*
@@ -556,9 +588,6 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
/* Mark this address as alive */
cur_fp->state = SCTP_FADDRS_ALIVE;
- /* This sctp_t is fully bound now. */
- sctp->sctp_connp->conn_fully_bound = B_TRUE;
-
/* Send the INIT to the peer */
SCTP_FADDR_TIMER_RESTART(sctp, cur_fp, cur_fp->rto);
sctp->sctp_state = SCTPS_COOKIE_WAIT;
@@ -567,7 +596,7 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
* address list, so take the hash lock.
*/
mutex_enter(&tbf->tf_lock);
- initmp = sctp_init_mp(sctp);
+ initmp = sctp_init_mp(sctp, cur_fp);
if (initmp == NULL) {
mutex_exit(&tbf->tf_lock);
/*
@@ -605,24 +634,20 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
/* The clustering module frees the lists */
sctp_get_saddr_list(sctp, slist, ssize);
sctp_get_faddr_list(sctp, flist, fsize);
- (*cl_sctp_connect)(sctp->sctp_family, slist,
- sctp->sctp_nsaddrs, sctp->sctp_lport,
- flist, sctp->sctp_nfaddrs, sctp->sctp_fport,
+ (*cl_sctp_connect)(connp->conn_family, slist,
+ sctp->sctp_nsaddrs, connp->conn_lport,
+ flist, sctp->sctp_nfaddrs, connp->conn_fport,
B_TRUE, (cl_sctp_handle_t)sctp);
}
- WAKE_SCTP(sctp);
- /* OK to call IP_PUT() here instead of sctp_add_sendq(). */
- CONN_INC_REF(sctp->sctp_connp);
- initmp->b_flag |= MSGHASREF;
- IP_PUT(initmp, sctp->sctp_connp, sctp->sctp_current->isv4);
+ ASSERT(ixa->ixa_cred != NULL);
+ ASSERT(ixa->ixa_ire != NULL);
+
+ (void) conn_ip_output(initmp, ixa);
BUMP_LOCAL(sctp->sctp_opkts);
+ WAKE_SCTP(sctp);
notify_ulp:
- bzero(&sopp, sizeof (sopp));
- sopp.sopp_flags = SOCKOPT_WROFF;
- sopp.sopp_wroff = sctps->sctps_wroff_xtra + hdrlen +
- sizeof (sctp_data_hdr_t);
- sctp->sctp_ulp_prop(sctp->sctp_ulpd, &sopp);
+ sctp_set_ulp_prop(sctp);
return (0);
default:
diff --git a/usr/src/uts/common/inet/sctp/sctp_cookie.c b/usr/src/uts/common/inet/sctp/sctp_cookie.c
index 601938c928..4baf0a7147 100644
--- a/usr/src/uts/common/inet/sctp/sctp_cookie.c
+++ b/usr/src/uts/common/inet/sctp/sctp_cookie.c
@@ -40,6 +40,7 @@
#include <inet/common.h>
#include <inet/ip.h>
#include <inet/ip6.h>
+#include <inet/ipsec_impl.h>
#include <inet/sctp_ip.h>
#include <inet/ipclassifier.h>
#include "sctp_impl.h"
@@ -156,7 +157,7 @@ hmac_md5(uchar_t *text, size_t text_len, uchar_t *key, size_t key_len,
static int
validate_init_params(sctp_t *sctp, sctp_chunk_hdr_t *ch,
sctp_init_chunk_t *init, mblk_t *inmp, sctp_parm_hdr_t **want_cookie,
- mblk_t **errmp, int *supp_af, uint_t *sctp_options)
+ mblk_t **errmp, int *supp_af, uint_t *sctp_options, ip_recv_attr_t *ira)
{
sctp_parm_hdr_t *cph;
sctp_init_chunk_t *ic;
@@ -168,6 +169,7 @@ validate_init_params(sctp_t *sctp, sctp_chunk_hdr_t *ch,
boolean_t got_errchunk = B_FALSE;
uint16_t ptype;
sctp_mpc_t mpc;
+ conn_t *connp = sctp->sctp_connp;
ASSERT(errmp != NULL);
@@ -336,8 +338,8 @@ done:
* is NULL.
*/
if (want_cookie == NULL &&
- ((sctp->sctp_family == AF_INET && !(*supp_af & PARM_SUPP_V4)) ||
- (sctp->sctp_family == AF_INET6 && !(*supp_af & PARM_SUPP_V6) &&
+ ((connp->conn_family == AF_INET && !(*supp_af & PARM_SUPP_V4)) ||
+ (connp->conn_family == AF_INET6 && !(*supp_af & PARM_SUPP_V6) &&
sctp->sctp_connp->conn_ipv6_v6only))) {
dprint(1, ("sctp:validate_init_params: supp addr\n"));
serror = SCTP_ERR_BAD_ADDR;
@@ -353,7 +355,7 @@ cookie_abort:
dprint(1, ("validate_init_params: cookie absent\n"));
sctp_send_abort(sctp, sctp_init2vtag(ch), SCTP_ERR_MISSING_PARM,
- (char *)&mpc, sizeof (sctp_mpc_t), inmp, 0, B_FALSE);
+ (char *)&mpc, sizeof (sctp_mpc_t), inmp, 0, B_FALSE, ira);
return (0);
}
@@ -365,7 +367,7 @@ abort:
return (0);
sctp_send_abort(sctp, sctp_init2vtag(ch), serror, details,
- errlen, inmp, 0, B_FALSE);
+ errlen, inmp, 0, B_FALSE, ira);
return (0);
}
@@ -453,14 +455,17 @@ cl_sctp_cookie_paddr(sctp_chunk_hdr_t *ch, in6_addr_t *addr)
sizeof (sctp_parm_hdr_t) + /* param header */ \
16 /* MD5 hash */
+/*
+ * Note that sctp is the listener, hence we shouldn't modify it.
+ */
void
sctp_send_initack(sctp_t *sctp, sctp_hdr_t *initsh, sctp_chunk_hdr_t *ch,
- mblk_t *initmp)
+ mblk_t *initmp, ip_recv_attr_t *ira)
{
ipha_t *initiph;
ip6_t *initip6h;
- ipha_t *iackiph;
- ip6_t *iackip6h;
+ ipha_t *iackiph = NULL;
+ ip6_t *iackip6h = NULL;
sctp_chunk_hdr_t *iack_ch;
sctp_init_chunk_t *iack;
sctp_init_chunk_t *init;
@@ -485,10 +490,10 @@ sctp_send_initack(sctp_t *sctp, sctp_hdr_t *initsh, sctp_chunk_hdr_t *ch,
mblk_t *errmp = NULL;
boolean_t initcollision = B_FALSE;
boolean_t linklocal = B_FALSE;
- cred_t *cr;
- pid_t pid;
- ts_label_t *initlabel;
sctp_stack_t *sctps = sctp->sctp_sctps;
+ conn_t *connp = sctp->sctp_connp;
+ int err;
+ ip_xmit_attr_t *ixa = NULL;
BUMP_LOCAL(sctp->sctp_ibchunks);
isv4 = (IPH_HDR_VERSION(initmp->b_rptr) == IPV4_VERSION);
@@ -501,21 +506,24 @@ sctp_send_initack(sctp_t *sctp, sctp_hdr_t *initsh, sctp_chunk_hdr_t *ch,
} else {
initip6h = (ip6_t *)initmp->b_rptr;
ipsctplen = sctp->sctp_ip_hdr6_len;
- if (IN6_IS_ADDR_LINKLOCAL(&initip6h->ip6_src))
+ if (IN6_IS_ADDR_LINKLOCAL(&initip6h->ip6_src) ||
+ IN6_IS_ADDR_LINKLOCAL(&initip6h->ip6_dst))
linklocal = B_TRUE;
supp_af |= PARM_SUPP_V6;
+ if (!sctp->sctp_connp->conn_ipv6_v6only)
+ supp_af |= PARM_SUPP_V4;
}
ASSERT(OK_32PTR(initsh));
init = (sctp_init_chunk_t *)((char *)(initsh + 1) + sizeof (*iack_ch));
/* Make sure we like the peer's parameters */
if (validate_init_params(sctp, ch, init, initmp, NULL, &errmp,
- &supp_af, &sctp_options) == 0) {
+ &supp_af, &sctp_options, ira) == 0) {
return;
}
if (errmp != NULL)
errlen = msgdsize(errmp);
- if (sctp->sctp_family == AF_INET) {
+ if (connp->conn_family == AF_INET) {
/*
* Irregardless of the supported address in the INIT, v4
* must be supported.
@@ -580,43 +588,65 @@ sctp_send_initack(sctp_t *sctp, sctp_hdr_t *initsh, sctp_chunk_hdr_t *ch,
}
/*
- * If the listen socket is bound to a trusted extensions
- * multi-label port, attach a copy of the listener's cred
- * to the new INITACK mblk. Modify the cred to contain
+ * Base the transmission on any routing-related socket options
+ * that have been set on the listener.
+ */
+ ixa = conn_get_ixa_exclusive(connp);
+ if (ixa == NULL) {
+ sctp_send_abort(sctp, sctp_init2vtag(ch),
+ SCTP_ERR_NO_RESOURCES, NULL, 0, initmp, 0, B_FALSE, ira);
+ return;
+ }
+ ixa->ixa_flags &= ~IXAF_VERIFY_PMTU;
+
+ if (isv4)
+ ixa->ixa_flags |= IXAF_IS_IPV4;
+ else
+ ixa->ixa_flags &= ~IXAF_IS_IPV4;
+
+ /*
+ * If the listen socket is bound to a trusted extensions multi-label
+ * port, a MAC-Exempt connection with an unlabeled node, we use the
* the security label of the received INIT packet.
* If not a multi-label port, attach the unmodified
- * listener's cred directly.
+ * listener's label directly.
*
* We expect Sun developed kernel modules to properly set
* cred labels for sctp connections. We can't be so sure this
* will be done correctly when 3rd party kernel modules
- * directly use sctp. The initlabel panic guard logic was
- * added to cover this possibility.
+ * directly use sctp. We check for a NULL ira_tsl to cover this
+ * possibility.
*/
- if (sctp->sctp_connp->conn_mlp_type != mlptSingle) {
- cr = msg_getcred(initmp, &pid);
- if (cr == NULL || (initlabel = crgetlabel(cr)) == NULL) {
- sctp_send_abort(sctp, sctp_init2vtag(ch),
- SCTP_ERR_UNKNOWN, NULL, 0, initmp, 0, B_FALSE);
- return;
+ if (is_system_labeled()) {
+ /* Discard any old label */
+ if (ixa->ixa_free_flags & IXA_FREE_TSL) {
+ ASSERT(ixa->ixa_tsl != NULL);
+ label_rele(ixa->ixa_tsl);
+ ixa->ixa_free_flags &= ~IXA_FREE_TSL;
+ ixa->ixa_tsl = NULL;
}
- cr = copycred_from_bslabel(CONN_CRED(sctp->sctp_connp),
- &initlabel->tsl_label, initlabel->tsl_doi, KM_NOSLEEP);
- if (cr == NULL) {
- sctp_send_abort(sctp, sctp_init2vtag(ch),
- SCTP_ERR_NO_RESOURCES, NULL, 0, initmp, 0, B_FALSE);
- return;
+
+ if (connp->conn_mlp_type != mlptSingle ||
+ connp->conn_mac_mode != CONN_MAC_DEFAULT) {
+ if (ira->ira_tsl == NULL) {
+ sctp_send_abort(sctp, sctp_init2vtag(ch),
+ SCTP_ERR_UNKNOWN, NULL, 0, initmp, 0,
+ B_FALSE, ira);
+ ixa_refrele(ixa);
+ return;
+ }
+ label_hold(ira->ira_tsl);
+ ip_xmit_attr_replace_tsl(ixa, ira->ira_tsl);
+ } else {
+ ixa->ixa_tsl = crgetlabel(connp->conn_cred);
}
- iackmp = allocb_cred(ipsctplen + sctps->sctps_wroff_xtra,
- cr, pid);
- crfree(cr);
- } else {
- iackmp = allocb_cred(ipsctplen + sctps->sctps_wroff_xtra,
- CONN_CRED(sctp->sctp_connp), sctp->sctp_cpid);
}
+
+ iackmp = allocb(ipsctplen + sctps->sctps_wroff_xtra, BPRI_MED);
if (iackmp == NULL) {
sctp_send_abort(sctp, sctp_init2vtag(ch),
- SCTP_ERR_NO_RESOURCES, NULL, 0, initmp, 0, B_FALSE);
+ SCTP_ERR_NO_RESOURCES, NULL, 0, initmp, 0, B_FALSE, ira);
+ ixa_refrele(ixa);
return;
}
@@ -632,6 +662,7 @@ sctp_send_initack(sctp_t *sctp, sctp_hdr_t *initsh, sctp_chunk_hdr_t *ch,
iackiph->ipha_src = initiph->ipha_dst;
iackiph->ipha_length = htons(ipsctplen + errlen);
iacksh = (sctp_hdr_t *)(p + sctp->sctp_ip_hdr_len);
+ ixa->ixa_ip_hdr_length = sctp->sctp_ip_hdr_len;
} else {
bcopy(sctp->sctp_iphc6, p, sctp->sctp_hdr6_len);
iackip6h = (ip6_t *)p;
@@ -639,10 +670,12 @@ sctp_send_initack(sctp_t *sctp, sctp_hdr_t *initsh, sctp_chunk_hdr_t *ch,
/* Copy the peer's IP addr */
iackip6h->ip6_dst = initip6h->ip6_src;
iackip6h->ip6_src = initip6h->ip6_dst;
- iackip6h->ip6_plen = htons(ipsctplen - sizeof (*iackip6h) +
- errlen);
+ iackip6h->ip6_plen = htons(ipsctplen + errlen - IPV6_HDR_LEN);
iacksh = (sctp_hdr_t *)(p + sctp->sctp_ip_hdr6_len);
+ ixa->ixa_ip_hdr_length = sctp->sctp_ip_hdr6_len;
}
+ ixa->ixa_pktlen = ipsctplen + errlen;
+
ASSERT(OK_32PTR(iacksh));
/* Fill in the holes in the SCTP common header */
@@ -776,41 +809,58 @@ sctp_send_initack(sctp_t *sctp, sctp_hdr_t *initsh, sctp_chunk_hdr_t *ch,
iackmp->b_cont = errmp; /* OK if NULL */
- if (is_system_labeled() && (cr = msg_getcred(iackmp, &pid)) != NULL &&
- crgetlabel(cr) != NULL) {
- conn_t *connp = sctp->sctp_connp;
- int err;
-
- if (isv4)
- err = tsol_check_label(cr, &iackmp,
- connp->conn_mac_mode,
- sctps->sctps_netstack->netstack_ip, pid);
- else
- err = tsol_check_label_v6(cr, &iackmp,
- connp->conn_mac_mode,
- sctps->sctps_netstack->netstack_ip, pid);
+ if (is_system_labeled()) {
+ ts_label_t *effective_tsl = NULL;
+
+ ASSERT(ira->ira_tsl != NULL);
+
+ /* Discard any old label */
+ if (ixa->ixa_free_flags & IXA_FREE_TSL) {
+ ASSERT(ixa->ixa_tsl != NULL);
+ label_rele(ixa->ixa_tsl);
+ ixa->ixa_free_flags &= ~IXA_FREE_TSL;
+ }
+ ixa->ixa_tsl = ira->ira_tsl; /* A multi-level responder */
+
+ /*
+ * We need to check for label-related failures which implies
+ * an extra call to tsol_check_dest (as ip_output_simple
+ * also does a tsol_check_dest as part of computing the
+ * label for the packet, but ip_output_simple doesn't return
+ * a specific errno for that case so we can't rely on its
+ * check.)
+ */
+ if (isv4) {
+ err = tsol_check_dest(ixa->ixa_tsl, &iackiph->ipha_dst,
+ IPV4_VERSION, connp->conn_mac_mode,
+ connp->conn_zone_is_global, &effective_tsl);
+ } else {
+ err = tsol_check_dest(ixa->ixa_tsl, &iackip6h->ip6_dst,
+ IPV6_VERSION, connp->conn_mac_mode,
+ connp->conn_zone_is_global, &effective_tsl);
+ }
if (err != 0) {
sctp_send_abort(sctp, sctp_init2vtag(ch),
- SCTP_ERR_AUTH_ERR, NULL, 0, initmp, 0, B_FALSE);
+ SCTP_ERR_AUTH_ERR, NULL, 0, initmp, 0, B_FALSE,
+ ira);
+ ixa_refrele(ixa);
freemsg(iackmp);
return;
}
+ if (effective_tsl != NULL) {
+ /*
+ * Since ip_output_simple will redo the
+ * tsol_check_dest, we just drop the ref.
+ */
+ label_rele(effective_tsl);
+ }
}
- /*
- * Stash the conn ptr info. for IP only as e don't have any
- * cached IRE.
- */
- SCTP_STASH_IPINFO(iackmp, (ire_t *)NULL);
-
- /* XXX sctp == sctp_g_q, so using its obchunks is valid */
BUMP_LOCAL(sctp->sctp_opkts);
BUMP_LOCAL(sctp->sctp_obchunks);
- /* OK to call IP_PUT() here instead of sctp_add_sendq(). */
- CONN_INC_REF(sctp->sctp_connp);
- iackmp->b_flag |= MSGHASREF;
- IP_PUT(iackmp, sctp->sctp_connp, isv4);
+ (void) ip_output_simple(iackmp, ixa);
+ ixa_refrele(ixa);
}
void
@@ -820,7 +870,7 @@ sctp_send_cookie_ack(sctp_t *sctp)
mblk_t *camp;
sctp_stack_t *sctps = sctp->sctp_sctps;
- camp = sctp_make_mp(sctp, NULL, sizeof (*cach));
+ camp = sctp_make_mp(sctp, sctp->sctp_current, sizeof (*cach));
if (camp == NULL) {
/* XXX should abort, but don't have the inmp anymore */
SCTP_KSTAT(sctps, sctp_send_cookie_ack_failed);
@@ -833,11 +883,11 @@ sctp_send_cookie_ack(sctp_t *sctp)
cach->sch_flags = 0;
cach->sch_len = htons(sizeof (*cach));
- sctp_set_iplen(sctp, camp);
-
BUMP_LOCAL(sctp->sctp_obchunks);
- sctp_add_sendq(sctp, camp);
+ sctp_set_iplen(sctp, camp, sctp->sctp_current->ixa);
+ (void) conn_ip_output(camp, sctp->sctp_current->ixa);
+ BUMP_LOCAL(sctp->sctp_opkts);
}
static int
@@ -859,7 +909,8 @@ sctp_find_al_ind(sctp_parm_hdr_t *sph, ssize_t len, uint32_t *adaptation_code)
}
void
-sctp_send_cookie_echo(sctp_t *sctp, sctp_chunk_hdr_t *iackch, mblk_t *iackmp)
+sctp_send_cookie_echo(sctp_t *sctp, sctp_chunk_hdr_t *iackch, mblk_t *iackmp,
+ ip_recv_attr_t *ira)
{
mblk_t *cemp;
mblk_t *mp = NULL;
@@ -886,7 +937,7 @@ sctp_send_cookie_echo(sctp_t *sctp, sctp_chunk_hdr_t *iackch, mblk_t *iackmp)
cph = NULL;
if (validate_init_params(sctp, iackch, iack, iackmp, &cph, &errmp,
- &pad, &sctp_options) == 0) { /* result in 'pad' ignored */
+ &pad, &sctp_options, ira) == 0) { /* result in 'pad' ignored */
BUMP_MIB(&sctps->sctps_mib, sctpAborted);
sctp_assoc_event(sctp, SCTP_CANT_STR_ASSOC, 0, NULL);
sctp_clean_death(sctp, ECONNABORTED);
@@ -906,8 +957,8 @@ sctp_send_cookie_echo(sctp_t *sctp, sctp_chunk_hdr_t *iackch, mblk_t *iackmp)
else
hdrlen = sctp->sctp_hdr6_len;
- cemp = allocb_cred(sctps->sctps_wroff_xtra + hdrlen + ceclen + pad,
- CONN_CRED(sctp->sctp_connp), sctp->sctp_cpid);
+ cemp = allocb(sctps->sctps_wroff_xtra + hdrlen + ceclen + pad,
+ BPRI_MED);
if (cemp == NULL) {
SCTP_FADDR_TIMER_RESTART(sctp, sctp->sctp_current,
sctp->sctp_current->rto);
@@ -932,11 +983,13 @@ sctp_send_cookie_echo(sctp_t *sctp, sctp_chunk_hdr_t *iackch, mblk_t *iackmp)
* in sctp_connect().
*/
sctp->sctp_current->df = B_TRUE;
+ sctp->sctp_ipha->ipha_fragment_offset_and_flags |= IPH_DF_HTONS;
+
/*
* Since IP uses this info during the fanout process, we need to hold
* the lock for this hash line while performing this operation.
*/
- /* XXX sctp_conn_fanout + SCTP_CONN_HASH(sctps, sctp->sctp_ports); */
+ /* XXX sctp_conn_fanout + SCTP_CONN_HASH(sctps, connp->conn_ports); */
ASSERT(sctp->sctp_conn_tfp != NULL);
tf = sctp->sctp_conn_tfp;
/* sctp isn't a listener so only need to hold conn fanout lock */
@@ -1139,14 +1192,15 @@ sendcookie:
sctp->sctp_state = SCTPS_COOKIE_ECHOED;
SCTP_FADDR_TIMER_RESTART(sctp, fp, fp->rto);
- sctp_set_iplen(sctp, head);
- sctp_add_sendq(sctp, head);
+ sctp_set_iplen(sctp, head, fp->ixa);
+ (void) conn_ip_output(head, fp->ixa);
+ BUMP_LOCAL(sctp->sctp_opkts);
}
int
sctp_process_cookie(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *cmp,
sctp_init_chunk_t **iackpp, sctp_hdr_t *insctph, int *recv_adaptation,
- in6_addr_t *peer_addr)
+ in6_addr_t *peer_addr, ip_recv_attr_t *ira)
{
int32_t clen;
size_t initplen;
@@ -1163,6 +1217,7 @@ sctp_process_cookie(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *cmp,
uint32_t *fttag;
uint32_t ports;
sctp_stack_t *sctps = sctp->sctp_sctps;
+ conn_t *connp = sctp->sctp_connp;
BUMP_LOCAL(sctp->sctp_ibchunks);
/* Verify the ICV */
@@ -1232,7 +1287,8 @@ sctp_process_cookie(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *cmp,
staleness = TICK_TO_USEC(diff);
staleness = htonl(staleness);
sctp_send_abort(sctp, init->sic_inittag, SCTP_ERR_STALE_COOKIE,
- (char *)&staleness, sizeof (staleness), cmp, 1, B_FALSE);
+ (char *)&staleness, sizeof (staleness), cmp, 1, B_FALSE,
+ ira);
dprint(1, ("stale cookie %d\n", staleness));
@@ -1242,7 +1298,7 @@ sctp_process_cookie(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *cmp,
/* Check for attack by adding addresses to a restart */
bcopy(insctph, &ports, sizeof (ports));
if (sctp_secure_restart_check(cmp, initch, ports, KM_NOSLEEP,
- sctps) != 1) {
+ sctps, ira) != 1) {
return (-1);
}
@@ -1263,7 +1319,7 @@ sctp_process_cookie(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *cmp,
dprint(1, ("duplicate cookie from %x:%x:%x:%x (%d)\n",
SCTP_PRINTADDR(sctp->sctp_current->faddr),
- (int)(sctp->sctp_fport)));
+ (int)(connp->conn_fport)));
return (-1);
}
@@ -1292,7 +1348,7 @@ sctp_process_cookie(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *cmp,
dprint(1, ("sctp peer %x:%x:%x:%x (%d) restarted\n",
SCTP_PRINTADDR(sctp->sctp_current->faddr),
- (int)(sctp->sctp_fport)));
+ (int)(connp->conn_fport)));
/* reset parameters */
sctp_congest_reset(sctp);
@@ -1320,7 +1376,7 @@ sctp_process_cookie(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *cmp,
dprint(1, ("init collision with %x:%x:%x:%x (%d)\n",
SCTP_PRINTADDR(sctp->sctp_current->faddr),
- (int)(sctp->sctp_fport)));
+ (int)(connp->conn_fport)));
return (0);
} else if (iack->sic_inittag != sctp->sctp_lvtag &&
@@ -1330,7 +1386,7 @@ sctp_process_cookie(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *cmp,
/* Section 5.2.4 case C: late COOKIE */
dprint(1, ("late cookie from %x:%x:%x:%x (%d)\n",
SCTP_PRINTADDR(sctp->sctp_current->faddr),
- (int)(sctp->sctp_fport)));
+ (int)(connp->conn_fport)));
return (-1);
} else if (init->sic_inittag == sctp->sctp_fvtag &&
iack->sic_inittag == sctp->sctp_lvtag) {
@@ -1341,7 +1397,7 @@ sctp_process_cookie(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *cmp,
*/
dprint(1, ("cookie tags match from %x:%x:%x:%x (%d)\n",
SCTP_PRINTADDR(sctp->sctp_current->faddr),
- (int)(sctp->sctp_fport)));
+ (int)(connp->conn_fport)));
if (sctp->sctp_state < SCTPS_ESTABLISHED) {
if (!sctp_initialize_params(sctp, init, iack))
return (-1); /* Drop? */
@@ -1412,13 +1468,17 @@ sctp_addrlist2sctp(mblk_t *mp, sctp_hdr_t *sctph, sctp_chunk_hdr_t *ich,
/*
* params have been put in host byteorder by
* sctp_check_input()
+ *
+ * For labeled systems, there's no need to check the
+ * label here. It's known to be good as we checked
+ * before allowing the connection to become bound.
*/
if (ph->sph_type == PARM_ADDR4) {
IN6_INADDR_TO_V4MAPPED((struct in_addr *)(ph + 1),
&src);
sctp = sctp_conn_match(&src, &dst, ports, zoneid,
- sctps);
+ 0, sctps);
dprint(1,
("sctp_addrlist2sctp: src=%x:%x:%x:%x, sctp=%p\n",
@@ -1431,7 +1491,7 @@ sctp_addrlist2sctp(mblk_t *mp, sctp_hdr_t *sctph, sctp_chunk_hdr_t *ich,
} else if (ph->sph_type == PARM_ADDR6) {
src = *(in6_addr_t *)(ph + 1);
sctp = sctp_conn_match(&src, &dst, ports, zoneid,
- sctps);
+ 0, sctps);
dprint(1,
("sctp_addrlist2sctp: src=%x:%x:%x:%x, sctp=%p\n",
diff --git a/usr/src/uts/common/inet/sctp/sctp_error.c b/usr/src/uts/common/inet/sctp/sctp_error.c
index 02d18cf78c..293ff5bd6e 100644
--- a/usr/src/uts/common/inet/sctp/sctp_error.c
+++ b/usr/src/uts/common/inet/sctp/sctp_error.c
@@ -35,9 +35,11 @@
#include <netinet/in.h>
#include <netinet/ip6.h>
+#include <inet/ipsec_impl.h>
#include <inet/common.h>
#include <inet/ip.h>
#include <inet/ip6.h>
+#include <inet/ipsec_impl.h>
#include <inet/mib2.h>
#include <inet/sctp_ip.h>
#include <inet/ipclassifier.h>
@@ -99,6 +101,7 @@ sctp_user_abort(sctp_t *sctp, mblk_t *data)
int len, hdrlen;
char *cause;
sctp_faddr_t *fp = sctp->sctp_current;
+ ip_xmit_attr_t *ixa = fp->ixa;
sctp_stack_t *sctps = sctp->sctp_sctps;
/*
@@ -147,14 +150,15 @@ sctp_user_abort(sctp_t *sctp, mblk_t *data)
freemsg(mp);
return;
}
- sctp_set_iplen(sctp, mp);
BUMP_MIB(&sctps->sctps_mib, sctpAborted);
BUMP_LOCAL(sctp->sctp_opkts);
BUMP_LOCAL(sctp->sctp_obchunks);
- CONN_INC_REF(sctp->sctp_connp);
- mp->b_flag |= MSGHASREF;
- IP_PUT(mp, sctp->sctp_connp, fp->isv4);
+ sctp_set_iplen(sctp, mp, ixa);
+ ASSERT(ixa->ixa_ire != NULL);
+ ASSERT(ixa->ixa_cred != NULL);
+
+ (void) conn_ip_output(mp, ixa);
sctp_assoc_event(sctp, SCTP_COMM_LOST, 0, NULL);
sctp_clean_death(sctp, ECONNABORTED);
@@ -165,29 +169,24 @@ sctp_user_abort(sctp_t *sctp, mblk_t *data)
*/
void
sctp_send_abort(sctp_t *sctp, uint32_t vtag, uint16_t serror, char *details,
- size_t len, mblk_t *inmp, int iserror, boolean_t tbit)
+ size_t len, mblk_t *inmp, int iserror, boolean_t tbit, ip_recv_attr_t *ira)
{
mblk_t *hmp;
uint32_t ip_hdr_len;
ipha_t *iniph;
- ipha_t *ahiph;
+ ipha_t *ahiph = NULL;
ip6_t *inip6h;
- ip6_t *ahip6h;
+ ip6_t *ahip6h = NULL;
sctp_hdr_t *sh;
sctp_hdr_t *insh;
size_t ahlen;
uchar_t *p;
ssize_t alen;
int isv4;
- ire_t *ire;
- irb_t *irb;
- ts_label_t *tsl;
- conn_t *connp;
- cred_t *cr = NULL;
- pid_t pid;
+ conn_t *connp = sctp->sctp_connp;
sctp_stack_t *sctps = sctp->sctp_sctps;
- ip_stack_t *ipst;
+ ip_xmit_attr_t *ixa;
isv4 = (IPH_HDR_VERSION(inmp->b_rptr) == IPV4_VERSION);
if (isv4) {
@@ -200,11 +199,10 @@ sctp_send_abort(sctp_t *sctp, uint32_t vtag, uint16_t serror, char *details,
* If this is a labeled system, then check to see if we're allowed to
* send a response to this particular sender. If not, then just drop.
*/
- if (is_system_labeled() && !tsol_can_reply_error(inmp))
+ if (is_system_labeled() && !tsol_can_reply_error(inmp, ira))
return;
- hmp = allocb_cred(sctps->sctps_wroff_xtra + ahlen,
- CONN_CRED(sctp->sctp_connp), sctp->sctp_cpid);
+ hmp = allocb(sctps->sctps_wroff_xtra + ahlen, BPRI_MED);
if (hmp == NULL) {
/* XXX no resources */
return;
@@ -262,75 +260,209 @@ sctp_send_abort(sctp_t *sctp, uint32_t vtag, uint16_t serror, char *details,
return;
}
+ /*
+ * Base the transmission on any routing-related socket options
+ * that have been set on the listener/connection.
+ */
+ ixa = conn_get_ixa_exclusive(connp);
+ if (ixa == NULL) {
+ freemsg(hmp);
+ return;
+ }
+ ixa->ixa_flags &= ~IXAF_VERIFY_PMTU;
+
+ ixa->ixa_pktlen = ahlen + alen;
if (isv4) {
- ahiph->ipha_length = htons(ahlen + alen);
+ ixa->ixa_flags |= IXAF_IS_IPV4;
+ ahiph->ipha_length = htons(ixa->ixa_pktlen);
+ ixa->ixa_ip_hdr_length = sctp->sctp_ip_hdr_len;
} else {
- ahip6h->ip6_plen = htons(alen + sizeof (*sh));
+ ixa->ixa_flags &= ~IXAF_IS_IPV4;
+ ahip6h->ip6_plen = htons(ixa->ixa_pktlen - IPV6_HDR_LEN);
+ ixa->ixa_ip_hdr_length = sctp->sctp_ip_hdr6_len;
}
BUMP_MIB(&sctps->sctps_mib, sctpAborted);
BUMP_LOCAL(sctp->sctp_obchunks);
- ipst = sctps->sctps_netstack->netstack_ip;
- connp = sctp->sctp_connp;
- if (is_system_labeled() && (cr = msg_getcred(inmp, &pid)) != NULL &&
- crgetlabel(cr) != NULL) {
- int err;
- uint_t mode = connp->conn_mac_mode;
+ if (is_system_labeled() && ixa->ixa_tsl != NULL) {
+ ASSERT(ira->ira_tsl != NULL);
- if (isv4)
- err = tsol_check_label(cr, &hmp, mode, ipst, pid);
- else
- err = tsol_check_label_v6(cr, &hmp, mode, ipst, pid);
- if (err != 0) {
- freemsg(hmp);
- return;
- }
+ ixa->ixa_tsl = ira->ira_tsl; /* A multi-level responder */
}
- /* Stash the conn ptr info. for IP */
- SCTP_STASH_IPINFO(hmp, NULL);
+ if (ira->ira_flags & IRAF_IPSEC_SECURE) {
+ /*
+ * Apply IPsec based on how IPsec was applied to
+ * the packet that caused the abort.
+ */
+ if (!ipsec_in_to_out(ira, ixa, hmp, ahiph, ahip6h)) {
+ ip_stack_t *ipst = sctps->sctps_netstack->netstack_ip;
- CONN_INC_REF(connp);
- hmp->b_flag |= MSGHASREF;
- IP_PUT(hmp, connp, sctp->sctp_current == NULL ? B_TRUE :
- sctp->sctp_current->isv4);
- /*
- * Let's just mark the IRE for this destination as temporary
- * to prevent any DoS attack.
- */
- tsl = cr == NULL ? NULL : crgetlabel(cr);
- if (isv4) {
- ire = ire_cache_lookup(iniph->ipha_src, sctp->sctp_zoneid, tsl,
- ipst);
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+ /* Note: mp already consumed and ip_drop_packet done */
+ ixa_refrele(ixa);
+ return;
+ }
} else {
- ire = ire_cache_lookup_v6(&inip6h->ip6_src, sctp->sctp_zoneid,
- tsl, ipst);
+ ixa->ixa_flags |= IXAF_NO_IPSEC;
}
+
+ BUMP_LOCAL(sctp->sctp_opkts);
+ BUMP_LOCAL(sctp->sctp_obchunks);
+
+ (void) ip_output_simple(hmp, ixa);
+ ixa_refrele(ixa);
+}
+
+/*
+ * OOTB version of the above.
+ * If iserror == 0, sends an abort. If iserror != 0, sends an error.
+ */
+void
+sctp_ootb_send_abort(uint32_t vtag, uint16_t serror, char *details,
+ size_t len, const mblk_t *inmp, int iserror, boolean_t tbit,
+ ip_recv_attr_t *ira, ip_stack_t *ipst)
+{
+ uint32_t ip_hdr_len;
+ size_t ahlen;
+ ipha_t *ipha = NULL;
+ ip6_t *ip6h = NULL;
+ sctp_hdr_t *insctph;
+ int i;
+ uint16_t port;
+ ssize_t alen;
+ int isv4;
+ mblk_t *mp;
+ netstack_t *ns = ipst->ips_netstack;
+ sctp_stack_t *sctps = ns->netstack_sctp;
+ ip_xmit_attr_t ixas;
+
+ bzero(&ixas, sizeof (ixas));
+
+ isv4 = (IPH_HDR_VERSION(inmp->b_rptr) == IPV4_VERSION);
+ ip_hdr_len = ira->ira_ip_hdr_length;
+ ahlen = ip_hdr_len + sizeof (sctp_hdr_t);
+
/*
- * In the normal case the ire would be non-null, however it could be
- * null, say, if IP needs to resolve the gateway for this address. We
- * only care about IRE_CACHE.
+ * If this is a labeled system, then check to see if we're allowed to
+ * send a response to this particular sender. If not, then just drop.
*/
- if (ire == NULL)
+ if (is_system_labeled() && !tsol_can_reply_error(inmp, ira))
return;
- if (ire->ire_type != IRE_CACHE) {
- ire_refrele(ire);
+
+ mp = allocb(ahlen + sctps->sctps_wroff_xtra, BPRI_MED);
+ if (mp == NULL) {
return;
}
- irb = ire->ire_bucket;
- /* ire_lock is not needed, as ire_marks is protected by irb_lock */
- rw_enter(&irb->irb_lock, RW_WRITER);
+ mp->b_rptr += sctps->sctps_wroff_xtra;
+ mp->b_wptr = mp->b_rptr + ahlen;
+ bcopy(inmp->b_rptr, mp->b_rptr, ahlen);
+
/*
- * Only increment the temporary IRE count if the original
- * IRE is not already marked temporary.
+ * We follow the logic in tcp_xmit_early_reset() in that we skip
+ * reversing source route (i.e. replace all IP options with EOL).
*/
- if (!(ire->ire_marks & IRE_MARK_TEMPORARY)) {
- irb->irb_tmp_ire_cnt++;
- ire->ire_marks |= IRE_MARK_TEMPORARY;
+ if (isv4) {
+ ipaddr_t v4addr;
+
+ ipha = (ipha_t *)mp->b_rptr;
+ for (i = IP_SIMPLE_HDR_LENGTH; i < (int)ip_hdr_len; i++)
+ mp->b_rptr[i] = IPOPT_EOL;
+ /* Swap addresses */
+ ipha->ipha_length = htons(ahlen);
+ v4addr = ipha->ipha_src;
+ ipha->ipha_src = ipha->ipha_dst;
+ ipha->ipha_dst = v4addr;
+ ipha->ipha_ident = 0;
+ ipha->ipha_ttl = (uchar_t)sctps->sctps_ipv4_ttl;
+
+ ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
+ } else {
+ in6_addr_t v6addr;
+
+ ip6h = (ip6_t *)mp->b_rptr;
+ /* Remove any extension headers assuming partial overlay */
+ if (ip_hdr_len > IPV6_HDR_LEN) {
+ uint8_t *to;
+
+ to = mp->b_rptr + ip_hdr_len - IPV6_HDR_LEN;
+ ovbcopy(ip6h, to, IPV6_HDR_LEN);
+ mp->b_rptr += ip_hdr_len - IPV6_HDR_LEN;
+ ip_hdr_len = IPV6_HDR_LEN;
+ ip6h = (ip6_t *)mp->b_rptr;
+ ip6h->ip6_nxt = IPPROTO_SCTP;
+ ahlen = ip_hdr_len + sizeof (sctp_hdr_t);
+ }
+ ip6h->ip6_plen = htons(ahlen - IPV6_HDR_LEN);
+ v6addr = ip6h->ip6_src;
+ ip6h->ip6_src = ip6h->ip6_dst;
+ ip6h->ip6_dst = v6addr;
+ ip6h->ip6_hops = (uchar_t)sctps->sctps_ipv6_hoplimit;
+
+ ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
+ if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_dst)) {
+ ixas.ixa_flags |= IXAF_SCOPEID_SET;
+ ixas.ixa_scopeid = ira->ira_ruifindex;
+ }
+ }
+ insctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_len);
+
+ /* Swap ports. Verification tag is reused. */
+ port = insctph->sh_sport;
+ insctph->sh_sport = insctph->sh_dport;
+ insctph->sh_dport = port;
+ insctph->sh_verf = vtag;
+
+ /* Link in the abort chunk */
+ if ((alen = sctp_link_abort(mp, serror, details, len, iserror, tbit))
+ < 0) {
+ freemsg(mp);
+ return;
+ }
+
+ ixas.ixa_pktlen = ahlen + alen;
+ ixas.ixa_ip_hdr_length = ip_hdr_len;
+
+ if (isv4) {
+ ipha->ipha_length = htons(ixas.ixa_pktlen);
+ } else {
+ ip6h->ip6_plen = htons(ixas.ixa_pktlen - IPV6_HDR_LEN);
}
- rw_exit(&irb->irb_lock);
- ire_refrele(ire);
+
+ ixas.ixa_protocol = IPPROTO_SCTP;
+ ixas.ixa_zoneid = ira->ira_zoneid;
+ ixas.ixa_ipst = ipst;
+ ixas.ixa_ifindex = 0;
+
+ BUMP_MIB(&sctps->sctps_mib, sctpAborted);
+
+ if (is_system_labeled()) {
+ ASSERT(ira->ira_tsl != NULL);
+
+ ixas.ixa_tsl = ira->ira_tsl; /* A multi-level responder */
+ }
+
+ if (ira->ira_flags & IRAF_IPSEC_SECURE) {
+ /*
+ * Apply IPsec based on how IPsec was applied to
+ * the packet that was out of the blue.
+ */
+ if (!ipsec_in_to_out(ira, &ixas, mp, ipha, ip6h)) {
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+ /* Note: mp already consumed and ip_drop_packet done */
+ return;
+ }
+ } else {
+ /*
+ * This is in clear. The abort message we are building
+ * here should go out in clear, independent of our policy.
+ */
+ ixas.ixa_flags |= IXAF_NO_IPSEC;
+ }
+
+ (void) ip_output_simple(mp, &ixas);
+ ixa_cleanup(&ixas);
}
/*ARGSUSED*/
@@ -418,8 +550,9 @@ sctp_add_err(sctp_t *sctp, uint16_t serror, void *details, size_t len,
return;
}
sendmp->b_cont = sctp->sctp_err_chunks;
- sctp_set_iplen(sctp, sendmp);
- sctp_add_sendq(sctp, sendmp);
+ sctp_set_iplen(sctp, sendmp, fp->ixa);
+ (void) conn_ip_output(sendmp, fp->ixa);
+ BUMP_LOCAL(sctp->sctp_opkts);
sctp->sctp_err_chunks = emp;
sctp->sctp_err_len = emp_len;
@@ -445,17 +578,20 @@ sctp_process_err(sctp_t *sctp)
sctp_stack_t *sctps = sctp->sctp_sctps;
mblk_t *errmp;
mblk_t *sendmp;
+ sctp_faddr_t *fp;
ASSERT(sctp->sctp_err_chunks != NULL);
errmp = sctp->sctp_err_chunks;
- if ((sendmp = sctp_make_mp(sctp, SCTP_CHUNK_DEST(errmp), 0)) == NULL) {
+ fp = SCTP_CHUNK_DEST(errmp);
+ if ((sendmp = sctp_make_mp(sctp, fp, 0)) == NULL) {
SCTP_KSTAT(sctps, sctp_send_err_failed);
freemsg(errmp);
goto done;
}
sendmp->b_cont = errmp;
- sctp_set_iplen(sctp, sendmp);
- sctp_add_sendq(sctp, sendmp);
+ sctp_set_iplen(sctp, sendmp, fp->ixa);
+ (void) conn_ip_output(sendmp, fp->ixa);
+ BUMP_LOCAL(sctp->sctp_opkts);
done:
sctp->sctp_err_chunks = NULL;
sctp->sctp_err_len = 0;
@@ -467,7 +603,7 @@ done:
*/
int
sctp_handle_error(sctp_t *sctp, sctp_hdr_t *sctph, sctp_chunk_hdr_t *ch,
- mblk_t *mp)
+ mblk_t *mp, ip_recv_attr_t *ira)
{
sctp_parm_hdr_t *errh;
sctp_chunk_hdr_t *uch;
@@ -487,11 +623,13 @@ sctp_handle_error(sctp_t *sctp, sctp_hdr_t *sctph, sctp_chunk_hdr_t *ch,
*/
case SCTP_ERR_BAD_SID:
cmn_err(CE_WARN, "BUG! send to invalid SID");
- sctp_send_abort(sctp, sctph->sh_verf, 0, NULL, 0, mp, 0, 0);
+ sctp_send_abort(sctp, sctph->sh_verf, 0, NULL, 0, mp, 0, 0,
+ ira);
return (ECONNABORTED);
case SCTP_ERR_NO_USR_DATA:
cmn_err(CE_WARN, "BUG! no usr data");
- sctp_send_abort(sctp, sctph->sh_verf, 0, NULL, 0, mp, 0, 0);
+ sctp_send_abort(sctp, sctph->sh_verf, 0, NULL, 0, mp, 0, 0,
+ ira);
return (ECONNABORTED);
case SCTP_ERR_UNREC_CHUNK:
/* Pull out the unrecognized chunk type */
diff --git a/usr/src/uts/common/inet/sctp/sctp_hash.c b/usr/src/uts/common/inet/sctp/sctp_hash.c
index 289dbc04e7..b5c838d297 100644
--- a/usr/src/uts/common/inet/sctp/sctp_hash.c
+++ b/usr/src/uts/common/inet/sctp/sctp_hash.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -82,7 +82,7 @@ sctp_hash_init(sctp_stack_t *sctps)
}
sctps->sctps_conn_fanout =
(sctp_tf_t *)kmem_zalloc(sctps->sctps_conn_hash_size *
- sizeof (sctp_tf_t), KM_SLEEP);
+ sizeof (sctp_tf_t), KM_SLEEP);
for (i = 0; i < sctps->sctps_conn_hash_size; i++) {
mutex_init(&sctps->sctps_conn_fanout[i].tf_lock, NULL,
MUTEX_DEFAULT, NULL);
@@ -129,87 +129,6 @@ sctp_hash_destroy(sctp_stack_t *sctps)
}
/*
- * Walk the SCTP global list and refrele the ire for this ipif
- * This is called when an address goes down, so that we release any reference
- * to the ire associated with this address. Additionally, for any SCTP if
- * this was the only/last address in its source list, we don't kill the
- * assoc., if there is no address added subsequently, or if this does not
- * come up, then the assoc. will die a natural death (i.e. timeout).
- */
-void
-sctp_ire_cache_flush(ipif_t *ipif)
-{
- sctp_t *sctp;
- sctp_t *sctp_prev = NULL;
- sctp_faddr_t *fp;
- conn_t *connp;
- ire_t *ire;
- sctp_stack_t *sctps = ipif->ipif_ill->ill_ipst->
- ips_netstack->netstack_sctp;
-
- sctp = sctps->sctps_gsctp;
- mutex_enter(&sctps->sctps_g_lock);
- while (sctp != NULL) {
- mutex_enter(&sctp->sctp_reflock);
- if (sctp->sctp_condemned) {
- mutex_exit(&sctp->sctp_reflock);
- sctp = list_next(&sctps->sctps_g_list, sctp);
- continue;
- }
- sctp->sctp_refcnt++;
- mutex_exit(&sctp->sctp_reflock);
- mutex_exit(&sctps->sctps_g_lock);
- if (sctp_prev != NULL)
- SCTP_REFRELE(sctp_prev);
-
- RUN_SCTP(sctp);
- connp = sctp->sctp_connp;
- mutex_enter(&connp->conn_lock);
- ire = connp->conn_ire_cache;
- if (ire != NULL && ire->ire_ipif == ipif) {
- connp->conn_ire_cache = NULL;
- mutex_exit(&connp->conn_lock);
- IRE_REFRELE_NOTR(ire);
- } else {
- mutex_exit(&connp->conn_lock);
- }
- /* check for ires cached in faddr */
- for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->next) {
- /*
- * If this ipif is being used as the source address
- * we need to update it as well, else we will end
- * up using the dead source address.
- */
- ire = fp->ire;
- if (ire != NULL && ire->ire_ipif == ipif) {
- fp->ire = NULL;
- IRE_REFRELE_NOTR(ire);
- }
- /*
- * This may result in setting the fp as unreachable,
- * i.e. if all the source addresses are down. In
- * that case the assoc. would timeout.
- */
- if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
- &fp->saddr)) {
- sctp_set_saddr(sctp, fp);
- if (fp == sctp->sctp_current &&
- fp->state != SCTP_FADDRS_UNREACH) {
- sctp_set_faddr_current(sctp, fp);
- }
- }
- }
- WAKE_SCTP(sctp);
- sctp_prev = sctp;
- mutex_enter(&sctps->sctps_g_lock);
- sctp = list_next(&sctps->sctps_g_list, sctp);
- }
- mutex_exit(&sctps->sctps_g_lock);
- if (sctp_prev != NULL)
- SCTP_REFRELE(sctp_prev);
-}
-
-/*
* Exported routine for extracting active SCTP associations.
* Like TCP, we terminate the walk if the callback returns non-zero.
*
@@ -244,9 +163,9 @@ cl_sctp_walk_list_stack(int (*cl_callback)(cl_sctp_info_t *, void *),
uchar_t *slist;
uchar_t *flist;
- sctp = sctps->sctps_gsctp;
sctp_prev = NULL;
mutex_enter(&sctps->sctps_g_lock);
+ sctp = list_head(&sctps->sctps_g_list);
while (sctp != NULL) {
size_t ssize;
size_t fsize;
@@ -282,11 +201,14 @@ cl_sctp_walk_list_stack(int (*cl_callback)(cl_sctp_info_t *, void *),
sctp_get_faddr_list(sctp, flist, fsize);
cl_sctpi.cl_sctpi_nladdr = sctp->sctp_nsaddrs;
cl_sctpi.cl_sctpi_nfaddr = sctp->sctp_nfaddrs;
- cl_sctpi.cl_sctpi_family = sctp->sctp_family;
- cl_sctpi.cl_sctpi_ipversion = sctp->sctp_ipversion;
+ cl_sctpi.cl_sctpi_family = sctp->sctp_connp->conn_family;
+ if (cl_sctpi.cl_sctpi_family == AF_INET)
+ cl_sctpi.cl_sctpi_ipversion = IPV4_VERSION;
+ else
+ cl_sctpi.cl_sctpi_ipversion = IPV6_VERSION;
cl_sctpi.cl_sctpi_state = sctp->sctp_state;
- cl_sctpi.cl_sctpi_lport = sctp->sctp_lport;
- cl_sctpi.cl_sctpi_fport = sctp->sctp_fport;
+ cl_sctpi.cl_sctpi_lport = sctp->sctp_connp->conn_lport;
+ cl_sctpi.cl_sctpi_fport = sctp->sctp_connp->conn_fport;
cl_sctpi.cl_sctpi_handle = (cl_sctp_handle_t)sctp;
WAKE_SCTP(sctp);
cl_sctpi.cl_sctpi_laddrp = slist;
@@ -310,20 +232,26 @@ cl_sctp_walk_list_stack(int (*cl_callback)(cl_sctp_info_t *, void *),
sctp_t *
sctp_conn_match(in6_addr_t *faddr, in6_addr_t *laddr, uint32_t ports,
- zoneid_t zoneid, sctp_stack_t *sctps)
+ zoneid_t zoneid, iaflags_t iraflags, sctp_stack_t *sctps)
{
sctp_tf_t *tf;
sctp_t *sctp;
sctp_faddr_t *fp;
+ conn_t *connp;
tf = &(sctps->sctps_conn_fanout[SCTP_CONN_HASH(sctps, ports)]);
mutex_enter(&tf->tf_lock);
for (sctp = tf->tf_sctp; sctp; sctp = sctp->sctp_conn_hash_next) {
- if (ports != sctp->sctp_ports ||
- !IPCL_ZONE_MATCH(sctp->sctp_connp, zoneid)) {
+ connp = sctp->sctp_connp;
+ if (ports != connp->conn_ports)
+ continue;
+ if (!(connp->conn_zoneid == zoneid ||
+ connp->conn_allzones ||
+ ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
+ (iraflags & IRAF_TX_MAC_EXEMPTABLE) &&
+ (iraflags & IRAF_TX_SHARED_ADDR))))
continue;
- }
/* check for faddr match */
for (fp = sctp->sctp_faddrs; fp; fp = fp->next) {
@@ -351,11 +279,12 @@ done:
static sctp_t *
listen_match(in6_addr_t *laddr, uint32_t ports, zoneid_t zoneid,
- sctp_stack_t *sctps)
+ iaflags_t iraflags, sctp_stack_t *sctps)
{
sctp_t *sctp;
sctp_tf_t *tf;
uint16_t lport;
+ conn_t *connp;
lport = ((uint16_t *)&ports)[1];
@@ -363,10 +292,16 @@ listen_match(in6_addr_t *laddr, uint32_t ports, zoneid_t zoneid,
mutex_enter(&tf->tf_lock);
for (sctp = tf->tf_sctp; sctp; sctp = sctp->sctp_listen_hash_next) {
- if (lport != sctp->sctp_lport ||
- !IPCL_ZONE_MATCH(sctp->sctp_connp, zoneid)) {
+ connp = sctp->sctp_connp;
+ if (lport != connp->conn_lport)
+ continue;
+
+ if (!(connp->conn_zoneid == zoneid ||
+ connp->conn_allzones ||
+ ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
+ (iraflags & IRAF_TX_MAC_EXEMPTABLE) &&
+ (iraflags & IRAF_TX_SHARED_ADDR))))
continue;
- }
if (sctp_saddr_lookup(sctp, laddr, 0) != NULL) {
SCTP_REFHOLD(sctp);
@@ -383,48 +318,36 @@ done:
/* called by ipsec_sctp_pol */
conn_t *
sctp_find_conn(in6_addr_t *src, in6_addr_t *dst, uint32_t ports,
- zoneid_t zoneid, sctp_stack_t *sctps)
+ zoneid_t zoneid, iaflags_t iraflags, sctp_stack_t *sctps)
{
sctp_t *sctp;
- if ((sctp = sctp_conn_match(src, dst, ports, zoneid, sctps)) == NULL) {
+ sctp = sctp_conn_match(src, dst, ports, zoneid, iraflags, sctps);
+ if (sctp == NULL) {
/* Not in conn fanout; check listen fanout */
- if ((sctp = listen_match(dst, ports, zoneid, sctps)) == NULL)
+ sctp = listen_match(dst, ports, zoneid, iraflags, sctps);
+ if (sctp == NULL)
return (NULL);
}
return (sctp->sctp_connp);
}
+/*
+ * Fanout to a sctp instance.
+ */
conn_t *
sctp_fanout(in6_addr_t *src, in6_addr_t *dst, uint32_t ports,
- zoneid_t zoneid, mblk_t *mp, sctp_stack_t *sctps)
-
+ ip_recv_attr_t *ira, mblk_t *mp, sctp_stack_t *sctps)
{
+ zoneid_t zoneid = ira->ira_zoneid;
+ iaflags_t iraflags = ira->ira_flags;
sctp_t *sctp;
- boolean_t shared_addr;
-
- if ((sctp = sctp_conn_match(src, dst, ports, zoneid, sctps)) == NULL) {
- shared_addr = (zoneid == ALL_ZONES);
- if (shared_addr) {
- /*
- * No need to handle exclusive-stack zones since
- * ALL_ZONES only applies to the shared stack.
- */
- zoneid = tsol_mlp_findzone(IPPROTO_SCTP,
- htons(ntohl(ports) & 0xFFFF));
- /*
- * If no shared MLP is found, tsol_mlp_findzone returns
- * ALL_ZONES. In that case, we assume it's SLP, and
- * search for the zone based on the packet label.
- * That will also return ALL_ZONES on failure.
- */
- if (zoneid == ALL_ZONES)
- zoneid = tsol_packet_to_zoneid(mp);
- if (zoneid == ALL_ZONES)
- return (NULL);
- }
+
+ sctp = sctp_conn_match(src, dst, ports, zoneid, iraflags, sctps);
+ if (sctp == NULL) {
/* Not in conn fanout; check listen fanout */
- if ((sctp = listen_match(dst, ports, zoneid, sctps)) == NULL)
+ sctp = listen_match(dst, ports, zoneid, iraflags, sctps);
+ if (sctp == NULL)
return (NULL);
/*
* On systems running trusted extensions, check if dst
@@ -432,9 +355,9 @@ sctp_fanout(in6_addr_t *src, in6_addr_t *dst, uint32_t ports,
* that dst is in 16 byte AF_INET6 format. IPv4-mapped
* IPv6 addresses are supported.
*/
- if (is_system_labeled() &&
- !tsol_receive_local(mp, dst, IPV6_VERSION,
- shared_addr, sctp->sctp_connp)) {
+ if ((iraflags & IRAF_SYSTEM_LABELED) &&
+ !tsol_receive_local(mp, dst, IPV6_VERSION, ira,
+ sctp->sctp_connp)) {
DTRACE_PROBE3(
tx__ip__log__info__classify__sctp,
char *,
@@ -444,145 +367,84 @@ sctp_fanout(in6_addr_t *src, in6_addr_t *dst, uint32_t ports,
return (NULL);
}
}
+ /*
+ * For labeled systems, there's no need to check the
+ * label here. It's known to be good as we checked
+ * before allowing the connection to become bound.
+ */
return (sctp->sctp_connp);
}
/*
- * Fanout for SCTP packets
+ * Fanout for ICMP errors for SCTP
* The caller puts <fport, lport> in the ports parameter.
*/
-/* ARGSUSED */
void
-ip_fanout_sctp(mblk_t *mp, ill_t *recv_ill, ipha_t *ipha,
- uint32_t ports, uint_t flags, boolean_t mctl_present, boolean_t ip_policy,
- zoneid_t zoneid)
+ip_fanout_sctp(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, uint32_t ports,
+ ip_recv_attr_t *ira)
{
- sctp_t *sctp;
- boolean_t isv4;
- conn_t *connp;
- mblk_t *first_mp;
- ip6_t *ip6h;
- in6_addr_t map_src, map_dst;
- in6_addr_t *src, *dst;
- ip_stack_t *ipst;
- ipsec_stack_t *ipss;
- sctp_stack_t *sctps;
-
- ASSERT(recv_ill != NULL);
- ipst = recv_ill->ill_ipst;
- sctps = ipst->ips_netstack->netstack_sctp;
- ipss = ipst->ips_netstack->netstack_ipsec;
-
- first_mp = mp;
- if (mctl_present) {
- mp = first_mp->b_cont;
- ASSERT(mp != NULL);
- }
+ sctp_t *sctp;
+ conn_t *connp;
+ in6_addr_t map_src, map_dst;
+ in6_addr_t *src, *dst;
+ boolean_t secure;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+ netstack_t *ns = ipst->ips_netstack;
+ ipsec_stack_t *ipss = ns->netstack_ipsec;
+ sctp_stack_t *sctps = ns->netstack_sctp;
+ iaflags_t iraflags = ira->ira_flags;
+ ill_t *rill = ira->ira_rill;
+
+ ASSERT(iraflags & IRAF_ICMP_ERROR);
+
+ secure = iraflags & IRAF_IPSEC_SECURE;
/* Assume IP provides aligned packets - otherwise toss */
if (!OK_32PTR(mp->b_rptr)) {
- BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(first_mp);
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
+ freemsg(mp);
return;
}
- if (IPH_HDR_VERSION(ipha) == IPV6_VERSION) {
- ip6h = (ip6_t *)ipha;
+ if (!(iraflags & IRAF_IS_IPV4)) {
src = &ip6h->ip6_src;
dst = &ip6h->ip6_dst;
- isv4 = B_FALSE;
} else {
- ip6h = NULL;
IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_src);
IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_dst);
src = &map_src;
dst = &map_dst;
- isv4 = B_TRUE;
}
- connp = sctp_fanout(src, dst, ports, zoneid, mp, sctps);
+ connp = sctp_fanout(src, dst, ports, ira, mp, sctps);
if (connp == NULL) {
- ip_fanout_sctp_raw(first_mp, recv_ill, ipha, isv4,
- ports, mctl_present, flags, ip_policy, zoneid);
+ ip_fanout_sctp_raw(mp, ipha, ip6h, ports, ira);
return;
}
sctp = CONN2SCTP(connp);
- /* Found a client; up it goes */
- BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers);
-
/*
* We check some fields in conn_t without holding a lock.
* This should be fine.
*/
- if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || mctl_present) {
- first_mp = ipsec_check_inbound_policy(first_mp, connp,
- ipha, NULL, mctl_present);
- if (first_mp == NULL) {
- SCTP_REFRELE(sctp);
- return;
- }
- }
-
- /* Initiate IPPF processing for fastpath */
- if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) {
- ip_process(IPP_LOCAL_IN, &mp,
- recv_ill->ill_phyint->phyint_ifindex);
+ if (((iraflags & IRAF_IS_IPV4) ?
+ CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
+ CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) ||
+ secure) {
+ mp = ipsec_check_inbound_policy(mp, connp, ipha,
+ ip6h, ira);
if (mp == NULL) {
SCTP_REFRELE(sctp);
- if (mctl_present)
- freeb(first_mp);
return;
- } else if (mctl_present) {
- /*
- * ip_process might return a new mp.
- */
- ASSERT(first_mp != mp);
- first_mp->b_cont = mp;
- } else {
- first_mp = mp;
}
}
- if (connp->conn_recvif || connp->conn_recvslla ||
- connp->conn_ip_recvpktinfo) {
- int in_flags = 0;
-
- if (connp->conn_recvif || connp->conn_ip_recvpktinfo) {
- in_flags = IPF_RECVIF;
- }
- if (connp->conn_recvslla) {
- in_flags |= IPF_RECVSLLA;
- }
- if (isv4) {
- mp = ip_add_info(mp, recv_ill, in_flags,
- IPCL_ZONEID(connp), ipst);
- } else {
- mp = ip_add_info_v6(mp, recv_ill, &ip6h->ip6_dst);
- }
- if (mp == NULL) {
- SCTP_REFRELE(sctp);
- if (mctl_present)
- freeb(first_mp);
- return;
- } else if (mctl_present) {
- /*
- * ip_add_info might return a new mp.
- */
- ASSERT(first_mp != mp);
- first_mp->b_cont = mp;
- } else {
- first_mp = mp;
- }
- }
+ ira->ira_ill = ira->ira_rill = NULL;
mutex_enter(&sctp->sctp_lock);
if (sctp->sctp_running) {
- if (mctl_present)
- mp->b_prev = first_mp;
- if (!sctp_add_recvq(sctp, mp, B_FALSE)) {
- BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(first_mp);
- }
+ sctp_add_recvq(sctp, mp, B_FALSE, ira);
mutex_exit(&sctp->sctp_lock);
} else {
sctp->sctp_running = B_TRUE;
@@ -590,24 +452,22 @@ ip_fanout_sctp(mblk_t *mp, ill_t *recv_ill, ipha_t *ipha,
mutex_enter(&sctp->sctp_recvq_lock);
if (sctp->sctp_recvq != NULL) {
- if (mctl_present)
- mp->b_prev = first_mp;
- if (!sctp_add_recvq(sctp, mp, B_TRUE)) {
- BUMP_MIB(recv_ill->ill_ip_mib,
- ipIfStatsInDiscards);
- freemsg(first_mp);
- }
+ sctp_add_recvq(sctp, mp, B_TRUE, ira);
mutex_exit(&sctp->sctp_recvq_lock);
WAKE_SCTP(sctp);
} else {
mutex_exit(&sctp->sctp_recvq_lock);
- sctp_input_data(sctp, mp, (mctl_present ? first_mp :
- NULL));
+ if (ira->ira_flags & IRAF_ICMP_ERROR) {
+ sctp_icmp_error(sctp, mp);
+ } else {
+ sctp_input_data(sctp, mp, ira);
+ }
WAKE_SCTP(sctp);
- sctp_process_sendq(sctp);
}
}
SCTP_REFRELE(sctp);
+ ira->ira_ill = ill;
+ ira->ira_rill = rill;
}
void
@@ -623,7 +483,7 @@ sctp_conn_hash_remove(sctp_t *sctp)
* subsystem.
*/
if (cl_sctp_disconnect != NULL) {
- (*cl_sctp_disconnect)(sctp->sctp_family,
+ (*cl_sctp_disconnect)(sctp->sctp_connp->conn_family,
(cl_sctp_handle_t)sctp);
}
@@ -683,6 +543,7 @@ void
sctp_listen_hash_remove(sctp_t *sctp)
{
sctp_tf_t *tf = sctp->sctp_listen_tfp;
+ conn_t *connp = sctp->sctp_connp;
if (!tf) {
return;
@@ -698,8 +559,8 @@ sctp_listen_hash_remove(sctp_t *sctp)
ssize = sizeof (in6_addr_t) * sctp->sctp_nsaddrs;
slist = kmem_alloc(ssize, KM_SLEEP);
sctp_get_saddr_list(sctp, slist, ssize);
- (*cl_sctp_unlisten)(sctp->sctp_family, slist,
- sctp->sctp_nsaddrs, sctp->sctp_lport);
+ (*cl_sctp_unlisten)(connp->conn_family, slist,
+ sctp->sctp_nsaddrs, connp->conn_lport);
/* list will be freed by the clustering module */
}
@@ -722,7 +583,10 @@ sctp_listen_hash_remove(sctp_t *sctp)
sctp->sctp_listen_hash_next;
if (sctp->sctp_listen_hash_next != NULL) {
- sctp->sctp_listen_hash_next->sctp_listen_hash_prev =
+ sctp_t *next = sctp->sctp_listen_hash_next;
+
+ ASSERT(next->sctp_listen_hash_prev == sctp);
+ next->sctp_listen_hash_prev =
sctp->sctp_listen_hash_prev;
}
}
@@ -735,6 +599,8 @@ sctp_listen_hash_remove(sctp_t *sctp)
void
sctp_listen_hash_insert(sctp_tf_t *tf, sctp_t *sctp)
{
+ conn_t *connp = sctp->sctp_connp;
+
if (sctp->sctp_listen_tfp) {
sctp_listen_hash_remove(sctp);
}
@@ -759,8 +625,8 @@ sctp_listen_hash_insert(sctp_tf_t *tf, sctp_t *sctp)
ssize = sizeof (in6_addr_t) * sctp->sctp_nsaddrs;
slist = kmem_alloc(ssize, KM_SLEEP);
sctp_get_saddr_list(sctp, slist, ssize);
- (*cl_sctp_listen)(sctp->sctp_family, slist,
- sctp->sctp_nsaddrs, sctp->sctp_lport);
+ (*cl_sctp_listen)(connp->conn_family, slist,
+ sctp->sctp_nsaddrs, connp->conn_lport);
/* list will be freed by the clustering module */
}
}
@@ -850,8 +716,8 @@ sctp_lookup(sctp_t *sctp1, in6_addr_t *faddr, sctp_tf_t *tf, uint32_t *ports,
for (sctp = tf->tf_sctp; sctp != NULL;
sctp = sctp->sctp_conn_hash_next) {
- if (*ports != sctp->sctp_ports || sctp->sctp_state <
- min_state) {
+ if (*ports != sctp->sctp_connp->conn_ports ||
+ sctp->sctp_state < min_state) {
continue;
}
@@ -886,38 +752,3 @@ done:
}
return (sctp);
}
-
-boolean_t
-ip_fanout_sctp_raw_match(conn_t *connp, uint32_t ports, ipha_t *ipha)
-{
- uint16_t lport;
-
- if (connp->conn_fully_bound) {
- return (IPCL_CONN_MATCH(connp, IPPROTO_SCTP, ipha->ipha_src,
- ipha->ipha_dst, ports));
- } else {
- lport = htons(ntohl(ports) & 0xFFFF);
- return (IPCL_BIND_MATCH(connp, IPPROTO_SCTP, ipha->ipha_dst,
- lport));
- }
-}
-
-boolean_t
-ip_fanout_sctp_raw_match_v6(conn_t *connp, uint32_t ports, ip6_t *ip6h,
- boolean_t for_v4)
-{
- uint16_t lport;
- in6_addr_t v6dst;
-
- if (!for_v4 && connp->conn_fully_bound) {
- return (IPCL_CONN_MATCH_V6(connp, IPPROTO_SCTP, ip6h->ip6_src,
- ip6h->ip6_dst, ports));
- } else {
- lport = htons(ntohl(ports) & 0xFFFF);
- if (for_v4)
- v6dst = ipv6_all_zeros;
- else
- v6dst = ip6h->ip6_dst;
- return (IPCL_BIND_MATCH_V6(connp, IPPROTO_SCTP, v6dst, lport));
- }
-}
diff --git a/usr/src/uts/common/inet/sctp/sctp_heartbeat.c b/usr/src/uts/common/inet/sctp/sctp_heartbeat.c
index 914f1cac3f..2fbffee1c3 100644
--- a/usr/src/uts/common/inet/sctp/sctp_heartbeat.c
+++ b/usr/src/uts/common/inet/sctp/sctp_heartbeat.c
@@ -20,12 +20,10 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/stream.h>
@@ -66,8 +64,14 @@ sctp_return_heartbeat(sctp_t *sctp, sctp_chunk_hdr_t *hbcp, mblk_t *mp)
addr = inip6h->ip6_src;
}
fp = sctp_lookup_faddr(sctp, &addr);
- ASSERT(fp != NULL);
-
+ /* If the source address is bogus we silently drop the packet */
+ if (fp == NULL) {
+ dprint(1,
+ ("sctp_return_heartbeat: %p bogus hb from %x:%x:%x:%x\n",
+ (void *)sctp, SCTP_PRINTADDR(addr)));
+ SCTP_KSTAT(sctps, sctp_return_hb_failed);
+ return;
+ }
dprint(3, ("sctp_return_heartbeat: %p got hb from %x:%x:%x:%x\n",
(void *)sctp, SCTP_PRINTADDR(addr)));
@@ -98,10 +102,11 @@ sctp_return_heartbeat(sctp_t *sctp, sctp_chunk_hdr_t *hbcp, mblk_t *mp)
smp->b_wptr += len;
- sctp_set_iplen(sctp, smp);
-
BUMP_LOCAL(sctp->sctp_obchunks);
- sctp_add_sendq(sctp, smp);
+
+ sctp_set_iplen(sctp, smp, fp->ixa);
+ (void) conn_ip_output(smp, fp->ixa);
+ BUMP_LOCAL(sctp->sctp_opkts);
}
/*
@@ -126,10 +131,10 @@ sctp_send_heartbeat(sctp_t *sctp, sctp_faddr_t *fp)
SCTP_PRINTADDR(fp->faddr), SCTP_PRINTADDR(fp->saddr)));
hblen = sizeof (*cp) +
- sizeof (*hpp) +
- sizeof (*t) +
- sizeof (fp->hb_secret) +
- sizeof (fp->faddr);
+ sizeof (*hpp) +
+ sizeof (*t) +
+ sizeof (fp->hb_secret) +
+ sizeof (fp->faddr);
hbmp = sctp_make_mp(sctp, fp, hblen);
if (hbmp == NULL) {
SCTP_KSTAT(sctps, sctp_send_hb_failed);
@@ -180,8 +185,6 @@ sctp_send_heartbeat(sctp_t *sctp, sctp_faddr_t *fp)
hbmp->b_wptr += hblen;
- sctp_set_iplen(sctp, hbmp);
-
/* Update the faddr's info */
fp->lastactive = now;
fp->hb_pending = B_TRUE;
@@ -189,7 +192,9 @@ sctp_send_heartbeat(sctp_t *sctp, sctp_faddr_t *fp)
BUMP_LOCAL(sctp->sctp_obchunks);
BUMP_MIB(&sctps->sctps_mib, sctpTimHeartBeatProbe);
- sctp_add_sendq(sctp, hbmp);
+ sctp_set_iplen(sctp, hbmp, fp->ixa);
+ (void) conn_ip_output(hbmp, fp->ixa);
+ BUMP_LOCAL(sctp->sctp_opkts);
}
/*
diff --git a/usr/src/uts/common/inet/sctp/sctp_impl.h b/usr/src/uts/common/inet/sctp/sctp_impl.h
index 32268648f6..d84c3762f3 100644
--- a/usr/src/uts/common/inet/sctp/sctp_impl.h
+++ b/usr/src/uts/common/inet/sctp/sctp_impl.h
@@ -191,7 +191,6 @@ typedef struct sctpparam_s {
#define SCTP_MAX_COMBINED_HEADER_LENGTH (60 + 12) /* Maxed out ip + sctp */
#define SCTP_MAX_IP_OPTIONS_LENGTH (60 - IP_SIMPLE_HDR_LENGTH)
#define SCTP_MAX_HDR_LENGTH 60
-#define ICMP_MIN_SCTP_HDR_LEN (ICMP_MIN_TP_HDR_LEN + sizeof (sctp_hdr_t))
#define SCTP_SECRET_LEN 16
@@ -213,27 +212,6 @@ typedef struct sctpparam_s {
} \
}
-#define SCTP_G_Q_REFHOLD(sctps) { \
- atomic_add_32(&(sctps)->sctps_g_q_ref, 1); \
- ASSERT((sctps)->sctps_g_q_ref != 0); \
- DTRACE_PROBE1(sctp__g__q__refhold, sctp_stack_t, sctps); \
-}
-
-/*
- * Decrement the reference count on sctp_g_q
- * In architectures e.g sun4u, where atomic_add_32_nv is just
- * a cas, we need to maintain the right memory barrier semantics
- * as that of mutex_exit i.e all the loads and stores should complete
- * before the cas is executed. membar_exit() does that here.
- */
-#define SCTP_G_Q_REFRELE(sctps) { \
- ASSERT((sctps)->sctps_g_q_ref != 0); \
- membar_exit(); \
- DTRACE_PROBE1(sctp__g__q__refrele, sctp_stack_t, sctps); \
- if (atomic_add_32_nv(&(sctps)->sctps_g_q_ref, -1) == 0) \
- sctp_g_q_inactive(sctps); \
-}
-
#define SCTP_PRINTADDR(a) (a).s6_addr32[0], (a).s6_addr32[1],\
(a).s6_addr32[2], (a).s6_addr32[3]
@@ -399,15 +377,6 @@ extern sin6_t sctp_sin6_null; /* Zero address for quick clears */
#define SCTP_IS_DETACHED(sctp) ((sctp)->sctp_detached)
-/*
- * Object to represent database of options to search passed to
- * {sock,tpi}optcom_req() interface routine to take care of option
- * management and associated methods.
- * XXX These and other externs should ideally move to a SCTP header
- */
-extern optdb_obj_t sctp_opt_obj;
-extern uint_t sctp_max_optbuf_len;
-
/* Data structure used to track received TSNs */
typedef struct sctp_set_s {
struct sctp_set_s *next;
@@ -528,7 +497,7 @@ typedef struct sctp_faddr_s {
hb_enabled : 1;
mblk_t *rc_timer_mp; /* reliable control chunk timer */
- ire_t *ire; /* cached IRE */
+ ip_xmit_attr_t *ixa; /* Transmit attributes */
uint32_t T3expire; /* # of times T3 timer expired */
uint64_t hb_secret; /* per addr "secret" in heartbeat */
@@ -600,25 +569,6 @@ typedef struct sctp_s {
sctp_ipif_hash_t sctp_saddrs[SCTP_IPIF_HASH];
int sctp_nsaddrs;
- /*
- * These fields contain the same information as sctp_sctph->th_*port.
- * However, the lookup functions can not use the header fields
- * since during IP option manipulation the sctp_sctph pointer
- * changes.
- */
- union {
- struct {
- in_port_t sctpu_fport; /* Remote port */
- in_port_t sctpu_lport; /* Local port */
- } sctpu_ports1;
- uint32_t sctpu_ports2; /* Rem port, */
- /* local port */
- /* Used for SCTP_MATCH performance */
- } sctp_sctpu;
-#define sctp_fport sctp_sctpu.sctpu_ports1.sctpu_fport
-#define sctp_lport sctp_sctpu.sctpu_ports1.sctpu_lport
-#define sctp_ports sctp_sctpu.sctpu_ports2
-
kmutex_t sctp_lock;
kcondvar_t sctp_cv;
boolean_t sctp_running;
@@ -637,12 +587,6 @@ typedef struct sctp_s {
int32_t sctp_state;
conn_t *sctp_connp; /* conn_t stuff */
-#define sctp_zoneid sctp_connp->conn_zoneid
-#define sctp_allzones sctp_connp->conn_allzones
-#define sctp_mac_mode sctp_connp->conn_mac_mode
-#define sctp_credp sctp_connp->conn_cred
-#define sctp_reuseaddr sctp_connp->conn_reuseaddr
-
sctp_stack_t *sctp_sctps;
/* Peer address tracking */
@@ -711,9 +655,6 @@ typedef struct sctp_s {
uint32_t sctp_T3expire; /* # of times T3timer expired */
uint32_t sctp_assoc_start_time; /* time when assoc was est. */
- /* Outbound flow control */
- int32_t sctp_xmit_hiwater; /* Send high water mark */
- int32_t sctp_xmit_lowater; /* Send low water mark */
uint32_t sctp_frwnd; /* Peer RWND */
uint32_t sctp_cwnd_max;
@@ -723,8 +664,8 @@ typedef struct sctp_s {
int32_t sctp_rxqueued; /* No. of bytes in RX q's */
/* Pre-initialized composite headers */
- char *sctp_iphc; /* v4 sctp/ip hdr template buffer */
- char *sctp_iphc6; /* v6 sctp/ip hdr template buffer */
+ uchar_t *sctp_iphc; /* v4 sctp/ip hdr template buffer */
+ uchar_t *sctp_iphc6; /* v6 sctp/ip hdr template buffer */
int32_t sctp_iphc_len; /* actual allocated v4 buffer size */
int32_t sctp_iphc6_len; /* actual allocated v6 buffer size */
@@ -754,17 +695,12 @@ typedef struct sctp_s {
uint32_t
sctp_understands_asconf : 1, /* Peer handles ASCONF chunks */
- sctp_debug : 1, /* SO_DEBUG "socket" option. */
sctp_cchunk_pend : 1, /* Control chunk in flight. */
- sctp_dgram_errind : 1, /* SO_DGRAM_ERRIND option */
-
- sctp_linger : 1, /* SO_LINGER turned on */
sctp_lingering : 1, /* Lingering in close */
sctp_loopback: 1, /* src and dst are the same machine */
- sctp_force_sack : 1,
+ sctp_force_sack : 1,
sctp_ack_timer_running: 1, /* Delayed ACK timer running */
- sctp_recvdstaddr : 1, /* return T_EXTCONN_IND with dstaddr */
sctp_hwcksum : 1, /* The NIC is capable of hwcksum */
sctp_understands_addip : 1,
@@ -802,15 +738,11 @@ typedef struct sctp_s {
} sctp_events;
#define sctp_priv_stream sctp_bits.sctp_priv_stream
#define sctp_understands_asconf sctp_bits.sctp_understands_asconf
-#define sctp_debug sctp_bits.sctp_debug
#define sctp_cchunk_pend sctp_bits.sctp_cchunk_pend
-#define sctp_dgram_errind sctp_bits.sctp_dgram_errind
-#define sctp_linger sctp_bits.sctp_linger
#define sctp_lingering sctp_bits.sctp_lingering
#define sctp_loopback sctp_bits.sctp_loopback
#define sctp_force_sack sctp_bits.sctp_force_sack
#define sctp_ack_timer_running sctp_bits.sctp_ack_timer_running
-#define sctp_recvdstaddr sctp_bits.sctp_recvdstaddr
#define sctp_hwcksum sctp_bits.sctp_hwcksum
#define sctp_understands_addip sctp_bits.sctp_understands_addip
#define sctp_bound_to_all sctp_bits.sctp_bound_to_all
@@ -853,15 +785,6 @@ typedef struct sctp_s {
uint8_t sctp_old_secret[SCTP_SECRET_LEN];
uint32_t sctp_cookie_lifetime; /* cookie lifetime in tick */
- /*
- * Address family that app wishes returned addrsses to be in.
- * Currently taken from address family used in T_BIND_REQ, but
- * should really come from family used in original socket() call.
- * Value can be AF_INET or AF_INET6.
- */
- uint_t sctp_family;
- ushort_t sctp_ipversion;
-
/* Bind hash tables */
kmutex_t *sctp_bind_lockp; /* Ptr to tf_lock */
struct sctp_s *sctp_bind_hash;
@@ -870,14 +793,10 @@ typedef struct sctp_s {
/* Shutdown / cleanup */
sctp_faddr_t *sctp_shutdown_faddr; /* rotate faddr during shutd */
int32_t sctp_client_errno; /* How the client screwed up */
- int sctp_lingertime; /* Close linger time (in seconds) */
kmutex_t sctp_reflock; /* Protects sctp_refcnt & timer mp */
ushort_t sctp_refcnt; /* No. of pending upstream msg */
mblk_t *sctp_timer_mp; /* List of fired timers. */
- /* Misc */
- uint_t sctp_bound_if; /* IPV6_BOUND_IF */
-
mblk_t *sctp_heartbeat_mp; /* Timer block for heartbeats */
uint32_t sctp_hb_interval; /* Default hb_interval */
@@ -897,47 +816,19 @@ typedef struct sctp_s {
mblk_t *sctp_recvq_tail;
taskq_t *sctp_recvq_tq;
- /* Send queue to IP */
- kmutex_t sctp_sendq_lock;
- mblk_t *sctp_sendq;
- mblk_t *sctp_sendq_tail;
- boolean_t sctp_sendq_sending;
-
/* IPv6 ancillary data */
- uint_t sctp_ipv6_recvancillary; /* flags */
-#define SCTP_IPV6_RECVPKTINFO 0x01 /* IPV6_RECVPKTINFO opt */
-#define SCTP_IPV6_RECVHOPLIMIT 0x02 /* IPV6_RECVHOPLIMIT opt */
-#define SCTP_IPV6_RECVHOPOPTS 0x04 /* IPV6_RECVHOPOPTS opt */
-#define SCTP_IPV6_RECVDSTOPTS 0x08 /* IPV6_RECVDSTOPTS opt */
-#define SCTP_IPV6_RECVRTHDR 0x10 /* IPV6_RECVRTHDR opt */
-#define SCTP_IPV6_RECVRTDSTOPTS 0x20 /* IPV6_RECVRTHDRDSTOPTS opt */
-
uint_t sctp_recvifindex; /* last rcvd IPV6_RCVPKTINFO */
uint_t sctp_recvhops; /* " IPV6_RECVHOPLIMIT */
+ uint_t sctp_recvtclass; /* " IPV6_RECVTCLASS */
ip6_hbh_t *sctp_hopopts; /* " IPV6_RECVHOPOPTS */
ip6_dest_t *sctp_dstopts; /* " IPV6_RECVDSTOPTS */
- ip6_dest_t *sctp_rtdstopts; /* " IPV6_RECVRTHDRDSTOPTS */
+ ip6_dest_t *sctp_rthdrdstopts; /* " IPV6_RECVRTHDRDSTOPTS */
ip6_rthdr_t *sctp_rthdr; /* " IPV6_RECVRTHDR */
uint_t sctp_hopoptslen;
uint_t sctp_dstoptslen;
- uint_t sctp_rtdstoptslen;
+ uint_t sctp_rthdrdstoptslen;
uint_t sctp_rthdrlen;
- ip6_pkt_t sctp_sticky_ipp; /* Sticky options */
-#define sctp_ipp_fields sctp_sticky_ipp.ipp_fields
-#define sctp_ipp_ifindex sctp_sticky_ipp.ipp_ifindex
-#define sctp_ipp_addr sctp_sticky_ipp.ipp_addr
-#define sctp_ipp_hoplimit sctp_sticky_ipp.ipp_hoplimit
-#define sctp_ipp_hopoptslen sctp_sticky_ipp.ipp_hopoptslen
-#define sctp_ipp_rtdstoptslen sctp_sticky_ipp.ipp_rtdstoptslen
-#define sctp_ipp_rthdrlen sctp_sticky_ipp.ipp_rthdrlen
-#define sctp_ipp_dstoptslen sctp_sticky_ipp.ipp_dstoptslen
-#define sctp_ipp_hopopts sctp_sticky_ipp.ipp_hopopts
-#define sctp_ipp_rtdstopts sctp_sticky_ipp.ipp_rtdstopts
-#define sctp_ipp_rthdr sctp_sticky_ipp.ipp_rthdr
-#define sctp_ipp_dstopts sctp_sticky_ipp.ipp_dstopts
-#define sctp_ipp_pathmtu sctp_sticky_ipp.ipp_pathmtu
-#define sctp_ipp_nexthop sctp_sticky_ipp.ipp_nexthop
/* Stats */
uint64_t sctp_msgcount;
uint64_t sctp_prsctpdrop;
@@ -951,9 +842,6 @@ typedef struct sctp_s {
mblk_t *sctp_err_chunks; /* Error chunks */
uint32_t sctp_err_len; /* Total error chunks length */
- pid_t sctp_cpid; /* Process id when this was opened */
- uint64_t sctp_open_time; /* time when this was opened */
-
/* additional source data for per endpoint association statistics */
uint64_t sctp_outseqtsns; /* TSN rx > expected TSN */
uint64_t sctp_osacks; /* total sacks sent */
@@ -988,7 +876,7 @@ typedef struct sctp_s {
#define SCTP_TXQ_LEN(sctp) ((sctp)->sctp_unsent + (sctp)->sctp_unacked)
#define SCTP_TXQ_UPDATE(sctp) \
if ((sctp)->sctp_txq_full && SCTP_TXQ_LEN(sctp) <= \
- (sctp)->sctp_xmit_lowater) { \
+ (sctp)->sctp_connp->conn_sndlowat) { \
(sctp)->sctp_txq_full = 0; \
(sctp)->sctp_ulp_xmitted((sctp)->sctp_ulpd, \
B_FALSE); \
@@ -1004,8 +892,8 @@ extern void sctp_add_err(sctp_t *, uint16_t, void *, size_t,
extern int sctp_add_faddr(sctp_t *, in6_addr_t *, int, boolean_t);
extern boolean_t sctp_add_ftsn_set(sctp_ftsn_set_t **, sctp_faddr_t *, mblk_t *,
uint_t *, uint32_t *);
-extern boolean_t sctp_add_recvq(sctp_t *, mblk_t *, boolean_t);
-extern void sctp_add_sendq(sctp_t *, mblk_t *);
+extern void sctp_add_recvq(sctp_t *, mblk_t *, boolean_t,
+ ip_recv_attr_t *);
extern void sctp_add_unrec_parm(sctp_parm_hdr_t *, mblk_t **, boolean_t);
extern size_t sctp_addr_params(sctp_t *, int, uchar_t *, boolean_t);
extern mblk_t *sctp_add_proto_hdr(sctp_t *, sctp_faddr_t *, mblk_t *, int,
@@ -1013,7 +901,6 @@ extern mblk_t *sctp_add_proto_hdr(sctp_t *, sctp_faddr_t *, mblk_t *, int,
extern void sctp_addr_req(sctp_t *, mblk_t *);
extern sctp_t *sctp_addrlist2sctp(mblk_t *, sctp_hdr_t *, sctp_chunk_hdr_t *,
zoneid_t, sctp_stack_t *);
-extern void sctp_add_hdr(sctp_t *, uchar_t *, size_t);
extern void sctp_check_adv_ack_pt(sctp_t *, mblk_t *, mblk_t *);
extern void sctp_assoc_event(sctp_t *, uint16_t, uint16_t,
sctp_chunk_hdr_t *);
@@ -1024,7 +911,7 @@ extern int sctp_bindi(sctp_t *, in_port_t, boolean_t, int, in_port_t *);
extern int sctp_bind_add(sctp_t *, const void *, uint32_t, boolean_t,
in_port_t);
extern int sctp_bind_del(sctp_t *, const void *, uint32_t, boolean_t);
-extern int sctp_build_hdrs(sctp_t *);
+extern int sctp_build_hdrs(sctp_t *, int);
extern int sctp_check_abandoned_msg(sctp_t *, mblk_t *);
extern void sctp_clean_death(sctp_t *, int);
@@ -1035,11 +922,9 @@ extern void sctp_conn_hash_insert(sctp_tf_t *, sctp_t *, int);
extern void sctp_conn_hash_remove(sctp_t *);
extern void sctp_conn_init(conn_t *);
extern sctp_t *sctp_conn_match(in6_addr_t *, in6_addr_t *, uint32_t,
- zoneid_t, sctp_stack_t *);
+ zoneid_t, iaflags_t, sctp_stack_t *);
extern sctp_t *sctp_conn_request(sctp_t *, mblk_t *, uint_t, uint_t,
- sctp_init_chunk_t *, mblk_t *);
-extern int sctp_conprim_opt_process(queue_t *, mblk_t *, int *, int *,
- int *);
+ sctp_init_chunk_t *, ip_recv_attr_t *);
extern uint32_t sctp_cumack(sctp_t *, uint32_t, mblk_t **);
extern sctp_t *sctp_create_eager(sctp_t *);
@@ -1066,10 +951,9 @@ extern void sctp_ftsn_sets_init(void);
extern int sctp_get_addrlist(sctp_t *, const void *, uint32_t *,
uchar_t **, int *, size_t *);
-extern void sctp_g_q_inactive(sctp_stack_t *);
extern int sctp_get_addrparams(sctp_t *, sctp_t *, mblk_t *,
sctp_chunk_hdr_t *, uint_t *);
-extern void sctp_get_ire(sctp_t *, sctp_faddr_t *);
+extern void sctp_get_dest(sctp_t *, sctp_faddr_t *);
extern void sctp_get_faddr_list(sctp_t *, uchar_t *, size_t);
extern mblk_t *sctp_get_first_sent(sctp_t *);
extern mblk_t *sctp_get_msg_to_send(sctp_t *, mblk_t **, mblk_t *, int *,
@@ -1077,22 +961,20 @@ extern mblk_t *sctp_get_msg_to_send(sctp_t *, mblk_t **, mblk_t *, int *,
extern void sctp_get_saddr_list(sctp_t *, uchar_t *, size_t);
extern int sctp_handle_error(sctp_t *, sctp_hdr_t *, sctp_chunk_hdr_t *,
- mblk_t *);
+ mblk_t *, ip_recv_attr_t *);
extern void sctp_hash_destroy(sctp_stack_t *);
extern void sctp_hash_init(sctp_stack_t *);
-extern int sctp_header_init_ipv4(sctp_t *, int);
-extern int sctp_header_init_ipv6(sctp_t *, int);
extern void sctp_heartbeat_timer(sctp_t *);
extern void sctp_icmp_error(sctp_t *, mblk_t *);
extern void sctp_inc_taskq(sctp_stack_t *);
extern void sctp_info_req(sctp_t *, mblk_t *);
-extern mblk_t *sctp_init_mp(sctp_t *);
+extern mblk_t *sctp_init_mp(sctp_t *, sctp_faddr_t *);
extern boolean_t sctp_initialize_params(sctp_t *, sctp_init_chunk_t *,
sctp_init_chunk_t *);
extern uint32_t sctp_init2vtag(sctp_chunk_hdr_t *);
extern void sctp_intf_event(sctp_t *, in6_addr_t, int, int);
-extern void sctp_input_data(sctp_t *, mblk_t *, mblk_t *);
+extern void sctp_input_data(sctp_t *, mblk_t *, ip_recv_attr_t *);
extern void sctp_instream_cleanup(sctp_t *, boolean_t);
extern int sctp_is_a_faddr_clean(sctp_t *);
@@ -1124,7 +1006,8 @@ extern int sctp_nd_getset(queue_t *, MBLKP);
extern boolean_t sctp_nd_init(sctp_stack_t *);
extern sctp_parm_hdr_t *sctp_next_parm(sctp_parm_hdr_t *, ssize_t *);
-extern void sctp_ootb_shutdown_ack(sctp_t *, mblk_t *, uint_t);
+extern void sctp_ootb_shutdown_ack(mblk_t *, uint_t, ip_recv_attr_t *,
+ ip_stack_t *);
extern size_t sctp_options_param(const sctp_t *, void *, int);
extern size_t sctp_options_param_len(const sctp_t *, int);
extern void sctp_output(sctp_t *, uint_t);
@@ -1132,10 +1015,10 @@ extern void sctp_output(sctp_t *, uint_t);
extern boolean_t sctp_param_register(IDP *, sctpparam_t *, int, sctp_stack_t *);
extern void sctp_partial_delivery_event(sctp_t *);
extern int sctp_process_cookie(sctp_t *, sctp_chunk_hdr_t *, mblk_t *,
- sctp_init_chunk_t **, sctp_hdr_t *, int *, in6_addr_t *);
+ sctp_init_chunk_t **, sctp_hdr_t *, int *, in6_addr_t *,
+ ip_recv_attr_t *);
extern void sctp_process_err(sctp_t *);
extern void sctp_process_heartbeat(sctp_t *, sctp_chunk_hdr_t *);
-extern void sctp_process_sendq(sctp_t *);
extern void sctp_process_timer(sctp_t *);
extern void sctp_redo_faddr_srcs(sctp_t *);
@@ -1149,13 +1032,17 @@ extern sctp_faddr_t *sctp_rotate_faddr(sctp_t *, sctp_faddr_t *);
extern boolean_t sctp_sack(sctp_t *, mblk_t *);
extern int sctp_secure_restart_check(mblk_t *, sctp_chunk_hdr_t *,
- uint32_t, int, sctp_stack_t *);
+ uint32_t, int, sctp_stack_t *, ip_recv_attr_t *);
extern void sctp_send_abort(sctp_t *, uint32_t, uint16_t, char *, size_t,
- mblk_t *, int, boolean_t);
+ mblk_t *, int, boolean_t, ip_recv_attr_t *);
+extern void sctp_ootb_send_abort(uint32_t, uint16_t, char *, size_t,
+ const mblk_t *, int, boolean_t, ip_recv_attr_t *,
+ ip_stack_t *);
extern void sctp_send_cookie_ack(sctp_t *);
-extern void sctp_send_cookie_echo(sctp_t *, sctp_chunk_hdr_t *, mblk_t *);
+extern void sctp_send_cookie_echo(sctp_t *, sctp_chunk_hdr_t *, mblk_t *,
+ ip_recv_attr_t *);
extern void sctp_send_initack(sctp_t *, sctp_hdr_t *, sctp_chunk_hdr_t *,
- mblk_t *);
+ mblk_t *, ip_recv_attr_t *);
extern void sctp_send_shutdown(sctp_t *, int);
extern void sctp_send_heartbeat(sctp_t *, sctp_faddr_t *);
extern void sctp_sendfail_event(sctp_t *, mblk_t *, int, boolean_t);
@@ -1170,7 +1057,7 @@ extern int sctp_shutdown_received(sctp_t *, sctp_chunk_hdr_t *, boolean_t,
boolean_t, sctp_faddr_t *);
extern void sctp_shutdown_complete(sctp_t *);
extern void sctp_set_if_mtu(sctp_t *);
-extern void sctp_set_iplen(sctp_t *, mblk_t *);
+extern void sctp_set_iplen(sctp_t *, mblk_t *, ip_xmit_attr_t *);
extern void sctp_set_ulp_prop(sctp_t *);
extern void sctp_ss_rexmit(sctp_t *);
extern size_t sctp_supaddr_param_len(sctp_t *);
@@ -1183,7 +1070,7 @@ extern void sctp_timer_free(mblk_t *);
extern void sctp_timer_stop(mblk_t *);
extern void sctp_unlink_faddr(sctp_t *, sctp_faddr_t *);
-extern void sctp_update_ire(sctp_t *sctp);
+extern void sctp_update_dce(sctp_t *sctp);
extern in_port_t sctp_update_next_port(in_port_t, zone_t *zone, sctp_stack_t *);
extern void sctp_update_rtt(sctp_t *, sctp_faddr_t *, clock_t);
extern void sctp_user_abort(sctp_t *, mblk_t *);
@@ -1209,17 +1096,6 @@ extern void (*cl_sctp_assoc_change)(sa_family_t, uchar_t *, size_t, uint_t,
extern void (*cl_sctp_check_addrs)(sa_family_t, in_port_t, uchar_t **,
size_t, uint_t *, boolean_t);
-/* Send a mp to IP. */
-#define IP_PUT(mp, conn, isv4) \
-{ \
- sctp_stack_t *sctps = conn->conn_netstack->netstack_sctp; \
- \
- if ((isv4)) \
- ip_output((conn), (mp), WR(sctps->sctps_g_q), IP_WPUT); \
- else \
- ip_output_v6((conn), (mp), WR(sctps->sctps_g_q), IP_WPUT);\
-}
-
#define RUN_SCTP(sctp) \
{ \
mutex_enter(&(sctp)->sctp_lock); \
diff --git a/usr/src/uts/common/inet/sctp/sctp_init.c b/usr/src/uts/common/inet/sctp/sctp_init.c
index 5547609c98..ff34147a65 100644
--- a/usr/src/uts/common/inet/sctp/sctp_init.c
+++ b/usr/src/uts/common/inet/sctp/sctp_init.c
@@ -20,12 +20,10 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/stream.h>
#include <sys/ddi.h>
@@ -45,32 +43,6 @@
#include "sctp_impl.h"
#include "sctp_addr.h"
-/*
- * This will compute the checksum over the SCTP packet, so this
- * function should only be called after the whole packet has been
- * built.
- *
- * rptr should point to the IP / SCTP composite header.
- * len should be the length of the entire packet, including the IP
- * header.
- */
-void
-sctp_add_hdr(sctp_t *sctp, uchar_t *rptr, size_t len)
-{
- ipha_t *iphdr;
- short iplen;
-
- ASSERT(len >= sctp->sctp_hdr_len);
-
- /* Copy the common header from the template */
- bcopy(sctp->sctp_iphc, rptr, sctp->sctp_hdr_len);
-
- /* Set the total length in the IP hdr */
- iplen = (short)len;
- iphdr = (ipha_t *)rptr;
- U16_TO_ABE16(iplen, &iphdr->ipha_length);
-}
-
/*ARGSUSED*/
size_t
sctp_supaddr_param_len(sctp_t *sctp)
@@ -83,17 +55,18 @@ sctp_supaddr_param(sctp_t *sctp, uchar_t *p)
{
sctp_parm_hdr_t *sph;
uint16_t *addrtype;
+ conn_t *connp = sctp->sctp_connp;
sph = (sctp_parm_hdr_t *)p;
sph->sph_type = htons(PARM_SUPP_ADDRS);
addrtype = (uint16_t *)(sph + 1);
- switch (sctp->sctp_ipversion) {
- case IPV4_VERSION:
+ switch (connp->conn_family) {
+ case AF_INET:
*addrtype++ = htons(PARM_ADDR4);
*addrtype = 0;
sph->sph_len = htons(sizeof (*sph) + sizeof (*addrtype));
break;
- case IPV6_VERSION:
+ case AF_INET6:
*addrtype++ = htons(PARM_ADDR6);
if (!sctp->sctp_connp->conn_ipv6_v6only) {
*addrtype = htons(PARM_ADDR4);
@@ -167,7 +140,7 @@ sctp_adaptation_code_param(sctp_t *sctp, uchar_t *p)
}
mblk_t *
-sctp_init_mp(sctp_t *sctp)
+sctp_init_mp(sctp_t *sctp, sctp_faddr_t *fp)
{
mblk_t *mp;
uchar_t *p;
@@ -176,12 +149,12 @@ sctp_init_mp(sctp_t *sctp)
sctp_chunk_hdr_t *chp;
uint16_t schlen;
int supp_af;
- sctp_stack_t *sctps = sctp->sctp_sctps;
+ sctp_stack_t *sctps = sctp->sctp_sctps;
+ conn_t *connp = sctp->sctp_connp;
- if (sctp->sctp_family == AF_INET) {
+ if (connp->conn_family == AF_INET) {
supp_af = PARM_SUPP_V4;
} else {
- /* Assume here that a v6 endpoint supports v4 address. */
if (sctp->sctp_connp->conn_ipv6_v6only)
supp_af = PARM_SUPP_V6;
else
@@ -203,11 +176,17 @@ sctp_init_mp(sctp_t *sctp)
sctp->sctp_sctph->sh_verf = 0;
sctp->sctp_sctph6->sh_verf = 0;
- mp = sctp_make_mp(sctp, NULL, initlen);
+ mp = sctp_make_mp(sctp, fp, initlen);
if (mp == NULL) {
SCTP_KSTAT(sctps, sctp_send_init_failed);
return (NULL);
}
+ /* sctp_make_mp could have discovered we have no usable sources */
+ if (sctp->sctp_nsaddrs == 0) {
+ freemsg(mp);
+ SCTP_KSTAT(sctps, sctp_send_init_failed);
+ return (NULL);
+ }
/* Lay in a new INIT chunk, starting with the chunk header */
chp = (sctp_chunk_hdr_t *)mp->b_wptr;
@@ -242,7 +221,7 @@ sctp_init_mp(sctp_t *sctp)
BUMP_LOCAL(sctp->sctp_obchunks);
- sctp_set_iplen(sctp, mp);
+ sctp_set_iplen(sctp, mp, fp->ixa);
return (mp);
}
diff --git a/usr/src/uts/common/inet/sctp/sctp_input.c b/usr/src/uts/common/inet/sctp/sctp_input.c
index e18bfeacdd..e4a5ef5c5b 100644
--- a/usr/src/uts/common/inet/sctp/sctp_input.c
+++ b/usr/src/uts/common/inet/sctp/sctp_input.c
@@ -42,6 +42,7 @@
#include <inet/common.h>
#include <inet/ip.h>
+#include <inet/ip_if.h>
#include <inet/ip6.h>
#include <inet/mib2.h>
#include <inet/ipclassifier.h>
@@ -318,7 +319,7 @@ sctp_next_chunk(sctp_chunk_hdr_t *ch, ssize_t *remaining)
*/
static int
sctp_input_add_ancillary(sctp_t *sctp, mblk_t **mp, sctp_data_hdr_t *dcp,
- sctp_faddr_t *fp, ip6_pkt_t *ipp)
+ sctp_faddr_t *fp, ip_pkt_t *ipp, ip_recv_attr_t *ira)
{
struct T_unitdata_ind *tudi;
int optlen;
@@ -329,57 +330,61 @@ sctp_input_add_ancillary(sctp_t *sctp, mblk_t **mp, sctp_data_hdr_t *dcp,
struct sockaddr_in6 sin_buf[1];
struct sockaddr_in6 *sin6;
struct sockaddr_in *sin4;
- uint_t addflag = 0;
+ crb_t addflag; /* Which pieces to add */
+ conn_t *connp = sctp->sctp_connp;
sin4 = NULL;
sin6 = NULL;
optlen = hdrlen = 0;
+ addflag.crb_all = 0;
/* Figure out address size */
- if (sctp->sctp_ipversion == IPV4_VERSION) {
+ if (connp->conn_family == AF_INET) {
sin4 = (struct sockaddr_in *)sin_buf;
sin4->sin_family = AF_INET;
- sin4->sin_port = sctp->sctp_fport;
+ sin4->sin_port = connp->conn_fport;
IN6_V4MAPPED_TO_IPADDR(&fp->faddr, sin4->sin_addr.s_addr);
hdrlen = sizeof (*tudi) + sizeof (*sin4);
} else {
sin6 = sin_buf;
sin6->sin6_family = AF_INET6;
- sin6->sin6_port = sctp->sctp_fport;
+ sin6->sin6_port = connp->conn_fport;
sin6->sin6_addr = fp->faddr;
hdrlen = sizeof (*tudi) + sizeof (*sin6);
}
-
/* If app asked to receive send / recv info */
- if (sctp->sctp_recvsndrcvinfo) {
+ if (sctp->sctp_recvsndrcvinfo)
optlen += sizeof (*cmsg) + sizeof (struct sctp_sndrcvinfo);
- if (hdrlen == 0)
- hdrlen = sizeof (struct T_optdata_ind);
- }
- if (sctp->sctp_ipv6_recvancillary == 0)
+ if (connp->conn_recv_ancillary.crb_all == 0)
goto noancillary;
- if ((ipp->ipp_fields & IPPF_IFINDEX) &&
- ipp->ipp_ifindex != sctp->sctp_recvifindex &&
- (sctp->sctp_ipv6_recvancillary & SCTP_IPV6_RECVPKTINFO)) {
+ if (connp->conn_recv_ancillary.crb_ip_recvpktinfo &&
+ ira->ira_ruifindex != sctp->sctp_recvifindex) {
optlen += sizeof (*cmsg) + sizeof (struct in6_pktinfo);
if (hdrlen == 0)
hdrlen = sizeof (struct T_unitdata_ind);
- addflag |= SCTP_IPV6_RECVPKTINFO;
+ addflag.crb_ip_recvpktinfo = 1;
}
/* If app asked for hoplimit and it has changed ... */
- if ((ipp->ipp_fields & IPPF_HOPLIMIT) &&
- ipp->ipp_hoplimit != sctp->sctp_recvhops &&
- (sctp->sctp_ipv6_recvancillary & SCTP_IPV6_RECVHOPLIMIT)) {
+ if (connp->conn_recv_ancillary.crb_ipv6_recvhoplimit &&
+ ipp->ipp_hoplimit != sctp->sctp_recvhops) {
optlen += sizeof (*cmsg) + sizeof (uint_t);
if (hdrlen == 0)
hdrlen = sizeof (struct T_unitdata_ind);
- addflag |= SCTP_IPV6_RECVHOPLIMIT;
+ addflag.crb_ipv6_recvhoplimit = 1;
+ }
+ /* If app asked for tclass and it has changed ... */
+ if (connp->conn_recv_ancillary.crb_ipv6_recvtclass &&
+ ipp->ipp_tclass != sctp->sctp_recvtclass) {
+ optlen += sizeof (struct T_opthdr) + sizeof (uint_t);
+ if (hdrlen == 0)
+ hdrlen = sizeof (struct T_unitdata_ind);
+ addflag.crb_ipv6_recvtclass = 1;
}
/* If app asked for hopbyhop headers and it has changed ... */
- if ((sctp->sctp_ipv6_recvancillary & SCTP_IPV6_RECVHOPOPTS) &&
+ if (connp->conn_recv_ancillary.crb_ipv6_recvhopopts &&
ip_cmpbuf(sctp->sctp_hopopts, sctp->sctp_hopoptslen,
(ipp->ipp_fields & IPPF_HOPOPTS),
ipp->ipp_hopopts, ipp->ipp_hopoptslen)) {
@@ -387,7 +392,7 @@ sctp_input_add_ancillary(sctp_t *sctp, mblk_t **mp, sctp_data_hdr_t *dcp,
sctp->sctp_v6label_len;
if (hdrlen == 0)
hdrlen = sizeof (struct T_unitdata_ind);
- addflag |= SCTP_IPV6_RECVHOPOPTS;
+ addflag.crb_ipv6_recvhopopts = 1;
if (!ip_allocbuf((void **)&sctp->sctp_hopopts,
&sctp->sctp_hopoptslen,
(ipp->ipp_fields & IPPF_HOPOPTS),
@@ -395,45 +400,44 @@ sctp_input_add_ancillary(sctp_t *sctp, mblk_t **mp, sctp_data_hdr_t *dcp,
return (-1);
}
/* If app asked for dst headers before routing headers ... */
- if ((sctp->sctp_ipv6_recvancillary & SCTP_IPV6_RECVRTDSTOPTS) &&
- ip_cmpbuf(sctp->sctp_rtdstopts, sctp->sctp_rtdstoptslen,
- (ipp->ipp_fields & IPPF_RTDSTOPTS),
- ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen)) {
- optlen += sizeof (*cmsg) + ipp->ipp_rtdstoptslen;
+ if (connp->conn_recv_ancillary.crb_ipv6_recvrthdrdstopts &&
+ ip_cmpbuf(sctp->sctp_rthdrdstopts, sctp->sctp_rthdrdstoptslen,
+ (ipp->ipp_fields & IPPF_RTHDRDSTOPTS),
+ ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen)) {
+ optlen += sizeof (*cmsg) + ipp->ipp_rthdrdstoptslen;
if (hdrlen == 0)
hdrlen = sizeof (struct T_unitdata_ind);
- addflag |= SCTP_IPV6_RECVRTDSTOPTS;
- if (!ip_allocbuf((void **)&sctp->sctp_rtdstopts,
- &sctp->sctp_rtdstoptslen,
- (ipp->ipp_fields & IPPF_RTDSTOPTS),
- ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen))
+ addflag.crb_ipv6_recvrthdrdstopts = 1;
+ if (!ip_allocbuf((void **)&sctp->sctp_rthdrdstopts,
+ &sctp->sctp_rthdrdstoptslen,
+ (ipp->ipp_fields & IPPF_RTHDRDSTOPTS),
+ ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen))
return (-1);
}
/* If app asked for routing headers and it has changed ... */
- if (sctp->sctp_ipv6_recvancillary & SCTP_IPV6_RECVRTHDR) {
- if (ip_cmpbuf(sctp->sctp_rthdr, sctp->sctp_rthdrlen,
+ if (connp->conn_recv_ancillary.crb_ipv6_recvrthdr &&
+ ip_cmpbuf(sctp->sctp_rthdr, sctp->sctp_rthdrlen,
+ (ipp->ipp_fields & IPPF_RTHDR),
+ ipp->ipp_rthdr, ipp->ipp_rthdrlen)) {
+ optlen += sizeof (*cmsg) + ipp->ipp_rthdrlen;
+ if (hdrlen == 0)
+ hdrlen = sizeof (struct T_unitdata_ind);
+ addflag.crb_ipv6_recvrthdr = 1;
+ if (!ip_allocbuf((void **)&sctp->sctp_rthdr,
+ &sctp->sctp_rthdrlen,
(ipp->ipp_fields & IPPF_RTHDR),
- ipp->ipp_rthdr, ipp->ipp_rthdrlen)) {
- optlen += sizeof (*cmsg) + ipp->ipp_rthdrlen;
- if (hdrlen == 0)
- hdrlen = sizeof (struct T_unitdata_ind);
- addflag |= SCTP_IPV6_RECVRTHDR;
- if (!ip_allocbuf((void **)&sctp->sctp_rthdr,
- &sctp->sctp_rthdrlen,
- (ipp->ipp_fields & IPPF_RTHDR),
- ipp->ipp_rthdr, ipp->ipp_rthdrlen))
- return (-1);
- }
+ ipp->ipp_rthdr, ipp->ipp_rthdrlen))
+ return (-1);
}
/* If app asked for dest headers and it has changed ... */
- if ((sctp->sctp_ipv6_recvancillary & SCTP_IPV6_RECVDSTOPTS) &&
+ if (connp->conn_recv_ancillary.crb_ipv6_recvdstopts &&
ip_cmpbuf(sctp->sctp_dstopts, sctp->sctp_dstoptslen,
(ipp->ipp_fields & IPPF_DSTOPTS),
ipp->ipp_dstopts, ipp->ipp_dstoptslen)) {
optlen += sizeof (*cmsg) + ipp->ipp_dstoptslen;
if (hdrlen == 0)
hdrlen = sizeof (struct T_unitdata_ind);
- addflag |= SCTP_IPV6_RECVDSTOPTS;
+ addflag.crb_ipv6_recvdstopts = 1;
if (!ip_allocbuf((void **)&sctp->sctp_dstopts,
&sctp->sctp_dstoptslen,
(ipp->ipp_fields & IPPF_DSTOPTS),
@@ -499,9 +503,11 @@ noancillary:
* If app asked for pktinfo and the index has changed ...
* Note that the local address never changes for the connection.
*/
- if (addflag & SCTP_IPV6_RECVPKTINFO) {
+ if (addflag.crb_ip_recvpktinfo) {
struct in6_pktinfo *pkti;
+ uint_t ifindex;
+ ifindex = ira->ira_ruifindex;
cmsg = (struct cmsghdr *)optptr;
cmsg->cmsg_level = IPPROTO_IPV6;
cmsg->cmsg_type = IPV6_PKTINFO;
@@ -509,19 +515,20 @@ noancillary:
optptr += sizeof (*cmsg);
pkti = (struct in6_pktinfo *)optptr;
- if (sctp->sctp_ipversion == IPV6_VERSION)
+ if (connp->conn_family == AF_INET6)
pkti->ipi6_addr = sctp->sctp_ip6h->ip6_src;
else
IN6_IPADDR_TO_V4MAPPED(sctp->sctp_ipha->ipha_src,
&pkti->ipi6_addr);
- pkti->ipi6_ifindex = ipp->ipp_ifindex;
+
+ pkti->ipi6_ifindex = ifindex;
optptr += sizeof (*pkti);
ASSERT(OK_32PTR(optptr));
/* Save as "last" value */
- sctp->sctp_recvifindex = ipp->ipp_ifindex;
+ sctp->sctp_recvifindex = ifindex;
}
/* If app asked for hoplimit and it has changed ... */
- if (addflag & SCTP_IPV6_RECVHOPLIMIT) {
+ if (addflag.crb_ipv6_recvhoplimit) {
cmsg = (struct cmsghdr *)optptr;
cmsg->cmsg_level = IPPROTO_IPV6;
cmsg->cmsg_type = IPV6_HOPLIMIT;
@@ -534,7 +541,21 @@ noancillary:
/* Save as "last" value */
sctp->sctp_recvhops = ipp->ipp_hoplimit;
}
- if (addflag & SCTP_IPV6_RECVHOPOPTS) {
+ /* If app asked for tclass and it has changed ... */
+ if (addflag.crb_ipv6_recvtclass) {
+ cmsg = (struct cmsghdr *)optptr;
+ cmsg->cmsg_level = IPPROTO_IPV6;
+ cmsg->cmsg_type = IPV6_TCLASS;
+ cmsg->cmsg_len = sizeof (*cmsg) + sizeof (uint_t);
+ optptr += sizeof (*cmsg);
+
+ *(uint_t *)optptr = ipp->ipp_tclass;
+ optptr += sizeof (uint_t);
+ ASSERT(OK_32PTR(optptr));
+ /* Save as "last" value */
+ sctp->sctp_recvtclass = ipp->ipp_tclass;
+ }
+ if (addflag.crb_ipv6_recvhopopts) {
cmsg = (struct cmsghdr *)optptr;
cmsg->cmsg_level = IPPROTO_IPV6;
cmsg->cmsg_type = IPV6_HOPOPTS;
@@ -550,23 +571,23 @@ noancillary:
(ipp->ipp_fields & IPPF_HOPOPTS),
ipp->ipp_hopopts, ipp->ipp_hopoptslen);
}
- if (addflag & SCTP_IPV6_RECVRTDSTOPTS) {
+ if (addflag.crb_ipv6_recvrthdrdstopts) {
cmsg = (struct cmsghdr *)optptr;
cmsg->cmsg_level = IPPROTO_IPV6;
cmsg->cmsg_type = IPV6_RTHDRDSTOPTS;
- cmsg->cmsg_len = sizeof (*cmsg) + ipp->ipp_rtdstoptslen;
+ cmsg->cmsg_len = sizeof (*cmsg) + ipp->ipp_rthdrdstoptslen;
optptr += sizeof (*cmsg);
- bcopy(ipp->ipp_rtdstopts, optptr, ipp->ipp_rtdstoptslen);
- optptr += ipp->ipp_rtdstoptslen;
+ bcopy(ipp->ipp_rthdrdstopts, optptr, ipp->ipp_rthdrdstoptslen);
+ optptr += ipp->ipp_rthdrdstoptslen;
ASSERT(OK_32PTR(optptr));
/* Save as last value */
- ip_savebuf((void **)&sctp->sctp_rtdstopts,
- &sctp->sctp_rtdstoptslen,
- (ipp->ipp_fields & IPPF_RTDSTOPTS),
- ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen);
+ ip_savebuf((void **)&sctp->sctp_rthdrdstopts,
+ &sctp->sctp_rthdrdstoptslen,
+ (ipp->ipp_fields & IPPF_RTHDRDSTOPTS),
+ ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen);
}
- if (addflag & SCTP_IPV6_RECVRTHDR) {
+ if (addflag.crb_ipv6_recvrthdr) {
cmsg = (struct cmsghdr *)optptr;
cmsg->cmsg_level = IPPROTO_IPV6;
cmsg->cmsg_type = IPV6_RTHDR;
@@ -582,7 +603,7 @@ noancillary:
(ipp->ipp_fields & IPPF_RTHDR),
ipp->ipp_rthdr, ipp->ipp_rthdrlen);
}
- if (addflag & SCTP_IPV6_RECVDSTOPTS) {
+ if (addflag.crb_ipv6_recvdstopts) {
cmsg = (struct cmsghdr *)optptr;
cmsg->cmsg_level = IPPROTO_IPV6;
cmsg->cmsg_type = IPV6_DSTOPTS;
@@ -778,7 +799,6 @@ static mblk_t *
sctp_try_partial_delivery(sctp_t *sctp, mblk_t *hmp, sctp_reass_t *srp,
sctp_data_hdr_t **dc)
{
- mblk_t *first_mp;
mblk_t *mp;
mblk_t *dmp;
mblk_t *qmp;
@@ -791,8 +811,7 @@ sctp_try_partial_delivery(sctp_t *sctp, mblk_t *hmp, sctp_reass_t *srp,
dprint(4, ("trypartial: got=%d, needed=%d\n",
(int)(srp->got), (int)(srp->needed)));
- first_mp = hmp->b_cont;
- mp = first_mp;
+ mp = hmp->b_cont;
qdc = (sctp_data_hdr_t *)mp->b_rptr;
ASSERT(SCTP_DATA_GET_BBIT(qdc) && srp->hasBchunk);
@@ -1175,7 +1194,7 @@ sctp_add_dup(uint32_t tsn, mblk_t **dups)
static void
sctp_data_chunk(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *mp, mblk_t **dups,
- sctp_faddr_t *fp, ip6_pkt_t *ipp)
+ sctp_faddr_t *fp, ip_pkt_t *ipp, ip_recv_attr_t *ira)
{
sctp_data_hdr_t *dc;
mblk_t *dmp, *pmp;
@@ -1419,7 +1438,8 @@ sctp_data_chunk(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *mp, mblk_t **dups,
if (can_deliver) {
dmp->b_rptr = (uchar_t *)(dc + 1);
- if (sctp_input_add_ancillary(sctp, &dmp, dc, fp, ipp) == 0) {
+ if (sctp_input_add_ancillary(sctp, &dmp, dc, fp,
+ ipp, ira) == 0) {
dprint(1, ("sctp_data_chunk: delivering %lu bytes\n",
msgdsize(dmp)));
sctp->sctp_rwnd -= dlen;
@@ -1507,7 +1527,7 @@ sctp_data_chunk(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *mp, mblk_t **dups,
if (can_deliver) {
dmp->b_rptr = (uchar_t *)(dc + 1);
if (sctp_input_add_ancillary(sctp, &dmp, dc, fp,
- ipp) == 0) {
+ ipp, ira) == 0) {
dprint(1, ("sctp_data_chunk: delivering %lu "
"bytes\n", msgdsize(dmp)));
sctp->sctp_rwnd -= dlen;
@@ -1646,6 +1666,8 @@ sctp_make_sack(sctp_t *sctp, sctp_faddr_t *sendto, mblk_t *dups)
uint32_t dups_len;
sctp_faddr_t *fp;
+ ASSERT(sendto != NULL);
+
if (sctp->sctp_force_sack) {
sctp->sctp_force_sack = 0;
goto checks_done;
@@ -1696,8 +1718,9 @@ checks_done:
return (NULL);
}
smp->b_cont = sctp->sctp_err_chunks;
- sctp_set_iplen(sctp, smp);
- sctp_add_sendq(sctp, smp);
+ sctp_set_iplen(sctp, smp, fp->ixa);
+ (void) conn_ip_output(smp, fp->ixa);
+ BUMP_LOCAL(sctp->sctp_opkts);
sctp->sctp_err_chunks = NULL;
sctp->sctp_err_len = 0;
}
@@ -1749,8 +1772,6 @@ sctp_sack(sctp_t *sctp, mblk_t *dups)
freeb(dups);
return (B_FALSE);
}
- sctp_set_iplen(sctp, smp);
-
dprint(2, ("sctp_sack: sending to %p %x:%x:%x:%x\n",
(void *)sctp->sctp_lastdata,
SCTP_PRINTADDR(sctp->sctp_lastdata->faddr)));
@@ -1758,7 +1779,10 @@ sctp_sack(sctp_t *sctp, mblk_t *dups)
sctp->sctp_active = lbolt64;
BUMP_MIB(&sctps->sctps_mib, sctpOutAck);
- sctp_add_sendq(sctp, smp);
+
+ sctp_set_iplen(sctp, smp, sctp->sctp_lastdata->ixa);
+ (void) conn_ip_output(smp, sctp->sctp_lastdata->ixa);
+ BUMP_LOCAL(sctp->sctp_opkts);
return (B_TRUE);
}
@@ -1813,8 +1837,9 @@ sctp_check_abandoned_msg(sctp_t *sctp, mblk_t *meta)
return (ENOMEM);
}
SCTP_MSG_SET_ABANDONED(meta);
- sctp_set_iplen(sctp, head);
- sctp_add_sendq(sctp, head);
+ sctp_set_iplen(sctp, head, fp->ixa);
+ (void) conn_ip_output(head, fp->ixa);
+ BUMP_LOCAL(sctp->sctp_opkts);
if (!fp->timer_running)
SCTP_FADDR_TIMER_RESTART(sctp, fp, fp->rto);
mp1 = mp1->b_next;
@@ -2080,13 +2105,13 @@ sctp_ftsn_check_frag(sctp_t *sctp, uint16_t ssn, sctp_instr_t *sip)
* messages, if any, from the instream queue (that were waiting for this
* sid-ssn message to show up). Once we are done try to update the SACK
* info. We could get a duplicate Forward TSN, in which case just send
- * a SACK. If any of the sid values in the the Forward TSN is invalid,
+ * a SACK. If any of the sid values in the Forward TSN is invalid,
* send back an "Invalid Stream Identifier" error and continue processing
* the rest.
*/
static void
sctp_process_forward_tsn(sctp_t *sctp, sctp_chunk_hdr_t *ch, sctp_faddr_t *fp,
- ip6_pkt_t *ipp)
+ ip_pkt_t *ipp, ip_recv_attr_t *ira)
{
uint32_t *ftsn = (uint32_t *)(ch + 1);
ftsn_entry_t *ftsn_entry;
@@ -2171,7 +2196,7 @@ sctp_process_forward_tsn(sctp_t *sctp, sctp_chunk_hdr_t *ch, sctp_faddr_t *fp,
dmp->b_next = NULL;
ASSERT(dmp->b_prev == NULL);
if (sctp_input_add_ancillary(sctp,
- &dmp, dc, fp, ipp) == 0) {
+ &dmp, dc, fp, ipp, ira) == 0) {
sctp->sctp_rxqueued -= dlen;
sctp->sctp_rwnd -= dlen;
/*
@@ -2280,8 +2305,9 @@ sctp_check_abandoned_data(sctp_t *sctp, sctp_faddr_t *fp)
SCTP_FADDR_TIMER_RESTART(sctp, fp, fp->rto);
return;
}
- sctp_set_iplen(sctp, nmp);
- sctp_add_sendq(sctp, nmp);
+ sctp_set_iplen(sctp, nmp, fp->ixa);
+ (void) conn_ip_output(nmp, fp->ixa);
+ BUMP_LOCAL(sctp->sctp_opkts);
if (!fp->timer_running)
SCTP_FADDR_TIMER_RESTART(sctp, fp, fp->rto);
}
@@ -2604,8 +2630,9 @@ sctp_got_sack(sctp_t *sctp, sctp_chunk_hdr_t *sch)
sctp->sctp_zero_win_probe = B_FALSE;
sctp->sctp_rxt_nxttsn = sctp->sctp_ltsn;
sctp->sctp_rxt_maxtsn = sctp->sctp_ltsn;
- sctp_set_iplen(sctp, pkt);
- sctp_add_sendq(sctp, pkt);
+ sctp_set_iplen(sctp, pkt, fp->ixa);
+ (void) conn_ip_output(pkt, fp->ixa);
+ BUMP_LOCAL(sctp->sctp_opkts);
}
} else {
if (sctp->sctp_zero_win_probe) {
@@ -3160,97 +3187,15 @@ sctp_check_input(sctp_t *sctp, sctp_chunk_hdr_t *ch, ssize_t len, int first)
return (1);
}
-/* ARGSUSED */
-static sctp_hdr_t *
-find_sctp_hdrs(mblk_t *mp, in6_addr_t *src, in6_addr_t *dst,
- uint_t *ifindex, uint_t *ip_hdr_len, ip6_pkt_t *ipp, ip_pktinfo_t *pinfo)
-{
- uchar_t *rptr;
- ipha_t *ip4h;
- ip6_t *ip6h;
- mblk_t *mp1;
-
- rptr = mp->b_rptr;
- if (IPH_HDR_VERSION(rptr) == IPV4_VERSION) {
- *ip_hdr_len = IPH_HDR_LENGTH(rptr);
- ip4h = (ipha_t *)rptr;
- IN6_IPADDR_TO_V4MAPPED(ip4h->ipha_src, src);
- IN6_IPADDR_TO_V4MAPPED(ip4h->ipha_dst, dst);
-
- ipp->ipp_fields |= IPPF_HOPLIMIT;
- ipp->ipp_hoplimit = ((ipha_t *)rptr)->ipha_ttl;
- if (pinfo != NULL && (pinfo->ip_pkt_flags & IPF_RECVIF)) {
- ipp->ipp_fields |= IPPF_IFINDEX;
- ipp->ipp_ifindex = pinfo->ip_pkt_ifindex;
- }
- } else {
- ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION);
- ip6h = (ip6_t *)rptr;
- ipp->ipp_fields = IPPF_HOPLIMIT;
- ipp->ipp_hoplimit = ip6h->ip6_hops;
-
- if (ip6h->ip6_nxt != IPPROTO_SCTP) {
- /* Look for ifindex information */
- if (ip6h->ip6_nxt == IPPROTO_RAW) {
- ip6i_t *ip6i = (ip6i_t *)ip6h;
-
- if (ip6i->ip6i_flags & IP6I_IFINDEX) {
- ASSERT(ip6i->ip6i_ifindex != 0);
- ipp->ipp_fields |= IPPF_IFINDEX;
- ipp->ipp_ifindex = ip6i->ip6i_ifindex;
- }
- rptr = (uchar_t *)&ip6i[1];
- mp->b_rptr = rptr;
- if (rptr == mp->b_wptr) {
- mp1 = mp->b_cont;
- freeb(mp);
- mp = mp1;
- rptr = mp->b_rptr;
- }
- ASSERT(mp->b_wptr - rptr >=
- IPV6_HDR_LEN + sizeof (sctp_hdr_t));
- ip6h = (ip6_t *)rptr;
- }
- /*
- * Find any potentially interesting extension headers
- * as well as the length of the IPv6 + extension
- * headers.
- */
- *ip_hdr_len = ip_find_hdr_v6(mp, ip6h, ipp, NULL);
- } else {
- *ip_hdr_len = IPV6_HDR_LEN;
- }
- *src = ip6h->ip6_src;
- *dst = ip6h->ip6_dst;
- }
- ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX);
- return ((sctp_hdr_t *)&rptr[*ip_hdr_len]);
-#undef IPVER
-}
-
static mblk_t *
-sctp_check_in_policy(mblk_t *mp, mblk_t *ipsec_mp)
+sctp_check_in_policy(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
{
- ipsec_in_t *ii;
- boolean_t check = B_TRUE;
boolean_t policy_present;
ipha_t *ipha;
ip6_t *ip6h;
- netstack_t *ns;
- ipsec_stack_t *ipss;
-
- ii = (ipsec_in_t *)ipsec_mp->b_rptr;
- ASSERT(ii->ipsec_in_type == IPSEC_IN);
- ns = ii->ipsec_in_ns;
- ipss = ns->netstack_ipsec;
-
- if (ii->ipsec_in_dont_check) {
- check = B_FALSE;
- if (!ii->ipsec_in_secure) {
- freeb(ipsec_mp);
- ipsec_mp = NULL;
- }
- }
+ netstack_t *ns = ipst->ips_netstack;
+ ipsec_stack_t *ipss = ns->netstack_ipsec;
+
if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
policy_present = ipss->ipsec_inbound_v4_policy_present;
ipha = (ipha_t *)mp->b_rptr;
@@ -3261,109 +3206,88 @@ sctp_check_in_policy(mblk_t *mp, mblk_t *ipsec_mp)
ip6h = (ip6_t *)mp->b_rptr;
}
- if (check && policy_present) {
+ if (policy_present) {
/*
* The conn_t parameter is NULL because we already know
* nobody's home.
*/
- ipsec_mp = ipsec_check_global_policy(ipsec_mp, (conn_t *)NULL,
- ipha, ip6h, B_TRUE, ns);
- if (ipsec_mp == NULL)
+ mp = ipsec_check_global_policy(mp, (conn_t *)NULL,
+ ipha, ip6h, ira, ns);
+ if (mp == NULL)
return (NULL);
}
- if (ipsec_mp != NULL)
- freeb(ipsec_mp);
return (mp);
}
/* Handle out-of-the-blue packets */
void
-sctp_ootb_input(mblk_t *mp, ill_t *recv_ill, zoneid_t zoneid,
- boolean_t mctl_present)
+sctp_ootb_input(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
{
sctp_t *sctp;
sctp_chunk_hdr_t *ch;
sctp_hdr_t *sctph;
in6_addr_t src, dst;
- uint_t ip_hdr_len;
- uint_t ifindex;
- ip6_pkt_t ipp;
+ uint_t ip_hdr_len = ira->ira_ip_hdr_length;
ssize_t mlen;
- ip_pktinfo_t *pinfo = NULL;
- mblk_t *first_mp;
sctp_stack_t *sctps;
- ip_stack_t *ipst;
+ boolean_t secure;
+ zoneid_t zoneid = ira->ira_zoneid;
+ uchar_t *rptr;
+
+ ASSERT(ira->ira_ill == NULL);
+
+ secure = ira->ira_flags & IRAF_IPSEC_SECURE;
- ASSERT(recv_ill != NULL);
- ipst = recv_ill->ill_ipst;
sctps = ipst->ips_netstack->netstack_sctp;
BUMP_MIB(&sctps->sctps_mib, sctpOutOfBlue);
BUMP_MIB(&sctps->sctps_mib, sctpInSCTPPkts);
- if (sctps->sctps_gsctp == NULL) {
- /*
- * For non-zero stackids the default queue isn't created
- * until the first open, thus there can be a need to send
- * an error before then. But we can't do that, hence we just
- * drop the packet. Later during boot, when the default queue
- * has been setup, a retransmitted packet from the peer
- * will result in a error.
- */
- ASSERT(sctps->sctps_netstack->netstack_stackid !=
- GLOBAL_NETSTACKID);
- freemsg(mp);
- return;
- }
-
- first_mp = mp;
- if (mctl_present)
- mp = mp->b_cont;
-
- /* Initiate IPPf processing, if needed. */
- if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) {
- ip_process(IPP_LOCAL_IN, &mp,
- recv_ill->ill_phyint->phyint_ifindex);
- if (mp == NULL) {
- if (mctl_present)
- freeb(first_mp);
- return;
- }
- }
-
if (mp->b_cont != NULL) {
/*
* All subsequent code is vastly simplified if it can
* assume a single contiguous chunk of data.
*/
if (pullupmsg(mp, -1) == 0) {
- BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(first_mp);
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, NULL);
+ freemsg(mp);
return;
}
}
- /*
- * We don't really need to call this function... Need to
- * optimize later.
- */
- sctph = find_sctp_hdrs(mp, &src, &dst, &ifindex, &ip_hdr_len,
- &ipp, pinfo);
+ rptr = mp->b_rptr;
+ sctph = ((sctp_hdr_t *)&rptr[ip_hdr_len]);
+ if (ira->ira_flags & IRAF_IS_IPV4) {
+ ipha_t *ipha;
+
+ ipha = (ipha_t *)rptr;
+ IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &src);
+ IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &dst);
+ } else {
+ ip6_t *ip6h;
+
+ ip6h = (ip6_t *)rptr;
+ src = ip6h->ip6_src;
+ dst = ip6h->ip6_dst;
+ }
+
mlen = mp->b_wptr - (uchar_t *)(sctph + 1);
if ((ch = sctp_first_chunk((uchar_t *)(sctph + 1), mlen)) == NULL) {
dprint(3, ("sctp_ootb_input: invalid packet\n"));
- BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(first_mp);
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, NULL);
+ freemsg(mp);
return;
}
switch (ch->sch_id) {
case CHUNK_INIT:
/* no listener; send abort */
- if (mctl_present && sctp_check_in_policy(mp, first_mp) == NULL)
+ if (secure && sctp_check_in_policy(mp, ira, ipst) == NULL)
return;
- sctp_send_abort(sctps->sctps_gsctp, sctp_init2vtag(ch), 0,
- NULL, 0, mp, 0, B_TRUE);
+ sctp_ootb_send_abort(sctp_init2vtag(ch), 0,
+ NULL, 0, mp, 0, B_TRUE, ira, ipst);
break;
case CHUNK_INIT_ACK:
/* check for changed src addr */
@@ -3372,11 +3296,7 @@ sctp_ootb_input(mblk_t *mp, ill_t *recv_ill, zoneid_t zoneid,
/* success; proceed to normal path */
mutex_enter(&sctp->sctp_lock);
if (sctp->sctp_running) {
- if (!sctp_add_recvq(sctp, mp, B_FALSE)) {
- BUMP_MIB(recv_ill->ill_ip_mib,
- ipIfStatsInDiscards);
- freemsg(mp);
- }
+ sctp_add_recvq(sctp, mp, B_FALSE, ira);
mutex_exit(&sctp->sctp_lock);
} else {
/*
@@ -3387,152 +3307,101 @@ sctp_ootb_input(mblk_t *mp, ill_t *recv_ill, zoneid_t zoneid,
*/
sctp->sctp_running = B_TRUE;
mutex_exit(&sctp->sctp_lock);
- sctp_input_data(sctp, mp, NULL);
+ sctp_input_data(sctp, mp, ira);
WAKE_SCTP(sctp);
- sctp_process_sendq(sctp);
}
SCTP_REFRELE(sctp);
return;
}
- if (mctl_present)
- freeb(first_mp);
/* else bogus init ack; drop it */
break;
case CHUNK_SHUTDOWN_ACK:
- if (mctl_present && sctp_check_in_policy(mp, first_mp) == NULL)
+ if (secure && sctp_check_in_policy(mp, ira, ipst) == NULL)
return;
- sctp_ootb_shutdown_ack(sctps->sctps_gsctp, mp, ip_hdr_len);
- sctp_process_sendq(sctps->sctps_gsctp);
+ sctp_ootb_shutdown_ack(mp, ip_hdr_len, ira, ipst);
return;
case CHUNK_ERROR:
case CHUNK_ABORT:
case CHUNK_COOKIE_ACK:
case CHUNK_SHUTDOWN_COMPLETE:
- if (mctl_present)
- freeb(first_mp);
break;
default:
- if (mctl_present && sctp_check_in_policy(mp, first_mp) == NULL)
+ if (secure && sctp_check_in_policy(mp, ira, ipst) == NULL)
return;
- sctp_send_abort(sctps->sctps_gsctp, sctph->sh_verf, 0,
- NULL, 0, mp, 0, B_TRUE);
+ sctp_ootb_send_abort(sctph->sh_verf, 0,
+ NULL, 0, mp, 0, B_TRUE, ira, ipst);
break;
}
- sctp_process_sendq(sctps->sctps_gsctp);
freemsg(mp);
}
+/*
+ * Handle sctp packets.
+ * Note that we rele the sctp_t (the caller got a reference on it).
+ */
void
-sctp_input(conn_t *connp, ipha_t *ipha, mblk_t *mp, mblk_t *first_mp,
- ill_t *recv_ill, boolean_t isv4, boolean_t mctl_present)
+sctp_input(conn_t *connp, ipha_t *ipha, ip6_t *ip6h, mblk_t *mp,
+ ip_recv_attr_t *ira)
{
- sctp_t *sctp = CONN2SCTP(connp);
- ip_stack_t *ipst = recv_ill->ill_ipst;
+ sctp_t *sctp = CONN2SCTP(connp);
+ boolean_t secure;
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
+ iaflags_t iraflags = ira->ira_flags;
+ ill_t *rill = ira->ira_rill;
+
+ secure = iraflags & IRAF_IPSEC_SECURE;
/*
* We check some fields in conn_t without holding a lock.
* This should be fine.
*/
- if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || mctl_present) {
- first_mp = ipsec_check_inbound_policy(first_mp, connp,
- ipha, NULL, mctl_present);
- if (first_mp == NULL) {
- BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards);
- SCTP_REFRELE(sctp);
- return;
- }
- }
-
- /* Initiate IPPF processing for fastpath */
- if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) {
- ip_process(IPP_LOCAL_IN, &mp,
- recv_ill->ill_phyint->phyint_ifindex);
+ if (((iraflags & IRAF_IS_IPV4) ?
+ CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
+ CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) ||
+ secure) {
+ mp = ipsec_check_inbound_policy(mp, connp, ipha,
+ ip6h, ira);
if (mp == NULL) {
+ BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
+ /* Note that mp is NULL */
+ ip_drop_input("ipIfStatsInDiscards", mp, ill);
SCTP_REFRELE(sctp);
- if (mctl_present)
- freeb(first_mp);
return;
- } else if (mctl_present) {
- /*
- * ip_process might return a new mp.
- */
- ASSERT(first_mp != mp);
- first_mp->b_cont = mp;
- } else {
- first_mp = mp;
}
}
- if (connp->conn_recvif || connp->conn_recvslla ||
- connp->conn_ip_recvpktinfo) {
- int in_flags = 0;
-
- if (connp->conn_recvif || connp->conn_ip_recvpktinfo) {
- in_flags = IPF_RECVIF;
- }
- if (connp->conn_recvslla) {
- in_flags |= IPF_RECVSLLA;
- }
- if (isv4) {
- mp = ip_add_info(mp, recv_ill, in_flags,
- IPCL_ZONEID(connp), ipst);
- } else {
- mp = ip_add_info_v6(mp, recv_ill,
- &(((ip6_t *)ipha)->ip6_dst));
- }
- if (mp == NULL) {
- BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards);
- SCTP_REFRELE(sctp);
- if (mctl_present)
- freeb(first_mp);
- return;
- } else if (mctl_present) {
- /*
- * ip_add_info might return a new mp.
- */
- ASSERT(first_mp != mp);
- first_mp->b_cont = mp;
- } else {
- first_mp = mp;
- }
- }
+ ira->ira_ill = ira->ira_rill = NULL;
mutex_enter(&sctp->sctp_lock);
if (sctp->sctp_running) {
- if (mctl_present)
- mp->b_prev = first_mp;
- if (!sctp_add_recvq(sctp, mp, B_FALSE)) {
- BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards);
- freemsg(first_mp);
- }
+ sctp_add_recvq(sctp, mp, B_FALSE, ira);
mutex_exit(&sctp->sctp_lock);
- SCTP_REFRELE(sctp);
- return;
+ goto done;
} else {
sctp->sctp_running = B_TRUE;
mutex_exit(&sctp->sctp_lock);
mutex_enter(&sctp->sctp_recvq_lock);
if (sctp->sctp_recvq != NULL) {
- if (mctl_present)
- mp->b_prev = first_mp;
- if (!sctp_add_recvq(sctp, mp, B_TRUE)) {
- BUMP_MIB(recv_ill->ill_ip_mib,
- ipIfStatsInDiscards);
- freemsg(first_mp);
- }
+ sctp_add_recvq(sctp, mp, B_TRUE, ira);
mutex_exit(&sctp->sctp_recvq_lock);
WAKE_SCTP(sctp);
- SCTP_REFRELE(sctp);
- return;
+ goto done;
}
}
mutex_exit(&sctp->sctp_recvq_lock);
- sctp_input_data(sctp, mp, (mctl_present ? first_mp : NULL));
+ if (ira->ira_flags & IRAF_ICMP_ERROR)
+ sctp_icmp_error(sctp, mp);
+ else
+ sctp_input_data(sctp, mp, ira);
WAKE_SCTP(sctp);
- sctp_process_sendq(sctp);
+
+done:
SCTP_REFRELE(sctp);
+ ira->ira_ill = ill;
+ ira->ira_rill = rill;
}
static void
@@ -3549,7 +3418,7 @@ sctp_process_abort(sctp_t *sctp, sctp_chunk_hdr_t *ch, int err)
}
void
-sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
+sctp_input_data(sctp_t *sctp, mblk_t *mp, ip_recv_attr_t *ira)
{
sctp_chunk_hdr_t *ch;
ssize_t mlen;
@@ -3559,17 +3428,15 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
sctp_init_chunk_t *iack;
uint32_t tsn;
sctp_data_hdr_t *sdc;
- ip6_pkt_t ipp;
+ ip_pkt_t ipp;
in6_addr_t src;
in6_addr_t dst;
uint_t ifindex;
sctp_hdr_t *sctph;
- uint_t ip_hdr_len;
+ uint_t ip_hdr_len = ira->ira_ip_hdr_length;
mblk_t *dups = NULL;
int recv_adaptation;
boolean_t wake_eager = B_FALSE;
- mblk_t *pinfo_mp;
- ip_pktinfo_t *pinfo = NULL;
in6_addr_t peer_src;
int64_t now;
sctp_stack_t *sctps = sctp->sctp_sctps;
@@ -3577,23 +3444,11 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
boolean_t hb_already = B_FALSE;
cred_t *cr;
pid_t cpid;
+ uchar_t *rptr;
+ conn_t *connp = sctp->sctp_connp;
- if (DB_TYPE(mp) != M_DATA) {
- ASSERT(DB_TYPE(mp) == M_CTL);
- if (MBLKL(mp) == sizeof (ip_pktinfo_t) &&
- ((ip_pktinfo_t *)mp->b_rptr)->ip_pkt_ulp_type ==
- IN_PKTINFO) {
- pinfo = (ip_pktinfo_t *)mp->b_rptr;
- pinfo_mp = mp;
- mp = mp->b_cont;
- } else {
- if (ipsec_mp != NULL)
- freeb(ipsec_mp);
- sctp_icmp_error(sctp, mp);
- return;
- }
- }
ASSERT(DB_TYPE(mp) == M_DATA);
+ ASSERT(ira->ira_ill == NULL);
if (mp->b_cont != NULL) {
/*
@@ -3602,32 +3457,72 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
*/
if (pullupmsg(mp, -1) == 0) {
BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
- if (ipsec_mp != NULL)
- freeb(ipsec_mp);
- if (pinfo != NULL)
- freeb(pinfo_mp);
+ ip_drop_input("ipIfStatsInDiscards", mp, NULL);
freemsg(mp);
return;
}
}
BUMP_LOCAL(sctp->sctp_ipkts);
- sctph = find_sctp_hdrs(mp, &src, &dst, &ifindex, &ip_hdr_len,
- &ipp, pinfo);
- if (pinfo != NULL)
- freeb(pinfo_mp);
+ ifindex = ira->ira_ruifindex;
+
+ rptr = mp->b_rptr;
+
+ ipp.ipp_fields = 0;
+ if (connp->conn_recv_ancillary.crb_all != 0) {
+ /*
+ * Record packet information in the ip_pkt_t
+ */
+ if (ira->ira_flags & IRAF_IS_IPV4) {
+ (void) ip_find_hdr_v4((ipha_t *)rptr, &ipp,
+ B_FALSE);
+ } else {
+ uint8_t nexthdrp;
+
+ /*
+ * IPv6 packets can only be received by applications
+ * that are prepared to receive IPv6 addresses.
+ * The IP fanout must ensure this.
+ */
+ ASSERT(connp->conn_family == AF_INET6);
+
+ (void) ip_find_hdr_v6(mp, (ip6_t *)rptr, B_TRUE, &ipp,
+ &nexthdrp);
+ ASSERT(nexthdrp == IPPROTO_SCTP);
+
+ /* Could have caused a pullup? */
+ rptr = mp->b_rptr;
+ }
+ }
+
+ sctph = ((sctp_hdr_t *)&rptr[ip_hdr_len]);
+
+ if (ira->ira_flags & IRAF_IS_IPV4) {
+ ipha_t *ipha;
+
+ ipha = (ipha_t *)rptr;
+ IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &src);
+ IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &dst);
+ } else {
+ ip6_t *ip6h;
+
+ ip6h = (ip6_t *)rptr;
+ src = ip6h->ip6_src;
+ dst = ip6h->ip6_dst;
+ }
+
mlen = mp->b_wptr - (uchar_t *)(sctph + 1);
ch = sctp_first_chunk((uchar_t *)(sctph + 1), mlen);
if (ch == NULL) {
BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
- if (ipsec_mp != NULL)
- freeb(ipsec_mp);
+ ip_drop_input("ipIfStatsInDiscards", mp, NULL);
freemsg(mp);
return;
}
if (!sctp_check_input(sctp, ch, mlen, 1)) {
BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
+ ip_drop_input("ipIfStatsInDiscards", mp, NULL);
goto done;
}
/*
@@ -3661,9 +3556,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
if (sctp->sctp_state > SCTPS_BOUND &&
sctp->sctp_state < SCTPS_ESTABLISHED) {
/* treat as OOTB */
- sctp_ootb_shutdown_ack(sctp, mp, ip_hdr_len);
- if (ipsec_mp != NULL)
- freeb(ipsec_mp);
+ sctp_ootb_shutdown_ack(mp, ip_hdr_len, ira, ipst);
return;
}
/* else fallthru */
@@ -3717,7 +3610,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
tsn = sdc->sdh_tsn;
sctp_send_abort(sctp, sctp->sctp_fvtag,
SCTP_ERR_NO_USR_DATA, (char *)&tsn,
- sizeof (tsn), mp, 0, B_FALSE);
+ sizeof (tsn), mp, 0, B_FALSE, ira);
sctp_assoc_event(sctp, SCTP_COMM_LOST,
0, NULL);
sctp_clean_death(sctp, ECONNABORTED);
@@ -3726,7 +3619,8 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
ASSERT(fp != NULL);
sctp->sctp_lastdata = fp;
- sctp_data_chunk(sctp, ch, mp, &dups, fp, &ipp);
+ sctp_data_chunk(sctp, ch, mp, &dups, fp,
+ &ipp, ira);
gotdata = 1;
/* Restart shutdown timer if shutting down */
if (sctp->sctp_state == SCTPS_SHUTDOWN_SENT) {
@@ -3743,7 +3637,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
sctps->sctps_shutack_wait_bound) {
sctp_send_abort(sctp,
sctp->sctp_fvtag, 0, NULL,
- 0, mp, 0, B_FALSE);
+ 0, mp, 0, B_FALSE, ira);
sctp_assoc_event(sctp,
SCTP_COMM_LOST, 0, NULL);
sctp_clean_death(sctp,
@@ -3764,7 +3658,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
trysend = sctp_got_sack(sctp, ch);
if (trysend < 0) {
sctp_send_abort(sctp, sctph->sh_verf,
- 0, NULL, 0, mp, 0, B_FALSE);
+ 0, NULL, 0, mp, 0, B_FALSE, ira);
sctp_assoc_event(sctp,
SCTP_COMM_LOST, 0, NULL);
sctp_clean_death(sctp,
@@ -3820,11 +3714,11 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
goto done;
}
case CHUNK_INIT:
- sctp_send_initack(sctp, sctph, ch, mp);
+ sctp_send_initack(sctp, sctph, ch, mp, ira);
break;
case CHUNK_COOKIE:
if (sctp_process_cookie(sctp, ch, mp, &iack,
- sctph, &recv_adaptation, NULL) != -1) {
+ sctph, &recv_adaptation, NULL, ira) != -1) {
sctp_send_cookie_ack(sctp);
sctp_assoc_event(sctp, SCTP_RESTART,
0, NULL);
@@ -3841,7 +3735,8 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
int error;
BUMP_LOCAL(sctp->sctp_ibchunks);
- error = sctp_handle_error(sctp, sctph, ch, mp);
+ error = sctp_handle_error(sctp, sctph, ch, mp,
+ ira);
if (error != 0) {
sctp_assoc_event(sctp, SCTP_COMM_LOST,
0, NULL);
@@ -3864,7 +3759,8 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
case CHUNK_FORWARD_TSN:
ASSERT(fp != NULL);
sctp->sctp_lastdata = fp;
- sctp_process_forward_tsn(sctp, ch, fp, &ipp);
+ sctp_process_forward_tsn(sctp, ch, fp,
+ &ipp, ira);
gotdata = 1;
BUMP_LOCAL(sctp->sctp_ibchunks);
break;
@@ -3879,13 +3775,14 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
case SCTPS_LISTEN:
switch (ch->sch_id) {
case CHUNK_INIT:
- sctp_send_initack(sctp, sctph, ch, mp);
+ sctp_send_initack(sctp, sctph, ch, mp, ira);
break;
case CHUNK_COOKIE: {
sctp_t *eager;
if (sctp_process_cookie(sctp, ch, mp, &iack,
- sctph, &recv_adaptation, &peer_src) == -1) {
+ sctph, &recv_adaptation, &peer_src,
+ ira) == -1) {
BUMP_MIB(&sctps->sctps_mib,
sctpInInvalidCookie);
goto done;
@@ -3900,11 +3797,11 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
goto done;
eager = sctp_conn_request(sctp, mp, ifindex,
- ip_hdr_len, iack, ipsec_mp);
+ ip_hdr_len, iack, ira);
if (eager == NULL) {
sctp_send_abort(sctp, sctph->sh_verf,
SCTP_ERR_NO_RESOURCES, NULL, 0, mp,
- 0, B_FALSE);
+ 0, B_FALSE, ira);
goto done;
}
@@ -3933,9 +3830,6 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
BUMP_MIB(&sctps->sctps_mib, sctpPassiveEstab);
if (mlen > ntohs(ch->sch_len)) {
eager->sctp_cookie_mp = dupb(mp);
- mblk_setcred(eager->sctp_cookie_mp,
- CONN_CRED(eager->sctp_connp),
- eager->sctp_cpid);
/*
* If no mem, just let
* the peer retransmit.
@@ -3986,7 +3880,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
default:
BUMP_LOCAL(sctp->sctp_ibchunks);
sctp_send_abort(sctp, sctph->sh_verf, 0, NULL,
- 0, mp, 0, B_TRUE);
+ 0, mp, 0, B_TRUE, ira);
goto done;
}
break;
@@ -3996,20 +3890,21 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
case CHUNK_INIT_ACK:
sctp_stop_faddr_timers(sctp);
sctp_faddr_alive(sctp, sctp->sctp_current);
- sctp_send_cookie_echo(sctp, ch, mp);
+ sctp_send_cookie_echo(sctp, ch, mp, ira);
BUMP_LOCAL(sctp->sctp_ibchunks);
break;
case CHUNK_ABORT:
sctp_process_abort(sctp, ch, ECONNREFUSED);
goto done;
case CHUNK_INIT:
- sctp_send_initack(sctp, sctph, ch, mp);
+ sctp_send_initack(sctp, sctph, ch, mp, ira);
break;
case CHUNK_COOKIE:
- cr = msg_getcred(mp, &cpid);
+ cr = ira->ira_cred;
+ cpid = ira->ira_cpid;
if (sctp_process_cookie(sctp, ch, mp, &iack,
- sctph, &recv_adaptation, NULL) == -1) {
+ sctph, &recv_adaptation, NULL, ira) == -1) {
BUMP_MIB(&sctps->sctps_mib,
sctpInInvalidCookie);
break;
@@ -4053,7 +3948,8 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
case SCTPS_COOKIE_ECHOED:
switch (ch->sch_id) {
case CHUNK_COOKIE_ACK:
- cr = msg_getcred(mp, &cpid);
+ cr = ira->ira_cred;
+ cpid = ira->ira_cpid;
if (!SCTP_IS_DETACHED(sctp)) {
sctp->sctp_ulp_connected(
@@ -4084,10 +3980,11 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
sctp_process_abort(sctp, ch, ECONNREFUSED);
goto done;
case CHUNK_COOKIE:
- cr = msg_getcred(mp, &cpid);
+ cr = ira->ira_cred;
+ cpid = ira->ira_cpid;
if (sctp_process_cookie(sctp, ch, mp, &iack,
- sctph, &recv_adaptation, NULL) == -1) {
+ sctph, &recv_adaptation, NULL, ira) == -1) {
BUMP_MIB(&sctps->sctps_mib,
sctpInInvalidCookie);
break;
@@ -4122,7 +4019,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
trysend = 1;
break;
case CHUNK_INIT:
- sctp_send_initack(sctp, sctph, ch, mp);
+ sctp_send_initack(sctp, sctph, ch, mp, ira);
break;
case CHUNK_ERROR: {
sctp_parm_hdr_t *p;
@@ -4165,7 +4062,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
switch (ch->sch_id) {
case CHUNK_ABORT:
/* Pass gathered wisdom to IP for keeping */
- sctp_update_ire(sctp);
+ sctp_update_dce(sctp);
sctp_process_abort(sctp, ch, 0);
goto done;
case CHUNK_SHUTDOWN_COMPLETE:
@@ -4175,7 +4072,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
NULL);
/* Pass gathered wisdom to IP for keeping */
- sctp_update_ire(sctp);
+ sctp_update_dce(sctp);
sctp_clean_death(sctp, 0);
goto done;
case CHUNK_SHUTDOWN_ACK:
@@ -4215,7 +4112,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
trysend = sctp_got_sack(sctp, ch);
if (trysend < 0) {
sctp_send_abort(sctp, sctph->sh_verf,
- 0, NULL, 0, mp, 0, B_FALSE);
+ 0, NULL, 0, mp, 0, B_FALSE, ira);
sctp_assoc_event(sctp,
SCTP_COMM_LOST, 0, NULL);
sctp_clean_death(sctp,
@@ -4287,8 +4184,6 @@ nomorechunks:
done:
if (dups != NULL)
freeb(dups);
- if (ipsec_mp != NULL)
- freeb(ipsec_mp);
freemsg(mp);
if (sctp->sctp_err_chunks != NULL)
@@ -4297,15 +4192,9 @@ done:
if (wake_eager) {
/*
* sctp points to newly created control block, need to
- * release it before exiting. Before releasing it and
- * processing the sendq, need to grab a hold on it.
- * Otherwise, another thread can close it while processing
- * the sendq.
+ * release it before exiting.
*/
- SCTP_REFHOLD(sctp);
WAKE_SCTP(sctp);
- sctp_process_sendq(sctp);
- SCTP_REFRELE(sctp);
}
}
@@ -4340,12 +4229,6 @@ sctp_recvd(sctp_t *sctp, int len)
sctp->sctp_force_sack = 1;
BUMP_MIB(&sctps->sctps_mib, sctpOutWinUpdate);
(void) sctp_sack(sctp, NULL);
- old = 1;
- } else {
- old = 0;
}
WAKE_SCTP(sctp);
- if (old > 0) {
- sctp_process_sendq(sctp);
- }
}
diff --git a/usr/src/uts/common/inet/sctp/sctp_ioc.c b/usr/src/uts/common/inet/sctp/sctp_ioc.c
index 7150c48c4b..5f5c2ee629 100644
--- a/usr/src/uts/common/inet/sctp/sctp_ioc.c
+++ b/usr/src/uts/common/inet/sctp/sctp_ioc.c
@@ -49,69 +49,7 @@
#include "sctp_impl.h"
/*
- * We need a stream q for sending packets to IP. This q should
- * be set in strplumb() time. Once it is set, it will never
- * be removed. Since it is done in strplumb() time, there is
- * no need to have a lock on the default q.
- */
-static void
-sctp_def_q_set(queue_t *q, mblk_t *mp)
-{
- conn_t *connp = (conn_t *)q->q_ptr;
- struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
- mblk_t *mp1;
- hrtime_t t;
- sctp_stack_t *sctps = connp->conn_netstack->
- netstack_sctp;
-
- if ((mp1 = mp->b_cont) == NULL) {
- iocp->ioc_error = EINVAL;
- ip0dbg(("sctp_def_q_set: no file descriptor\n"));
- goto done;
- }
-
- mutex_enter(&sctps->sctps_g_q_lock);
- if (sctps->sctps_g_q != NULL) {
- mutex_exit(&sctps->sctps_g_q_lock);
- ip0dbg(("sctp_def_q_set: already set\n"));
- iocp->ioc_error = EALREADY;
- goto done;
- }
-
- sctps->sctps_g_q = q;
- mutex_exit(&sctps->sctps_g_q_lock);
- sctps->sctps_gsctp = (sctp_t *)sctp_create(NULL, NULL, AF_INET6,
- SCTP_CAN_BLOCK, NULL, NULL, connp->conn_cred);
- mutex_enter(&sctps->sctps_g_q_lock);
- if (sctps->sctps_gsctp == NULL) {
- sctps->sctps_g_q = NULL;
- mutex_exit(&sctps->sctps_g_q_lock);
- iocp->ioc_error = ENOMEM;
- goto done;
- }
- mutex_exit(&sctps->sctps_g_q_lock);
- ASSERT(sctps->sctps_g_q_ref >= 1);
- ASSERT(list_head(&sctps->sctps_g_list) == sctps->sctps_gsctp);
-
- /*
- * As a good citizen of using /dev/urandom, add some entropy
- * to the random number pool.
- */
- t = gethrtime();
- (void) random_add_entropy((uint8_t *)&t, sizeof (t), 0);
-done:
- if (mp1 != NULL) {
- freemsg(mp1);
- mp->b_cont = NULL;
- }
- iocp->ioc_count = 0;
- mp->b_datap->db_type = M_IOCACK;
- qreply(q, mp);
-}
-
-
-/*
- * sctp_wput_ioctl is called by sctp_wput_slow to handle all
+ * sctp_wput_ioctl is called by sctp_wput to handle all
* M_IOCTL messages.
*/
void
@@ -119,7 +57,6 @@ sctp_wput_ioctl(queue_t *q, mblk_t *mp)
{
conn_t *connp = (conn_t *)q->q_ptr;
struct iocblk *iocp;
- cred_t *cr;
if (connp == NULL) {
ip0dbg(("sctp_wput_ioctl: null conn\n"));
@@ -127,24 +64,7 @@ sctp_wput_ioctl(queue_t *q, mblk_t *mp)
}
iocp = (struct iocblk *)mp->b_rptr;
- /*
- * prefer credential from mblk over ioctl;
- * see ip_sioctl_copyin_setup
- */
- cr = msg_getcred(mp, NULL);
- if (cr == NULL)
- cr = iocp->ioc_cr;
-
switch (iocp->ioc_cmd) {
- case SCTP_IOC_DEFAULT_Q:
- /* Wants to be the default wq. */
- if (cr != NULL && secpolicy_ip_config(cr, B_FALSE) != 0) {
- iocp->ioc_error = EPERM;
- goto err_ret;
- }
- sctp_def_q_set(q, mp);
- return;
-
case ND_SET:
/* sctp_nd_getset() -> nd_getset() does the checking. */
case ND_GET:
@@ -244,6 +164,9 @@ sctp_str_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
netstack_rele(ns);
connp->conn_zoneid = zoneid;
+ connp->conn_ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
+ /* conn_allzones can not be set this early, hence no IPCL_ZONEID */
+ connp->conn_ixa->ixa_zoneid = zoneid;
connp->conn_rq = q;
connp->conn_wq = WR(q);
@@ -276,6 +199,12 @@ sctp_str_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
ASSERT(connp->conn_cred == NULL);
connp->conn_cred = credp;
crhold(connp->conn_cred);
+ connp->conn_cpid = curproc->p_pid;
+ /* Cache things in ixa without an extra refhold */
+ connp->conn_ixa->ixa_cred = connp->conn_cred;
+ connp->conn_ixa->ixa_cpid = connp->conn_cpid;
+ if (is_system_labeled())
+ connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
/*
* Make the conn globally visible to walkers
diff --git a/usr/src/uts/common/inet/sctp/sctp_notify.c b/usr/src/uts/common/inet/sctp/sctp_notify.c
index 3ede878954..ea46e0bbd2 100644
--- a/usr/src/uts/common/inet/sctp/sctp_notify.c
+++ b/usr/src/uts/common/inet/sctp/sctp_notify.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -51,6 +51,7 @@ sctp_notify(sctp_t *sctp, mblk_t *emp, size_t len)
sctp_faddr_t *fp;
int32_t rwnd = 0;
int error;
+ conn_t *connp = sctp->sctp_connp;
if ((mp = allocb(sizeof (*tudi) + sizeof (void *) +
sizeof (struct sockaddr_in6), BPRI_HI)) == NULL) {
@@ -82,7 +83,7 @@ sctp_notify(sctp_t *sctp, mblk_t *emp, size_t len)
tudi->SRC_length = sizeof (*sin4);
sin4 = (struct sockaddr_in *)(tudi + 1);
sin4->sin_family = AF_INET;
- sin4->sin_port = sctp->sctp_fport;
+ sin4->sin_port = connp->conn_fport;
IN6_V4MAPPED_TO_IPADDR(&fp->faddr, sin4->sin_addr.s_addr);
mp->b_wptr = (uchar_t *)(sin4 + 1);
} else {
@@ -91,7 +92,7 @@ sctp_notify(sctp_t *sctp, mblk_t *emp, size_t len)
tudi->SRC_length = sizeof (*sin6);
sin6 = (struct sockaddr_in6 *)(tudi + 1);
sin6->sin6_family = AF_INET6;
- sin6->sin6_port = sctp->sctp_fport;
+ sin6->sin6_port = connp->conn_fport;
sin6->sin6_addr = fp->faddr;
mp->b_wptr = (uchar_t *)(sin6 + 1);
}
diff --git a/usr/src/uts/common/inet/sctp/sctp_opt_data.c b/usr/src/uts/common/inet/sctp/sctp_opt_data.c
index 322e4d461e..ee5eb445af 100644
--- a/usr/src/uts/common/inet/sctp/sctp_opt_data.c
+++ b/usr/src/uts/common/inet/sctp/sctp_opt_data.c
@@ -43,6 +43,7 @@
#include <inet/ip.h>
#include <inet/ip_ire.h>
#include <inet/ip_if.h>
+#include <inet/proto_set.h>
#include <inet/ipclassifier.h>
#include <inet/ipsec_impl.h>
@@ -60,68 +61,6 @@
static int sctp_getpeeraddrs(sctp_t *, void *, int *);
-/*
- * Copy the standard header into its new location,
- * lay in the new options and then update the relevant
- * fields in both sctp_t and the standard header.
- * Returns 0 on success, errno otherwise.
- */
-static int
-sctp_opt_set_header(sctp_t *sctp, const void *ptr, uint_t len)
-{
- uint8_t *ip_optp;
- sctp_hdr_t *new_sctph;
-
- if ((len > SCTP_MAX_IP_OPTIONS_LENGTH) || (len & 0x3))
- return (EINVAL);
-
- if (len > IP_MAX_OPT_LENGTH - sctp->sctp_v4label_len)
- return (EINVAL);
-
- ip_optp = (uint8_t *)sctp->sctp_ipha + IP_SIMPLE_HDR_LENGTH;
-
- if (sctp->sctp_v4label_len > 0) {
- int padlen;
- uint8_t opt;
-
- /* convert list termination to no-ops as needed */
- padlen = sctp->sctp_v4label_len - ip_optp[IPOPT_OLEN];
- ip_optp += ip_optp[IPOPT_OLEN];
- opt = len > 0 ? IPOPT_NOP : IPOPT_EOL;
- while (--padlen >= 0)
- *ip_optp++ = opt;
- ASSERT(ip_optp == (uint8_t *)sctp->sctp_ipha +
- IP_SIMPLE_HDR_LENGTH + sctp->sctp_v4label_len);
- }
-
- /*
- * Move the existing SCTP header out where it belongs.
- */
- new_sctph = (sctp_hdr_t *)(ip_optp + len);
- ovbcopy(sctp->sctp_sctph, new_sctph, sizeof (sctp_hdr_t));
- sctp->sctp_sctph = new_sctph;
-
- /*
- * Insert the new user-supplied IP options.
- */
- if (len > 0)
- bcopy(ptr, ip_optp, len);
-
- len += sctp->sctp_v4label_len;
- sctp->sctp_ip_hdr_len = len;
- sctp->sctp_ipha->ipha_version_and_hdr_length =
- (IP_VERSION << 4) | (len >> 2);
- sctp->sctp_hdr_len = len + sizeof (sctp_hdr_t);
-
- if (sctp->sctp_current) {
- /*
- * Could be setting options before setting up connection.
- */
- sctp_set_ulp_prop(sctp);
- }
- return (0);
-}
-
static int
sctp_get_status(sctp_t *sctp, void *ptr)
{
@@ -132,6 +71,7 @@ sctp_get_status(sctp_t *sctp, void *ptr)
struct sctp_paddrinfo *sp;
mblk_t *meta, *mp;
int i;
+ conn_t *connp = sctp->sctp_connp;
sstat->sstat_state = sctp->sctp_state;
sstat->sstat_rwnd = sctp->sctp_frwnd;
@@ -146,13 +86,13 @@ sctp_get_status(sctp_t *sctp, void *ptr)
if (fp->isv4) {
sin = (struct sockaddr_in *)&sp->spinfo_address;
sin->sin_family = AF_INET;
- sin->sin_port = sctp->sctp_fport;
+ sin->sin_port = connp->conn_fport;
IN6_V4MAPPED_TO_INADDR(&fp->faddr, &sin->sin_addr);
sp->spinfo_mtu = sctp->sctp_hdr_len;
} else {
sin6 = (struct sockaddr_in6 *)&sp->spinfo_address;
sin6->sin6_family = AF_INET6;
- sin6->sin6_port = sctp->sctp_fport;
+ sin6->sin6_port = connp->conn_fport;
sin6->sin6_addr = fp->faddr;
sp->spinfo_mtu = sctp->sctp_hdr6_len;
}
@@ -261,18 +201,16 @@ sctp_get_rtoinfo(sctp_t *sctp, void *ptr)
}
static int
-sctp_set_rtoinfo(sctp_t *sctp, const void *invalp, uint_t inlen)
+sctp_set_rtoinfo(sctp_t *sctp, const void *invalp)
{
const struct sctp_rtoinfo *srto;
boolean_t ispriv;
sctp_stack_t *sctps = sctp->sctp_sctps;
+ conn_t *connp = sctp->sctp_connp;
- if (inlen < sizeof (*srto)) {
- return (EINVAL);
- }
srto = invalp;
- ispriv = secpolicy_ip_config(sctp->sctp_credp, B_TRUE) == 0;
+ ispriv = secpolicy_ip_config(connp->conn_cred, B_TRUE) == 0;
/*
* Bounds checking. Priviledged user can set the RTO initial
@@ -334,17 +272,13 @@ sctp_get_assocparams(sctp_t *sctp, void *ptr)
}
static int
-sctp_set_assocparams(sctp_t *sctp, const void *invalp, uint_t inlen)
+sctp_set_assocparams(sctp_t *sctp, const void *invalp)
{
const struct sctp_assocparams *sap = invalp;
uint32_t sum = 0;
sctp_faddr_t *fp;
sctp_stack_t *sctps = sctp->sctp_sctps;
- if (inlen < sizeof (*sap)) {
- return (EINVAL);
- }
-
if (sap->sasoc_asocmaxrxt) {
if (sctp->sctp_faddrs) {
/*
@@ -403,6 +337,7 @@ sctp_set_initmsg(sctp_t *sctp, const void *invalp, uint_t inlen)
{
const struct sctp_initmsg *si = invalp;
sctp_stack_t *sctps = sctp->sctp_sctps;
+ conn_t *connp = sctp->sctp_connp;
if (sctp->sctp_state > SCTPS_LISTEN) {
return (EINVAL);
@@ -430,7 +365,7 @@ sctp_set_initmsg(sctp_t *sctp, const void *invalp, uint_t inlen)
return (EINVAL);
}
if (si->sinit_max_init_timeo != 0 &&
- (secpolicy_ip_config(sctp->sctp_credp, B_TRUE) != 0 &&
+ (secpolicy_ip_config(connp->conn_cred, B_TRUE) != 0 &&
(si->sinit_max_init_timeo < sctps->sctps_rto_maxg_low ||
si->sinit_max_init_timeo > sctps->sctps_rto_maxg_high))) {
return (EINVAL);
@@ -506,7 +441,7 @@ sctp_get_peer_addr_params(sctp_t *sctp, void *ptr)
}
static int
-sctp_set_peer_addr_params(sctp_t *sctp, const void *invalp, uint_t inlen)
+sctp_set_peer_addr_params(sctp_t *sctp, const void *invalp)
{
const struct sctp_paddrparams *spp = invalp;
sctp_faddr_t *fp, *fp2;
@@ -515,10 +450,6 @@ sctp_set_peer_addr_params(sctp_t *sctp, const void *invalp, uint_t inlen)
int64_t now;
sctp_stack_t *sctps = sctp->sctp_sctps;
- if (inlen < sizeof (*spp)) {
- return (EINVAL);
- }
-
retval = sctp_find_peer_fp(sctp, &spp->spp_address, &fp);
if (retval != 0) {
return (retval);
@@ -620,13 +551,10 @@ sctp_get_def_send_params(sctp_t *sctp, void *ptr)
}
static int
-sctp_set_def_send_params(sctp_t *sctp, const void *invalp, uint_t inlen)
+sctp_set_def_send_params(sctp_t *sctp, const void *invalp)
{
const struct sctp_sndrcvinfo *sinfo = invalp;
- if (inlen < sizeof (*sinfo)) {
- return (EINVAL);
- }
if (sinfo->sinfo_stream >= sctp->sctp_num_ostr) {
return (EINVAL);
}
@@ -641,16 +569,12 @@ sctp_set_def_send_params(sctp_t *sctp, const void *invalp, uint_t inlen)
}
static int
-sctp_set_prim(sctp_t *sctp, const void *invalp, uint_t inlen)
+sctp_set_prim(sctp_t *sctp, const void *invalp)
{
const struct sctp_setpeerprim *pp = invalp;
int retval;
sctp_faddr_t *fp;
- if (inlen < sizeof (*pp)) {
- return (EINVAL);
- }
-
retval = sctp_find_peer_fp(sctp, &pp->sspp_addr, &fp);
if (retval)
return (retval);
@@ -670,6 +594,183 @@ sctp_set_prim(sctp_t *sctp, const void *invalp, uint_t inlen)
return (0);
}
+/*
+ * Table of all known options handled on a SCTP protocol stack.
+ *
+ * Note: This table contains options processed by both SCTP and IP levels
+ * and is the superset of options that can be performed on a SCTP and IP
+ * stack.
+ */
+opdes_t sctp_opt_arr[] = {
+
+{ SO_LINGER, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
+ sizeof (struct linger), 0 },
+
+{ SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_KEEPALIVE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
+ },
+{ SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
+{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
+ },
+{ SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
+ 0 },
+{ SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
+ 0 },
+{ SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
+ 0 },
+{ SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+
+{ SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
+
+{ SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
+
+{ SCTP_ADAPTATION_LAYER, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0,
+ sizeof (struct sctp_setadaptation), 0 },
+{ SCTP_ADD_ADDR, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, OP_VARLEN,
+ sizeof (int), 0 },
+{ SCTP_ASSOCINFO, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0,
+ sizeof (struct sctp_assocparams), 0 },
+{ SCTP_AUTOCLOSE, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SCTP_DEFAULT_SEND_PARAM, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0,
+ sizeof (struct sctp_sndrcvinfo), 0 },
+{ SCTP_DISABLE_FRAGMENTS, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0,
+ sizeof (int), 0 },
+{ SCTP_EVENTS, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0,
+ sizeof (struct sctp_event_subscribe), 0 },
+{ SCTP_GET_LADDRS, IPPROTO_SCTP, OA_R, OA_R, OP_NP, OP_VARLEN,
+ sizeof (int), 0 },
+{ SCTP_GET_NLADDRS, IPPROTO_SCTP, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
+{ SCTP_GET_NPADDRS, IPPROTO_SCTP, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
+{ SCTP_GET_PADDRS, IPPROTO_SCTP, OA_R, OA_R, OP_NP, OP_VARLEN,
+ sizeof (int), 0 },
+{ SCTP_GET_PEER_ADDR_INFO, IPPROTO_SCTP, OA_R, OA_R, OP_NP, 0,
+ sizeof (struct sctp_paddrinfo), 0 },
+{ SCTP_INITMSG, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0,
+ sizeof (struct sctp_initmsg), 0 },
+{ SCTP_I_WANT_MAPPED_V4_ADDR, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0,
+ sizeof (int), 0 },
+{ SCTP_MAXSEG, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SCTP_NODELAY, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SCTP_PEER_ADDR_PARAMS, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0,
+ sizeof (struct sctp_paddrparams), 0 },
+{ SCTP_PRIMARY_ADDR, IPPROTO_SCTP, OA_W, OA_W, OP_NP, 0,
+ sizeof (struct sctp_setpeerprim), 0 },
+{ SCTP_PRSCTP, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SCTP_GET_ASSOC_STATS, IPPROTO_SCTP, OA_R, OA_R, OP_NP, 0,
+ sizeof (sctp_assoc_stats_t), 0 },
+{ SCTP_REM_ADDR, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, OP_VARLEN,
+ sizeof (int), 0 },
+{ SCTP_RTOINFO, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0,
+ sizeof (struct sctp_rtoinfo), 0 },
+{ SCTP_SET_PEER_PRIMARY_ADDR, IPPROTO_SCTP, OA_W, OA_W, OP_NP, 0,
+ sizeof (struct sctp_setprim), 0 },
+{ SCTP_STATUS, IPPROTO_SCTP, OA_R, OA_R, OP_NP, 0,
+ sizeof (struct sctp_status), 0 },
+{ SCTP_UC_SWAP, IPPROTO_SCTP, OA_W, OA_W, OP_NP, 0,
+ sizeof (struct sctp_uc_swap), 0 },
+
+{ IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
+ (OP_VARLEN|OP_NODEFAULT),
+ 40, -1 /* not initialized */ },
+{ T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
+ (OP_VARLEN|OP_NODEFAULT),
+ 40, -1 /* not initialized */ },
+
+{ IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
+ sizeof (int), -1 /* not initialized */ },
+
+{ IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
+ sizeof (ipsec_req_t), -1 /* not initialized */ },
+
+{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
+ sizeof (int), 0 /* no ifindex */ },
+
+{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
+ sizeof (int), 0 },
+
+{ IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
+ sizeof (int), -1 /* not initialized */ },
+
+{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+ sizeof (int), 0 /* no ifindex */ },
+
+{ IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+
+{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
+ sizeof (in_addr_t), -1 /* not initialized */ },
+
+{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
+ sizeof (int), 0 },
+
+{ IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
+ (OP_NODEFAULT|OP_VARLEN),
+ sizeof (struct in6_pktinfo), -1 /* not initialized */ },
+{ IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
+ OP_NODEFAULT,
+ sizeof (sin6_t), -1 /* not initialized */ },
+{ IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
+ (OP_VARLEN|OP_NODEFAULT), 255*8,
+ -1 /* not initialized */ },
+{ IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
+ (OP_VARLEN|OP_NODEFAULT), 255*8,
+ -1 /* not initialized */ },
+{ IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
+ (OP_VARLEN|OP_NODEFAULT), 255*8,
+ -1 /* not initialized */ },
+{ IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
+ (OP_VARLEN|OP_NODEFAULT), 255*8,
+ -1 /* not initialized */ },
+{ IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
+ OP_NODEFAULT,
+ sizeof (int), -1 /* not initialized */ },
+{ IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
+ OP_NODEFAULT,
+ sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
+{ IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+ sizeof (int), 0 },
+{ IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+ sizeof (int), 0 },
+{ IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+ sizeof (int), 0 },
+
+/* Enable receipt of ancillary data */
+{ IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+ sizeof (int), 0 },
+{ IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+ sizeof (int), 0 },
+{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+ sizeof (int), 0 },
+{ IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+ sizeof (int), 0 },
+{ _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+ sizeof (int), 0 },
+{ IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+ sizeof (int), 0 },
+{ IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+ sizeof (int), 0 },
+{ IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+ sizeof (int), 0 },
+{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+ sizeof (int), 0 },
+
+{ IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
+ sizeof (ipsec_req_t), -1 /* not initialized */ },
+{ IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+ sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
+};
+
+uint_t sctp_opt_arr_size = A_CNT(sctp_opt_arr);
+
/* Handy on off switch for socket option processing. */
#define ONOFF(x) ((x) == 0 ? 0 : 1)
@@ -682,8 +783,12 @@ sctp_get_opt(sctp_t *sctp, int level, int name, void *ptr, socklen_t *optlen)
int *i1 = (int *)ptr;
int retval = 0;
int buflen = *optlen;
- conn_t *connp = sctp->sctp_connp;
- ip6_pkt_t *ipp = &sctp->sctp_sticky_ipp;
+ conn_t *connp = sctp->sctp_connp;
+ conn_opt_arg_t coas;
+
+ coas.coa_connp = connp;
+ coas.coa_ixa = connp->conn_ixa;
+ coas.coa_ipp = &connp->conn_xmit_ipp;
/* In most cases, the return buffer is just an int */
*optlen = sizeof (int32_t);
@@ -695,83 +800,30 @@ sctp_get_opt(sctp_t *sctp, int level, int name, void *ptr, socklen_t *optlen)
return (EINVAL);
}
- switch (level) {
- case SOL_SOCKET:
- switch (name) {
- case SO_LINGER: {
- struct linger *lgr = (struct linger *)ptr;
-
- lgr->l_onoff = sctp->sctp_linger ? SO_LINGER : 0;
- lgr->l_linger = TICK_TO_MSEC(sctp->sctp_lingertime);
- *optlen = sizeof (struct linger);
- break;
- }
- case SO_DEBUG:
- *i1 = sctp->sctp_debug ? SO_DEBUG : 0;
- break;
- case SO_DONTROUTE:
- *i1 = connp->conn_dontroute ? SO_DONTROUTE : 0;
- break;
- case SO_USELOOPBACK:
- *i1 = connp->conn_loopback ? SO_USELOOPBACK : 0;
- break;
- case SO_BROADCAST:
- *i1 = connp->conn_broadcast ? SO_BROADCAST : 0;
- break;
- case SO_REUSEADDR:
- *i1 = connp->conn_reuseaddr ? SO_REUSEADDR : 0;
- break;
- case SO_DGRAM_ERRIND:
- *i1 = sctp->sctp_dgram_errind ? SO_DGRAM_ERRIND : 0;
- break;
- case SO_SNDBUF:
- *i1 = sctp->sctp_xmit_hiwater;
- break;
- case SO_RCVBUF:
- *i1 = sctp->sctp_rwnd;
- break;
- case SO_ALLZONES:
- *i1 = connp->conn_allzones;
- break;
- case SO_MAC_EXEMPT:
- *i1 = (connp->conn_mac_mode == CONN_MAC_AWARE);
- break;
- case SO_MAC_IMPLICIT:
- *i1 = (connp->conn_mac_mode == CONN_MAC_IMPLICIT);
- break;
- case SO_PROTOTYPE:
- *i1 = IPPROTO_SCTP;
- break;
- case SO_DOMAIN:
- *i1 = sctp->sctp_family;
- break;
- default:
- retval = ENOPROTOOPT;
- break;
+ /*
+ * Check that the level and name are supported by SCTP, and that
+ * the length and credentials are ok.
+ */
+ retval = proto_opt_check(level, name, buflen, NULL, sctp_opt_arr,
+ sctp_opt_arr_size, B_FALSE, B_TRUE, connp->conn_cred);
+ if (retval != 0) {
+ WAKE_SCTP(sctp);
+ if (retval < 0) {
+ retval = proto_tlitosyserr(-retval);
}
- break;
+ return (retval);
+ }
+ switch (level) {
case IPPROTO_SCTP:
switch (name) {
case SCTP_RTOINFO:
- if (buflen < sizeof (struct sctp_rtoinfo)) {
- retval = EINVAL;
- break;
- }
*optlen = sctp_get_rtoinfo(sctp, ptr);
break;
case SCTP_ASSOCINFO:
- if (buflen < sizeof (struct sctp_assocparams)) {
- retval = EINVAL;
- break;
- }
*optlen = sctp_get_assocparams(sctp, ptr);
break;
case SCTP_INITMSG:
- if (buflen < sizeof (struct sctp_initmsg)) {
- retval = EINVAL;
- break;
- }
*optlen = sctp_get_initmsg(sctp, ptr);
break;
case SCTP_NODELAY:
@@ -781,34 +833,18 @@ sctp_get_opt(sctp_t *sctp, int level, int name, void *ptr, socklen_t *optlen)
*i1 = TICK_TO_SEC(sctp->sctp_autoclose);
break;
case SCTP_ADAPTATION_LAYER:
- if (buflen < sizeof (struct sctp_setadaptation)) {
- retval = EINVAL;
- break;
- }
((struct sctp_setadaptation *)ptr)->ssb_adaptation_ind =
sctp->sctp_tx_adaptation_code;
break;
case SCTP_PEER_ADDR_PARAMS:
- if (buflen < sizeof (struct sctp_paddrparams)) {
- retval = EINVAL;
- break;
- }
*optlen = sctp_get_peer_addr_params(sctp, ptr);
break;
case SCTP_DEFAULT_SEND_PARAM:
- if (buflen < sizeof (struct sctp_sndrcvinfo)) {
- retval = EINVAL;
- break;
- }
*optlen = sctp_get_def_send_params(sctp, ptr);
break;
case SCTP_EVENTS: {
struct sctp_event_subscribe *ev;
- if (buflen < sizeof (struct sctp_event_subscribe)) {
- retval = EINVAL;
- break;
- }
ev = (struct sctp_event_subscribe *)ptr;
ev->sctp_data_io_event =
ONOFF(sctp->sctp_recvsndrcvinfo);
@@ -830,17 +866,9 @@ sctp_get_opt(sctp_t *sctp, int level, int name, void *ptr, socklen_t *optlen)
break;
}
case SCTP_STATUS:
- if (buflen < sizeof (struct sctp_status)) {
- retval = EINVAL;
- break;
- }
*optlen = sctp_get_status(sctp, ptr);
break;
case SCTP_GET_PEER_ADDR_INFO:
- if (buflen < sizeof (struct sctp_paddrinfo)) {
- retval = EINVAL;
- break;
- }
retval = sctp_get_paddrinfo(sctp, ptr, optlen);
break;
case SCTP_GET_NLADDRS:
@@ -850,7 +878,7 @@ sctp_get_opt(sctp_t *sctp, int level, int name, void *ptr, socklen_t *optlen)
int addr_cnt;
int addr_size;
- if (sctp->sctp_family == AF_INET)
+ if (connp->conn_family == AF_INET)
addr_size = sizeof (struct sockaddr_in);
else
addr_size = sizeof (struct sockaddr_in6);
@@ -874,7 +902,7 @@ sctp_get_opt(sctp_t *sctp, int level, int name, void *ptr, socklen_t *optlen)
int addr_cnt;
int addr_size;
- if (sctp->sctp_family == AF_INET)
+ if (connp->conn_family == AF_INET)
addr_size = sizeof (struct sockaddr_in);
else
addr_size = sizeof (struct sockaddr_in6);
@@ -891,11 +919,6 @@ sctp_get_opt(sctp_t *sctp, int level, int name, void *ptr, socklen_t *optlen)
case SCTP_GET_ASSOC_STATS: {
sctp_assoc_stats_t *sas;
- if (buflen < sizeof (sctp_assoc_stats_t)) {
- retval = EINVAL;
- break;
- }
-
sas = (sctp_assoc_stats_t *)ptr;
/*
@@ -947,15 +970,15 @@ sctp_get_opt(sctp_t *sctp, int level, int name, void *ptr, socklen_t *optlen)
case SCTP_I_WANT_MAPPED_V4_ADDR:
case SCTP_MAXSEG:
case SCTP_DISABLE_FRAGMENTS:
- /* Not yet supported. */
default:
+ /* Not yet supported. */
retval = ENOPROTOOPT;
break;
}
- break;
-
+ WAKE_SCTP(sctp);
+ return (retval);
case IPPROTO_IP:
- if (sctp->sctp_family != AF_INET) {
+ if (connp->conn_family != AF_INET) {
retval = EINVAL;
break;
}
@@ -972,231 +995,52 @@ sctp_get_opt(sctp_t *sctp, int level, int name, void *ptr, socklen_t *optlen)
* ip_opt_get_user() adds the final destination
* at the start.
*/
- char *opt_ptr;
int opt_len;
uchar_t obuf[SCTP_MAX_IP_OPTIONS_LENGTH + IP_ADDR_LEN];
- opt_ptr = (char *)sctp->sctp_ipha +
- IP_SIMPLE_HDR_LENGTH;
- opt_len = (char *)sctp->sctp_sctph - opt_ptr;
- /* Caller ensures enough space */
- if (opt_len > 0) {
- /*
- * TODO: Do we have to handle getsockopt on an
- * initiator as well?
- */
- opt_len = ip_opt_get_user(sctp->sctp_ipha,
- obuf);
- ASSERT(opt_len <= sizeof (obuf));
- } else {
- opt_len = 0;
- }
+ opt_len = ip_opt_get_user(connp, obuf);
+ ASSERT(opt_len <= sizeof (obuf));
+
if (buflen < opt_len) {
/* Silently truncate */
opt_len = buflen;
}
*optlen = opt_len;
bcopy(obuf, ptr, opt_len);
- break;
- }
- case IP_TOS:
- case T_IP_TOS:
- *i1 = (int)sctp->sctp_ipha->ipha_type_of_service;
- break;
- case IP_TTL:
- *i1 = (int)sctp->sctp_ipha->ipha_ttl;
- break;
- case IP_NEXTHOP:
- if (connp->conn_nexthop_set) {
- *(ipaddr_t *)ptr = connp->conn_nexthop_v4;
- *optlen = sizeof (ipaddr_t);
- } else {
- *optlen = 0;
- }
- break;
- default:
- retval = ENOPROTOOPT;
- break;
- }
- break;
- case IPPROTO_IPV6:
- if (sctp->sctp_family != AF_INET6) {
- retval = EINVAL;
- break;
- }
- switch (name) {
- case IPV6_UNICAST_HOPS:
- *i1 = (unsigned int) sctp->sctp_ip6h->ip6_hops;
- break; /* goto sizeof (int) option return */
- case IPV6_RECVPKTINFO:
- if (sctp->sctp_ipv6_recvancillary &
- SCTP_IPV6_RECVPKTINFO) {
- *i1 = 1;
- } else {
- *i1 = 0;
- }
- break; /* goto sizeof (int) option return */
- case IPV6_RECVHOPLIMIT:
- if (sctp->sctp_ipv6_recvancillary &
- SCTP_IPV6_RECVHOPLIMIT) {
- *i1 = 1;
- } else {
- *i1 = 0;
- }
- break; /* goto sizeof (int) option return */
- case IPV6_RECVHOPOPTS:
- if (sctp->sctp_ipv6_recvancillary &
- SCTP_IPV6_RECVHOPOPTS) {
- *i1 = 1;
- } else {
- *i1 = 0;
- }
- break; /* goto sizeof (int) option return */
- case IPV6_RECVDSTOPTS:
- if (sctp->sctp_ipv6_recvancillary &
- SCTP_IPV6_RECVDSTOPTS) {
- *i1 = 1;
- } else {
- *i1 = 0;
- }
- break; /* goto sizeof (int) option return */
- case IPV6_RECVRTHDR:
- if (sctp->sctp_ipv6_recvancillary &
- SCTP_IPV6_RECVRTHDR) {
- *i1 = 1;
- } else {
- *i1 = 0;
- }
- break; /* goto sizeof (int) option return */
- case IPV6_RECVRTHDRDSTOPTS:
- if (sctp->sctp_ipv6_recvancillary &
- SCTP_IPV6_RECVRTDSTOPTS) {
- *i1 = 1;
- } else {
- *i1 = 0;
- }
- break; /* goto sizeof (int) option return */
- case IPV6_PKTINFO: {
- struct in6_pktinfo *pkti;
-
- if (buflen < sizeof (struct in6_pktinfo)) {
- retval = EINVAL;
- break;
- }
- pkti = (struct in6_pktinfo *)ptr;
- if (ipp->ipp_fields & IPPF_IFINDEX)
- pkti->ipi6_ifindex = ipp->ipp_ifindex;
- else
- pkti->ipi6_ifindex = 0;
- if (ipp->ipp_fields & IPPF_ADDR)
- pkti->ipi6_addr = ipp->ipp_addr;
- else
- pkti->ipi6_addr = ipv6_all_zeros;
- *optlen = sizeof (struct in6_pktinfo);
- break;
- }
- case IPV6_NEXTHOP: {
- sin6_t *sin6;
-
- if (buflen < sizeof (sin6_t)) {
- retval = EINVAL;
- break;
- }
- sin6 = (sin6_t *)ptr;
- if (!(ipp->ipp_fields & IPPF_NEXTHOP))
- break;
- *sin6 = sctp_sin6_null;
- sin6->sin6_family = AF_INET6;
- sin6->sin6_addr = ipp->ipp_nexthop;
- *optlen = sizeof (sin6_t);
- break;
+ WAKE_SCTP(sctp);
+ return (0);
}
- case IPV6_HOPOPTS: {
- int len;
-
- if (!(ipp->ipp_fields & IPPF_HOPOPTS))
- break;
- len = ipp->ipp_hopoptslen - sctp->sctp_v6label_len;
- if (len <= 0)
- break;
- if (buflen < len) {
- retval = EINVAL;
- break;
- }
- bcopy((char *)ipp->ipp_hopopts +
- sctp->sctp_v6label_len, ptr, len);
- if (sctp->sctp_v6label_len > 0) {
- char *cptr = ptr;
-
- /*
- * If the label length is greater than zero,
- * then we need to hide the label from user.
- * Make it look as though a normal Hop-By-Hop
- * Options Header is present here.
- */
- cptr[0] = ((char *)ipp->ipp_hopopts)[0];
- cptr[1] = (len + 7) / 8 - 1;
- }
- *optlen = len;
- break;
- }
- case IPV6_RTHDRDSTOPTS:
- if (!(ipp->ipp_fields & IPPF_RTDSTOPTS))
- break;
- if (buflen < ipp->ipp_rtdstoptslen) {
- retval = EINVAL;
- break;
- }
- bcopy(ipp->ipp_rtdstopts, ptr, ipp->ipp_rtdstoptslen);
- *optlen = ipp->ipp_rtdstoptslen;
- break;
- case IPV6_RTHDR:
- if (!(ipp->ipp_fields & IPPF_RTHDR))
- break;
- if (buflen < ipp->ipp_rthdrlen) {
- retval = EINVAL;
- break;
- }
- bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen);
- *optlen = ipp->ipp_rthdrlen;
- break;
- case IPV6_DSTOPTS:
- if (!(ipp->ipp_fields & IPPF_DSTOPTS))
- break;
- if (buflen < ipp->ipp_dstoptslen) {
- retval = EINVAL;
- break;
- }
- bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen);
- *optlen = ipp->ipp_dstoptslen;
- break;
- case IPV6_V6ONLY:
- *i1 = sctp->sctp_connp->conn_ipv6_v6only;
- break;
default:
- retval = ENOPROTOOPT;
break;
}
break;
-
- default:
- retval = ENOPROTOOPT;
- break;
}
+ mutex_enter(&connp->conn_lock);
+ retval = conn_opt_get(&coas, level, name, ptr);
+ mutex_exit(&connp->conn_lock);
WAKE_SCTP(sctp);
- return (retval);
+ if (retval == -1)
+ return (EINVAL);
+ *optlen = retval;
+ return (0);
}
int
sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp,
socklen_t inlen)
{
- ip6_pkt_t *ipp = &sctp->sctp_sticky_ipp;
int *i1 = (int *)invalp;
boolean_t onoff;
int retval = 0, addrcnt;
conn_t *connp = sctp->sctp_connp;
sctp_stack_t *sctps = sctp->sctp_sctps;
+ conn_opt_arg_t coas;
+
+ coas.coa_connp = connp;
+ coas.coa_ixa = connp->conn_ixa;
+ coas.coa_ipp = &connp->conn_xmit_ipp;
+ coas.coa_ancillary = B_FALSE;
+ coas.coa_changed = 0;
/* In all cases, the size of the option must be bigger than int */
if (inlen >= sizeof (int32_t)) {
@@ -1211,74 +1055,42 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp,
return (EINVAL);
}
+ /*
+ * Check that the level and name are supported by SCTP, and that
+ * the length an credentials are ok.
+ */
+ retval = proto_opt_check(level, name, inlen, NULL, sctp_opt_arr,
+ sctp_opt_arr_size, B_TRUE, B_FALSE, connp->conn_cred);
+ if (retval != 0) {
+ if (retval < 0) {
+ retval = proto_tlitosyserr(-retval);
+ }
+ goto done;
+ }
+
+ /* Note: both SCTP and TCP interpret l_linger as being in seconds */
switch (level) {
case SOL_SOCKET:
- if (inlen < sizeof (int32_t)) {
- retval = EINVAL;
- break;
- }
switch (name) {
- case SO_LINGER: {
- struct linger *lgr;
-
- if (inlen != sizeof (struct linger)) {
- retval = EINVAL;
- break;
- }
- lgr = (struct linger *)invalp;
- if (lgr->l_onoff != 0) {
- sctp->sctp_linger = 1;
- sctp->sctp_lingertime = MSEC_TO_TICK(
- lgr->l_linger);
- } else {
- sctp->sctp_linger = 0;
- sctp->sctp_lingertime = 0;
- }
- break;
- }
- case SO_DEBUG:
- sctp->sctp_debug = onoff;
- break;
- case SO_KEEPALIVE:
- break;
- case SO_DONTROUTE:
- /*
- * SO_DONTROUTE, SO_USELOOPBACK and SO_BROADCAST are
- * only of interest to IP.
- */
- connp->conn_dontroute = onoff;
- break;
- case SO_USELOOPBACK:
- connp->conn_loopback = onoff;
- break;
- case SO_BROADCAST:
- connp->conn_broadcast = onoff;
- break;
- case SO_REUSEADDR:
- connp->conn_reuseaddr = onoff;
- break;
- case SO_DGRAM_ERRIND:
- sctp->sctp_dgram_errind = onoff;
- break;
case SO_SNDBUF:
if (*i1 > sctps->sctps_max_buf) {
retval = ENOBUFS;
- break;
+ goto done;
}
if (*i1 < 0) {
retval = EINVAL;
- break;
+ goto done;
}
- sctp->sctp_xmit_hiwater = *i1;
- if (sctps->sctps_snd_lowat_fraction != 0)
- sctp->sctp_xmit_lowater =
- sctp->sctp_xmit_hiwater /
+ connp->conn_sndbuf = *i1;
+ if (sctps->sctps_snd_lowat_fraction != 0) {
+ connp->conn_sndlowat = connp->conn_sndbuf /
sctps->sctps_snd_lowat_fraction;
- break;
+ }
+ goto done;
case SO_RCVBUF:
if (*i1 > sctps->sctps_max_buf) {
retval = ENOBUFS;
- break;
+ goto done;
}
/* Silently ignore zero */
if (*i1 != 0) {
@@ -1294,12 +1106,16 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp,
*i1 = MAX(*i1,
sctps->sctps_recv_hiwat_minmss *
sctp->sctp_mss);
- sctp->sctp_rwnd = *i1;
+ /*
+ * Note that sctp_rwnd is modified by the
+ * protocol and here we just whack it.
+ */
+ connp->conn_rcvbuf = sctp->sctp_rwnd = *i1;
sctp->sctp_irwnd = sctp->sctp_rwnd;
sctp->sctp_pd_point = sctp->sctp_rwnd;
sopp.sopp_flags = SOCKOPT_RCVHIWAT;
- sopp.sopp_rxhiwat = *i1;
+ sopp.sopp_rxhiwat = connp->conn_rcvbuf;
sctp->sctp_ulp_prop(sctp->sctp_ulpd, &sopp);
}
@@ -1307,60 +1123,29 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp,
* XXX should we return the rwnd here
* and sctp_opt_get ?
*/
- break;
+ goto done;
case SO_ALLZONES:
- if (secpolicy_ip(sctp->sctp_credp, OP_CONFIG,
- B_TRUE)) {
- retval = EACCES;
- break;
- }
if (sctp->sctp_state >= SCTPS_BOUND) {
retval = EINVAL;
- break;
+ goto done;
}
- sctp->sctp_allzones = onoff;
break;
case SO_MAC_EXEMPT:
- if (secpolicy_net_mac_aware(sctp->sctp_credp) != 0) {
- retval = EACCES;
- break;
- }
- if (sctp->sctp_state >= SCTPS_BOUND) {
- retval = EINVAL;
- break;
- }
- connp->conn_mac_mode = onoff ?
- CONN_MAC_AWARE : CONN_MAC_DEFAULT;
- break;
- case SO_MAC_IMPLICIT:
- if (secpolicy_net_mac_implicit(sctp->sctp_credp) != 0) {
- retval = EACCES;
- break;
- }
if (sctp->sctp_state >= SCTPS_BOUND) {
retval = EINVAL;
- break;
+ goto done;
}
- connp->conn_mac_mode = onoff ?
- CONN_MAC_AWARE : CONN_MAC_IMPLICIT;
- break;
- default:
- retval = ENOPROTOOPT;
break;
}
break;
case IPPROTO_SCTP:
- if (inlen < sizeof (int32_t)) {
- retval = EINVAL;
- break;
- }
switch (name) {
case SCTP_RTOINFO:
- retval = sctp_set_rtoinfo(sctp, invalp, inlen);
+ retval = sctp_set_rtoinfo(sctp, invalp);
break;
case SCTP_ASSOCINFO:
- retval = sctp_set_assocparams(sctp, invalp, inlen);
+ retval = sctp_set_assocparams(sctp, invalp);
break;
case SCTP_INITMSG:
retval = sctp_set_initmsg(sctp, invalp, inlen);
@@ -1378,37 +1163,28 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp,
sctp_heartbeat_timer(sctp);
break;
case SCTP_SET_PEER_PRIMARY_ADDR:
- retval = sctp_set_peerprim(sctp, invalp, inlen);
+ retval = sctp_set_peerprim(sctp, invalp);
break;
case SCTP_PRIMARY_ADDR:
- retval = sctp_set_prim(sctp, invalp, inlen);
+ retval = sctp_set_prim(sctp, invalp);
break;
case SCTP_ADAPTATION_LAYER: {
struct sctp_setadaptation *ssb;
- if (inlen < sizeof (struct sctp_setadaptation)) {
- retval = EINVAL;
- break;
- }
ssb = (struct sctp_setadaptation *)invalp;
sctp->sctp_send_adaptation = 1;
sctp->sctp_tx_adaptation_code = ssb->ssb_adaptation_ind;
break;
}
case SCTP_PEER_ADDR_PARAMS:
- retval = sctp_set_peer_addr_params(sctp, invalp,
- inlen);
+ retval = sctp_set_peer_addr_params(sctp, invalp);
break;
case SCTP_DEFAULT_SEND_PARAM:
- retval = sctp_set_def_send_params(sctp, invalp, inlen);
+ retval = sctp_set_def_send_params(sctp, invalp);
break;
case SCTP_EVENTS: {
struct sctp_event_subscribe *ev;
- if (inlen < sizeof (struct sctp_event_subscribe)) {
- retval = EINVAL;
- break;
- }
ev = (struct sctp_event_subscribe *)invalp;
sctp->sctp_recvsndrcvinfo =
ONOFF(ev->sctp_data_io_event);
@@ -1438,15 +1214,15 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp,
retval = EINVAL;
break;
}
- if (sctp->sctp_family == AF_INET) {
+ if (connp->conn_family == AF_INET) {
addrcnt = inlen / sizeof (struct sockaddr_in);
} else {
- ASSERT(sctp->sctp_family == AF_INET6);
+ ASSERT(connp->conn_family == AF_INET6);
addrcnt = inlen / sizeof (struct sockaddr_in6);
}
if (name == SCTP_ADD_ADDR) {
retval = sctp_bind_add(sctp, invalp, addrcnt,
- B_TRUE, sctp->sctp_lport);
+ B_TRUE, connp->conn_lport);
} else {
retval = sctp_bind_del(sctp, invalp, addrcnt,
B_TRUE);
@@ -1458,10 +1234,6 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp,
/*
* Change handle & upcalls.
*/
- if (inlen < sizeof (*us)) {
- retval = EINVAL;
- break;
- }
us = (struct sctp_uc_swap *)invalp;
sctp->sctp_ulpd = us->sus_handle;
sctp->sctp_upcalls = us->sus_upcalls;
@@ -1474,33 +1246,17 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp,
case SCTP_MAXSEG:
case SCTP_DISABLE_FRAGMENTS:
/* Not yet supported. */
- default:
retval = ENOPROTOOPT;
break;
}
- break;
+ goto done;
case IPPROTO_IP:
- if (sctp->sctp_family != AF_INET) {
+ if (connp->conn_family != AF_INET) {
retval = ENOPROTOOPT;
- break;
- }
- if ((name != IP_OPTIONS) && (inlen < sizeof (int32_t))) {
- retval = EINVAL;
- break;
+ goto done;
}
switch (name) {
- case IP_OPTIONS:
- case T_IP_OPTIONS:
- retval = sctp_opt_set_header(sctp, invalp, inlen);
- break;
- case IP_TOS:
- case T_IP_TOS:
- sctp->sctp_ipha->ipha_type_of_service = (uchar_t)*i1;
- break;
- case IP_TTL:
- sctp->sctp_ipha->ipha_ttl = (uchar_t)*i1;
- break;
case IP_SEC_OPT:
/*
* We should not allow policy setting after
@@ -1508,319 +1264,30 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp,
*/
if (sctp->sctp_state >= SCTPS_LISTEN) {
retval = EINVAL;
- } else {
- retval = ipsec_set_req(sctp->sctp_credp,
- sctp->sctp_connp, (ipsec_req_t *)invalp);
- }
- break;
- /* IP level options */
- case IP_UNSPEC_SRC:
- connp->conn_unspec_src = onoff;
- break;
- case IP_NEXTHOP: {
- ipaddr_t addr = *i1;
- ipif_t *ipif = NULL;
- ill_t *ill;
- ip_stack_t *ipst = sctps->sctps_netstack->netstack_ip;
-
- if (secpolicy_ip(sctp->sctp_credp, OP_CONFIG,
- B_TRUE) == 0) {
- ipif = ipif_lookup_onlink_addr(addr,
- connp->conn_zoneid, ipst);
- if (ipif == NULL) {
- retval = EHOSTUNREACH;
- break;
- }
- ill = ipif->ipif_ill;
- mutex_enter(&ill->ill_lock);
- if ((ill->ill_state_flags & ILL_CONDEMNED) ||
- (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
- mutex_exit(&ill->ill_lock);
- ipif_refrele(ipif);
- retval = EHOSTUNREACH;
- break;
- }
- mutex_exit(&ill->ill_lock);
- ipif_refrele(ipif);
- mutex_enter(&connp->conn_lock);
- connp->conn_nexthop_v4 = addr;
- connp->conn_nexthop_set = B_TRUE;
- mutex_exit(&connp->conn_lock);
+ goto done;
}
break;
}
- default:
- retval = ENOPROTOOPT;
- break;
- }
break;
- case IPPROTO_IPV6: {
- if (sctp->sctp_family != AF_INET6) {
- retval = ENOPROTOOPT;
- break;
+ case IPPROTO_IPV6:
+ if (connp->conn_family != AF_INET6) {
+ retval = EINVAL;
+ goto done;
}
switch (name) {
- case IPV6_UNICAST_HOPS:
- if (inlen < sizeof (int32_t)) {
- retval = EINVAL;
- break;
- }
- if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) {
- retval = EINVAL;
- break;
- }
- if (*i1 == -1) {
- ipp->ipp_unicast_hops =
- sctps->sctps_ipv6_hoplimit;
- ipp->ipp_fields &= ~IPPF_UNICAST_HOPS;
- } else {
- ipp->ipp_unicast_hops = (uint8_t)*i1;
- ipp->ipp_fields |= IPPF_UNICAST_HOPS;
- }
- retval = sctp_build_hdrs(sctp);
- break;
- case IPV6_UNSPEC_SRC:
- if (inlen < sizeof (int32_t)) {
- retval = EINVAL;
- break;
- }
- connp->conn_unspec_src = onoff;
- break;
case IPV6_RECVPKTINFO:
- if (inlen < sizeof (int32_t)) {
- retval = EINVAL;
- break;
- }
- if (onoff)
- sctp->sctp_ipv6_recvancillary |=
- SCTP_IPV6_RECVPKTINFO;
- else
- sctp->sctp_ipv6_recvancillary &=
- ~SCTP_IPV6_RECVPKTINFO;
/* Send it with the next msg */
sctp->sctp_recvifindex = 0;
- connp->conn_ip_recvpktinfo = onoff;
+ break;
+ case IPV6_RECVTCLASS:
+ /* Force it to be sent up with the next msg */
+ sctp->sctp_recvtclass = 0xffffffffU;
break;
case IPV6_RECVHOPLIMIT:
- if (inlen < sizeof (int32_t)) {
- retval = EINVAL;
- break;
- }
- if (onoff)
- sctp->sctp_ipv6_recvancillary |=
- SCTP_IPV6_RECVHOPLIMIT;
- else
- sctp->sctp_ipv6_recvancillary &=
- ~SCTP_IPV6_RECVHOPLIMIT;
+ /* Force it to be sent up with the next msg */
sctp->sctp_recvhops = 0xffffffffU;
- connp->conn_ipv6_recvhoplimit = onoff;
- break;
- case IPV6_RECVHOPOPTS:
- if (inlen < sizeof (int32_t)) {
- retval = EINVAL;
- break;
- }
- if (onoff)
- sctp->sctp_ipv6_recvancillary |=
- SCTP_IPV6_RECVHOPOPTS;
- else
- sctp->sctp_ipv6_recvancillary &=
- ~SCTP_IPV6_RECVHOPOPTS;
- connp->conn_ipv6_recvhopopts = onoff;
- break;
- case IPV6_RECVDSTOPTS:
- if (inlen < sizeof (int32_t)) {
- retval = EINVAL;
- break;
- }
- if (onoff)
- sctp->sctp_ipv6_recvancillary |=
- SCTP_IPV6_RECVDSTOPTS;
- else
- sctp->sctp_ipv6_recvancillary &=
- ~SCTP_IPV6_RECVDSTOPTS;
- connp->conn_ipv6_recvdstopts = onoff;
- break;
- case IPV6_RECVRTHDR:
- if (inlen < sizeof (int32_t)) {
- retval = EINVAL;
- break;
- }
- if (onoff)
- sctp->sctp_ipv6_recvancillary |=
- SCTP_IPV6_RECVRTHDR;
- else
- sctp->sctp_ipv6_recvancillary &=
- ~SCTP_IPV6_RECVRTHDR;
- connp->conn_ipv6_recvrthdr = onoff;
- break;
- case IPV6_RECVRTHDRDSTOPTS:
- if (inlen < sizeof (int32_t)) {
- retval = EINVAL;
- break;
- }
- if (onoff)
- sctp->sctp_ipv6_recvancillary |=
- SCTP_IPV6_RECVRTDSTOPTS;
- else
- sctp->sctp_ipv6_recvancillary &=
- ~SCTP_IPV6_RECVRTDSTOPTS;
- connp->conn_ipv6_recvrtdstopts = onoff;
- break;
- case IPV6_PKTINFO:
- if (inlen != 0 &&
- inlen != sizeof (struct in6_pktinfo)) {
- retval = EINVAL;
- break;
- }
-
- if (inlen == 0) {
- ipp->ipp_fields &= ~(IPPF_IFINDEX |IPPF_ADDR);
- } else {
- struct in6_pktinfo *pkti;
-
- pkti = (struct in6_pktinfo *)invalp;
- /* XXX Need to check if the index exists */
- ipp->ipp_ifindex = pkti->ipi6_ifindex;
- ipp->ipp_addr = pkti->ipi6_addr;
- if (ipp->ipp_ifindex != 0)
- ipp->ipp_fields |= IPPF_IFINDEX;
- else
- ipp->ipp_fields &= ~IPPF_IFINDEX;
- if (!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr))
- ipp->ipp_fields |= IPPF_ADDR;
- else
- ipp->ipp_fields &= ~IPPF_ADDR;
- }
- retval = sctp_build_hdrs(sctp);
- break;
- case IPV6_NEXTHOP: {
- struct sockaddr_in6 *sin6;
- ip_stack_t *ipst = sctps->sctps_netstack->netstack_ip;
-
- if (inlen != 0 && inlen != sizeof (sin6_t)) {
- retval = EINVAL;
- break;
- }
-
- if (inlen == 0) {
- ipp->ipp_fields &= ~IPPF_NEXTHOP;
- } else {
- sin6 = (struct sockaddr_in6 *)invalp;
- if (sin6->sin6_family != AF_INET6) {
- retval = EAFNOSUPPORT;
- break;
- }
- if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
- retval = EADDRNOTAVAIL;
- break;
- }
- ipp->ipp_nexthop = sin6->sin6_addr;
- if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
- ipp->ipp_fields &= ~IPPF_NEXTHOP;
- } else {
- ire_t *ire;
-
- ire = ire_route_lookup_v6(
- &sin6->sin6_addr, NULL, NULL, 0,
- NULL, NULL, ALL_ZONES, NULL,
- MATCH_IRE_DEFAULT, ipst);
- if (ire == NULL) {
- retval = EHOSTUNREACH;
- break;
- }
- ire_refrele(ire);
- ipp->ipp_fields |= IPPF_NEXTHOP;
- }
- }
- retval = sctp_build_hdrs(sctp);
- break;
- }
- case IPV6_HOPOPTS: {
- ip6_hbh_t *hopts = (ip6_hbh_t *)invalp;
-
- if (inlen != 0 &&
- inlen != (8 * (hopts->ip6h_len + 1))) {
- retval = EINVAL;
- break;
- }
-
- retval = optcom_pkt_set((uchar_t *)invalp, inlen,
- B_TRUE, (uchar_t **)&ipp->ipp_hopopts,
- &ipp->ipp_hopoptslen, sctp->sctp_v6label_len);
- if (retval != 0)
- break;
- if (ipp->ipp_hopoptslen == 0)
- ipp->ipp_fields &= ~IPPF_HOPOPTS;
- else
- ipp->ipp_fields |= IPPF_HOPOPTS;
- retval = sctp_build_hdrs(sctp);
- break;
- }
- case IPV6_RTHDRDSTOPTS: {
- ip6_dest_t *dopts = (ip6_dest_t *)invalp;
-
- if (inlen != 0 &&
- inlen != (8 * (dopts->ip6d_len + 1))) {
- retval = EINVAL;
- break;
- }
-
- retval = optcom_pkt_set((uchar_t *)invalp, inlen,
- B_TRUE, (uchar_t **)&ipp->ipp_rtdstopts,
- &ipp->ipp_rtdstoptslen, 0);
- if (retval != 0)
- break;
- if (ipp->ipp_rtdstoptslen == 0)
- ipp->ipp_fields &= ~IPPF_RTDSTOPTS;
- else
- ipp->ipp_fields |= IPPF_RTDSTOPTS;
- retval = sctp_build_hdrs(sctp);
- break;
- }
- case IPV6_DSTOPTS: {
- ip6_dest_t *dopts = (ip6_dest_t *)invalp;
-
- if (inlen != 0 &&
- inlen != (8 * (dopts->ip6d_len + 1))) {
- retval = EINVAL;
- break;
- }
-
- retval = optcom_pkt_set((uchar_t *)invalp, inlen,
- B_TRUE, (uchar_t **)&ipp->ipp_dstopts,
- &ipp->ipp_dstoptslen, 0);
- if (retval != 0)
- break;
- if (ipp->ipp_dstoptslen == 0)
- ipp->ipp_fields &= ~IPPF_DSTOPTS;
- else
- ipp->ipp_fields |= IPPF_DSTOPTS;
- retval = sctp_build_hdrs(sctp);
break;
- }
- case IPV6_RTHDR: {
- ip6_rthdr_t *rt = (ip6_rthdr_t *)invalp;
-
- if (inlen != 0 &&
- inlen != (8 * (rt->ip6r_len + 1))) {
- retval = EINVAL;
- break;
- }
-
- retval = optcom_pkt_set((uchar_t *)invalp, inlen,
- B_TRUE, (uchar_t **)&ipp->ipp_rthdr,
- &ipp->ipp_rthdrlen, 0);
- if (retval != 0)
- break;
- if (ipp->ipp_rthdrlen == 0)
- ipp->ipp_fields &= ~IPPF_RTHDR;
- else
- ipp->ipp_fields |= IPPF_RTHDR;
- retval = sctp_build_hdrs(sctp);
- break;
- }
case IPV6_SEC_OPT:
/*
* We should not allow policy setting after
@@ -1828,9 +1295,7 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp,
*/
if (sctp->sctp_state >= SCTPS_LISTEN) {
retval = EINVAL;
- } else {
- retval = ipsec_set_req(sctp->sctp_credp,
- sctp->sctp_connp, (ipsec_req_t *)invalp);
+ goto done;
}
break;
case IPV6_V6ONLY:
@@ -1840,21 +1305,44 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp,
*/
if (sctp->sctp_state >= SCTPS_BOUND) {
retval = EINVAL;
- } else {
- sctp->sctp_connp->conn_ipv6_v6only = onoff;
+ goto done;
}
break;
- default:
- retval = ENOPROTOOPT;
- break;
}
break;
}
- default:
- retval = ENOPROTOOPT;
- break;
- }
+ retval = conn_opt_set(&coas, level, name, inlen, (uchar_t *)invalp,
+ B_FALSE, connp->conn_cred);
+ if (retval != 0)
+ goto done;
+
+ if (coas.coa_changed & COA_ROUTE_CHANGED) {
+ sctp_faddr_t *fp;
+ /*
+ * We recache the information which might pick a different
+ * source and redo IPsec as a result.
+ */
+ for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->next)
+ sctp_get_dest(sctp, fp);
+ }
+ if (coas.coa_changed & COA_HEADER_CHANGED) {
+ retval = sctp_build_hdrs(sctp, KM_NOSLEEP);
+ if (retval != 0)
+ goto done;
+ }
+ if (coas.coa_changed & COA_WROFF_CHANGED) {
+ connp->conn_wroff = connp->conn_ht_iphc_allocated +
+ sctps->sctps_wroff_xtra;
+ if (sctp->sctp_current != NULL) {
+ /*
+ * Could be setting options before setting up
+ * connection.
+ */
+ sctp_set_ulp_prop(sctp);
+ }
+ }
+done:
WAKE_SCTP(sctp);
return (retval);
}
@@ -1871,18 +1359,19 @@ sctp_getsockname(sctp_t *sctp, struct sockaddr *addr, socklen_t *addrlen)
int addrcnt = 1;
sin_t *sin4;
sin6_t *sin6;
+ conn_t *connp = sctp->sctp_connp;
ASSERT(sctp != NULL);
RUN_SCTP(sctp);
- addr->sa_family = sctp->sctp_family;
- switch (sctp->sctp_family) {
+ addr->sa_family = connp->conn_family;
+ switch (connp->conn_family) {
case AF_INET:
sin4 = (sin_t *)addr;
if ((sctp->sctp_state <= SCTPS_LISTEN) &&
sctp->sctp_bound_to_all) {
sin4->sin_addr.s_addr = INADDR_ANY;
- sin4->sin_port = sctp->sctp_lport;
+ sin4->sin_port = connp->conn_lport;
} else {
err = sctp_getmyaddrs(sctp, sin4, &addrcnt);
if (err != 0) {
@@ -1897,7 +1386,7 @@ sctp_getsockname(sctp_t *sctp, struct sockaddr *addr, socklen_t *addrlen)
if ((sctp->sctp_state <= SCTPS_LISTEN) &&
sctp->sctp_bound_to_all) {
bzero(&sin6->sin6_addr, sizeof (sin6->sin6_addr));
- sin6->sin6_port = sctp->sctp_lport;
+ sin6->sin6_port = connp->conn_lport;
} else {
err = sctp_getmyaddrs(sctp, sin6, &addrcnt);
if (err != 0) {
@@ -1906,10 +1395,7 @@ sctp_getsockname(sctp_t *sctp, struct sockaddr *addr, socklen_t *addrlen)
}
}
*addrlen = sizeof (struct sockaddr_in6);
- sin6->sin6_flowinfo = sctp->sctp_ip6h->ip6_vcf &
- ~IPV6_VERS_AND_FLOW_MASK;
- sin6->sin6_scope_id = 0;
- sin6->__sin6_src_id = 0;
+ /* Note that flowinfo is only returned for getpeername */
break;
}
WAKE_SCTP(sctp);
@@ -1927,12 +1413,13 @@ sctp_getpeername(sctp_t *sctp, struct sockaddr *addr, socklen_t *addrlen)
int err = 0;
int addrcnt = 1;
sin6_t *sin6;
+ conn_t *connp = sctp->sctp_connp;
ASSERT(sctp != NULL);
RUN_SCTP(sctp);
- addr->sa_family = sctp->sctp_family;
- switch (sctp->sctp_family) {
+ addr->sa_family = connp->conn_family;
+ switch (connp->conn_family) {
case AF_INET:
err = sctp_getpeeraddrs(sctp, addr, &addrcnt);
if (err != 0) {
@@ -1949,9 +1436,6 @@ sctp_getpeername(sctp_t *sctp, struct sockaddr *addr, socklen_t *addrlen)
break;
}
*addrlen = sizeof (struct sockaddr_in6);
- sin6->sin6_flowinfo = 0;
- sin6->sin6_scope_id = 0;
- sin6->__sin6_src_id = 0;
break;
}
WAKE_SCTP(sctp);
@@ -1973,13 +1457,14 @@ sctp_getpeeraddrs(sctp_t *sctp, void *paddrs, int *addrcnt)
int cnt;
sctp_faddr_t *fp = sctp->sctp_faddrs;
in6_addr_t addr;
+ conn_t *connp = sctp->sctp_connp;
ASSERT(sctp != NULL);
if (sctp->sctp_faddrs == NULL)
return (ENOTCONN);
- family = sctp->sctp_family;
+ family = connp->conn_family;
max = *addrcnt;
/* If we want only one, give the primary */
@@ -1989,15 +1474,26 @@ sctp_getpeeraddrs(sctp_t *sctp, void *paddrs, int *addrcnt)
case AF_INET:
sin4 = paddrs;
IN6_V4MAPPED_TO_INADDR(&addr, &sin4->sin_addr);
- sin4->sin_port = sctp->sctp_fport;
+ sin4->sin_port = connp->conn_fport;
sin4->sin_family = AF_INET;
break;
case AF_INET6:
sin6 = paddrs;
sin6->sin6_addr = addr;
- sin6->sin6_port = sctp->sctp_fport;
+ sin6->sin6_port = connp->conn_fport;
sin6->sin6_family = AF_INET6;
+ sin6->sin6_flowinfo = connp->conn_flowinfo;
+ if (IN6_IS_ADDR_LINKSCOPE(&addr) &&
+ sctp->sctp_primary != NULL &&
+ (sctp->sctp_primary->ixa->ixa_flags &
+ IXAF_SCOPEID_SET)) {
+ sin6->sin6_scope_id =
+ sctp->sctp_primary->ixa->ixa_scopeid;
+ } else {
+ sin6->sin6_scope_id = 0;
+ }
+ sin6->__sin6_src_id = 0;
break;
}
return (0);
@@ -2010,14 +1506,21 @@ sctp_getpeeraddrs(sctp_t *sctp, void *paddrs, int *addrcnt)
ASSERT(IN6_IS_ADDR_V4MAPPED(&addr));
sin4 = (struct sockaddr_in *)paddrs + cnt;
IN6_V4MAPPED_TO_INADDR(&addr, &sin4->sin_addr);
- sin4->sin_port = sctp->sctp_fport;
+ sin4->sin_port = connp->conn_fport;
sin4->sin_family = AF_INET;
break;
case AF_INET6:
sin6 = (struct sockaddr_in6 *)paddrs + cnt;
sin6->sin6_addr = addr;
- sin6->sin6_port = sctp->sctp_fport;
+ sin6->sin6_port = connp->conn_fport;
sin6->sin6_family = AF_INET6;
+ sin6->sin6_flowinfo = connp->conn_flowinfo;
+ if (IN6_IS_ADDR_LINKSCOPE(&addr) &&
+ (fp->ixa->ixa_flags & IXAF_SCOPEID_SET))
+ sin6->sin6_scope_id = fp->ixa->ixa_scopeid;
+ else
+ sin6->sin6_scope_id = 0;
+ sin6->__sin6_src_id = 0;
break;
}
}
diff --git a/usr/src/uts/common/inet/sctp/sctp_output.c b/usr/src/uts/common/inet/sctp/sctp_output.c
index c16a1166fa..1a50097260 100644
--- a/usr/src/uts/common/inet/sctp/sctp_output.c
+++ b/usr/src/uts/common/inet/sctp/sctp_output.c
@@ -38,6 +38,7 @@
#include <inet/common.h>
#include <inet/mi.h>
#include <inet/ip.h>
+#include <inet/ip_ire.h>
#include <inet/ip6.h>
#include <inet/sctp_ip.h>
#include <inet/ipclassifier.h>
@@ -140,6 +141,7 @@ sctp_sendmsg(sctp_t *sctp, mblk_t *mp, int flags)
sctp_msg_hdr_t *sctp_msg_hdr;
uint32_t msg_len = 0;
uint32_t timetolive = sctp->sctp_def_timetolive;
+ conn_t *connp = sctp->sctp_connp;
ASSERT(DB_TYPE(mproto) == M_PROTO);
@@ -228,7 +230,7 @@ sctp_sendmsg(sctp_t *sctp, mblk_t *mp, int flags)
RUN_SCTP(sctp);
sctp_user_abort(sctp, mp);
freemsg(mproto);
- goto process_sendq;
+ goto done2;
}
if (mp == NULL)
goto done;
@@ -292,15 +294,14 @@ sctp_sendmsg(sctp_t *sctp, mblk_t *mp, int flags)
/*
* Notify sockfs if the tx queue is full.
*/
- if (SCTP_TXQ_LEN(sctp) >= sctp->sctp_xmit_hiwater) {
+ if (SCTP_TXQ_LEN(sctp) >= connp->conn_sndbuf) {
sctp->sctp_txq_full = 1;
sctp->sctp_ulp_xmitted(sctp->sctp_ulpd, B_TRUE);
}
if (sctp->sctp_state == SCTPS_ESTABLISHED)
sctp_output(sctp, UINT_MAX);
-process_sendq:
+done2:
WAKE_SCTP(sctp);
- sctp_process_sendq(sctp);
return (0);
unlock_done:
WAKE_SCTP(sctp);
@@ -569,7 +570,7 @@ sctp_add_proto_hdr(sctp_t *sctp, sctp_faddr_t *fp, mblk_t *mp, int sacklen,
int *error)
{
int hdrlen;
- char *hdr;
+ uchar_t *hdr;
int isv4 = fp->isv4;
sctp_stack_t *sctps = sctp->sctp_sctps;
@@ -584,17 +585,19 @@ sctp_add_proto_hdr(sctp_t *sctp, sctp_faddr_t *fp, mblk_t *mp, int sacklen,
hdr = sctp->sctp_iphc6;
}
/*
- * A null fp->ire could mean that the address is 'down'. Similarly,
+ * A reject|blackhole could mean that the address is 'down'. Similarly,
* it is possible that the address went down, we tried to send an
* heartbeat and ended up setting fp->saddr as unspec because we
* didn't have any usable source address. In either case
- * sctp_get_ire() will try find an IRE, if available, and set
+ * sctp_get_dest() will try find an IRE, if available, and set
* the source address, if needed. If we still don't have any
* usable source address, fp->state will be SCTP_FADDRS_UNREACH and
* we return EHOSTUNREACH.
*/
- if (fp->ire == NULL || SCTP_IS_ADDR_UNSPEC(fp->isv4, fp->saddr)) {
- sctp_get_ire(sctp, fp);
+ ASSERT(fp->ixa->ixa_ire != NULL);
+ if ((fp->ixa->ixa_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
+ SCTP_IS_ADDR_UNSPEC(fp->isv4, fp->saddr)) {
+ sctp_get_dest(sctp, fp);
if (fp->state == SCTP_FADDRS_UNREACH) {
if (error != NULL)
*error = EHOSTUNREACH;
@@ -603,8 +606,7 @@ sctp_add_proto_hdr(sctp_t *sctp, sctp_faddr_t *fp, mblk_t *mp, int sacklen,
}
/* Copy in IP header. */
if ((mp->b_rptr - mp->b_datap->db_base) <
- (sctps->sctps_wroff_xtra + hdrlen + sacklen) || DB_REF(mp) > 2 ||
- !IS_P2ALIGNED(DB_BASE(mp), sizeof (ire_t *))) {
+ (sctps->sctps_wroff_xtra + hdrlen + sacklen) || DB_REF(mp) > 2) {
mblk_t *nmp;
/*
@@ -612,8 +614,8 @@ sctp_add_proto_hdr(sctp_t *sctp, sctp_faddr_t *fp, mblk_t *mp, int sacklen,
* data was moved into chunks, or during retransmission,
* or things like snoop is running.
*/
- nmp = allocb_cred(sctps->sctps_wroff_xtra + hdrlen + sacklen,
- CONN_CRED(sctp->sctp_connp), sctp->sctp_cpid);
+ nmp = allocb(sctps->sctps_wroff_xtra + hdrlen + sacklen,
+ BPRI_MED);
if (nmp == NULL) {
if (error != NULL)
*error = ENOMEM;
@@ -625,7 +627,6 @@ sctp_add_proto_hdr(sctp_t *sctp, sctp_faddr_t *fp, mblk_t *mp, int sacklen,
mp = nmp;
} else {
mp->b_rptr -= (hdrlen + sacklen);
- mblk_setcred(mp, CONN_CRED(sctp->sctp_connp), sctp->sctp_cpid);
}
bcopy(hdr, mp->b_rptr, hdrlen);
if (sacklen) {
@@ -644,26 +645,16 @@ sctp_add_proto_hdr(sctp_t *sctp, sctp_faddr_t *fp, mblk_t *mp, int sacklen,
iph->ipha_src = INADDR_ANY;
}
} else {
- ((ip6_t *)(mp->b_rptr))->ip6_dst = fp->faddr;
+ ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+
+ ip6h->ip6_dst = fp->faddr;
if (!IN6_IS_ADDR_UNSPECIFIED(&fp->saddr)) {
- ((ip6_t *)(mp->b_rptr))->ip6_src = fp->saddr;
+ ip6h->ip6_src = fp->saddr;
} else if (sctp->sctp_bound_to_all) {
- V6_SET_ZERO(((ip6_t *)(mp->b_rptr))->ip6_src);
+ ip6h->ip6_src = ipv6_all_zeros;
}
}
}
- /*
- * IP will not free this IRE if it is condemned. SCTP needs to
- * free it.
- */
- if ((fp->ire != NULL) && (fp->ire->ire_marks & IRE_MARK_CONDEMNED)) {
- IRE_REFRELE_NOTR(fp->ire);
- fp->ire = NULL;
- }
-
- /* Stash the conn and ire ptr info for IP */
- SCTP_STASH_IPINFO(mp, fp->ire);
-
return (mp);
}
@@ -985,8 +976,9 @@ sctp_fast_rexmit(sctp_t *sctp)
iph->ipha_fragment_offset_and_flags = 0;
}
- sctp_set_iplen(sctp, head);
- sctp_add_sendq(sctp, head);
+ sctp_set_iplen(sctp, head, fp->ixa);
+ (void) conn_ip_output(head, fp->ixa);
+ BUMP_LOCAL(sctp->sctp_opkts);
sctp->sctp_active = fp->lastactive = lbolt64;
}
@@ -1280,8 +1272,9 @@ sctp_output(sctp_t *sctp, uint_t num_pkt)
seglen - xtralen, ntohl(sdc->sdh_tsn),
ntohs(sdc->sdh_ssn), (void *)fp, sctp->sctp_frwnd,
cansend, sctp->sctp_lastack_rxd));
- sctp_set_iplen(sctp, head);
- sctp_add_sendq(sctp, head);
+ sctp_set_iplen(sctp, head, fp->ixa);
+ (void) conn_ip_output(head, fp->ixa);
+ BUMP_LOCAL(sctp->sctp_opkts);
/* arm rto timer (if not set) */
if (!fp->timer_running)
SCTP_FADDR_TIMER_RESTART(sctp, fp, fp->rto);
@@ -1415,8 +1408,7 @@ sctp_make_ftsn_chunk(sctp_t *sctp, sctp_faddr_t *fp, sctp_ftsn_set_t *sets,
xtralen = sctp->sctp_hdr_len + sctps->sctps_wroff_xtra;
else
xtralen = sctp->sctp_hdr6_len + sctps->sctps_wroff_xtra;
- ftsn_mp = allocb_cred(xtralen + seglen, CONN_CRED(sctp->sctp_connp),
- sctp->sctp_cpid);
+ ftsn_mp = allocb(xtralen + seglen, BPRI_MED);
if (ftsn_mp == NULL)
return (NULL);
ftsn_mp->b_rptr += xtralen;
@@ -1804,8 +1796,9 @@ out:
pkt = sctp_rexmit_packet(sctp, &meta, &mp, fp, &pkt_len);
if (pkt != NULL) {
ASSERT(pkt_len <= fp->sfa_pmss);
- sctp_set_iplen(sctp, pkt);
- sctp_add_sendq(sctp, pkt);
+ sctp_set_iplen(sctp, pkt, fp->ixa);
+ (void) conn_ip_output(pkt, fp->ixa);
+ BUMP_LOCAL(sctp->sctp_opkts);
} else {
SCTP_KSTAT(sctps, sctp_ss_rexmit_failed);
}
@@ -2022,8 +2015,9 @@ done_bundle:
sctp->sctp_rexmitting = B_TRUE;
sctp->sctp_rxt_nxttsn = first_ua_tsn;
sctp->sctp_rxt_maxtsn = sctp->sctp_ltsn - 1;
- sctp_set_iplen(sctp, head);
- sctp_add_sendq(sctp, head);
+ sctp_set_iplen(sctp, head, fp->ixa);
+ (void) conn_ip_output(head, fp->ixa);
+ BUMP_LOCAL(sctp->sctp_opkts);
/*
* Restart the oldfp timer with exponential backoff and
@@ -2305,8 +2299,9 @@ found_msg:
*/
iph->ipha_fragment_offset_and_flags = 0;
}
- sctp_set_iplen(sctp, pkt);
- sctp_add_sendq(sctp, pkt);
+ sctp_set_iplen(sctp, pkt, fp->ixa);
+ (void) conn_ip_output(pkt, fp->ixa);
+ BUMP_LOCAL(sctp->sctp_opkts);
/* Check and see if there is more chunk to be retransmitted. */
if (tot_wnd <= pkt_len || tot_wnd - pkt_len < fp->sfa_pmss ||
diff --git a/usr/src/uts/common/inet/sctp/sctp_param.c b/usr/src/uts/common/inet/sctp/sctp_param.c
index 5d5ed19676..26365c5a06 100644
--- a/usr/src/uts/common/inet/sctp/sctp_param.c
+++ b/usr/src/uts/common/inet/sctp/sctp_param.c
@@ -20,12 +20,10 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/stream.h>
#include <sys/socket.h>
#include <sys/ddi.h>
@@ -72,11 +70,8 @@
/*
* sctp_wroff_xtra is the extra space in front of SCTP/IP header for link
* layer header. It has to be a multiple of 4.
- * Also there has to be enough space to stash in information passed between
- * IP and SCTP.
*/
-sctpparam_t lcl_sctp_wroff_xtra_param = { sizeof (conn_t *) + sizeof (ire_t *),
- 256, 32, "sctp_wroff_xtra" };
+sctpparam_t lcl_sctp_wroff_xtra_param = { 0, 256, 32, "sctp_wroff_xtra" };
/*
* All of these are alterable, within the min/max values given, at run time.
@@ -343,7 +338,7 @@ sctp_nd_init(sctp_stack_t *sctps)
bcopy(lcl_sctp_param_arr, pa, sizeof (lcl_sctp_param_arr));
sctps->sctps_params = pa;
return (sctp_param_register(&sctps->sctps_g_nd, pa,
- A_CNT(lcl_sctp_param_arr), sctps));
+ A_CNT(lcl_sctp_param_arr), sctps));
}
int
diff --git a/usr/src/uts/common/inet/sctp/sctp_shutdown.c b/usr/src/uts/common/inet/sctp/sctp_shutdown.c
index b58016eb15..ff835a60c0 100644
--- a/usr/src/uts/common/inet/sctp/sctp_shutdown.c
+++ b/usr/src/uts/common/inet/sctp/sctp_shutdown.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -35,6 +35,7 @@
#include <netinet/in.h>
#include <netinet/ip6.h>
+#include <inet/ipsec_impl.h>
#include <inet/common.h>
#include <inet/ip.h>
#include <inet/ip6.h>
@@ -129,12 +130,12 @@ sctp_send_shutdown(sctp_t *sctp, int rexmit)
/* Link the shutdown chunk in after the IP/SCTP header */
- sctp_set_iplen(sctp, sendmp);
-
BUMP_LOCAL(sctp->sctp_obchunks);
/* Send the shutdown and restart the timer */
- sctp_add_sendq(sctp, sendmp);
+ sctp_set_iplen(sctp, sendmp, fp->ixa);
+ (void) conn_ip_output(sendmp, fp->ixa);
+ BUMP_LOCAL(sctp->sctp_opkts);
done:
sctp->sctp_state = SCTPS_SHUTDOWN_SENT;
@@ -211,11 +212,11 @@ sctp_shutdown_received(sctp_t *sctp, sctp_chunk_hdr_t *sch, boolean_t crwsd,
}
}
- sctp_set_iplen(sctp, samp);
-
BUMP_LOCAL(sctp->sctp_obchunks);
- sctp_add_sendq(sctp, samp);
+ sctp_set_iplen(sctp, samp, fp->ixa);
+ (void) conn_ip_output(samp, fp->ixa);
+ BUMP_LOCAL(sctp->sctp_opkts);
dotimer:
sctp->sctp_state = SCTPS_SHUTDOWN_ACK_SENT;
@@ -232,7 +233,7 @@ sctp_shutdown_complete(sctp_t *sctp)
sctp_chunk_hdr_t *scch;
sctp_stack_t *sctps = sctp->sctp_sctps;
- scmp = sctp_make_mp(sctp, NULL, sizeof (*scch));
+ scmp = sctp_make_mp(sctp, sctp->sctp_current, sizeof (*scch));
if (scmp == NULL) {
/* XXX use timer approach */
SCTP_KSTAT(sctps, sctp_send_shutdown_comp_failed);
@@ -246,11 +247,11 @@ sctp_shutdown_complete(sctp_t *sctp)
scmp->b_wptr += sizeof (*scch);
- sctp_set_iplen(sctp, scmp);
-
BUMP_LOCAL(sctp->sctp_obchunks);
- sctp_add_sendq(sctp, scmp);
+ sctp_set_iplen(sctp, scmp, sctp->sctp_current->ixa);
+ (void) conn_ip_output(scmp, sctp->sctp_current->ixa);
+ BUMP_LOCAL(sctp->sctp_opkts);
}
/*
@@ -259,91 +260,99 @@ sctp_shutdown_complete(sctp_t *sctp)
* and instead must draw all necessary info from the incoming packet.
*/
void
-sctp_ootb_shutdown_ack(sctp_t *gsctp, mblk_t *inmp, uint_t ip_hdr_len)
+sctp_ootb_shutdown_ack(mblk_t *mp, uint_t ip_hdr_len, ip_recv_attr_t *ira,
+ ip_stack_t *ipst)
{
boolean_t isv4;
- ipha_t *inip4h;
- ip6_t *inip6h;
+ ipha_t *ipha = NULL;
+ ip6_t *ip6h = NULL;
sctp_hdr_t *insctph;
sctp_chunk_hdr_t *scch;
int i;
uint16_t port;
mblk_t *mp1;
- sctp_stack_t *sctps = gsctp->sctp_sctps;
+ netstack_t *ns = ipst->ips_netstack;
+ sctp_stack_t *sctps = ns->netstack_sctp;
+ ip_xmit_attr_t ixas;
- isv4 = (IPH_HDR_VERSION(inmp->b_rptr) == IPV4_VERSION);
+ bzero(&ixas, sizeof (ixas));
- /*
- * The gsctp should contain the minimal IP header. So the
- * incoming mblk should be able to hold the new SCTP packet.
- */
- ASSERT(MBLKL(inmp) >= sizeof (*insctph) + sizeof (*scch) +
- (isv4 ? gsctp->sctp_ip_hdr_len : gsctp->sctp_ip_hdr6_len));
+ isv4 = (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION);
+
+ ASSERT(MBLKL(mp) >= sizeof (*insctph) + sizeof (*scch) +
+ (isv4 ? sizeof (ipha_t) : sizeof (ip6_t)));
/*
* Check to see if we can reuse the incoming mblk. There should
- * not be other reference and the db_base of the mblk should be
- * properly aligned. Since this packet comes from below,
+ * not be other reference. Since this packet comes from below,
* there should be enough header space to fill in what the lower
- * layers want to add. And we will not stash anything there.
+ * layers want to add.
*/
- if (!IS_P2ALIGNED(DB_BASE(inmp), sizeof (ire_t *)) ||
- DB_REF(inmp) != 1) {
- mp1 = allocb(MBLKL(inmp) + sctps->sctps_wroff_xtra, BPRI_MED);
+ if (DB_REF(mp) != 1) {
+ mp1 = allocb(MBLKL(mp) + sctps->sctps_wroff_xtra, BPRI_MED);
if (mp1 == NULL) {
- freeb(inmp);
+ freeb(mp);
return;
}
mp1->b_rptr += sctps->sctps_wroff_xtra;
- mp1->b_wptr = mp1->b_rptr + MBLKL(inmp);
- bcopy(inmp->b_rptr, mp1->b_rptr, MBLKL(inmp));
- freeb(inmp);
- inmp = mp1;
+ mp1->b_wptr = mp1->b_rptr + MBLKL(mp);
+ bcopy(mp->b_rptr, mp1->b_rptr, MBLKL(mp));
+ freeb(mp);
+ mp = mp1;
} else {
- ASSERT(DB_CKSUMFLAGS(inmp) == 0);
+ DB_CKSUMFLAGS(mp) = 0;
}
+ ixas.ixa_pktlen = ip_hdr_len + sizeof (*insctph) + sizeof (*scch);
+ ixas.ixa_ip_hdr_length = ip_hdr_len;
/*
* We follow the logic in tcp_xmit_early_reset() in that we skip
- * reversing source route (i.e. relpace all IP options with EOL).
+ * reversing source route (i.e. replace all IP options with EOL).
*/
if (isv4) {
ipaddr_t v4addr;
- inip4h = (ipha_t *)inmp->b_rptr;
+ ipha = (ipha_t *)mp->b_rptr;
for (i = IP_SIMPLE_HDR_LENGTH; i < (int)ip_hdr_len; i++)
- inmp->b_rptr[i] = IPOPT_EOL;
+ mp->b_rptr[i] = IPOPT_EOL;
/* Swap addresses */
- inip4h->ipha_length = htons(ip_hdr_len + sizeof (*insctph) +
- sizeof (*scch));
- v4addr = inip4h->ipha_src;
- inip4h->ipha_src = inip4h->ipha_dst;
- inip4h->ipha_dst = v4addr;
- inip4h->ipha_ident = 0;
- inip4h->ipha_ttl = (uchar_t)sctps->sctps_ipv4_ttl;
+ ipha->ipha_length = htons(ixas.ixa_pktlen);
+ v4addr = ipha->ipha_src;
+ ipha->ipha_src = ipha->ipha_dst;
+ ipha->ipha_dst = v4addr;
+ ipha->ipha_ident = 0;
+ ipha->ipha_ttl = (uchar_t)sctps->sctps_ipv4_ttl;
+
+ ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
} else {
in6_addr_t v6addr;
- inip6h = (ip6_t *)inmp->b_rptr;
+ ip6h = (ip6_t *)mp->b_rptr;
/* Remove any extension headers assuming partial overlay */
if (ip_hdr_len > IPV6_HDR_LEN) {
uint8_t *to;
- to = inmp->b_rptr + ip_hdr_len - IPV6_HDR_LEN;
- ovbcopy(inip6h, to, IPV6_HDR_LEN);
- inmp->b_rptr += ip_hdr_len - IPV6_HDR_LEN;
+ to = mp->b_rptr + ip_hdr_len - IPV6_HDR_LEN;
+ ovbcopy(ip6h, to, IPV6_HDR_LEN);
+ mp->b_rptr += ip_hdr_len - IPV6_HDR_LEN;
ip_hdr_len = IPV6_HDR_LEN;
- inip6h = (ip6_t *)inmp->b_rptr;
- inip6h->ip6_nxt = IPPROTO_SCTP;
+ ip6h = (ip6_t *)mp->b_rptr;
+ ip6h->ip6_nxt = IPPROTO_SCTP;
+ }
+ ip6h->ip6_plen = htons(ixas.ixa_pktlen - IPV6_HDR_LEN);
+ v6addr = ip6h->ip6_src;
+ ip6h->ip6_src = ip6h->ip6_dst;
+ ip6h->ip6_dst = v6addr;
+ ip6h->ip6_hops = (uchar_t)sctps->sctps_ipv6_hoplimit;
+
+ ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
+ if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_dst)) {
+ ixas.ixa_flags |= IXAF_SCOPEID_SET;
+ ixas.ixa_scopeid = ira->ira_ruifindex;
}
- inip6h->ip6_plen = htons(ip_hdr_len + sizeof (*insctph) +
- sizeof (*scch) - IPV6_HDR_LEN);
- v6addr = inip6h->ip6_src;
- inip6h->ip6_src = inip6h->ip6_dst;
- inip6h->ip6_dst = v6addr;
- inip6h->ip6_hops = (uchar_t)sctps->sctps_ipv6_hoplimit;
}
- insctph = (sctp_hdr_t *)(inmp->b_rptr + ip_hdr_len);
+
+ insctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_len);
/* Swap ports. Verification tag is reused. */
port = insctph->sh_sport;
@@ -359,9 +368,29 @@ sctp_ootb_shutdown_ack(sctp_t *gsctp, mblk_t *inmp, uint_t ip_hdr_len)
/* Set the T-bit */
SCTP_SET_TBIT(scch);
- BUMP_LOCAL(gsctp->sctp_obchunks);
- /* Nothing to stash... */
- SCTP_STASH_IPINFO(inmp, (ire_t *)NULL);
+ ixas.ixa_protocol = IPPROTO_SCTP;
+ ixas.ixa_zoneid = ira->ira_zoneid;
+ ixas.ixa_ipst = ipst;
+ ixas.ixa_ifindex = 0;
+
+ if (ira->ira_flags & IRAF_IPSEC_SECURE) {
+ /*
+ * Apply IPsec based on how IPsec was applied to
+ * the packet that was out of the blue.
+ */
+ if (!ipsec_in_to_out(ira, &ixas, mp, ipha, ip6h)) {
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+ /* Note: mp already consumed and ip_drop_packet done */
+ return;
+ }
+ } else {
+ /*
+ * This is in clear. The message we are building
+ * here should go out in clear, independent of our policy.
+ */
+ ixas.ixa_flags |= IXAF_NO_IPSEC;
+ }
- sctp_add_sendq(gsctp, inmp);
+ (void) ip_output_simple(mp, &ixas);
+ ixa_cleanup(&ixas);
}
diff --git a/usr/src/uts/common/inet/sctp/sctp_snmp.c b/usr/src/uts/common/inet/sctp/sctp_snmp.c
index f859cd6ba5..f1e7deceae 100644
--- a/usr/src/uts/common/inet/sctp/sctp_snmp.c
+++ b/usr/src/uts/common/inet/sctp/sctp_snmp.c
@@ -78,9 +78,9 @@ sctp_kstat_update(kstat_t *kp, int rw)
* individual set of statistics.
*/
SET_MIB(sctps->sctps_mib.sctpCurrEstab, 0);
- sctp = sctps->sctps_gsctp;
sctp_prev = NULL;
mutex_enter(&sctps->sctps_g_lock);
+ sctp = list_head(&sctps->sctps_g_list);
while (sctp != NULL) {
mutex_enter(&sctp->sctp_reflock);
if (sctp->sctp_condemned) {
@@ -471,8 +471,8 @@ sctp_snmp_get_mib2(queue_t *q, mblk_t *mpctl, sctp_stack_t *sctps)
SET_MIB(sctps->sctps_mib.sctpCurrEstab, 0);
idx = 0;
- sctp = sctps->sctps_gsctp;
mutex_enter(&sctps->sctps_g_lock);
+ sctp = list_head(&sctps->sctps_g_list);
while (sctp != NULL) {
mutex_enter(&sctp->sctp_reflock);
if (sctp->sctp_condemned) {
@@ -541,8 +541,8 @@ sctp_snmp_get_mib2(queue_t *q, mblk_t *mpctl, sctp_stack_t *sctps)
sctp->sctp_reassmsgs = 0;
sce.sctpAssocId = ntohl(sctp->sctp_lvtag);
- sce.sctpAssocLocalPort = ntohs(sctp->sctp_lport);
- sce.sctpAssocRemPort = ntohs(sctp->sctp_fport);
+ sce.sctpAssocLocalPort = ntohs(sctp->sctp_connp->conn_lport);
+ sce.sctpAssocRemPort = ntohs(sctp->sctp_connp->conn_fport);
RUN_SCTP(sctp);
if (sctp->sctp_primary != NULL) {
@@ -659,11 +659,10 @@ done:
needattr = B_TRUE;
break;
}
- if (connp->conn_fully_bound &&
- connp->conn_effective_cred != NULL) {
+ if (sctp->sctp_connp->conn_ixa->ixa_tsl != NULL) {
ts_label_t *tsl;
- tsl = crgetlabel(connp->conn_effective_cred);
+ tsl = sctp->sctp_connp->conn_ixa->ixa_tsl;
mlp.tme_flags |= MIB2_TMEF_IS_LABELED;
mlp.tme_doi = label2doi(tsl);
mlp.tme_label = *label2bslabel(tsl);
diff --git a/usr/src/uts/common/inet/sctp/sctp_stack.h b/usr/src/uts/common/inet/sctp/sctp_stack.h
index d467b38a17..e9ad5cf9c7 100644
--- a/usr/src/uts/common/inet/sctp/sctp_stack.h
+++ b/usr/src/uts/common/inet/sctp/sctp_stack.h
@@ -20,15 +20,13 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _INET_SCTP_SCTP_STACK_H
#define _INET_SCTP_SCTP_STACK_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/netstack.h>
#include <sys/taskq.h>
@@ -76,17 +74,6 @@ struct sctp_stack {
mib2_sctp_t sctps_mib;
- /* Protected by sctps_g_q_lock */
- queue_t *sctps_g_q;
- uint_t sctps_g_q_ref; /* Number of sctp_t's that use it */
- kmutex_t sctps_g_q_lock;
- kcondvar_t sctps_g_q_cv;
- kthread_t *sctps_g_q_creator;
- struct __ldi_handle *sctps_g_q_lh;
- cred_t *sctps_g_q_cr; /* For _inactive close call */
- /* The default sctp_t for responding out of the blue packets. */
- struct sctp_s *sctps_gsctp;
-
/* Protected by sctps_g_lock */
struct list sctps_g_list; /* SCTP instance data chain */
kmutex_t sctps_g_lock;
diff --git a/usr/src/uts/common/inet/sctp/sctp_timer.c b/usr/src/uts/common/inet/sctp/sctp_timer.c
index c6fd4a5c71..24b46ad6f0 100644
--- a/usr/src/uts/common/inet/sctp/sctp_timer.c
+++ b/usr/src/uts/common/inet/sctp/sctp_timer.c
@@ -220,7 +220,6 @@ sctp_timer_fire(sctp_tb_t *sctp_tb)
sctp_timer_call(sctp, mp);
WAKE_SCTP(sctp);
- sctp_process_sendq(sctp);
}
SCTP_REFRELE(sctp);
}
@@ -429,7 +428,7 @@ sctp_heartbeat_timer(sctp_t *sctp)
for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->next) {
/*
* If the peer is unreachable because there is no available
- * source address, call sctp_get_ire() to see if it is
+ * source address, call sctp_get_dest() to see if it is
* reachable now. If it is OK, the state will become
* unconfirmed. And the following code to handle unconfirmed
* address will be executed. If it is still not OK,
@@ -438,7 +437,7 @@ sctp_heartbeat_timer(sctp_t *sctp)
* is disable, this retry may go on forever.
*/
if (fp->state == SCTP_FADDRS_UNREACH) {
- sctp_get_ire(sctp, fp);
+ sctp_get_dest(sctp, fp);
if (fp->state == SCTP_FADDRS_UNREACH) {
if (fp->hb_enabled &&
++fp->strikes > fp->max_retr &&
@@ -642,15 +641,14 @@ rxmit_init:
* address list won't be modified (it would have been done
* the first time around).
*/
- mp = sctp_init_mp(sctp);
+ mp = sctp_init_mp(sctp, fp);
if (mp != NULL) {
BUMP_MIB(&sctps->sctps_mib, sctpTimRetrans);
- sctp_add_sendq(sctp, mp);
+ (void) conn_ip_output(mp, fp->ixa);
+ BUMP_LOCAL(sctp->sctp_opkts);
}
break;
- case SCTPS_COOKIE_ECHOED: {
- ipha_t *iph;
-
+ case SCTPS_COOKIE_ECHOED:
BUMP_LOCAL(sctp->sctp_T1expire);
if (sctp->sctp_cookie_mp == NULL) {
sctp->sctp_state = SCTPS_COOKIE_WAIT;
@@ -659,14 +657,10 @@ rxmit_init:
mp = dupmsg(sctp->sctp_cookie_mp);
if (mp == NULL)
break;
- iph = (ipha_t *)mp->b_rptr;
- /* Reset the IP ident. */
- if (IPH_HDR_VERSION(iph) == IPV4_VERSION)
- iph->ipha_ident = 0;
- sctp_add_sendq(sctp, mp);
+ (void) conn_ip_output(mp, fp->ixa);
+ BUMP_LOCAL(sctp->sctp_opkts);
BUMP_MIB(&sctps->sctps_mib, sctpTimRetrans);
break;
- }
case SCTPS_SHUTDOWN_SENT:
BUMP_LOCAL(sctp->sctp_T2expire);
sctp_send_shutdown(sctp, 1);
diff --git a/usr/src/uts/common/inet/sctp_ip.h b/usr/src/uts/common/inet/sctp_ip.h
index 7b20d3fd2b..9e4c2ef7ec 100644
--- a/usr/src/uts/common/inet/sctp_ip.h
+++ b/usr/src/uts/common/inet/sctp_ip.h
@@ -35,40 +35,24 @@ extern "C" {
#define SCTP_COMMON_HDR_LENGTH 12 /* SCTP common header length */
/* SCTP routines for IP to call. */
-extern void ip_fanout_sctp(mblk_t *, ill_t *, ipha_t *, uint32_t,
- uint_t, boolean_t, boolean_t, zoneid_t);
+extern void ip_fanout_sctp(mblk_t *, ipha_t *, ip6_t *, uint32_t,
+ ip_recv_attr_t *);
extern void sctp_ddi_g_init(void);
extern void sctp_ddi_g_destroy(void);
extern conn_t *sctp_find_conn(in6_addr_t *, in6_addr_t *, uint32_t,
- zoneid_t, sctp_stack_t *);
+ zoneid_t, iaflags_t, sctp_stack_t *);
extern conn_t *sctp_fanout(in6_addr_t *, in6_addr_t *, uint32_t,
- zoneid_t, mblk_t *, sctp_stack_t *);
+ ip_recv_attr_t *, mblk_t *, sctp_stack_t *);
-extern void sctp_input(conn_t *, ipha_t *, mblk_t *, mblk_t *, ill_t *,
- boolean_t, boolean_t);
+extern void sctp_input(conn_t *, ipha_t *, ip6_t *, mblk_t *, ip_recv_attr_t *);
extern void sctp_wput(queue_t *, mblk_t *);
-extern void sctp_ootb_input(mblk_t *, ill_t *, zoneid_t, boolean_t);
+extern void sctp_ootb_input(mblk_t *, ip_recv_attr_t *, ip_stack_t *);
extern void sctp_hash_init(sctp_stack_t *);
extern void sctp_hash_destroy(sctp_stack_t *);
extern uint32_t sctp_cksum(mblk_t *, int);
extern mblk_t *sctp_snmp_get_mib2(queue_t *, mblk_t *, sctp_stack_t *);
extern void sctp_free(conn_t *);
-#define SCTP_STASH_IPINFO(mp, ire) \
-{ \
- unsigned char *stp; \
- stp = DB_BASE((mp)); \
- ASSERT(stp + sizeof (ire_t *) < (mp)->b_rptr); \
- *(ire_t **)stp = (ire); \
-}
-
-#define SCTP_EXTRACT_IPINFO(mp, ire) \
-{ \
- unsigned char *stp; \
- stp = (mp)->b_datap->db_base; \
- (ire) = *(ire_t **)stp; \
-}
-
/*
* SCTP maintains a list of ILLs/IPIFs, these functions are provided by
* SCTP to keep its interface list up to date.
@@ -87,16 +71,8 @@ extern void sctp_ill_reindex(ill_t *, uint_t);
#define SCTP_IPIF_UPDATE 6
/* IP routines for SCTP to call. */
-extern void ip_fanout_sctp_raw(mblk_t *, ill_t *, ipha_t *, boolean_t,
- uint32_t, boolean_t, uint_t, boolean_t, zoneid_t);
-extern void sctp_ire_cache_flush(ipif_t *);
-
-/*
- * Private (and possibly temporary) ioctls. It is a large number
- * to avoid conflict with other ioctls, which are normally smaller
- * than 2^16.
- */
-#define SCTP_IOC_DEFAULT_Q (('S' << 16) | 1024)
+extern void ip_fanout_sctp_raw(mblk_t *, ipha_t *, ip6_t *, uint32_t,
+ ip_recv_attr_t *);
#ifdef __cplusplus
}
diff --git a/usr/src/uts/common/inet/sctp_itf.h b/usr/src/uts/common/inet/sctp_itf.h
index 9ce69fdaf0..2ae6d3669f 100644
--- a/usr/src/uts/common/inet/sctp_itf.h
+++ b/usr/src/uts/common/inet/sctp_itf.h
@@ -83,9 +83,9 @@ extern int sctp_bindx(struct sctp_s *conn, const void *addrs, int addrcnt,
int flags);
extern void sctp_close(struct sctp_s *conn);
extern int sctp_connect(struct sctp_s *conn, const struct sockaddr *dst,
- socklen_t addrlen);
+ socklen_t addrlen, cred_t *cr, pid_t pid);
extern struct sctp_s *sctp_create(void *newhandle, struct sctp_s *parent,
- int family, int flags, struct sock_upcalls_s *su,
+ int family, int type, int flags, struct sock_upcalls_s *su,
sctp_sockbuf_limits_t *sbl, cred_t *cr);
extern int sctp_disconnect(struct sctp_s *conn);
extern int sctp_get_opt(struct sctp_s *conn, int level, int opt, void *opts,
diff --git a/usr/src/uts/common/inet/sockmods/socksctp.c b/usr/src/uts/common/inet/sockmods/socksctp.c
index 7da9f92dde..4df7e33501 100644
--- a/usr/src/uts/common/inet/sockmods/socksctp.c
+++ b/usr/src/uts/common/inet/sockmods/socksctp.c
@@ -207,7 +207,7 @@ sosctp_init(struct sonode *so, struct sonode *pso, struct cred *cr, int flags)
upcalls = &sosctp_assoc_upcalls;
}
so->so_proto_handle = (sock_lower_handle_t)sctp_create(so, NULL,
- so->so_family, SCTP_CAN_BLOCK, upcalls, &sbl, cr);
+ so->so_family, so->so_type, SCTP_CAN_BLOCK, upcalls, &sbl, cr);
if (so->so_proto_handle == NULL)
return (ENOMEM);
@@ -350,6 +350,7 @@ sosctp_connect(struct sonode *so, const struct sockaddr *name,
socklen_t namelen, int fflag, int flags, struct cred *cr)
{
int error = 0;
+ pid_t pid = curproc->p_pid;
ASSERT(so->so_type == SOCK_STREAM);
@@ -404,7 +405,7 @@ sosctp_connect(struct sonode *so, const struct sockaddr *name,
mutex_exit(&so->so_lock);
error = sctp_connect((struct sctp_s *)so->so_proto_handle,
- name, namelen);
+ name, namelen, cr, pid);
mutex_enter(&so->so_lock);
if (error == 0) {
@@ -662,7 +663,7 @@ done:
int
sosctp_uiomove(mblk_t *hdr_mp, ssize_t count, ssize_t blk_size, int wroff,
- struct uio *uiop, int flags, cred_t *cr)
+ struct uio *uiop, int flags)
{
ssize_t size;
int error;
@@ -683,8 +684,7 @@ sosctp_uiomove(mblk_t *hdr_mp, ssize_t count, ssize_t blk_size, int wroff,
* packets, each mblk will have the extra space before
* data to accommodate what SCTP wants to put in there.
*/
- while ((mp = allocb_cred(size + wroff, cr,
- curproc->p_pid)) == NULL) {
+ while ((mp = allocb(size + wroff, BPRI_MED)) == NULL) {
if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) ||
(flags & MSG_DONTWAIT)) {
return (EAGAIN);
@@ -887,7 +887,7 @@ sosctp_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
/* Copy in the message. */
if ((error = sosctp_uiomove(mctl, count, ss->ss_wrsize, ss->ss_wroff,
- uiop, flags, cr)) != 0) {
+ uiop, flags)) != 0) {
goto error_ret;
}
error = sctp_sendmsg((struct sctp_s *)so->so_proto_handle, mctl, 0);
@@ -1091,7 +1091,7 @@ sosctp_seq_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
/* Copy in the message. */
if ((error = sosctp_uiomove(mctl, count, ssa->ssa_wrsize,
- ssa->ssa_wroff, uiop, flags, cr)) != 0) {
+ ssa->ssa_wroff, uiop, flags)) != 0) {
goto lock_rele;
}
error = sctp_sendmsg((struct sctp_s *)ssa->ssa_conn, mctl, 0);
diff --git a/usr/src/uts/common/inet/sockmods/socksctp.h b/usr/src/uts/common/inet/sockmods/socksctp.h
index b02622c994..2ac7058821 100644
--- a/usr/src/uts/common/inet/sockmods/socksctp.h
+++ b/usr/src/uts/common/inet/sockmods/socksctp.h
@@ -116,7 +116,7 @@ extern void sosctp_assoc_isdisconnected(struct sctp_soassoc *ssa, int error);
extern int sosctp_waitconnected(struct sonode *so, int fmode);
extern int sosctp_uiomove(mblk_t *hdr_mp, ssize_t count, ssize_t blk_size,
- int wroff, struct uio *uiop, int flags, cred_t *cr);
+ int wroff, struct uio *uiop, int flags);
/*
* Data structure types.
diff --git a/usr/src/uts/common/inet/sockmods/socksctpsubr.c b/usr/src/uts/common/inet/sockmods/socksctpsubr.c
index 4a4cb08007..a647cbe4f2 100644
--- a/usr/src/uts/common/inet/sockmods/socksctpsubr.c
+++ b/usr/src/uts/common/inet/sockmods/socksctpsubr.c
@@ -367,6 +367,7 @@ sosctp_assoc_createconn(struct sctp_sonode *ss, const struct sockaddr *name,
sctp_assoc_t id;
int error;
struct cmsghdr *cmsg;
+ pid_t pid = curproc->p_pid;
ASSERT(MUTEX_HELD(&so->so_lock));
@@ -407,7 +408,8 @@ sosctp_assoc_createconn(struct sctp_sonode *ss, const struct sockaddr *name,
ssa->ssa_wroff = ss->ss_wroff;
ssa->ssa_wrsize = ss->ss_wrsize;
ssa->ssa_conn = sctp_create(ssa, (struct sctp_s *)so->so_proto_handle,
- so->so_family, SCTP_CAN_BLOCK, &sosctp_assoc_upcalls, &sbl, cr);
+ so->so_family, so->so_type, SCTP_CAN_BLOCK, &sosctp_assoc_upcalls,
+ &sbl, cr);
mutex_enter(&so->so_lock);
ss->ss_assocs[id].ssi_assoc = ssa;
@@ -435,7 +437,7 @@ sosctp_assoc_createconn(struct sctp_sonode *ss, const struct sockaddr *name,
goto ret_err;
}
- if ((error = sctp_connect(ssa->ssa_conn, name, namelen)) != 0)
+ if ((error = sctp_connect(ssa->ssa_conn, name, namelen, cr, pid)) != 0)
goto ret_err;
mutex_enter(&so->so_lock);
diff --git a/usr/src/uts/common/inet/spdsock.h b/usr/src/uts/common/inet/spdsock.h
index 7622e56a45..64c63cdd71 100644
--- a/usr/src/uts/common/inet/spdsock.h
+++ b/usr/src/uts/common/inet/spdsock.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -110,7 +110,7 @@ extern uint_t spdsock_max_optsize;
extern int spdsock_opt_get(queue_t *, int, int, uchar_t *);
extern int spdsock_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *,
- uint_t *, uchar_t *, void *, cred_t *, mblk_t *);
+ uint_t *, uchar_t *, void *, cred_t *);
#ifdef __cplusplus
}
diff --git a/usr/src/uts/common/inet/squeue.c b/usr/src/uts/common/inet/squeue.c
index e46293d820..db11ef79ae 100644
--- a/usr/src/uts/common/inet/squeue.c
+++ b/usr/src/uts/common/inet/squeue.c
@@ -39,8 +39,8 @@
* parallelization (on a per H/W execution pipeline basis) with at
* most one queuing.
*
- * The modules needing protection typically calls squeue_enter() or
- * squeue_enter_chain() routine as soon as a thread enter the module
+ * The modules needing protection typically calls SQUEUE_ENTER_ONE() or
+ * SQUEUE_ENTER() macro as soon as a thread enter the module
* from either direction. For each packet, the processing function
* and argument is stored in the mblk itself. When the packet is ready
* to be processed, the squeue retrieves the stored function and calls
@@ -406,11 +406,15 @@ squeue_worker_wakeup(squeue_t *sqp)
* and drain in the entering thread context. If process_flag is
* SQ_FILL, then we just queue the mblk and return (after signaling
* the worker thread if no one else is processing the squeue).
+ *
+ * The ira argument can be used when the count is one.
+ * For a chain the caller needs to prepend any needed mblks from
+ * ip_recv_attr_to_mblk().
*/
/* ARGSUSED */
void
squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
- int process_flag, uint8_t tag)
+ ip_recv_attr_t *ira, int process_flag, uint8_t tag)
{
conn_t *connp;
sqproc_t proc;
@@ -421,6 +425,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
ASSERT(tail != NULL);
ASSERT(cnt > 0);
ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
+ ASSERT(ira == NULL || cnt == 1);
mutex_enter(&sqp->sq_lock);
@@ -467,7 +472,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
connp->conn_on_sqp = B_TRUE;
DTRACE_PROBE3(squeue__proc__start, squeue_t *,
sqp, mblk_t *, mp, conn_t *, connp);
- (*proc)(connp, mp, sqp);
+ (*proc)(connp, mp, sqp, ira);
DTRACE_PROBE2(squeue__proc__end, squeue_t *,
sqp, conn_t *, connp);
connp->conn_on_sqp = B_FALSE;
@@ -475,7 +480,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
CONN_DEC_REF(connp);
} else {
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc,
- connp, SQ_FILL, SQTAG_SQUEUE_CHANGE);
+ connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE);
}
ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
mutex_enter(&sqp->sq_lock);
@@ -499,6 +504,33 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
return;
}
} else {
+ if (ira != NULL) {
+ mblk_t *attrmp;
+
+ ASSERT(cnt == 1);
+ attrmp = ip_recv_attr_to_mblk(ira);
+ if (attrmp == NULL) {
+ mutex_exit(&sqp->sq_lock);
+ ip_drop_input("squeue: "
+ "ip_recv_attr_to_mblk",
+ mp, NULL);
+ /* Caller already set b_prev/b_next */
+ mp->b_prev = mp->b_next = NULL;
+ freemsg(mp);
+ return;
+ }
+ ASSERT(attrmp->b_cont == NULL);
+ attrmp->b_cont = mp;
+ /* Move connp and func to new */
+ attrmp->b_queue = mp->b_queue;
+ mp->b_queue = NULL;
+ attrmp->b_prev = mp->b_prev;
+ mp->b_prev = NULL;
+
+ ASSERT(mp == tail);
+ tail = mp = attrmp;
+ }
+
ENQUEUE_CHAIN(sqp, mp, tail, cnt);
#ifdef DEBUG
mp->b_tag = tag;
@@ -564,14 +596,14 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
connp->conn_on_sqp = B_TRUE;
DTRACE_PROBE3(squeue__proc__start, squeue_t *,
sqp, mblk_t *, mp, conn_t *, connp);
- (*proc)(connp, mp, sqp);
+ (*proc)(connp, mp, sqp, ira);
DTRACE_PROBE2(squeue__proc__end, squeue_t *,
sqp, conn_t *, connp);
connp->conn_on_sqp = B_FALSE;
CONN_DEC_REF(connp);
} else {
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc,
- connp, SQ_FILL, SQTAG_SQUEUE_CHANGE);
+ connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE);
}
mutex_enter(&sqp->sq_lock);
@@ -589,7 +621,31 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
#ifdef DEBUG
mp->b_tag = tag;
#endif
+ if (ira != NULL) {
+ mblk_t *attrmp;
+ ASSERT(cnt == 1);
+ attrmp = ip_recv_attr_to_mblk(ira);
+ if (attrmp == NULL) {
+ mutex_exit(&sqp->sq_lock);
+ ip_drop_input("squeue: ip_recv_attr_to_mblk",
+ mp, NULL);
+ /* Caller already set b_prev/b_next */
+ mp->b_prev = mp->b_next = NULL;
+ freemsg(mp);
+ return;
+ }
+ ASSERT(attrmp->b_cont == NULL);
+ attrmp->b_cont = mp;
+ /* Move connp and func to new */
+ attrmp->b_queue = mp->b_queue;
+ mp->b_queue = NULL;
+ attrmp->b_prev = mp->b_prev;
+ mp->b_prev = NULL;
+
+ ASSERT(mp == tail);
+ tail = mp = attrmp;
+ }
ENQUEUE_CHAIN(sqp, mp, tail, cnt);
if (!(sqp->sq_state & SQS_PROC)) {
squeue_worker_wakeup(sqp);
@@ -653,6 +709,7 @@ squeue_drain(squeue_t *sqp, uint_t proc_type, hrtime_t expire)
hrtime_t now;
boolean_t did_wakeup = B_FALSE;
boolean_t sq_poll_capable;
+ ip_recv_attr_t *ira, iras;
sq_poll_capable = (sqp->sq_state & SQS_POLL_CAPAB) != 0;
again:
@@ -697,6 +754,31 @@ again:
connp = (conn_t *)mp->b_prev;
mp->b_prev = NULL;
+ /* Is there an ip_recv_attr_t to handle? */
+ if (ip_recv_attr_is_mblk(mp)) {
+ mblk_t *attrmp = mp;
+
+ ASSERT(attrmp->b_cont != NULL);
+
+ mp = attrmp->b_cont;
+ attrmp->b_cont = NULL;
+ ASSERT(mp->b_queue == NULL);
+ ASSERT(mp->b_prev == NULL);
+
+ if (!ip_recv_attr_from_mblk(attrmp, &iras)) {
+ /* The ill or ip_stack_t disappeared on us */
+ ip_drop_input("ip_recv_attr_from_mblk",
+ mp, NULL);
+ ira_cleanup(&iras, B_TRUE);
+ CONN_DEC_REF(connp);
+ continue;
+ }
+ ira = &iras;
+ } else {
+ ira = NULL;
+ }
+
+
/*
* Handle squeue switching. More details in the
* block comment at the top of the file
@@ -707,15 +789,17 @@ again:
connp->conn_on_sqp = B_TRUE;
DTRACE_PROBE3(squeue__proc__start, squeue_t *,
sqp, mblk_t *, mp, conn_t *, connp);
- (*proc)(connp, mp, sqp);
+ (*proc)(connp, mp, sqp, ira);
DTRACE_PROBE2(squeue__proc__end, squeue_t *,
sqp, conn_t *, connp);
connp->conn_on_sqp = B_FALSE;
CONN_DEC_REF(connp);
} else {
- SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp,
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira,
SQ_FILL, SQTAG_SQUEUE_CHANGE);
}
+ if (ira != NULL)
+ ira_cleanup(ira, B_TRUE);
}
SQUEUE_DBG_CLEAR(sqp);
@@ -991,9 +1075,13 @@ poll_again:
&tail, &cnt);
}
mutex_enter(lock);
- if (mp != NULL)
+ if (mp != NULL) {
+ /*
+ * The ip_accept function has already added an
+ * ip_recv_attr_t mblk if that is needed.
+ */
ENQUEUE_CHAIN(sqp, mp, tail, cnt);
-
+ }
ASSERT((sqp->sq_state &
(SQS_PROC|SQS_POLLING|SQS_GET_PKTS)) ==
(SQS_PROC|SQS_POLLING|SQS_GET_PKTS));
@@ -1263,7 +1351,7 @@ squeue_getprivate(squeue_t *sqp, sqprivate_t p)
/* ARGSUSED */
void
-squeue_wakeup_conn(void *arg, mblk_t *mp, void *arg2)
+squeue_wakeup_conn(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
{
conn_t *connp = (conn_t *)arg;
squeue_t *sqp = connp->conn_sqp;
diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h
index 8442c4f384..321d0756fc 100644
--- a/usr/src/uts/common/inet/tcp.h
+++ b/usr/src/uts/common/inet/tcp.h
@@ -36,7 +36,6 @@ extern "C" {
#include <netinet/tcp.h>
#include <sys/socket.h>
#include <sys/socket_proto.h>
-#include <sys/multidata.h>
#include <sys/md5.h>
#include <inet/common.h>
#include <inet/ip.h>
@@ -47,12 +46,6 @@ extern "C" {
#include <inet/tcp_sack.h>
#include <inet/kssl/ksslapi.h>
-/*
- * Private (and possibly temporary) ioctl used by configuration code
- * to lock in the "default" stream for detached closes.
- */
-#define TCP_IOC_DEFAULT_Q (('T' << 8) + 51)
-
/* TCP states */
#define TCPS_CLOSED -6
#define TCPS_IDLE -5 /* idle (opened, but not bound) */
@@ -73,7 +66,7 @@ extern "C" {
/*
* Internal flags used in conjunction with the packet header flags.
- * Used in tcp_rput_data to keep track of what needs to be done.
+ * Used in tcp_input_data to keep track of what needs to be done.
*/
#define TH_LIMIT_XMIT 0x0400 /* Limited xmit is needed */
#define TH_XMIT_NEEDED 0x0800 /* Window opened - send queued data */
@@ -108,11 +101,12 @@ typedef struct tcphdr_s {
uint8_t th_urp[2]; /* Urgent pointer */
} tcph_t;
-#define TCP_HDR_LENGTH(tcph) (((tcph)->th_offset_and_rsrvd[0] >>2) &(0xF << 2))
+#define TCP_HDR_LENGTH(tcph) \
+ ((((tcph_t *)tcph)->th_offset_and_rsrvd[0] >>2) &(0xF << 2))
#define TCP_MAX_COMBINED_HEADER_LENGTH (60 + 60) /* Maxed out ip + tcp */
#define TCP_MAX_IP_OPTIONS_LENGTH (60 - IP_SIMPLE_HDR_LENGTH)
#define TCP_MAX_HDR_LENGTH 60
-#define TCP_MAX_TCP_OPTIONS_LENGTH (60 - sizeof (tcph_t))
+#define TCP_MAX_TCP_OPTIONS_LENGTH (60 - sizeof (tcpha_t))
#define TCP_MIN_HEADER_LENGTH 20
#define TCP_MAXWIN 65535
#define TCP_PORT_LEN sizeof (in_port_t)
@@ -122,7 +116,7 @@ typedef struct tcphdr_s {
#define TCPIP_HDR_LENGTH(mp, n) \
(n) = IPH_HDR_LENGTH((mp)->b_rptr), \
- (n) += TCP_HDR_LENGTH((tcph_t *)&(mp)->b_rptr[(n)])
+ (n) += TCP_HDR_LENGTH((tcpha_t *)&(mp)->b_rptr[(n)])
/* TCP Protocol header (used if the header is known to be 32-bit aligned) */
typedef struct tcphdra_s {
@@ -173,9 +167,6 @@ typedef struct tcp_s {
uint32_t tcp_rnxt; /* Seq we expect to recv next */
uint32_t tcp_rwnd;
- queue_t *tcp_rq; /* Our upstream neighbor (client) */
- queue_t *tcp_wq; /* Our downstream neighbor */
-
/* Fields arranged in approximate access order along main paths */
mblk_t *tcp_xmit_head; /* Head of rexmit list */
mblk_t *tcp_xmit_last; /* last valid data seen by tcp_wput */
@@ -207,46 +198,16 @@ typedef struct tcp_s {
int64_t tcp_last_recv_time; /* Last time we receive a segment. */
uint32_t tcp_init_cwnd; /* Initial cwnd (start/restart) */
- /*
- * Following socket options are set by sockfs outside the squeue
- * and we want to separate these bit fields from the other bit fields
- * set by TCP to avoid grabbing locks. sockfs ensures that only one
- * thread in sockfs can set a socket option at a time on a conn_t.
- * However TCP may read these options concurrently. The linger option
- * needs atomicity since tcp_lingertime also needs to be in sync.
- * However TCP uses it only during close, and by then no socket option
- * can come down. So we don't need any locks, instead just separating
- * the sockfs settable bit fields from the other bit fields is
- * sufficient.
- */
- uint32_t
- tcp_debug : 1, /* SO_DEBUG "socket" option. */
- tcp_dontroute : 1, /* SO_DONTROUTE "socket" option. */
- tcp_broadcast : 1, /* SO_BROADCAST "socket" option. */
- tcp_useloopback : 1, /* SO_USELOOPBACK "socket" option. */
-
- tcp_oobinline : 1, /* SO_OOBINLINE "socket" option. */
- tcp_dgram_errind : 1, /* SO_DGRAM_ERRIND option */
- tcp_linger : 1, /* SO_LINGER turned on */
- tcp_reuseaddr : 1, /* SO_REUSEADDR "socket" option. */
-
- tcp_junk_to_bit_31 : 24;
-
/* Following manipulated by TCP under squeue protection */
uint32_t
tcp_urp_last_valid : 1, /* Is tcp_urp_last valid? */
- tcp_hard_binding : 1, /* If we've started a full bind */
- tcp_hard_bound : 1, /* If we've done a full bind with IP */
+ tcp_hard_binding : 1, /* TCP_DETACHED_NONEAGER */
tcp_fin_acked : 1, /* Has our FIN been acked? */
-
tcp_fin_rcvd : 1, /* Have we seen a FIN? */
+
tcp_fin_sent : 1, /* Have we sent our FIN yet? */
tcp_ordrel_done : 1, /* Have we sent the ord_rel upstream? */
tcp_detached : 1, /* If we're detached from a stream */
-
- tcp_bind_pending : 1, /* Client is waiting for bind ack */
- tcp_unbind_pending : 1, /* Client sent T_UNBIND_REQ */
- tcp_ka_enabled: 1, /* Connection KeepAlive Timer needed */
tcp_zero_win_probe: 1, /* Zero win probing is in progress */
tcp_loopback: 1, /* src and dst are the same machine */
@@ -258,44 +219,40 @@ typedef struct tcp_s {
tcp_active_open: 1, /* This is a active open */
tcp_rexmit : 1, /* TCP is retransmitting */
tcp_snd_sack_ok : 1, /* Can use SACK for this connection */
- tcp_empty_flag : 1, /* Empty flag for future use */
-
- tcp_recvdstaddr : 1, /* return T_EXTCONN_IND with dst addr */
tcp_hwcksum : 1, /* The NIC is capable of hwcksum */
- tcp_ip_forward_progress : 1,
- tcp_anon_priv_bind : 1,
+ tcp_ip_forward_progress : 1,
tcp_ecn_ok : 1, /* Can use ECN for this connection */
tcp_ecn_echo_on : 1, /* Need to do ECN echo */
tcp_ecn_cwr_sent : 1, /* ECN_CWR has been sent */
+
tcp_cwr : 1, /* Cwnd has reduced recently */
- tcp_pad_to_bit31 : 4;
+ tcp_pad_to_bit31 : 11;
+
/* Following manipulated by TCP under squeue protection */
uint32_t
- tcp_mdt : 1, /* Lower layer is capable of MDT */
tcp_snd_ts_ok : 1,
tcp_snd_ws_ok : 1,
- tcp_exclbind : 1, /* ``exclusive'' binding */
-
- tcp_hdr_grown : 1,
+ tcp_reserved_port : 1,
tcp_in_free_list : 1,
- tcp_snd_zcopy_on : 1, /* xmit zero-copy enabled */
+ tcp_snd_zcopy_on : 1, /* xmit zero-copy enabled */
tcp_snd_zcopy_aware : 1, /* client is zero-copy aware */
tcp_xmit_zc_clean : 1, /* the xmit list is free of zc-mblk */
tcp_wait_for_eagers : 1, /* Wait for eagers to disappear */
- tcp_accept_error : 1, /* Error during TLI accept */
+ tcp_accept_error : 1, /* Error during TLI accept */
tcp_send_discon_ind : 1, /* TLI accept err, send discon ind */
tcp_cork : 1, /* tcp_cork option */
tcp_tconnind_started : 1, /* conn_ind message is being sent */
+
tcp_lso :1, /* Lower layer is capable of LSO */
- tcp_refuse :1, /* Connection needs refusing */
tcp_is_wnd_shrnk : 1, /* Window has shrunk */
- tcp_pad_to_bit_31 : 15;
- uint32_t tcp_if_mtu; /* Outgoing interface MTU. */
+ tcp_pad_to_bit_31 : 18;
+
+ uint32_t tcp_initial_pmtu; /* Initial outgoing Path MTU. */
mblk_t *tcp_reass_head; /* Out of order reassembly list head */
mblk_t *tcp_reass_tail; /* Out of order reassembly list tail */
@@ -340,11 +297,6 @@ typedef struct tcp_s {
struct tcp_s *tcp_listener; /* Our listener */
- size_t tcp_xmit_hiwater; /* Send buffer high water mark. */
- size_t tcp_xmit_lowater; /* Send buffer low water mark. */
- size_t tcp_recv_hiwater; /* Recv high water mark */
- size_t tcp_recv_lowater; /* Recv low water mark */
-
uint32_t tcp_irs; /* Initial recv seq num */
uint32_t tcp_fss; /* Final/fin send seq num */
uint32_t tcp_urg; /* Urgent data seq num */
@@ -354,8 +306,6 @@ typedef struct tcp_s {
clock_t tcp_first_ctimer_threshold; /* 1st threshold while connecting */
clock_t tcp_second_ctimer_threshold; /* 2nd ... while connecting */
- int tcp_lingertime; /* Close linger time (in seconds) */
-
uint32_t tcp_urp_last; /* Last urp for which signal sent */
mblk_t *tcp_urp_mp; /* T_EXDATA_IND for urgent byte */
mblk_t *tcp_urp_mark_mp; /* zero-length marked/unmarked msg */
@@ -389,21 +339,14 @@ typedef struct tcp_s {
int32_t tcp_client_errno; /* How the client screwed up */
- char *tcp_iphc; /* Buffer holding tcp/ip hdr template */
- int tcp_iphc_len; /* actual allocated buffer size */
- int32_t tcp_hdr_len; /* Byte len of combined TCP/IP hdr */
- ipha_t *tcp_ipha; /* IPv4 header in the buffer */
- ip6_t *tcp_ip6h; /* IPv6 header in the buffer */
- int tcp_ip_hdr_len; /* Byte len of our current IPvx hdr */
- tcph_t *tcp_tcph; /* tcp header within combined hdr */
- int32_t tcp_tcp_hdr_len; /* tcp header len within combined */
- /* Saved peer headers in the case of re-fusion */
- ipha_t tcp_saved_ipha;
- ip6_t tcp_saved_ip6h;
- tcph_t tcp_saved_tcph;
-
- uint32_t tcp_sum; /* checksum to compensate for source */
- /* routed packets. Host byte order */
+ /*
+ * The header template lives in conn_ht_iphc allocated by tcp_build_hdrs
+ * We maintain three pointers into conn_ht_iphc.
+ */
+ ipha_t *tcp_ipha; /* IPv4 header in conn_ht_iphc */
+ ip6_t *tcp_ip6h; /* IPv6 header in conn_ht_iphc */
+ tcpha_t *tcp_tcpha; /* TCP header in conn_ht_iphc */
+
uint16_t tcp_last_sent_len; /* Record length for nagle */
uint16_t tcp_dupack_cnt; /* # of consequtive duplicate acks */
@@ -413,75 +356,20 @@ typedef struct tcp_s {
t_uscalar_t tcp_acceptor_id; /* ACCEPTOR_id */
int tcp_ipsec_overhead;
- /*
- * Address family that app wishes returned addrsses to be in.
- * Currently taken from address family used in T_BIND_REQ, but
- * should really come from family used in original socket() call.
- * Value can be AF_INET or AF_INET6.
- */
- uint_t tcp_family;
- /*
- * used for a quick test to determine if any ancillary bits are
- * set
- */
- uint_t tcp_ipv6_recvancillary; /* Flags */
-#define TCP_IPV6_RECVPKTINFO 0x01 /* IPV6_RECVPKTINFO option */
-#define TCP_IPV6_RECVHOPLIMIT 0x02 /* IPV6_RECVHOPLIMIT option */
-#define TCP_IPV6_RECVHOPOPTS 0x04 /* IPV6_RECVHOPOPTS option */
-#define TCP_IPV6_RECVDSTOPTS 0x08 /* IPV6_RECVDSTOPTS option */
-#define TCP_IPV6_RECVRTHDR 0x10 /* IPV6_RECVRTHDR option */
-#define TCP_IPV6_RECVRTDSTOPTS 0x20 /* IPV6_RECVRTHDRDSTOPTS option */
-#define TCP_IPV6_RECVTCLASS 0x40 /* IPV6_RECVTCLASS option */
-#define TCP_OLD_IPV6_RECVDSTOPTS 0x80 /* old IPV6_RECVDSTOPTS option */
uint_t tcp_recvifindex; /* Last received IPV6_RCVPKTINFO */
uint_t tcp_recvhops; /* Last received IPV6_RECVHOPLIMIT */
uint_t tcp_recvtclass; /* Last received IPV6_RECVTCLASS */
ip6_hbh_t *tcp_hopopts; /* Last received IPV6_RECVHOPOPTS */
ip6_dest_t *tcp_dstopts; /* Last received IPV6_RECVDSTOPTS */
- ip6_dest_t *tcp_rtdstopts; /* Last recvd IPV6_RECVRTHDRDSTOPTS */
+ ip6_dest_t *tcp_rthdrdstopts; /* Last recv IPV6_RECVRTHDRDSTOPTS */
ip6_rthdr_t *tcp_rthdr; /* Last received IPV6_RECVRTHDR */
uint_t tcp_hopoptslen;
uint_t tcp_dstoptslen;
- uint_t tcp_rtdstoptslen;
+ uint_t tcp_rthdrdstoptslen;
uint_t tcp_rthdrlen;
mblk_t *tcp_timercache;
- cred_t *tcp_cred; /* Credentials when this was opened */
- pid_t tcp_cpid; /* Process id when this was opened */
- uint64_t tcp_open_time; /* time when this was opened */
-
-
- union {
- struct {
- uchar_t v4_ttl;
- /* Dup of tcp_ipha.iph_type_of_service */
- uchar_t v4_tos; /* Dup of tcp_ipha.iph_ttl */
- } v4_hdr_info;
- struct {
- uint_t v6_vcf; /* Dup of tcp_ip6h.ip6h_vcf */
- uchar_t v6_hops; /* Dup of tcp_ip6h.ip6h_hops */
- } v6_hdr_info;
- } tcp_hdr_info;
-#define tcp_ttl tcp_hdr_info.v4_hdr_info.v4_ttl
-#define tcp_tos tcp_hdr_info.v4_hdr_info.v4_tos
-#define tcp_ip6_vcf tcp_hdr_info.v6_hdr_info.v6_vcf
-#define tcp_ip6_hops tcp_hdr_info.v6_hdr_info.v6_hops
-
- ushort_t tcp_ipversion;
- uint_t tcp_bound_if; /* IPV6_BOUND_IF */
-
-#define tcp_lport tcp_connp->conn_lport
-#define tcp_fport tcp_connp->conn_fport
-#define tcp_ports tcp_connp->conn_ports
-
-#define tcp_remote tcp_connp->conn_rem
-#define tcp_ip_src tcp_connp->conn_src
-
-#define tcp_remote_v6 tcp_connp->conn_remv6
-#define tcp_ip_src_v6 tcp_connp->conn_srcv6
-#define tcp_bound_source_v6 tcp_connp->conn_bound_source_v6
-#define tcp_bound_source tcp_connp->conn_bound_source
kmutex_t tcp_closelock;
kcondvar_t tcp_closecv;
@@ -497,36 +385,13 @@ typedef struct tcp_s {
struct tcp_s *tcp_bind_hash_port; /* tcp_t's bound to the same lport */
struct tcp_s **tcp_ptpbhn;
- boolean_t tcp_ire_ill_check_done;
- uint_t tcp_maxpsz;
-
- /*
- * used for Multidata Transmit
- */
- uint_t tcp_mdt_hdr_head; /* leading header fragment extra space */
- uint_t tcp_mdt_hdr_tail; /* trailing header fragment extra space */
- int tcp_mdt_max_pld; /* maximum payload buffers per Multidata */
+ uint_t tcp_maxpsz_multiplier;
uint32_t tcp_lso_max; /* maximum LSO payload */
uint32_t tcp_ofo_fin_seq; /* Recv out of order FIN seq num */
uint32_t tcp_cwr_snd_max;
- uint_t tcp_drop_opt_ack_cnt; /* # tcp generated optmgmt */
- ip6_pkt_t tcp_sticky_ipp; /* Sticky options */
-#define tcp_ipp_fields tcp_sticky_ipp.ipp_fields /* valid fields */
-#define tcp_ipp_ifindex tcp_sticky_ipp.ipp_ifindex /* pktinfo ifindex */
-#define tcp_ipp_addr tcp_sticky_ipp.ipp_addr /* pktinfo src/dst addr */
-#define tcp_ipp_hoplimit tcp_sticky_ipp.ipp_hoplimit
-#define tcp_ipp_hopoptslen tcp_sticky_ipp.ipp_hopoptslen
-#define tcp_ipp_rtdstoptslen tcp_sticky_ipp.ipp_rtdstoptslen
-#define tcp_ipp_rthdrlen tcp_sticky_ipp.ipp_rthdrlen
-#define tcp_ipp_dstoptslen tcp_sticky_ipp.ipp_dstoptslen
-#define tcp_ipp_hopopts tcp_sticky_ipp.ipp_hopopts
-#define tcp_ipp_rtdstopts tcp_sticky_ipp.ipp_rtdstopts
-#define tcp_ipp_rthdr tcp_sticky_ipp.ipp_rthdr
-#define tcp_ipp_dstopts tcp_sticky_ipp.ipp_dstopts
-#define tcp_ipp_nexthop tcp_sticky_ipp.ipp_nexthop
-#define tcp_ipp_use_min_mtu tcp_sticky_ipp.ipp_use_min_mtu
+
struct tcp_s *tcp_saved_listener; /* saved value of listener */
uint32_t tcp_in_ack_unsent; /* ACK for unsent data cnt. */
@@ -562,7 +427,6 @@ typedef struct tcp_s {
boolean_t tcp_kssl_inhandshake; /* during SSL handshake */
kssl_ent_t tcp_kssl_ent; /* SSL table entry */
kssl_ctx_t tcp_kssl_ctx; /* SSL session */
- uint_t tcp_label_len; /* length of cached label */
/*
* tcp_closemp_used is protected by listener's tcp_eager_lock
@@ -620,47 +484,17 @@ typedef struct tcp_s {
#define TCP_DEBUG_GETPCSTACK(buffer, depth)
#endif
-/*
- * Track a reference count on the tcps in order to know when
- * the tcps_g_q can be removed. As long as there is any
- * tcp_t, other that the tcps_g_q itself, in the tcp_stack_t we
- * need to keep tcps_g_q around so that a closing connection can
- * switch to using tcps_g_q as part of it closing.
- */
-#define TCPS_REFHOLD(tcps) { \
- atomic_add_32(&(tcps)->tcps_refcnt, 1); \
- ASSERT((tcps)->tcps_refcnt != 0); \
- DTRACE_PROBE1(tcps__refhold, tcp_stack_t, tcps); \
-}
-
-/*
- * Decrement the reference count on the tcp_stack_t.
- * In architectures e.g sun4u, where atomic_add_32_nv is just
- * a cas, we need to maintain the right memory barrier semantics
- * as that of mutex_exit i.e all the loads and stores should complete
- * before the cas is executed. membar_exit() does that here.
- */
-#define TCPS_REFRELE(tcps) { \
- ASSERT((tcps)->tcps_refcnt != 0); \
- membar_exit(); \
- DTRACE_PROBE1(tcps__refrele, tcp_stack_t, tcps); \
- if (atomic_add_32_nv(&(tcps)->tcps_refcnt, -1) == 0 && \
- (tcps)->tcps_g_q != NULL) { \
- /* Only tcps_g_q left */ \
- tcp_g_q_inactive(tcps); \
- } \
-}
-
extern void tcp_free(tcp_t *tcp);
extern void tcp_ddi_g_init(void);
extern void tcp_ddi_g_destroy(void);
-extern void tcp_g_q_inactive(tcp_stack_t *);
-extern void tcp_xmit_listeners_reset(mblk_t *mp, uint_t ip_hdr_len,
- zoneid_t zoneid, tcp_stack_t *, conn_t *connp);
-extern void tcp_conn_request(void *arg, mblk_t *mp, void *arg2);
-extern void tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2);
-extern void tcp_input(void *arg, mblk_t *mp, void *arg2);
-extern void tcp_rput_data(void *arg, mblk_t *mp, void *arg2);
+extern void tcp_xmit_listeners_reset(mblk_t *, ip_recv_attr_t *,
+ ip_stack_t *, conn_t *);
+extern void tcp_input_listener(void *arg, mblk_t *mp, void *arg2,
+ ip_recv_attr_t *);
+extern void tcp_input_listener_unbound(void *arg, mblk_t *mp, void *arg2,
+ ip_recv_attr_t *);
+extern void tcp_input_data(void *arg, mblk_t *mp, void *arg2,
+ ip_recv_attr_t *);
extern void *tcp_get_conn(void *arg, tcp_stack_t *);
extern void tcp_time_wait_collector(void *arg);
extern mblk_t *tcp_snmp_get(queue_t *, mblk_t *);
@@ -668,7 +502,6 @@ extern int tcp_snmp_set(queue_t *, int, int, uchar_t *, int len);
extern mblk_t *tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send,
int32_t *offset, mblk_t **end_mp, uint32_t seq,
boolean_t sendall, uint32_t *seg_len, boolean_t rexmit);
-extern void tcp_xmit_reset(void *arg, mblk_t *mp, void *arg2);
/*
* The TCP Fanout structure.
@@ -706,6 +539,15 @@ typedef struct cl_tcp_info_s {
} cl_tcp_info_t;
/*
+ * Hook functions to enable cluster networking
+ * On non-clustered systems these vectors must always be NULL.
+ */
+extern void (*cl_inet_listen)(netstackid_t, uint8_t, sa_family_t,
+ uint8_t *, in_port_t, void *);
+extern void (*cl_inet_unlisten)(netstackid_t, uint8_t, sa_family_t,
+ uint8_t *, in_port_t, void *);
+
+/*
* Contracted Consolidation Private ioctl for aborting TCP connections.
* In order to keep the offsets and size of the structure the same between
* a 32-bit application and a 64-bit amd64 kernel, we use a #pragma
@@ -729,25 +571,6 @@ typedef struct tcp_ioc_abort_conn_s {
#pragma pack()
#endif
-#if (defined(_KERNEL) || defined(_KMEMUSER))
-extern void tcp_rput_other(tcp_t *tcp, mblk_t *mp);
-#endif
-
-#if (defined(_KERNEL))
-#define TCP_XRE_EVENT_IP_FANOUT_TCP 1
-
-/*
- * This is a private structure used to pass data to an squeue function during
- * tcp's listener reset sending path.
- */
-typedef struct tcp_xmit_reset_event {
- int tcp_xre_event;
- int tcp_xre_iphdrlen;
- zoneid_t tcp_xre_zoneid;
- tcp_stack_t *tcp_xre_tcps;
-} tcp_xmit_reset_event_t;
-#endif
-
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index c9a941eab2..0e1ef43cfb 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -46,8 +46,6 @@
#include <sys/ethernet.h>
#include <sys/cpuvar.h>
#include <sys/dlpi.h>
-#include <sys/multidata.h>
-#include <sys/multidata_impl.h>
#include <sys/pattr.h>
#include <sys/policy.h>
#include <sys/priv.h>
@@ -87,7 +85,6 @@
#include <inet/tcp_impl.h>
#include <inet/udp_impl.h>
#include <net/pfkeyv2.h>
-#include <inet/ipsec_info.h>
#include <inet/ipdrop.h>
#include <inet/ipclassifier.h>
@@ -95,6 +92,7 @@
#include <inet/ip_ftable.h>
#include <inet/ip_if.h>
#include <inet/ipp_common.h>
+#include <inet/ip_rts.h>
#include <inet/ip_netinfo.h>
#include <sys/squeue_impl.h>
#include <sys/squeue.h>
@@ -111,7 +109,7 @@
*
* The entire tcp state is contained in tcp_t and conn_t structure
* which are allocated in tandem using ipcl_conn_create() and passing
- * IPCL_CONNTCP as a flag. We use 'conn_ref' and 'conn_lock' to protect
+ * IPCL_TCPCONN as a flag. We use 'conn_ref' and 'conn_lock' to protect
* the references on the tcp_t. The tcp_t structure is never compressed
* and packets always land on the correct TCP perimeter from the time
* eager is created till the time tcp_t dies (as such the old mentat
@@ -172,8 +170,8 @@
*
* This is a more interesting case because of various races involved in
* establishing a eager in its own perimeter. Read the meta comment on
- * top of tcp_conn_request(). But briefly, the squeue is picked by
- * ip_tcp_input()/ip_fanout_tcp_v6() based on the interrupted CPU.
+ * top of tcp_input_listener(). But briefly, the squeue is picked by
+ * ip_fanout based on the ring or the sender (if loopback).
*
* Closing a connection:
*
@@ -198,20 +196,13 @@
*
* Special provisions and fast paths:
*
- * We make special provision for (AF_INET, SOCK_STREAM) sockets which
- * can't have 'ipv6_recvpktinfo' set and for these type of sockets, IP
- * will never send a M_CTL to TCP. As such, ip_tcp_input() which handles
- * all TCP packets from the wire makes a IPCL_IS_TCP4_CONNECTED_NO_POLICY
- * check to send packets directly to tcp_rput_data via squeue. Everyone
- * else comes through tcp_input() on the read side.
- *
- * We also make special provisions for sockfs by marking tcp_issocket
+ * We make special provisions for sockfs by marking tcp_issocket
* whenever we have only sockfs on top of TCP. This allows us to skip
* putting the tcp in acceptor hash since a sockfs listener can never
* become acceptor and also avoid allocating a tcp_t for acceptor STREAM
* since eager has already been allocated and the accept now happens
* on acceptor STREAM. There is a big blob of comment on top of
- * tcp_conn_request explaining the new accept. When socket is POP'd,
+ * tcp_input_listener explaining the new accept. When socket is POP'd,
* sockfs sends us an ioctl to mark the fact and we go back to old
* behaviour. Once tcp_issocket is unset, its never set for the
* life of that connection.
@@ -224,13 +215,6 @@
* only exception is tcp_xmit_listeners_reset() which is called
* directly from IP and needs to policy check to see if TH_RST
* can be sent out.
- *
- * PFHooks notes :
- *
- * For mdt case, one meta buffer contains multiple packets. Mblks for every
- * packet are assembled and passed to the hooks. When packets are blocked,
- * or boundary of any packet is changed, the mdt processing is stopped, and
- * packets of the meta buffer are send to the IP path one by one.
*/
/*
@@ -244,7 +228,7 @@ int tcp_squeue_flag;
/*
* This controls how tiny a write must be before we try to copy it
- * into the the mblk on the tail of the transmit queue. Not much
+ * into the mblk on the tail of the transmit queue. Not much
* speedup is observed for values larger than sixteen. Zero will
* disable the optimisation.
*/
@@ -333,16 +317,6 @@ static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG];
tcp_g_stat_t tcp_g_statistics;
kstat_t *tcp_g_kstat;
-/*
- * Call either ip_output or ip_output_v6. This replaces putnext() calls on the
- * tcp write side.
- */
-#define CALL_IP_WPUT(connp, q, mp) { \
- ASSERT(((q)->q_flag & QREADR) == 0); \
- TCP_DBGSTAT(connp->conn_netstack->netstack_tcp, tcp_ip_output); \
- connp->conn_send(connp, (mp), (q), IP_WPUT); \
-}
-
/* Macros for timestamp comparisons */
#define TSTMP_GEQ(a, b) ((int32_t)((a)-(b)) >= 0)
#define TSTMP_LT(a, b) ((int32_t)((a)-(b)) < 0)
@@ -354,7 +328,7 @@ kstat_t *tcp_g_kstat;
* nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27);
* a per-connection component which grows by 125000 for every new connection;
* and an "extra" component that grows by a random amount centered
- * approximately on 64000. This causes the the ISS generator to cycle every
+ * approximately on 64000. This causes the ISS generator to cycle every
* 4.89 hours if no TCP connections are made, and faster if connections are
* made.
*
@@ -381,8 +355,13 @@ static sin6_t sin6_null; /* Zero address for quick clears */
*/
#define TCP_OLD_URP_INTERPRETATION 1
+/*
+ * Since tcp_listener is not cleared atomically with tcp_detached
+ * being cleared we need this extra bit to tell a detached connection
+ * apart from one that is in the process of being accepted.
+ */
#define TCP_IS_DETACHED_NONEAGER(tcp) \
- (TCP_IS_DETACHED(tcp) && \
+ (TCP_IS_DETACHED(tcp) && \
(!(tcp)->tcp_hard_binding))
/*
@@ -495,7 +474,6 @@ typedef struct tcp_timer_s {
static kmem_cache_t *tcp_timercache;
kmem_cache_t *tcp_sack_info_cache;
-kmem_cache_t *tcp_iphc_cache;
/*
* For scalability, we must not run a timer for every TCP connection
@@ -592,17 +570,6 @@ typedef struct tcp_opt_s {
} tcp_opt_t;
/*
- * TCP option struct passing information b/w lisenter and eager.
- */
-struct tcp_options {
- uint_t to_flags;
- ssize_t to_boundif; /* IPV6_BOUND_IF */
-};
-
-#define TCPOPT_BOUNDIF 0x00000001 /* set IPV6_BOUND_IF */
-#define TCPOPT_RECVPKTINFO 0x00000002 /* set IPV6_RECVPKTINFO */
-
-/*
* RFC1323-recommended phrasing of TSTAMP option, for easier parsing
*/
@@ -673,43 +640,53 @@ typedef struct tcpt_s {
/*
* Functions called directly via squeue having a prototype of edesc_t.
*/
-void tcp_conn_request(void *arg, mblk_t *mp, void *arg2);
-static void tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2);
-void tcp_accept_finish(void *arg, mblk_t *mp, void *arg2);
-static void tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2);
-static void tcp_wput_proto(void *arg, mblk_t *mp, void *arg2);
-void tcp_input(void *arg, mblk_t *mp, void *arg2);
-void tcp_rput_data(void *arg, mblk_t *mp, void *arg2);
-static void tcp_close_output(void *arg, mblk_t *mp, void *arg2);
-void tcp_output(void *arg, mblk_t *mp, void *arg2);
-void tcp_output_urgent(void *arg, mblk_t *mp, void *arg2);
-static void tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2);
-static void tcp_timer_handler(void *arg, mblk_t *mp, void *arg2);
-static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2);
+void tcp_input_listener(void *arg, mblk_t *mp, void *arg2,
+ ip_recv_attr_t *ira);
+static void tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2,
+ ip_recv_attr_t *dummy);
+void tcp_accept_finish(void *arg, mblk_t *mp, void *arg2,
+ ip_recv_attr_t *dummy);
+static void tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2,
+ ip_recv_attr_t *dummy);
+static void tcp_wput_proto(void *arg, mblk_t *mp, void *arg2,
+ ip_recv_attr_t *dummy);
+void tcp_input_data(void *arg, mblk_t *mp, void *arg2,
+ ip_recv_attr_t *ira);
+static void tcp_close_output(void *arg, mblk_t *mp, void *arg2,
+ ip_recv_attr_t *dummy);
+void tcp_output(void *arg, mblk_t *mp, void *arg2,
+ ip_recv_attr_t *dummy);
+void tcp_output_urgent(void *arg, mblk_t *mp, void *arg2,
+ ip_recv_attr_t *dummy);
+static void tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2,
+ ip_recv_attr_t *dummy);
+static void tcp_timer_handler(void *arg, mblk_t *mp, void *arg2,
+ ip_recv_attr_t *dummy);
+static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2,
+ ip_recv_attr_t *dummy);
/* Prototype for TCP functions */
static void tcp_random_init(void);
int tcp_random(void);
static void tcp_tli_accept(tcp_t *tcp, mblk_t *mp);
-static int tcp_accept_swap(tcp_t *listener, tcp_t *acceptor,
+static void tcp_accept_swap(tcp_t *listener, tcp_t *acceptor,
tcp_t *eager);
-static int tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp);
+static int tcp_set_destination(tcp_t *tcp);
static in_port_t tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
int reuseaddr, boolean_t quick_connect, boolean_t bind_to_req_port_only,
boolean_t user_specified);
static void tcp_closei_local(tcp_t *tcp);
static void tcp_close_detached(tcp_t *tcp);
-static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph,
- mblk_t *idmp, mblk_t **defermp);
+static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr,
+ mblk_t *idmp, mblk_t **defermp, ip_recv_attr_t *ira);
static void tcp_tpi_connect(tcp_t *tcp, mblk_t *mp);
static int tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp,
- in_port_t dstport, uint_t srcid, cred_t *cr, pid_t pid);
-static int tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp,
- in_port_t dstport, uint32_t flowinfo, uint_t srcid,
- uint32_t scope_id, cred_t *cr, pid_t pid);
+ in_port_t dstport, uint_t srcid);
+static int tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp,
+ in_port_t dstport, uint32_t flowinfo,
+ uint_t srcid, uint32_t scope_id);
static int tcp_clean_death(tcp_t *tcp, int err, uint8_t tag);
-static void tcp_def_q_set(tcp_t *tcp, mblk_t *mp);
static void tcp_disconnect(tcp_t *tcp, mblk_t *mp);
static char *tcp_display(tcp_t *tcp, char *, char);
static boolean_t tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum);
@@ -735,34 +712,16 @@ static void tcp_acceptor_hash_remove(tcp_t *tcp);
static void tcp_capability_req(tcp_t *tcp, mblk_t *mp);
static void tcp_info_req(tcp_t *tcp, mblk_t *mp);
static void tcp_addr_req(tcp_t *tcp, mblk_t *mp);
-static void tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *mp);
-void tcp_g_q_setup(tcp_stack_t *);
-void tcp_g_q_create(tcp_stack_t *);
-void tcp_g_q_destroy(tcp_stack_t *);
-static int tcp_header_init_ipv4(tcp_t *tcp);
-static int tcp_header_init_ipv6(tcp_t *tcp);
-int tcp_init(tcp_t *tcp, queue_t *q);
-static int tcp_init_values(tcp_t *tcp);
-static mblk_t *tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic);
-static void tcp_ip_ire_mark_advice(tcp_t *tcp);
+static void tcp_init_values(tcp_t *tcp);
static void tcp_ip_notify(tcp_t *tcp);
-static mblk_t *tcp_ire_mp(mblk_t **mpp);
static void tcp_iss_init(tcp_t *tcp);
static void tcp_keepalive_killer(void *arg);
-static int tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt);
-static void tcp_mss_set(tcp_t *tcp, uint32_t size, boolean_t do_ss);
+static int tcp_parse_options(tcpha_t *tcpha, tcp_opt_t *tcpopt);
+static void tcp_mss_set(tcp_t *tcp, uint32_t size);
static int tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp,
int *do_disconnectp, int *t_errorp, int *sys_errorp);
static boolean_t tcp_allow_connopt_set(int level, int name);
int tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr);
-int tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr);
-int tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level,
- int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
- uchar_t *outvalp, void *thisdg_attrs, cred_t *cr,
- mblk_t *mblk);
-static void tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha);
-static int tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly,
- uchar_t *ptr, uint_t len);
static int tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
static boolean_t tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt,
tcp_stack_t *);
@@ -785,9 +744,9 @@ static uint_t tcp_rcv_drain(tcp_t *tcp);
static void tcp_sack_rxmit(tcp_t *tcp, uint_t *flags);
static boolean_t tcp_send_rst_chk(tcp_stack_t *);
static void tcp_ss_rexmit(tcp_t *tcp);
-static mblk_t *tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp);
-static void tcp_process_options(tcp_t *, tcph_t *);
-static void tcp_rput_common(tcp_t *tcp, mblk_t *mp);
+static mblk_t *tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp,
+ ip_recv_attr_t *);
+static void tcp_process_options(tcp_t *, tcpha_t *);
static void tcp_rsrv(queue_t *q);
static int tcp_snmp_state(tcp_t *tcp);
static void tcp_timer(void *arg);
@@ -801,16 +760,10 @@ void tcp_tpi_accept(queue_t *q, mblk_t *mp);
static void tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent);
static void tcp_wput_flush(tcp_t *tcp, mblk_t *mp);
static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
-static int tcp_send(queue_t *q, tcp_t *tcp, const int mss,
- const int tcp_hdr_len, const int tcp_tcp_hdr_len,
+static int tcp_send(tcp_t *tcp, const int mss,
+ const int total_hdr_len, const int tcp_hdr_len,
const int num_sack_blk, int *usable, uint_t *snxt,
- int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time,
- const int mdt_thres);
-static int tcp_multisend(queue_t *q, tcp_t *tcp, const int mss,
- const int tcp_hdr_len, const int tcp_tcp_hdr_len,
- const int num_sack_blk, int *usable, uint_t *snxt,
- int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time,
- const int mdt_thres);
+ int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time);
static void tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now,
int num_sack_blk);
static void tcp_wsrv(queue_t *q);
@@ -818,38 +771,36 @@ static int tcp_xmit_end(tcp_t *tcp);
static void tcp_ack_timer(void *arg);
static mblk_t *tcp_ack_mp(tcp_t *tcp);
static void tcp_xmit_early_reset(char *str, mblk_t *mp,
- uint32_t seq, uint32_t ack, int ctl, uint_t ip_hdr_len,
- zoneid_t zoneid, tcp_stack_t *, conn_t *connp);
+ uint32_t seq, uint32_t ack, int ctl, ip_recv_attr_t *,
+ ip_stack_t *, conn_t *);
static void tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq,
uint32_t ack, int ctl);
-static int setmaxps(queue_t *q, int maxpsz);
static void tcp_set_rto(tcp_t *, time_t);
-static boolean_t tcp_check_policy(tcp_t *, mblk_t *, ipha_t *, ip6_t *,
- boolean_t, boolean_t);
-static void tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp,
- boolean_t ipsec_mctl);
+static void tcp_icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
+static void tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *);
+static boolean_t tcp_verifyicmp(conn_t *, void *, icmph_t *, icmp6_t *,
+ ip_recv_attr_t *);
static int tcp_build_hdrs(tcp_t *);
static void tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp,
- uint32_t seg_seq, uint32_t seg_ack, int seg_len,
- tcph_t *tcph);
-boolean_t tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp);
-static mblk_t *tcp_mdt_info_mp(mblk_t *);
-static void tcp_mdt_update(tcp_t *, ill_mdt_capab_t *, boolean_t);
-static int tcp_mdt_add_attrs(multidata_t *, const mblk_t *,
- const boolean_t, const uint32_t, const uint32_t,
- const uint32_t, const uint32_t, tcp_stack_t *);
-static void tcp_multisend_data(tcp_t *, ire_t *, const ill_t *, mblk_t *,
- const uint_t, const uint_t, boolean_t *);
-static mblk_t *tcp_lso_info_mp(mblk_t *);
-static void tcp_lso_update(tcp_t *, ill_lso_capab_t *);
-static void tcp_send_data(tcp_t *, queue_t *, mblk_t *);
+ uint32_t seg_seq, uint32_t seg_ack, int seg_len, tcpha_t *tcpha,
+ ip_recv_attr_t *ira);
+boolean_t tcp_paws_check(tcp_t *tcp, tcpha_t *tcpha, tcp_opt_t *tcpoptp);
+static boolean_t tcp_zcopy_check(tcp_t *);
+static void tcp_zcopy_notify(tcp_t *);
+static mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, boolean_t);
+static void tcp_update_lso(tcp_t *tcp, ip_xmit_attr_t *ixa);
+static void tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only);
+static void tcp_update_zcopy(tcp_t *tcp);
+static void tcp_notify(void *, ip_xmit_attr_t *, ixa_notify_type_t,
+ ixa_notify_arg_t);
+static void tcp_rexmit_after_error(tcp_t *tcp);
+static void tcp_send_data(tcp_t *, mblk_t *);
extern mblk_t *tcp_timermp_alloc(int);
extern void tcp_timermp_free(tcp_t *);
static void tcp_timer_free(tcp_t *tcp, mblk_t *mp);
static void tcp_stop_lingering(tcp_t *tcp);
static void tcp_close_linger_timeout(void *arg);
static void *tcp_stack_init(netstackid_t stackid, netstack_t *ns);
-static void tcp_stack_shutdown(netstackid_t stackid, void *arg);
static void tcp_stack_fini(netstackid_t stackid, void *arg);
static void *tcp_g_kstat_init(tcp_g_stat_t *);
static void tcp_g_kstat_fini(kstat_t *);
@@ -858,11 +809,10 @@ static void tcp_kstat_fini(netstackid_t, kstat_t *);
static void *tcp_kstat2_init(netstackid_t, tcp_stat_t *);
static void tcp_kstat2_fini(netstackid_t, kstat_t *);
static int tcp_kstat_update(kstat_t *kp, int rw);
-void tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp);
-static int tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
- tcph_t *tcph, uint_t ipvers, mblk_t *idmp);
-static int tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha,
- tcph_t *tcph, mblk_t *idmp);
+static mblk_t *tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
+ ip_recv_attr_t *ira);
+static mblk_t *tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, mblk_t *mp,
+ ip_recv_attr_t *ira);
static int tcp_squeue_switch(int);
static int tcp_open(queue_t *, dev_t *, int, int, cred_t *, boolean_t);
@@ -872,21 +822,17 @@ static int tcp_tpi_close(queue_t *, int);
static int tcp_tpi_close_accept(queue_t *);
static void tcp_squeue_add(squeue_t *);
-static boolean_t tcp_zcopy_check(tcp_t *);
-static void tcp_zcopy_notify(tcp_t *);
-static mblk_t *tcp_zcopy_disable(tcp_t *, mblk_t *);
-static mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, int);
-static void tcp_ire_ill_check(tcp_t *, ire_t *, ill_t *, boolean_t);
+static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
-extern void tcp_kssl_input(tcp_t *, mblk_t *);
+extern void tcp_kssl_input(tcp_t *, mblk_t *, cred_t *);
-void tcp_eager_kill(void *arg, mblk_t *mp, void *arg2);
-void tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2);
+void tcp_eager_kill(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy);
+void tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2,
+ ip_recv_attr_t *dummy);
static int tcp_accept(sock_lower_handle_t, sock_lower_handle_t,
sock_upper_handle_t, cred_t *);
static int tcp_listen(sock_lower_handle_t, int, cred_t *);
-static int tcp_post_ip_bind(tcp_t *, mblk_t *, int, cred_t *, pid_t);
static int tcp_do_listen(conn_t *, struct sockaddr *, socklen_t, int, cred_t *,
boolean_t);
static int tcp_do_connect(conn_t *, const struct sockaddr *, socklen_t,
@@ -922,7 +868,8 @@ static void tcp_ulp_newconn(conn_t *, conn_t *, mblk_t *);
*/
static mblk_t *tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *);
static void tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *);
-static void tcp_ioctl_abort_handler(tcp_t *, mblk_t *);
+static void tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2,
+ ip_recv_attr_t *dummy);
static int tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps);
static void tcp_ioctl_abort_conn(queue_t *, mblk_t *);
static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *,
@@ -988,12 +935,6 @@ struct streamtab tcpinfov6 = {
sock_downcalls_t sock_tcp_downcalls;
-/*
- * Have to ensure that tcp_g_q_close is not done by an
- * interrupt thread.
- */
-static taskq_t *tcp_taskq;
-
/* Setable only in /etc/system. Move to ndd? */
boolean_t tcp_icmp_source_quench = B_FALSE;
@@ -1042,8 +983,8 @@ static struct T_info_ack tcp_g_t_info_ack_v6 = {
#define PARAM_MAX (~(uint32_t)0)
/* Max size IP datagram is 64k - 1 */
-#define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcph_t)))
-#define TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcph_t)))
+#define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcpha_t)))
+#define TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcpha_t)))
/* Max of the above */
#define TCP_MSS_MAX TCP_MSS_MAX_IPV4
@@ -1128,29 +1069,10 @@ static tcpparam_t lcl_tcp_param_arr[] = {
{ 0, 100*MS, 50*MS, "tcp_push_timer_interval"},
{ 0, 1, 0, "tcp_use_smss_as_mss_opt"},
{ 0, PARAM_MAX, 8*MINUTES, "tcp_keepalive_abort_interval"},
+ { 0, 1, 0, "tcp_dev_flow_ctl"},
};
/* END CSTYLED */
-/*
- * tcp_mdt_hdr_{head,tail}_min are the leading and trailing spaces of
- * each header fragment in the header buffer. Each parameter value has
- * to be a multiple of 4 (32-bit aligned).
- */
-static tcpparam_t lcl_tcp_mdt_head_param =
- { 32, 256, 32, "tcp_mdt_hdr_head_min" };
-static tcpparam_t lcl_tcp_mdt_tail_param =
- { 0, 256, 32, "tcp_mdt_hdr_tail_min" };
-#define tcps_mdt_hdr_head_min tcps_mdt_head_param->tcp_param_val
-#define tcps_mdt_hdr_tail_min tcps_mdt_tail_param->tcp_param_val
-
-/*
- * tcp_mdt_max_pbufs is the upper limit value that tcp uses to figure out
- * the maximum number of payload buffers associated per Multidata.
- */
-static tcpparam_t lcl_tcp_mdt_max_pbufs_param =
- { 1, MULTIDATA_MAX_PBUFS, MULTIDATA_MAX_PBUFS, "tcp_mdt_max_pbufs" };
-#define tcps_mdt_max_pbufs tcps_mdt_max_pbufs_param->tcp_param_val
-
/* Round up the value to the nearest mss. */
#define MSS_ROUNDUP(value, mss) ((((value) - 1) / (mss) + 1) * (mss))
@@ -1162,7 +1084,7 @@ static tcpparam_t lcl_tcp_mdt_max_pbufs_param =
* point ECT(0) for TCP as described in RFC 2481.
*/
#define SET_ECT(tcp, iph) \
- if ((tcp)->tcp_ipversion == IPV4_VERSION) { \
+ if ((tcp)->tcp_connp->conn_ipversion == IPV4_VERSION) { \
/* We need to clear the code point first. */ \
((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \
((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \
@@ -1183,23 +1105,12 @@ static tcpparam_t lcl_tcp_mdt_max_pbufs_param =
#define IS_VMLOANED_MBLK(mp) \
(((mp)->b_datap->db_struioflag & STRUIO_ZC) != 0)
-
-/* Enable or disable b_cont M_MULTIDATA chaining for MDT. */
-boolean_t tcp_mdt_chain = B_TRUE;
-
-/*
- * MDT threshold in the form of effective send MSS multiplier; we take
- * the MDT path if the amount of unsent data exceeds the threshold value
- * (default threshold is 1*SMSS).
- */
-uint_t tcp_mdt_smss_threshold = 1;
-
uint32_t do_tcpzcopy = 1; /* 0: disable, 1: enable, 2: force */
/*
* Forces all connections to obey the value of the tcps_maxpsz_multiplier
* tunable settable via NDD. Otherwise, the per-connection behavior is
- * determined dynamically during tcp_adapt_ire(), which is the default.
+ * determined dynamically during tcp_set_destination(), which is the default.
*/
boolean_t tcp_static_maxpsz = B_FALSE;
@@ -1273,84 +1184,73 @@ int (*cl_inet_connect2)(netstackid_t stack_id, uint8_t protocol,
uint8_t *laddrp, in_port_t lport,
uint8_t *faddrp, in_port_t fport,
void *args) = NULL;
-
void (*cl_inet_disconnect)(netstackid_t stack_id, uint8_t protocol,
sa_family_t addr_family, uint8_t *laddrp,
in_port_t lport, uint8_t *faddrp,
in_port_t fport, void *args) = NULL;
-/*
- * The following are defined in ip.c
- */
-extern int (*cl_inet_isclusterwide)(netstackid_t stack_id, uint8_t protocol,
- sa_family_t addr_family, uint8_t *laddrp,
- void *args);
-extern uint32_t (*cl_inet_ipident)(netstackid_t stack_id, uint8_t protocol,
- sa_family_t addr_family, uint8_t *laddrp,
- uint8_t *faddrp, void *args);
-
/*
* int CL_INET_CONNECT(conn_t *cp, tcp_t *tcp, boolean_t is_outgoing, int err)
*/
-#define CL_INET_CONNECT(connp, tcp, is_outgoing, err) { \
+#define CL_INET_CONNECT(connp, is_outgoing, err) { \
(err) = 0; \
if (cl_inet_connect2 != NULL) { \
/* \
* Running in cluster mode - register active connection \
* information \
*/ \
- if ((tcp)->tcp_ipversion == IPV4_VERSION) { \
- if ((tcp)->tcp_ipha->ipha_src != 0) { \
+ if ((connp)->conn_ipversion == IPV4_VERSION) { \
+ if ((connp)->conn_laddr_v4 != 0) { \
(err) = (*cl_inet_connect2)( \
(connp)->conn_netstack->netstack_stackid,\
IPPROTO_TCP, is_outgoing, AF_INET, \
- (uint8_t *)(&((tcp)->tcp_ipha->ipha_src)),\
- (in_port_t)(tcp)->tcp_lport, \
- (uint8_t *)(&((tcp)->tcp_ipha->ipha_dst)),\
- (in_port_t)(tcp)->tcp_fport, NULL); \
+ (uint8_t *)(&((connp)->conn_laddr_v4)),\
+ (in_port_t)(connp)->conn_lport, \
+ (uint8_t *)(&((connp)->conn_faddr_v4)),\
+ (in_port_t)(connp)->conn_fport, NULL); \
} \
} else { \
if (!IN6_IS_ADDR_UNSPECIFIED( \
- &(tcp)->tcp_ip6h->ip6_src)) { \
+ &(connp)->conn_laddr_v6)) { \
(err) = (*cl_inet_connect2)( \
(connp)->conn_netstack->netstack_stackid,\
IPPROTO_TCP, is_outgoing, AF_INET6, \
- (uint8_t *)(&((tcp)->tcp_ip6h->ip6_src)),\
- (in_port_t)(tcp)->tcp_lport, \
- (uint8_t *)(&((tcp)->tcp_ip6h->ip6_dst)),\
- (in_port_t)(tcp)->tcp_fport, NULL); \
+ (uint8_t *)(&((connp)->conn_laddr_v6)),\
+ (in_port_t)(connp)->conn_lport, \
+ (uint8_t *)(&((connp)->conn_faddr_v6)), \
+ (in_port_t)(connp)->conn_fport, NULL); \
} \
} \
} \
}
-#define CL_INET_DISCONNECT(connp, tcp) { \
+#define CL_INET_DISCONNECT(connp) { \
if (cl_inet_disconnect != NULL) { \
/* \
* Running in cluster mode - deregister active \
* connection information \
*/ \
- if ((tcp)->tcp_ipversion == IPV4_VERSION) { \
- if ((tcp)->tcp_ip_src != 0) { \
+ if ((connp)->conn_ipversion == IPV4_VERSION) { \
+ if ((connp)->conn_laddr_v4 != 0) { \
(*cl_inet_disconnect)( \
(connp)->conn_netstack->netstack_stackid,\
IPPROTO_TCP, AF_INET, \
- (uint8_t *)(&((tcp)->tcp_ip_src)), \
- (in_port_t)(tcp)->tcp_lport, \
- (uint8_t *)(&((tcp)->tcp_ipha->ipha_dst)),\
- (in_port_t)(tcp)->tcp_fport, NULL); \
+ (uint8_t *)(&((connp)->conn_laddr_v4)),\
+ (in_port_t)(connp)->conn_lport, \
+ (uint8_t *)(&((connp)->conn_faddr_v4)),\
+ (in_port_t)(connp)->conn_fport, NULL); \
} \
} else { \
if (!IN6_IS_ADDR_UNSPECIFIED( \
- &(tcp)->tcp_ip_src_v6)) { \
+ &(connp)->conn_laddr_v6)) { \
(*cl_inet_disconnect)( \
(connp)->conn_netstack->netstack_stackid,\
IPPROTO_TCP, AF_INET6, \
- (uint8_t *)(&((tcp)->tcp_ip_src_v6)),\
- (in_port_t)(tcp)->tcp_lport, \
- (uint8_t *)(&((tcp)->tcp_ip6h->ip6_dst)),\
- (in_port_t)(tcp)->tcp_fport, NULL); \
+ (uint8_t *)(&((connp)->conn_laddr_v6)),\
+ (in_port_t)(connp)->conn_lport, \
+ (uint8_t *)(&((connp)->conn_faddr_v6)), \
+ (in_port_t)(connp)->conn_fport, NULL); \
} \
} \
} \
@@ -1367,11 +1267,6 @@ int cl_tcp_walk_list(netstackid_t stack_id,
static int cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *),
void *arg, tcp_stack_t *tcps);
-#define DTRACE_IP_FASTPATH(mp, iph, ill, ipha, ip6h) \
- DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, \
- iph, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, \
- ip6_t *, ip6h, int, 0);
-
static void
tcp_set_recv_threshold(tcp_t *tcp, uint32_t new_rcvthresh)
{
@@ -1540,7 +1435,7 @@ tcp_time_wait_append(tcp_t *tcp)
/* ARGSUSED */
void
-tcp_timewait_output(void *arg, mblk_t *mp, void *arg2)
+tcp_timewait_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
{
conn_t *connp = (conn_t *)arg;
tcp_t *tcp = connp->conn_tcp;
@@ -1551,11 +1446,11 @@ tcp_timewait_output(void *arg, mblk_t *mp, void *arg2)
return;
}
- ASSERT((tcp->tcp_family == AF_INET &&
- tcp->tcp_ipversion == IPV4_VERSION) ||
- (tcp->tcp_family == AF_INET6 &&
- (tcp->tcp_ipversion == IPV4_VERSION ||
- tcp->tcp_ipversion == IPV6_VERSION)));
+ ASSERT((connp->conn_family == AF_INET &&
+ connp->conn_ipversion == IPV4_VERSION) ||
+ (connp->conn_family == AF_INET6 &&
+ (connp->conn_ipversion == IPV4_VERSION ||
+ connp->conn_ipversion == IPV6_VERSION)));
ASSERT(!tcp->tcp_listener);
TCP_STAT(tcps, tcp_time_wait_reap);
@@ -1579,10 +1474,17 @@ tcp_ipsec_cleanup(tcp_t *tcp)
ASSERT(connp->conn_flags & IPCL_TCPCONN);
if (connp->conn_latch != NULL) {
- IPLATCH_REFRELE(connp->conn_latch,
- connp->conn_netstack);
+ IPLATCH_REFRELE(connp->conn_latch);
connp->conn_latch = NULL;
}
+ if (connp->conn_latch_in_policy != NULL) {
+ IPPOL_REFRELE(connp->conn_latch_in_policy);
+ connp->conn_latch_in_policy = NULL;
+ }
+ if (connp->conn_latch_in_action != NULL) {
+ IPACT_REFRELE(connp->conn_latch_in_action);
+ connp->conn_latch_in_action = NULL;
+ }
if (connp->conn_policy != NULL) {
IPPH_REFRELE(connp->conn_policy, connp->conn_netstack);
connp->conn_policy = NULL;
@@ -1598,9 +1500,6 @@ void
tcp_cleanup(tcp_t *tcp)
{
mblk_t *mp;
- char *tcp_iphc;
- int tcp_iphc_len;
- int tcp_hdr_grown;
tcp_sack_info_t *tcp_sack_info;
conn_t *connp = tcp->tcp_connp;
tcp_stack_t *tcps = tcp->tcp_tcps;
@@ -1611,6 +1510,22 @@ tcp_cleanup(tcp_t *tcp)
/* Cleanup that which needs the netstack first */
tcp_ipsec_cleanup(tcp);
+ ixa_cleanup(connp->conn_ixa);
+
+ if (connp->conn_ht_iphc != NULL) {
+ kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
+ connp->conn_ht_iphc = NULL;
+ connp->conn_ht_iphc_allocated = 0;
+ connp->conn_ht_iphc_len = 0;
+ connp->conn_ht_ulp = NULL;
+ connp->conn_ht_ulp_len = 0;
+ tcp->tcp_ipha = NULL;
+ tcp->tcp_ip6h = NULL;
+ tcp->tcp_tcpha = NULL;
+ }
+
+ /* We clear any IP_OPTIONS and extension headers */
+ ip_pkt_free(&connp->conn_xmit_ipp);
tcp_free(tcp);
@@ -1626,8 +1541,6 @@ tcp_cleanup(tcp_t *tcp)
}
tcp->tcp_kssl_pending = B_FALSE;
- conn_delete_ire(connp, NULL);
-
/*
* Since we will bzero the entire structure, we need to
* remove it and reinsert it in global hash list. We
@@ -1639,46 +1552,36 @@ tcp_cleanup(tcp_t *tcp)
*/
ipcl_globalhash_remove(connp);
- /*
- * Now it is safe to decrement the reference counts.
- * This might be the last reference on the netstack and TCPS
- * in which case it will cause the tcp_g_q_close and
- * the freeing of the IP Instance.
- */
- connp->conn_netstack = NULL;
- netstack_rele(ns);
- ASSERT(tcps != NULL);
- tcp->tcp_tcps = NULL;
- TCPS_REFRELE(tcps);
-
/* Save some state */
mp = tcp->tcp_timercache;
tcp_sack_info = tcp->tcp_sack_info;
- tcp_iphc = tcp->tcp_iphc;
- tcp_iphc_len = tcp->tcp_iphc_len;
- tcp_hdr_grown = tcp->tcp_hdr_grown;
tcp_rsrv_mp = tcp->tcp_rsrv_mp;
if (connp->conn_cred != NULL) {
crfree(connp->conn_cred);
connp->conn_cred = NULL;
}
- if (connp->conn_effective_cred != NULL) {
- crfree(connp->conn_effective_cred);
- connp->conn_effective_cred = NULL;
- }
ipcl_conn_cleanup(connp);
connp->conn_flags = IPCL_TCPCONN;
+
+ /*
+ * Now it is safe to decrement the reference counts.
+ * This might be the last reference on the netstack
+ * in which case it will cause the freeing of the IP Instance.
+ */
+ connp->conn_netstack = NULL;
+ connp->conn_ixa->ixa_ipst = NULL;
+ netstack_rele(ns);
+ ASSERT(tcps != NULL);
+ tcp->tcp_tcps = NULL;
+
bzero(tcp, sizeof (tcp_t));
/* restore the state */
tcp->tcp_timercache = mp;
tcp->tcp_sack_info = tcp_sack_info;
- tcp->tcp_iphc = tcp_iphc;
- tcp->tcp_iphc_len = tcp_iphc_len;
- tcp->tcp_hdr_grown = tcp_hdr_grown;
tcp->tcp_rsrv_mp = tcp_rsrv_mp;
tcp->tcp_connp = connp;
@@ -1686,7 +1589,7 @@ tcp_cleanup(tcp_t *tcp)
ASSERT(connp->conn_tcp == tcp);
ASSERT(connp->conn_flags & IPCL_TCPCONN);
connp->conn_state_flags = CONN_INCIPIENT;
- ASSERT(connp->conn_ulp == IPPROTO_TCP);
+ ASSERT(connp->conn_proto == IPPROTO_TCP);
ASSERT(connp->conn_ref == 1);
}
@@ -1777,11 +1680,7 @@ tcp_time_wait_collector(void *arg)
/*
* Set the CONDEMNED flag now itself so that
* the refcnt cannot increase due to any
- * walker. But we have still not cleaned up
- * conn_ire_cache. This is still ok since
- * we are going to clean it up in tcp_cleanup
- * immediately and any interface unplumb
- * thread will wait till the ire is blown away
+ * walker.
*/
connp->conn_state_flags |= CONN_CONDEMNED;
mutex_exit(lock);
@@ -1809,7 +1708,7 @@ tcp_time_wait_collector(void *arg)
mutex_exit(
&tcp_time_wait->tcp_time_wait_lock);
tcp_bind_hash_remove(tcp);
- conn_delete_ire(tcp->tcp_connp, NULL);
+ ixa_cleanup(tcp->tcp_connp->conn_ixa);
tcp_ipsec_cleanup(tcp);
CONN_DEC_REF(tcp->tcp_connp);
}
@@ -1839,7 +1738,7 @@ tcp_time_wait_collector(void *arg)
TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
mp = &tcp->tcp_closemp;
SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
- tcp_timewait_output, connp,
+ tcp_timewait_output, connp, NULL,
SQ_FILL, SQTAG_TCP_TIMEWAIT);
}
} else {
@@ -1867,7 +1766,7 @@ tcp_time_wait_collector(void *arg)
TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
mp = &tcp->tcp_closemp;
SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
- tcp_timewait_output, connp,
+ tcp_timewait_output, connp, NULL,
SQ_FILL, SQTAG_TCP_TIMEWAIT);
}
mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
@@ -1886,24 +1785,23 @@ tcp_time_wait_collector(void *arg)
/*
* Reply to a clients T_CONN_RES TPI message. This function
* is used only for TLI/XTI listener. Sockfs sends T_CONN_RES
- * on the acceptor STREAM and processed in tcp_wput_accept().
- * Read the block comment on top of tcp_conn_request().
+ * on the acceptor STREAM and processed in tcp_accept_common().
+ * Read the block comment on top of tcp_input_listener().
*/
static void
tcp_tli_accept(tcp_t *listener, mblk_t *mp)
{
- tcp_t *acceptor;
- tcp_t *eager;
- tcp_t *tcp;
+ tcp_t *acceptor;
+ tcp_t *eager;
+ tcp_t *tcp;
struct T_conn_res *tcr;
t_uscalar_t acceptor_id;
t_scalar_t seqnum;
- mblk_t *opt_mp = NULL; /* T_OPTMGMT_REQ messages */
- struct tcp_options *tcpopt;
- mblk_t *ok_mp;
- mblk_t *mp1;
+ mblk_t *discon_mp = NULL;
+ mblk_t *ok_mp;
+ mblk_t *mp1;
tcp_stack_t *tcps = listener->tcp_tcps;
- int error;
+ conn_t *econnp;
if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
tcp_err_ack(listener, mp, TPROTO, 0);
@@ -1922,8 +1820,8 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp)
* fanout hash lock is held.
* This prevents any thread from entering the acceptor queue from
* below (since it has not been hard bound yet i.e. any inbound
- * packets will arrive on the listener or default tcp queue and
- * go through tcp_lookup).
+ * packets will arrive on the listener conn_t and
+ * go through the classifier).
* The CONN_INC_REF will prevent the acceptor from closing.
*
* XXX It is still possible for a tli application to send down data
@@ -1974,7 +1872,7 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp)
} else {
acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps);
if (acceptor == NULL) {
- if (listener->tcp_debug) {
+ if (listener->tcp_connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1,
SL_ERROR|SL_TRACE,
"tcp_accept: did not find acceptor 0x%x\n",
@@ -2013,7 +1911,7 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp)
* Rendezvous with an eager connection request packet hanging off
* 'tcp' that has the 'seqnum' tag. We tagged the detached open
* tcp structure when the connection packet arrived in
- * tcp_conn_request().
+ * tcp_input_listener().
*/
seqnum = tcr->SEQ_number;
eager = listener;
@@ -2047,37 +1945,26 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp)
*/
ASSERT(eager->tcp_connp->conn_ref >= 1);
- /* Pre allocate the stroptions mblk also */
- opt_mp = allocb(MAX(sizeof (struct tcp_options),
- sizeof (struct T_conn_res)), BPRI_HI);
- if (opt_mp == NULL) {
+ /*
+ * Pre allocate the discon_ind mblk also. tcp_accept_finish will
+ * use it if something failed.
+ */
+ discon_mp = allocb(MAX(sizeof (struct T_discon_ind),
+ sizeof (struct stroptions)), BPRI_HI);
+ if (discon_mp == NULL) {
CONN_DEC_REF(acceptor->tcp_connp);
CONN_DEC_REF(eager->tcp_connp);
tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
return;
}
- DB_TYPE(opt_mp) = M_SETOPTS;
- opt_mp->b_wptr += sizeof (struct tcp_options);
- tcpopt = (struct tcp_options *)opt_mp->b_rptr;
- tcpopt->to_flags = 0;
- /*
- * Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO
- * from listener to acceptor.
- */
- if (listener->tcp_bound_if != 0) {
- tcpopt->to_flags |= TCPOPT_BOUNDIF;
- tcpopt->to_boundif = listener->tcp_bound_if;
- }
- if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) {
- tcpopt->to_flags |= TCPOPT_RECVPKTINFO;
- }
+ econnp = eager->tcp_connp;
- /* Re-use mp1 to hold a copy of mp, in case reallocb fails */
+ /* Hold a copy of mp, in case reallocb fails */
if ((mp1 = copymsg(mp)) == NULL) {
CONN_DEC_REF(acceptor->tcp_connp);
CONN_DEC_REF(eager->tcp_connp);
- freemsg(opt_mp);
+ freemsg(discon_mp);
tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
return;
}
@@ -2093,7 +1980,7 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp)
{
int extra;
- extra = (eager->tcp_family == AF_INET) ?
+ extra = (econnp->conn_family == AF_INET) ?
sizeof (sin_t) : sizeof (sin6_t);
/*
@@ -2104,7 +1991,7 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp)
if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) {
CONN_DEC_REF(acceptor->tcp_connp);
CONN_DEC_REF(eager->tcp_connp);
- freemsg(opt_mp);
+ freemsg(discon_mp);
/* Original mp has been freed by now, so use mp1 */
tcp_err_ack(listener, mp1, TSYSERR, ENOMEM);
return;
@@ -2114,38 +2001,32 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp)
switch (extra) {
case sizeof (sin_t): {
- sin_t *sin = (sin_t *)ok_mp->b_wptr;
+ sin_t *sin = (sin_t *)ok_mp->b_wptr;
- ok_mp->b_wptr += extra;
- sin->sin_family = AF_INET;
- sin->sin_port = eager->tcp_lport;
- sin->sin_addr.s_addr =
- eager->tcp_ipha->ipha_src;
- break;
- }
+ ok_mp->b_wptr += extra;
+ sin->sin_family = AF_INET;
+ sin->sin_port = econnp->conn_lport;
+ sin->sin_addr.s_addr = econnp->conn_laddr_v4;
+ break;
+ }
case sizeof (sin6_t): {
- sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr;
+ sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr;
- ok_mp->b_wptr += extra;
- sin6->sin6_family = AF_INET6;
- sin6->sin6_port = eager->tcp_lport;
- if (eager->tcp_ipversion == IPV4_VERSION) {
- sin6->sin6_flowinfo = 0;
- IN6_IPADDR_TO_V4MAPPED(
- eager->tcp_ipha->ipha_src,
- &sin6->sin6_addr);
- } else {
- ASSERT(eager->tcp_ip6h != NULL);
- sin6->sin6_flowinfo =
- eager->tcp_ip6h->ip6_vcf &
- ~IPV6_VERS_AND_FLOW_MASK;
- sin6->sin6_addr =
- eager->tcp_ip6h->ip6_src;
- }
+ ok_mp->b_wptr += extra;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = econnp->conn_lport;
+ sin6->sin6_addr = econnp->conn_laddr_v6;
+ sin6->sin6_flowinfo = econnp->conn_flowinfo;
+ if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) &&
+ (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) {
+ sin6->sin6_scope_id =
+ econnp->conn_ixa->ixa_scopeid;
+ } else {
sin6->sin6_scope_id = 0;
- sin6->__sin6_src_id = 0;
- break;
}
+ sin6->__sin6_src_id = 0;
+ break;
+ }
default:
break;
}
@@ -2158,15 +2039,7 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp)
* the tcp_accept_swap is done since it would be dangerous to
* let the application start using the new fd prior to the swap.
*/
- error = tcp_accept_swap(listener, acceptor, eager);
- if (error != 0) {
- CONN_DEC_REF(acceptor->tcp_connp);
- CONN_DEC_REF(eager->tcp_connp);
- freemsg(ok_mp);
- /* Original mp has been freed by now, so use mp1 */
- tcp_err_ack(listener, mp1, TSYSERR, error);
- return;
- }
+ tcp_accept_swap(listener, acceptor, eager);
/*
* tcp_accept_swap unlinks eager from listener but does not drop
@@ -2244,7 +2117,7 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp)
/* We no longer need mp1, since all options processing has passed */
freemsg(mp1);
- putnext(listener->tcp_rq, ok_mp);
+ putnext(listener->tcp_connp->conn_rq, ok_mp);
mutex_enter(&listener->tcp_eager_lock);
if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
@@ -2305,7 +2178,7 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp)
listener->tcp_eager_last_q = tcp;
tcp->tcp_eager_next_q = NULL;
mutex_exit(&listener->tcp_eager_lock);
- putnext(tcp->tcp_rq, conn_ind);
+ putnext(tcp->tcp_connp->conn_rq, conn_ind);
} else {
mutex_exit(&listener->tcp_eager_lock);
}
@@ -2318,26 +2191,20 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp)
*/
finish:
ASSERT(acceptor->tcp_detached);
- ASSERT(tcps->tcps_g_q != NULL);
+ acceptor->tcp_connp->conn_rq = NULL;
ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp));
- acceptor->tcp_rq = tcps->tcps_g_q;
- acceptor->tcp_wq = WR(tcps->tcps_g_q);
+ acceptor->tcp_connp->conn_wq = NULL;
(void) tcp_clean_death(acceptor, 0, 2);
CONN_DEC_REF(acceptor->tcp_connp);
/*
- * In case we already received a FIN we have to make tcp_rput send
- * the ordrel_ind. This will also send up a window update if the window
- * has opened up.
- *
- * In the normal case of a successful connection acceptance
- * we give the O_T_BIND_REQ to the read side put procedure as an
- * indication that this was just accepted. This tells tcp_rput to
- * pass up any data queued in tcp_rcv_list.
+ * We pass discon_mp to tcp_accept_finish to get on the right squeue.
*
- * In the fringe case where options sent with T_CONN_RES failed and
- * we required, we would be indicating a T_DISCON_IND to blow
- * away this connection.
+ * It will update the setting for sockfs/stream head and also take
+ * care of any data that arrived before accept() wad called.
+ * In case we already received a FIN then tcp_accept_finish will send up
+ * the ordrel. It will also send up a window update if the window
+ * has opened up.
*/
/*
@@ -2346,7 +2213,7 @@ finish:
* and is well know but nothing can be done short of major rewrite
* to fix it. Now it is possible to take care of it by assigning TLI/XTI
* eager same squeue as listener (we can distinguish non socket
- * listeners at the time of handling a SYN in tcp_conn_request)
+ * listeners at the time of handling a SYN in tcp_input_listener)
* and do most of the work that tcp_accept_finish does here itself
* and then get behind the acceptor squeue to access the acceptor
* queue.
@@ -2354,52 +2221,38 @@ finish:
/*
* We already have a ref on tcp so no need to do one before squeue_enter
*/
- SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, opt_mp, tcp_accept_finish,
- eager->tcp_connp, SQ_FILL, SQTAG_TCP_ACCEPT_FINISH);
+ SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, discon_mp,
+ tcp_accept_finish, eager->tcp_connp, NULL, SQ_FILL,
+ SQTAG_TCP_ACCEPT_FINISH);
}
/*
* Swap information between the eager and acceptor for a TLI/XTI client.
* The sockfs accept is done on the acceptor stream and control goes
- * through tcp_wput_accept() and tcp_accept()/tcp_accept_swap() is not
+ * through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not
* called. In either case, both the eager and listener are in their own
* perimeter (squeue) and the code has to deal with potential race.
*
- * See the block comment on top of tcp_accept() and tcp_wput_accept().
+ * See the block comment on top of tcp_accept() and tcp_tli_accept().
*/
-static int
+static void
tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager)
{
conn_t *econnp, *aconnp;
- cred_t *effective_cred = NULL;
- ASSERT(eager->tcp_rq == listener->tcp_rq);
+ ASSERT(eager->tcp_connp->conn_rq == listener->tcp_connp->conn_rq);
ASSERT(eager->tcp_detached && !acceptor->tcp_detached);
- ASSERT(!eager->tcp_hard_bound);
ASSERT(!TCP_IS_SOCKET(acceptor));
ASSERT(!TCP_IS_SOCKET(eager));
ASSERT(!TCP_IS_SOCKET(listener));
- econnp = eager->tcp_connp;
- aconnp = acceptor->tcp_connp;
-
/*
* Trusted Extensions may need to use a security label that is
* different from the acceptor's label on MLP and MAC-Exempt
* sockets. If this is the case, the required security label
- * already exists in econnp->conn_effective_cred. Use this label
- * to generate a new effective cred for the acceptor.
- *
- * We allow for potential application level retry attempts by
- * checking for transient errors before modifying eager.
+ * already exists in econnp->conn_ixa->ixa_tsl. Since we make the
+ * acceptor stream refer to econnp we atomatically get that label.
*/
- if (is_system_labeled() &&
- aconnp->conn_cred != NULL && econnp->conn_effective_cred != NULL) {
- effective_cred = copycred_from_tslabel(aconnp->conn_cred,
- crgetlabel(econnp->conn_effective_cred), KM_NOSLEEP);
- if (effective_cred == NULL)
- return (ENOMEM);
- }
acceptor->tcp_detached = B_TRUE;
/*
@@ -2416,18 +2269,20 @@ tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager)
ASSERT(eager->tcp_eager_next_q0 == NULL &&
eager->tcp_eager_prev_q0 == NULL);
mutex_exit(&listener->tcp_eager_lock);
- eager->tcp_rq = acceptor->tcp_rq;
- eager->tcp_wq = acceptor->tcp_wq;
- eager->tcp_rq->q_ptr = econnp;
- eager->tcp_wq->q_ptr = econnp;
+ econnp = eager->tcp_connp;
+ aconnp = acceptor->tcp_connp;
+ econnp->conn_rq = aconnp->conn_rq;
+ econnp->conn_wq = aconnp->conn_wq;
+ econnp->conn_rq->q_ptr = econnp;
+ econnp->conn_wq->q_ptr = econnp;
/*
* In the TLI/XTI loopback case, we are inside the listener's squeue,
* which might be a different squeue from our peer TCP instance.
* For TCP Fusion, the peer expects that whenever tcp_detached is
* clear, our TCP queues point to the acceptor's queues. Thus, use
- * membar_producer() to ensure that the assignments of tcp_rq/tcp_wq
+ * membar_producer() to ensure that the assignments of conn_rq/conn_wq
* above reach global visibility prior to the clearing of tcp_detached.
*/
membar_producer();
@@ -2439,419 +2294,187 @@ tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager)
econnp->conn_minor_arena = aconnp->conn_minor_arena;
ASSERT(econnp->conn_minor_arena != NULL);
- if (eager->tcp_cred != NULL)
- crfree(eager->tcp_cred);
- eager->tcp_cred = econnp->conn_cred = aconnp->conn_cred;
- if (econnp->conn_effective_cred != NULL)
- crfree(econnp->conn_effective_cred);
- econnp->conn_effective_cred = effective_cred;
+ if (econnp->conn_cred != NULL)
+ crfree(econnp->conn_cred);
+ econnp->conn_cred = aconnp->conn_cred;
aconnp->conn_cred = NULL;
- ASSERT(aconnp->conn_effective_cred == NULL);
-
+ econnp->conn_cpid = aconnp->conn_cpid;
ASSERT(econnp->conn_netstack == aconnp->conn_netstack);
ASSERT(eager->tcp_tcps == acceptor->tcp_tcps);
econnp->conn_zoneid = aconnp->conn_zoneid;
econnp->conn_allzones = aconnp->conn_allzones;
+ econnp->conn_ixa->ixa_zoneid = aconnp->conn_ixa->ixa_zoneid;
+ econnp->conn_mac_mode = aconnp->conn_mac_mode;
+ econnp->conn_zone_is_global = aconnp->conn_zone_is_global;
aconnp->conn_mac_mode = CONN_MAC_DEFAULT;
/* Do the IPC initialization */
CONN_INC_REF(econnp);
- econnp->conn_multicast_loop = aconnp->conn_multicast_loop;
- econnp->conn_af_isv6 = aconnp->conn_af_isv6;
- econnp->conn_pkt_isv6 = aconnp->conn_pkt_isv6;
+ econnp->conn_family = aconnp->conn_family;
+ econnp->conn_ipversion = aconnp->conn_ipversion;
/* Done with old IPC. Drop its ref on its connp */
CONN_DEC_REF(aconnp);
- return (0);
}
/*
* Adapt to the information, such as rtt and rtt_sd, provided from the
- * ire cached in conn_cache_ire. If no ire cached, do a ire lookup.
+ * DCE and IRE maintained by IP.
*
* Checks for multicast and broadcast destination address.
- * Returns zero on failure; non-zero if ok.
+ * Returns zero if ok; an errno on failure.
*
* Note that the MSS calculation here is based on the info given in
- * the IRE. We do not do any calculation based on TCP options. They
- * will be handled in tcp_rput_other() and tcp_rput_data() when TCP
- * knows which options to use.
+ * the DCE and IRE. We do not do any calculation based on TCP options. They
+ * will be handled in tcp_input_data() when TCP knows which options to use.
*
* Note on how TCP gets its parameters for a connection.
*
* When a tcp_t structure is allocated, it gets all the default parameters.
- * In tcp_adapt_ire(), it gets those metric parameters, like rtt, rtt_sd,
+ * In tcp_set_destination(), it gets those metric parameters, like rtt, rtt_sd,
* spipe, rpipe, ... from the route metrics. Route metric overrides the
* default.
*
- * An incoming SYN with a multicast or broadcast destination address, is dropped
- * in 1 of 2 places.
- *
- * 1. If the packet was received over the wire it is dropped in
- * ip_rput_process_broadcast()
- *
- * 2. If the packet was received through internal IP loopback, i.e. the packet
- * was generated and received on the same machine, it is dropped in
- * ip_wput_local()
+ * An incoming SYN with a multicast or broadcast destination address is dropped
+ * in ip_fanout_v4/v6.
*
* An incoming SYN with a multicast or broadcast source address is always
- * dropped in tcp_adapt_ire. The same logic in tcp_adapt_ire also serves to
+ * dropped in tcp_set_destination, since IPDF_ALLOW_MCBC is not set in
+ * conn_connect.
+ * The same logic in tcp_set_destination also serves to
* reject an attempt to connect to a broadcast or multicast (destination)
* address.
*/
static int
-tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp)
+tcp_set_destination(tcp_t *tcp)
{
- ire_t *ire;
- ire_t *sire = NULL;
- iulp_t *ire_uinfo = NULL;
uint32_t mss_max;
uint32_t mss;
boolean_t tcp_detached = TCP_IS_DETACHED(tcp);
conn_t *connp = tcp->tcp_connp;
- boolean_t ire_cacheable = B_FALSE;
- zoneid_t zoneid = connp->conn_zoneid;
- int match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
- MATCH_IRE_SECATTR;
- ts_label_t *tsl = crgetlabel(CONN_CRED(connp));
- ill_t *ill = NULL;
- boolean_t incoming = (ire_mp == NULL);
tcp_stack_t *tcps = tcp->tcp_tcps;
- ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
-
- ASSERT(connp->conn_ire_cache == NULL);
-
- if (tcp->tcp_ipversion == IPV4_VERSION) {
+ iulp_t uinfo;
+ int error;
+ uint32_t flags;
- if (CLASSD(tcp->tcp_connp->conn_rem)) {
- BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
- return (0);
- }
- /*
- * If IP_NEXTHOP is set, then look for an IRE_CACHE
- * for the destination with the nexthop as gateway.
- * ire_ctable_lookup() is used because this particular
- * ire, if it exists, will be marked private.
- * If that is not available, use the interface ire
- * for the nexthop.
- *
- * TSol: tcp_update_label will detect label mismatches based
- * only on the destination's label, but that would not
- * detect label mismatches based on the security attributes
- * of routes or next hop gateway. Hence we need to pass the
- * label to ire_ftable_lookup below in order to locate the
- * right prefix (and/or) ire cache. Similarly we also need
- * pass the label to the ire_cache_lookup below to locate
- * the right ire that also matches on the label.
- */
- if (tcp->tcp_connp->conn_nexthop_set) {
- ire = ire_ctable_lookup(tcp->tcp_connp->conn_rem,
- tcp->tcp_connp->conn_nexthop_v4, 0, NULL, zoneid,
- tsl, MATCH_IRE_MARK_PRIVATE_ADDR | MATCH_IRE_GW,
- ipst);
- if (ire == NULL) {
- ire = ire_ftable_lookup(
- tcp->tcp_connp->conn_nexthop_v4,
- 0, 0, IRE_INTERFACE, NULL, NULL, zoneid, 0,
- tsl, match_flags, ipst);
- if (ire == NULL)
- return (0);
- } else {
- ire_uinfo = &ire->ire_uinfo;
- }
- } else {
- ire = ire_cache_lookup(tcp->tcp_connp->conn_rem,
- zoneid, tsl, ipst);
- if (ire != NULL) {
- ire_cacheable = B_TRUE;
- ire_uinfo = (ire_mp != NULL) ?
- &((ire_t *)ire_mp->b_rptr)->ire_uinfo:
- &ire->ire_uinfo;
+ flags = IPDF_LSO | IPDF_ZCOPY;
+ /*
+ * Make sure we have a dce for the destination to avoid dce_ident
+ * contention for connected sockets.
+ */
+ flags |= IPDF_UNIQUE_DCE;
- } else {
- if (ire_mp == NULL) {
- ire = ire_ftable_lookup(
- tcp->tcp_connp->conn_rem,
- 0, 0, 0, NULL, &sire, zoneid, 0,
- tsl, (MATCH_IRE_RECURSIVE |
- MATCH_IRE_DEFAULT), ipst);
- if (ire == NULL)
- return (0);
- ire_uinfo = (sire != NULL) ?
- &sire->ire_uinfo :
- &ire->ire_uinfo;
- } else {
- ire = (ire_t *)ire_mp->b_rptr;
- ire_uinfo =
- &((ire_t *)
- ire_mp->b_rptr)->ire_uinfo;
- }
- }
- }
- ASSERT(ire != NULL);
+ if (!tcps->tcps_ignore_path_mtu)
+ connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
- if ((ire->ire_src_addr == INADDR_ANY) ||
- (ire->ire_type & IRE_BROADCAST)) {
- /*
- * ire->ire_mp is non null when ire_mp passed in is used
- * ire->ire_mp is set in ip_bind_insert_ire[_v6]().
- */
- if (ire->ire_mp == NULL)
- ire_refrele(ire);
- if (sire != NULL)
- ire_refrele(sire);
- return (0);
- }
-
- if (tcp->tcp_ipha->ipha_src == INADDR_ANY) {
- ipaddr_t src_addr;
+ /* Use conn_lock to satify ASSERT; tcp is already serialized */
+ mutex_enter(&connp->conn_lock);
+ error = conn_connect(connp, &uinfo, flags);
+ mutex_exit(&connp->conn_lock);
+ if (error != 0)
+ return (error);
- /*
- * ip_bind_connected() has stored the correct source
- * address in conn_src.
- */
- src_addr = tcp->tcp_connp->conn_src;
- tcp->tcp_ipha->ipha_src = src_addr;
- /*
- * Copy of the src addr. in tcp_t is needed
- * for the lookup funcs.
- */
- IN6_IPADDR_TO_V4MAPPED(src_addr, &tcp->tcp_ip_src_v6);
- }
- /*
- * Set the fragment bit so that IP will tell us if the MTU
- * should change. IP tells us the latest setting of
- * ip_path_mtu_discovery through ire_frag_flag.
- */
- if (ipst->ips_ip_path_mtu_discovery) {
- tcp->tcp_ipha->ipha_fragment_offset_and_flags =
- htons(IPH_DF);
- }
- /*
- * If ire_uinfo is NULL, this is the IRE_INTERFACE case
- * for IP_NEXTHOP. No cache ire has been found for the
- * destination and we are working with the nexthop's
- * interface ire. Since we need to forward all packets
- * to the nexthop first, we "blindly" set tcp_localnet
- * to false, eventhough the destination may also be
- * onlink.
- */
- if (ire_uinfo == NULL)
- tcp->tcp_localnet = 0;
- else
- tcp->tcp_localnet = (ire->ire_gateway_addr == 0);
- } else {
- /*
- * For incoming connection ire_mp = NULL
- * For outgoing connection ire_mp != NULL
- * Technically we should check conn_incoming_ill
- * when ire_mp is NULL and conn_outgoing_ill when
- * ire_mp is non-NULL. But this is performance
- * critical path and for IPV*_BOUND_IF, outgoing
- * and incoming ill are always set to the same value.
- */
- ill_t *dst_ill = NULL;
- ipif_t *dst_ipif = NULL;
+ error = tcp_build_hdrs(tcp);
+ if (error != 0)
+ return (error);
- ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill);
+ tcp->tcp_localnet = uinfo.iulp_localnet;
- if (connp->conn_outgoing_ill != NULL) {
- /* Outgoing or incoming path */
- int err;
+ if (uinfo.iulp_rtt != 0) {
+ clock_t rto;
- dst_ill = conn_get_held_ill(connp,
- &connp->conn_outgoing_ill, &err);
- if (err == ILL_LOOKUP_FAILED || dst_ill == NULL) {
- ip1dbg(("tcp_adapt_ire: ill_lookup failed\n"));
- return (0);
- }
- match_flags |= MATCH_IRE_ILL;
- dst_ipif = dst_ill->ill_ipif;
- }
- ire = ire_ctable_lookup_v6(&tcp->tcp_connp->conn_remv6,
- 0, 0, dst_ipif, zoneid, tsl, match_flags, ipst);
+ tcp->tcp_rtt_sa = uinfo.iulp_rtt;
+ tcp->tcp_rtt_sd = uinfo.iulp_rtt_sd;
+ rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
+ tcps->tcps_rexmit_interval_extra +
+ (tcp->tcp_rtt_sa >> 5);
- if (ire != NULL) {
- ire_cacheable = B_TRUE;
- ire_uinfo = (ire_mp != NULL) ?
- &((ire_t *)ire_mp->b_rptr)->ire_uinfo:
- &ire->ire_uinfo;
+ if (rto > tcps->tcps_rexmit_interval_max) {
+ tcp->tcp_rto = tcps->tcps_rexmit_interval_max;
+ } else if (rto < tcps->tcps_rexmit_interval_min) {
+ tcp->tcp_rto = tcps->tcps_rexmit_interval_min;
} else {
- if (ire_mp == NULL) {
- ire = ire_ftable_lookup_v6(
- &tcp->tcp_connp->conn_remv6,
- 0, 0, 0, dst_ipif, &sire, zoneid,
- 0, tsl, match_flags, ipst);
- if (ire == NULL) {
- if (dst_ill != NULL)
- ill_refrele(dst_ill);
- return (0);
- }
- ire_uinfo = (sire != NULL) ? &sire->ire_uinfo :
- &ire->ire_uinfo;
- } else {
- ire = (ire_t *)ire_mp->b_rptr;
- ire_uinfo =
- &((ire_t *)ire_mp->b_rptr)->ire_uinfo;
- }
- }
- if (dst_ill != NULL)
- ill_refrele(dst_ill);
-
- ASSERT(ire != NULL);
- ASSERT(ire_uinfo != NULL);
-
- if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6) ||
- IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) {
- /*
- * ire->ire_mp is non null when ire_mp passed in is used
- * ire->ire_mp is set in ip_bind_insert_ire[_v6]().
- */
- if (ire->ire_mp == NULL)
- ire_refrele(ire);
- if (sire != NULL)
- ire_refrele(sire);
- return (0);
+ tcp->tcp_rto = rto;
}
-
- if (IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) {
- in6_addr_t src_addr;
-
- /*
- * ip_bind_connected_v6() has stored the correct source
- * address per IPv6 addr. selection policy in
- * conn_src_v6.
- */
- src_addr = tcp->tcp_connp->conn_srcv6;
-
- tcp->tcp_ip6h->ip6_src = src_addr;
- /*
- * Copy of the src addr. in tcp_t is needed
- * for the lookup funcs.
- */
- tcp->tcp_ip_src_v6 = src_addr;
- ASSERT(IN6_ARE_ADDR_EQUAL(&tcp->tcp_ip6h->ip6_src,
- &connp->conn_srcv6));
+ }
+ if (uinfo.iulp_ssthresh != 0)
+ tcp->tcp_cwnd_ssthresh = uinfo.iulp_ssthresh;
+ else
+ tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
+ if (uinfo.iulp_spipe > 0) {
+ connp->conn_sndbuf = MIN(uinfo.iulp_spipe,
+ tcps->tcps_max_buf);
+ if (tcps->tcps_snd_lowat_fraction != 0) {
+ connp->conn_sndlowat = connp->conn_sndbuf /
+ tcps->tcps_snd_lowat_fraction;
}
- tcp->tcp_localnet =
- IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6);
+ (void) tcp_maxpsz_set(tcp, B_TRUE);
}
-
/*
- * This allows applications to fail quickly when connections are made
- * to dead hosts. Hosts can be labeled dead by adding a reject route
- * with both the RTF_REJECT and RTF_PRIVATE flags set.
+ * Note that up till now, acceptor always inherits receive
+ * window from the listener. But if there is a metrics
+ * associated with a host, we should use that instead of
+ * inheriting it from listener. Thus we need to pass this
+ * info back to the caller.
*/
- if ((ire->ire_flags & RTF_REJECT) &&
- (ire->ire_flags & RTF_PRIVATE))
- goto error;
+ if (uinfo.iulp_rpipe > 0) {
+ tcp->tcp_rwnd = MIN(uinfo.iulp_rpipe,
+ tcps->tcps_max_buf);
+ }
+
+ if (uinfo.iulp_rtomax > 0) {
+ tcp->tcp_second_timer_threshold =
+ uinfo.iulp_rtomax;
+ }
/*
- * Make use of the cached rtt and rtt_sd values to calculate the
- * initial RTO. Note that they are already initialized in
- * tcp_init_values().
- * If ire_uinfo is NULL, i.e., we do not have a cache ire for
- * IP_NEXTHOP, but instead are using the interface ire for the
- * nexthop, then we do not use the ire_uinfo from that ire to
- * do any initializations.
+ * Use the metric option settings, iulp_tstamp_ok and
+ * iulp_wscale_ok, only for active open. What this means
+ * is that if the other side uses timestamp or window
+ * scale option, TCP will also use those options. That
+ * is for passive open. If the application sets a
+ * large window, window scale is enabled regardless of
+ * the value in iulp_wscale_ok. This is the behavior
+ * since 2.6. So we keep it.
+ * The only case left in passive open processing is the
+ * check for SACK.
+ * For ECN, it should probably be like SACK. But the
+ * current value is binary, so we treat it like the other
+ * cases. The metric only controls active open.For passive
+ * open, the ndd param, tcp_ecn_permitted, controls the
+ * behavior.
*/
- if (ire_uinfo != NULL) {
- if (ire_uinfo->iulp_rtt != 0) {
- clock_t rto;
-
- tcp->tcp_rtt_sa = ire_uinfo->iulp_rtt;
- tcp->tcp_rtt_sd = ire_uinfo->iulp_rtt_sd;
- rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
- tcps->tcps_rexmit_interval_extra +
- (tcp->tcp_rtt_sa >> 5);
-
- if (rto > tcps->tcps_rexmit_interval_max) {
- tcp->tcp_rto = tcps->tcps_rexmit_interval_max;
- } else if (rto < tcps->tcps_rexmit_interval_min) {
- tcp->tcp_rto = tcps->tcps_rexmit_interval_min;
- } else {
- tcp->tcp_rto = rto;
- }
- }
- if (ire_uinfo->iulp_ssthresh != 0)
- tcp->tcp_cwnd_ssthresh = ire_uinfo->iulp_ssthresh;
- else
- tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
- if (ire_uinfo->iulp_spipe > 0) {
- tcp->tcp_xmit_hiwater = MIN(ire_uinfo->iulp_spipe,
- tcps->tcps_max_buf);
- if (tcps->tcps_snd_lowat_fraction != 0)
- tcp->tcp_xmit_lowater = tcp->tcp_xmit_hiwater /
- tcps->tcps_snd_lowat_fraction;
- (void) tcp_maxpsz_set(tcp, B_TRUE);
- }
+ if (!tcp_detached) {
/*
- * Note that up till now, acceptor always inherits receive
- * window from the listener. But if there is a metrics
- * associated with a host, we should use that instead of
- * inheriting it from listener. Thus we need to pass this
- * info back to the caller.
+ * The if check means that the following can only
+ * be turned on by the metrics only IRE, but not off.
*/
- if (ire_uinfo->iulp_rpipe > 0) {
- tcp->tcp_rwnd = MIN(ire_uinfo->iulp_rpipe,
- tcps->tcps_max_buf);
- }
-
- if (ire_uinfo->iulp_rtomax > 0) {
- tcp->tcp_second_timer_threshold =
- ire_uinfo->iulp_rtomax;
- }
-
+ if (uinfo.iulp_tstamp_ok)
+ tcp->tcp_snd_ts_ok = B_TRUE;
+ if (uinfo.iulp_wscale_ok)
+ tcp->tcp_snd_ws_ok = B_TRUE;
+ if (uinfo.iulp_sack == 2)
+ tcp->tcp_snd_sack_ok = B_TRUE;
+ if (uinfo.iulp_ecn_ok)
+ tcp->tcp_ecn_ok = B_TRUE;
+ } else {
/*
- * Use the metric option settings, iulp_tstamp_ok and
- * iulp_wscale_ok, only for active open. What this means
- * is that if the other side uses timestamp or window
- * scale option, TCP will also use those options. That
- * is for passive open. If the application sets a
- * large window, window scale is enabled regardless of
- * the value in iulp_wscale_ok. This is the behavior
- * since 2.6. So we keep it.
- * The only case left in passive open processing is the
- * check for SACK.
- * For ECN, it should probably be like SACK. But the
- * current value is binary, so we treat it like the other
- * cases. The metric only controls active open.For passive
- * open, the ndd param, tcp_ecn_permitted, controls the
- * behavior.
+ * Passive open.
+ *
+ * As above, the if check means that SACK can only be
+ * turned on by the metric only IRE.
*/
- if (!tcp_detached) {
- /*
- * The if check means that the following can only
- * be turned on by the metrics only IRE, but not off.
- */
- if (ire_uinfo->iulp_tstamp_ok)
- tcp->tcp_snd_ts_ok = B_TRUE;
- if (ire_uinfo->iulp_wscale_ok)
- tcp->tcp_snd_ws_ok = B_TRUE;
- if (ire_uinfo->iulp_sack == 2)
- tcp->tcp_snd_sack_ok = B_TRUE;
- if (ire_uinfo->iulp_ecn_ok)
- tcp->tcp_ecn_ok = B_TRUE;
- } else {
- /*
- * Passive open.
- *
- * As above, the if check means that SACK can only be
- * turned on by the metric only IRE.
- */
- if (ire_uinfo->iulp_sack > 0) {
- tcp->tcp_snd_sack_ok = B_TRUE;
- }
+ if (uinfo.iulp_sack > 0) {
+ tcp->tcp_snd_sack_ok = B_TRUE;
}
}
-
/*
- * XXX: Note that currently, ire_max_frag can be as small as 68
+ * XXX Note that currently, iulp_mtu can be as small as 68
* because of PMTUd. So tcp_mss may go to negative if combined
* length of all those options exceeds 28 bytes. But because
* of the tcp_mss_min check below, we may not have a problem if
@@ -2864,31 +2487,15 @@ tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp)
* We do not deal with that now. All those problems related to
* PMTUd will be fixed later.
*/
- ASSERT(ire->ire_max_frag != 0);
- mss = tcp->tcp_if_mtu = ire->ire_max_frag;
- if (tcp->tcp_ipp_fields & IPPF_USE_MIN_MTU) {
- if (tcp->tcp_ipp_use_min_mtu == IPV6_USE_MIN_MTU_NEVER) {
- mss = MIN(mss, IPV6_MIN_MTU);
- }
- }
+ ASSERT(uinfo.iulp_mtu != 0);
+ mss = tcp->tcp_initial_pmtu = uinfo.iulp_mtu;
/* Sanity check for MSS value. */
- if (tcp->tcp_ipversion == IPV4_VERSION)
+ if (connp->conn_ipversion == IPV4_VERSION)
mss_max = tcps->tcps_mss_max_ipv4;
else
mss_max = tcps->tcps_mss_max_ipv6;
- if (tcp->tcp_ipversion == IPV6_VERSION &&
- (ire->ire_frag_flag & IPH_FRAG_HDR)) {
- /*
- * After receiving an ICMPv6 "packet too big" message with a
- * MTU < 1280, and for multirouted IPv6 packets, the IP layer
- * will insert a 8-byte fragment header in every packet; we
- * reduce the MSS by that amount here.
- */
- mss -= sizeof (ip6_frag_t);
- }
-
if (tcp->tcp_ipsec_overhead == 0)
tcp->tcp_ipsec_overhead = conn_ipsec_length(connp);
@@ -2903,71 +2510,28 @@ tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp)
tcp->tcp_mss = mss;
/*
+ * Update the tcp connection with LSO capability.
+ */
+ tcp_update_lso(tcp, connp->conn_ixa);
+
+ /*
* Initialize the ISS here now that we have the full connection ID.
* The RFC 1948 method of initial sequence number generation requires
* knowledge of the full connection ID before setting the ISS.
*/
-
tcp_iss_init(tcp);
- if (ire->ire_type & (IRE_LOOPBACK | IRE_LOCAL))
- tcp->tcp_loopback = B_TRUE;
-
- if (sire != NULL)
- IRE_REFRELE(sire);
-
- /*
- * If we got an IRE_CACHE and an ILL, go through their properties;
- * otherwise, this is deferred until later when we have an IRE_CACHE.
- */
- if (tcp->tcp_loopback ||
- (ire_cacheable && (ill = ire_to_ill(ire)) != NULL)) {
- /*
- * For incoming, see if this tcp may be MDT-capable. For
- * outgoing, this process has been taken care of through
- * tcp_rput_other.
- */
- tcp_ire_ill_check(tcp, ire, ill, incoming);
- tcp->tcp_ire_ill_check_done = B_TRUE;
- }
+ tcp->tcp_loopback = (uinfo.iulp_loopback | uinfo.iulp_local);
- mutex_enter(&connp->conn_lock);
/*
* Make sure that conn is not marked incipient
* for incoming connections. A blind
* removal of incipient flag is cheaper than
* check and removal.
*/
+ mutex_enter(&connp->conn_lock);
connp->conn_state_flags &= ~CONN_INCIPIENT;
-
- /*
- * Must not cache forwarding table routes
- * or recache an IRE after the conn_t has
- * had conn_ire_cache cleared and is flagged
- * unusable, (see the CONN_CACHE_IRE() macro).
- */
- if (ire_cacheable && CONN_CACHE_IRE(connp)) {
- rw_enter(&ire->ire_bucket->irb_lock, RW_READER);
- if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) {
- connp->conn_ire_cache = ire;
- IRE_UNTRACE_REF(ire);
- rw_exit(&ire->ire_bucket->irb_lock);
- mutex_exit(&connp->conn_lock);
- return (1);
- }
- rw_exit(&ire->ire_bucket->irb_lock);
- }
mutex_exit(&connp->conn_lock);
-
- if (ire->ire_mp == NULL)
- ire_refrele(ire);
- return (1);
-
-error:
- if (ire->ire_mp == NULL)
- ire_refrele(ire);
- if (sire != NULL)
- ire_refrele(sire);
return (0);
}
@@ -3001,7 +2565,7 @@ tcp_tpi_bind(tcp_t *tcp, mblk_t *mp)
ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
- if (tcp->tcp_debug) {
+ if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
"tcp_tpi_bind: bad req, len %u",
(uint_t)(mp->b_wptr - mp->b_rptr));
@@ -3010,7 +2574,7 @@ tcp_tpi_bind(tcp_t *tcp, mblk_t *mp)
return;
}
/* Make sure the largest address fits */
- mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t) + 1, 1);
+ mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
if (mp1 == NULL) {
tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
return;
@@ -3024,7 +2588,7 @@ tcp_tpi_bind(tcp_t *tcp, mblk_t *mp)
switch (len) {
case 0: /* request for a generic port */
tbr->ADDR_offset = sizeof (struct T_bind_req);
- if (tcp->tcp_family == AF_INET) {
+ if (connp->conn_family == AF_INET) {
tbr->ADDR_length = sizeof (sin_t);
sin = (sin_t *)&tbr[1];
*sin = sin_null;
@@ -3033,7 +2597,7 @@ tcp_tpi_bind(tcp_t *tcp, mblk_t *mp)
len = sizeof (sin_t);
mp->b_wptr = (uchar_t *)&sin[1];
} else {
- ASSERT(tcp->tcp_family == AF_INET6);
+ ASSERT(connp->conn_family == AF_INET6);
tbr->ADDR_length = sizeof (sin6_t);
sin6 = (sin6_t *)&tbr[1];
*sin6 = sin6_null;
@@ -3055,7 +2619,7 @@ tcp_tpi_bind(tcp_t *tcp, mblk_t *mp)
break;
default:
- if (tcp->tcp_debug) {
+ if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
"tcp_tpi_bind: bad address length, %d",
tbr->ADDR_length);
@@ -3080,16 +2644,16 @@ done:
/*
* Update port information as sockfs/tpi needs it for checking
*/
- if (tcp->tcp_family == AF_INET) {
+ if (connp->conn_family == AF_INET) {
sin = (sin_t *)sa;
- sin->sin_port = tcp->tcp_lport;
+ sin->sin_port = connp->conn_lport;
} else {
sin6 = (sin6_t *)sa;
- sin6->sin6_port = tcp->tcp_lport;
+ sin6->sin6_port = connp->conn_lport;
}
mp->b_datap->db_type = M_PCPROTO;
tbr->PRIM_type = T_BIND_ACK;
- putnext(tcp->tcp_rq, mp);
+ putnext(connp->conn_rq, mp);
}
}
@@ -3139,7 +2703,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
* Set loopmax appropriately so that one does not look
* forever in the case all of the anonymous ports are in use.
*/
- if (tcp->tcp_anon_priv_bind) {
+ if (connp->conn_anon_priv_bind) {
/*
* loopmax =
* (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
@@ -3175,7 +2739,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
mutex_enter(&tbf->tf_lock);
for (ltcp = tbf->tf_tcp; ltcp != NULL;
ltcp = ltcp->tcp_bind_hash) {
- if (lport == ltcp->tcp_lport)
+ if (lport == ltcp->tcp_connp->conn_lport)
break;
}
@@ -3191,7 +2755,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
* privilege as being in all zones, as there's
* otherwise no way to identify the right receiver.
*/
- if (!IPCL_BIND_ZONE_MATCH(ltcp->tcp_connp, connp))
+ if (!IPCL_BIND_ZONE_MATCH(lconnp, connp))
continue;
/*
@@ -3227,7 +2791,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
* added.
*
* if (ltcp->tcp_state == TCPS_LISTEN ||
- * !reuseaddr || !ltcp->tcp_reuseaddr) {
+ * !reuseaddr || !lconnp->conn_reuseaddr) {
* ...
* }
*
@@ -3243,17 +2807,18 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
*/
not_socket = !(TCP_IS_SOCKET(ltcp) &&
TCP_IS_SOCKET(tcp));
- exclbind = ltcp->tcp_exclbind || tcp->tcp_exclbind;
+ exclbind = lconnp->conn_exclbind ||
+ connp->conn_exclbind;
if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) ||
(connp->conn_mac_mode != CONN_MAC_DEFAULT) ||
(exclbind && (not_socket ||
ltcp->tcp_state <= TCPS_ESTABLISHED))) {
if (V6_OR_V4_INADDR_ANY(
- ltcp->tcp_bound_source_v6) ||
+ lconnp->conn_bound_addr_v6) ||
V6_OR_V4_INADDR_ANY(*laddr) ||
IN6_ARE_ADDR_EQUAL(laddr,
- &ltcp->tcp_bound_source_v6)) {
+ &lconnp->conn_bound_addr_v6)) {
break;
}
continue;
@@ -3266,7 +2831,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
* specific port. We use the same autoassigned port
* number space for IPv4 and IPv6 sockets.
*/
- if (tcp->tcp_ipversion != ltcp->tcp_ipversion &&
+ if (connp->conn_ipversion != lconnp->conn_ipversion &&
bind_to_req_port_only)
continue;
@@ -3281,9 +2846,9 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
*/
if (quick_connect &&
(ltcp->tcp_state > TCPS_LISTEN) &&
- ((tcp->tcp_fport != ltcp->tcp_fport) ||
- !IN6_ARE_ADDR_EQUAL(&tcp->tcp_remote_v6,
- &ltcp->tcp_remote_v6)))
+ ((connp->conn_fport != lconnp->conn_fport) ||
+ !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
+ &lconnp->conn_faddr_v6)))
continue;
if (!reuseaddr) {
@@ -3299,9 +2864,9 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
*/
if (!V6_OR_V4_INADDR_ANY(*laddr) &&
!V6_OR_V4_INADDR_ANY(
- ltcp->tcp_bound_source_v6) &&
+ lconnp->conn_bound_addr_v6) &&
!IN6_ARE_ADDR_EQUAL(laddr,
- &ltcp->tcp_bound_source_v6))
+ &lconnp->conn_bound_addr_v6))
continue;
if (ltcp->tcp_state >= TCPS_BOUND) {
/*
@@ -3327,7 +2892,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
* SO_REUSEADDR setting, so we break.
*/
if (IN6_ARE_ADDR_EQUAL(laddr,
- &ltcp->tcp_bound_source_v6) &&
+ &lconnp->conn_bound_addr_v6) &&
(ltcp->tcp_state == TCPS_LISTEN ||
ltcp->tcp_state == TCPS_BOUND))
break;
@@ -3343,11 +2908,10 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
* number.
*/
tcp->tcp_state = TCPS_BOUND;
- tcp->tcp_lport = htons(port);
- *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport;
+ connp->conn_lport = htons(port);
ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
- tcp->tcp_lport)] == tbf);
+ connp->conn_lport)] == tbf);
tcp_bind_hash_insert(tbf, tcp, 1);
mutex_exit(&tbf->tf_lock);
@@ -3364,12 +2928,12 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
* is updated. After the update, it may or may not
* be in the valid range.
*/
- if (!tcp->tcp_anon_priv_bind)
+ if (!connp->conn_anon_priv_bind)
tcps->tcps_next_port_to_try = port + 1;
return (port);
}
- if (tcp->tcp_anon_priv_bind) {
+ if (connp->conn_anon_priv_bind) {
port = tcp_get_next_priv_port(tcp);
} else {
if (count == 0 && user_specified) {
@@ -3402,12 +2966,13 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
* tcp_clean_death / tcp_close_detached must not be called more than once
* on a tcp. Thus every function that potentially calls tcp_clean_death
* must check for the tcp state before calling tcp_clean_death.
- * Eg. tcp_input, tcp_rput_data, tcp_eager_kill, tcp_clean_death_wrapper,
+ * Eg. tcp_input_data, tcp_eager_kill, tcp_clean_death_wrapper,
* tcp_timer_handler, all check for the tcp state.
*/
/* ARGSUSED */
void
-tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2)
+tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2,
+ ip_recv_attr_t *dummy)
{
tcp_t *tcp = ((conn_t *)arg)->conn_tcp;
@@ -3449,11 +3014,11 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag)
}
ASSERT(tcp != NULL);
- ASSERT((tcp->tcp_family == AF_INET &&
- tcp->tcp_ipversion == IPV4_VERSION) ||
- (tcp->tcp_family == AF_INET6 &&
- (tcp->tcp_ipversion == IPV4_VERSION ||
- tcp->tcp_ipversion == IPV6_VERSION)));
+ ASSERT((connp->conn_family == AF_INET &&
+ connp->conn_ipversion == IPV4_VERSION) ||
+ (connp->conn_family == AF_INET6 &&
+ (connp->conn_ipversion == IPV4_VERSION ||
+ connp->conn_ipversion == IPV6_VERSION)));
if (TCP_IS_DETACHED(tcp)) {
if (tcp->tcp_hard_binding) {
@@ -3483,7 +3048,7 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag)
TCP_STAT(tcps, tcp_clean_death_nondetached);
- q = tcp->tcp_rq;
+ q = connp->conn_rq;
/* Trash all inbound data */
if (!IPCL_IS_NONSTR(connp)) {
@@ -3506,7 +3071,7 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag)
*/
(void) putnextctl1(q, M_FLUSH, FLUSHR);
}
- if (tcp->tcp_debug) {
+ if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
"tcp_clean_death: discon err %d", err);
}
@@ -3519,7 +3084,7 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag)
if (mp != NULL) {
putnext(q, mp);
} else {
- if (tcp->tcp_debug) {
+ if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1,
SL_ERROR|SL_TRACE,
"tcp_clean_death, sending M_ERROR");
@@ -3552,6 +3117,7 @@ tcp_stop_lingering(tcp_t *tcp)
{
clock_t delta = 0;
tcp_stack_t *tcps = tcp->tcp_tcps;
+ conn_t *connp = tcp->tcp_connp;
tcp->tcp_linger_tid = 0;
if (tcp->tcp_state > TCPS_LISTEN) {
@@ -3568,15 +3134,14 @@ tcp_stop_lingering(tcp_t *tcp)
}
/*
* Need to cancel those timers which will not be used when
- * TCP is detached. This has to be done before the tcp_wq
- * is set to the global queue.
+ * TCP is detached. This has to be done before the conn_wq
+ * is cleared.
*/
tcp_timers_stop(tcp);
tcp->tcp_detached = B_TRUE;
- ASSERT(tcps->tcps_g_q != NULL);
- tcp->tcp_rq = tcps->tcps_g_q;
- tcp->tcp_wq = WR(tcps->tcps_g_q);
+ connp->conn_rq = NULL;
+ connp->conn_wq = NULL;
if (tcp->tcp_state == TCPS_TIME_WAIT) {
tcp_time_wait_append(tcp);
@@ -3595,16 +3160,14 @@ tcp_stop_lingering(tcp_t *tcp)
}
} else {
tcp_closei_local(tcp);
- CONN_DEC_REF(tcp->tcp_connp);
+ CONN_DEC_REF(connp);
}
finish:
/* Signal closing thread that it can complete close */
mutex_enter(&tcp->tcp_closelock);
tcp->tcp_detached = B_TRUE;
- ASSERT(tcps->tcps_g_q != NULL);
-
- tcp->tcp_rq = tcps->tcps_g_q;
- tcp->tcp_wq = WR(tcps->tcps_g_q);
+ connp->conn_rq = NULL;
+ connp->conn_wq = NULL;
tcp->tcp_closed = 1;
cv_signal(&tcp->tcp_closecv);
@@ -3636,9 +3199,9 @@ tcp_close_common(conn_t *connp, int flags)
ASSERT(connp->conn_ref >= 2);
/*
- * Mark the conn as closing. ill_pending_mp_add will not
+ * Mark the conn as closing. ipsq_pending_mp_add will not
* add any mp to the pending mp list, after this conn has
- * started closing. Same for sq_pending_mp_add
+ * started closing.
*/
mutex_enter(&connp->conn_lock);
connp->conn_state_flags |= CONN_CLOSING;
@@ -3664,7 +3227,7 @@ tcp_close_common(conn_t *connp, int flags)
TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_close_output, connp,
- tcp_squeue_flag, SQTAG_IP_TCP_CLOSE);
+ NULL, tcp_squeue_flag, SQTAG_IP_TCP_CLOSE);
mutex_enter(&tcp->tcp_closelock);
while (!tcp->tcp_closed) {
@@ -3684,13 +3247,13 @@ tcp_close_common(conn_t *connp, int flags)
* thread is higher priority than the squeue worker
* thread and is bound to the same cpu.
*/
- if (tcp->tcp_linger && tcp->tcp_lingertime > 0) {
+ if (connp->conn_linger && connp->conn_lingertime > 0) {
mutex_exit(&tcp->tcp_closelock);
/* Entering squeue, bump ref count. */
CONN_INC_REF(connp);
bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
SQUEUE_ENTER_ONE(connp->conn_sqp, bp,
- tcp_linger_interrupted, connp,
+ tcp_linger_interrupted, connp, NULL,
tcp_squeue_flag, SQTAG_IP_TCP_CLOSE);
mutex_enter(&tcp->tcp_closelock);
}
@@ -3703,8 +3266,8 @@ tcp_close_common(conn_t *connp, int flags)
/*
* In the case of listener streams that have eagers in the q or q0
- * we wait for the eagers to drop their reference to us. tcp_rq and
- * tcp_wq of the eagers point to our queues. By waiting for the
+ * we wait for the eagers to drop their reference to us. conn_rq and
+ * conn_wq of the eagers point to our queues. By waiting for the
* refcnt to drop to 1, we are sure that the eagers have cleaned
* up their queue pointers and also dropped their references to us.
*/
@@ -3716,13 +3279,12 @@ tcp_close_common(conn_t *connp, int flags)
mutex_exit(&connp->conn_lock);
}
/*
- * ioctl cleanup. The mp is queued in the
- * ill_pending_mp or in the sq_pending_mp.
+ * ioctl cleanup. The mp is queued in the ipx_pending_mp.
*/
if (conn_ioctl_cleanup_reqd)
conn_ioctl_cleanup(connp);
- tcp->tcp_cpid = -1;
+ connp->conn_cpid = NOPID;
}
static int
@@ -3799,7 +3361,7 @@ tcp_tpi_close_accept(queue_t *q)
/* ARGSUSED */
static void
-tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2)
+tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
{
conn_t *connp = (conn_t *)arg;
tcp_t *tcp = connp->conn_tcp;
@@ -3828,7 +3390,7 @@ tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2)
/* ARGSUSED */
static void
-tcp_close_output(void *arg, mblk_t *mp, void *arg2)
+tcp_close_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
{
char *msg;
conn_t *connp = (conn_t *)arg;
@@ -3847,10 +3409,6 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2)
}
mutex_exit(&tcp->tcp_eager_lock);
- connp->conn_mdt_ok = B_FALSE;
- tcp->tcp_mdt = B_FALSE;
-
- connp->conn_lso_ok = B_FALSE;
tcp->tcp_lso = B_FALSE;
msg = NULL;
@@ -3879,12 +3437,11 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2)
* If SO_LINGER has set a zero linger time, abort the
* connection with a reset.
*/
- if (tcp->tcp_linger && tcp->tcp_lingertime == 0) {
+ if (connp->conn_linger && connp->conn_lingertime == 0) {
msg = "tcp_close, zero lingertime";
break;
}
- ASSERT(tcp->tcp_hard_bound || tcp->tcp_hard_binding);
/*
* Abort connection if there is unread data queued.
*/
@@ -3893,9 +3450,6 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2)
break;
}
/*
- * tcp_hard_bound is now cleared thus all packets go through
- * tcp_lookup. This fact is used by tcp_detach below.
- *
* We have done a qwait() above which could have possibly
* drained more messages in turn causing transition to a
* different state. Check whether we have to do the rest
@@ -3915,7 +3469,7 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2)
* If lingering on close then wait until the fin is acked,
* the SO_LINGER time passes, or a reset is sent/received.
*/
- if (tcp->tcp_linger && tcp->tcp_lingertime > 0 &&
+ if (connp->conn_linger && connp->conn_lingertime > 0 &&
!(tcp->tcp_fin_acked) &&
tcp->tcp_state >= TCPS_ESTABLISHED) {
if (tcp->tcp_closeflags & (FNDELAY|FNONBLOCK)) {
@@ -3926,7 +3480,7 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2)
tcp->tcp_linger_tid = TCP_TIMER(tcp,
tcp_close_linger_timeout,
- tcp->tcp_lingertime * hz);
+ connp->conn_lingertime * hz);
/* tcp_close_linger_timeout will finish close */
if (tcp->tcp_linger_tid == 0)
@@ -3944,8 +3498,8 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2)
}
/*
- * Make sure that no other thread will access the tcp_rq of
- * this instance (through lookups etc.) as tcp_rq will go
+ * Make sure that no other thread will access the conn_rq of
+ * this instance (through lookups etc.) as conn_rq will go
* away shortly.
*/
tcp_acceptor_hash_remove(tcp);
@@ -3962,8 +3516,8 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2)
}
/*
* Need to cancel those timers which will not be used when
- * TCP is detached. This has to be done before the tcp_wq
- * is set to the global queue.
+ * TCP is detached. This has to be done before the conn_wq
+ * is set to NULL.
*/
tcp_timers_stop(tcp);
@@ -4004,18 +3558,6 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2)
ASSERT(connp->conn_ref >= 2);
finish:
- /*
- * Although packets are always processed on the correct
- * tcp's perimeter and access is serialized via squeue's,
- * IP still needs a queue when sending packets in time_wait
- * state so use WR(tcps_g_q) till ip_output() can be
- * changed to deal with just connp. For read side, we
- * could have set tcp_rq to NULL but there are some cases
- * in tcp_rput_data() from early days of this code which
- * do a putnext without checking if tcp is closed. Those
- * need to be identified before both tcp_rq and tcp_wq
- * can be set to NULL and tcps_g_q can disappear forever.
- */
mutex_enter(&tcp->tcp_closelock);
/*
* Don't change the queues in the case of a listener that has
@@ -4024,13 +3566,8 @@ finish:
*/
if (!tcp->tcp_wait_for_eagers) {
tcp->tcp_detached = B_TRUE;
- /*
- * When default queue is closing we set tcps_g_q to NULL
- * after the close is done.
- */
- ASSERT(tcps->tcps_g_q != NULL);
- tcp->tcp_rq = tcps->tcps_g_q;
- tcp->tcp_wq = WR(tcps->tcps_g_q);
+ connp->conn_rq = NULL;
+ connp->conn_wq = NULL;
}
/* Signal tcp_close() to finish closing. */
@@ -4112,8 +3649,7 @@ tcp_timers_stop(tcp_t *tcp)
static void
tcp_closei_local(tcp_t *tcp)
{
- ire_t *ire;
- conn_t *connp = tcp->tcp_connp;
+ conn_t *connp = tcp->tcp_connp;
tcp_stack_t *tcps = tcp->tcp_tcps;
if (!TCP_IS_SOCKET(tcp))
@@ -4138,7 +3674,7 @@ tcp_closei_local(tcp_t *tcp)
* this point, eager will be closed but we
* leave it in listeners eager list so that
* if listener decides to close without doing
- * accept, we can clean this up. In tcp_wput_accept
+ * accept, we can clean this up. In tcp_tli_accept
* we take care of the case of accept on closed
* eager.
*/
@@ -4150,9 +3686,9 @@ tcp_closei_local(tcp_t *tcp)
* listener queue, after we have released our
* reference on the listener
*/
- ASSERT(tcps->tcps_g_q != NULL);
- tcp->tcp_rq = tcps->tcps_g_q;
- tcp->tcp_wq = WR(tcps->tcps_g_q);
+ ASSERT(tcp->tcp_detached);
+ connp->conn_rq = NULL;
+ connp->conn_wq = NULL;
CONN_DEC_REF(listener->tcp_connp);
} else {
mutex_exit(&listener->tcp_eager_lock);
@@ -4185,20 +3721,16 @@ tcp_closei_local(tcp_t *tcp)
*/
if (tcp->tcp_state == TCPS_TIME_WAIT)
(void) tcp_time_wait_remove(tcp, NULL);
- CL_INET_DISCONNECT(connp, tcp);
+ CL_INET_DISCONNECT(connp);
ipcl_hash_remove(connp);
+ ixa_cleanup(connp->conn_ixa);
/*
- * Delete the cached ire in conn_ire_cache and also mark
- * the conn as CONDEMNED
+ * Mark the conn as CONDEMNED
*/
mutex_enter(&connp->conn_lock);
connp->conn_state_flags |= CONN_CONDEMNED;
- ire = connp->conn_ire_cache;
- connp->conn_ire_cache = NULL;
mutex_exit(&connp->conn_lock);
- if (ire != NULL)
- IRE_REFRELE_NOTR(ire);
/* Need to cleanup any pending ioctls */
ASSERT(tcp->tcp_time_wait_next == NULL);
@@ -4227,14 +3759,14 @@ tcp_closei_local(tcp_t *tcp)
void
tcp_free(tcp_t *tcp)
{
- mblk_t *mp;
- ip6_pkt_t *ipp;
+ mblk_t *mp;
+ conn_t *connp = tcp->tcp_connp;
ASSERT(tcp != NULL);
ASSERT(tcp->tcp_ptpahn == NULL && tcp->tcp_acceptor_hash == NULL);
- tcp->tcp_rq = NULL;
- tcp->tcp_wq = NULL;
+ connp->conn_rq = NULL;
+ connp->conn_wq = NULL;
tcp_close_mpp(&tcp->tcp_xmit_head);
tcp_close_mpp(&tcp->tcp_reass_head);
@@ -4281,12 +3813,12 @@ tcp_free(tcp_t *tcp)
tcp->tcp_dstoptslen = 0;
}
ASSERT(tcp->tcp_dstoptslen == 0);
- if (tcp->tcp_rtdstopts != NULL) {
- mi_free(tcp->tcp_rtdstopts);
- tcp->tcp_rtdstopts = NULL;
- tcp->tcp_rtdstoptslen = 0;
+ if (tcp->tcp_rthdrdstopts != NULL) {
+ mi_free(tcp->tcp_rthdrdstopts);
+ tcp->tcp_rthdrdstopts = NULL;
+ tcp->tcp_rthdrdstoptslen = 0;
}
- ASSERT(tcp->tcp_rtdstoptslen == 0);
+ ASSERT(tcp->tcp_rthdrdstoptslen == 0);
if (tcp->tcp_rthdr != NULL) {
mi_free(tcp->tcp_rthdr);
tcp->tcp_rthdr = NULL;
@@ -4294,18 +3826,6 @@ tcp_free(tcp_t *tcp)
}
ASSERT(tcp->tcp_rthdrlen == 0);
- ipp = &tcp->tcp_sticky_ipp;
- if (ipp->ipp_fields & (IPPF_HOPOPTS | IPPF_RTDSTOPTS | IPPF_DSTOPTS |
- IPPF_RTHDR))
- ip6_pkt_free(ipp);
-
- /*
- * Free memory associated with the tcp/ip header template.
- */
-
- if (tcp->tcp_iphc != NULL)
- bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
-
/*
* Following is really a blowing away a union.
* It happens to have exactly two members of identical size
@@ -4317,17 +3837,19 @@ tcp_free(tcp_t *tcp)
/*
* Put a connection confirmation message upstream built from the
- * address information within 'iph' and 'tcph'. Report our success or failure.
+ * address/flowid information with the conn and iph. Report our success or
+ * failure.
*/
static boolean_t
-tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, mblk_t *idmp,
- mblk_t **defermp)
+tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, mblk_t *idmp,
+ mblk_t **defermp, ip_recv_attr_t *ira)
{
sin_t sin;
sin6_t sin6;
mblk_t *mp;
char *optp = NULL;
int optlen = 0;
+ conn_t *connp = tcp->tcp_connp;
if (defermp != NULL)
*defermp = NULL;
@@ -4352,20 +3874,19 @@ tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, mblk_t *idmp,
}
if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) {
- ipha_t *ipha = (ipha_t *)iphdr;
/* packet is IPv4 */
- if (tcp->tcp_family == AF_INET) {
+ if (connp->conn_family == AF_INET) {
sin = sin_null;
- sin.sin_addr.s_addr = ipha->ipha_src;
- sin.sin_port = *(uint16_t *)tcph->th_lport;
+ sin.sin_addr.s_addr = connp->conn_faddr_v4;
+ sin.sin_port = connp->conn_fport;
sin.sin_family = AF_INET;
mp = mi_tpi_conn_con(NULL, (char *)&sin,
(int)sizeof (sin_t), optp, optlen);
} else {
sin6 = sin6_null;
- IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &sin6.sin6_addr);
- sin6.sin6_port = *(uint16_t *)tcph->th_lport;
+ sin6.sin6_addr = connp->conn_faddr_v6;
+ sin6.sin6_port = connp->conn_fport;
sin6.sin6_family = AF_INET6;
mp = mi_tpi_conn_con(NULL, (char *)&sin6,
(int)sizeof (sin6_t), optp, optlen);
@@ -4375,10 +3896,10 @@ tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, mblk_t *idmp,
ip6_t *ip6h = (ip6_t *)iphdr;
ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION);
- ASSERT(tcp->tcp_family == AF_INET6);
+ ASSERT(connp->conn_family == AF_INET6);
sin6 = sin6_null;
- sin6.sin6_addr = ip6h->ip6_src;
- sin6.sin6_port = *(uint16_t *)tcph->th_lport;
+ sin6.sin6_addr = connp->conn_faddr_v6;
+ sin6.sin6_port = connp->conn_fport;
sin6.sin6_family = AF_INET6;
sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
mp = mi_tpi_conn_con(NULL, (char *)&sin6,
@@ -4393,16 +3914,16 @@ tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, mblk_t *idmp,
if (defermp == NULL) {
conn_t *connp = tcp->tcp_connp;
if (IPCL_IS_NONSTR(connp)) {
- cred_t *cr;
- pid_t cpid;
-
- cr = msg_getcred(mp, &cpid);
(*connp->conn_upcalls->su_connected)
- (connp->conn_upper_handle, tcp->tcp_connid, cr,
- cpid);
+ (connp->conn_upper_handle, tcp->tcp_connid,
+ ira->ira_cred, ira->ira_cpid);
freemsg(mp);
} else {
- putnext(tcp->tcp_rq, mp);
+ if (ira->ira_cred != NULL) {
+ /* So that getpeerucred works for TPI sockfs */
+ mblk_setcred(mp, ira->ira_cred, ira->ira_cpid);
+ }
+ putnext(connp->conn_rq, mp);
}
} else {
*defermp = mp;
@@ -4456,7 +3977,7 @@ tcp_drop_q0(tcp_t *tcp)
*/
MAKE_UNDROPPABLE(eager);
- if (tcp->tcp_debug) {
+ if (tcp->tcp_connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE,
"tcp_drop_q0: listen half-open queue (max=%d) overflow"
" (%d pending) on %s, drop one", tcps->tcps_conn_req_max_q0,
@@ -4469,18 +3990,19 @@ tcp_drop_q0(tcp_t *tcp)
/* Put a reference on the conn as we are enqueueing it in the sqeue */
CONN_INC_REF(eager->tcp_connp);
- /* Mark the IRE created for this SYN request temporary */
- tcp_ip_ire_mark_advice(eager);
SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp,
- tcp_clean_death_wrapper, eager->tcp_connp,
+ tcp_clean_death_wrapper, eager->tcp_connp, NULL,
SQ_FILL, SQTAG_TCP_DROP_Q0);
return (B_TRUE);
}
-int
+/*
+ * Handle a SYN on an AF_INET6 socket; can be either IPv4 or IPv6
+ */
+static mblk_t *
tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
- tcph_t *tcph, uint_t ipvers, mblk_t *idmp)
+ ip_recv_attr_t *ira)
{
tcp_t *ltcp = lconnp->conn_tcp;
tcp_t *tcp = connp->conn_tcp;
@@ -4488,36 +4010,30 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
ipha_t *ipha;
ip6_t *ip6h;
sin6_t sin6;
- in6_addr_t v6dst;
- int err;
- int ifindex = 0;
+ uint_t ifindex = ira->ira_ruifindex;
tcp_stack_t *tcps = tcp->tcp_tcps;
- if (ipvers == IPV4_VERSION) {
+ if (ira->ira_flags & IRAF_IS_IPV4) {
ipha = (ipha_t *)mp->b_rptr;
- connp->conn_send = ip_output;
- connp->conn_recv = tcp_input;
-
- IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst,
- &connp->conn_bound_source_v6);
- IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_srcv6);
- IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_remv6);
+ connp->conn_ipversion = IPV4_VERSION;
+ IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6);
+ IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6);
+ connp->conn_saddr_v6 = connp->conn_laddr_v6;
sin6 = sin6_null;
- IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &sin6.sin6_addr);
- IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
- sin6.sin6_port = *(uint16_t *)tcph->th_lport;
+ sin6.sin6_addr = connp->conn_faddr_v6;
+ sin6.sin6_port = connp->conn_fport;
sin6.sin6_family = AF_INET6;
- sin6.__sin6_src_id = ip_srcid_find_addr(&v6dst,
- lconnp->conn_zoneid, tcps->tcps_netstack);
- if (tcp->tcp_recvdstaddr) {
+ sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6,
+ IPCL_ZONEID(lconnp), tcps->tcps_netstack);
+
+ if (connp->conn_recv_ancillary.crb_recvdstaddr) {
sin6_t sin6d;
sin6d = sin6_null;
- IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst,
- &sin6d.sin6_addr);
- sin6d.sin6_port = *(uint16_t *)tcph->th_fport;
+ sin6d.sin6_addr = connp->conn_laddr_v6;
+ sin6d.sin6_port = connp->conn_lport;
sin6d.sin6_family = AF_INET;
tpi_mp = mi_tpi_extconn_ind(NULL,
(char *)&sin6d, sizeof (sin6_t),
@@ -4534,24 +4050,18 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
} else {
ip6h = (ip6_t *)mp->b_rptr;
- connp->conn_send = ip_output_v6;
- connp->conn_recv = tcp_input;
-
- connp->conn_bound_source_v6 = ip6h->ip6_dst;
- connp->conn_srcv6 = ip6h->ip6_dst;
- connp->conn_remv6 = ip6h->ip6_src;
-
- /* db_cksumstuff is set at ip_fanout_tcp_v6 */
- ifindex = (int)DB_CKSUMSTUFF(mp);
- DB_CKSUMSTUFF(mp) = 0;
+ connp->conn_ipversion = IPV6_VERSION;
+ connp->conn_laddr_v6 = ip6h->ip6_dst;
+ connp->conn_faddr_v6 = ip6h->ip6_src;
+ connp->conn_saddr_v6 = connp->conn_laddr_v6;
sin6 = sin6_null;
- sin6.sin6_addr = ip6h->ip6_src;
- sin6.sin6_port = *(uint16_t *)tcph->th_lport;
+ sin6.sin6_addr = connp->conn_faddr_v6;
+ sin6.sin6_port = connp->conn_fport;
sin6.sin6_family = AF_INET6;
sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
- sin6.__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst,
- lconnp->conn_zoneid, tcps->tcps_netstack);
+ sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6,
+ IPCL_ZONEID(lconnp), tcps->tcps_netstack);
if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
/* Pass up the scope_id of remote addr */
@@ -4559,13 +4069,16 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
} else {
sin6.sin6_scope_id = 0;
}
- if (tcp->tcp_recvdstaddr) {
+ if (connp->conn_recv_ancillary.crb_recvdstaddr) {
sin6_t sin6d;
sin6d = sin6_null;
- sin6.sin6_addr = ip6h->ip6_dst;
- sin6d.sin6_port = *(uint16_t *)tcph->th_fport;
- sin6d.sin6_family = AF_INET;
+ sin6.sin6_addr = connp->conn_laddr_v6;
+ sin6d.sin6_port = connp->conn_lport;
+ sin6d.sin6_family = AF_INET6;
+ if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_laddr_v6))
+ sin6d.sin6_scope_id = ifindex;
+
tpi_mp = mi_tpi_extconn_ind(NULL,
(char *)&sin6d, sizeof (sin6_t),
(char *)&tcp, (t_scalar_t)sizeof (intptr_t),
@@ -4579,194 +4092,40 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
}
}
- if (tpi_mp == NULL)
- return (ENOMEM);
-
- connp->conn_fport = *(uint16_t *)tcph->th_lport;
- connp->conn_lport = *(uint16_t *)tcph->th_fport;
- connp->conn_flags |= (IPCL_TCP6|IPCL_EAGER);
- connp->conn_fully_bound = B_FALSE;
-
- /* Inherit information from the "parent" */
- tcp->tcp_ipversion = ltcp->tcp_ipversion;
- tcp->tcp_family = ltcp->tcp_family;
-
- tcp->tcp_wq = ltcp->tcp_wq;
- tcp->tcp_rq = ltcp->tcp_rq;
-
tcp->tcp_mss = tcps->tcps_mss_def_ipv6;
- tcp->tcp_detached = B_TRUE;
- SOCK_CONNID_INIT(tcp->tcp_connid);
- if ((err = tcp_init_values(tcp)) != 0) {
- freemsg(tpi_mp);
- return (err);
- }
-
- if (ipvers == IPV4_VERSION) {
- if ((err = tcp_header_init_ipv4(tcp)) != 0) {
- freemsg(tpi_mp);
- return (err);
- }
- ASSERT(tcp->tcp_ipha != NULL);
- } else {
- /* ifindex must be already set */
- ASSERT(ifindex != 0);
-
- if (ltcp->tcp_bound_if != 0)
- tcp->tcp_bound_if = ltcp->tcp_bound_if;
- else if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
- tcp->tcp_bound_if = ifindex;
-
- tcp->tcp_ipv6_recvancillary = ltcp->tcp_ipv6_recvancillary;
- tcp->tcp_recvifindex = 0;
- tcp->tcp_recvhops = 0xffffffffU;
- ASSERT(tcp->tcp_ip6h != NULL);
- }
-
- tcp->tcp_lport = ltcp->tcp_lport;
-
- if (ltcp->tcp_ipversion == tcp->tcp_ipversion) {
- if (tcp->tcp_iphc_len != ltcp->tcp_iphc_len) {
- /*
- * Listener had options of some sort; eager inherits.
- * Free up the eager template and allocate one
- * of the right size.
- */
- if (tcp->tcp_hdr_grown) {
- kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len);
- } else {
- bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
- kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc);
- }
- tcp->tcp_iphc = kmem_zalloc(ltcp->tcp_iphc_len,
- KM_NOSLEEP);
- if (tcp->tcp_iphc == NULL) {
- tcp->tcp_iphc_len = 0;
- freemsg(tpi_mp);
- return (ENOMEM);
- }
- tcp->tcp_iphc_len = ltcp->tcp_iphc_len;
- tcp->tcp_hdr_grown = B_TRUE;
- }
- tcp->tcp_hdr_len = ltcp->tcp_hdr_len;
- tcp->tcp_ip_hdr_len = ltcp->tcp_ip_hdr_len;
- tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len;
- tcp->tcp_ip6_hops = ltcp->tcp_ip6_hops;
- tcp->tcp_ip6_vcf = ltcp->tcp_ip6_vcf;
-
- /*
- * Copy the IP+TCP header template from listener to eager
- */
- bcopy(ltcp->tcp_iphc, tcp->tcp_iphc, ltcp->tcp_hdr_len);
- if (tcp->tcp_ipversion == IPV6_VERSION) {
- if (((ip6i_t *)(tcp->tcp_iphc))->ip6i_nxt ==
- IPPROTO_RAW) {
- tcp->tcp_ip6h =
- (ip6_t *)(tcp->tcp_iphc +
- sizeof (ip6i_t));
- } else {
- tcp->tcp_ip6h =
- (ip6_t *)(tcp->tcp_iphc);
- }
- tcp->tcp_ipha = NULL;
- } else {
- tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc;
- tcp->tcp_ip6h = NULL;
- }
- tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc +
- tcp->tcp_ip_hdr_len);
- } else {
- /*
- * only valid case when ipversion of listener and
- * eager differ is when listener is IPv6 and
- * eager is IPv4.
- * Eager header template has been initialized to the
- * maximum v4 header sizes, which includes space for
- * TCP and IP options.
- */
- ASSERT((ltcp->tcp_ipversion == IPV6_VERSION) &&
- (tcp->tcp_ipversion == IPV4_VERSION));
- ASSERT(tcp->tcp_iphc_len >=
- TCP_MAX_COMBINED_HEADER_LENGTH);
- tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len;
- /* copy IP header fields individually */
- tcp->tcp_ipha->ipha_ttl =
- ltcp->tcp_ip6h->ip6_hops;
- bcopy(ltcp->tcp_tcph->th_lport,
- tcp->tcp_tcph->th_lport, sizeof (ushort_t));
- }
-
- bcopy(tcph->th_lport, tcp->tcp_tcph->th_fport, sizeof (in_port_t));
- bcopy(tcp->tcp_tcph->th_fport, &tcp->tcp_fport,
- sizeof (in_port_t));
-
- if (ltcp->tcp_lport == 0) {
- tcp->tcp_lport = *(in_port_t *)tcph->th_fport;
- bcopy(tcph->th_fport, tcp->tcp_tcph->th_lport,
- sizeof (in_port_t));
- }
-
- if (tcp->tcp_ipversion == IPV4_VERSION) {
- ASSERT(ipha != NULL);
- tcp->tcp_ipha->ipha_dst = ipha->ipha_src;
- tcp->tcp_ipha->ipha_src = ipha->ipha_dst;
-
- /* Source routing option copyover (reverse it) */
- if (tcps->tcps_rev_src_routes)
- tcp_opt_reverse(tcp, ipha);
- } else {
- ASSERT(ip6h != NULL);
- tcp->tcp_ip6h->ip6_dst = ip6h->ip6_src;
- tcp->tcp_ip6h->ip6_src = ip6h->ip6_dst;
- }
-
- ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL);
- ASSERT(!tcp->tcp_tconnind_started);
- /*
- * If the SYN contains a credential, it's a loopback packet; attach
- * the credential to the TPI message.
- */
- mblk_copycred(tpi_mp, idmp);
-
- tcp->tcp_conn.tcp_eager_conn_ind = tpi_mp;
-
- /* Inherit the listener's SSL protection state */
-
- if ((tcp->tcp_kssl_ent = ltcp->tcp_kssl_ent) != NULL) {
- kssl_hold_ent(tcp->tcp_kssl_ent);
- tcp->tcp_kssl_pending = B_TRUE;
- }
-
- /* Inherit the listener's non-STREAMS flag */
- if (IPCL_IS_NONSTR(lconnp)) {
- connp->conn_flags |= IPCL_NONSTR;
- }
-
- return (0);
+ return (tpi_mp);
}
-
-int
-tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha,
- tcph_t *tcph, mblk_t *idmp)
+/* Handle a SYN on an AF_INET socket */
+mblk_t *
+tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, mblk_t *mp,
+ ip_recv_attr_t *ira)
{
tcp_t *ltcp = lconnp->conn_tcp;
tcp_t *tcp = connp->conn_tcp;
sin_t sin;
mblk_t *tpi_mp = NULL;
- int err;
tcp_stack_t *tcps = tcp->tcp_tcps;
+ ipha_t *ipha;
+
+ ASSERT(ira->ira_flags & IRAF_IS_IPV4);
+ ipha = (ipha_t *)mp->b_rptr;
+
+ connp->conn_ipversion = IPV4_VERSION;
+ IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6);
+ IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6);
+ connp->conn_saddr_v6 = connp->conn_laddr_v6;
sin = sin_null;
- sin.sin_addr.s_addr = ipha->ipha_src;
- sin.sin_port = *(uint16_t *)tcph->th_lport;
+ sin.sin_addr.s_addr = connp->conn_faddr_v4;
+ sin.sin_port = connp->conn_fport;
sin.sin_family = AF_INET;
- if (ltcp->tcp_recvdstaddr) {
+ if (lconnp->conn_recv_ancillary.crb_recvdstaddr) {
sin_t sind;
sind = sin_null;
- sind.sin_addr.s_addr = ipha->ipha_dst;
- sind.sin_port = *(uint16_t *)tcph->th_fport;
+ sind.sin_addr.s_addr = connp->conn_laddr_v4;
+ sind.sin_port = connp->conn_lport;
sind.sin_family = AF_INET;
tpi_mp = mi_tpi_extconn_ind(NULL,
(char *)&sind, sizeof (sin_t), (char *)&tcp,
@@ -4779,214 +4138,8 @@ tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha,
(t_scalar_t)ltcp->tcp_conn_req_seqnum);
}
- if (tpi_mp == NULL) {
- return (ENOMEM);
- }
-
- connp->conn_flags |= (IPCL_TCP4|IPCL_EAGER);
- connp->conn_send = ip_output;
- connp->conn_recv = tcp_input;
- connp->conn_fully_bound = B_FALSE;
-
- IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_bound_source_v6);
- IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_srcv6);
- IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_remv6);
- connp->conn_fport = *(uint16_t *)tcph->th_lport;
- connp->conn_lport = *(uint16_t *)tcph->th_fport;
-
- /* Inherit information from the "parent" */
- tcp->tcp_ipversion = ltcp->tcp_ipversion;
- tcp->tcp_family = ltcp->tcp_family;
- tcp->tcp_wq = ltcp->tcp_wq;
- tcp->tcp_rq = ltcp->tcp_rq;
tcp->tcp_mss = tcps->tcps_mss_def_ipv4;
- tcp->tcp_detached = B_TRUE;
- SOCK_CONNID_INIT(tcp->tcp_connid);
- if ((err = tcp_init_values(tcp)) != 0) {
- freemsg(tpi_mp);
- return (err);
- }
-
- /*
- * Let's make sure that eager tcp template has enough space to
- * copy IPv4 listener's tcp template. Since the conn_t structure is
- * preserved and tcp_iphc_len is also preserved, an eager conn_t may
- * have a tcp_template of total len TCP_MAX_COMBINED_HEADER_LENGTH or
- * more (in case of re-allocation of conn_t with tcp-IPv6 template with
- * extension headers or with ip6i_t struct). Note that bcopy() below
- * copies listener tcp's hdr_len which cannot be greater than TCP_MAX_
- * COMBINED_HEADER_LENGTH as this listener must be a IPv4 listener.
- */
- ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH);
- ASSERT(ltcp->tcp_hdr_len <= TCP_MAX_COMBINED_HEADER_LENGTH);
-
- tcp->tcp_hdr_len = ltcp->tcp_hdr_len;
- tcp->tcp_ip_hdr_len = ltcp->tcp_ip_hdr_len;
- tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len;
- tcp->tcp_ttl = ltcp->tcp_ttl;
- tcp->tcp_tos = ltcp->tcp_tos;
-
- /* Copy the IP+TCP header template from listener to eager */
- bcopy(ltcp->tcp_iphc, tcp->tcp_iphc, ltcp->tcp_hdr_len);
- tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc;
- tcp->tcp_ip6h = NULL;
- tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc +
- tcp->tcp_ip_hdr_len);
-
- /* Initialize the IP addresses and Ports */
- tcp->tcp_ipha->ipha_dst = ipha->ipha_src;
- tcp->tcp_ipha->ipha_src = ipha->ipha_dst;
- bcopy(tcph->th_lport, tcp->tcp_tcph->th_fport, sizeof (in_port_t));
- bcopy(tcph->th_fport, tcp->tcp_tcph->th_lport, sizeof (in_port_t));
-
- /* Source routing option copyover (reverse it) */
- if (tcps->tcps_rev_src_routes)
- tcp_opt_reverse(tcp, ipha);
-
- ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL);
- ASSERT(!tcp->tcp_tconnind_started);
-
- /*
- * If the SYN contains a credential, it's a loopback packet; attach
- * the credential to the TPI message.
- */
- mblk_copycred(tpi_mp, idmp);
-
- tcp->tcp_conn.tcp_eager_conn_ind = tpi_mp;
-
- /* Inherit the listener's SSL protection state */
- if ((tcp->tcp_kssl_ent = ltcp->tcp_kssl_ent) != NULL) {
- kssl_hold_ent(tcp->tcp_kssl_ent);
- tcp->tcp_kssl_pending = B_TRUE;
- }
-
- /* Inherit the listener's non-STREAMS flag */
- if (IPCL_IS_NONSTR(lconnp)) {
- connp->conn_flags |= IPCL_NONSTR;
- }
-
- return (0);
-}
-
-/*
- * sets up conn for ipsec.
- * if the first mblk is M_CTL it is consumed and mpp is updated.
- * in case of error mpp is freed.
- */
-conn_t *
-tcp_get_ipsec_conn(tcp_t *tcp, squeue_t *sqp, mblk_t **mpp)
-{
- conn_t *connp = tcp->tcp_connp;
- conn_t *econnp;
- squeue_t *new_sqp;
- mblk_t *first_mp = *mpp;
- mblk_t *mp = *mpp;
- boolean_t mctl_present = B_FALSE;
- uint_t ipvers;
-
- econnp = tcp_get_conn(sqp, tcp->tcp_tcps);
- if (econnp == NULL) {
- freemsg(first_mp);
- return (NULL);
- }
- if (DB_TYPE(mp) == M_CTL) {
- if (mp->b_cont == NULL ||
- mp->b_cont->b_datap->db_type != M_DATA) {
- freemsg(first_mp);
- return (NULL);
- }
- mp = mp->b_cont;
- if ((mp->b_datap->db_struioflag & STRUIO_EAGER) == 0) {
- freemsg(first_mp);
- return (NULL);
- }
-
- mp->b_datap->db_struioflag &= ~STRUIO_EAGER;
- first_mp->b_datap->db_struioflag &= ~STRUIO_POLICY;
- mctl_present = B_TRUE;
- } else {
- ASSERT(mp->b_datap->db_struioflag & STRUIO_POLICY);
- mp->b_datap->db_struioflag &= ~STRUIO_POLICY;
- }
-
- new_sqp = (squeue_t *)DB_CKSUMSTART(mp);
- DB_CKSUMSTART(mp) = 0;
-
- ASSERT(OK_32PTR(mp->b_rptr));
- ipvers = IPH_HDR_VERSION(mp->b_rptr);
- if (ipvers == IPV4_VERSION) {
- uint16_t *up;
- uint32_t ports;
- ipha_t *ipha;
-
- ipha = (ipha_t *)mp->b_rptr;
- up = (uint16_t *)((uchar_t *)ipha +
- IPH_HDR_LENGTH(ipha) + TCP_PORTS_OFFSET);
- ports = *(uint32_t *)up;
- IPCL_TCP_EAGER_INIT(econnp, IPPROTO_TCP,
- ipha->ipha_dst, ipha->ipha_src, ports);
- } else {
- uint16_t *up;
- uint32_t ports;
- uint16_t ip_hdr_len;
- uint8_t *nexthdrp;
- ip6_t *ip6h;
- tcph_t *tcph;
-
- ip6h = (ip6_t *)mp->b_rptr;
- if (ip6h->ip6_nxt == IPPROTO_TCP) {
- ip_hdr_len = IPV6_HDR_LEN;
- } else if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ip_hdr_len,
- &nexthdrp) || *nexthdrp != IPPROTO_TCP) {
- CONN_DEC_REF(econnp);
- freemsg(first_mp);
- return (NULL);
- }
- tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len];
- up = (uint16_t *)tcph->th_lport;
- ports = *(uint32_t *)up;
- IPCL_TCP_EAGER_INIT_V6(econnp, IPPROTO_TCP,
- ip6h->ip6_dst, ip6h->ip6_src, ports);
- }
-
- /*
- * The caller already ensured that there is a sqp present.
- */
- econnp->conn_sqp = new_sqp;
- econnp->conn_initial_sqp = new_sqp;
-
- if (connp->conn_policy != NULL) {
- ipsec_in_t *ii;
- ii = (ipsec_in_t *)(first_mp->b_rptr);
- ASSERT(ii->ipsec_in_policy == NULL);
- IPPH_REFHOLD(connp->conn_policy);
- ii->ipsec_in_policy = connp->conn_policy;
-
- first_mp->b_datap->db_type = IPSEC_POLICY_SET;
- if (!ip_bind_ipsec_policy_set(econnp, first_mp)) {
- CONN_DEC_REF(econnp);
- freemsg(first_mp);
- return (NULL);
- }
- }
-
- if (ipsec_conn_cache_policy(econnp, ipvers == IPV4_VERSION) != 0) {
- CONN_DEC_REF(econnp);
- freemsg(first_mp);
- return (NULL);
- }
-
- /*
- * If we know we have some policy, pass the "IPSEC"
- * options size TCP uses this adjust the MSS.
- */
- econnp->conn_tcp->tcp_ipsec_overhead = conn_ipsec_length(econnp);
- if (mctl_present) {
- freeb(first_mp);
- *mpp = mp;
- }
-
- return (econnp);
+ return (tpi_mp);
}
/*
@@ -5002,10 +4155,8 @@ tcp_get_ipsec_conn(tcp_t *tcp, squeue_t *sqp, mblk_t **mpp)
* connection sitting in the freelist. Obviously, this buys us
* performance.
*
- * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_conn_request
- * has multiple disadvantages - tying up the squeue during alloc, and the
- * fact that IPSec policy initialization has to happen here which
- * requires us sending a M_CTL and checking for it i.e. real ugliness.
+ * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_input_listener
+ * has multiple disadvantages - tying up the squeue during alloc.
* But allocating the conn/tcp in IP land is also not the best since
* we can't check the 'q' and 'q0' which are protected by squeue and
* blindly allocate memory which might have to be freed here if we are
@@ -5050,9 +4201,15 @@ tcp_get_conn(void *arg, tcp_stack_t *tcps)
ns = tcps->tcps_netstack;
netstack_hold(ns);
connp->conn_netstack = ns;
+ connp->conn_ixa->ixa_ipst = ns->netstack_ip;
tcp->tcp_tcps = tcps;
- TCPS_REFHOLD(tcps);
ipcl_globalhash_insert(connp);
+
+ connp->conn_ixa->ixa_notify_cookie = tcp;
+ ASSERT(connp->conn_ixa->ixa_notify == tcp_notify);
+ connp->conn_recv = tcp_input_data;
+ ASSERT(connp->conn_recvicmp == tcp_icmp_input);
+ ASSERT(connp->conn_verifyicmp == tcp_verifyicmp);
return ((void *)connp);
}
mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
@@ -5075,62 +4232,20 @@ tcp_get_conn(void *arg, tcp_stack_t *tcps)
mutex_init(&tcp->tcp_rsrv_mp_lock, NULL, MUTEX_DEFAULT, NULL);
tcp->tcp_tcps = tcps;
- TCPS_REFHOLD(tcps);
- return ((void *)connp);
-}
+ connp->conn_recv = tcp_input_data;
+ connp->conn_recvicmp = tcp_icmp_input;
+ connp->conn_verifyicmp = tcp_verifyicmp;
-/*
- * Update the cached label for the given tcp_t. This should be called once per
- * connection, and before any packets are sent or tcp_process_options is
- * invoked. Returns B_FALSE if the correct label could not be constructed.
- */
-static boolean_t
-tcp_update_label(tcp_t *tcp, const cred_t *cr)
-{
- conn_t *connp = tcp->tcp_connp;
-
- if (tcp->tcp_ipversion == IPV4_VERSION) {
- uchar_t optbuf[IP_MAX_OPT_LENGTH];
- int added;
-
- if (tsol_compute_label(cr, tcp->tcp_remote, optbuf,
- tcp->tcp_tcps->tcps_netstack->netstack_ip) != 0)
- return (B_FALSE);
-
- added = tsol_remove_secopt(tcp->tcp_ipha, tcp->tcp_hdr_len);
- if (added == -1)
- return (B_FALSE);
- tcp->tcp_hdr_len += added;
- tcp->tcp_tcph = (tcph_t *)((uchar_t *)tcp->tcp_tcph + added);
- tcp->tcp_ip_hdr_len += added;
- if ((tcp->tcp_label_len = optbuf[IPOPT_OLEN]) != 0) {
- tcp->tcp_label_len = (tcp->tcp_label_len + 3) & ~3;
- added = tsol_prepend_option(optbuf, tcp->tcp_ipha,
- tcp->tcp_hdr_len);
- if (added == -1)
- return (B_FALSE);
- tcp->tcp_hdr_len += added;
- tcp->tcp_tcph = (tcph_t *)
- ((uchar_t *)tcp->tcp_tcph + added);
- tcp->tcp_ip_hdr_len += added;
- }
- } else {
- uchar_t optbuf[TSOL_MAX_IPV6_OPTION];
-
- if (tsol_compute_label_v6(cr, &tcp->tcp_remote_v6, optbuf,
- tcp->tcp_tcps->tcps_netstack->netstack_ip) != 0)
- return (B_FALSE);
- if (tsol_update_sticky(&tcp->tcp_sticky_ipp,
- &tcp->tcp_label_len, optbuf) != 0)
- return (B_FALSE);
- if (tcp_build_hdrs(tcp) != 0)
- return (B_FALSE);
- }
-
- connp->conn_ulp_labeled = 1;
+ /*
+ * Register tcp_notify to listen to capability changes detected by IP.
+ * This upcall is made in the context of the call to conn_ip_output
+ * thus it is inside the squeue.
+ */
+ connp->conn_ixa->ixa_notify = tcp_notify;
+ connp->conn_ixa->ixa_notify_cookie = tcp;
- return (B_TRUE);
+ return ((void *)connp);
}
/* BEGIN CSTYLED */
@@ -5140,7 +4255,7 @@ tcp_update_label(tcp_t *tcp, const cred_t *cr)
* =======================
*
* The eager is now established in its own perimeter as soon as SYN is
- * received in tcp_conn_request(). When sockfs receives conn_ind, it
+ * received in tcp_input_listener(). When sockfs receives conn_ind, it
* completes the accept processing on the acceptor STREAM. The sending
* of conn_ind part is common for both sockfs listener and a TLI/XTI
* listener but a TLI/XTI listener completes the accept processing
@@ -5149,29 +4264,28 @@ tcp_update_label(tcp_t *tcp, const cred_t *cr)
* Common control flow for 3 way handshake:
* ----------------------------------------
*
- * incoming SYN (listener perimeter) -> tcp_rput_data()
- * -> tcp_conn_request()
+ * incoming SYN (listener perimeter) -> tcp_input_listener()
*
- * incoming SYN-ACK-ACK (eager perim) -> tcp_rput_data()
+ * incoming SYN-ACK-ACK (eager perim) -> tcp_input_data()
* send T_CONN_IND (listener perim) -> tcp_send_conn_ind()
*
* Sockfs ACCEPT Path:
* -------------------
*
- * open acceptor stream (tcp_open allocates tcp_wput_accept()
+ * open acceptor stream (tcp_open allocates tcp_tli_accept()
* as STREAM entry point)
*
- * soaccept() sends T_CONN_RES on the acceptor STREAM to tcp_wput_accept()
+ * soaccept() sends T_CONN_RES on the acceptor STREAM to tcp_tli_accept()
*
- * tcp_wput_accept() extracts the eager and makes the q->q_ptr <-> eager
+ * tcp_tli_accept() extracts the eager and makes the q->q_ptr <-> eager
* association (we are not behind eager's squeue but sockfs is protecting us
* and no one knows about this stream yet. The STREAMS entry point q->q_info
* is changed to point at tcp_wput().
*
- * tcp_wput_accept() sends any deferred eagers via tcp_send_pending() to
+ * tcp_accept_common() sends any deferred eagers via tcp_send_pending() to
* listener (done on listener's perimeter).
*
- * tcp_wput_accept() calls tcp_accept_finish() on eagers perimeter to finish
+ * tcp_tli_accept() calls tcp_accept_finish() on eagers perimeter to finish
* accept.
*
* TLI/XTI client ACCEPT path:
@@ -5179,8 +4293,8 @@ tcp_update_label(tcp_t *tcp, const cred_t *cr)
*
* soaccept() sends T_CONN_RES on the listener STREAM.
*
- * tcp_accept() -> tcp_accept_swap() complete the processing and send
- * the bind_mp to eager perimeter to finish accept (tcp_rput_other()).
+ * tcp_tli_accept() -> tcp_accept_swap() complete the processing and send
+ * a M_SETOPS mblk to eager perimeter to finish accept (tcp_accept_finish()).
*
* Locks:
* ======
@@ -5191,7 +4305,7 @@ tcp_update_label(tcp_t *tcp, const cred_t *cr)
* Referencing:
* ============
*
- * 1) We start out in tcp_conn_request by eager placing a ref on
+ * 1) We start out in tcp_input_listener by eager placing a ref on
* listener and listener adding eager to listeners->tcp_eager_next_q0.
*
* 2) When a SYN-ACK-ACK arrives, we send the conn_ind to listener. Before
@@ -5249,51 +4363,71 @@ tcp_update_label(tcp_t *tcp, const cred_t *cr)
/*
* THIS FUNCTION IS DIRECTLY CALLED BY IP VIA SQUEUE FOR SYN.
- * tcp_rput_data will not see any SYN packets.
+ * tcp_input_data will not see any packets for listeners since the listener
+ * has conn_recv set to tcp_input_listener.
*/
/* ARGSUSED */
void
-tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
+tcp_input_listener(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
{
- tcph_t *tcph;
+ tcpha_t *tcpha;
uint32_t seg_seq;
tcp_t *eager;
- uint_t ipvers;
- ipha_t *ipha;
- ip6_t *ip6h;
int err;
conn_t *econnp = NULL;
squeue_t *new_sqp;
mblk_t *mp1;
uint_t ip_hdr_len;
- conn_t *connp = (conn_t *)arg;
- tcp_t *tcp = connp->conn_tcp;
- cred_t *credp;
- tcp_stack_t *tcps = tcp->tcp_tcps;
- ip_stack_t *ipst;
+ conn_t *lconnp = (conn_t *)arg;
+ tcp_t *listener = lconnp->conn_tcp;
+ tcp_stack_t *tcps = listener->tcp_tcps;
+ ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
+ uint_t flags;
+ mblk_t *tpi_mp;
+ uint_t ifindex = ira->ira_ruifindex;
- if (tcp->tcp_state != TCPS_LISTEN)
+ ip_hdr_len = ira->ira_ip_hdr_length;
+ tcpha = (tcpha_t *)&mp->b_rptr[ip_hdr_len];
+ flags = (unsigned int)tcpha->tha_flags & 0xFF;
+
+ if (!(flags & TH_SYN)) {
+ if ((flags & TH_RST) || (flags & TH_URG)) {
+ freemsg(mp);
+ return;
+ }
+ if (flags & TH_ACK) {
+ /* Note this executes in listener's squeue */
+ tcp_xmit_listeners_reset(mp, ira, ipst, lconnp);
+ return;
+ }
+
+ freemsg(mp);
+ return;
+ }
+
+ if (listener->tcp_state != TCPS_LISTEN)
goto error2;
- ASSERT((tcp->tcp_connp->conn_flags & IPCL_BOUND) != 0);
+ ASSERT(IPCL_IS_BOUND(lconnp));
- mutex_enter(&tcp->tcp_eager_lock);
- if (tcp->tcp_conn_req_cnt_q >= tcp->tcp_conn_req_max) {
- mutex_exit(&tcp->tcp_eager_lock);
+ mutex_enter(&listener->tcp_eager_lock);
+ if (listener->tcp_conn_req_cnt_q >= listener->tcp_conn_req_max) {
+ mutex_exit(&listener->tcp_eager_lock);
TCP_STAT(tcps, tcp_listendrop);
BUMP_MIB(&tcps->tcps_mib, tcpListenDrop);
- if (tcp->tcp_debug) {
+ if (lconnp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
- "tcp_conn_request: listen backlog (max=%d) "
+ "tcp_input_listener: listen backlog (max=%d) "
"overflow (%d pending) on %s",
- tcp->tcp_conn_req_max, tcp->tcp_conn_req_cnt_q,
- tcp_display(tcp, NULL, DISP_PORT_ONLY));
+ listener->tcp_conn_req_max,
+ listener->tcp_conn_req_cnt_q,
+ tcp_display(listener, NULL, DISP_PORT_ONLY));
}
goto error2;
}
- if (tcp->tcp_conn_req_cnt_q0 >=
- tcp->tcp_conn_req_max + tcps->tcps_conn_req_max_q0) {
+ if (listener->tcp_conn_req_cnt_q0 >=
+ listener->tcp_conn_req_max + tcps->tcps_conn_req_max_q0) {
/*
* Q0 is full. Drop a pending half-open req from the queue
* to make room for the new SYN req. Also mark the time we
@@ -5303,83 +4437,127 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
* be to set the "tcp_syn_defense" flag now.
*/
TCP_STAT(tcps, tcp_listendropq0);
- tcp->tcp_last_rcv_lbolt = lbolt64;
- if (!tcp_drop_q0(tcp)) {
- mutex_exit(&tcp->tcp_eager_lock);
+ listener->tcp_last_rcv_lbolt = lbolt64;
+ if (!tcp_drop_q0(listener)) {
+ mutex_exit(&listener->tcp_eager_lock);
BUMP_MIB(&tcps->tcps_mib, tcpListenDropQ0);
- if (tcp->tcp_debug) {
+ if (lconnp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE,
- "tcp_conn_request: listen half-open queue "
- "(max=%d) full (%d pending) on %s",
+ "tcp_input_listener: listen half-open "
+ "queue (max=%d) full (%d pending) on %s",
tcps->tcps_conn_req_max_q0,
- tcp->tcp_conn_req_cnt_q0,
- tcp_display(tcp, NULL,
+ listener->tcp_conn_req_cnt_q0,
+ tcp_display(listener, NULL,
DISP_PORT_ONLY));
}
goto error2;
}
}
- mutex_exit(&tcp->tcp_eager_lock);
+ mutex_exit(&listener->tcp_eager_lock);
/*
- * IP adds STRUIO_EAGER and ensures that the received packet is
- * M_DATA even if conn_ipv6_recvpktinfo is enabled or for ip6
- * link local address. If IPSec is enabled, db_struioflag has
- * STRUIO_POLICY set (mutually exclusive from STRUIO_EAGER);
- * otherwise an error case if neither of them is set.
+ * IP sets ira_sqp to either the senders conn_sqp (for loopback)
+ * or based on the ring (for packets from GLD). Otherwise it is
+ * set based on lbolt i.e., a somewhat random number.
*/
- if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) {
- new_sqp = (squeue_t *)DB_CKSUMSTART(mp);
- DB_CKSUMSTART(mp) = 0;
- mp->b_datap->db_struioflag &= ~STRUIO_EAGER;
- econnp = (conn_t *)tcp_get_conn(arg2, tcps);
- if (econnp == NULL)
- goto error2;
- ASSERT(econnp->conn_netstack == connp->conn_netstack);
- econnp->conn_sqp = new_sqp;
- econnp->conn_initial_sqp = new_sqp;
- } else if ((mp->b_datap->db_struioflag & STRUIO_POLICY) != 0) {
- /*
- * mp is updated in tcp_get_ipsec_conn().
- */
- econnp = tcp_get_ipsec_conn(tcp, arg2, &mp);
- if (econnp == NULL) {
- /*
- * mp freed by tcp_get_ipsec_conn.
- */
- return;
- }
- ASSERT(econnp->conn_netstack == connp->conn_netstack);
- } else {
+ ASSERT(ira->ira_sqp != NULL);
+ new_sqp = ira->ira_sqp;
+
+ econnp = (conn_t *)tcp_get_conn(arg2, tcps);
+ if (econnp == NULL)
goto error2;
- }
- ASSERT(DB_TYPE(mp) == M_DATA);
+ ASSERT(econnp->conn_netstack == lconnp->conn_netstack);
+ econnp->conn_sqp = new_sqp;
+ econnp->conn_initial_sqp = new_sqp;
+ econnp->conn_ixa->ixa_sqp = new_sqp;
+
+ econnp->conn_fport = tcpha->tha_lport;
+ econnp->conn_lport = tcpha->tha_fport;
+
+ err = conn_inherit_parent(lconnp, econnp);
+ if (err != 0)
+ goto error3;
- ipvers = IPH_HDR_VERSION(mp->b_rptr);
- ASSERT(ipvers == IPV6_VERSION || ipvers == IPV4_VERSION);
ASSERT(OK_32PTR(mp->b_rptr));
- if (ipvers == IPV4_VERSION) {
- ipha = (ipha_t *)mp->b_rptr;
- ip_hdr_len = IPH_HDR_LENGTH(ipha);
- tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len];
- } else {
- ip6h = (ip6_t *)mp->b_rptr;
- ip_hdr_len = ip_hdr_length_v6(mp, ip6h);
- tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len];
- }
+ ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION ||
+ IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION);
- if (tcp->tcp_family == AF_INET) {
- ASSERT(ipvers == IPV4_VERSION);
- err = tcp_conn_create_v4(connp, econnp, ipha, tcph, mp);
+ if (lconnp->conn_family == AF_INET) {
+ ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION);
+ tpi_mp = tcp_conn_create_v4(lconnp, econnp, mp, ira);
} else {
- err = tcp_conn_create_v6(connp, econnp, mp, tcph, ipvers, mp);
+ tpi_mp = tcp_conn_create_v6(lconnp, econnp, mp, ira);
}
- if (err)
+ if (tpi_mp == NULL)
goto error3;
eager = econnp->conn_tcp;
+ eager->tcp_detached = B_TRUE;
+ SOCK_CONNID_INIT(eager->tcp_connid);
+
+ tcp_init_values(eager);
+
+ ASSERT((econnp->conn_ixa->ixa_flags &
+ (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE |
+ IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO)) ==
+ (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE |
+ IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO));
+
+ if (!tcps->tcps_dev_flow_ctl)
+ econnp->conn_ixa->ixa_flags |= IXAF_NO_DEV_FLOW_CTL;
+
+ /* Prepare for diffing against previous packets */
+ eager->tcp_recvifindex = 0;
+ eager->tcp_recvhops = 0xffffffffU;
+
+ if (!(ira->ira_flags & IRAF_IS_IPV4) && econnp->conn_bound_if == 0) {
+ if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_faddr_v6) ||
+ IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6)) {
+ econnp->conn_incoming_ifindex = ifindex;
+ econnp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
+ econnp->conn_ixa->ixa_scopeid = ifindex;
+ }
+ }
+
+ if ((ira->ira_flags & (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS)) ==
+ (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS) &&
+ tcps->tcps_rev_src_routes) {
+ ipha_t *ipha = (ipha_t *)mp->b_rptr;
+ ip_pkt_t *ipp = &econnp->conn_xmit_ipp;
+
+ /* Source routing option copyover (reverse it) */
+ err = ip_find_hdr_v4(ipha, ipp, B_TRUE);
+ if (err != 0) {
+ freemsg(tpi_mp);
+ goto error3;
+ }
+ ip_pkt_source_route_reverse_v4(ipp);
+ }
+
+ ASSERT(eager->tcp_conn.tcp_eager_conn_ind == NULL);
+ ASSERT(!eager->tcp_tconnind_started);
+ /*
+ * If the SYN came with a credential, it's a loopback packet or a
+ * labeled packet; attach the credential to the TPI message.
+ */
+ if (ira->ira_cred != NULL)
+ mblk_setcred(tpi_mp, ira->ira_cred, ira->ira_cpid);
+
+ eager->tcp_conn.tcp_eager_conn_ind = tpi_mp;
+
+ /* Inherit the listener's SSL protection state */
+ if ((eager->tcp_kssl_ent = listener->tcp_kssl_ent) != NULL) {
+ kssl_hold_ent(eager->tcp_kssl_ent);
+ eager->tcp_kssl_pending = B_TRUE;
+ }
+
+ /* Inherit the listener's non-STREAMS flag */
+ if (IPCL_IS_NONSTR(lconnp)) {
+ econnp->conn_flags |= IPCL_NONSTR;
+ }
+
ASSERT(eager->tcp_ordrel_mp == NULL);
if (!IPCL_IS_NONSTR(econnp)) {
@@ -5392,127 +4570,103 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
if ((eager->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL)
goto error3;
}
- /* Inherit various TCP parameters from the listener */
- eager->tcp_naglim = tcp->tcp_naglim;
- eager->tcp_first_timer_threshold = tcp->tcp_first_timer_threshold;
- eager->tcp_second_timer_threshold = tcp->tcp_second_timer_threshold;
-
- eager->tcp_first_ctimer_threshold = tcp->tcp_first_ctimer_threshold;
- eager->tcp_second_ctimer_threshold = tcp->tcp_second_ctimer_threshold;
-
/*
- * tcp_adapt_ire() may change tcp_rwnd according to the ire metrics.
- * If it does not, the eager's receive window will be set to the
- * listener's receive window later in this function.
+ * Now that the IP addresses and ports are setup in econnp we
+ * can do the IPsec policy work.
*/
- eager->tcp_rwnd = 0;
+ if (ira->ira_flags & IRAF_IPSEC_SECURE) {
+ if (lconnp->conn_policy != NULL) {
+ /*
+ * Inherit the policy from the listener; use
+ * actions from ira
+ */
+ if (!ip_ipsec_policy_inherit(econnp, lconnp, ira)) {
+ CONN_DEC_REF(econnp);
+ freemsg(mp);
+ goto error3;
+ }
+ }
+ }
- /*
- * Inherit listener's tcp_init_cwnd. Need to do this before
- * calling tcp_process_options() where tcp_mss_set() is called
- * to set the initial cwnd.
- */
- eager->tcp_init_cwnd = tcp->tcp_init_cwnd;
+ /* Inherit various TCP parameters from the listener */
+ eager->tcp_naglim = listener->tcp_naglim;
+ eager->tcp_first_timer_threshold = listener->tcp_first_timer_threshold;
+ eager->tcp_second_timer_threshold =
+ listener->tcp_second_timer_threshold;
+ eager->tcp_first_ctimer_threshold =
+ listener->tcp_first_ctimer_threshold;
+ eager->tcp_second_ctimer_threshold =
+ listener->tcp_second_ctimer_threshold;
/*
- * Zones: tcp_adapt_ire() and tcp_send_data() both need the
- * zone id before the accept is completed in tcp_wput_accept().
+ * tcp_set_destination() may set tcp_rwnd according to the route
+ * metrics. If it does not, the eager's receive window will be set
+ * to the listener's receive window later in this function.
*/
- econnp->conn_zoneid = connp->conn_zoneid;
- econnp->conn_allzones = connp->conn_allzones;
-
- /* Copy nexthop information from listener to eager */
- if (connp->conn_nexthop_set) {
- econnp->conn_nexthop_set = connp->conn_nexthop_set;
- econnp->conn_nexthop_v4 = connp->conn_nexthop_v4;
- }
+ eager->tcp_rwnd = 0;
/*
- * TSOL: tsol_input_proc() needs the eager's cred before the
- * eager is accepted
+ * Inherit listener's tcp_init_cwnd. Need to do this before
+ * calling tcp_process_options() which set the initial cwnd.
*/
- econnp->conn_cred = eager->tcp_cred = credp = connp->conn_cred;
- crhold(credp);
+ eager->tcp_init_cwnd = listener->tcp_init_cwnd;
- ASSERT(econnp->conn_effective_cred == NULL);
if (is_system_labeled()) {
- cred_t *cr;
- ts_label_t *tsl;
-
- /*
- * If this is an MLP connection or a MAC-Exempt connection
- * with an unlabeled node, packets are to be
- * exchanged using the security label of the received
- * SYN packet instead of the server application's label.
- */
- if ((cr = msg_getcred(mp, NULL)) != NULL &&
- (tsl = crgetlabel(cr)) != NULL &&
- (connp->conn_mlp_type != mlptSingle ||
- (connp->conn_mac_mode != CONN_MAC_AWARE &&
- (tsl->tsl_flags & TSLF_UNLABELED)))) {
- if ((econnp->conn_effective_cred =
- copycred_from_tslabel(econnp->conn_cred,
- tsl, KM_NOSLEEP)) != NULL) {
- DTRACE_PROBE2(
- syn_accept_peerlabel,
- conn_t *, econnp, cred_t *,
- econnp->conn_effective_cred);
- } else {
- DTRACE_PROBE3(
- tx__ip__log__error__set__eagercred__tcp,
- char *,
- "SYN mp(1) label on eager connp(2) failed",
- mblk_t *, mp, conn_t *, econnp);
- goto error3;
- }
+ ip_xmit_attr_t *ixa = econnp->conn_ixa;
+
+ ASSERT(ira->ira_tsl != NULL);
+ /* Discard any old label */
+ if (ixa->ixa_free_flags & IXA_FREE_TSL) {
+ ASSERT(ixa->ixa_tsl != NULL);
+ label_rele(ixa->ixa_tsl);
+ ixa->ixa_free_flags &= ~IXA_FREE_TSL;
+ ixa->ixa_tsl = NULL;
+ }
+ if ((lconnp->conn_mlp_type != mlptSingle ||
+ lconnp->conn_mac_mode != CONN_MAC_DEFAULT) &&
+ ira->ira_tsl != NULL) {
+ /*
+ * If this is an MLP connection or a MAC-Exempt
+ * connection with an unlabeled node, packets are to be
+ * exchanged using the security label of the received
+ * SYN packet instead of the server application's label.
+ * tsol_check_dest called from ip_set_destination
+ * might later update TSF_UNLABELED by replacing
+ * ixa_tsl with a new label.
+ */
+ label_hold(ira->ira_tsl);
+ ip_xmit_attr_replace_tsl(ixa, ira->ira_tsl);
+ DTRACE_PROBE2(mlp_syn_accept, conn_t *,
+ econnp, ts_label_t *, ixa->ixa_tsl)
} else {
+ ixa->ixa_tsl = crgetlabel(econnp->conn_cred);
DTRACE_PROBE2(syn_accept, conn_t *,
- econnp, cred_t *, econnp->conn_cred)
+ econnp, ts_label_t *, ixa->ixa_tsl)
}
-
/*
- * Verify the destination is allowed to receive packets
- * at the security label of the SYN-ACK we are generating.
- * tsol_check_dest() may create a new effective cred for
- * this connection with a modified label or label flags.
+ * conn_connect() called from tcp_set_destination will verify
+ * the destination is allowed to receive packets at the
+ * security label of the SYN-ACK we are generating. As part of
+ * that, tsol_check_dest() may create a new effective label for
+ * this connection.
+ * Finally conn_connect() will call conn_update_label.
+ * All that remains for TCP to do is to call
+ * conn_build_hdr_template which is done as part of
+ * tcp_set_destination.
*/
- if (IN6_IS_ADDR_V4MAPPED(&econnp->conn_remv6)) {
- uint32_t dst;
- IN6_V4MAPPED_TO_IPADDR(&econnp->conn_remv6, dst);
- err = tsol_check_dest(CONN_CRED(econnp), &dst,
- IPV4_VERSION, B_FALSE, &cr);
- } else {
- err = tsol_check_dest(CONN_CRED(econnp),
- &econnp->conn_remv6, IPV6_VERSION,
- B_FALSE, &cr);
- }
- if (err != 0)
- goto error3;
- if (cr != NULL) {
- if (econnp->conn_effective_cred != NULL)
- crfree(econnp->conn_effective_cred);
- econnp->conn_effective_cred = cr;
- }
-
- /*
- * Generate the security label to be used in the text of
- * this connection's outgoing packets.
- */
- if (!tcp_update_label(eager, CONN_CRED(econnp))) {
- DTRACE_PROBE3(
- tx__ip__log__error__connrequest__tcp,
- char *, "eager connp(1) label on SYN mp(2) failed",
- conn_t *, econnp, mblk_t *, mp);
- goto error3;
- }
}
+ /*
+ * Since we will clear tcp_listener before we clear tcp_detached
+ * in the accept code we need tcp_hard_binding aka tcp_accept_inprogress
+ * so we can tell a TCP_DETACHED_NONEAGER apart.
+ */
eager->tcp_hard_binding = B_TRUE;
tcp_bind_hash_insert(&tcps->tcps_bind_fanout[
- TCP_BIND_HASH(eager->tcp_lport)], eager, 0);
+ TCP_BIND_HASH(econnp->conn_lport)], eager, 0);
- CL_INET_CONNECT(connp, eager, B_FALSE, err);
+ CL_INET_CONNECT(econnp, B_FALSE, err);
if (err != 0) {
tcp_bind_hash_remove(eager);
goto error3;
@@ -5528,32 +4682,27 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
SOCK_CONNID_BUMP(eager->tcp_connid);
/*
- * There should be no ire in the mp as we are being called after
- * receiving the SYN.
- */
- ASSERT(tcp_ire_mp(&mp) == NULL);
-
- /*
- * Adapt our mss, ttl, ... according to information provided in IRE.
+ * Adapt our mss, ttl, ... based on the remote address.
*/
- if (tcp_adapt_ire(eager, NULL) == 0) {
+ if (tcp_set_destination(eager) != 0) {
+ BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
/* Undo the bind_hash_insert */
tcp_bind_hash_remove(eager);
goto error3;
}
/* Process all TCP options. */
- tcp_process_options(eager, tcph);
+ tcp_process_options(eager, tcpha);
/* Is the other end ECN capable? */
if (tcps->tcps_ecn_permitted >= 1 &&
- (tcph->th_flags[0] & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) {
+ (tcpha->tha_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) {
eager->tcp_ecn_ok = B_TRUE;
}
/*
- * listeners tcp_recv_hiwater should be the default window size or a
+ * The listener's conn_rcvbuf should be the default window size or a
* window size changed via SO_RCVBUF option. First round up the
* eager's tcp_rwnd to the nearest MSS. Then find out the window
* scale option value if needed. Call tcp_rwnd_set() to finish the
@@ -5563,7 +4712,7 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
* we should not inherit receive window size from listener.
*/
eager->tcp_rwnd = MSS_ROUNDUP(
- (eager->tcp_rwnd == 0 ? tcp->tcp_recv_hiwater:
+ (eager->tcp_rwnd == 0 ? econnp->conn_rcvbuf :
eager->tcp_rwnd), eager->tcp_mss);
if (eager->tcp_snd_ws_ok)
tcp_set_ws_value(eager);
@@ -5575,77 +4724,46 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
*/
(void) tcp_rwnd_set(eager, eager->tcp_rwnd);
- /*
- * We eliminate the need for sockfs to send down a T_SVR4_OPTMGMT_REQ
- * via soaccept()->soinheritoptions() which essentially applies
- * all the listener options to the new STREAM. The options that we
- * need to take care of are:
- * SO_DEBUG, SO_REUSEADDR, SO_KEEPALIVE, SO_DONTROUTE, SO_BROADCAST,
- * SO_USELOOPBACK, SO_OOBINLINE, SO_DGRAM_ERRIND, SO_LINGER,
- * SO_SNDBUF, SO_RCVBUF.
- *
- * SO_RCVBUF: tcp_rwnd_set() above takes care of it.
- * SO_SNDBUF: Set the tcp_xmit_hiwater for the eager. When
- * tcp_maxpsz_set() gets called later from
- * tcp_accept_finish(), the option takes effect.
- *
- */
- /* Set the TCP options */
- eager->tcp_recv_lowater = tcp->tcp_recv_lowater;
- eager->tcp_xmit_hiwater = tcp->tcp_xmit_hiwater;
- eager->tcp_dgram_errind = tcp->tcp_dgram_errind;
- eager->tcp_oobinline = tcp->tcp_oobinline;
- eager->tcp_reuseaddr = tcp->tcp_reuseaddr;
- eager->tcp_broadcast = tcp->tcp_broadcast;
- eager->tcp_useloopback = tcp->tcp_useloopback;
- eager->tcp_dontroute = tcp->tcp_dontroute;
- eager->tcp_debug = tcp->tcp_debug;
- eager->tcp_linger = tcp->tcp_linger;
- eager->tcp_lingertime = tcp->tcp_lingertime;
- if (tcp->tcp_ka_enabled)
- eager->tcp_ka_enabled = 1;
-
- ASSERT(eager->tcp_recv_hiwater != 0 &&
- eager->tcp_recv_hiwater == eager->tcp_rwnd);
-
- /* Set the IP options */
- econnp->conn_broadcast = connp->conn_broadcast;
- econnp->conn_loopback = connp->conn_loopback;
- econnp->conn_dontroute = connp->conn_dontroute;
- econnp->conn_reuseaddr = connp->conn_reuseaddr;
+ ASSERT(eager->tcp_connp->conn_rcvbuf != 0 &&
+ eager->tcp_connp->conn_rcvbuf == eager->tcp_rwnd);
+
+ ASSERT(econnp->conn_rcvbuf != 0 &&
+ econnp->conn_rcvbuf == eager->tcp_rwnd);
/* Put a ref on the listener for the eager. */
- CONN_INC_REF(connp);
- mutex_enter(&tcp->tcp_eager_lock);
- tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = eager;
- eager->tcp_eager_next_q0 = tcp->tcp_eager_next_q0;
- tcp->tcp_eager_next_q0 = eager;
- eager->tcp_eager_prev_q0 = tcp;
+ CONN_INC_REF(lconnp);
+ mutex_enter(&listener->tcp_eager_lock);
+ listener->tcp_eager_next_q0->tcp_eager_prev_q0 = eager;
+ eager->tcp_eager_next_q0 = listener->tcp_eager_next_q0;
+ listener->tcp_eager_next_q0 = eager;
+ eager->tcp_eager_prev_q0 = listener;
/* Set tcp_listener before adding it to tcp_conn_fanout */
- eager->tcp_listener = tcp;
- eager->tcp_saved_listener = tcp;
+ eager->tcp_listener = listener;
+ eager->tcp_saved_listener = listener;
/*
* Tag this detached tcp vector for later retrieval
* by our listener client in tcp_accept().
*/
- eager->tcp_conn_req_seqnum = tcp->tcp_conn_req_seqnum;
- tcp->tcp_conn_req_cnt_q0++;
- if (++tcp->tcp_conn_req_seqnum == -1) {
+ eager->tcp_conn_req_seqnum = listener->tcp_conn_req_seqnum;
+ listener->tcp_conn_req_cnt_q0++;
+ if (++listener->tcp_conn_req_seqnum == -1) {
/*
* -1 is "special" and defined in TPI as something
* that should never be used in T_CONN_IND
*/
- ++tcp->tcp_conn_req_seqnum;
+ ++listener->tcp_conn_req_seqnum;
}
- mutex_exit(&tcp->tcp_eager_lock);
+ mutex_exit(&listener->tcp_eager_lock);
- if (tcp->tcp_syn_defense) {
+ if (listener->tcp_syn_defense) {
/* Don't drop the SYN that comes from a good IP source */
- ipaddr_t *addr_cache = (ipaddr_t *)(tcp->tcp_ip_addr_cache);
- if (addr_cache != NULL && eager->tcp_remote ==
- addr_cache[IP_ADDR_CACHE_HASH(eager->tcp_remote)]) {
+ ipaddr_t *addr_cache;
+
+ addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
+ if (addr_cache != NULL && econnp->conn_faddr_v4 ==
+ addr_cache[IP_ADDR_CACHE_HASH(econnp->conn_faddr_v4)]) {
eager->tcp_dontdrop = B_TRUE;
}
}
@@ -5655,14 +4773,14 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
* as we do that, we expose the eager to the classifier and
* should not touch any field outside the eager's perimeter.
* So do all the work necessary before inserting the eager
- * in its own perimeter. Be optimistic that ipcl_conn_insert()
+ * in its own perimeter. Be optimistic that conn_connect()
* will succeed but undo everything if it fails.
*/
- seg_seq = ABE32_TO_U32(tcph->th_seq);
+ seg_seq = ntohl(tcpha->tha_seq);
eager->tcp_irs = seg_seq;
eager->tcp_rack = seg_seq;
eager->tcp_rnxt = seg_seq + 1;
- U32_TO_ABE32(eager->tcp_rnxt, eager->tcp_tcph->th_ack);
+ eager->tcp_tcpha->tha_ack = htonl(eager->tcp_rnxt);
BUMP_MIB(&tcps->tcps_mib, tcpPassiveOpens);
eager->tcp_state = TCPS_SYN_RCVD;
mp1 = tcp_xmit_mp(eager, eager->tcp_xmit_head, eager->tcp_mss,
@@ -5677,24 +4795,10 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
}
/*
- * Note that in theory this should use the current pid
- * so that getpeerucred on the client returns the actual listener
- * that does accept. But accept() hasn't been called yet. We could use
- * the pid of the process that did bind/listen on the server.
- * However, with common usage like inetd() the bind/listen can be done
- * by a different process than the accept().
- * Hence we do the simple thing of using the open pid here.
- * Note that db_credp is set later in tcp_send_data().
- */
- mblk_setcred(mp1, credp, tcp->tcp_cpid);
- eager->tcp_cpid = tcp->tcp_cpid;
- eager->tcp_open_time = lbolt64;
-
- /*
* We need to start the rto timer. In normal case, we start
* the timer after sending the packet on the wire (or at
* least believing that packet was sent by waiting for
- * CALL_IP_WPUT() to return). Since this is the first packet
+ * conn_ip_output() to return). Since this is the first packet
* being sent on the wire for the eager, our initial tcp_rto
* is at least tcp_rexmit_interval_min which is a fairly
* large value to allow the algorithm to adjust slowly to large
@@ -5716,7 +4820,7 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
* ensure against an eager close race.
*/
- CONN_INC_REF(eager->tcp_connp);
+ CONN_INC_REF(econnp);
TCP_TIMER_RESTART(eager, eager->tcp_rto);
@@ -5724,22 +4828,16 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
* Insert the eager in its own perimeter now. We are ready to deal
* with any packets on eager.
*/
- if (eager->tcp_ipversion == IPV4_VERSION) {
- if (ipcl_conn_insert(econnp, IPPROTO_TCP, 0, 0, 0) != 0) {
- goto error;
- }
- } else {
- if (ipcl_conn_insert_v6(econnp, IPPROTO_TCP, 0, 0, 0, 0) != 0) {
- goto error;
- }
- }
-
- /* mark conn as fully-bound */
- econnp->conn_fully_bound = B_TRUE;
+ if (ipcl_conn_insert(econnp) != 0)
+ goto error;
- /* Send the SYN-ACK */
- tcp_send_data(eager, eager->tcp_wq, mp1);
- CONN_DEC_REF(eager->tcp_connp);
+ /*
+ * Send the SYN-ACK. Can't use tcp_send_data since we can't update
+ * pmtu etc; we are not on the eager's squeue
+ */
+ ASSERT(econnp->conn_ixa->ixa_notify_cookie == econnp->conn_tcp);
+ (void) conn_ip_output(mp1, econnp->conn_ixa);
+ CONN_DEC_REF(econnp);
freemsg(mp);
return;
@@ -5749,7 +4847,7 @@ error:
TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
mp1 = &eager->tcp_closemp;
SQUEUE_ENTER_ONE(econnp->conn_sqp, mp1, tcp_eager_kill,
- econnp, SQ_FILL, SQTAG_TCP_CONN_REQ_2);
+ econnp, NULL, SQ_FILL, SQTAG_TCP_CONN_REQ_2);
/*
* If a connection already exists, send the mp to that connections so
@@ -5757,7 +4855,7 @@ error:
*/
ipst = tcps->tcps_netstack->netstack_ip;
- if ((econnp = ipcl_classify(mp, connp->conn_zoneid, ipst)) != NULL) {
+ if ((econnp = ipcl_classify(mp, ira, ipst)) != NULL) {
if (!IPCL_IS_CONNECTED(econnp)) {
/*
* Something bad happened. ipcl_conn_insert()
@@ -5772,8 +4870,8 @@ error:
CONN_DEC_REF(econnp);
freemsg(mp);
} else {
- SQUEUE_ENTER_ONE(econnp->conn_sqp, mp,
- tcp_input, econnp, SQ_FILL, SQTAG_TCP_CONN_REQ_1);
+ SQUEUE_ENTER_ONE(econnp->conn_sqp, mp, tcp_input_data,
+ econnp, ira, SQ_FILL, SQTAG_TCP_CONN_REQ_1);
}
} else {
/* Nobody wants this packet */
@@ -5803,18 +4901,21 @@ error2:
* very first time and there is no attempt to rebind them.
*/
void
-tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2)
+tcp_input_listener_unbound(void *arg, mblk_t *mp, void *arg2,
+ ip_recv_attr_t *ira)
{
conn_t *connp = (conn_t *)arg;
squeue_t *sqp = (squeue_t *)arg2;
squeue_t *new_sqp;
uint32_t conn_flags;
- if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) {
- new_sqp = (squeue_t *)DB_CKSUMSTART(mp);
- } else {
- goto done;
- }
+ /*
+ * IP sets ira_sqp to either the senders conn_sqp (for loopback)
+ * or based on the ring (for packets from GLD). Otherwise it is
+ * set based on lbolt i.e., a somewhat random number.
+ */
+ ASSERT(ira->ira_sqp != NULL);
+ new_sqp = ira->ira_sqp;
if (connp->conn_fanout == NULL)
goto done;
@@ -5849,6 +4950,8 @@ tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2)
if (connp->conn_sqp != new_sqp) {
while (connp->conn_sqp != new_sqp)
(void) casptr(&connp->conn_sqp, sqp, new_sqp);
+ /* No special MT issues for outbound ixa_sqp hint */
+ connp->conn_ixa->ixa_sqp = new_sqp;
}
do {
@@ -5860,49 +4963,47 @@ tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2)
mutex_exit(&connp->conn_fanout->connf_lock);
mutex_exit(&connp->conn_lock);
+
+ /*
+ * Assume we have picked a good squeue for the listener. Make
+ * subsequent SYNs not try to change the squeue.
+ */
+ connp->conn_recv = tcp_input_listener;
}
done:
if (connp->conn_sqp != sqp) {
CONN_INC_REF(connp);
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp,
- SQ_FILL, SQTAG_TCP_CONN_REQ_UNBOUND);
+ ira, SQ_FILL, SQTAG_TCP_CONN_REQ_UNBOUND);
} else {
- tcp_conn_request(connp, mp, sqp);
+ tcp_input_listener(connp, mp, sqp, ira);
}
}
/*
* Successful connect request processing begins when our client passes
- * a T_CONN_REQ message into tcp_wput() and ends when tcp_rput() passes
- * our T_OK_ACK reply message upstream. The control flow looks like this:
- * upstream -> tcp_wput() -> tcp_wput_proto() -> tcp_tpi_connect() -> IP
- * upstream <- tcp_rput() <- IP
+ * a T_CONN_REQ message into tcp_wput(), which performs function calls into
+ * IP and the passes a T_OK_ACK (or T_ERROR_ACK upstream).
+ *
* After various error checks are completed, tcp_tpi_connect() lays
- * the target address and port into the composite header template,
- * preallocates the T_OK_ACK reply message, construct a full 12 byte bind
- * request followed by an IRE request, and passes the three mblk message
- * down to IP looking like this:
- * O_T_BIND_REQ for IP --> IRE req --> T_OK_ACK for our client
- * Processing continues in tcp_rput() when we receive the following message:
- * T_BIND_ACK from IP --> IRE ack --> T_OK_ACK for our client
- * After consuming the first two mblks, tcp_rput() calls tcp_timer(),
- * to fire off the connection request, and then passes the T_OK_ACK mblk
- * upstream that we filled in below. There are, of course, numerous
- * error conditions along the way which truncate the processing described
- * above.
+ * the target address and port into the composite header template.
+ * Then we ask IP for information, including a source address if we didn't
+ * already have one. Finally we prepare to send the SYN packet, and then
+ * send up the T_OK_ACK reply message.
*/
static void
tcp_tpi_connect(tcp_t *tcp, mblk_t *mp)
{
sin_t *sin;
- queue_t *q = tcp->tcp_wq;
struct T_conn_req *tcr;
struct sockaddr *sa;
socklen_t len;
int error;
cred_t *cr;
pid_t cpid;
+ conn_t *connp = tcp->tcp_connp;
+ queue_t *q = connp->conn_wq;
/*
* All Solaris components should pass a db_credp
@@ -5944,7 +5045,7 @@ tcp_tpi_connect(tcp_t *tcp, mblk_t *mp)
* Determine packet type based on type of address passed in
* the request should contain an IPv4 or IPv6 address.
* Make sure that address family matches the type of
- * family of the the address passed down
+ * family of the address passed down.
*/
switch (tcr->DEST_length) {
default:
@@ -6022,7 +5123,7 @@ tcp_tpi_connect(tcp_t *tcp, mblk_t *mp)
break;
}
- error = proto_verify_ip_addr(tcp->tcp_family, sa, len);
+ error = proto_verify_ip_addr(connp->conn_family, sa, len);
if (error != 0) {
tcp_err_ack(tcp, mp, TSYSERR, error);
return;
@@ -6111,7 +5212,7 @@ tcp_tpi_connect(tcp_t *tcp, mblk_t *mp)
/* return error ack and blow away saved option results if any */
connect_failed:
if (mp != NULL)
- putnext(tcp->tcp_rq, mp);
+ putnext(connp->conn_rq, mp);
else {
tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
TSYSERR, ENOMEM);
@@ -6121,20 +5222,19 @@ connect_failed:
/*
* Handle connect to IPv4 destinations, including connections for AF_INET6
* sockets connecting to IPv4 mapped IPv6 destinations.
+ * Returns zero if OK, a positive errno, or a negative TLI error.
*/
static int
tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport,
- uint_t srcid, cred_t *cr, pid_t pid)
+ uint_t srcid)
{
- tcph_t *tcph;
- mblk_t *mp;
- ipaddr_t dstaddr = *dstaddrp;
- int32_t oldstate;
- uint16_t lport;
- int error = 0;
+ ipaddr_t dstaddr = *dstaddrp;
+ uint16_t lport;
+ conn_t *connp = tcp->tcp_connp;
tcp_stack_t *tcps = tcp->tcp_tcps;
+ int error;
- ASSERT(tcp->tcp_ipversion == IPV4_VERSION);
+ ASSERT(connp->conn_ipversion == IPV4_VERSION);
/* Check for attempt to connect to INADDR_ANY */
if (dstaddr == INADDR_ANY) {
@@ -6157,74 +5257,21 @@ tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport,
}
/* Handle __sin6_src_id if socket not bound to an IP address */
- if (srcid != 0 && tcp->tcp_ipha->ipha_src == INADDR_ANY) {
- ip_srcid_find_id(srcid, &tcp->tcp_ip_src_v6,
- tcp->tcp_connp->conn_zoneid, tcps->tcps_netstack);
- IN6_V4MAPPED_TO_IPADDR(&tcp->tcp_ip_src_v6,
- tcp->tcp_ipha->ipha_src);
+ if (srcid != 0 && connp->conn_laddr_v4 == INADDR_ANY) {
+ ip_srcid_find_id(srcid, &connp->conn_laddr_v6,
+ IPCL_ZONEID(connp), tcps->tcps_netstack);
+ connp->conn_saddr_v6 = connp->conn_laddr_v6;
}
- /*
- * Don't let an endpoint connect to itself. Note that
- * the test here does not catch the case where the
- * source IP addr was left unspecified by the user. In
- * this case, the source addr is set in tcp_adapt_ire()
- * using the reply to the T_BIND message that we send
- * down to IP here and the check is repeated in tcp_rput_other.
- */
- if (dstaddr == tcp->tcp_ipha->ipha_src &&
- dstport == tcp->tcp_lport) {
- error = -TBADADDR;
- goto failed;
- }
+ IN6_IPADDR_TO_V4MAPPED(dstaddr, &connp->conn_faddr_v6);
+ connp->conn_fport = dstport;
/*
- * Verify the destination is allowed to receive packets
- * at the security label of the connection we are initiating.
- * tsol_check_dest() may create a new effective cred for this
- * connection with a modified label or label flags.
- */
- if (is_system_labeled()) {
- ASSERT(tcp->tcp_connp->conn_effective_cred == NULL);
- if ((error = tsol_check_dest(CONN_CRED(tcp->tcp_connp),
- &dstaddr, IPV4_VERSION, tcp->tcp_connp->conn_mac_mode,
- &tcp->tcp_connp->conn_effective_cred)) != 0) {
- if (error != EHOSTUNREACH)
- error = -TSYSERR;
- goto failed;
- }
- }
-
- tcp->tcp_ipha->ipha_dst = dstaddr;
- IN6_IPADDR_TO_V4MAPPED(dstaddr, &tcp->tcp_remote_v6);
-
- /*
- * Massage a source route if any putting the first hop
- * in iph_dst. Compute a starting value for the checksum which
- * takes into account that the original iph_dst should be
- * included in the checksum but that ip will include the
- * first hop in the source route in the tcp checksum.
- */
- tcp->tcp_sum = ip_massage_options(tcp->tcp_ipha, tcps->tcps_netstack);
- tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16);
- tcp->tcp_sum -= ((tcp->tcp_ipha->ipha_dst >> 16) +
- (tcp->tcp_ipha->ipha_dst & 0xffff));
- if ((int)tcp->tcp_sum < 0)
- tcp->tcp_sum--;
- tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16);
- tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) +
- (tcp->tcp_sum >> 16));
- tcph = tcp->tcp_tcph;
- *(uint16_t *)tcph->th_fport = dstport;
- tcp->tcp_fport = dstport;
-
- oldstate = tcp->tcp_state;
- /*
* At this point the remote destination address and remote port fields
* in the tcp-four-tuple have been filled in the tcp structure. Now we
- * have to see which state tcp was in so we can take apropriate action.
+ * have to see which state tcp was in so we can take appropriate action.
*/
- if (oldstate == TCPS_IDLE) {
+ if (tcp->tcp_state == TCPS_IDLE) {
/*
* We support a quick connect capability here, allowing
* clients to transition directly from IDLE to SYN_SENT
@@ -6233,203 +5280,93 @@ tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport,
*/
lport = tcp_update_next_port(tcps->tcps_next_port_to_try,
tcp, B_TRUE);
- lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE,
+ lport = tcp_bindi(tcp, lport, &connp->conn_laddr_v6, 0, B_TRUE,
B_FALSE, B_FALSE);
- if (lport == 0) {
- error = -TNOADDR;
- goto failed;
- }
- }
- tcp->tcp_state = TCPS_SYN_SENT;
-
- mp = allocb(sizeof (ire_t), BPRI_HI);
- if (mp == NULL) {
- tcp->tcp_state = oldstate;
- error = ENOMEM;
- goto failed;
+ if (lport == 0)
+ return (-TNOADDR);
}
- mp->b_wptr += sizeof (ire_t);
- mp->b_datap->db_type = IRE_DB_REQ_TYPE;
- tcp->tcp_hard_binding = 1;
-
/*
- * We need to make sure that the conn_recv is set to a non-null
- * value before we insert the conn_t into the classifier table.
- * This is to avoid a race with an incoming packet which does
- * an ipcl_classify().
+ * Lookup the route to determine a source address and the uinfo.
+ * If there was a source route we have tcp_ipha->ipha_dst as the first
+ * hop.
+ * Setup TCP parameters based on the metrics/DCE.
*/
- tcp->tcp_connp->conn_recv = tcp_input;
+ error = tcp_set_destination(tcp);
+ if (error != 0)
+ return (error);
- if (tcp->tcp_family == AF_INET) {
- error = ip_proto_bind_connected_v4(tcp->tcp_connp, &mp,
- IPPROTO_TCP, &tcp->tcp_ipha->ipha_src, tcp->tcp_lport,
- tcp->tcp_remote, tcp->tcp_fport, B_TRUE, B_TRUE, cr);
- } else {
- in6_addr_t v6src;
- if (tcp->tcp_ipversion == IPV4_VERSION) {
- IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, &v6src);
- } else {
- v6src = tcp->tcp_ip6h->ip6_src;
- }
- error = ip_proto_bind_connected_v6(tcp->tcp_connp, &mp,
- IPPROTO_TCP, &v6src, tcp->tcp_lport, &tcp->tcp_remote_v6,
- &tcp->tcp_sticky_ipp, tcp->tcp_fport, B_TRUE, B_TRUE, cr);
- }
- BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens);
- tcp->tcp_active_open = 1;
+ /*
+ * Don't let an endpoint connect to itself.
+ */
+ if (connp->conn_faddr_v4 == connp->conn_laddr_v4 &&
+ connp->conn_fport == connp->conn_lport)
+ return (-TBADADDR);
+ tcp->tcp_state = TCPS_SYN_SENT;
- return (tcp_post_ip_bind(tcp, mp, error, cr, pid));
-failed:
- /* return error ack and blow away saved option results if any */
- if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
- tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
- return (error);
+ return (ipcl_conn_insert_v4(connp));
}
/*
* Handle connect to IPv6 destinations.
+ * Returns zero if OK, a positive errno, or a negative TLI error.
*/
static int
tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp, in_port_t dstport,
- uint32_t flowinfo, uint_t srcid, uint32_t scope_id, cred_t *cr, pid_t pid)
+ uint32_t flowinfo, uint_t srcid, uint32_t scope_id)
{
- tcph_t *tcph;
- mblk_t *mp;
- ip6_rthdr_t *rth;
- int32_t oldstate;
- uint16_t lport;
+ uint16_t lport;
+ conn_t *connp = tcp->tcp_connp;
tcp_stack_t *tcps = tcp->tcp_tcps;
- int error = 0;
- conn_t *connp = tcp->tcp_connp;
+ int error;
- ASSERT(tcp->tcp_family == AF_INET6);
+ ASSERT(connp->conn_family == AF_INET6);
/*
* If we're here, it means that the destination address is a native
- * IPv6 address. Return an error if tcp_ipversion is not IPv6. A
+ * IPv6 address. Return an error if conn_ipversion is not IPv6. A
* reason why it might not be IPv6 is if the socket was bound to an
* IPv4-mapped IPv6 address.
*/
- if (tcp->tcp_ipversion != IPV6_VERSION) {
+ if (connp->conn_ipversion != IPV6_VERSION)
return (-TBADADDR);
- }
/*
* Interpret a zero destination to mean loopback.
* Update the T_CONN_REQ (sin/sin6) since it is used to
* generate the T_CONN_CON.
*/
- if (IN6_IS_ADDR_UNSPECIFIED(dstaddrp)) {
+ if (IN6_IS_ADDR_UNSPECIFIED(dstaddrp))
*dstaddrp = ipv6_loopback;
- }
/* Handle __sin6_src_id if socket not bound to an IP address */
- if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) {
- ip_srcid_find_id(srcid, &tcp->tcp_ip6h->ip6_src,
- connp->conn_zoneid, tcps->tcps_netstack);
- tcp->tcp_ip_src_v6 = tcp->tcp_ip6h->ip6_src;
- }
-
- /*
- * Take care of the scope_id now and add ip6i_t
- * if ip6i_t is not already allocated through TCP
- * sticky options. At this point tcp_ip6h does not
- * have dst info, thus use dstaddrp.
- */
- if (scope_id != 0 &&
- IN6_IS_ADDR_LINKSCOPE(dstaddrp)) {
- ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp;
- ip6i_t *ip6i;
-
- ipp->ipp_ifindex = scope_id;
- ip6i = (ip6i_t *)tcp->tcp_iphc;
-
- if ((ipp->ipp_fields & IPPF_HAS_IP6I) &&
- ip6i != NULL && (ip6i->ip6i_nxt == IPPROTO_RAW)) {
- /* Already allocated */
- ip6i->ip6i_flags |= IP6I_IFINDEX;
- ip6i->ip6i_ifindex = ipp->ipp_ifindex;
- ipp->ipp_fields |= IPPF_SCOPE_ID;
- } else {
- int reterr;
-
- ipp->ipp_fields |= IPPF_SCOPE_ID;
- if (ipp->ipp_fields & IPPF_HAS_IP6I)
- ip2dbg(("tcp_connect_v6: SCOPE_ID set\n"));
- reterr = tcp_build_hdrs(tcp);
- if (reterr != 0)
- goto failed;
- ip1dbg(("tcp_connect_ipv6: tcp_bld_hdrs returned\n"));
- }
- }
-
- /*
- * Don't let an endpoint connect to itself. Note that
- * the test here does not catch the case where the
- * source IP addr was left unspecified by the user. In
- * this case, the source addr is set in tcp_adapt_ire()
- * using the reply to the T_BIND message that we send
- * down to IP here and the check is repeated in tcp_rput_other.
- */
- if (IN6_ARE_ADDR_EQUAL(dstaddrp, &tcp->tcp_ip6h->ip6_src) &&
- (dstport == tcp->tcp_lport)) {
- error = -TBADADDR;
- goto failed;
+ if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
+ ip_srcid_find_id(srcid, &connp->conn_laddr_v6,
+ IPCL_ZONEID(connp), tcps->tcps_netstack);
+ connp->conn_saddr_v6 = connp->conn_laddr_v6;
}
/*
- * Verify the destination is allowed to receive packets
- * at the security label of the connection we are initiating.
- * check_dest may create a new effective cred for this
- * connection with a modified label or label flags.
+ * Take care of the scope_id now.
*/
- if (is_system_labeled()) {
- ASSERT(tcp->tcp_connp->conn_effective_cred == NULL);
- if ((error = tsol_check_dest(CONN_CRED(tcp->tcp_connp),
- dstaddrp, IPV6_VERSION, tcp->tcp_connp->conn_mac_mode,
- &tcp->tcp_connp->conn_effective_cred)) != 0) {
- if (error != EHOSTUNREACH)
- error = -TSYSERR;
- goto failed;
- }
- }
-
- tcp->tcp_ip6h->ip6_dst = *dstaddrp;
- tcp->tcp_remote_v6 = *dstaddrp;
- tcp->tcp_ip6h->ip6_vcf =
- (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
- (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
-
- /*
- * Massage a routing header (if present) putting the first hop
- * in ip6_dst. Compute a starting value for the checksum which
- * takes into account that the original ip6_dst should be
- * included in the checksum but that ip will include the
- * first hop in the source route in the tcp checksum.
- */
- rth = ip_find_rthdr_v6(tcp->tcp_ip6h, (uint8_t *)tcp->tcp_tcph);
- if (rth != NULL) {
- tcp->tcp_sum = ip_massage_options_v6(tcp->tcp_ip6h, rth,
- tcps->tcps_netstack);
- tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) +
- (tcp->tcp_sum >> 16));
+ if (scope_id != 0 && IN6_IS_ADDR_LINKSCOPE(dstaddrp)) {
+ connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
+ connp->conn_ixa->ixa_scopeid = scope_id;
} else {
- tcp->tcp_sum = 0;
+ connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
}
- tcph = tcp->tcp_tcph;
- *(uint16_t *)tcph->th_fport = dstport;
- tcp->tcp_fport = dstport;
+ connp->conn_flowinfo = flowinfo;
+ connp->conn_faddr_v6 = *dstaddrp;
+ connp->conn_fport = dstport;
- oldstate = tcp->tcp_state;
/*
* At this point the remote destination address and remote port fields
* in the tcp-four-tuple have been filled in the tcp structure. Now we
- * have to see which state tcp was in so we can take apropriate action.
+ * have to see which state tcp was in so we can take appropriate action.
*/
- if (oldstate == TCPS_IDLE) {
+ if (tcp->tcp_state == TCPS_IDLE) {
/*
* We support a quick connect capability here, allowing
* clients to transition directly from IDLE to SYN_SENT
@@ -6438,128 +5375,55 @@ tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp, in_port_t dstport,
*/
lport = tcp_update_next_port(tcps->tcps_next_port_to_try,
tcp, B_TRUE);
- lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE,
+ lport = tcp_bindi(tcp, lport, &connp->conn_laddr_v6, 0, B_TRUE,
B_FALSE, B_FALSE);
- if (lport == 0) {
- error = -TNOADDR;
- goto failed;
- }
+ if (lport == 0)
+ return (-TNOADDR);
}
- tcp->tcp_state = TCPS_SYN_SENT;
-
- mp = allocb(sizeof (ire_t), BPRI_HI);
- if (mp != NULL) {
- in6_addr_t v6src;
-
- mp->b_wptr += sizeof (ire_t);
- mp->b_datap->db_type = IRE_DB_REQ_TYPE;
- tcp->tcp_hard_binding = 1;
-
- /*
- * We need to make sure that the conn_recv is set to a non-null
- * value before we insert the conn_t into the classifier table.
- * This is to avoid a race with an incoming packet which does
- * an ipcl_classify().
- */
- tcp->tcp_connp->conn_recv = tcp_input;
+ /*
+ * Lookup the route to determine a source address and the uinfo.
+ * If there was a source route we have tcp_ip6h->ip6_dst as the first
+ * hop.
+ * Setup TCP parameters based on the metrics/DCE.
+ */
+ error = tcp_set_destination(tcp);
+ if (error != 0)
+ return (error);
- if (tcp->tcp_ipversion == IPV4_VERSION) {
- IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, &v6src);
- } else {
- v6src = tcp->tcp_ip6h->ip6_src;
- }
- error = ip_proto_bind_connected_v6(connp, &mp, IPPROTO_TCP,
- &v6src, tcp->tcp_lport, &tcp->tcp_remote_v6,
- &tcp->tcp_sticky_ipp, tcp->tcp_fport, B_TRUE, B_TRUE, cr);
- BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens);
- tcp->tcp_active_open = 1;
+ /*
+ * Don't let an endpoint connect to itself.
+ */
+ if (IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, &connp->conn_laddr_v6) &&
+ connp->conn_fport == connp->conn_lport)
+ return (-TBADADDR);
- return (tcp_post_ip_bind(tcp, mp, error, cr, pid));
- }
- /* Error case */
- tcp->tcp_state = oldstate;
- error = ENOMEM;
+ tcp->tcp_state = TCPS_SYN_SENT;
-failed:
- /* return error ack and blow away saved option results if any */
- if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
- tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
- return (error);
+ return (ipcl_conn_insert_v6(connp));
}
/*
- * We need a stream q for detached closing tcp connections
- * to use. Our client hereby indicates that this q is the
- * one to use.
+ * Disconnect
+ * Note that unlike other functions this returns a positive tli error
+ * when it fails; it never returns an errno.
*/
-static void
-tcp_def_q_set(tcp_t *tcp, mblk_t *mp)
-{
- struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
- queue_t *q = tcp->tcp_wq;
- tcp_stack_t *tcps = tcp->tcp_tcps;
-
-#ifdef NS_DEBUG
- (void) printf("TCP_IOC_DEFAULT_Q for stack %d\n",
- tcps->tcps_netstack->netstack_stackid);
-#endif
- mp->b_datap->db_type = M_IOCACK;
- iocp->ioc_count = 0;
- mutex_enter(&tcps->tcps_g_q_lock);
- if (tcps->tcps_g_q != NULL) {
- mutex_exit(&tcps->tcps_g_q_lock);
- iocp->ioc_error = EALREADY;
- } else {
- int error = 0;
- conn_t *connp = tcp->tcp_connp;
- ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
-
- tcps->tcps_g_q = tcp->tcp_rq;
- mutex_exit(&tcps->tcps_g_q_lock);
- iocp->ioc_error = 0;
- iocp->ioc_rval = 0;
- /*
- * We are passing tcp_sticky_ipp as NULL
- * as it is not useful for tcp_default queue
- *
- * Set conn_recv just in case.
- */
- tcp->tcp_connp->conn_recv = tcp_conn_request;
-
- ASSERT(connp->conn_af_isv6);
- connp->conn_ulp = IPPROTO_TCP;
-
- if (ipst->ips_ipcl_proto_fanout_v6[IPPROTO_TCP].connf_head !=
- NULL || (connp->conn_mac_mode != CONN_MAC_DEFAULT)) {
- error = -TBADADDR;
- } else {
- connp->conn_srcv6 = ipv6_all_zeros;
- ipcl_proto_insert_v6(connp, IPPROTO_TCP);
- }
-
- (void) tcp_post_ip_bind(tcp, NULL, error, NULL, 0);
- }
- qreply(q, mp);
-}
-
static int
tcp_disconnect_common(tcp_t *tcp, t_scalar_t seqnum)
{
tcp_t *ltcp = NULL;
- conn_t *connp;
+ conn_t *lconnp;
tcp_stack_t *tcps = tcp->tcp_tcps;
+ conn_t *connp = tcp->tcp_connp;
/*
* Right now, upper modules pass down a T_DISCON_REQ to TCP,
* when the stream is in BOUND state. Do not send a reset,
* since the destination IP address is not valid, and it can
* be the initialized value of all zeros (broadcast address).
- *
- * XXX There won't be any pending bind request to IP.
*/
- if (tcp->tcp_state <= TCPS_BOUND) {
- if (tcp->tcp_debug) {
+ if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_hard_binding) {
+ if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
"tcp_disconnect: bad state, %d", tcp->tcp_state);
}
@@ -6595,19 +5459,23 @@ tcp_disconnect_common(tcp_t *tcp, t_scalar_t seqnum)
* If it used to be a listener, check to make sure no one else
* has taken the port before switching back to LISTEN state.
*/
- if (tcp->tcp_ipversion == IPV4_VERSION) {
- connp = ipcl_lookup_listener_v4(tcp->tcp_lport,
- tcp->tcp_ipha->ipha_src,
- tcp->tcp_connp->conn_zoneid, ipst);
- if (connp != NULL)
- ltcp = connp->conn_tcp;
+ if (connp->conn_ipversion == IPV4_VERSION) {
+ lconnp = ipcl_lookup_listener_v4(connp->conn_lport,
+ connp->conn_laddr_v4, IPCL_ZONEID(connp), ipst);
+ if (lconnp != NULL)
+ ltcp = lconnp->conn_tcp;
} else {
- /* Allow tcp_bound_if listeners? */
- connp = ipcl_lookup_listener_v6(tcp->tcp_lport,
- &tcp->tcp_ip6h->ip6_src, 0,
- tcp->tcp_connp->conn_zoneid, ipst);
- if (connp != NULL)
- ltcp = connp->conn_tcp;
+ uint_t ifindex = 0;
+
+ if (connp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)
+ ifindex = connp->conn_ixa->ixa_scopeid;
+
+ /* Allow conn_bound_if listeners? */
+ lconnp = ipcl_lookup_listener_v6(connp->conn_lport,
+ &connp->conn_laddr_v6, ifindex, IPCL_ZONEID(connp),
+ ipst);
+ if (lconnp != NULL)
+ ltcp = lconnp->conn_tcp;
}
if (tcp->tcp_conn_req_max && ltcp == NULL) {
tcp->tcp_state = TCPS_LISTEN;
@@ -6616,7 +5484,7 @@ tcp_disconnect_common(tcp_t *tcp, t_scalar_t seqnum)
tcp->tcp_state = TCPS_BOUND;
}
if (ltcp != NULL)
- CONN_DEC_REF(ltcp->tcp_connp);
+ CONN_DEC_REF(lconnp);
if (old_state == TCPS_SYN_SENT || old_state == TCPS_SYN_RCVD) {
BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
} else if (old_state == TCPS_ESTABLISHED ||
@@ -6648,7 +5516,7 @@ tcp_disconnect_common(tcp_t *tcp, t_scalar_t seqnum)
/*
* Our client hereby directs us to reject the connection request
- * that tcp_conn_request() marked with 'seqnum'. Rejection consists
+ * that tcp_input_listener() marked with 'seqnum'. Rejection consists
* of sending the appropriate RST, not an ICMP error.
*/
static void
@@ -6656,6 +5524,7 @@ tcp_disconnect(tcp_t *tcp, mblk_t *mp)
{
t_scalar_t seqnum;
int error;
+ conn_t *connp = tcp->tcp_connp;
ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_discon_req)) {
@@ -6669,11 +5538,11 @@ tcp_disconnect(tcp_t *tcp, mblk_t *mp)
else {
if (tcp->tcp_state >= TCPS_ESTABLISHED) {
/* Send M_FLUSH according to TPI */
- (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW);
+ (void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW);
}
mp = mi_tpi_ok_ack_alloc(mp);
- if (mp)
- putnext(tcp->tcp_rq, mp);
+ if (mp != NULL)
+ putnext(connp->conn_rq, mp);
}
}
@@ -6695,6 +5564,7 @@ tcp_display(tcp_t *tcp, char *sup_buf, char format)
in6_addr_t local, remote;
char local_addrbuf[INET6_ADDRSTRLEN];
char remote_addrbuf[INET6_ADDRSTRLEN];
+ conn_t *connp;
if (sup_buf != NULL)
buf = sup_buf;
@@ -6703,6 +5573,8 @@ tcp_display(tcp_t *tcp, char *sup_buf, char format)
if (tcp == NULL)
return ("NULL_TCP");
+
+ connp = tcp->tcp_connp;
switch (tcp->tcp_state) {
case TCPS_CLOSED:
cp = "TCP_CLOSED";
@@ -6750,32 +5622,32 @@ tcp_display(tcp_t *tcp, char *sup_buf, char format)
}
switch (format) {
case DISP_ADDR_AND_PORT:
- if (tcp->tcp_ipversion == IPV4_VERSION) {
+ if (connp->conn_ipversion == IPV4_VERSION) {
/*
* Note that we use the remote address in the tcp_b
* structure. This means that it will print out
* the real destination address, not the next hop's
* address if source routing is used.
*/
- IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ip_src, &local);
- IN6_IPADDR_TO_V4MAPPED(tcp->tcp_remote, &remote);
+ IN6_IPADDR_TO_V4MAPPED(connp->conn_laddr_v4, &local);
+ IN6_IPADDR_TO_V4MAPPED(connp->conn_faddr_v4, &remote);
} else {
- local = tcp->tcp_ip_src_v6;
- remote = tcp->tcp_remote_v6;
+ local = connp->conn_laddr_v6;
+ remote = connp->conn_faddr_v6;
}
(void) inet_ntop(AF_INET6, &local, local_addrbuf,
sizeof (local_addrbuf));
(void) inet_ntop(AF_INET6, &remote, remote_addrbuf,
sizeof (remote_addrbuf));
(void) mi_sprintf(buf, "[%s.%u, %s.%u] %s",
- local_addrbuf, ntohs(tcp->tcp_lport), remote_addrbuf,
- ntohs(tcp->tcp_fport), cp);
+ local_addrbuf, ntohs(connp->conn_lport), remote_addrbuf,
+ ntohs(connp->conn_fport), cp);
break;
case DISP_PORT_ONLY:
default:
(void) mi_sprintf(buf, "[%u, %u] %s",
- ntohs(tcp->tcp_lport), ntohs(tcp->tcp_fport), cp);
+ ntohs(connp->conn_lport), ntohs(connp->conn_fport), cp);
break;
}
@@ -6788,26 +5660,24 @@ tcp_display(tcp_t *tcp, char *sup_buf, char format)
* eager to disappear either by means of tcp_eager_blowoff() or
* tcp_eager_cleanup() being called. tcp_eager_kill() can also be
* called (via squeue) if the eager cannot be inserted in the
- * fanout table in tcp_conn_request().
+ * fanout table in tcp_input_listener().
*/
/* ARGSUSED */
void
-tcp_eager_kill(void *arg, mblk_t *mp, void *arg2)
+tcp_eager_kill(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
{
conn_t *econnp = (conn_t *)arg;
tcp_t *eager = econnp->conn_tcp;
tcp_t *listener = eager->tcp_listener;
- tcp_stack_t *tcps = eager->tcp_tcps;
/*
* We could be called because listener is closing. Since
- * the eager is using listener's queue's, its not safe.
- * Better use the default queue just to send the TH_RST
- * out.
+ * the eager was using listener's queue's, we avoid
+ * using the listeners queues from now on.
*/
- ASSERT(tcps->tcps_g_q != NULL);
- eager->tcp_rq = tcps->tcps_g_q;
- eager->tcp_wq = WR(tcps->tcps_g_q);
+ ASSERT(eager->tcp_detached);
+ econnp->conn_rq = NULL;
+ econnp->conn_wq = NULL;
/*
* An eager's conn_fanout will be NULL if it's a duplicate
@@ -6828,7 +5698,7 @@ tcp_eager_kill(void *arg, mblk_t *mp, void *arg2)
* The eager has sent a conn_ind up to the
* listener but listener decides to close
* instead. We need to drop the extra ref
- * placed on eager in tcp_rput_data() before
+ * placed on eager in tcp_input_data() before
* sending the conn_ind to listener.
*/
CONN_DEC_REF(econnp);
@@ -6873,7 +5743,7 @@ tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum)
mutex_exit(&listener->tcp_eager_lock);
mp = &eager->tcp_closemp;
SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill,
- eager->tcp_connp, SQ_FILL, SQTAG_TCP_EAGER_BLOWOFF);
+ eager->tcp_connp, NULL, SQ_FILL, SQTAG_TCP_EAGER_BLOWOFF);
return (B_TRUE);
}
@@ -6901,7 +5771,7 @@ tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only)
CONN_INC_REF(eager->tcp_connp);
mp = &eager->tcp_closemp;
SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp,
- tcp_eager_kill, eager->tcp_connp,
+ tcp_eager_kill, eager->tcp_connp, NULL,
SQ_FILL, SQTAG_TCP_EAGER_CLEANUP);
}
eager = eager->tcp_eager_next_q;
@@ -6917,7 +5787,7 @@ tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only)
CONN_INC_REF(eager->tcp_connp);
mp = &eager->tcp_closemp;
SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp,
- tcp_eager_kill, eager->tcp_connp, SQ_FILL,
+ tcp_eager_kill, eager->tcp_connp, NULL, SQ_FILL,
SQTAG_TCP_EAGER_CLEANUP_Q0);
}
eager = eager->tcp_eager_next_q0;
@@ -7008,7 +5878,7 @@ static void
tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error)
{
if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
- putnext(tcp->tcp_rq, mp);
+ putnext(tcp->tcp_connp->conn_rq, mp);
}
/* Shorthand to generate and send TPI error acks to our client */
@@ -7024,7 +5894,7 @@ tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
teackp->ERROR_prim = primitive;
teackp->TLI_error = t_error;
teackp->UNIX_error = sys_error;
- putnext(tcp->tcp_rq, mp);
+ putnext(tcp->tcp_connp->conn_rq, mp);
}
}
@@ -7194,8 +6064,9 @@ static void
tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp)
{
tcp_stack_t *tcps = tcp->tcp_tcps;
+ conn_t *connp = tcp->tcp_connp;
- if (tcp->tcp_family == AF_INET6)
+ if (connp->conn_family == AF_INET6)
*tia = tcp_g_t_info_ack_v6;
else
*tia = tcp_g_t_info_ack;
@@ -7203,7 +6074,7 @@ tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp)
tia->OPT_size = tcp_max_optsize;
if (tcp->tcp_mss == 0) {
/* Not yet set - tcp_open does not set mss */
- if (tcp->tcp_ipversion == IPV4_VERSION)
+ if (connp->conn_ipversion == IPV4_VERSION)
tia->TIDU_size = tcps->tcps_mss_def_ipv4;
else
tia->TIDU_size = tcps->tcps_mss_def_ipv6;
@@ -7258,7 +6129,7 @@ tcp_capability_req(tcp_t *tcp, mblk_t *mp)
tcap = (struct T_capability_ack *)mp->b_rptr;
tcp_do_capability_ack(tcp, tcap, cap_bits1);
- putnext(tcp->tcp_rq, mp);
+ putnext(tcp->tcp_connp->conn_rq, mp);
}
/*
@@ -7276,16 +6147,18 @@ tcp_info_req(tcp_t *tcp, mblk_t *mp)
return;
}
tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp);
- putnext(tcp->tcp_rq, mp);
+ putnext(tcp->tcp_connp->conn_rq, mp);
}
/* Respond to the TPI addr request */
static void
tcp_addr_req(tcp_t *tcp, mblk_t *mp)
{
- sin_t *sin;
+ struct sockaddr *sa;
mblk_t *ackmp;
struct T_addr_ack *taa;
+ conn_t *connp = tcp->tcp_connp;
+ uint_t addrlen;
/* Make it large enough for worst case */
ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
@@ -7295,10 +6168,6 @@ tcp_addr_req(tcp_t *tcp, mblk_t *mp)
return;
}
- if (tcp->tcp_ipversion == IPV6_VERSION) {
- tcp_addr_req_ipv6(tcp, ackmp);
- return;
- }
taa = (struct T_addr_ack *)ackmp->b_rptr;
bzero(taa, sizeof (struct T_addr_ack));
@@ -7307,110 +6176,38 @@ tcp_addr_req(tcp_t *tcp, mblk_t *mp)
taa->PRIM_type = T_ADDR_ACK;
ackmp->b_datap->db_type = M_PCPROTO;
+ if (connp->conn_family == AF_INET)
+ addrlen = sizeof (sin_t);
+ else
+ addrlen = sizeof (sin6_t);
+
/*
* Note: Following code assumes 32 bit alignment of basic
* data structures like sin_t and struct T_addr_ack.
*/
if (tcp->tcp_state >= TCPS_BOUND) {
/*
- * Fill in local address
+ * Fill in local address first
*/
- taa->LOCADDR_length = sizeof (sin_t);
taa->LOCADDR_offset = sizeof (*taa);
-
- sin = (sin_t *)&taa[1];
-
- /* Fill zeroes and then intialize non-zero fields */
- *sin = sin_null;
-
- sin->sin_family = AF_INET;
-
- sin->sin_addr.s_addr = tcp->tcp_ipha->ipha_src;
- sin->sin_port = *(uint16_t *)tcp->tcp_tcph->th_lport;
-
- ackmp->b_wptr = (uchar_t *)&sin[1];
-
- if (tcp->tcp_state >= TCPS_SYN_RCVD) {
- /*
- * Fill in Remote address
- */
- taa->REMADDR_length = sizeof (sin_t);
- taa->REMADDR_offset = ROUNDUP32(taa->LOCADDR_offset +
- taa->LOCADDR_length);
-
- sin = (sin_t *)(ackmp->b_rptr + taa->REMADDR_offset);
- *sin = sin_null;
- sin->sin_family = AF_INET;
- sin->sin_addr.s_addr = tcp->tcp_remote;
- sin->sin_port = tcp->tcp_fport;
-
- ackmp->b_wptr = (uchar_t *)&sin[1];
- }
+ taa->LOCADDR_length = addrlen;
+ sa = (struct sockaddr *)&taa[1];
+ (void) conn_getsockname(connp, sa, &addrlen);
+ ackmp->b_wptr += addrlen;
}
- putnext(tcp->tcp_rq, ackmp);
-}
-
-/* Assumes that tcp_addr_req gets enough space and alignment */
-static void
-tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *ackmp)
-{
- sin6_t *sin6;
- struct T_addr_ack *taa;
-
- ASSERT(tcp->tcp_ipversion == IPV6_VERSION);
- ASSERT(OK_32PTR(ackmp->b_rptr));
- ASSERT(ackmp->b_wptr - ackmp->b_rptr >= sizeof (struct T_addr_ack) +
- 2 * sizeof (sin6_t));
-
- taa = (struct T_addr_ack *)ackmp->b_rptr;
-
- bzero(taa, sizeof (struct T_addr_ack));
- ackmp->b_wptr = (uchar_t *)&taa[1];
-
- taa->PRIM_type = T_ADDR_ACK;
- ackmp->b_datap->db_type = M_PCPROTO;
-
- /*
- * Note: Following code assumes 32 bit alignment of basic
- * data structures like sin6_t and struct T_addr_ack.
- */
- if (tcp->tcp_state >= TCPS_BOUND) {
+ if (tcp->tcp_state >= TCPS_SYN_RCVD) {
/*
- * Fill in local address
+ * Fill in Remote address
*/
- taa->LOCADDR_length = sizeof (sin6_t);
- taa->LOCADDR_offset = sizeof (*taa);
-
- sin6 = (sin6_t *)&taa[1];
- *sin6 = sin6_null;
-
- sin6->sin6_family = AF_INET6;
- sin6->sin6_addr = tcp->tcp_ip6h->ip6_src;
- sin6->sin6_port = tcp->tcp_lport;
-
- ackmp->b_wptr = (uchar_t *)&sin6[1];
-
- if (tcp->tcp_state >= TCPS_SYN_RCVD) {
- /*
- * Fill in Remote address
- */
- taa->REMADDR_length = sizeof (sin6_t);
- taa->REMADDR_offset = ROUNDUP32(taa->LOCADDR_offset +
- taa->LOCADDR_length);
-
- sin6 = (sin6_t *)(ackmp->b_rptr + taa->REMADDR_offset);
- *sin6 = sin6_null;
- sin6->sin6_family = AF_INET6;
- sin6->sin6_flowinfo =
- tcp->tcp_ip6h->ip6_vcf &
- ~IPV6_VERS_AND_FLOW_MASK;
- sin6->sin6_addr = tcp->tcp_remote_v6;
- sin6->sin6_port = tcp->tcp_fport;
-
- ackmp->b_wptr = (uchar_t *)&sin6[1];
- }
+ taa->REMADDR_length = addrlen;
+ /* assumed 32-bit alignment */
+ taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length;
+ sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset);
+ (void) conn_getpeername(connp, sa, &addrlen);
+ ackmp->b_wptr += addrlen;
}
- putnext(tcp->tcp_rq, ackmp);
+ ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
+ putnext(tcp->tcp_connp->conn_rq, ackmp);
}
/*
@@ -7420,19 +6217,19 @@ tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *ackmp)
static void
tcp_reinit(tcp_t *tcp)
{
- mblk_t *mp;
- int err;
+ mblk_t *mp;
tcp_stack_t *tcps = tcp->tcp_tcps;
+ conn_t *connp = tcp->tcp_connp;
TCP_STAT(tcps, tcp_reinit_calls);
/* tcp_reinit should never be called for detached tcp_t's */
ASSERT(tcp->tcp_listener == NULL);
- ASSERT((tcp->tcp_family == AF_INET &&
- tcp->tcp_ipversion == IPV4_VERSION) ||
- (tcp->tcp_family == AF_INET6 &&
- (tcp->tcp_ipversion == IPV4_VERSION ||
- tcp->tcp_ipversion == IPV6_VERSION)));
+ ASSERT((connp->conn_family == AF_INET &&
+ connp->conn_ipversion == IPV4_VERSION) ||
+ (connp->conn_family == AF_INET6 &&
+ (connp->conn_ipversion == IPV4_VERSION ||
+ connp->conn_ipversion == IPV6_VERSION)));
/* Cancel outstanding timers */
tcp_timers_stop(tcp);
@@ -7453,7 +6250,7 @@ tcp_reinit(tcp_t *tcp)
tcp->tcp_unsent = tcp->tcp_xmit_tail_unsent = 0;
mutex_enter(&tcp->tcp_non_sq_lock);
if (tcp->tcp_flow_stopped &&
- TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
+ TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
tcp_clrqfull(tcp);
}
mutex_exit(&tcp->tcp_non_sq_lock);
@@ -7494,7 +6291,7 @@ tcp_reinit(tcp_t *tcp)
*/
tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
- CL_INET_DISCONNECT(tcp->tcp_connp, tcp);
+ CL_INET_DISCONNECT(connp);
/*
* The connection can't be on the tcp_time_wait_head list
@@ -7522,14 +6319,12 @@ tcp_reinit(tcp_t *tcp)
* Reset/preserve other values
*/
tcp_reinit_values(tcp);
- ipcl_hash_remove(tcp->tcp_connp);
- conn_delete_ire(tcp->tcp_connp, NULL);
+ ipcl_hash_remove(connp);
+ ixa_cleanup(connp->conn_ixa);
tcp_ipsec_cleanup(tcp);
- if (tcp->tcp_connp->conn_effective_cred != NULL) {
- crfree(tcp->tcp_connp->conn_effective_cred);
- tcp->tcp_connp->conn_effective_cred = NULL;
- }
+ connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
+ connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
if (tcp->tcp_conn_req_max != 0) {
/*
@@ -7553,44 +6348,31 @@ tcp_reinit(tcp_t *tcp)
tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp;
tcp->tcp_eager_next_drop_q0 = tcp;
tcp->tcp_eager_prev_drop_q0 = tcp;
- tcp->tcp_connp->conn_recv = tcp_conn_request;
- if (tcp->tcp_family == AF_INET6) {
- ASSERT(tcp->tcp_connp->conn_af_isv6);
- (void) ipcl_bind_insert_v6(tcp->tcp_connp, IPPROTO_TCP,
- &tcp->tcp_ip6h->ip6_src, tcp->tcp_lport);
- } else {
- ASSERT(!tcp->tcp_connp->conn_af_isv6);
- (void) ipcl_bind_insert(tcp->tcp_connp, IPPROTO_TCP,
- tcp->tcp_ipha->ipha_src, tcp->tcp_lport);
- }
+ /*
+ * Initially set conn_recv to tcp_input_listener_unbound to try
+ * to pick a good squeue for the listener when the first SYN
+ * arrives. tcp_input_listener_unbound sets it to
+ * tcp_input_listener on that first SYN.
+ */
+ connp->conn_recv = tcp_input_listener_unbound;
+
+ connp->conn_proto = IPPROTO_TCP;
+ connp->conn_faddr_v6 = ipv6_all_zeros;
+ connp->conn_fport = 0;
+
+ (void) ipcl_bind_insert(connp);
} else {
tcp->tcp_state = TCPS_BOUND;
}
/*
* Initialize to default values
- * Can't fail since enough header template space already allocated
- * at open().
- */
- err = tcp_init_values(tcp);
- ASSERT(err == 0);
- /* Restore state in tcp_tcph */
- bcopy(&tcp->tcp_lport, tcp->tcp_tcph->th_lport, TCP_PORT_LEN);
- if (tcp->tcp_ipversion == IPV4_VERSION)
- tcp->tcp_ipha->ipha_src = tcp->tcp_bound_source;
- else
- tcp->tcp_ip6h->ip6_src = tcp->tcp_bound_source_v6;
- /*
- * Copy of the src addr. in tcp_t is needed in tcp_t
- * since the lookup funcs can only lookup on tcp_t
*/
- tcp->tcp_ip_src_v6 = tcp->tcp_bound_source_v6;
+ tcp_init_values(tcp);
ASSERT(tcp->tcp_ptpbhn != NULL);
- tcp->tcp_recv_hiwater = tcps->tcps_recv_hiwat;
- tcp->tcp_recv_lowater = tcp_rinfo.mi_lowat;
- tcp->tcp_rwnd = tcps->tcps_recv_hiwat;
- tcp->tcp_mss = tcp->tcp_ipversion != IPV4_VERSION ?
+ tcp->tcp_rwnd = connp->conn_rcvbuf;
+ tcp->tcp_mss = connp->conn_ipversion != IPV4_VERSION ?
tcps->tcps_mss_def_ipv6 : tcps->tcps_mss_def_ipv4;
}
@@ -7606,6 +6388,7 @@ tcp_reinit_values(tcp)
tcp_t *tcp;
{
tcp_stack_t *tcps = tcp->tcp_tcps;
+ conn_t *connp = tcp->tcp_connp;
#ifndef lint
#define DONTCARE(x)
@@ -7626,8 +6409,8 @@ tcp_reinit_values(tcp)
ASSERT(tcp->tcp_time_wait_prev == NULL);
ASSERT(tcp->tcp_time_wait_expire == 0);
PRESERVE(tcp->tcp_state);
- PRESERVE(tcp->tcp_rq);
- PRESERVE(tcp->tcp_wq);
+ PRESERVE(connp->conn_rq);
+ PRESERVE(connp->conn_wq);
ASSERT(tcp->tcp_xmit_head == NULL);
ASSERT(tcp->tcp_xmit_last == NULL);
@@ -7638,26 +6421,32 @@ tcp_reinit_values(tcp)
tcp->tcp_snxt = 0; /* Displayed in mib */
tcp->tcp_suna = 0; /* Displayed in mib */
tcp->tcp_swnd = 0;
- DONTCARE(tcp->tcp_cwnd); /* Init in tcp_mss_set */
+ DONTCARE(tcp->tcp_cwnd); /* Init in tcp_process_options */
ASSERT(tcp->tcp_ibsegs == 0);
ASSERT(tcp->tcp_obsegs == 0);
- if (tcp->tcp_iphc != NULL) {
- ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH);
- bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
+ if (connp->conn_ht_iphc != NULL) {
+ kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
+ connp->conn_ht_iphc = NULL;
+ connp->conn_ht_iphc_allocated = 0;
+ connp->conn_ht_iphc_len = 0;
+ connp->conn_ht_ulp = NULL;
+ connp->conn_ht_ulp_len = 0;
+ tcp->tcp_ipha = NULL;
+ tcp->tcp_ip6h = NULL;
+ tcp->tcp_tcpha = NULL;
}
+ /* We clear any IP_OPTIONS and extension headers */
+ ip_pkt_free(&connp->conn_xmit_ipp);
+
DONTCARE(tcp->tcp_naglim); /* Init in tcp_init_values */
- DONTCARE(tcp->tcp_hdr_len); /* Init in tcp_init_values */
DONTCARE(tcp->tcp_ipha);
DONTCARE(tcp->tcp_ip6h);
- DONTCARE(tcp->tcp_ip_hdr_len);
- DONTCARE(tcp->tcp_tcph);
- DONTCARE(tcp->tcp_tcp_hdr_len); /* Init in tcp_init_values */
+ DONTCARE(tcp->tcp_tcpha);
tcp->tcp_valid_bits = 0;
- DONTCARE(tcp->tcp_xmit_hiwater); /* Init in tcp_init_values */
DONTCARE(tcp->tcp_timer_backoff); /* Init in tcp_init_values */
DONTCARE(tcp->tcp_last_recv_time); /* Init in tcp_init_values */
tcp->tcp_last_rcv_lbolt = 0;
@@ -7666,38 +6455,19 @@ tcp_reinit_values(tcp)
tcp->tcp_urp_last_valid = 0;
tcp->tcp_hard_binding = 0;
- tcp->tcp_hard_bound = 0;
- PRESERVE(tcp->tcp_cred);
- PRESERVE(tcp->tcp_cpid);
- PRESERVE(tcp->tcp_open_time);
- PRESERVE(tcp->tcp_exclbind);
tcp->tcp_fin_acked = 0;
tcp->tcp_fin_rcvd = 0;
tcp->tcp_fin_sent = 0;
tcp->tcp_ordrel_done = 0;
- tcp->tcp_debug = 0;
- tcp->tcp_dontroute = 0;
- tcp->tcp_broadcast = 0;
-
- tcp->tcp_useloopback = 0;
- tcp->tcp_reuseaddr = 0;
- tcp->tcp_oobinline = 0;
- tcp->tcp_dgram_errind = 0;
-
tcp->tcp_detached = 0;
- tcp->tcp_bind_pending = 0;
- tcp->tcp_unbind_pending = 0;
tcp->tcp_snd_ws_ok = B_FALSE;
tcp->tcp_snd_ts_ok = B_FALSE;
- tcp->tcp_linger = 0;
- tcp->tcp_ka_enabled = 0;
tcp->tcp_zero_win_probe = 0;
tcp->tcp_loopback = 0;
- tcp->tcp_refuse = 0;
tcp->tcp_localnet = 0;
tcp->tcp_syn_defense = 0;
tcp->tcp_set_timer = 0;
@@ -7707,19 +6477,12 @@ tcp_reinit_values(tcp)
tcp->tcp_xmit_zc_clean = B_FALSE;
tcp->tcp_snd_sack_ok = B_FALSE;
- PRESERVE(tcp->tcp_recvdstaddr);
tcp->tcp_hwcksum = B_FALSE;
- tcp->tcp_ire_ill_check_done = B_FALSE;
- DONTCARE(tcp->tcp_maxpsz); /* Init in tcp_init_values */
-
- tcp->tcp_mdt = B_FALSE;
- tcp->tcp_mdt_hdr_head = 0;
- tcp->tcp_mdt_hdr_tail = 0;
+ DONTCARE(tcp->tcp_maxpsz_multiplier); /* Init in tcp_init_values */
tcp->tcp_conn_def_q0 = 0;
tcp->tcp_ip_forward_progress = B_FALSE;
- tcp->tcp_anon_priv_bind = 0;
tcp->tcp_ecn_ok = B_FALSE;
tcp->tcp_cwr = B_FALSE;
@@ -7740,7 +6503,7 @@ tcp_reinit_values(tcp)
tcp->tcp_ts_recent = 0;
tcp->tcp_rnxt = 0; /* Displayed in mib */
DONTCARE(tcp->tcp_rwnd); /* Set in tcp_reinit() */
- tcp->tcp_if_mtu = 0;
+ tcp->tcp_initial_pmtu = 0;
ASSERT(tcp->tcp_reass_head == NULL);
ASSERT(tcp->tcp_reass_tail == NULL);
@@ -7752,7 +6515,7 @@ tcp_reinit_values(tcp)
ASSERT(tcp->tcp_rcv_last_tail == NULL);
ASSERT(tcp->tcp_rcv_cnt == 0);
- DONTCARE(tcp->tcp_cwnd_ssthresh); /* Init in tcp_adapt_ire */
+ DONTCARE(tcp->tcp_cwnd_ssthresh); /* Init in tcp_set_destination */
DONTCARE(tcp->tcp_cwnd_max); /* Init in tcp_init_values */
tcp->tcp_csuna = 0;
@@ -7773,8 +6536,6 @@ tcp_reinit_values(tcp)
ASSERT(tcp->tcp_listener == NULL);
- DONTCARE(tcp->tcp_xmit_lowater); /* Init in tcp_init_values */
-
DONTCARE(tcp->tcp_irs); /* tcp_valid_bits cleared */
DONTCARE(tcp->tcp_iss); /* tcp_valid_bits cleared */
DONTCARE(tcp->tcp_fss); /* tcp_valid_bits cleared */
@@ -7785,14 +6546,11 @@ tcp_reinit_values(tcp)
PRESERVE(tcp->tcp_conn_req_max);
PRESERVE(tcp->tcp_conn_req_seqnum);
- DONTCARE(tcp->tcp_ip_hdr_len); /* Init in tcp_init_values */
DONTCARE(tcp->tcp_first_timer_threshold); /* Init in tcp_init_values */
DONTCARE(tcp->tcp_second_timer_threshold); /* Init in tcp_init_values */
DONTCARE(tcp->tcp_first_ctimer_threshold); /* Init in tcp_init_values */
DONTCARE(tcp->tcp_second_ctimer_threshold); /* in tcp_init_values */
- tcp->tcp_lingertime = 0;
-
DONTCARE(tcp->tcp_urp_last); /* tcp_urp_last_valid is cleared */
ASSERT(tcp->tcp_urp_mp == NULL);
ASSERT(tcp->tcp_urp_mark_mp == NULL);
@@ -7811,16 +6569,16 @@ tcp_reinit_values(tcp)
tcp->tcp_client_errno = 0;
- DONTCARE(tcp->tcp_sum); /* Init in tcp_init_values */
+ DONTCARE(connp->conn_sum); /* Init in tcp_init_values */
- tcp->tcp_remote_v6 = ipv6_all_zeros; /* Displayed in MIB */
+ connp->conn_faddr_v6 = ipv6_all_zeros; /* Displayed in MIB */
- PRESERVE(tcp->tcp_bound_source_v6);
+ PRESERVE(connp->conn_bound_addr_v6);
tcp->tcp_last_sent_len = 0;
tcp->tcp_dupack_cnt = 0;
- tcp->tcp_fport = 0; /* Displayed in MIB */
- PRESERVE(tcp->tcp_lport);
+ connp->conn_fport = 0; /* Displayed in MIB */
+ PRESERVE(connp->conn_lport);
PRESERVE(tcp->tcp_acceptor_lockp);
@@ -7828,16 +6586,18 @@ tcp_reinit_values(tcp)
PRESERVE(tcp->tcp_acceptor_id);
DONTCARE(tcp->tcp_ipsec_overhead);
- PRESERVE(tcp->tcp_family);
- if (tcp->tcp_family == AF_INET6) {
+ PRESERVE(connp->conn_family);
+ /* Remove any remnants of mapped address binding */
+ if (connp->conn_family == AF_INET6) {
+ connp->conn_ipversion = IPV6_VERSION;
tcp->tcp_mss = tcps->tcps_mss_def_ipv6;
} else {
+ connp->conn_ipversion = IPV4_VERSION;
tcp->tcp_mss = tcps->tcps_mss_def_ipv4;
}
- PRESERVE(tcp->tcp_ipversion); /* Init in tcp_init_values */
- tcp->tcp_bound_if = 0;
- tcp->tcp_ipv6_recvancillary = 0;
+ connp->conn_bound_if = 0;
+ connp->conn_recv_ancillary.crb_all = 0;
tcp->tcp_recvifindex = 0;
tcp->tcp_recvhops = 0;
tcp->tcp_closed = 0;
@@ -7854,19 +6614,18 @@ tcp_reinit_values(tcp)
tcp->tcp_dstoptslen = 0;
}
ASSERT(tcp->tcp_dstoptslen == 0);
- if (tcp->tcp_rtdstopts != NULL) {
- mi_free(tcp->tcp_rtdstopts);
- tcp->tcp_rtdstopts = NULL;
- tcp->tcp_rtdstoptslen = 0;
+ if (tcp->tcp_rthdrdstopts != NULL) {
+ mi_free(tcp->tcp_rthdrdstopts);
+ tcp->tcp_rthdrdstopts = NULL;
+ tcp->tcp_rthdrdstoptslen = 0;
}
- ASSERT(tcp->tcp_rtdstoptslen == 0);
+ ASSERT(tcp->tcp_rthdrdstoptslen == 0);
if (tcp->tcp_rthdr != NULL) {
mi_free(tcp->tcp_rthdr);
tcp->tcp_rthdr = NULL;
tcp->tcp_rthdrlen = 0;
}
ASSERT(tcp->tcp_rthdrlen == 0);
- PRESERVE(tcp->tcp_drop_opt_ack_cnt);
/* Reset fusion-related fields */
tcp->tcp_fused = B_FALSE;
@@ -7902,35 +6661,17 @@ tcp_reinit_values(tcp)
#undef PRESERVE
}
-/*
- * Allocate necessary resources and initialize state vector.
- * Guaranteed not to fail so that when an error is returned,
- * the caller doesn't need to do any additional cleanup.
- */
-int
-tcp_init(tcp_t *tcp, queue_t *q)
-{
- int err;
-
- tcp->tcp_rq = q;
- tcp->tcp_wq = WR(q);
- tcp->tcp_state = TCPS_IDLE;
- if ((err = tcp_init_values(tcp)) != 0)
- tcp_timers_stop(tcp);
- return (err);
-}
-
-static int
+static void
tcp_init_values(tcp_t *tcp)
{
- int err;
tcp_stack_t *tcps = tcp->tcp_tcps;
+ conn_t *connp = tcp->tcp_connp;
- ASSERT((tcp->tcp_family == AF_INET &&
- tcp->tcp_ipversion == IPV4_VERSION) ||
- (tcp->tcp_family == AF_INET6 &&
- (tcp->tcp_ipversion == IPV4_VERSION ||
- tcp->tcp_ipversion == IPV6_VERSION)));
+ ASSERT((connp->conn_family == AF_INET &&
+ connp->conn_ipversion == IPV4_VERSION) ||
+ (connp->conn_family == AF_INET6 &&
+ (connp->conn_ipversion == IPV4_VERSION ||
+ connp->conn_ipversion == IPV6_VERSION)));
/*
* Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
@@ -7953,7 +6694,7 @@ tcp_init_values(tcp_t *tcp)
tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
tcp->tcp_snd_burst = TCP_CWND_INFINITE;
- tcp->tcp_maxpsz = tcps->tcps_maxpsz_multiplier;
+ tcp->tcp_maxpsz_multiplier = tcps->tcps_maxpsz_multiplier;
tcp->tcp_first_timer_threshold = tcps->tcps_ip_notify_interval;
tcp->tcp_first_ctimer_threshold = tcps->tcps_ip_notify_cinterval;
@@ -7966,10 +6707,7 @@ tcp_init_values(tcp_t *tcp)
tcp->tcp_naglim = tcps->tcps_naglim_def;
- /* NOTE: ISS is now set in tcp_adapt_ire(). */
-
- tcp->tcp_mdt_hdr_head = 0;
- tcp->tcp_mdt_hdr_tail = 0;
+ /* NOTE: ISS is now set in tcp_set_destination(). */
/* Reset fusion-related fields */
tcp->tcp_fused = B_FALSE;
@@ -7977,280 +6715,84 @@ tcp_init_values(tcp_t *tcp)
tcp->tcp_fused_sigurg = B_FALSE;
tcp->tcp_loopback_peer = NULL;
- /* Initialize the header template */
- if (tcp->tcp_family == AF_INET) {
- err = tcp_header_init_ipv4(tcp);
- } else {
- err = tcp_header_init_ipv6(tcp);
- }
- if (err)
- return (err);
+ /* We rebuild the header template on the next connect/conn_request */
+
+ connp->conn_mlp_type = mlptSingle;
/*
* Init the window scale to the max so tcp_rwnd_set() won't pare
- * down tcp_rwnd. tcp_adapt_ire() will set the right value later.
+ * down tcp_rwnd. tcp_set_destination() will set the right value later.
*/
tcp->tcp_rcv_ws = TCP_MAX_WINSHIFT;
- tcp->tcp_xmit_lowater = tcps->tcps_xmit_lowat;
- tcp->tcp_xmit_hiwater = tcps->tcps_xmit_hiwat;
- tcp->tcp_recv_hiwater = tcps->tcps_recv_hiwat;
- tcp->tcp_rwnd = tcps->tcps_recv_hiwat;
- tcp->tcp_recv_lowater = tcp_rinfo.mi_lowat;
+ tcp->tcp_rwnd = connp->conn_rcvbuf;
tcp->tcp_cork = B_FALSE;
/*
- * Init the tcp_debug option. This value determines whether TCP
+ * Init the tcp_debug option if it wasn't already set. This value
+ * determines whether TCP
* calls strlog() to print out debug messages. Doing this
* initialization here means that this value is not inherited thru
* tcp_reinit().
*/
- tcp->tcp_debug = tcps->tcps_dbg;
+ if (!connp->conn_debug)
+ connp->conn_debug = tcps->tcps_dbg;
tcp->tcp_ka_interval = tcps->tcps_keepalive_interval;
tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval;
-
- return (0);
-}
-
-/*
- * Initialize the IPv4 header. Loses any record of any IP options.
- */
-static int
-tcp_header_init_ipv4(tcp_t *tcp)
-{
- tcph_t *tcph;
- uint32_t sum;
- conn_t *connp;
- tcp_stack_t *tcps = tcp->tcp_tcps;
-
- /*
- * This is a simple initialization. If there's
- * already a template, it should never be too small,
- * so reuse it. Otherwise, allocate space for the new one.
- */
- if (tcp->tcp_iphc == NULL) {
- ASSERT(tcp->tcp_iphc_len == 0);
- tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH;
- tcp->tcp_iphc = kmem_cache_alloc(tcp_iphc_cache, KM_NOSLEEP);
- if (tcp->tcp_iphc == NULL) {
- tcp->tcp_iphc_len = 0;
- return (ENOMEM);
- }
- }
-
- /* options are gone; may need a new label */
- connp = tcp->tcp_connp;
- connp->conn_mlp_type = mlptSingle;
- connp->conn_ulp_labeled = !is_system_labeled();
- ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH);
-
- /*
- * tcp_do_get{sock,peer}name constructs the sockaddr from the
- * ip header, and decides which header to use based on ip version.
- * That operation happens outside the squeue, so we hold the lock
- * here to ensure that the ip version and header remain consistent.
- */
- mutex_enter(&connp->conn_lock);
- tcp->tcp_ipversion = IPV4_VERSION;
- tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc;
- tcp->tcp_ip6h = NULL;
- mutex_exit(&connp->conn_lock);
-
- tcp->tcp_hdr_len = sizeof (ipha_t) + sizeof (tcph_t);
- tcp->tcp_tcp_hdr_len = sizeof (tcph_t);
- tcp->tcp_ip_hdr_len = sizeof (ipha_t);
- tcp->tcp_ipha->ipha_length = htons(sizeof (ipha_t) + sizeof (tcph_t));
- tcp->tcp_ipha->ipha_version_and_hdr_length
- = (IP_VERSION << 4) | IP_SIMPLE_HDR_LENGTH_IN_WORDS;
- tcp->tcp_ipha->ipha_ident = 0;
-
- tcp->tcp_ttl = (uchar_t)tcps->tcps_ipv4_ttl;
- tcp->tcp_tos = 0;
- tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0;
- tcp->tcp_ipha->ipha_ttl = (uchar_t)tcps->tcps_ipv4_ttl;
- tcp->tcp_ipha->ipha_protocol = IPPROTO_TCP;
-
- tcph = (tcph_t *)(tcp->tcp_iphc + sizeof (ipha_t));
- tcp->tcp_tcph = tcph;
- tcph->th_offset_and_rsrvd[0] = (5 << 4);
- /*
- * IP wants our header length in the checksum field to
- * allow it to perform a single pseudo-header+checksum
- * calculation on behalf of TCP.
- * Include the adjustment for a source route once IP_OPTIONS is set.
- */
- sum = sizeof (tcph_t) + tcp->tcp_sum;
- sum = (sum >> 16) + (sum & 0xFFFF);
- U16_TO_ABE16(sum, tcph->th_sum);
- return (0);
-}
-
-/*
- * Initialize the IPv6 header. Loses any record of any IPv6 extension headers.
- */
-static int
-tcp_header_init_ipv6(tcp_t *tcp)
-{
- tcph_t *tcph;
- uint32_t sum;
- conn_t *connp;
- tcp_stack_t *tcps = tcp->tcp_tcps;
-
- /*
- * This is a simple initialization. If there's
- * already a template, it should never be too small,
- * so reuse it. Otherwise, allocate space for the new one.
- * Ensure that there is enough space to "downgrade" the tcp_t
- * to an IPv4 tcp_t. This requires having space for a full load
- * of IPv4 options, as well as a full load of TCP options
- * (TCP_MAX_COMBINED_HEADER_LENGTH, 120 bytes); this is more space
- * than a v6 header and a TCP header with a full load of TCP options
- * (IPV6_HDR_LEN is 40 bytes; TCP_MAX_HDR_LENGTH is 60 bytes).
- * We want to avoid reallocation in the "downgraded" case when
- * processing outbound IPv4 options.
- */
- if (tcp->tcp_iphc == NULL) {
- ASSERT(tcp->tcp_iphc_len == 0);
- tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH;
- tcp->tcp_iphc = kmem_cache_alloc(tcp_iphc_cache, KM_NOSLEEP);
- if (tcp->tcp_iphc == NULL) {
- tcp->tcp_iphc_len = 0;
- return (ENOMEM);
- }
- }
-
- /* options are gone; may need a new label */
- connp = tcp->tcp_connp;
- connp->conn_mlp_type = mlptSingle;
- connp->conn_ulp_labeled = !is_system_labeled();
-
- ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH);
- tcp->tcp_hdr_len = IPV6_HDR_LEN + sizeof (tcph_t);
- tcp->tcp_tcp_hdr_len = sizeof (tcph_t);
- tcp->tcp_ip_hdr_len = IPV6_HDR_LEN;
-
- /*
- * tcp_do_get{sock,peer}name constructs the sockaddr from the
- * ip header, and decides which header to use based on ip version.
- * That operation happens outside the squeue, so we hold the lock
- * here to ensure that the ip version and header remain consistent.
- */
- mutex_enter(&connp->conn_lock);
- tcp->tcp_ipversion = IPV6_VERSION;
- tcp->tcp_ip6h = (ip6_t *)tcp->tcp_iphc;
- tcp->tcp_ipha = NULL;
- mutex_exit(&connp->conn_lock);
-
- /* Initialize the header template */
-
- tcp->tcp_ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
- tcp->tcp_ip6h->ip6_plen = ntohs(sizeof (tcph_t));
- tcp->tcp_ip6h->ip6_nxt = IPPROTO_TCP;
- tcp->tcp_ip6h->ip6_hops = (uint8_t)tcps->tcps_ipv6_hoplimit;
-
- tcph = (tcph_t *)(tcp->tcp_iphc + IPV6_HDR_LEN);
- tcp->tcp_tcph = tcph;
- tcph->th_offset_and_rsrvd[0] = (5 << 4);
- /*
- * IP wants our header length in the checksum field to
- * allow it to perform a single psuedo-header+checksum
- * calculation on behalf of TCP.
- * Include the adjustment for a source route when IPV6_RTHDR is set.
- */
- sum = sizeof (tcph_t) + tcp->tcp_sum;
- sum = (sum >> 16) + (sum & 0xFFFF);
- U16_TO_ABE16(sum, tcph->th_sum);
- return (0);
}
/* At minimum we need 8 bytes in the TCP header for the lookup */
#define ICMP_MIN_TCP_HDR 8
/*
- * tcp_icmp_error is called by tcp_rput_other to process ICMP error messages
+ * tcp_icmp_input is called as conn_recvicmp to process ICMP error messages
* passed up by IP. The message is always received on the correct tcp_t.
* Assumes that IP has pulled up everything up to and including the ICMP header.
*/
-void
-tcp_icmp_error(tcp_t *tcp, mblk_t *mp)
+/* ARGSUSED2 */
+static void
+tcp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
{
- icmph_t *icmph;
- ipha_t *ipha;
- int iph_hdr_length;
- tcph_t *tcph;
- boolean_t ipsec_mctl = B_FALSE;
- boolean_t secure;
- mblk_t *first_mp = mp;
- int32_t new_mss;
- uint32_t ratio;
- size_t mp_size = MBLKL(mp);
- uint32_t seg_seq;
- tcp_stack_t *tcps = tcp->tcp_tcps;
- ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
-
- /* Assume IP provides aligned packets - otherwise toss */
- if (!OK_32PTR(mp->b_rptr)) {
- freemsg(mp);
- return;
- }
-
- /*
- * Since ICMP errors are normal data marked with M_CTL when sent
- * to TCP or UDP, we have to look for a IPSEC_IN value to identify
- * packets starting with an ipsec_info_t, see ipsec_info.h.
- */
- if ((mp_size == sizeof (ipsec_info_t)) &&
- (((ipsec_info_t *)mp->b_rptr)->ipsec_info_type == IPSEC_IN)) {
- ASSERT(mp->b_cont != NULL);
- mp = mp->b_cont;
- /* IP should have done this */
- ASSERT(OK_32PTR(mp->b_rptr));
- mp_size = MBLKL(mp);
- ipsec_mctl = B_TRUE;
- }
+ conn_t *connp = (conn_t *)arg1;
+ icmph_t *icmph;
+ ipha_t *ipha;
+ int iph_hdr_length;
+ tcpha_t *tcpha;
+ uint32_t seg_seq;
+ tcp_t *tcp = connp->conn_tcp;
- /*
- * Verify that we have a complete outer IP header. If not, drop it.
- */
- if (mp_size < sizeof (ipha_t)) {
-noticmpv4:
- freemsg(first_mp);
- return;
- }
+ /* Assume IP provides aligned packets */
+ ASSERT(OK_32PTR(mp->b_rptr));
+ ASSERT((MBLKL(mp) >= sizeof (ipha_t)));
- ipha = (ipha_t *)mp->b_rptr;
/*
* Verify IP version. Anything other than IPv4 or IPv6 packet is sent
* upstream. ICMPv6 is handled in tcp_icmp_error_ipv6.
*/
- switch (IPH_HDR_VERSION(ipha)) {
- case IPV6_VERSION:
- tcp_icmp_error_ipv6(tcp, first_mp, ipsec_mctl);
+ if (!(ira->ira_flags & IRAF_IS_IPV4)) {
+ tcp_icmp_error_ipv6(tcp, mp, ira);
return;
- case IPV4_VERSION:
- break;
- default:
- goto noticmpv4;
}
/* Skip past the outer IP and ICMP headers */
- iph_hdr_length = IPH_HDR_LENGTH(ipha);
+ iph_hdr_length = ira->ira_ip_hdr_length;
icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
/*
- * If we don't have the correct outer IP header length or if the ULP
- * is not IPPROTO_ICMP or if we don't have a complete inner IP header
- * send it upstream.
+ * If we don't have the correct outer IP header length
+ * or if we don't have a complete inner IP header
+ * drop it.
*/
if (iph_hdr_length < sizeof (ipha_t) ||
- ipha->ipha_protocol != IPPROTO_ICMP ||
(ipha_t *)&icmph[1] + 1 > (ipha_t *)mp->b_wptr) {
- goto noticmpv4;
+noticmpv4:
+ freemsg(mp);
+ return;
}
ipha = (ipha_t *)&icmph[1];
/* Skip past the inner IP and find the ULP header */
iph_hdr_length = IPH_HDR_LENGTH(ipha);
- tcph = (tcph_t *)((char *)ipha + iph_hdr_length);
+ tcpha = (tcpha_t *)((char *)ipha + iph_hdr_length);
/*
* If we don't have the correct inner IP header length or if the ULP
* is not IPPROTO_TCP or if we don't have at least ICMP_MIN_TCP_HDR
@@ -8258,166 +6800,20 @@ noticmpv4:
*/
if (iph_hdr_length < sizeof (ipha_t) ||
ipha->ipha_protocol != IPPROTO_TCP ||
- (uchar_t *)tcph + ICMP_MIN_TCP_HDR > mp->b_wptr) {
- goto noticmpv4;
- }
-
- if (TCP_IS_DETACHED_NONEAGER(tcp)) {
- if (ipsec_mctl) {
- secure = ipsec_in_is_secure(first_mp);
- } else {
- secure = B_FALSE;
- }
- if (secure) {
- /*
- * If we are willing to accept this in clear
- * we don't have to verify policy.
- */
- if (!ipsec_inbound_accept_clear(mp, ipha, NULL)) {
- if (!tcp_check_policy(tcp, first_mp,
- ipha, NULL, secure, ipsec_mctl)) {
- /*
- * tcp_check_policy called
- * ip_drop_packet() on failure.
- */
- return;
- }
- }
- }
- } else if (ipsec_mctl) {
- /*
- * This is a hard_bound connection. IP has already
- * verified policy. We don't have to do it again.
- */
- freeb(first_mp);
- first_mp = mp;
- ipsec_mctl = B_FALSE;
- }
-
- seg_seq = ABE32_TO_U32(tcph->th_seq);
- /*
- * TCP SHOULD check that the TCP sequence number contained in
- * payload of the ICMP error message is within the range
- * SND.UNA <= SEG.SEQ < SND.NXT.
- */
- if (SEQ_LT(seg_seq, tcp->tcp_suna) || SEQ_GEQ(seg_seq, tcp->tcp_snxt)) {
- /*
- * The ICMP message is bogus, just drop it. But if this is
- * an ICMP too big message, IP has already changed
- * the ire_max_frag to the bogus value. We need to change
- * it back.
- */
- if (icmph->icmph_type == ICMP_DEST_UNREACHABLE &&
- icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) {
- conn_t *connp = tcp->tcp_connp;
- ire_t *ire;
- int flag;
-
- if (tcp->tcp_ipversion == IPV4_VERSION) {
- flag = tcp->tcp_ipha->
- ipha_fragment_offset_and_flags;
- } else {
- flag = 0;
- }
- mutex_enter(&connp->conn_lock);
- if ((ire = connp->conn_ire_cache) != NULL) {
- mutex_enter(&ire->ire_lock);
- mutex_exit(&connp->conn_lock);
- ire->ire_max_frag = tcp->tcp_if_mtu;
- ire->ire_frag_flag |= flag;
- mutex_exit(&ire->ire_lock);
- } else {
- mutex_exit(&connp->conn_lock);
- }
- }
+ (uchar_t *)tcpha + ICMP_MIN_TCP_HDR > mp->b_wptr) {
goto noticmpv4;
}
+ seg_seq = ntohl(tcpha->tha_seq);
switch (icmph->icmph_type) {
case ICMP_DEST_UNREACHABLE:
switch (icmph->icmph_code) {
case ICMP_FRAGMENTATION_NEEDED:
/*
- * Reduce the MSS based on the new MTU. This will
- * eliminate any fragmentation locally.
- * N.B. There may well be some funny side-effects on
- * the local send policy and the remote receive policy.
- * Pending further research, we provide
- * tcp_ignore_path_mtu just in case this proves
- * disastrous somewhere.
- *
- * After updating the MSS, retransmit part of the
- * dropped segment using the new mss by calling
- * tcp_wput_data(). Need to adjust all those
- * params to make sure tcp_wput_data() work properly.
- */
- if (tcps->tcps_ignore_path_mtu ||
- tcp->tcp_ipha->ipha_fragment_offset_and_flags == 0)
- break;
-
- /*
- * Decrease the MSS by time stamp options
- * IP options and IPSEC options. tcp_hdr_len
- * includes time stamp option and IP option
- * length. Note that new_mss may be negative
- * if tcp_ipsec_overhead is large and the
- * icmph_du_mtu is the minimum value, which is 68.
- */
- new_mss = ntohs(icmph->icmph_du_mtu) -
- tcp->tcp_hdr_len - tcp->tcp_ipsec_overhead;
-
- DTRACE_PROBE2(tcp__pmtu__change, tcp_t *, tcp, int,
- new_mss);
-
- /*
- * Only update the MSS if the new one is
- * smaller than the previous one. This is
- * to avoid problems when getting multiple
- * ICMP errors for the same MTU.
- */
- if (new_mss >= tcp->tcp_mss)
- break;
-
- /*
- * Note that we are using the template header's DF
- * bit in the fast path sending. So we need to compare
- * the new mss with both tcps_mss_min and ip_pmtu_min.
- * And stop doing IPv4 PMTUd if new_mss is less than
- * MAX(tcps_mss_min, ip_pmtu_min).
- */
- if (new_mss < tcps->tcps_mss_min ||
- new_mss < ipst->ips_ip_pmtu_min) {
- tcp->tcp_ipha->ipha_fragment_offset_and_flags =
- 0;
- }
-
- ratio = tcp->tcp_cwnd / tcp->tcp_mss;
- ASSERT(ratio >= 1);
- tcp_mss_set(tcp, new_mss, B_TRUE);
-
- /*
- * Make sure we have something to
- * send.
+ * Update Path MTU, then try to send something out.
*/
- if (SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) &&
- (tcp->tcp_xmit_head != NULL)) {
- /*
- * Shrink tcp_cwnd in
- * proportion to the old MSS/new MSS.
- */
- tcp->tcp_cwnd = ratio * tcp->tcp_mss;
- if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
- (tcp->tcp_unsent == 0)) {
- tcp->tcp_rexmit_max = tcp->tcp_fss;
- } else {
- tcp->tcp_rexmit_max = tcp->tcp_snxt;
- }
- tcp->tcp_rexmit_nxt = tcp->tcp_suna;
- tcp->tcp_rexmit = B_TRUE;
- tcp->tcp_dupack_cnt = 0;
- tcp->tcp_snd_burst = TCP_CWND_SS;
- tcp_ss_rexmit(tcp);
- }
+ tcp_update_pmtu(tcp, B_TRUE);
+ tcp_rexmit_after_error(tcp);
break;
case ICMP_PORT_UNREACHABLE:
case ICMP_PROTOCOL_UNREACHABLE:
@@ -8451,7 +6847,6 @@ noticmpv4:
* Ditch the half-open connection if we
* suspect a SYN attack is under way.
*/
- tcp_ip_ire_mark_advice(tcp);
(void) tcp_clean_death(tcp,
tcp->tcp_client_errno, 7);
}
@@ -8483,67 +6878,191 @@ noticmpv4:
break;
}
}
- freemsg(first_mp);
+ freemsg(mp);
}
/*
- * tcp_icmp_error_ipv6 is called by tcp_rput_other to process ICMPv6
- * error messages passed up by IP.
- * Assumes that IP has pulled up all the extension headers as well
- * as the ICMPv6 header.
+ * CALLED OUTSIDE OF SQUEUE! It can not follow any pointers that tcp might
+ * change. But it can refer to fields like tcp_suna and tcp_snxt.
+ *
+ * Function tcp_verifyicmp is called as conn_verifyicmp to verify the ICMP
+ * error messages received by IP. The message is always received on the correct
+ * tcp_t.
+ */
+/* ARGSUSED */
+static boolean_t
+tcp_verifyicmp(conn_t *connp, void *arg2, icmph_t *icmph, icmp6_t *icmp6,
+ ip_recv_attr_t *ira)
+{
+ tcpha_t *tcpha = (tcpha_t *)arg2;
+ uint32_t seq = ntohl(tcpha->tha_seq);
+ tcp_t *tcp = connp->conn_tcp;
+
+ /*
+ * TCP sequence number contained in payload of the ICMP error message
+ * should be within the range SND.UNA <= SEG.SEQ < SND.NXT. Otherwise,
+ * the message is either a stale ICMP error, or an attack from the
+ * network. Fail the verification.
+ */
+ if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GEQ(seq, tcp->tcp_snxt))
+ return (B_FALSE);
+
+ /* For "too big" we also check the ignore flag */
+ if (ira->ira_flags & IRAF_IS_IPV4) {
+ ASSERT(icmph != NULL);
+ if (icmph->icmph_type == ICMP_DEST_UNREACHABLE &&
+ icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED &&
+ tcp->tcp_tcps->tcps_ignore_path_mtu)
+ return (B_FALSE);
+ } else {
+ ASSERT(icmp6 != NULL);
+ if (icmp6->icmp6_type == ICMP6_PACKET_TOO_BIG &&
+ tcp->tcp_tcps->tcps_ignore_path_mtu)
+ return (B_FALSE);
+ }
+ return (B_TRUE);
+}
+
+/*
+ * Update the TCP connection according to change of PMTU.
+ *
+ * Path MTU might have changed by either increase or decrease, so need to
+ * adjust the MSS based on the value of ixa_pmtu. No need to handle tiny
+ * or negative MSS, since tcp_mss_set() will do it.
*/
static void
-tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, boolean_t ipsec_mctl)
+tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only)
{
- icmp6_t *icmp6;
- ip6_t *ip6h;
- uint16_t iph_hdr_length;
- tcpha_t *tcpha;
- uint8_t *nexthdrp;
- uint32_t new_mss;
- uint32_t ratio;
- boolean_t secure;
- mblk_t *first_mp = mp;
- size_t mp_size;
- uint32_t seg_seq;
- tcp_stack_t *tcps = tcp->tcp_tcps;
+ uint32_t pmtu;
+ int32_t mss;
+ conn_t *connp = tcp->tcp_connp;
+ ip_xmit_attr_t *ixa = connp->conn_ixa;
+ iaflags_t ixaflags;
+
+ if (tcp->tcp_tcps->tcps_ignore_path_mtu)
+ return;
+
+ if (tcp->tcp_state < TCPS_ESTABLISHED)
+ return;
/*
- * The caller has determined if this is an IPSEC_IN packet and
- * set ipsec_mctl appropriately (see tcp_icmp_error).
+ * Always call ip_get_pmtu() to make sure that IP has updated
+ * ixa_flags properly.
*/
- if (ipsec_mctl)
- mp = mp->b_cont;
+ pmtu = ip_get_pmtu(ixa);
+ ixaflags = ixa->ixa_flags;
- mp_size = MBLKL(mp);
+ /*
+ * Calculate the MSS by decreasing the PMTU by conn_ht_iphc_len and
+ * IPsec overhead if applied. Make sure to use the most recent
+ * IPsec information.
+ */
+ mss = pmtu - connp->conn_ht_iphc_len - conn_ipsec_length(connp);
/*
- * Verify that we have a complete IP header. If not, send it upstream.
+ * Nothing to change, so just return.
*/
- if (mp_size < sizeof (ip6_t)) {
-noticmpv6:
- freemsg(first_mp);
+ if (mss == tcp->tcp_mss)
return;
- }
/*
- * Verify this is an ICMPV6 packet, else send it upstream.
+ * Currently, for ICMP errors, only PMTU decrease is handled.
*/
- ip6h = (ip6_t *)mp->b_rptr;
- if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
- iph_hdr_length = IPV6_HDR_LEN;
- } else if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length,
- &nexthdrp) ||
- *nexthdrp != IPPROTO_ICMPV6) {
- goto noticmpv6;
+ if (mss > tcp->tcp_mss && decrease_only)
+ return;
+
+ DTRACE_PROBE2(tcp_update_pmtu, int32_t, tcp->tcp_mss, uint32_t, mss);
+
+ /*
+ * Update ixa_fragsize and ixa_pmtu.
+ */
+ ixa->ixa_fragsize = ixa->ixa_pmtu = pmtu;
+
+ /*
+ * Adjust MSS and all relevant variables.
+ */
+ tcp_mss_set(tcp, mss);
+
+ /*
+ * If the PMTU is below the min size maintained by IP, then ip_get_pmtu
+ * has set IXAF_PMTU_TOO_SMALL and cleared IXAF_PMTU_IPV4_DF. Since TCP
+ * has a (potentially different) min size we do the same. Make sure to
+ * clear IXAF_DONTFRAG, which is used by IP to decide whether to
+ * fragment the packet.
+ *
+ * LSO over IPv6 can not be fragmented. So need to disable LSO
+ * when IPv6 fragmentation is needed.
+ */
+ if (mss < tcp->tcp_tcps->tcps_mss_min)
+ ixaflags |= IXAF_PMTU_TOO_SMALL;
+
+ if (ixaflags & IXAF_PMTU_TOO_SMALL)
+ ixaflags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
+
+ if ((connp->conn_ipversion == IPV4_VERSION) &&
+ !(ixaflags & IXAF_PMTU_IPV4_DF)) {
+ tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0;
}
+ ixa->ixa_flags = ixaflags;
+}
+
+/*
+ * Do slow start retransmission after ICMP errors of PMTU changes.
+ */
+static void
+tcp_rexmit_after_error(tcp_t *tcp)
+{
+ /*
+ * All sent data has been acknowledged or no data left to send, just
+ * to return.
+ */
+ if (!SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) ||
+ (tcp->tcp_xmit_head == NULL))
+ return;
+
+ if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && (tcp->tcp_unsent == 0))
+ tcp->tcp_rexmit_max = tcp->tcp_fss;
+ else
+ tcp->tcp_rexmit_max = tcp->tcp_snxt;
+
+ tcp->tcp_rexmit_nxt = tcp->tcp_suna;
+ tcp->tcp_rexmit = B_TRUE;
+ tcp->tcp_dupack_cnt = 0;
+ tcp->tcp_snd_burst = TCP_CWND_SS;
+ tcp_ss_rexmit(tcp);
+}
+
+/*
+ * tcp_icmp_error_ipv6 is called from tcp_icmp_input to process ICMPv6
+ * error messages passed up by IP.
+ * Assumes that IP has pulled up all the extension headers as well
+ * as the ICMPv6 header.
+ */
+static void
+tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, ip_recv_attr_t *ira)
+{
+ icmp6_t *icmp6;
+ ip6_t *ip6h;
+ uint16_t iph_hdr_length = ira->ira_ip_hdr_length;
+ tcpha_t *tcpha;
+ uint8_t *nexthdrp;
+ uint32_t seg_seq;
+
+ /*
+ * Verify that we have a complete IP header.
+ */
+ ASSERT((MBLKL(mp) >= sizeof (ip6_t)));
+
icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
ip6h = (ip6_t *)&icmp6[1];
/*
* Verify if we have a complete ICMP and inner IP header.
*/
- if ((uchar_t *)&ip6h[1] > mp->b_wptr)
- goto noticmpv6;
+ if ((uchar_t *)&ip6h[1] > mp->b_wptr) {
+noticmpv6:
+ freemsg(mp);
+ return;
+ }
if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp))
goto noticmpv6;
@@ -8558,130 +7077,15 @@ noticmpv6:
goto noticmpv6;
}
- /*
- * ICMP errors come on the right queue or come on
- * listener/global queue for detached connections and
- * get switched to the right queue. If it comes on the
- * right queue, policy check has already been done by IP
- * and thus free the first_mp without verifying the policy.
- * If it has come for a non-hard bound connection, we need
- * to verify policy as IP may not have done it.
- */
- if (!tcp->tcp_hard_bound) {
- if (ipsec_mctl) {
- secure = ipsec_in_is_secure(first_mp);
- } else {
- secure = B_FALSE;
- }
- if (secure) {
- /*
- * If we are willing to accept this in clear
- * we don't have to verify policy.
- */
- if (!ipsec_inbound_accept_clear(mp, NULL, ip6h)) {
- if (!tcp_check_policy(tcp, first_mp,
- NULL, ip6h, secure, ipsec_mctl)) {
- /*
- * tcp_check_policy called
- * ip_drop_packet() on failure.
- */
- return;
- }
- }
- }
- } else if (ipsec_mctl) {
- /*
- * This is a hard_bound connection. IP has already
- * verified policy. We don't have to do it again.
- */
- freeb(first_mp);
- first_mp = mp;
- ipsec_mctl = B_FALSE;
- }
-
seg_seq = ntohl(tcpha->tha_seq);
- /*
- * TCP SHOULD check that the TCP sequence number contained in
- * payload of the ICMP error message is within the range
- * SND.UNA <= SEG.SEQ < SND.NXT.
- */
- if (SEQ_LT(seg_seq, tcp->tcp_suna) || SEQ_GEQ(seg_seq, tcp->tcp_snxt)) {
- /*
- * If the ICMP message is bogus, should we kill the
- * connection, or should we just drop the bogus ICMP
- * message? It would probably make more sense to just
- * drop the message so that if this one managed to get
- * in, the real connection should not suffer.
- */
- goto noticmpv6;
- }
-
switch (icmp6->icmp6_type) {
case ICMP6_PACKET_TOO_BIG:
/*
- * Reduce the MSS based on the new MTU. This will
- * eliminate any fragmentation locally.
- * N.B. There may well be some funny side-effects on
- * the local send policy and the remote receive policy.
- * Pending further research, we provide
- * tcp_ignore_path_mtu just in case this proves
- * disastrous somewhere.
- *
- * After updating the MSS, retransmit part of the
- * dropped segment using the new mss by calling
- * tcp_wput_data(). Need to adjust all those
- * params to make sure tcp_wput_data() work properly.
- */
- if (tcps->tcps_ignore_path_mtu)
- break;
-
- /*
- * Decrease the MSS by time stamp options
- * IP options and IPSEC options. tcp_hdr_len
- * includes time stamp option and IP option
- * length.
- */
- new_mss = ntohs(icmp6->icmp6_mtu) - tcp->tcp_hdr_len -
- tcp->tcp_ipsec_overhead;
-
- /*
- * Only update the MSS if the new one is
- * smaller than the previous one. This is
- * to avoid problems when getting multiple
- * ICMP errors for the same MTU.
- */
- if (new_mss >= tcp->tcp_mss)
- break;
-
- ratio = tcp->tcp_cwnd / tcp->tcp_mss;
- ASSERT(ratio >= 1);
- tcp_mss_set(tcp, new_mss, B_TRUE);
-
- /*
- * Make sure we have something to
- * send.
+ * Update Path MTU, then try to send something out.
*/
- if (SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) &&
- (tcp->tcp_xmit_head != NULL)) {
- /*
- * Shrink tcp_cwnd in
- * proportion to the old MSS/new MSS.
- */
- tcp->tcp_cwnd = ratio * tcp->tcp_mss;
- if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
- (tcp->tcp_unsent == 0)) {
- tcp->tcp_rexmit_max = tcp->tcp_fss;
- } else {
- tcp->tcp_rexmit_max = tcp->tcp_snxt;
- }
- tcp->tcp_rexmit_nxt = tcp->tcp_suna;
- tcp->tcp_rexmit = B_TRUE;
- tcp->tcp_dupack_cnt = 0;
- tcp->tcp_snd_burst = TCP_CWND_SS;
- tcp_ss_rexmit(tcp);
- }
+ tcp_update_pmtu(tcp, B_TRUE);
+ tcp_rexmit_after_error(tcp);
break;
-
case ICMP6_DST_UNREACH:
switch (icmp6->icmp6_code) {
case ICMP6_DST_UNREACH_NOPORT:
@@ -8692,7 +7096,6 @@ noticmpv6:
ECONNREFUSED, 8);
}
break;
-
case ICMP6_DST_UNREACH_ADMIN:
case ICMP6_DST_UNREACH_NOROUTE:
case ICMP6_DST_UNREACH_BEYONDSCOPE:
@@ -8708,7 +7111,6 @@ noticmpv6:
* Ditch the half-open connection if we
* suspect a SYN attack is under way.
*/
- tcp_ip_ire_mark_advice(tcp);
(void) tcp_clean_death(tcp,
tcp->tcp_client_errno, 9);
}
@@ -8720,7 +7122,6 @@ noticmpv6:
break;
}
break;
-
case ICMP6_PARAM_PROB:
/* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */
if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER &&
@@ -8739,83 +7140,42 @@ noticmpv6:
default:
break;
}
- freemsg(first_mp);
+ freemsg(mp);
}
/*
* Notify IP that we are having trouble with this connection. IP should
- * blow the IRE away and start over.
+ * make note so it can potentially use a different IRE.
*/
static void
tcp_ip_notify(tcp_t *tcp)
{
- struct iocblk *iocp;
- ipid_t *ipid;
- mblk_t *mp;
-
- /* IPv6 has NUD thus notification to delete the IRE is not needed */
- if (tcp->tcp_ipversion == IPV6_VERSION)
- return;
-
- mp = mkiocb(IP_IOCTL);
- if (mp == NULL)
- return;
-
- iocp = (struct iocblk *)mp->b_rptr;
- iocp->ioc_count = sizeof (ipid_t) + sizeof (tcp->tcp_ipha->ipha_dst);
-
- mp->b_cont = allocb(iocp->ioc_count, BPRI_HI);
- if (!mp->b_cont) {
- freeb(mp);
- return;
- }
+ conn_t *connp = tcp->tcp_connp;
+ ire_t *ire;
- ipid = (ipid_t *)mp->b_cont->b_rptr;
- mp->b_cont->b_wptr += iocp->ioc_count;
- bzero(ipid, sizeof (*ipid));
- ipid->ipid_cmd = IP_IOC_IRE_DELETE_NO_REPLY;
- ipid->ipid_ire_type = IRE_CACHE;
- ipid->ipid_addr_offset = sizeof (ipid_t);
- ipid->ipid_addr_length = sizeof (tcp->tcp_ipha->ipha_dst);
/*
* Note: in the case of source routing we want to blow away the
* route to the first source route hop.
*/
- bcopy(&tcp->tcp_ipha->ipha_dst, &ipid[1],
- sizeof (tcp->tcp_ipha->ipha_dst));
-
- CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp);
-}
-
-/* Unlink and return any mblk that looks like it contains an ire */
-static mblk_t *
-tcp_ire_mp(mblk_t **mpp)
-{
- mblk_t *mp = *mpp;
- mblk_t *prev_mp = NULL;
-
- for (;;) {
- switch (DB_TYPE(mp)) {
- case IRE_DB_TYPE:
- case IRE_DB_REQ_TYPE:
- if (mp == *mpp) {
- *mpp = mp->b_cont;
- } else {
- prev_mp->b_cont = mp->b_cont;
- }
- mp->b_cont = NULL;
- return (mp);
- default:
- break;
+ ire = connp->conn_ixa->ixa_ire;
+ if (ire != NULL && !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
+ if (ire->ire_ipversion == IPV4_VERSION) {
+ /*
+ * As per RFC 1122, we send an RTM_LOSING to inform
+ * routing protocols.
+ */
+ ip_rts_change(RTM_LOSING, ire->ire_addr,
+ ire->ire_gateway_addr, ire->ire_mask,
+ connp->conn_laddr_v4, 0, 0, 0,
+ (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA),
+ ire->ire_ipst);
}
- prev_mp = mp;
- mp = mp->b_cont;
- if (mp == NULL)
- break;
+ (void) ire_no_good(ire);
}
- return (mp);
}
+#pragma inline(tcp_send_data)
+
/*
* Timer callback routine for keepalive probe. We do a fake resend of
* last ACKed byte. Then set a timer using RTO. When the timer expires,
@@ -8890,7 +7250,7 @@ tcp_keepalive_killer(void *arg)
* timer back.
*/
if (mp != NULL) {
- tcp_send_data(tcp, tcp->tcp_wq, mp);
+ tcp_send_data(tcp, mp);
BUMP_MIB(&tcps->tcps_mib,
tcpTimKeepaliveProbe);
if (tcp->tcp_ka_last_intrvl != 0) {
@@ -8930,17 +7290,17 @@ tcp_keepalive_killer(void *arg)
int
tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk)
{
- queue_t *q = tcp->tcp_rq;
+ conn_t *connp = tcp->tcp_connp;
+ queue_t *q = connp->conn_rq;
int32_t mss = tcp->tcp_mss;
int maxpsz;
- conn_t *connp = tcp->tcp_connp;
if (TCP_IS_DETACHED(tcp))
return (mss);
if (tcp->tcp_fused) {
maxpsz = tcp_fuse_maxpsz(tcp);
mss = INFPSZ;
- } else if (tcp->tcp_mdt || tcp->tcp_lso || tcp->tcp_maxpsz == 0) {
+ } else if (tcp->tcp_maxpsz_multiplier == 0) {
/*
* Set the sd_qn_maxpsz according to the socket send buffer
* size, and sd_maxblk to INFPSZ (-1). This will essentially
@@ -8948,7 +7308,7 @@ tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk)
* kernel-allocated buffers without breaking it up into smaller
* chunks. We round up the buffer size to the nearest SMSS.
*/
- maxpsz = MSS_ROUNDUP(tcp->tcp_xmit_hiwater, mss);
+ maxpsz = MSS_ROUNDUP(connp->conn_sndbuf, mss);
if (tcp->tcp_kssl_ctx == NULL)
mss = INFPSZ;
else
@@ -8960,21 +7320,17 @@ tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk)
* head to break down larger than SMSS writes into SMSS-
* size mblks, up to tcp_maxpsz_multiplier mblks at a time.
*/
- /* XXX tune this with ndd tcp_maxpsz_multiplier */
- maxpsz = tcp->tcp_maxpsz * mss;
- if (maxpsz > tcp->tcp_xmit_hiwater/2) {
- maxpsz = tcp->tcp_xmit_hiwater/2;
+ maxpsz = tcp->tcp_maxpsz_multiplier * mss;
+ if (maxpsz > connp->conn_sndbuf / 2) {
+ maxpsz = connp->conn_sndbuf / 2;
/* Round up to nearest mss */
maxpsz = MSS_ROUNDUP(maxpsz, mss);
}
}
(void) proto_set_maxpsz(q, connp, maxpsz);
- if (!(IPCL_IS_NONSTR(connp))) {
- /* XXX do it in set_maxpsz()? */
- tcp->tcp_wq->q_maxpsz = maxpsz;
- }
-
+ if (!(IPCL_IS_NONSTR(connp)))
+ connp->conn_wq->q_maxpsz = maxpsz;
if (set_maxblk)
(void) proto_set_tx_maxblk(q, connp, mss);
return (mss);
@@ -8985,18 +7341,18 @@ tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk)
* tcpopt struct and return a bitmask saying which options were found.
*/
static int
-tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt)
+tcp_parse_options(tcpha_t *tcpha, tcp_opt_t *tcpopt)
{
uchar_t *endp;
int len;
uint32_t mss;
- uchar_t *up = (uchar_t *)tcph;
+ uchar_t *up = (uchar_t *)tcpha;
int found = 0;
int32_t sack_len;
tcp_seq sack_begin, sack_end;
tcp_t *tcp;
- endp = up + TCP_HDR_LENGTH(tcph);
+ endp = up + TCP_HDR_LENGTH(tcpha);
up += TCP_MIN_HEADER_LENGTH;
while (up < endp) {
len = endp - up;
@@ -9135,28 +7491,20 @@ tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt)
}
/*
- * Set the mss associated with a particular tcp based on its current value,
- * and a new one passed in. Observe minimums and maximums, and reset
- * other state variables that we want to view as multiples of mss.
- *
- * This function is called mainly because values like tcp_mss, tcp_cwnd,
- * highwater marks etc. need to be initialized or adjusted.
- * 1) From tcp_process_options() when the other side's SYN/SYN-ACK
- * packet arrives.
- * 2) We need to set a new MSS when ICMP_FRAGMENTATION_NEEDED or
- * ICMP6_PACKET_TOO_BIG arrives.
- * 3) From tcp_paws_check() if the other side stops sending the timestamp,
- * to increase the MSS to use the extra bytes available.
+ * Set the MSS associated with a particular tcp based on its current value,
+ * and a new one passed in. Observe minimums and maximums, and reset other
+ * state variables that we want to view as multiples of MSS.
*
- * Callers except tcp_paws_check() ensure that they only reduce mss.
+ * The value of MSS could be either increased or descreased.
*/
static void
-tcp_mss_set(tcp_t *tcp, uint32_t mss, boolean_t do_ss)
+tcp_mss_set(tcp_t *tcp, uint32_t mss)
{
uint32_t mss_max;
tcp_stack_t *tcps = tcp->tcp_tcps;
+ conn_t *connp = tcp->tcp_connp;
- if (tcp->tcp_ipversion == IPV4_VERSION)
+ if (connp->conn_ipversion == IPV4_VERSION)
mss_max = tcps->tcps_mss_max_ipv4;
else
mss_max = tcps->tcps_mss_max_ipv6;
@@ -9176,34 +7524,22 @@ tcp_mss_set(tcp_t *tcp, uint32_t mss, boolean_t do_ss)
* TCP should be able to buffer at least 4 MSS data for obvious
* performance reason.
*/
- if ((mss << 2) > tcp->tcp_xmit_hiwater)
- tcp->tcp_xmit_hiwater = mss << 2;
+ if ((mss << 2) > connp->conn_sndbuf)
+ connp->conn_sndbuf = mss << 2;
/*
- * Set the xmit_lowater to at least twice of MSS.
+ * Set the send lowater to at least twice of MSS.
*/
- if ((mss << 1) > tcp->tcp_xmit_lowater)
- tcp->tcp_xmit_lowater = mss << 1;
+ if ((mss << 1) > connp->conn_sndlowat)
+ connp->conn_sndlowat = mss << 1;
+
+ /*
+ * Update tcp_cwnd according to the new value of MSS. Keep the
+ * previous ratio to preserve the transmit rate.
+ */
+ tcp->tcp_cwnd = (tcp->tcp_cwnd / tcp->tcp_mss) * mss;
+ tcp->tcp_cwnd_cnt = 0;
- if (do_ss) {
- /*
- * Either the tcp_cwnd is as yet uninitialized, or mss is
- * changing due to a reduction in MTU, presumably as a
- * result of a new path component, reset cwnd to its
- * "initial" value, as a multiple of the new mss.
- */
- SET_TCP_INIT_CWND(tcp, mss, tcps->tcps_slow_start_initial);
- } else {
- /*
- * Called by tcp_paws_check(), the mss increased
- * marginally to allow use of space previously taken
- * by the timestamp option. It would be inappropriate
- * to apply slow start or tcp_init_cwnd values to
- * tcp_cwnd, simply adjust to a multiple of the new mss.
- */
- tcp->tcp_cwnd = (tcp->tcp_cwnd / tcp->tcp_mss) * mss;
- tcp->tcp_cwnd_cnt = 0;
- }
tcp->tcp_mss = mss;
(void) tcp_maxpsz_set(tcp, B_TRUE);
}
@@ -9223,12 +7559,11 @@ tcp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
}
static conn_t *
-tcp_create_common(queue_t *q, cred_t *credp, boolean_t isv6,
- boolean_t issocket, int *errorp)
+tcp_create_common(cred_t *credp, boolean_t isv6, boolean_t issocket,
+ int *errorp)
{
tcp_t *tcp = NULL;
conn_t *connp;
- int err;
zoneid_t zoneid;
tcp_stack_t *tcps;
squeue_t *sqp;
@@ -9265,15 +7600,6 @@ tcp_create_common(queue_t *q, cred_t *credp, boolean_t isv6,
else
zoneid = crgetzoneid(credp);
}
- /*
- * For stackid zero this is done from strplumb.c, but
- * non-zero stackids are handled here.
- */
- if (tcps->tcps_g_q == NULL &&
- tcps->tcps_netstack->netstack_stackid !=
- GLOBAL_NETSTACKID) {
- tcp_g_q_setup(tcps);
- }
sqp = IP_SQUEUE_GET((uint_t)gethrtime());
connp = (conn_t *)tcp_get_conn(sqp, tcps);
@@ -9286,41 +7612,50 @@ tcp_create_common(queue_t *q, cred_t *credp, boolean_t isv6,
*errorp = ENOSR;
return (NULL);
}
+ ASSERT(connp->conn_ixa->ixa_protocol == connp->conn_proto);
+
connp->conn_sqp = sqp;
connp->conn_initial_sqp = connp->conn_sqp;
+ connp->conn_ixa->ixa_sqp = connp->conn_sqp;
tcp = connp->conn_tcp;
+ /*
+ * Besides asking IP to set the checksum for us, have conn_ip_output
+ * to do the following checks when necessary:
+ *
+ * IXAF_VERIFY_SOURCE: drop packets when our outer source goes invalid
+ * IXAF_VERIFY_PMTU: verify PMTU changes
+ * IXAF_VERIFY_LSO: verify LSO capability changes
+ */
+ connp->conn_ixa->ixa_flags |= IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE |
+ IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO;
+
+ if (!tcps->tcps_dev_flow_ctl)
+ connp->conn_ixa->ixa_flags |= IXAF_NO_DEV_FLOW_CTL;
+
if (isv6) {
- connp->conn_flags |= IPCL_TCP6;
- connp->conn_send = ip_output_v6;
- connp->conn_af_isv6 = B_TRUE;
- connp->conn_pkt_isv6 = B_TRUE;
- connp->conn_src_preferences = IPV6_PREFER_SRC_DEFAULT;
- tcp->tcp_ipversion = IPV6_VERSION;
- tcp->tcp_family = AF_INET6;
+ connp->conn_ixa->ixa_src_preferences = IPV6_PREFER_SRC_DEFAULT;
+ connp->conn_ipversion = IPV6_VERSION;
+ connp->conn_family = AF_INET6;
tcp->tcp_mss = tcps->tcps_mss_def_ipv6;
+ connp->conn_default_ttl = tcps->tcps_ipv6_hoplimit;
} else {
- connp->conn_flags |= IPCL_TCP4;
- connp->conn_send = ip_output;
- connp->conn_af_isv6 = B_FALSE;
- connp->conn_pkt_isv6 = B_FALSE;
- tcp->tcp_ipversion = IPV4_VERSION;
- tcp->tcp_family = AF_INET;
+ connp->conn_ipversion = IPV4_VERSION;
+ connp->conn_family = AF_INET;
tcp->tcp_mss = tcps->tcps_mss_def_ipv4;
+ connp->conn_default_ttl = tcps->tcps_ipv4_ttl;
}
+ connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl;
+
+ crhold(credp);
+ connp->conn_cred = credp;
+ connp->conn_cpid = curproc->p_pid;
+ connp->conn_open_time = lbolt64;
- /*
- * TCP keeps a copy of cred for cache locality reasons but
- * we put a reference only once. If connp->conn_cred
- * becomes invalid, tcp_cred should also be set to NULL.
- */
- tcp->tcp_cred = connp->conn_cred = credp;
- crhold(connp->conn_cred);
- tcp->tcp_cpid = curproc->p_pid;
- tcp->tcp_open_time = lbolt64;
connp->conn_zoneid = zoneid;
+ /* conn_allzones can not be set this early, hence no IPCL_ZONEID */
+ connp->conn_ixa->ixa_zoneid = zoneid;
connp->conn_mlp_type = mlptSingle;
- connp->conn_ulp_labeled = !is_system_labeled();
ASSERT(connp->conn_netstack == tcps->tcps_netstack);
ASSERT(tcp->tcp_tcps == tcps);
@@ -9331,38 +7666,22 @@ tcp_create_common(queue_t *q, cred_t *credp, boolean_t isv6,
if (getpflags(NET_MAC_AWARE, credp) != 0)
connp->conn_mac_mode = CONN_MAC_AWARE;
- connp->conn_dev = NULL;
+ connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID);
+
if (issocket) {
- connp->conn_flags |= IPCL_SOCKET;
tcp->tcp_issocket = 1;
}
- /* Non-zero default values */
- connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
-
- if (q == NULL) {
- /*
- * Create a helper stream for non-STREAMS socket.
- */
- err = ip_create_helper_stream(connp, tcps->tcps_ldi_ident);
- if (err != 0) {
- ip1dbg(("tcp_create_common: create of IP helper stream "
- "failed\n"));
- CONN_DEC_REF(connp);
- *errorp = err;
- return (NULL);
- }
- q = connp->conn_rq;
- }
+ connp->conn_rcvbuf = tcps->tcps_recv_hiwat;
+ connp->conn_sndbuf = tcps->tcps_xmit_hiwat;
+ connp->conn_sndlowat = tcps->tcps_xmit_lowat;
+ connp->conn_so_type = SOCK_STREAM;
+ connp->conn_wroff = connp->conn_ht_iphc_allocated +
+ tcps->tcps_wroff_xtra;
SOCK_CONNID_INIT(tcp->tcp_connid);
- err = tcp_init(tcp, q);
- if (err != 0) {
- CONN_DEC_REF(connp);
- *errorp = err;
- return (NULL);
- }
-
+ tcp->tcp_state = TCPS_IDLE;
+ tcp_init_values(tcp);
return (connp);
}
@@ -9415,7 +7734,7 @@ tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
q->q_qinfo = &tcp_acceptor_rinit;
/*
* the conn_dev and minor_arena will be subsequently used by
- * tcp_wput_accept() and tcp_tpi_close_accept() to figure out
+ * tcp_tli_accept() and tcp_tpi_close_accept() to figure out
* the minor device number for this connection from the q_ptr.
*/
RD(q)->q_ptr = (void *)conn_dev;
@@ -9426,7 +7745,7 @@ tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
}
issocket = flag & SO_SOCKSTR;
- connp = tcp_create_common(q, credp, isv6, issocket, &err);
+ connp = tcp_create_common(credp, isv6, issocket, &err);
if (connp == NULL) {
inet_minor_free(minor_arena, conn_dev);
@@ -9434,6 +7753,8 @@ tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
return (err);
}
+ connp->conn_rq = q;
+ connp->conn_wq = WR(q);
q->q_ptr = WR(q)->q_ptr = connp;
connp->conn_dev = conn_dev;
@@ -9500,7 +7821,7 @@ tcp_allow_connopt_set(int level, int name)
}
/*
- * this routine gets default values of certain options whose default
+ * This routine gets default values of certain options whose default
* values are maintained by protocol specific code
*/
/* ARGSUSED */
@@ -9553,321 +7874,102 @@ tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
return (sizeof (int));
}
+/*
+ * TCP routine to get the values of options.
+ */
static int
tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
{
int *i1 = (int *)ptr;
tcp_t *tcp = connp->conn_tcp;
- ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp;
+ conn_opt_arg_t coas;
+ int retval;
+
+ coas.coa_connp = connp;
+ coas.coa_ixa = connp->conn_ixa;
+ coas.coa_ipp = &connp->conn_xmit_ipp;
+ coas.coa_ancillary = B_FALSE;
+ coas.coa_changed = 0;
switch (level) {
case SOL_SOCKET:
switch (name) {
- case SO_LINGER: {
- struct linger *lgr = (struct linger *)ptr;
-
- lgr->l_onoff = tcp->tcp_linger ? SO_LINGER : 0;
- lgr->l_linger = tcp->tcp_lingertime;
- }
- return (sizeof (struct linger));
- case SO_DEBUG:
- *i1 = tcp->tcp_debug ? SO_DEBUG : 0;
- break;
- case SO_KEEPALIVE:
- *i1 = tcp->tcp_ka_enabled ? SO_KEEPALIVE : 0;
- break;
- case SO_DONTROUTE:
- *i1 = tcp->tcp_dontroute ? SO_DONTROUTE : 0;
- break;
- case SO_USELOOPBACK:
- *i1 = tcp->tcp_useloopback ? SO_USELOOPBACK : 0;
- break;
- case SO_BROADCAST:
- *i1 = tcp->tcp_broadcast ? SO_BROADCAST : 0;
- break;
- case SO_REUSEADDR:
- *i1 = tcp->tcp_reuseaddr ? SO_REUSEADDR : 0;
- break;
- case SO_OOBINLINE:
- *i1 = tcp->tcp_oobinline ? SO_OOBINLINE : 0;
- break;
- case SO_DGRAM_ERRIND:
- *i1 = tcp->tcp_dgram_errind ? SO_DGRAM_ERRIND : 0;
- break;
- case SO_TYPE:
- *i1 = SOCK_STREAM;
- break;
- case SO_SNDBUF:
- *i1 = tcp->tcp_xmit_hiwater;
- break;
- case SO_RCVBUF:
- *i1 = tcp->tcp_recv_hiwater;
- break;
case SO_SND_COPYAVOID:
*i1 = tcp->tcp_snd_zcopy_on ?
SO_SND_COPYAVOID : 0;
- break;
- case SO_ALLZONES:
- *i1 = connp->conn_allzones ? 1 : 0;
- break;
- case SO_ANON_MLP:
- *i1 = connp->conn_anon_mlp;
- break;
- case SO_MAC_EXEMPT:
- *i1 = (connp->conn_mac_mode == CONN_MAC_AWARE);
- break;
- case SO_MAC_IMPLICIT:
- *i1 = (connp->conn_mac_mode == CONN_MAC_IMPLICIT);
- break;
- case SO_EXCLBIND:
- *i1 = tcp->tcp_exclbind ? SO_EXCLBIND : 0;
- break;
- case SO_PROTOTYPE:
- *i1 = IPPROTO_TCP;
- break;
- case SO_DOMAIN:
- *i1 = tcp->tcp_family;
- break;
+ return (sizeof (int));
case SO_ACCEPTCONN:
*i1 = (tcp->tcp_state == TCPS_LISTEN);
- default:
- return (-1);
+ return (sizeof (int));
}
break;
case IPPROTO_TCP:
switch (name) {
case TCP_NODELAY:
*i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
- break;
+ return (sizeof (int));
case TCP_MAXSEG:
*i1 = tcp->tcp_mss;
- break;
+ return (sizeof (int));
case TCP_NOTIFY_THRESHOLD:
*i1 = (int)tcp->tcp_first_timer_threshold;
- break;
+ return (sizeof (int));
case TCP_ABORT_THRESHOLD:
*i1 = tcp->tcp_second_timer_threshold;
- break;
+ return (sizeof (int));
case TCP_CONN_NOTIFY_THRESHOLD:
*i1 = tcp->tcp_first_ctimer_threshold;
- break;
+ return (sizeof (int));
case TCP_CONN_ABORT_THRESHOLD:
*i1 = tcp->tcp_second_ctimer_threshold;
- break;
- case TCP_RECVDSTADDR:
- *i1 = tcp->tcp_recvdstaddr;
- break;
- case TCP_ANONPRIVBIND:
- *i1 = tcp->tcp_anon_priv_bind;
- break;
- case TCP_EXCLBIND:
- *i1 = tcp->tcp_exclbind ? TCP_EXCLBIND : 0;
- break;
+ return (sizeof (int));
case TCP_INIT_CWND:
*i1 = tcp->tcp_init_cwnd;
- break;
+ return (sizeof (int));
case TCP_KEEPALIVE_THRESHOLD:
*i1 = tcp->tcp_ka_interval;
- break;
+ return (sizeof (int));
case TCP_KEEPALIVE_ABORT_THRESHOLD:
*i1 = tcp->tcp_ka_abort_thres;
- break;
+ return (sizeof (int));
case TCP_CORK:
*i1 = tcp->tcp_cork;
- break;
- default:
- return (-1);
+ return (sizeof (int));
}
break;
case IPPROTO_IP:
- if (tcp->tcp_family != AF_INET)
+ if (connp->conn_family != AF_INET)
return (-1);
switch (name) {
case IP_OPTIONS:
- case T_IP_OPTIONS: {
- /*
- * This is compatible with BSD in that in only return
- * the reverse source route with the final destination
- * as the last entry. The first 4 bytes of the option
- * will contain the final destination.
- */
- int opt_len;
-
- opt_len = (char *)tcp->tcp_tcph - (char *)tcp->tcp_ipha;
- opt_len -= tcp->tcp_label_len + IP_SIMPLE_HDR_LENGTH;
- ASSERT(opt_len >= 0);
+ case T_IP_OPTIONS:
/* Caller ensures enough space */
- if (opt_len > 0) {
- /*
- * TODO: Do we have to handle getsockopt on an
- * initiator as well?
- */
- return (ip_opt_get_user(tcp->tcp_ipha, ptr));
- }
- return (0);
- }
- case IP_TOS:
- case T_IP_TOS:
- *i1 = (int)tcp->tcp_ipha->ipha_type_of_service;
- break;
- case IP_TTL:
- *i1 = (int)tcp->tcp_ipha->ipha_ttl;
- break;
- case IP_NEXTHOP:
- /* Handled at IP level */
- return (-EINVAL);
+ return (ip_opt_get_user(connp, ptr));
default:
- return (-1);
+ break;
}
break;
+
case IPPROTO_IPV6:
/*
* IPPROTO_IPV6 options are only supported for sockets
* that are using IPv6 on the wire.
*/
- if (tcp->tcp_ipversion != IPV6_VERSION) {
+ if (connp->conn_ipversion != IPV6_VERSION) {
return (-1);
}
switch (name) {
- case IPV6_UNICAST_HOPS:
- *i1 = (unsigned int) tcp->tcp_ip6h->ip6_hops;
- break; /* goto sizeof (int) option return */
- case IPV6_BOUND_IF:
- /* Zero if not set */
- *i1 = tcp->tcp_bound_if;
- break; /* goto sizeof (int) option return */
- case IPV6_RECVPKTINFO:
- if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO)
- *i1 = 1;
- else
- *i1 = 0;
- break; /* goto sizeof (int) option return */
- case IPV6_RECVTCLASS:
- if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVTCLASS)
- *i1 = 1;
- else
- *i1 = 0;
- break; /* goto sizeof (int) option return */
- case IPV6_RECVHOPLIMIT:
- if (tcp->tcp_ipv6_recvancillary &
- TCP_IPV6_RECVHOPLIMIT)
- *i1 = 1;
- else
- *i1 = 0;
- break; /* goto sizeof (int) option return */
- case IPV6_RECVHOPOPTS:
- if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVHOPOPTS)
- *i1 = 1;
- else
- *i1 = 0;
- break; /* goto sizeof (int) option return */
- case IPV6_RECVDSTOPTS:
- if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVDSTOPTS)
- *i1 = 1;
- else
- *i1 = 0;
- break; /* goto sizeof (int) option return */
- case _OLD_IPV6_RECVDSTOPTS:
- if (tcp->tcp_ipv6_recvancillary &
- TCP_OLD_IPV6_RECVDSTOPTS)
- *i1 = 1;
- else
- *i1 = 0;
- break; /* goto sizeof (int) option return */
- case IPV6_RECVRTHDR:
- if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVRTHDR)
- *i1 = 1;
- else
- *i1 = 0;
- break; /* goto sizeof (int) option return */
- case IPV6_RECVRTHDRDSTOPTS:
- if (tcp->tcp_ipv6_recvancillary &
- TCP_IPV6_RECVRTDSTOPTS)
- *i1 = 1;
- else
- *i1 = 0;
- break; /* goto sizeof (int) option return */
- case IPV6_PKTINFO: {
- /* XXX assumes that caller has room for max size! */
- struct in6_pktinfo *pkti;
-
- pkti = (struct in6_pktinfo *)ptr;
- if (ipp->ipp_fields & IPPF_IFINDEX)
- pkti->ipi6_ifindex = ipp->ipp_ifindex;
- else
- pkti->ipi6_ifindex = 0;
- if (ipp->ipp_fields & IPPF_ADDR)
- pkti->ipi6_addr = ipp->ipp_addr;
- else
- pkti->ipi6_addr = ipv6_all_zeros;
- return (sizeof (struct in6_pktinfo));
- }
- case IPV6_TCLASS:
- if (ipp->ipp_fields & IPPF_TCLASS)
- *i1 = ipp->ipp_tclass;
- else
- *i1 = IPV6_FLOW_TCLASS(
- IPV6_DEFAULT_VERS_AND_FLOW);
- break; /* goto sizeof (int) option return */
- case IPV6_NEXTHOP: {
- sin6_t *sin6 = (sin6_t *)ptr;
-
- if (!(ipp->ipp_fields & IPPF_NEXTHOP))
- return (0);
- *sin6 = sin6_null;
- sin6->sin6_family = AF_INET6;
- sin6->sin6_addr = ipp->ipp_nexthop;
- return (sizeof (sin6_t));
- }
- case IPV6_HOPOPTS:
- if (!(ipp->ipp_fields & IPPF_HOPOPTS))
- return (0);
- if (ipp->ipp_hopoptslen <= tcp->tcp_label_len)
- return (0);
- bcopy((char *)ipp->ipp_hopopts + tcp->tcp_label_len,
- ptr, ipp->ipp_hopoptslen - tcp->tcp_label_len);
- if (tcp->tcp_label_len > 0) {
- ptr[0] = ((char *)ipp->ipp_hopopts)[0];
- ptr[1] = (ipp->ipp_hopoptslen -
- tcp->tcp_label_len + 7) / 8 - 1;
- }
- return (ipp->ipp_hopoptslen - tcp->tcp_label_len);
- case IPV6_RTHDRDSTOPTS:
- if (!(ipp->ipp_fields & IPPF_RTDSTOPTS))
- return (0);
- bcopy(ipp->ipp_rtdstopts, ptr, ipp->ipp_rtdstoptslen);
- return (ipp->ipp_rtdstoptslen);
- case IPV6_RTHDR:
- if (!(ipp->ipp_fields & IPPF_RTHDR))
- return (0);
- bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen);
- return (ipp->ipp_rthdrlen);
- case IPV6_DSTOPTS:
- if (!(ipp->ipp_fields & IPPF_DSTOPTS))
- return (0);
- bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen);
- return (ipp->ipp_dstoptslen);
- case IPV6_SRC_PREFERENCES:
- return (ip6_get_src_preferences(connp,
- (uint32_t *)ptr));
- case IPV6_PATHMTU: {
- struct ip6_mtuinfo *mtuinfo = (struct ip6_mtuinfo *)ptr;
-
+ case IPV6_PATHMTU:
if (tcp->tcp_state < TCPS_ESTABLISHED)
return (-1);
-
- return (ip_fill_mtuinfo(&connp->conn_remv6,
- connp->conn_fport, mtuinfo,
- connp->conn_netstack));
- }
- default:
- return (-1);
+ break;
}
break;
- default:
- return (-1);
}
- return (sizeof (int));
+ mutex_enter(&connp->conn_lock);
+ retval = conn_opt_get(&coas, level, name, ptr);
+ mutex_exit(&connp->conn_lock);
+ return (retval);
}
/*
@@ -9896,7 +7998,6 @@ tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
tcp_opt_obj.odb_opt_des_arr,
tcp_opt_obj.odb_opt_arr_cnt,
- tcp_opt_obj.odb_topmost_tpiprovider,
B_FALSE, B_TRUE, cr);
if (error != 0) {
if (error < 0) {
@@ -9909,30 +8010,28 @@ tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
error = squeue_synch_enter(sqp, connp, NULL);
if (error == ENOMEM) {
+ kmem_free(optvalp_buf, max_optbuf_len);
return (ENOMEM);
}
len = tcp_opt_get(connp, level, option_name, optvalp_buf);
squeue_synch_exit(sqp, connp);
- if (len < 0) {
- /*
- * Pass on to IP
- */
+ if (len == -1) {
kmem_free(optvalp_buf, max_optbuf_len);
- return (ip_get_options(connp, level, option_name,
- optvalp, optlen, cr));
- } else {
- /*
- * update optlen and copy option value
- */
- t_uscalar_t size = MIN(len, *optlen);
- bcopy(optvalp_buf, optvalp, size);
- bcopy(&size, optlen, sizeof (size));
-
- kmem_free(optvalp_buf, max_optbuf_len);
- return (0);
+ return (EINVAL);
}
+
+ /*
+ * update optlen and copy option value
+ */
+ t_uscalar_t size = MIN(len, *optlen);
+
+ bcopy(optvalp_buf, optvalp, size);
+ bcopy(&size, optlen, sizeof (size));
+
+ kmem_free(optvalp_buf, max_optbuf_len);
+ return (0);
}
/*
@@ -9943,7 +8042,7 @@ tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
int
tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
- void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
+ void *thisdg_attrs, cred_t *cr)
{
tcp_t *tcp = connp->conn_tcp;
int *i1 = (int *)invalp;
@@ -9951,6 +8050,13 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
boolean_t checkonly;
int reterr;
tcp_stack_t *tcps = tcp->tcp_tcps;
+ conn_opt_arg_t coas;
+
+ coas.coa_connp = connp;
+ coas.coa_ixa = connp->conn_ixa;
+ coas.coa_ipp = &connp->conn_xmit_ipp;
+ coas.coa_ancillary = B_FALSE;
+ coas.coa_changed = 0;
switch (optset_context) {
case SETFN_OPTCOM_CHECKONLY:
@@ -10016,37 +8122,6 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
switch (level) {
case SOL_SOCKET:
switch (name) {
- case SO_LINGER: {
- struct linger *lgr = (struct linger *)invalp;
-
- if (!checkonly) {
- if (lgr->l_onoff) {
- tcp->tcp_linger = 1;
- tcp->tcp_lingertime = lgr->l_linger;
- } else {
- tcp->tcp_linger = 0;
- tcp->tcp_lingertime = 0;
- }
- /* struct copy */
- *(struct linger *)outvalp = *lgr;
- } else {
- if (!lgr->l_onoff) {
- ((struct linger *)
- outvalp)->l_onoff = 0;
- ((struct linger *)
- outvalp)->l_linger = 0;
- } else {
- /* struct copy */
- *(struct linger *)outvalp = *lgr;
- }
- }
- *outlenp = sizeof (struct linger);
- return (0);
- }
- case SO_DEBUG:
- if (!checkonly)
- tcp->tcp_debug = onoff;
- break;
case SO_KEEPALIVE:
if (checkonly) {
/* check only case */
@@ -10054,65 +8129,25 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
}
if (!onoff) {
- if (tcp->tcp_ka_enabled) {
+ if (connp->conn_keepalive) {
if (tcp->tcp_ka_tid != 0) {
(void) TCP_TIMER_CANCEL(tcp,
tcp->tcp_ka_tid);
tcp->tcp_ka_tid = 0;
}
- tcp->tcp_ka_enabled = 0;
+ connp->conn_keepalive = 0;
}
break;
}
- if (!tcp->tcp_ka_enabled) {
+ if (!connp->conn_keepalive) {
/* Crank up the keepalive timer */
tcp->tcp_ka_last_intrvl = 0;
tcp->tcp_ka_tid = TCP_TIMER(tcp,
tcp_keepalive_killer,
MSEC_TO_TICK(tcp->tcp_ka_interval));
- tcp->tcp_ka_enabled = 1;
- }
- break;
- case SO_DONTROUTE:
- /*
- * SO_DONTROUTE, SO_USELOOPBACK, and SO_BROADCAST are
- * only of interest to IP. We track them here only so
- * that we can report their current value.
- */
- if (!checkonly) {
- tcp->tcp_dontroute = onoff;
- tcp->tcp_connp->conn_dontroute = onoff;
+ connp->conn_keepalive = 1;
}
break;
- case SO_USELOOPBACK:
- if (!checkonly) {
- tcp->tcp_useloopback = onoff;
- tcp->tcp_connp->conn_loopback = onoff;
- }
- break;
- case SO_BROADCAST:
- if (!checkonly) {
- tcp->tcp_broadcast = onoff;
- tcp->tcp_connp->conn_broadcast = onoff;
- }
- break;
- case SO_REUSEADDR:
- if (!checkonly) {
- tcp->tcp_reuseaddr = onoff;
- tcp->tcp_connp->conn_reuseaddr = onoff;
- }
- break;
- case SO_OOBINLINE:
- if (!checkonly) {
- tcp->tcp_oobinline = onoff;
- if (IPCL_IS_NONSTR(tcp->tcp_connp))
- proto_set_rx_oob_opt(connp, onoff);
- }
- break;
- case SO_DGRAM_ERRIND:
- if (!checkonly)
- tcp->tcp_dgram_errind = onoff;
- break;
case SO_SNDBUF: {
if (*i1 > tcps->tcps_max_buf) {
*outlenp = 0;
@@ -10121,11 +8156,11 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
if (checkonly)
break;
- tcp->tcp_xmit_hiwater = *i1;
- if (tcps->tcps_snd_lowat_fraction != 0)
- tcp->tcp_xmit_lowater =
- tcp->tcp_xmit_hiwater /
+ connp->conn_sndbuf = *i1;
+ if (tcps->tcps_snd_lowat_fraction != 0) {
+ connp->conn_sndlowat = connp->conn_sndbuf /
tcps->tcps_snd_lowat_fraction;
+ }
(void) tcp_maxpsz_set(tcp, B_TRUE);
/*
* If we are flow-controlled, recheck the condition.
@@ -10135,11 +8170,12 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
*/
mutex_enter(&tcp->tcp_non_sq_lock);
if (tcp->tcp_flow_stopped &&
- TCP_UNSENT_BYTES(tcp) < tcp->tcp_xmit_hiwater) {
+ TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) {
tcp_clrqfull(tcp);
}
mutex_exit(&tcp->tcp_non_sq_lock);
- break;
+ *outlenp = inlen;
+ return (0);
}
case SO_RCVBUF:
if (*i1 > tcps->tcps_max_buf) {
@@ -10155,43 +8191,20 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
* XXX should we return the rwnd here
* and tcp_opt_get ?
*/
- break;
+ *outlenp = inlen;
+ return (0);
case SO_SND_COPYAVOID:
if (!checkonly) {
- /* we only allow enable at most once for now */
if (tcp->tcp_loopback ||
(tcp->tcp_kssl_ctx != NULL) ||
- (!tcp->tcp_snd_zcopy_aware &&
- (onoff != 1 || !tcp_zcopy_check(tcp)))) {
+ (onoff != 1) || !tcp_zcopy_check(tcp)) {
*outlenp = 0;
return (EOPNOTSUPP);
}
tcp->tcp_snd_zcopy_aware = 1;
}
- break;
- case SO_RCVTIMEO:
- case SO_SNDTIMEO:
- /*
- * Pass these two options in order for third part
- * protocol usage. Here just return directly.
- */
+ *outlenp = inlen;
return (0);
- case SO_ALLZONES:
- /* Pass option along to IP level for handling */
- return (-EINVAL);
- case SO_ANON_MLP:
- /* Pass option along to IP level for handling */
- return (-EINVAL);
- case SO_MAC_EXEMPT:
- /* Pass option along to IP level for handling */
- return (-EINVAL);
- case SO_EXCLBIND:
- if (!checkonly)
- tcp->tcp_exclbind = onoff;
- break;
- default:
- *outlenp = 0;
- return (EINVAL);
}
break;
case IPPROTO_TCP:
@@ -10217,25 +8230,12 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
tcp->tcp_second_ctimer_threshold = *i1;
break;
case TCP_RECVDSTADDR:
- if (tcp->tcp_state > TCPS_LISTEN)
- return (EOPNOTSUPP);
- if (!checkonly)
- tcp->tcp_recvdstaddr = onoff;
- break;
- case TCP_ANONPRIVBIND:
- if ((reterr = secpolicy_net_privaddr(cr, 0,
- IPPROTO_TCP)) != 0) {
+ if (tcp->tcp_state > TCPS_LISTEN) {
*outlenp = 0;
- return (reterr);
- }
- if (!checkonly) {
- tcp->tcp_anon_priv_bind = onoff;
+ return (EOPNOTSUPP);
}
+ /* Setting done in conn_opt_set */
break;
- case TCP_EXCLBIND:
- if (!checkonly)
- tcp->tcp_exclbind = onoff;
- break; /* goto sizeof (int) option return */
case TCP_INIT_CWND: {
uint32_t init_cwnd = *((uint32_t *)invalp);
@@ -10278,7 +8278,7 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
* keepalive timer.
*/
if (tcp->tcp_ka_tid != 0) {
- ASSERT(tcp->tcp_ka_enabled);
+ ASSERT(connp->conn_keepalive);
(void) TCP_TIMER_CANCEL(tcp,
tcp->tcp_ka_tid);
tcp->tcp_ka_last_intrvl = 0;
@@ -10318,49 +8318,15 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
}
break;
default:
- *outlenp = 0;
- return (EINVAL);
+ break;
}
break;
case IPPROTO_IP:
- if (tcp->tcp_family != AF_INET) {
+ if (connp->conn_family != AF_INET) {
*outlenp = 0;
- return (ENOPROTOOPT);
+ return (EINVAL);
}
switch (name) {
- case IP_OPTIONS:
- case T_IP_OPTIONS:
- reterr = tcp_opt_set_header(tcp, checkonly,
- invalp, inlen);
- if (reterr) {
- *outlenp = 0;
- return (reterr);
- }
- /* OK return - copy input buffer into output buffer */
- if (invalp != outvalp) {
- /* don't trust bcopy for identical src/dst */
- bcopy(invalp, outvalp, inlen);
- }
- *outlenp = inlen;
- return (0);
- case IP_TOS:
- case T_IP_TOS:
- if (!checkonly) {
- tcp->tcp_ipha->ipha_type_of_service =
- (uchar_t)*i1;
- tcp->tcp_tos = (uchar_t)*i1;
- }
- break;
- case IP_TTL:
- if (!checkonly) {
- tcp->tcp_ipha->ipha_ttl = (uchar_t)*i1;
- tcp->tcp_ttl = (uchar_t)*i1;
- }
- break;
- case IP_BOUND_IF:
- case IP_NEXTHOP:
- /* Handled at the IP level */
- return (-EINVAL);
case IP_SEC_OPT:
/*
* We should not allow policy setting after
@@ -10368,166 +8334,42 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
*/
if (tcp->tcp_state == TCPS_LISTEN) {
return (EINVAL);
- } else {
- /* Handled at the IP level */
- return (-EINVAL);
}
- default:
- *outlenp = 0;
- return (EINVAL);
+ break;
}
break;
- case IPPROTO_IPV6: {
- ip6_pkt_t *ipp;
-
+ case IPPROTO_IPV6:
/*
* IPPROTO_IPV6 options are only supported for sockets
* that are using IPv6 on the wire.
*/
- if (tcp->tcp_ipversion != IPV6_VERSION) {
+ if (connp->conn_ipversion != IPV6_VERSION) {
*outlenp = 0;
- return (ENOPROTOOPT);
+ return (EINVAL);
}
- /*
- * Only sticky options; no ancillary data
- */
- ipp = &tcp->tcp_sticky_ipp;
switch (name) {
- case IPV6_UNICAST_HOPS:
- /* -1 means use default */
- if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) {
- *outlenp = 0;
- return (EINVAL);
- }
- if (!checkonly) {
- if (*i1 == -1) {
- tcp->tcp_ip6h->ip6_hops =
- ipp->ipp_unicast_hops =
- (uint8_t)tcps->tcps_ipv6_hoplimit;
- ipp->ipp_fields &= ~IPPF_UNICAST_HOPS;
- /* Pass modified value to IP. */
- *i1 = tcp->tcp_ip6h->ip6_hops;
- } else {
- tcp->tcp_ip6h->ip6_hops =
- ipp->ipp_unicast_hops =
- (uint8_t)*i1;
- ipp->ipp_fields |= IPPF_UNICAST_HOPS;
- }
- reterr = tcp_build_hdrs(tcp);
- if (reterr != 0)
- return (reterr);
- }
- break;
- case IPV6_BOUND_IF:
- if (!checkonly) {
- tcp->tcp_bound_if = *i1;
- PASS_OPT_TO_IP(connp);
- }
- break;
- /*
- * Set boolean switches for ancillary data delivery
- */
case IPV6_RECVPKTINFO:
if (!checkonly) {
- if (onoff)
- tcp->tcp_ipv6_recvancillary |=
- TCP_IPV6_RECVPKTINFO;
- else
- tcp->tcp_ipv6_recvancillary &=
- ~TCP_IPV6_RECVPKTINFO;
/* Force it to be sent up with the next msg */
tcp->tcp_recvifindex = 0;
- PASS_OPT_TO_IP(connp);
}
break;
case IPV6_RECVTCLASS:
if (!checkonly) {
- if (onoff)
- tcp->tcp_ipv6_recvancillary |=
- TCP_IPV6_RECVTCLASS;
- else
- tcp->tcp_ipv6_recvancillary &=
- ~TCP_IPV6_RECVTCLASS;
- PASS_OPT_TO_IP(connp);
+ /* Force it to be sent up with the next msg */
+ tcp->tcp_recvtclass = 0xffffffffU;
}
break;
case IPV6_RECVHOPLIMIT:
if (!checkonly) {
- if (onoff)
- tcp->tcp_ipv6_recvancillary |=
- TCP_IPV6_RECVHOPLIMIT;
- else
- tcp->tcp_ipv6_recvancillary &=
- ~TCP_IPV6_RECVHOPLIMIT;
/* Force it to be sent up with the next msg */
tcp->tcp_recvhops = 0xffffffffU;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IPV6_RECVHOPOPTS:
- if (!checkonly) {
- if (onoff)
- tcp->tcp_ipv6_recvancillary |=
- TCP_IPV6_RECVHOPOPTS;
- else
- tcp->tcp_ipv6_recvancillary &=
- ~TCP_IPV6_RECVHOPOPTS;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IPV6_RECVDSTOPTS:
- if (!checkonly) {
- if (onoff)
- tcp->tcp_ipv6_recvancillary |=
- TCP_IPV6_RECVDSTOPTS;
- else
- tcp->tcp_ipv6_recvancillary &=
- ~TCP_IPV6_RECVDSTOPTS;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case _OLD_IPV6_RECVDSTOPTS:
- if (!checkonly) {
- if (onoff)
- tcp->tcp_ipv6_recvancillary |=
- TCP_OLD_IPV6_RECVDSTOPTS;
- else
- tcp->tcp_ipv6_recvancillary &=
- ~TCP_OLD_IPV6_RECVDSTOPTS;
- }
- break;
- case IPV6_RECVRTHDR:
- if (!checkonly) {
- if (onoff)
- tcp->tcp_ipv6_recvancillary |=
- TCP_IPV6_RECVRTHDR;
- else
- tcp->tcp_ipv6_recvancillary &=
- ~TCP_IPV6_RECVRTHDR;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IPV6_RECVRTHDRDSTOPTS:
- if (!checkonly) {
- if (onoff)
- tcp->tcp_ipv6_recvancillary |=
- TCP_IPV6_RECVRTDSTOPTS;
- else
- tcp->tcp_ipv6_recvancillary &=
- ~TCP_IPV6_RECVRTDSTOPTS;
- PASS_OPT_TO_IP(connp);
}
break;
case IPV6_PKTINFO:
- if (inlen != 0 && inlen != sizeof (struct in6_pktinfo))
- return (EINVAL);
- if (checkonly)
- break;
-
- if (inlen == 0) {
- ipp->ipp_fields &= ~(IPPF_IFINDEX|IPPF_ADDR);
- } else {
+ /* This is an extra check for TCP */
+ if (inlen == sizeof (struct in6_pktinfo)) {
struct in6_pktinfo *pkti;
pkti = (struct in6_pktinfo *)invalp;
@@ -10539,219 +8381,8 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
*/
if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
return (EINVAL);
- /*
- * IP will validate the source address and
- * interface index.
- */
- if (IPCL_IS_NONSTR(tcp->tcp_connp)) {
- reterr = ip_set_options(tcp->tcp_connp,
- level, name, invalp, inlen, cr);
- } else {
- reterr = ip6_set_pktinfo(cr,
- tcp->tcp_connp, pkti);
- }
- if (reterr != 0)
- return (reterr);
- ipp->ipp_ifindex = pkti->ipi6_ifindex;
- ipp->ipp_addr = pkti->ipi6_addr;
- if (ipp->ipp_ifindex != 0)
- ipp->ipp_fields |= IPPF_IFINDEX;
- else
- ipp->ipp_fields &= ~IPPF_IFINDEX;
- if (!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr))
- ipp->ipp_fields |= IPPF_ADDR;
- else
- ipp->ipp_fields &= ~IPPF_ADDR;
- }
- reterr = tcp_build_hdrs(tcp);
- if (reterr != 0)
- return (reterr);
- break;
- case IPV6_TCLASS:
- if (inlen != 0 && inlen != sizeof (int))
- return (EINVAL);
- if (checkonly)
- break;
-
- if (inlen == 0) {
- ipp->ipp_fields &= ~IPPF_TCLASS;
- } else {
- if (*i1 > 255 || *i1 < -1)
- return (EINVAL);
- if (*i1 == -1) {
- ipp->ipp_tclass = 0;
- *i1 = 0;
- } else {
- ipp->ipp_tclass = *i1;
- }
- ipp->ipp_fields |= IPPF_TCLASS;
- }
- reterr = tcp_build_hdrs(tcp);
- if (reterr != 0)
- return (reterr);
- break;
- case IPV6_NEXTHOP:
- /*
- * IP will verify that the nexthop is reachable
- * and fail for sticky options.
- */
- if (inlen != 0 && inlen != sizeof (sin6_t))
- return (EINVAL);
- if (checkonly)
- break;
-
- if (inlen == 0) {
- ipp->ipp_fields &= ~IPPF_NEXTHOP;
- } else {
- sin6_t *sin6 = (sin6_t *)invalp;
-
- if (sin6->sin6_family != AF_INET6)
- return (EAFNOSUPPORT);
- if (IN6_IS_ADDR_V4MAPPED(
- &sin6->sin6_addr))
- return (EADDRNOTAVAIL);
- ipp->ipp_nexthop = sin6->sin6_addr;
- if (!IN6_IS_ADDR_UNSPECIFIED(
- &ipp->ipp_nexthop))
- ipp->ipp_fields |= IPPF_NEXTHOP;
- else
- ipp->ipp_fields &= ~IPPF_NEXTHOP;
- }
- reterr = tcp_build_hdrs(tcp);
- if (reterr != 0)
- return (reterr);
- PASS_OPT_TO_IP(connp);
- break;
- case IPV6_HOPOPTS: {
- ip6_hbh_t *hopts = (ip6_hbh_t *)invalp;
-
- /*
- * Sanity checks - minimum size, size a multiple of
- * eight bytes, and matching size passed in.
- */
- if (inlen != 0 &&
- inlen != (8 * (hopts->ip6h_len + 1)))
- return (EINVAL);
-
- if (checkonly)
- break;
-
- reterr = optcom_pkt_set(invalp, inlen, B_TRUE,
- (uchar_t **)&ipp->ipp_hopopts,
- &ipp->ipp_hopoptslen, tcp->tcp_label_len);
- if (reterr != 0)
- return (reterr);
- if (ipp->ipp_hopoptslen == 0)
- ipp->ipp_fields &= ~IPPF_HOPOPTS;
- else
- ipp->ipp_fields |= IPPF_HOPOPTS;
- reterr = tcp_build_hdrs(tcp);
- if (reterr != 0)
- return (reterr);
- break;
- }
- case IPV6_RTHDRDSTOPTS: {
- ip6_dest_t *dopts = (ip6_dest_t *)invalp;
-
- /*
- * Sanity checks - minimum size, size a multiple of
- * eight bytes, and matching size passed in.
- */
- if (inlen != 0 &&
- inlen != (8 * (dopts->ip6d_len + 1)))
- return (EINVAL);
-
- if (checkonly)
- break;
-
- reterr = optcom_pkt_set(invalp, inlen, B_TRUE,
- (uchar_t **)&ipp->ipp_rtdstopts,
- &ipp->ipp_rtdstoptslen, 0);
- if (reterr != 0)
- return (reterr);
- if (ipp->ipp_rtdstoptslen == 0)
- ipp->ipp_fields &= ~IPPF_RTDSTOPTS;
- else
- ipp->ipp_fields |= IPPF_RTDSTOPTS;
- reterr = tcp_build_hdrs(tcp);
- if (reterr != 0)
- return (reterr);
- break;
- }
- case IPV6_DSTOPTS: {
- ip6_dest_t *dopts = (ip6_dest_t *)invalp;
-
- /*
- * Sanity checks - minimum size, size a multiple of
- * eight bytes, and matching size passed in.
- */
- if (inlen != 0 &&
- inlen != (8 * (dopts->ip6d_len + 1)))
- return (EINVAL);
-
- if (checkonly)
- break;
-
- reterr = optcom_pkt_set(invalp, inlen, B_TRUE,
- (uchar_t **)&ipp->ipp_dstopts,
- &ipp->ipp_dstoptslen, 0);
- if (reterr != 0)
- return (reterr);
- if (ipp->ipp_dstoptslen == 0)
- ipp->ipp_fields &= ~IPPF_DSTOPTS;
- else
- ipp->ipp_fields |= IPPF_DSTOPTS;
- reterr = tcp_build_hdrs(tcp);
- if (reterr != 0)
- return (reterr);
- break;
- }
- case IPV6_RTHDR: {
- ip6_rthdr_t *rt = (ip6_rthdr_t *)invalp;
-
- /*
- * Sanity checks - minimum size, size a multiple of
- * eight bytes, and matching size passed in.
- */
- if (inlen != 0 &&
- inlen != (8 * (rt->ip6r_len + 1)))
- return (EINVAL);
-
- if (checkonly)
- break;
-
- reterr = optcom_pkt_set(invalp, inlen, B_TRUE,
- (uchar_t **)&ipp->ipp_rthdr,
- &ipp->ipp_rthdrlen, 0);
- if (reterr != 0)
- return (reterr);
- if (ipp->ipp_rthdrlen == 0)
- ipp->ipp_fields &= ~IPPF_RTHDR;
- else
- ipp->ipp_fields |= IPPF_RTHDR;
- reterr = tcp_build_hdrs(tcp);
- if (reterr != 0)
- return (reterr);
- break;
- }
- case IPV6_V6ONLY:
- if (!checkonly) {
- tcp->tcp_connp->conn_ipv6_v6only = onoff;
}
break;
- case IPV6_USE_MIN_MTU:
- if (inlen != sizeof (int))
- return (EINVAL);
-
- if (*i1 < -1 || *i1 > 1)
- return (EINVAL);
-
- if (checkonly)
- break;
-
- ipp->ipp_fields |= IPPF_USE_MIN_MTU;
- ipp->ipp_use_min_mtu = *i1;
- break;
case IPV6_SEC_OPT:
/*
* We should not allow policy setting after
@@ -10759,30 +8390,18 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
*/
if (tcp->tcp_state == TCPS_LISTEN) {
return (EINVAL);
- } else {
- /* Handled at the IP level */
- return (-EINVAL);
- }
- case IPV6_SRC_PREFERENCES:
- if (inlen != sizeof (uint32_t))
- return (EINVAL);
- reterr = ip6_set_src_preferences(tcp->tcp_connp,
- *(uint32_t *)invalp);
- if (reterr != 0) {
- *outlenp = 0;
- return (reterr);
}
break;
- default:
- *outlenp = 0;
- return (EINVAL);
}
break;
- } /* end IPPROTO_IPV6 */
- default:
+ }
+ reterr = conn_opt_set(&coas, level, name, inlen, invalp,
+ checkonly, cr);
+ if (reterr != 0) {
*outlenp = 0;
- return (EINVAL);
+ return (reterr);
}
+
/*
* Common case of OK return with outval same as inval
*/
@@ -10791,6 +8410,45 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
(void) bcopy(invalp, outvalp, inlen);
}
*outlenp = inlen;
+
+ if (coas.coa_changed & COA_HEADER_CHANGED) {
+ reterr = tcp_build_hdrs(tcp);
+ if (reterr != 0)
+ return (reterr);
+ }
+ if (coas.coa_changed & COA_ROUTE_CHANGED) {
+ in6_addr_t nexthop;
+
+ /*
+ * If we are connected we re-cache the information.
+ * We ignore errors to preserve BSD behavior.
+ * Note that we don't redo IPsec policy lookup here
+ * since the final destination (or source) didn't change.
+ */
+ ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
+ &connp->conn_faddr_v6, &nexthop);
+
+ if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
+ !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
+ (void) ip_attr_connect(connp, connp->conn_ixa,
+ &connp->conn_laddr_v6, &connp->conn_faddr_v6,
+ &nexthop, connp->conn_fport, NULL, NULL,
+ IPDF_VERIFY_DST);
+ }
+ }
+ if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
+ connp->conn_wq->q_hiwat = connp->conn_sndbuf;
+ }
+ if (coas.coa_changed & COA_WROFF_CHANGED) {
+ connp->conn_wroff = connp->conn_ht_iphc_allocated +
+ tcps->tcps_wroff_xtra;
+ (void) proto_set_tx_wroff(connp->conn_rq, connp,
+ connp->conn_wroff);
+ }
+ if (coas.coa_changed & COA_OOBINLINE_CHANGED) {
+ if (IPCL_IS_NONSTR(connp))
+ proto_set_rx_oob_opt(connp, onoff);
+ }
return (0);
}
@@ -10798,12 +8456,12 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
int
tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
- void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
+ void *thisdg_attrs, cred_t *cr)
{
conn_t *connp = Q_TO_CONN(q);
return (tcp_opt_set(connp, optset_context, level, name, inlen, invalp,
- outlenp, outvalp, thisdg_attrs, cr, mblk));
+ outlenp, outvalp, thisdg_attrs, cr));
}
int
@@ -10843,7 +8501,6 @@ tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
error = proto_opt_check(level, option_name, optlen, NULL,
tcp_opt_obj.odb_opt_des_arr,
tcp_opt_obj.odb_opt_arr_cnt,
- tcp_opt_obj.odb_topmost_tpiprovider,
B_TRUE, B_FALSE, cr);
if (error != 0) {
@@ -10856,292 +8513,75 @@ tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name,
optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp,
- NULL, cr, NULL);
+ NULL, cr);
squeue_synch_exit(sqp, connp);
- if (error < 0) {
- /*
- * Pass on to ip
- */
- error = ip_set_options(connp, level, option_name, optvalp,
- optlen, cr);
- }
+ ASSERT(error >= 0);
+
return (error);
}
/*
- * Update tcp_sticky_hdrs based on tcp_sticky_ipp.
- * The headers include ip6i_t (if needed), ip6_t, any sticky extension
+ * Build/update the tcp header template (in conn_ht_iphc) based on
+ * conn_xmit_ipp. The headers include ip6_t, any extension
* headers, and the maximum size tcp header (to avoid reallocation
* on the fly for additional tcp options).
+ *
+ * Assumes the caller has already set conn_{faddr,laddr,fport,lport,flowinfo}.
* Returns failure if can't allocate memory.
*/
static int
tcp_build_hdrs(tcp_t *tcp)
{
- char *hdrs;
- uint_t hdrs_len;
- ip6i_t *ip6i;
- char buf[TCP_MAX_HDR_LENGTH];
- ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp;
- in6_addr_t src, dst;
tcp_stack_t *tcps = tcp->tcp_tcps;
- conn_t *connp = tcp->tcp_connp;
+ conn_t *connp = tcp->tcp_connp;
+ tcpha_t *tcpha;
+ uint32_t cksum;
+ int error;
- /*
- * save the existing tcp header and source/dest IP addresses
- */
- bcopy(tcp->tcp_tcph, buf, tcp->tcp_tcp_hdr_len);
- src = tcp->tcp_ip6h->ip6_src;
- dst = tcp->tcp_ip6h->ip6_dst;
- hdrs_len = ip_total_hdrs_len_v6(ipp) + TCP_MAX_HDR_LENGTH;
- ASSERT(hdrs_len != 0);
- if (hdrs_len > tcp->tcp_iphc_len) {
- /* Need to reallocate */
- hdrs = kmem_zalloc(hdrs_len, KM_NOSLEEP);
- if (hdrs == NULL)
- return (ENOMEM);
- if (tcp->tcp_iphc != NULL) {
- if (tcp->tcp_hdr_grown) {
- kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len);
- } else {
- bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
- kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc);
- }
- tcp->tcp_iphc_len = 0;
- }
- ASSERT(tcp->tcp_iphc_len == 0);
- tcp->tcp_iphc = hdrs;
- tcp->tcp_iphc_len = hdrs_len;
- tcp->tcp_hdr_grown = B_TRUE;
- }
- ip_build_hdrs_v6((uchar_t *)tcp->tcp_iphc,
- hdrs_len - TCP_MAX_HDR_LENGTH, ipp, IPPROTO_TCP);
+ /* Grab lock to satisfy ASSERT; TCP is serialized using squeue */
+ mutex_enter(&connp->conn_lock);
+ error = conn_build_hdr_template(connp, TCP_MIN_HEADER_LENGTH,
+ TCP_MAX_TCP_OPTIONS_LENGTH, &connp->conn_laddr_v6,
+ &connp->conn_faddr_v6, connp->conn_flowinfo);
+ mutex_exit(&connp->conn_lock);
+ if (error != 0)
+ return (error);
- /* Set header fields not in ipp */
- if (ipp->ipp_fields & IPPF_HAS_IP6I) {
- ip6i = (ip6i_t *)tcp->tcp_iphc;
- tcp->tcp_ip6h = (ip6_t *)&ip6i[1];
- } else {
- tcp->tcp_ip6h = (ip6_t *)tcp->tcp_iphc;
- }
/*
- * tcp->tcp_ip_hdr_len will include ip6i_t if there is one.
- *
- * tcp->tcp_tcp_hdr_len doesn't change here.
+ * Any routing header/option has been massaged. The checksum difference
+ * is stored in conn_sum for later use.
*/
- tcp->tcp_ip_hdr_len = hdrs_len - TCP_MAX_HDR_LENGTH;
- tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc + tcp->tcp_ip_hdr_len);
- tcp->tcp_hdr_len = tcp->tcp_ip_hdr_len + tcp->tcp_tcp_hdr_len;
+ tcpha = (tcpha_t *)connp->conn_ht_ulp;
+ tcp->tcp_tcpha = tcpha;
- bcopy(buf, tcp->tcp_tcph, tcp->tcp_tcp_hdr_len);
-
- tcp->tcp_ip6h->ip6_src = src;
- tcp->tcp_ip6h->ip6_dst = dst;
+ tcpha->tha_lport = connp->conn_lport;
+ tcpha->tha_fport = connp->conn_fport;
+ tcpha->tha_sum = 0;
+ tcpha->tha_offset_and_reserved = (5 << 4);
/*
- * If the hop limit was not set by ip_build_hdrs_v6(), set it to
- * the default value for TCP.
- */
- if (!(ipp->ipp_fields & IPPF_UNICAST_HOPS))
- tcp->tcp_ip6h->ip6_hops = tcps->tcps_ipv6_hoplimit;
-
- /*
- * If we're setting extension headers after a connection
- * has been established, and if we have a routing header
- * among the extension headers, call ip_massage_options_v6 to
- * manipulate the routing header/ip6_dst set the checksum
- * difference in the tcp header template.
- * (This happens in tcp_connect_ipv6 if the routing header
- * is set prior to the connect.)
- * Set the tcp_sum to zero first in case we've cleared a
- * routing header or don't have one at all.
+ * IP wants our header length in the checksum field to
+ * allow it to perform a single pseudo-header+checksum
+ * calculation on behalf of TCP.
+ * Include the adjustment for a source route once IP_OPTIONS is set.
*/
- tcp->tcp_sum = 0;
- if ((tcp->tcp_state >= TCPS_SYN_SENT) &&
- (tcp->tcp_ipp_fields & IPPF_RTHDR)) {
- ip6_rthdr_t *rth = ip_find_rthdr_v6(tcp->tcp_ip6h,
- (uint8_t *)tcp->tcp_tcph);
- if (rth != NULL) {
- tcp->tcp_sum = ip_massage_options_v6(tcp->tcp_ip6h,
- rth, tcps->tcps_netstack);
- tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) +
- (tcp->tcp_sum >> 16));
- }
- }
-
- /* Try to get everything in a single mblk */
- (void) proto_set_tx_wroff(tcp->tcp_rq, connp,
- hdrs_len + tcps->tcps_wroff_xtra);
- return (0);
-}
-
-/*
- * Transfer any source route option from ipha to buf/dst in reversed form.
- */
-static int
-tcp_opt_rev_src_route(ipha_t *ipha, char *buf, uchar_t *dst)
-{
- ipoptp_t opts;
- uchar_t *opt;
- uint8_t optval;
- uint8_t optlen;
- uint32_t len = 0;
-
- for (optval = ipoptp_first(&opts, ipha);
- optval != IPOPT_EOL;
- optval = ipoptp_next(&opts)) {
- opt = opts.ipoptp_cur;
- optlen = opts.ipoptp_len;
- switch (optval) {
- int off1, off2;
- case IPOPT_SSRR:
- case IPOPT_LSRR:
-
- /* Reverse source route */
- /*
- * First entry should be the next to last one in the
- * current source route (the last entry is our
- * address.)
- * The last entry should be the final destination.
- */
- buf[IPOPT_OPTVAL] = (uint8_t)optval;
- buf[IPOPT_OLEN] = (uint8_t)optlen;
- off1 = IPOPT_MINOFF_SR - 1;
- off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1;
- if (off2 < 0) {
- /* No entries in source route */
- break;
- }
- bcopy(opt + off2, dst, IP_ADDR_LEN);
- /*
- * Note: use src since ipha has not had its src
- * and dst reversed (it is in the state it was
- * received.
- */
- bcopy(&ipha->ipha_src, buf + off2,
- IP_ADDR_LEN);
- off2 -= IP_ADDR_LEN;
-
- while (off2 > 0) {
- bcopy(opt + off2, buf + off1,
- IP_ADDR_LEN);
- off1 += IP_ADDR_LEN;
- off2 -= IP_ADDR_LEN;
- }
- buf[IPOPT_OFFSET] = IPOPT_MINOFF_SR;
- buf += optlen;
- len += optlen;
- break;
- }
- }
-done:
- /* Pad the resulting options */
- while (len & 0x3) {
- *buf++ = IPOPT_EOL;
- len++;
- }
- return (len);
-}
-
-
-/*
- * Extract and revert a source route from ipha (if any)
- * and then update the relevant fields in both tcp_t and the standard header.
- */
-static void
-tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha)
-{
- char buf[TCP_MAX_HDR_LENGTH];
- uint_t tcph_len;
- int len;
-
- ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
- len = IPH_HDR_LENGTH(ipha);
- if (len == IP_SIMPLE_HDR_LENGTH)
- /* Nothing to do */
- return;
- if (len > IP_SIMPLE_HDR_LENGTH + TCP_MAX_IP_OPTIONS_LENGTH ||
- (len & 0x3))
- return;
-
- tcph_len = tcp->tcp_tcp_hdr_len;
- bcopy(tcp->tcp_tcph, buf, tcph_len);
- tcp->tcp_sum = (tcp->tcp_ipha->ipha_dst >> 16) +
- (tcp->tcp_ipha->ipha_dst & 0xffff);
- len = tcp_opt_rev_src_route(ipha, (char *)tcp->tcp_ipha +
- IP_SIMPLE_HDR_LENGTH, (uchar_t *)&tcp->tcp_ipha->ipha_dst);
- len += IP_SIMPLE_HDR_LENGTH;
- tcp->tcp_sum -= ((tcp->tcp_ipha->ipha_dst >> 16) +
- (tcp->tcp_ipha->ipha_dst & 0xffff));
- if ((int)tcp->tcp_sum < 0)
- tcp->tcp_sum--;
- tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16);
- tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16));
- tcp->tcp_tcph = (tcph_t *)((char *)tcp->tcp_ipha + len);
- bcopy(buf, tcp->tcp_tcph, tcph_len);
- tcp->tcp_ip_hdr_len = len;
- tcp->tcp_ipha->ipha_version_and_hdr_length =
- (IP_VERSION << 4) | (len >> 2);
- len += tcph_len;
- tcp->tcp_hdr_len = len;
-}
-
-/*
- * Copy the standard header into its new location,
- * lay in the new options and then update the relevant
- * fields in both tcp_t and the standard header.
- */
-static int
-tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly, uchar_t *ptr, uint_t len)
-{
- uint_t tcph_len;
- uint8_t *ip_optp;
- tcph_t *new_tcph;
- tcp_stack_t *tcps = tcp->tcp_tcps;
- conn_t *connp = tcp->tcp_connp;
-
- if ((len > TCP_MAX_IP_OPTIONS_LENGTH) || (len & 0x3))
- return (EINVAL);
-
- if (len > IP_MAX_OPT_LENGTH - tcp->tcp_label_len)
- return (EINVAL);
-
- if (checkonly) {
- /*
- * do not really set, just pretend to - T_CHECK
- */
- return (0);
- }
+ cksum = sizeof (tcpha_t) + connp->conn_sum;
+ cksum = (cksum >> 16) + (cksum & 0xFFFF);
+ ASSERT(cksum < 0x10000);
+ tcpha->tha_sum = htons(cksum);
- ip_optp = (uint8_t *)tcp->tcp_ipha + IP_SIMPLE_HDR_LENGTH;
- if (tcp->tcp_label_len > 0) {
- int padlen;
- uint8_t opt;
+ if (connp->conn_ipversion == IPV4_VERSION)
+ tcp->tcp_ipha = (ipha_t *)connp->conn_ht_iphc;
+ else
+ tcp->tcp_ip6h = (ip6_t *)connp->conn_ht_iphc;
- /* convert list termination to no-ops */
- padlen = tcp->tcp_label_len - ip_optp[IPOPT_OLEN];
- ip_optp += ip_optp[IPOPT_OLEN];
- opt = len > 0 ? IPOPT_NOP : IPOPT_EOL;
- while (--padlen >= 0)
- *ip_optp++ = opt;
- }
- tcph_len = tcp->tcp_tcp_hdr_len;
- new_tcph = (tcph_t *)(ip_optp + len);
- ovbcopy(tcp->tcp_tcph, new_tcph, tcph_len);
- tcp->tcp_tcph = new_tcph;
- bcopy(ptr, ip_optp, len);
-
- len += IP_SIMPLE_HDR_LENGTH + tcp->tcp_label_len;
-
- tcp->tcp_ip_hdr_len = len;
- tcp->tcp_ipha->ipha_version_and_hdr_length =
- (IP_VERSION << 4) | (len >> 2);
- tcp->tcp_hdr_len = len + tcph_len;
- if (!TCP_IS_DETACHED(tcp)) {
- /* Always allocate room for all options. */
- (void) proto_set_tx_wroff(tcp->tcp_rq, connp,
- TCP_MAX_COMBINED_HEADER_LENGTH + tcps->tcps_wroff_xtra);
+ if (connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra >
+ connp->conn_wroff) {
+ connp->conn_wroff = connp->conn_ht_iphc_allocated +
+ tcps->tcps_wroff_xtra;
+ (void) proto_set_tx_wroff(connp->conn_rq, connp,
+ connp->conn_wroff);
}
return (0);
}
@@ -11184,36 +8624,6 @@ tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt, tcp_stack_t *tcps)
nd_free(ndp);
return (B_FALSE);
}
- tcps->tcps_mdt_head_param = kmem_zalloc(sizeof (tcpparam_t),
- KM_SLEEP);
- bcopy(&lcl_tcp_mdt_head_param, tcps->tcps_mdt_head_param,
- sizeof (tcpparam_t));
- if (!nd_load(ndp, tcps->tcps_mdt_head_param->tcp_param_name,
- tcp_param_get, tcp_param_set_aligned,
- (caddr_t)tcps->tcps_mdt_head_param)) {
- nd_free(ndp);
- return (B_FALSE);
- }
- tcps->tcps_mdt_tail_param = kmem_zalloc(sizeof (tcpparam_t),
- KM_SLEEP);
- bcopy(&lcl_tcp_mdt_tail_param, tcps->tcps_mdt_tail_param,
- sizeof (tcpparam_t));
- if (!nd_load(ndp, tcps->tcps_mdt_tail_param->tcp_param_name,
- tcp_param_get, tcp_param_set_aligned,
- (caddr_t)tcps->tcps_mdt_tail_param)) {
- nd_free(ndp);
- return (B_FALSE);
- }
- tcps->tcps_mdt_max_pbufs_param = kmem_zalloc(sizeof (tcpparam_t),
- KM_SLEEP);
- bcopy(&lcl_tcp_mdt_max_pbufs_param, tcps->tcps_mdt_max_pbufs_param,
- sizeof (tcpparam_t));
- if (!nd_load(ndp, tcps->tcps_mdt_max_pbufs_param->tcp_param_name,
- tcp_param_get, tcp_param_set_aligned,
- (caddr_t)tcps->tcps_mdt_max_pbufs_param)) {
- nd_free(ndp);
- return (B_FALSE);
- }
if (!nd_load(ndp, "tcp_extra_priv_ports",
tcp_extra_priv_ports_get, NULL, NULL)) {
nd_free(ndp);
@@ -11248,7 +8658,7 @@ tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt, tcp_stack_t *tcps)
return (B_TRUE);
}
-/* ndd set routine for tcp_wroff_xtra, tcp_mdt_hdr_{head,tail}_min. */
+/* ndd set routine for tcp_wroff_xtra. */
/* ARGSUSED */
static int
tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
@@ -11307,6 +8717,7 @@ tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start)
uint32_t u1;
tcp_stack_t *tcps = tcp->tcp_tcps;
+
/* Walk through all the new pieces. */
do {
ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <=
@@ -11433,9 +8844,10 @@ tcp_rwnd_reopen(tcp_t *tcp)
{
uint_t ret = 0;
uint_t thwin;
+ conn_t *connp = tcp->tcp_connp;
/* Learn the latest rwnd information that we sent to the other side. */
- thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win))
+ thwin = ((uint_t)ntohs(tcp->tcp_tcpha->tha_win))
<< tcp->tcp_rcv_ws;
/* This is peer's calculated send window (our receive window). */
thwin -= tcp->tcp_rnxt - tcp->tcp_rack;
@@ -11444,7 +8856,7 @@ tcp_rwnd_reopen(tcp_t *tcp)
* SWS avoidance. This means that we need to check the increase of
* of receive window is at least 1 MSS.
*/
- if (tcp->tcp_recv_hiwater - thwin >= tcp->tcp_mss) {
+ if (connp->conn_rcvbuf - thwin >= tcp->tcp_mss) {
/*
* If the window that the other side knows is less than max
* deferred acks segments, send an update immediately.
@@ -11453,7 +8865,7 @@ tcp_rwnd_reopen(tcp_t *tcp)
BUMP_MIB(&tcp->tcp_tcps->tcps_mib, tcpOutWinUpdate);
ret = TH_ACK_NEEDED;
}
- tcp->tcp_rwnd = tcp->tcp_recv_hiwater;
+ tcp->tcp_rwnd = connp->conn_rcvbuf;
}
return (ret);
}
@@ -11469,7 +8881,7 @@ tcp_rcv_drain(tcp_t *tcp)
#ifdef DEBUG
uint_t cnt = 0;
#endif
- queue_t *q = tcp->tcp_rq;
+ queue_t *q = tcp->tcp_connp->conn_rq;
/* Can't drain on an eager connection */
if (tcp->tcp_listener != NULL)
@@ -11511,7 +8923,7 @@ tcp_rcv_drain(tcp_t *tcp)
if ((tcp->tcp_kssl_ctx != NULL) && (DB_TYPE(mp) == M_DATA)) {
DTRACE_PROBE1(kssl_mblk__ksslinput_rcvdrain,
mblk_t *, mp);
- tcp_kssl_input(tcp, mp);
+ tcp_kssl_input(tcp, mp, NULL);
continue;
}
putnext(q, mp);
@@ -11538,11 +8950,22 @@ tcp_rcv_drain(tcp_t *tcp)
* Other messages are added as new (b_next) elements.
*/
void
-tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len)
+tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len, cred_t *cr)
{
ASSERT(seg_len == msgdsize(mp));
ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_rcv_last_head != NULL);
+ if (is_system_labeled()) {
+ ASSERT(cr != NULL || msg_getcred(mp, NULL) != NULL);
+ /*
+ * Provide for protocols above TCP such as RPC. NOPID leaves
+ * db_cpid unchanged.
+ * The cred could have already been set.
+ */
+ if (cr != NULL)
+ mblk_setcred(mp, cr, NOPID);
+ }
+
if (tcp->tcp_rcv_list == NULL) {
ASSERT(tcp->tcp_rcv_last_head == NULL);
tcp->tcp_rcv_list = mp;
@@ -11562,176 +8985,6 @@ tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len)
tcp->tcp_rwnd -= seg_len;
}
-/*
- * DEFAULT TCP ENTRY POINT via squeue on READ side.
- *
- * This is the default entry function into TCP on the read side. TCP is
- * always entered via squeue i.e. using squeue's for mutual exclusion.
- * When classifier does a lookup to find the tcp, it also puts a reference
- * on the conn structure associated so the tcp is guaranteed to exist
- * when we come here. We still need to check the state because it might
- * as well has been closed. The squeue processing function i.e. squeue_enter,
- * is responsible for doing the CONN_DEC_REF.
- *
- * Apart from the default entry point, IP also sends packets directly to
- * tcp_rput_data for AF_INET fast path and tcp_conn_request for incoming
- * connections.
- */
-boolean_t tcp_outbound_squeue_switch = B_FALSE;
-void
-tcp_input(void *arg, mblk_t *mp, void *arg2)
-{
- conn_t *connp = (conn_t *)arg;
- tcp_t *tcp = (tcp_t *)connp->conn_tcp;
-
- /* arg2 is the sqp */
- ASSERT(arg2 != NULL);
- ASSERT(mp != NULL);
-
- /*
- * Don't accept any input on a closed tcp as this TCP logically does
- * not exist on the system. Don't proceed further with this TCP.
- * For eg. this packet could trigger another close of this tcp
- * which would be disastrous for tcp_refcnt. tcp_close_detached /
- * tcp_clean_death / tcp_closei_local must be called at most once
- * on a TCP. In this case we need to refeed the packet into the
- * classifier and figure out where the packet should go. Need to
- * preserve the recv_ill somehow. Until we figure that out, for
- * now just drop the packet if we can't classify the packet.
- */
- if (tcp->tcp_state == TCPS_CLOSED ||
- tcp->tcp_state == TCPS_BOUND) {
- conn_t *new_connp;
- ip_stack_t *ipst = tcp->tcp_tcps->tcps_netstack->netstack_ip;
-
- new_connp = ipcl_classify(mp, connp->conn_zoneid, ipst);
- if (new_connp != NULL) {
- tcp_reinput(new_connp, mp, arg2);
- return;
- }
- /* We failed to classify. For now just drop the packet */
- freemsg(mp);
- return;
- }
-
- if (DB_TYPE(mp) != M_DATA) {
- tcp_rput_common(tcp, mp);
- return;
- }
-
- if (mp->b_datap->db_struioflag & STRUIO_CONNECT) {
- squeue_t *final_sqp;
-
- mp->b_datap->db_struioflag &= ~STRUIO_CONNECT;
- final_sqp = (squeue_t *)DB_CKSUMSTART(mp);
- DB_CKSUMSTART(mp) = 0;
- if (tcp->tcp_state == TCPS_SYN_SENT &&
- connp->conn_final_sqp == NULL &&
- tcp_outbound_squeue_switch) {
- ASSERT(connp->conn_initial_sqp == connp->conn_sqp);
- connp->conn_final_sqp = final_sqp;
- if (connp->conn_final_sqp != connp->conn_sqp) {
- CONN_INC_REF(connp);
- SQUEUE_SWITCH(connp, connp->conn_final_sqp);
- SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
- tcp_rput_data, connp, ip_squeue_flag,
- SQTAG_CONNECT_FINISH);
- return;
- }
- }
- }
- tcp_rput_data(connp, mp, arg2);
-}
-
-/*
- * The read side put procedure.
- * The packets passed up by ip are assume to be aligned according to
- * OK_32PTR and the IP+TCP headers fitting in the first mblk.
- */
-static void
-tcp_rput_common(tcp_t *tcp, mblk_t *mp)
-{
- /*
- * tcp_rput_data() does not expect M_CTL except for the case
- * where tcp_ipv6_recvancillary is set and we get a IN_PKTINFO
- * type. Need to make sure that any other M_CTLs don't make
- * it to tcp_rput_data since it is not expecting any and doesn't
- * check for it.
- */
- if (DB_TYPE(mp) == M_CTL) {
- switch (*(uint32_t *)(mp->b_rptr)) {
- case TCP_IOC_ABORT_CONN:
- /*
- * Handle connection abort request.
- */
- tcp_ioctl_abort_handler(tcp, mp);
- return;
- case IPSEC_IN:
- /*
- * Only secure icmp arrive in TCP and they
- * don't go through data path.
- */
- tcp_icmp_error(tcp, mp);
- return;
- case IN_PKTINFO:
- /*
- * Handle IPV6_RECVPKTINFO socket option on AF_INET6
- * sockets that are receiving IPv4 traffic. tcp
- */
- ASSERT(tcp->tcp_family == AF_INET6);
- ASSERT(tcp->tcp_ipv6_recvancillary &
- TCP_IPV6_RECVPKTINFO);
- tcp_rput_data(tcp->tcp_connp, mp,
- tcp->tcp_connp->conn_sqp);
- return;
- case MDT_IOC_INFO_UPDATE:
- /*
- * Handle Multidata information update; the
- * following routine will free the message.
- */
- if (tcp->tcp_connp->conn_mdt_ok) {
- tcp_mdt_update(tcp,
- &((ip_mdt_info_t *)mp->b_rptr)->mdt_capab,
- B_FALSE);
- }
- freemsg(mp);
- return;
- case LSO_IOC_INFO_UPDATE:
- /*
- * Handle LSO information update; the following
- * routine will free the message.
- */
- if (tcp->tcp_connp->conn_lso_ok) {
- tcp_lso_update(tcp,
- &((ip_lso_info_t *)mp->b_rptr)->lso_capab);
- }
- freemsg(mp);
- return;
- default:
- /*
- * tcp_icmp_err() will process the M_CTL packets.
- * Non-ICMP packets, if any, will be discarded in
- * tcp_icmp_err(). We will process the ICMP packet
- * even if we are TCP_IS_DETACHED_NONEAGER as the
- * incoming ICMP packet may result in changing
- * the tcp_mss, which we would need if we have
- * packets to retransmit.
- */
- tcp_icmp_error(tcp, mp);
- return;
- }
- }
-
- /* No point processing the message if tcp is already closed */
- if (TCP_IS_DETACHED_NONEAGER(tcp)) {
- freemsg(mp);
- return;
- }
-
- tcp_rput_other(tcp, mp);
-}
-
-
/* The minimum of smoothed mean deviation in RTO calculation. */
#define TCP_SD_MIN 400
@@ -11885,12 +9138,12 @@ tcp_get_seg_mp(tcp_t *tcp, uint32_t seq, int32_t *off)
* segments. A segment is eligible if sack_cnt for that segment is greater
* than or equal tcp_dupack_fast_retransmit. After it has retransmitted
* all eligible segments, it checks to see if TCP can send some new segments
- * (fast recovery). If it can, set the appropriate flag for tcp_rput_data().
+ * (fast recovery). If it can, set the appropriate flag for tcp_input_data().
*
* Parameters:
* tcp_t *tcp: the tcp structure of the connection.
* uint_t *flags: in return, appropriate value will be set for
- * tcp_rput_data().
+ * tcp_input_data().
*/
static void
tcp_sack_rxmit(tcp_t *tcp, uint_t *flags)
@@ -11988,7 +9241,7 @@ tcp_sack_rxmit(tcp_t *tcp, uint_t *flags)
tcp->tcp_pipe += seg_len;
tcp->tcp_sack_snxt = begin + seg_len;
- tcp_send_data(tcp, tcp->tcp_wq, xmit_mp);
+ tcp_send_data(tcp, xmit_mp);
/*
* Update the send timestamp to avoid false retransmission.
@@ -12012,96 +9265,8 @@ tcp_sack_rxmit(tcp_t *tcp, uint_t *flags)
}
/*
- * This function handles policy checking at TCP level for non-hard_bound/
- * detached connections.
- */
-static boolean_t
-tcp_check_policy(tcp_t *tcp, mblk_t *first_mp, ipha_t *ipha, ip6_t *ip6h,
- boolean_t secure, boolean_t mctl_present)
-{
- ipsec_latch_t *ipl = NULL;
- ipsec_action_t *act = NULL;
- mblk_t *data_mp;
- ipsec_in_t *ii;
- const char *reason;
- kstat_named_t *counter;
- tcp_stack_t *tcps = tcp->tcp_tcps;
- ipsec_stack_t *ipss;
- ip_stack_t *ipst;
-
- ASSERT(mctl_present || !secure);
-
- ASSERT((ipha == NULL && ip6h != NULL) ||
- (ip6h == NULL && ipha != NULL));
-
- /*
- * We don't necessarily have an ipsec_in_act action to verify
- * policy because of assymetrical policy where we have only
- * outbound policy and no inbound policy (possible with global
- * policy).
- */
- if (!secure) {
- if (act == NULL || act->ipa_act.ipa_type == IPSEC_ACT_BYPASS ||
- act->ipa_act.ipa_type == IPSEC_ACT_CLEAR)
- return (B_TRUE);
- ipsec_log_policy_failure(IPSEC_POLICY_MISMATCH,
- "tcp_check_policy", ipha, ip6h, secure,
- tcps->tcps_netstack);
- ipss = tcps->tcps_netstack->netstack_ipsec;
-
- ip_drop_packet(first_mp, B_TRUE, NULL, NULL,
- DROPPER(ipss, ipds_tcp_clear),
- &tcps->tcps_dropper);
- return (B_FALSE);
- }
-
- /*
- * We have a secure packet.
- */
- if (act == NULL) {
- ipsec_log_policy_failure(IPSEC_POLICY_NOT_NEEDED,
- "tcp_check_policy", ipha, ip6h, secure,
- tcps->tcps_netstack);
- ipss = tcps->tcps_netstack->netstack_ipsec;
-
- ip_drop_packet(first_mp, B_TRUE, NULL, NULL,
- DROPPER(ipss, ipds_tcp_secure),
- &tcps->tcps_dropper);
- return (B_FALSE);
- }
-
- /*
- * XXX This whole routine is currently incorrect. ipl should
- * be set to the latch pointer, but is currently not set, so
- * we initialize it to NULL to avoid picking up random garbage.
- */
- if (ipl == NULL)
- return (B_TRUE);
-
- data_mp = first_mp->b_cont;
-
- ii = (ipsec_in_t *)first_mp->b_rptr;
-
- ipst = tcps->tcps_netstack->netstack_ip;
-
- if (ipsec_check_ipsecin_latch(ii, data_mp, ipl, ipha, ip6h, &reason,
- &counter, tcp->tcp_connp)) {
- BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded);
- return (B_TRUE);
- }
- (void) strlog(TCP_MOD_ID, 0, 0, SL_ERROR|SL_WARN|SL_CONSOLE,
- "tcp inbound policy mismatch: %s, packet dropped\n",
- reason);
- BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed);
-
- ip_drop_packet(first_mp, B_TRUE, NULL, NULL, counter,
- &tcps->tcps_dropper);
- return (B_FALSE);
-}
-
-/*
- * tcp_ss_rexmit() is called in tcp_rput_data() to do slow start
- * retransmission after a timeout.
+ * tcp_ss_rexmit() is called to do slow start retransmission after a timeout
+ * or ICMP errors.
*
* To limit the number of duplicate segments, we limit the number of segment
* to be sent in one time to tcp_snd_burst, the burst variable.
@@ -12150,7 +9315,7 @@ tcp_ss_rexmit(tcp_t *tcp)
if (xmit_mp == NULL)
return;
- tcp_send_data(tcp, tcp->tcp_wq, xmit_mp);
+ tcp_send_data(tcp, xmit_mp);
snxt += cnt;
win -= cnt;
@@ -12184,7 +9349,7 @@ tcp_ss_rexmit(tcp_t *tcp)
/*
* Process all TCP option in SYN segment. Note that this function should
- * be called after tcp_adapt_ire() is called so that the necessary info
+ * be called after tcp_set_destination() is called so that the necessary info
* from IRE is already set in the tcp structure.
*
* This function sets up the correct tcp_mss value according to the
@@ -12194,16 +9359,17 @@ tcp_ss_rexmit(tcp_t *tcp)
* should do the appropriate change.
*/
void
-tcp_process_options(tcp_t *tcp, tcph_t *tcph)
+tcp_process_options(tcp_t *tcp, tcpha_t *tcpha)
{
int options;
tcp_opt_t tcpopt;
uint32_t mss_max;
char *tmp_tcph;
tcp_stack_t *tcps = tcp->tcp_tcps;
+ conn_t *connp = tcp->tcp_connp;
tcpopt.tcp = NULL;
- options = tcp_parse_options(tcph, &tcpopt);
+ options = tcp_parse_options(tcpha, &tcpopt);
/*
* Process MSS option. Note that MSS option value does not account
@@ -12212,12 +9378,12 @@ tcp_process_options(tcp_t *tcp, tcph_t *tcph)
* IPv6.
*/
if (!(options & TCP_OPT_MSS_PRESENT)) {
- if (tcp->tcp_ipversion == IPV4_VERSION)
+ if (connp->conn_ipversion == IPV4_VERSION)
tcpopt.tcp_opt_mss = tcps->tcps_mss_def_ipv4;
else
tcpopt.tcp_opt_mss = tcps->tcps_mss_def_ipv6;
} else {
- if (tcp->tcp_ipversion == IPV4_VERSION)
+ if (connp->conn_ipversion == IPV4_VERSION)
mss_max = tcps->tcps_mss_max_ipv4;
else
mss_max = tcps->tcps_mss_max_ipv6;
@@ -12240,23 +9406,23 @@ tcp_process_options(tcp_t *tcp, tcph_t *tcph)
/* Process Timestamp option. */
if ((options & TCP_OPT_TSTAMP_PRESENT) &&
(tcp->tcp_snd_ts_ok || TCP_IS_DETACHED(tcp))) {
- tmp_tcph = (char *)tcp->tcp_tcph;
+ tmp_tcph = (char *)tcp->tcp_tcpha;
tcp->tcp_snd_ts_ok = B_TRUE;
tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val;
tcp->tcp_last_rcv_lbolt = lbolt64;
ASSERT(OK_32PTR(tmp_tcph));
- ASSERT(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH);
+ ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
/* Fill in our template header with basic timestamp option. */
- tmp_tcph += tcp->tcp_tcp_hdr_len;
+ tmp_tcph += connp->conn_ht_ulp_len;
tmp_tcph[0] = TCPOPT_NOP;
tmp_tcph[1] = TCPOPT_NOP;
tmp_tcph[2] = TCPOPT_TSTAMP;
tmp_tcph[3] = TCPOPT_TSTAMP_LEN;
- tcp->tcp_hdr_len += TCPOPT_REAL_TS_LEN;
- tcp->tcp_tcp_hdr_len += TCPOPT_REAL_TS_LEN;
- tcp->tcp_tcph->th_offset_and_rsrvd[0] += (3 << 4);
+ connp->conn_ht_iphc_len += TCPOPT_REAL_TS_LEN;
+ connp->conn_ht_ulp_len += TCPOPT_REAL_TS_LEN;
+ tcp->tcp_tcpha->tha_offset_and_reserved += (3 << 4);
} else {
tcp->tcp_snd_ts_ok = B_FALSE;
}
@@ -12266,12 +9432,11 @@ tcp_process_options(tcp_t *tcp, tcph_t *tcph)
* then allocate the SACK info structure. Note the following ways
* when tcp_snd_sack_ok is set to true.
*
- * For active connection: in tcp_adapt_ire() called in
- * tcp_rput_other(), or in tcp_rput_other() when tcp_sack_permitted
- * is checked.
+ * For active connection: in tcp_set_destination() called in
+ * tcp_connect().
*
- * For passive connection: in tcp_adapt_ire() called in
- * tcp_accept_comm().
+ * For passive connection: in tcp_set_destination() called in
+ * tcp_input_listener().
*
* That's the reason why the extra TCP_IS_DETACHED() check is there.
* That check makes sure that if we did not send a SACK OK option,
@@ -12320,7 +9485,8 @@ tcp_process_options(tcp_t *tcp, tcph_t *tcph)
* Now we know the exact TCP/IP header length, subtract
* that from tcp_mss to get our side's MSS.
*/
- tcp->tcp_mss -= tcp->tcp_hdr_len;
+ tcp->tcp_mss -= connp->conn_ht_iphc_len;
+
/*
* Here we assume that the other side's header size will be equal to
* our header size. We calculate the real MSS accordingly. Need to
@@ -12328,22 +9494,29 @@ tcp_process_options(tcp_t *tcp, tcph_t *tcph)
*
* Real MSS = Opt.MSS - (our TCP/IP header - min TCP/IP header)
*/
- tcpopt.tcp_opt_mss -= tcp->tcp_hdr_len + tcp->tcp_ipsec_overhead -
- ((tcp->tcp_ipversion == IPV4_VERSION ?
+ tcpopt.tcp_opt_mss -= connp->conn_ht_iphc_len +
+ tcp->tcp_ipsec_overhead -
+ ((connp->conn_ipversion == IPV4_VERSION ?
IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) + TCP_MIN_HEADER_LENGTH);
/*
* Set MSS to the smaller one of both ends of the connection.
* We should not have called tcp_mss_set() before, but our
* side of the MSS should have been set to a proper value
- * by tcp_adapt_ire(). tcp_mss_set() will also set up the
+ * by tcp_set_destination(). tcp_mss_set() will also set up the
* STREAM head parameters properly.
*
* If we have a larger-than-16-bit window but the other side
* didn't want to do window scale, tcp_rwnd_set() will take
* care of that.
*/
- tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss), B_TRUE);
+ tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss));
+
+ /*
+ * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been
+ * updated properly.
+ */
+ SET_TCP_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial);
}
/*
@@ -12410,7 +9583,7 @@ tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2)
tcp_t *tail;
/*
- * The eager already has an extra ref put in tcp_rput_data
+ * The eager already has an extra ref put in tcp_input_data
* so that it stays till accept comes back even though it
* might get into TCPS_CLOSED as a result of a TH_RST etc.
*/
@@ -12496,8 +9669,8 @@ tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2)
* remote host. This proves the IP addr is good.
* Cache it!
*/
- addr_cache[IP_ADDR_CACHE_HASH(
- tcp->tcp_remote)] = tcp->tcp_remote;
+ addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] =
+ tcp->tcp_connp->conn_faddr_v4;
}
mutex_exit(&listener->tcp_eager_lock);
if (need_send_conn_ind)
@@ -12513,17 +9686,16 @@ tcp_ulp_newconn(conn_t *lconnp, conn_t *econnp, mblk_t *mp)
{
if (IPCL_IS_NONSTR(lconnp)) {
cred_t *cr;
- pid_t cpid;
-
- cr = msg_getcred(mp, &cpid);
+ pid_t cpid = NOPID;
ASSERT(econnp->conn_tcp->tcp_listener == lconnp->conn_tcp);
ASSERT(econnp->conn_tcp->tcp_saved_listener ==
lconnp->conn_tcp);
+ cr = msg_getcred(mp, &cpid);
+
/* Keep the message around in case of a fallback to TPI */
econnp->conn_tcp->tcp_conn.tcp_eager_conn_ind = mp;
-
/*
* Notify the ULP about the newconn. It is guaranteed that no
* tcp_accept() call will be made for the eager if the
@@ -12545,177 +9717,83 @@ tcp_ulp_newconn(conn_t *lconnp, conn_t *econnp, mblk_t *mp)
econnp->conn_tcp->tcp_conn_req_seqnum);
}
} else {
- putnext(lconnp->conn_tcp->tcp_rq, mp);
+ putnext(lconnp->conn_rq, mp);
}
}
-mblk_t *
-tcp_find_pktinfo(tcp_t *tcp, mblk_t *mp, uint_t *ipversp, uint_t *ip_hdr_lenp,
- uint_t *ifindexp, ip6_pkt_t *ippp)
+/*
+ * Handle a packet that has been reclassified by TCP.
+ * This function drops the ref on connp that the caller had.
+ */
+static void
+tcp_reinput(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
{
- ip_pktinfo_t *pinfo;
- ip6_t *ip6h;
- uchar_t *rptr;
- mblk_t *first_mp = mp;
- boolean_t mctl_present = B_FALSE;
- uint_t ifindex = 0;
- ip6_pkt_t ipp;
- uint_t ipvers;
- uint_t ip_hdr_len;
- tcp_stack_t *tcps = tcp->tcp_tcps;
+ ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
- rptr = mp->b_rptr;
- ASSERT(OK_32PTR(rptr));
- ASSERT(tcp != NULL);
- ipp.ipp_fields = 0;
+ if (connp->conn_incoming_ifindex != 0 &&
+ connp->conn_incoming_ifindex != ira->ira_ruifindex) {
+ freemsg(mp);
+ CONN_DEC_REF(connp);
+ return;
+ }
- switch DB_TYPE(mp) {
- case M_CTL:
- mp = mp->b_cont;
- if (mp == NULL) {
- freemsg(first_mp);
- return (NULL);
+ if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) ||
+ (ira->ira_flags & IRAF_IPSEC_SECURE)) {
+ ip6_t *ip6h;
+ ipha_t *ipha;
+
+ if (ira->ira_flags & IRAF_IS_IPV4) {
+ ipha = (ipha_t *)mp->b_rptr;
+ ip6h = NULL;
+ } else {
+ ipha = NULL;
+ ip6h = (ip6_t *)mp->b_rptr;
}
- if (DB_TYPE(mp) != M_DATA) {
- freemsg(first_mp);
- return (NULL);
+ mp = ipsec_check_inbound_policy(mp, connp, ipha, ip6h, ira);
+ if (mp == NULL) {
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
+ /* Note that mp is NULL */
+ ip_drop_input("ipIfStatsInDiscards", mp, NULL);
+ CONN_DEC_REF(connp);
+ return;
}
- mctl_present = B_TRUE;
- break;
- case M_DATA:
- break;
- default:
- cmn_err(CE_NOTE, "tcp_find_pktinfo: unknown db_type");
- freemsg(mp);
- return (NULL);
}
- ipvers = IPH_HDR_VERSION(rptr);
- if (ipvers == IPV4_VERSION) {
- if (tcp == NULL) {
- ip_hdr_len = IPH_HDR_LENGTH(rptr);
- goto done;
- }
-
- ipp.ipp_fields |= IPPF_HOPLIMIT;
- ipp.ipp_hoplimit = ((ipha_t *)rptr)->ipha_ttl;
+ if (IPCL_IS_TCP(connp)) {
/*
- * If we have IN_PKTINFO in an M_CTL and tcp_ipv6_recvancillary
- * has TCP_IPV6_RECVPKTINFO set, pass I/F index along in ipp.
+ * do not drain, certain use cases can blow
+ * the stack
*/
- if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) &&
- mctl_present) {
- pinfo = (ip_pktinfo_t *)first_mp->b_rptr;
- if ((MBLKL(first_mp) == sizeof (ip_pktinfo_t)) &&
- (pinfo->ip_pkt_ulp_type == IN_PKTINFO) &&
- (pinfo->ip_pkt_flags & IPF_RECVIF)) {
- ipp.ipp_fields |= IPPF_IFINDEX;
- ipp.ipp_ifindex = pinfo->ip_pkt_ifindex;
- ifindex = pinfo->ip_pkt_ifindex;
- }
- freeb(first_mp);
- mctl_present = B_FALSE;
- }
- ip_hdr_len = IPH_HDR_LENGTH(rptr);
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
+ connp->conn_recv, connp, ira,
+ SQ_NODRAIN, SQTAG_IP_TCP_INPUT);
} else {
- ip6h = (ip6_t *)rptr;
-
- ASSERT(ipvers == IPV6_VERSION);
- ipp.ipp_fields = IPPF_HOPLIMIT | IPPF_TCLASS;
- ipp.ipp_tclass = (ip6h->ip6_flow & 0x0FF00000) >> 20;
- ipp.ipp_hoplimit = ip6h->ip6_hops;
-
- if (ip6h->ip6_nxt != IPPROTO_TCP) {
- uint8_t nexthdrp;
- ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
-
- /* Look for ifindex information */
- if (ip6h->ip6_nxt == IPPROTO_RAW) {
- ip6i_t *ip6i = (ip6i_t *)ip6h;
- if ((uchar_t *)&ip6i[1] > mp->b_wptr) {
- BUMP_MIB(&ipst->ips_ip_mib, tcpInErrs);
- freemsg(first_mp);
- return (NULL);
- }
-
- if (ip6i->ip6i_flags & IP6I_IFINDEX) {
- ASSERT(ip6i->ip6i_ifindex != 0);
- ipp.ipp_fields |= IPPF_IFINDEX;
- ipp.ipp_ifindex = ip6i->ip6i_ifindex;
- ifindex = ip6i->ip6i_ifindex;
- }
- rptr = (uchar_t *)&ip6i[1];
- mp->b_rptr = rptr;
- if (rptr == mp->b_wptr) {
- mblk_t *mp1;
- mp1 = mp->b_cont;
- freeb(mp);
- mp = mp1;
- rptr = mp->b_rptr;
- }
- if (MBLKL(mp) < IPV6_HDR_LEN +
- sizeof (tcph_t)) {
- BUMP_MIB(&ipst->ips_ip_mib, tcpInErrs);
- freemsg(first_mp);
- return (NULL);
- }
- ip6h = (ip6_t *)rptr;
- }
-
- /*
- * Find any potentially interesting extension headers
- * as well as the length of the IPv6 + extension
- * headers.
- */
- ip_hdr_len = ip_find_hdr_v6(mp, ip6h, &ipp, &nexthdrp);
- /* Verify if this is a TCP packet */
- if (nexthdrp != IPPROTO_TCP) {
- BUMP_MIB(&ipst->ips_ip_mib, tcpInErrs);
- freemsg(first_mp);
- return (NULL);
- }
- } else {
- ip_hdr_len = IPV6_HDR_LEN;
- }
+ /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
+ (connp->conn_recv)(connp, mp, NULL,
+ ira);
+ CONN_DEC_REF(connp);
}
-done:
- if (ipversp != NULL)
- *ipversp = ipvers;
- if (ip_hdr_lenp != NULL)
- *ip_hdr_lenp = ip_hdr_len;
- if (ippp != NULL)
- *ippp = ipp;
- if (ifindexp != NULL)
- *ifindexp = ifindex;
- if (mctl_present) {
- freeb(first_mp);
- }
- return (mp);
}
+boolean_t tcp_outbound_squeue_switch = B_FALSE;
+
/*
* Handle M_DATA messages from IP. Its called directly from IP via
- * squeue for AF_INET type sockets fast path. No M_CTL are expected
- * in this path.
- *
- * For everything else (including AF_INET6 sockets with 'tcp_ipversion'
- * v4 and v6), we are called through tcp_input() and a M_CTL can
- * be present for options but tcp_find_pktinfo() deals with it. We
- * only expect M_DATA packets after tcp_find_pktinfo() is done.
+ * squeue for received IP packets.
*
* The first argument is always the connp/tcp to which the mp belongs.
* There are no exceptions to this rule. The caller has already put
- * a reference on this connp/tcp and once tcp_rput_data() returns,
+ * a reference on this connp/tcp and once tcp_input_data() returns,
* the squeue will do the refrele.
*
- * The TH_SYN for the listener directly go to tcp_conn_request via
- * squeue.
+ * The TH_SYN for the listener directly go to tcp_input_listener via
+ * squeue. ICMP errors go directly to tcp_icmp_input().
*
* sqp: NULL = recursive, sqp != NULL means called from squeue
*/
void
-tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
+tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
{
int32_t bytes_acked;
int32_t gap;
@@ -12729,11 +9807,10 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
int seg_len;
uint_t ip_hdr_len;
uint32_t seg_seq;
- tcph_t *tcph;
+ tcpha_t *tcpha;
int urp;
tcp_opt_t tcpopt;
- uint_t ipvers;
- ip6_pkt_t ipp;
+ ip_pkt_t ipp;
boolean_t ofo_seg = B_FALSE; /* Out of order segment */
uint32_t cwnd;
uint32_t add;
@@ -12756,33 +9833,43 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
rptr = mp->b_rptr;
ASSERT(OK_32PTR(rptr));
- /*
- * An AF_INET socket is not capable of receiving any pktinfo. Do inline
- * processing here. For rest call tcp_find_pktinfo to fill up the
- * necessary information.
- */
- if (IPCL_IS_TCP4(connp)) {
- ipvers = IPV4_VERSION;
- ip_hdr_len = IPH_HDR_LENGTH(rptr);
- } else {
- mp = tcp_find_pktinfo(tcp, mp, &ipvers, &ip_hdr_len,
- NULL, &ipp);
- if (mp == NULL) {
- TCP_STAT(tcps, tcp_rput_v6_error);
- return;
+ ip_hdr_len = ira->ira_ip_hdr_length;
+ if (connp->conn_recv_ancillary.crb_all != 0) {
+ /*
+ * Record packet information in the ip_pkt_t
+ */
+ ipp.ipp_fields = 0;
+ if (ira->ira_flags & IRAF_IS_IPV4) {
+ (void) ip_find_hdr_v4((ipha_t *)rptr, &ipp,
+ B_FALSE);
+ } else {
+ uint8_t nexthdrp;
+
+ /*
+ * IPv6 packets can only be received by applications
+ * that are prepared to receive IPv6 addresses.
+ * The IP fanout must ensure this.
+ */
+ ASSERT(connp->conn_family == AF_INET6);
+
+ (void) ip_find_hdr_v6(mp, (ip6_t *)rptr, B_TRUE, &ipp,
+ &nexthdrp);
+ ASSERT(nexthdrp == IPPROTO_TCP);
+
+ /* Could have caused a pullup? */
+ iphdr = mp->b_rptr;
+ rptr = mp->b_rptr;
}
- iphdr = mp->b_rptr;
- rptr = mp->b_rptr;
}
ASSERT(DB_TYPE(mp) == M_DATA);
ASSERT(mp->b_next == NULL);
- tcph = (tcph_t *)&rptr[ip_hdr_len];
- seg_seq = ABE32_TO_U32(tcph->th_seq);
- seg_ack = ABE32_TO_U32(tcph->th_ack);
+ tcpha = (tcpha_t *)&rptr[ip_hdr_len];
+ seg_seq = ntohl(tcpha->tha_seq);
+ seg_ack = ntohl(tcpha->tha_ack);
ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX);
seg_len = (int)(mp->b_wptr - rptr) -
- (ip_hdr_len + TCP_HDR_LENGTH(tcph));
+ (ip_hdr_len + TCP_HDR_LENGTH(tcpha));
if ((mp1 = mp->b_cont) != NULL && mp1->b_datap->db_type == M_DATA) {
do {
ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <=
@@ -12794,7 +9881,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
if (tcp->tcp_state == TCPS_TIME_WAIT) {
tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack,
- seg_len, tcph);
+ seg_len, tcpha, ira);
return;
}
@@ -12809,7 +9896,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
tcp->tcp_last_recv_time = lbolt;
}
- flags = (unsigned int)tcph->th_flags[0] & 0xFF;
+ flags = (unsigned int)tcpha->tha_flags & 0xFF;
BUMP_LOCAL(tcp->tcp_ibsegs);
DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
@@ -12840,7 +9927,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
}
/* Update pointers into message */
iphdr = rptr = mp->b_rptr;
- tcph = (tcph_t *)&rptr[ip_hdr_len];
+ tcpha = (tcpha_t *)&rptr[ip_hdr_len];
if (SEQ_GT(seg_seq, tcp->tcp_rnxt)) {
/*
* Since we can't handle any data with this urgent
@@ -12849,13 +9936,29 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
* the urgent mark and generate the M_PCSIG,
* which we can do.
*/
- mp->b_wptr = (uchar_t *)tcph + TCP_HDR_LENGTH(tcph);
+ mp->b_wptr = (uchar_t *)tcpha + TCP_HDR_LENGTH(tcpha);
seg_len = 0;
}
}
switch (tcp->tcp_state) {
case TCPS_SYN_SENT:
+ if (connp->conn_final_sqp == NULL &&
+ tcp_outbound_squeue_switch && sqp != NULL) {
+ ASSERT(connp->conn_initial_sqp == connp->conn_sqp);
+ connp->conn_final_sqp = sqp;
+ if (connp->conn_final_sqp != connp->conn_sqp) {
+ DTRACE_PROBE1(conn__final__sqp__switch,
+ conn_t *, connp);
+ CONN_INC_REF(connp);
+ SQUEUE_SWITCH(connp, connp->conn_final_sqp);
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
+ tcp_input_data, connp, ira, ip_squeue_flag,
+ SQTAG_CONNECT_FINISH);
+ return;
+ }
+ DTRACE_PROBE1(conn__final__sqp__same, conn_t *, connp);
+ }
if (flags & TH_ACK) {
/*
* Note that our stack cannot send data before a
@@ -12887,13 +9990,13 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
}
/* Process all TCP options. */
- tcp_process_options(tcp, tcph);
+ tcp_process_options(tcp, tcpha);
/*
* The following changes our rwnd to be a multiple of the
* MIN(peer MSS, our MSS) for performance reason.
*/
- (void) tcp_rwnd_set(tcp,
- MSS_ROUNDUP(tcp->tcp_recv_hiwater, tcp->tcp_mss));
+ (void) tcp_rwnd_set(tcp, MSS_ROUNDUP(connp->conn_rcvbuf,
+ tcp->tcp_mss));
/* Is the other end ECN capable? */
if (tcp->tcp_ecn_ok) {
@@ -12910,21 +10013,17 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
tcp->tcp_irs = seg_seq;
tcp->tcp_rack = seg_seq;
tcp->tcp_rnxt = seg_seq + 1;
- U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack);
+ tcp->tcp_tcpha->tha_ack = htonl(tcp->tcp_rnxt);
if (!TCP_IS_DETACHED(tcp)) {
/* Allocate room for SACK options if needed. */
- if (tcp->tcp_snd_sack_ok) {
- (void) proto_set_tx_wroff(tcp->tcp_rq, connp,
- tcp->tcp_hdr_len +
- TCPOPT_MAX_SACK_LEN +
- (tcp->tcp_loopback ? 0 :
- tcps->tcps_wroff_xtra));
- } else {
- (void) proto_set_tx_wroff(tcp->tcp_rq, connp,
- tcp->tcp_hdr_len +
- (tcp->tcp_loopback ? 0 :
- tcps->tcps_wroff_xtra));
- }
+ connp->conn_wroff = connp->conn_ht_iphc_len;
+ if (tcp->tcp_snd_sack_ok)
+ connp->conn_wroff += TCPOPT_MAX_SACK_LEN;
+ if (!tcp->tcp_loopback)
+ connp->conn_wroff += tcps->tcps_wroff_xtra;
+
+ (void) proto_set_tx_wroff(connp->conn_rq, connp,
+ connp->conn_wroff);
}
if (flags & TH_ACK) {
/*
@@ -12944,15 +10043,14 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
* sending up connection confirmation
*/
tcp->tcp_state = TCPS_ESTABLISHED;
- if (!tcp_conn_con(tcp, iphdr, tcph, mp,
- tcp->tcp_loopback ? &mp1 : NULL)) {
+ if (!tcp_conn_con(tcp, iphdr, mp,
+ tcp->tcp_loopback ? &mp1 : NULL, ira)) {
tcp->tcp_state = TCPS_SYN_SENT;
freemsg(mp);
return;
}
/* SYN was acked - making progress */
- if (tcp->tcp_ipversion == IPV6_VERSION)
- tcp->tcp_ip_forward_progress = B_TRUE;
+ tcp->tcp_ip_forward_progress = B_TRUE;
/* One for the SYN */
tcp->tcp_suna = tcp->tcp_iss + 1;
@@ -12983,7 +10081,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
tcp->tcp_swl1 = seg_seq;
tcp->tcp_swl2 = seg_ack;
- new_swnd = BE16_TO_U16(tcph->th_win);
+ new_swnd = ntohs(tcpha->tha_win);
tcp->tcp_swnd = new_swnd;
if (new_swnd > tcp->tcp_max_swnd)
tcp->tcp_max_swnd = new_swnd;
@@ -13022,22 +10120,25 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
tcp->tcp_ack_tid);
tcp->tcp_ack_tid = 0;
}
- tcp_send_data(tcp, tcp->tcp_wq, ack_mp);
+ tcp_send_data(tcp, ack_mp);
BUMP_LOCAL(tcp->tcp_obsegs);
BUMP_MIB(&tcps->tcps_mib, tcpOutAck);
if (!IPCL_IS_NONSTR(connp)) {
/* Send up T_CONN_CON */
- putnext(tcp->tcp_rq, mp1);
+ if (ira->ira_cred != NULL) {
+ mblk_setcred(mp1,
+ ira->ira_cred,
+ ira->ira_cpid);
+ }
+ putnext(connp->conn_rq, mp1);
} else {
- cred_t *cr;
- pid_t cpid;
-
- cr = msg_getcred(mp1, &cpid);
(*connp->conn_upcalls->
su_connected)
(connp->conn_upper_handle,
- tcp->tcp_connid, cr, cpid);
+ tcp->tcp_connid,
+ ira->ira_cred,
+ ira->ira_cpid);
freemsg(mp1);
}
@@ -13054,15 +10155,16 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
TCP_STAT(tcps, tcp_fusion_unfusable);
tcp->tcp_unfusable = B_TRUE;
if (!IPCL_IS_NONSTR(connp)) {
- putnext(tcp->tcp_rq, mp1);
+ if (ira->ira_cred != NULL) {
+ mblk_setcred(mp1, ira->ira_cred,
+ ira->ira_cpid);
+ }
+ putnext(connp->conn_rq, mp1);
} else {
- cred_t *cr;
- pid_t cpid;
-
- cr = msg_getcred(mp1, &cpid);
(*connp->conn_upcalls->su_connected)
(connp->conn_upper_handle,
- tcp->tcp_connid, cr, cpid);
+ tcp->tcp_connid, ira->ira_cred,
+ ira->ira_cpid);
freemsg(mp1);
}
}
@@ -13089,13 +10191,8 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
tcp->tcp_state = TCPS_SYN_RCVD;
mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, tcp->tcp_mss,
NULL, NULL, tcp->tcp_iss, B_FALSE, NULL, B_FALSE);
- if (mp1) {
- /*
- * See comment in tcp_conn_request() for why we use
- * the open() time pid here.
- */
- DB_CPID(mp1) = tcp->tcp_cpid;
- tcp_send_data(tcp, tcp->tcp_wq, mp1);
+ if (mp1 != NULL) {
+ tcp_send_data(tcp, mp1);
TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
}
freemsg(mp);
@@ -13146,9 +10243,20 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
conn_t *new_connp;
ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
- new_connp = ipcl_classify(mp, connp->conn_zoneid, ipst);
+ /*
+ * Don't accept any input on a closed tcp as this TCP logically
+ * does not exist on the system. Don't proceed further with
+ * this TCP. For instance, this packet could trigger another
+ * close of this tcp which would be disastrous for tcp_refcnt.
+ * tcp_close_detached / tcp_clean_death / tcp_closei_local must
+ * be called at most once on a TCP. In this case we need to
+ * refeed the packet into the classifier and figure out where
+ * the packet should go.
+ */
+ new_connp = ipcl_classify(mp, ira, ipst);
if (new_connp != NULL) {
- tcp_reinput(new_connp, mp, connp->conn_sqp);
+ /* Drops ref on new_connp */
+ tcp_reinput(new_connp, mp, ira, ipst);
return;
}
/* We failed to classify. For now just drop the packet */
@@ -13194,7 +10302,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
tcp->tcp_kssl_ctx = NULL;
tcp->tcp_rnxt += seg_len;
- U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack);
+ tcp->tcp_tcpha->tha_ack = htonl(tcp->tcp_rnxt);
flags |= TH_ACK_NEEDED;
goto ack_check;
}
@@ -13205,13 +10313,13 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
return;
}
- mp->b_rptr = (uchar_t *)tcph + TCP_HDR_LENGTH(tcph);
- urp = BE16_TO_U16(tcph->th_urp) - TCP_OLD_URP_INTERPRETATION;
- new_swnd = BE16_TO_U16(tcph->th_win) <<
- ((tcph->th_flags[0] & TH_SYN) ? 0 : tcp->tcp_snd_ws);
+ mp->b_rptr = (uchar_t *)tcpha + TCP_HDR_LENGTH(tcpha);
+ urp = ntohs(tcpha->tha_urp) - TCP_OLD_URP_INTERPRETATION;
+ new_swnd = ntohs(tcpha->tha_win) <<
+ ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws);
if (tcp->tcp_snd_ts_ok) {
- if (!tcp_paws_check(tcp, tcph, &tcpopt)) {
+ if (!tcp_paws_check(tcp, tcpha, &tcpopt)) {
/*
* This segment is not acceptable.
* Drop it and send back an ACK.
@@ -13227,7 +10335,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
* SACK info in already updated in tcp_parse_options. Ignore
* all other TCP options...
*/
- (void) tcp_parse_options(tcph, &tcpopt);
+ (void) tcp_parse_options(tcpha, &tcpopt);
}
try_again:;
mss = tcp->tcp_mss;
@@ -13289,7 +10397,7 @@ try_again:;
* Adjust seg_len to the original value for tracing.
*/
seg_len -= gap;
- if (tcp->tcp_debug) {
+ if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
"tcp_rput: unacceptable, gap %d, rgap %d, "
"flags 0x%x, seg_seq %u, seg_ack %u, "
@@ -13436,7 +10544,7 @@ try_again:;
return;
}
if (!TCP_IS_DETACHED(tcp) &&
- !putnextctl1(tcp->tcp_rq,
+ !putnextctl1(connp->conn_rq,
M_PCSIG, SIGURG)) {
/* Try again on the rexmit. */
freemsg(mp1);
@@ -13505,7 +10613,7 @@ ok:;
* same segment. In this case, we once again turn
* on ECN_ECHO.
*/
- if (tcp->tcp_ipversion == IPV4_VERSION) {
+ if (connp->conn_ipversion == IPV4_VERSION) {
uchar_t tos = ((ipha_t *)rptr)->ipha_type_of_service;
if ((tos & IPH_ECN_CE) == IPH_ECN_CE) {
@@ -13705,7 +10813,7 @@ ok:;
return;
}
if (!TCP_IS_DETACHED(tcp) &&
- !putnextctl1(tcp->tcp_rq, M_PCSIG,
+ !putnextctl1(connp->conn_rq, M_PCSIG,
SIGURG)) {
/* Try again on the rexmit. */
freemsg(mp1);
@@ -13739,7 +10847,7 @@ ok:;
} else if (tcp->tcp_urp_mark_mp != NULL) {
/*
* An allocation failure prevented the previous
- * tcp_rput_data from sending up the allocated
+ * tcp_input_data from sending up the allocated
* MSG*MARKNEXT message - send it up this time
* around.
*/
@@ -13775,14 +10883,14 @@ ok:;
*/
(void) adjmsg(mp,
urp - seg_len);
- tcp_rput_data(connp,
- mp, NULL);
+ tcp_input_data(connp,
+ mp, NULL, ira);
return;
}
(void) adjmsg(mp1, urp - seg_len);
/* Feed this piece back in. */
tmp_rnxt = tcp->tcp_rnxt;
- tcp_rput_data(connp, mp1, NULL);
+ tcp_input_data(connp, mp1, NULL, ira);
/*
* If the data passed back in was not
* processed (ie: bad ACK) sending
@@ -13811,13 +10919,13 @@ ok:;
*/
(void) adjmsg(mp,
urp + 1 - seg_len);
- tcp_rput_data(connp,
- mp, NULL);
+ tcp_input_data(connp,
+ mp, NULL, ira);
return;
}
(void) adjmsg(mp1, urp + 1 - seg_len);
tmp_rnxt = tcp->tcp_rnxt;
- tcp_rput_data(connp, mp1, NULL);
+ tcp_input_data(connp, mp1, NULL, ira);
/*
* If the data passed back in was not
* processed (ie: bad ACK) sending
@@ -13831,7 +10939,7 @@ ok:;
return;
}
}
- tcp_rput_data(connp, mp, NULL);
+ tcp_input_data(connp, mp, NULL, ira);
return;
}
/*
@@ -13960,7 +11068,7 @@ process_ack:
}
bytes_acked = (int)(seg_ack - tcp->tcp_suna);
- if (tcp->tcp_ipversion == IPV6_VERSION && bytes_acked > 0)
+ if (bytes_acked > 0)
tcp->tcp_ip_forward_progress = B_TRUE;
if (tcp->tcp_state == TCPS_SYN_RCVD) {
if ((tcp->tcp_conn.tcp_eager_conn_ind != NULL) &&
@@ -13983,7 +11091,7 @@ process_ack:
/*
* The listener also exists because of the refhold
- * done in tcp_conn_request. Its possible that it
+ * done in tcp_input_listener. Its possible that it
* might have closed. We will check that once we
* get inside listeners context.
*/
@@ -14005,12 +11113,12 @@ process_ack:
} else if (!tcp->tcp_loopback) {
SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp,
mp, tcp_send_conn_ind,
- listener->tcp_connp, SQ_FILL,
+ listener->tcp_connp, NULL, SQ_FILL,
SQTAG_TCP_CONN_IND);
} else {
SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp,
mp, tcp_send_conn_ind,
- listener->tcp_connp, SQ_PROCESS,
+ listener->tcp_connp, NULL, SQ_PROCESS,
SQTAG_TCP_CONN_IND);
}
}
@@ -14026,7 +11134,7 @@ process_ack:
*/
tcp->tcp_state = TCPS_ESTABLISHED;
if (tcp->tcp_active_open) {
- if (!tcp_conn_con(tcp, iphdr, tcph, mp, NULL)) {
+ if (!tcp_conn_con(tcp, iphdr, mp, NULL, ira)) {
freemsg(mp);
tcp->tcp_state = TCPS_SYN_RCVD;
return;
@@ -14044,8 +11152,7 @@ process_ack:
tcp->tcp_suna = tcp->tcp_iss + 1; /* One for the SYN */
bytes_acked--;
/* SYN was acked - making progress */
- if (tcp->tcp_ipversion == IPV6_VERSION)
- tcp->tcp_ip_forward_progress = B_TRUE;
+ tcp->tcp_ip_forward_progress = B_TRUE;
/*
* If SYN was retransmitted, need to reset all
@@ -14083,7 +11190,7 @@ process_ack:
/* Fuse when both sides are in ESTABLISHED state */
if (tcp->tcp_loopback && do_tcp_fusion)
- tcp_fuse(tcp, iphdr, tcph);
+ tcp_fuse(tcp, iphdr, tcpha);
}
/* This code follows 4.4BSD-Lite2 mostly. */
@@ -14388,7 +11495,7 @@ process_ack:
if (mp != NULL) {
BUMP_LOCAL(tcp->tcp_obsegs);
BUMP_MIB(&tcps->tcps_mib, tcpOutAck);
- tcp_send_data(tcp, tcp->tcp_wq, mp);
+ tcp_send_data(tcp, mp);
}
return;
}
@@ -14487,7 +11594,6 @@ process_ack:
}
} else {
tcp->tcp_rexmit = B_FALSE;
- tcp->tcp_xmit_zc_clean = B_FALSE;
tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
tcp->tcp_snd_burst = tcp->tcp_localnet ?
TCP_CWND_INFINITE : TCP_CWND_NORMAL;
@@ -14662,8 +11768,7 @@ fin_acked:
tcp->tcp_xmit_tail = NULL;
if (tcp->tcp_fin_sent) {
/* FIN was acked - making progress */
- if (tcp->tcp_ipversion == IPV6_VERSION &&
- !tcp->tcp_fin_acked)
+ if (!tcp->tcp_fin_acked)
tcp->tcp_ip_forward_progress = B_TRUE;
tcp->tcp_fin_acked = B_TRUE;
if (tcp->tcp_linger_tid != 0 &&
@@ -14781,7 +11886,7 @@ est:
* bit so this TIME-WAIT connection won't
* interfere with new ones.
*/
- tcp->tcp_exclbind = 0;
+ connp->conn_exclbind = 0;
if (!TCP_IS_DETACHED(tcp)) {
TCP_TIMER_RESTART(tcp,
tcps->tcps_time_wait_interval);
@@ -14805,8 +11910,8 @@ est:
if (!tcp->tcp_fin_rcvd) {
tcp->tcp_fin_rcvd = B_TRUE;
tcp->tcp_rnxt++;
- tcph = tcp->tcp_tcph;
- U32_TO_ABE32(tcp->tcp_rnxt, tcph->th_ack);
+ tcpha = tcp->tcp_tcpha;
+ tcpha->tha_ack = htonl(tcp->tcp_rnxt);
/*
* Generate the ordrel_ind at the end unless we
@@ -14815,7 +11920,7 @@ est:
* after tcp_accept is done.
*/
if (tcp->tcp_listener == NULL &&
- !TCP_IS_DETACHED(tcp) && (!tcp->tcp_hard_binding))
+ !TCP_IS_DETACHED(tcp) && !tcp->tcp_hard_binding)
flags |= TH_ORDREL_NEEDED;
switch (tcp->tcp_state) {
case TCPS_SYN_RCVD:
@@ -14836,7 +11941,7 @@ est:
* bit so this TIME-WAIT connection won't
* interfere with new ones.
*/
- tcp->tcp_exclbind = 0;
+ connp->conn_exclbind = 0;
if (!TCP_IS_DETACHED(tcp)) {
TCP_TIMER_RESTART(tcp,
tcps->tcps_time_wait_interval);
@@ -14872,7 +11977,7 @@ est:
freeb(mp1);
}
update_ack:
- tcph = tcp->tcp_tcph;
+ tcpha = tcp->tcp_tcpha;
tcp->tcp_rack_cnt++;
{
uint32_t cur_max;
@@ -14915,7 +12020,7 @@ update_ack:
}
}
tcp->tcp_rnxt += seg_len;
- U32_TO_ABE32(tcp->tcp_rnxt, tcph->th_ack);
+ tcpha->tha_ack = htonl(tcp->tcp_rnxt);
if (mp == NULL)
goto xmit_check;
@@ -14942,12 +12047,13 @@ update_ack:
/*
* Check for ancillary data changes compared to last segment.
*/
- if (tcp->tcp_ipv6_recvancillary != 0) {
- mp = tcp_rput_add_ancillary(tcp, mp, &ipp);
- ASSERT(mp != NULL);
+ if (connp->conn_recv_ancillary.crb_all != 0) {
+ mp = tcp_input_add_ancillary(tcp, mp, &ipp, ira);
+ if (mp == NULL)
+ return;
}
- if (tcp->tcp_listener || tcp->tcp_hard_binding) {
+ if (tcp->tcp_listener != NULL || tcp->tcp_hard_binding) {
/*
* Side queue inbound data until the accept happens.
* tcp_accept/tcp_rput drains this when the accept happens.
@@ -14961,9 +12067,9 @@ update_ack:
if (tcp->tcp_kssl_pending) {
DTRACE_PROBE1(kssl_mblk__ksslinput_pending,
mblk_t *, mp);
- tcp_kssl_input(tcp, mp);
+ tcp_kssl_input(tcp, mp, ira->ira_cred);
} else {
- tcp_rcv_enqueue(tcp, mp, seg_len);
+ tcp_rcv_enqueue(tcp, mp, seg_len, ira->ira_cred);
}
} else if (IPCL_IS_NONSTR(connp)) {
/*
@@ -15015,19 +12121,22 @@ update_ack:
(DB_TYPE(mp) == M_DATA)) {
DTRACE_PROBE1(kssl_mblk__ksslinput_data1,
mblk_t *, mp);
- tcp_kssl_input(tcp, mp);
+ tcp_kssl_input(tcp, mp, ira->ira_cred);
} else {
- putnext(tcp->tcp_rq, mp);
- if (!canputnext(tcp->tcp_rq))
+ if (is_system_labeled())
+ tcp_setcred_data(mp, ira);
+
+ putnext(connp->conn_rq, mp);
+ if (!canputnext(connp->conn_rq))
tcp->tcp_rwnd -= seg_len;
}
} else if ((tcp->tcp_kssl_ctx != NULL) &&
(DB_TYPE(mp) == M_DATA)) {
/* Does this need SSL processing first? */
DTRACE_PROBE1(kssl_mblk__ksslinput_data2, mblk_t *, mp);
- tcp_kssl_input(tcp, mp);
+ tcp_kssl_input(tcp, mp, ira->ira_cred);
} else if ((flags & (TH_PUSH|TH_FIN)) ||
- tcp->tcp_rcv_cnt + seg_len >= tcp->tcp_recv_hiwater >> 3) {
+ tcp->tcp_rcv_cnt + seg_len >= connp->conn_rcvbuf >> 3) {
if (tcp->tcp_rcv_list != NULL) {
/*
* Enqueue the new segment first and then
@@ -15042,11 +12151,15 @@ update_ack:
* canputnext() as tcp_rcv_drain() needs to
* call canputnext().
*/
- tcp_rcv_enqueue(tcp, mp, seg_len);
+ tcp_rcv_enqueue(tcp, mp, seg_len,
+ ira->ira_cred);
flags |= tcp_rcv_drain(tcp);
} else {
- putnext(tcp->tcp_rq, mp);
- if (!canputnext(tcp->tcp_rq))
+ if (is_system_labeled())
+ tcp_setcred_data(mp, ira);
+
+ putnext(connp->conn_rq, mp);
+ if (!canputnext(connp->conn_rq))
tcp->tcp_rwnd -= seg_len;
}
} else {
@@ -15054,7 +12167,7 @@ update_ack:
* Enqueue all packets when processing an mblk
* from the co queue and also enqueue normal packets.
*/
- tcp_rcv_enqueue(tcp, mp, seg_len);
+ tcp_rcv_enqueue(tcp, mp, seg_len, ira->ira_cred);
}
/*
* Make sure the timer is running if we have data waiting
@@ -15103,7 +12216,7 @@ xmit_check:
BUMP_MIB(&tcps->tcps_mib, tcpRetransSegs);
UPDATE_MIB(&tcps->tcps_mib,
tcpRetransBytes, snd_size);
- tcp_send_data(tcp, tcp->tcp_wq, mp1);
+ tcp_send_data(tcp, mp1);
}
}
if (flags & TH_NEED_SACK_REXMIT) {
@@ -15155,7 +12268,10 @@ ack_check:
ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
mp1 = tcp->tcp_urp_mark_mp;
tcp->tcp_urp_mark_mp = NULL;
- putnext(tcp->tcp_rq, mp1);
+ if (is_system_labeled())
+ tcp_setcred_data(mp1, ira);
+
+ putnext(connp->conn_rq, mp1);
#ifdef DEBUG
(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
"tcp_rput: sending zero-length %s %s",
@@ -15172,7 +12288,7 @@ ack_check:
mp1 = tcp_ack_mp(tcp);
if (mp1 != NULL) {
- tcp_send_data(tcp, tcp->tcp_wq, mp1);
+ tcp_send_data(tcp, mp1);
BUMP_LOCAL(tcp->tcp_obsegs);
BUMP_MIB(&tcps->tcps_mib, tcpOutAck);
}
@@ -15200,6 +12316,7 @@ ack_check:
* after tcp_accept is done.
*/
ASSERT(tcp->tcp_listener == NULL);
+ ASSERT(!tcp->tcp_detached);
if (IPCL_IS_NONSTR(connp)) {
ASSERT(tcp->tcp_ordrel_mp == NULL);
@@ -15220,7 +12337,7 @@ ack_check:
mp1 = tcp->tcp_ordrel_mp;
tcp->tcp_ordrel_mp = NULL;
tcp->tcp_ordrel_done = B_TRUE;
- putnext(tcp->tcp_rq, mp1);
+ putnext(connp->conn_rq, mp1);
}
done:
ASSERT(!(flags & TH_MARKNEXT_NEEDED));
@@ -15251,21 +12368,22 @@ tcp_update_xmit_tail(tcp_t *tcp, uint32_t snxt)
* segment passes the PAWS test, else returns B_FALSE.
*/
boolean_t
-tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp)
+tcp_paws_check(tcp_t *tcp, tcpha_t *tcpha, tcp_opt_t *tcpoptp)
{
uint8_t flags;
int options;
uint8_t *up;
+ conn_t *connp = tcp->tcp_connp;
- flags = (unsigned int)tcph->th_flags[0] & 0xFF;
+ flags = (unsigned int)tcpha->tha_flags & 0xFF;
/*
* If timestamp option is aligned nicely, get values inline,
* otherwise call general routine to parse. Only do that
* if timestamp is the only option.
*/
- if (TCP_HDR_LENGTH(tcph) == (uint32_t)TCP_MIN_HEADER_LENGTH +
+ if (TCP_HDR_LENGTH(tcpha) == (uint32_t)TCP_MIN_HEADER_LENGTH +
TCPOPT_REAL_TS_LEN &&
- OK_32PTR((up = ((uint8_t *)tcph) +
+ OK_32PTR((up = ((uint8_t *)tcpha) +
TCP_MIN_HEADER_LENGTH)) &&
*(uint32_t *)up == TCPOPT_NOP_NOP_TSTAMP) {
tcpoptp->tcp_opt_ts_val = ABE32_TO_U32((up+4));
@@ -15278,7 +12396,7 @@ tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp)
} else {
tcpoptp->tcp = NULL;
}
- options = tcp_parse_options(tcph, tcpoptp);
+ options = tcp_parse_options(tcpha, tcpoptp);
}
if (options & TCP_OPT_TSTAMP_PRESENT) {
@@ -15311,16 +12429,15 @@ tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp)
*/
tcp->tcp_snd_ts_ok = B_FALSE;
- tcp->tcp_hdr_len -= TCPOPT_REAL_TS_LEN;
- tcp->tcp_tcp_hdr_len -= TCPOPT_REAL_TS_LEN;
- tcp->tcp_tcph->th_offset_and_rsrvd[0] -= (3 << 4);
+ connp->conn_ht_iphc_len -= TCPOPT_REAL_TS_LEN;
+ connp->conn_ht_ulp_len -= TCPOPT_REAL_TS_LEN;
+ tcp->tcp_tcpha->tha_offset_and_reserved -= (3 << 4);
/*
- * Adjust the tcp_mss accordingly. We also need to
- * adjust tcp_cwnd here in accordance with the new mss.
- * But we avoid doing a slow start here so as to not
- * to lose on the transfer rate built up so far.
+ * Adjust the tcp_mss and tcp_cwnd accordingly. We avoid
+ * doing a slow start here so as to not to lose on the
+ * transfer rate built up so far.
*/
- tcp_mss_set(tcp, tcp->tcp_mss + TCPOPT_REAL_TS_LEN, B_FALSE);
+ tcp_mss_set(tcp, tcp->tcp_mss + TCPOPT_REAL_TS_LEN);
if (tcp->tcp_snd_sack_ok) {
ASSERT(tcp->tcp_sack_info != NULL);
tcp->tcp_max_sack_blk = 4;
@@ -15338,38 +12455,37 @@ tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp)
* when memory allocation fails we can just wait for the next data segment.
*/
static mblk_t *
-tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp)
+tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp,
+ ip_recv_attr_t *ira)
{
struct T_optdata_ind *todi;
int optlen;
uchar_t *optptr;
struct T_opthdr *toh;
- uint_t addflag; /* Which pieces to add */
+ crb_t addflag; /* Which pieces to add */
mblk_t *mp1;
+ conn_t *connp = tcp->tcp_connp;
optlen = 0;
- addflag = 0;
+ addflag.crb_all = 0;
/* If app asked for pktinfo and the index has changed ... */
- if ((ipp->ipp_fields & IPPF_IFINDEX) &&
- ipp->ipp_ifindex != tcp->tcp_recvifindex &&
- (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO)) {
+ if (connp->conn_recv_ancillary.crb_ip_recvpktinfo &&
+ ira->ira_ruifindex != tcp->tcp_recvifindex) {
optlen += sizeof (struct T_opthdr) +
sizeof (struct in6_pktinfo);
- addflag |= TCP_IPV6_RECVPKTINFO;
+ addflag.crb_ip_recvpktinfo = 1;
}
/* If app asked for hoplimit and it has changed ... */
- if ((ipp->ipp_fields & IPPF_HOPLIMIT) &&
- ipp->ipp_hoplimit != tcp->tcp_recvhops &&
- (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVHOPLIMIT)) {
+ if (connp->conn_recv_ancillary.crb_ipv6_recvhoplimit &&
+ ipp->ipp_hoplimit != tcp->tcp_recvhops) {
optlen += sizeof (struct T_opthdr) + sizeof (uint_t);
- addflag |= TCP_IPV6_RECVHOPLIMIT;
+ addflag.crb_ipv6_recvhoplimit = 1;
}
/* If app asked for tclass and it has changed ... */
- if ((ipp->ipp_fields & IPPF_TCLASS) &&
- ipp->ipp_tclass != tcp->tcp_recvtclass &&
- (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVTCLASS)) {
+ if (connp->conn_recv_ancillary.crb_ipv6_recvtclass &&
+ ipp->ipp_tclass != tcp->tcp_recvtclass) {
optlen += sizeof (struct T_opthdr) + sizeof (uint_t);
- addflag |= TCP_IPV6_RECVTCLASS;
+ addflag.crb_ipv6_recvtclass = 1;
}
/*
* If app asked for hopbyhop headers and it has changed ...
@@ -15377,51 +12493,51 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp)
* a connected socket at all, (2) we're connected to at most one peer,
* (3) if anything changes, then it must be some other extra option.
*/
- if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVHOPOPTS) &&
+ if (connp->conn_recv_ancillary.crb_ipv6_recvhopopts &&
ip_cmpbuf(tcp->tcp_hopopts, tcp->tcp_hopoptslen,
(ipp->ipp_fields & IPPF_HOPOPTS),
ipp->ipp_hopopts, ipp->ipp_hopoptslen)) {
- optlen += sizeof (struct T_opthdr) + ipp->ipp_hopoptslen -
- tcp->tcp_label_len;
- addflag |= TCP_IPV6_RECVHOPOPTS;
+ optlen += sizeof (struct T_opthdr) + ipp->ipp_hopoptslen;
+ addflag.crb_ipv6_recvhopopts = 1;
if (!ip_allocbuf((void **)&tcp->tcp_hopopts,
&tcp->tcp_hopoptslen, (ipp->ipp_fields & IPPF_HOPOPTS),
ipp->ipp_hopopts, ipp->ipp_hopoptslen))
return (mp);
}
/* If app asked for dst headers before routing headers ... */
- if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVRTDSTOPTS) &&
- ip_cmpbuf(tcp->tcp_rtdstopts, tcp->tcp_rtdstoptslen,
- (ipp->ipp_fields & IPPF_RTDSTOPTS),
- ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen)) {
+ if (connp->conn_recv_ancillary.crb_ipv6_recvrthdrdstopts &&
+ ip_cmpbuf(tcp->tcp_rthdrdstopts, tcp->tcp_rthdrdstoptslen,
+ (ipp->ipp_fields & IPPF_RTHDRDSTOPTS),
+ ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen)) {
optlen += sizeof (struct T_opthdr) +
- ipp->ipp_rtdstoptslen;
- addflag |= TCP_IPV6_RECVRTDSTOPTS;
- if (!ip_allocbuf((void **)&tcp->tcp_rtdstopts,
- &tcp->tcp_rtdstoptslen, (ipp->ipp_fields & IPPF_RTDSTOPTS),
- ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen))
+ ipp->ipp_rthdrdstoptslen;
+ addflag.crb_ipv6_recvrthdrdstopts = 1;
+ if (!ip_allocbuf((void **)&tcp->tcp_rthdrdstopts,
+ &tcp->tcp_rthdrdstoptslen,
+ (ipp->ipp_fields & IPPF_RTHDRDSTOPTS),
+ ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen))
return (mp);
}
/* If app asked for routing headers and it has changed ... */
- if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVRTHDR) &&
+ if (connp->conn_recv_ancillary.crb_ipv6_recvrthdr &&
ip_cmpbuf(tcp->tcp_rthdr, tcp->tcp_rthdrlen,
(ipp->ipp_fields & IPPF_RTHDR),
ipp->ipp_rthdr, ipp->ipp_rthdrlen)) {
optlen += sizeof (struct T_opthdr) + ipp->ipp_rthdrlen;
- addflag |= TCP_IPV6_RECVRTHDR;
+ addflag.crb_ipv6_recvrthdr = 1;
if (!ip_allocbuf((void **)&tcp->tcp_rthdr,
&tcp->tcp_rthdrlen, (ipp->ipp_fields & IPPF_RTHDR),
ipp->ipp_rthdr, ipp->ipp_rthdrlen))
return (mp);
}
/* If app asked for dest headers and it has changed ... */
- if ((tcp->tcp_ipv6_recvancillary &
- (TCP_IPV6_RECVDSTOPTS | TCP_OLD_IPV6_RECVDSTOPTS)) &&
+ if ((connp->conn_recv_ancillary.crb_ipv6_recvdstopts ||
+ connp->conn_recv_ancillary.crb_old_ipv6_recvdstopts) &&
ip_cmpbuf(tcp->tcp_dstopts, tcp->tcp_dstoptslen,
(ipp->ipp_fields & IPPF_DSTOPTS),
ipp->ipp_dstopts, ipp->ipp_dstoptslen)) {
optlen += sizeof (struct T_opthdr) + ipp->ipp_dstoptslen;
- addflag |= TCP_IPV6_RECVDSTOPTS;
+ addflag.crb_ipv6_recvdstopts = 1;
if (!ip_allocbuf((void **)&tcp->tcp_dstopts,
&tcp->tcp_dstoptslen, (ipp->ipp_fields & IPPF_DSTOPTS),
ipp->ipp_dstopts, ipp->ipp_dstoptslen))
@@ -15454,9 +12570,11 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp)
* If app asked for pktinfo and the index has changed ...
* Note that the local address never changes for the connection.
*/
- if (addflag & TCP_IPV6_RECVPKTINFO) {
+ if (addflag.crb_ip_recvpktinfo) {
struct in6_pktinfo *pkti;
+ uint_t ifindex;
+ ifindex = ira->ira_ruifindex;
toh = (struct T_opthdr *)optptr;
toh->level = IPPROTO_IPV6;
toh->name = IPV6_PKTINFO;
@@ -15464,19 +12582,15 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp)
toh->status = 0;
optptr += sizeof (*toh);
pkti = (struct in6_pktinfo *)optptr;
- if (tcp->tcp_ipversion == IPV6_VERSION)
- pkti->ipi6_addr = tcp->tcp_ip6h->ip6_src;
- else
- IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src,
- &pkti->ipi6_addr);
- pkti->ipi6_ifindex = ipp->ipp_ifindex;
+ pkti->ipi6_addr = connp->conn_laddr_v6;
+ pkti->ipi6_ifindex = ifindex;
optptr += sizeof (*pkti);
ASSERT(OK_32PTR(optptr));
/* Save as "last" value */
- tcp->tcp_recvifindex = ipp->ipp_ifindex;
+ tcp->tcp_recvifindex = ifindex;
}
/* If app asked for hoplimit and it has changed ... */
- if (addflag & TCP_IPV6_RECVHOPLIMIT) {
+ if (addflag.crb_ipv6_recvhoplimit) {
toh = (struct T_opthdr *)optptr;
toh->level = IPPROTO_IPV6;
toh->name = IPV6_HOPLIMIT;
@@ -15490,7 +12604,7 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp)
tcp->tcp_recvhops = ipp->ipp_hoplimit;
}
/* If app asked for tclass and it has changed ... */
- if (addflag & TCP_IPV6_RECVTCLASS) {
+ if (addflag.crb_ipv6_recvtclass) {
toh = (struct T_opthdr *)optptr;
toh->level = IPPROTO_IPV6;
toh->name = IPV6_TCLASS;
@@ -15503,40 +12617,38 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp)
/* Save as "last" value */
tcp->tcp_recvtclass = ipp->ipp_tclass;
}
- if (addflag & TCP_IPV6_RECVHOPOPTS) {
+ if (addflag.crb_ipv6_recvhopopts) {
toh = (struct T_opthdr *)optptr;
toh->level = IPPROTO_IPV6;
toh->name = IPV6_HOPOPTS;
- toh->len = sizeof (*toh) + ipp->ipp_hopoptslen -
- tcp->tcp_label_len;
+ toh->len = sizeof (*toh) + ipp->ipp_hopoptslen;
toh->status = 0;
optptr += sizeof (*toh);
- bcopy((uchar_t *)ipp->ipp_hopopts + tcp->tcp_label_len, optptr,
- ipp->ipp_hopoptslen - tcp->tcp_label_len);
- optptr += ipp->ipp_hopoptslen - tcp->tcp_label_len;
+ bcopy((uchar_t *)ipp->ipp_hopopts, optptr, ipp->ipp_hopoptslen);
+ optptr += ipp->ipp_hopoptslen;
ASSERT(OK_32PTR(optptr));
/* Save as last value */
ip_savebuf((void **)&tcp->tcp_hopopts, &tcp->tcp_hopoptslen,
(ipp->ipp_fields & IPPF_HOPOPTS),
ipp->ipp_hopopts, ipp->ipp_hopoptslen);
}
- if (addflag & TCP_IPV6_RECVRTDSTOPTS) {
+ if (addflag.crb_ipv6_recvrthdrdstopts) {
toh = (struct T_opthdr *)optptr;
toh->level = IPPROTO_IPV6;
toh->name = IPV6_RTHDRDSTOPTS;
- toh->len = sizeof (*toh) + ipp->ipp_rtdstoptslen;
+ toh->len = sizeof (*toh) + ipp->ipp_rthdrdstoptslen;
toh->status = 0;
optptr += sizeof (*toh);
- bcopy(ipp->ipp_rtdstopts, optptr, ipp->ipp_rtdstoptslen);
- optptr += ipp->ipp_rtdstoptslen;
+ bcopy(ipp->ipp_rthdrdstopts, optptr, ipp->ipp_rthdrdstoptslen);
+ optptr += ipp->ipp_rthdrdstoptslen;
ASSERT(OK_32PTR(optptr));
/* Save as last value */
- ip_savebuf((void **)&tcp->tcp_rtdstopts,
- &tcp->tcp_rtdstoptslen,
- (ipp->ipp_fields & IPPF_RTDSTOPTS),
- ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen);
+ ip_savebuf((void **)&tcp->tcp_rthdrdstopts,
+ &tcp->tcp_rthdrdstoptslen,
+ (ipp->ipp_fields & IPPF_RTHDRDSTOPTS),
+ ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen);
}
- if (addflag & TCP_IPV6_RECVRTHDR) {
+ if (addflag.crb_ipv6_recvrthdr) {
toh = (struct T_opthdr *)optptr;
toh->level = IPPROTO_IPV6;
toh->name = IPV6_RTHDR;
@@ -15551,7 +12663,7 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp)
(ipp->ipp_fields & IPPF_RTHDR),
ipp->ipp_rthdr, ipp->ipp_rthdrlen);
}
- if (addflag & (TCP_IPV6_RECVDSTOPTS | TCP_OLD_IPV6_RECVDSTOPTS)) {
+ if (addflag.crb_ipv6_recvdstopts) {
toh = (struct T_opthdr *)optptr;
toh->level = IPPROTO_IPV6;
toh->name = IPV6_DSTOPTS;
@@ -15570,99 +12682,13 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp)
return (mp);
}
-/*
- * tcp_rput_other is called by tcp_rput to handle everything other than M_DATA
- * messages.
- */
-void
-tcp_rput_other(tcp_t *tcp, mblk_t *mp)
-{
- uchar_t *rptr = mp->b_rptr;
- queue_t *q = tcp->tcp_rq;
- struct T_error_ack *tea;
-
- switch (mp->b_datap->db_type) {
- case M_PROTO:
- case M_PCPROTO:
- ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX);
- if ((mp->b_wptr - rptr) < sizeof (t_scalar_t))
- break;
- tea = (struct T_error_ack *)rptr;
- ASSERT(tea->PRIM_type != T_BIND_ACK);
- ASSERT(tea->ERROR_prim != O_T_BIND_REQ &&
- tea->ERROR_prim != T_BIND_REQ);
- switch (tea->PRIM_type) {
- case T_ERROR_ACK:
- if (tcp->tcp_debug) {
- (void) strlog(TCP_MOD_ID, 0, 1,
- SL_TRACE|SL_ERROR,
- "tcp_rput_other: case T_ERROR_ACK, "
- "ERROR_prim == %d",
- tea->ERROR_prim);
- }
- switch (tea->ERROR_prim) {
- case T_SVR4_OPTMGMT_REQ:
- if (tcp->tcp_drop_opt_ack_cnt > 0) {
- /* T_OPTMGMT_REQ generated by TCP */
- printf("T_SVR4_OPTMGMT_REQ failed "
- "%d/%d - dropped (cnt %d)\n",
- tea->TLI_error, tea->UNIX_error,
- tcp->tcp_drop_opt_ack_cnt);
- freemsg(mp);
- tcp->tcp_drop_opt_ack_cnt--;
- return;
- }
- break;
- }
- if (tea->ERROR_prim == T_SVR4_OPTMGMT_REQ &&
- tcp->tcp_drop_opt_ack_cnt > 0) {
- printf("T_SVR4_OPTMGMT_REQ failed %d/%d "
- "- dropped (cnt %d)\n",
- tea->TLI_error, tea->UNIX_error,
- tcp->tcp_drop_opt_ack_cnt);
- freemsg(mp);
- tcp->tcp_drop_opt_ack_cnt--;
- return;
- }
- break;
- case T_OPTMGMT_ACK:
- if (tcp->tcp_drop_opt_ack_cnt > 0) {
- /* T_OPTMGMT_REQ generated by TCP */
- freemsg(mp);
- tcp->tcp_drop_opt_ack_cnt--;
- return;
- }
- break;
- default:
- ASSERT(tea->ERROR_prim != T_UNBIND_REQ);
- break;
- }
- break;
- case M_FLUSH:
- if (*rptr & FLUSHR)
- flushq(q, FLUSHDATA);
- break;
- default:
- /* M_CTL will be directly sent to tcp_icmp_error() */
- ASSERT(DB_TYPE(mp) != M_CTL);
- break;
- }
- /*
- * Make sure we set this bit before sending the ACK for
- * bind. Otherwise accept could possibly run and free
- * this tcp struct.
- */
- ASSERT(q != NULL);
- putnext(q, mp);
-}
-
/* ARGSUSED */
static void
-tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2)
+tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
{
conn_t *connp = (conn_t *)arg;
tcp_t *tcp = connp->conn_tcp;
- queue_t *q = tcp->tcp_rq;
+ queue_t *q = connp->conn_rq;
tcp_stack_t *tcps = tcp->tcp_tcps;
ASSERT(!IPCL_IS_NONSTR(connp));
@@ -15683,7 +12709,7 @@ tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2)
if (canputnext(q)) {
/* Not flow-controlled, open rwnd */
- tcp->tcp_rwnd = tcp->tcp_recv_hiwater;
+ tcp->tcp_rwnd = connp->conn_rcvbuf;
/*
* Send back a window update immediately if TCP is above
@@ -15712,16 +12738,10 @@ tcp_rsrv(queue_t *q)
conn_t *connp = Q_TO_CONN(q);
tcp_t *tcp = connp->conn_tcp;
mblk_t *mp;
- tcp_stack_t *tcps = tcp->tcp_tcps;
/* No code does a putq on the read side */
ASSERT(q->q_first == NULL);
- /* Nothing to do for the default queue */
- if (q == tcps->tcps_g_q) {
- return;
- }
-
/*
* If tcp->tcp_rsrv_mp == NULL, it means that tcp_rsrv() has already
* been run. So just return.
@@ -15736,7 +12756,7 @@ tcp_rsrv(queue_t *q)
CONN_INC_REF(connp);
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_rsrv_input, connp,
- SQ_PROCESS, SQTAG_TCP_RSRV);
+ NULL, SQ_PROCESS, SQTAG_TCP_RSRV);
}
/*
@@ -15746,8 +12766,8 @@ tcp_rsrv(queue_t *q)
*
* This function is called in 2 cases:
*
- * 1) Before data transfer begins, in tcp_accept_comm() for accepting a
- * connection (passive open) and in tcp_rput_data() for active connect.
+ * 1) Before data transfer begins, in tcp_input_listener() for accepting a
+ * connection (passive open) and in tcp_input_data() for active connect.
* This is called after tcp_mss_set() when the desired MSS value is known.
* This makes sure that our window size is a mutiple of the other side's
* MSS.
@@ -15766,6 +12786,7 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd)
uint32_t max_transmittable_rwnd;
boolean_t tcp_detached = TCP_IS_DETACHED(tcp);
tcp_stack_t *tcps = tcp->tcp_tcps;
+ conn_t *connp = tcp->tcp_connp;
/*
* Insist on a receive window that is at least
@@ -15782,7 +12803,7 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd)
ASSERT(peer_tcp != NULL);
sth_hiwat = tcp_fuse_set_rcv_hiwat(tcp, rwnd);
if (!tcp_detached) {
- (void) proto_set_rx_hiwat(tcp->tcp_rq, tcp->tcp_connp,
+ (void) proto_set_rx_hiwat(connp->conn_rq, connp,
sth_hiwat);
tcp_set_recv_threshold(tcp, sth_hiwat >> 3);
}
@@ -15797,11 +12818,10 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd)
return (sth_hiwat);
}
- if (tcp_detached) {
+ if (tcp_detached)
old_max_rwnd = tcp->tcp_rwnd;
- } else {
- old_max_rwnd = tcp->tcp_recv_hiwater;
- }
+ else
+ old_max_rwnd = connp->conn_rcvbuf;
/*
@@ -15854,9 +12874,14 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd)
* connection.)
*/
tcp->tcp_rwnd += rwnd - old_max_rwnd;
- tcp->tcp_recv_hiwater = rwnd;
+ connp->conn_rcvbuf = rwnd;
+
+ /* Are we already connected? */
+ if (tcp->tcp_tcpha != NULL) {
+ tcp->tcp_tcpha->tha_win =
+ htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
+ }
- U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, tcp->tcp_tcph->th_win);
if ((tcp->tcp_rcv_ws > 0) && rwnd > tcp->tcp_cwnd_max)
tcp->tcp_cwnd_max = rwnd;
@@ -15865,7 +12890,7 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd)
tcp_set_recv_threshold(tcp, rwnd >> 3);
- (void) proto_set_rx_hiwat(tcp->tcp_rq, tcp->tcp_connp, rwnd);
+ (void) proto_set_rx_hiwat(connp->conn_rq, connp, rwnd);
return (rwnd);
}
@@ -15944,7 +12969,7 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl)
connp = NULL;
while ((connp =
- ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
+ ipcl_get_next_conn(connfp, connp, IPCL_TCPCONN)) != NULL) {
tcp_t *tcp;
boolean_t needattr;
@@ -15992,11 +13017,10 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl)
needattr = B_TRUE;
break;
}
- if (connp->conn_fully_bound &&
- connp->conn_effective_cred != NULL) {
+ if (connp->conn_ixa->ixa_tsl != NULL) {
ts_label_t *tsl;
- tsl = crgetlabel(connp->conn_effective_cred);
+ tsl = connp->conn_ixa->ixa_tsl;
mlp.tme_flags |= MIB2_TMEF_IS_LABELED;
mlp.tme_doi = label2doi(tsl);
mlp.tme_label = *label2bslabel(tsl);
@@ -16004,12 +13028,17 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl)
}
/* Create a message to report on IPv6 entries */
- if (tcp->tcp_ipversion == IPV6_VERSION) {
- tce6.tcp6ConnLocalAddress = tcp->tcp_ip_src_v6;
- tce6.tcp6ConnRemAddress = tcp->tcp_remote_v6;
- tce6.tcp6ConnLocalPort = ntohs(tcp->tcp_lport);
- tce6.tcp6ConnRemPort = ntohs(tcp->tcp_fport);
- tce6.tcp6ConnIfIndex = tcp->tcp_bound_if;
+ if (connp->conn_ipversion == IPV6_VERSION) {
+ tce6.tcp6ConnLocalAddress = connp->conn_laddr_v6;
+ tce6.tcp6ConnRemAddress = connp->conn_faddr_v6;
+ tce6.tcp6ConnLocalPort = ntohs(connp->conn_lport);
+ tce6.tcp6ConnRemPort = ntohs(connp->conn_fport);
+ if (connp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET) {
+ tce6.tcp6ConnIfIndex =
+ connp->conn_ixa->ixa_scopeid;
+ } else {
+ tce6.tcp6ConnIfIndex = connp->conn_bound_if;
+ }
/* Don't want just anybody seeing these... */
if (ispriv) {
tce6.tcp6ConnEntryInfo.ce_snxt =
@@ -16041,9 +13070,9 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl)
tce6.tcp6ConnEntryInfo.ce_state = tcp->tcp_state;
tce6.tcp6ConnCreationProcess =
- (tcp->tcp_cpid < 0) ? MIB2_UNKNOWN_PROCESS :
- tcp->tcp_cpid;
- tce6.tcp6ConnCreationTime = tcp->tcp_open_time;
+ (connp->conn_cpid < 0) ? MIB2_UNKNOWN_PROCESS :
+ connp->conn_cpid;
+ tce6.tcp6ConnCreationTime = connp->conn_open_time;
(void) snmp_append_data2(mp6_conn_ctl->b_cont,
&mp6_conn_tail, (char *)&tce6, sizeof (tce6));
@@ -16059,21 +13088,21 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl)
* but don't have IPV6_V6ONLY set.
* (i.e. anything an IPv4 peer could connect to)
*/
- if (tcp->tcp_ipversion == IPV4_VERSION ||
+ if (connp->conn_ipversion == IPV4_VERSION ||
(tcp->tcp_state <= TCPS_LISTEN &&
- !tcp->tcp_connp->conn_ipv6_v6only &&
- IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip_src_v6))) {
- if (tcp->tcp_ipversion == IPV6_VERSION) {
+ !connp->conn_ipv6_v6only &&
+ IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6))) {
+ if (connp->conn_ipversion == IPV6_VERSION) {
tce.tcpConnRemAddress = INADDR_ANY;
tce.tcpConnLocalAddress = INADDR_ANY;
} else {
tce.tcpConnRemAddress =
- tcp->tcp_remote;
+ connp->conn_faddr_v4;
tce.tcpConnLocalAddress =
- tcp->tcp_ip_src;
+ connp->conn_laddr_v4;
}
- tce.tcpConnLocalPort = ntohs(tcp->tcp_lport);
- tce.tcpConnRemPort = ntohs(tcp->tcp_fport);
+ tce.tcpConnLocalPort = ntohs(connp->conn_lport);
+ tce.tcpConnRemPort = ntohs(connp->conn_fport);
/* Don't want just anybody seeing these... */
if (ispriv) {
tce.tcpConnEntryInfo.ce_snxt =
@@ -16107,9 +13136,10 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl)
tcp->tcp_state;
tce.tcpConnCreationProcess =
- (tcp->tcp_cpid < 0) ? MIB2_UNKNOWN_PROCESS :
- tcp->tcp_cpid;
- tce.tcpConnCreationTime = tcp->tcp_open_time;
+ (connp->conn_cpid < 0) ?
+ MIB2_UNKNOWN_PROCESS :
+ connp->conn_cpid;
+ tce.tcpConnCreationTime = connp->conn_open_time;
(void) snmp_append_data2(mp_conn_ctl->b_cont,
&mp_conn_tail, (char *)&tce, sizeof (tce));
@@ -16273,7 +13303,6 @@ tcp_timer(void *arg)
tcp_t *listener = tcp->tcp_listener;
if (tcp->tcp_syn_rcvd_timeout == 0 && (listener != NULL)) {
- ASSERT(tcp->tcp_rq == listener->tcp_rq);
/* it's our first timeout */
tcp->tcp_syn_rcvd_timeout = 1;
mutex_enter(&listener->tcp_eager_lock);
@@ -16295,7 +13324,7 @@ tcp_timer(void *arg)
cmn_err(CE_WARN, "High TCP connect timeout "
"rate! System (port %d) may be under a "
"SYN flood attack!",
- BE16_TO_U16(listener->tcp_tcph->th_lport));
+ ntohs(listener->tcp_connp->conn_lport));
listener->tcp_ip_addr_cache = kmem_zalloc(
IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t),
@@ -16363,7 +13392,7 @@ tcp_timer(void *arg)
* backoff.
*/
if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) {
- if (tcp->tcp_debug) {
+ if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1,
SL_TRACE, "tcp_timer: zero win");
}
@@ -16415,6 +13444,13 @@ tcp_timer(void *arg)
* 3. But 1 and 3 are exclusive.
*/
if (tcp->tcp_unsent != 0) {
+ /*
+ * Should not hold the zero-copy messages for too long.
+ */
+ if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
+ tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
+ tcp->tcp_xmit_head, B_TRUE);
+
if (tcp->tcp_cwnd == 0) {
/*
* Set tcp_cwnd to 1 MSS so that a
@@ -16477,7 +13513,7 @@ tcp_timer(void *arg)
(void) tcp_clean_death(tcp, 0, 24);
return;
default:
- if (tcp->tcp_debug) {
+ if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
"tcp_timer: strange state (%d) %s",
tcp->tcp_state, tcp_display(tcp, NULL,
@@ -16485,8 +13521,16 @@ tcp_timer(void *arg)
}
return;
}
+
if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) {
/*
+ * Should not hold the zero-copy messages for too long.
+ */
+ if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
+ tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
+ tcp->tcp_xmit_head, B_TRUE);
+
+ /*
* For zero window probe, we need to send indefinitely,
* unless we have not heard from the other side for some
* time...
@@ -16529,11 +13573,13 @@ tcp_timer(void *arg)
tcp->tcp_ms_we_have_waited = second_threshold;
}
} else if (ms > first_threshold) {
- if (tcp->tcp_snd_zcopy_aware && (!tcp->tcp_xmit_zc_clean) &&
- tcp->tcp_xmit_head != NULL) {
- tcp->tcp_xmit_head =
- tcp_zcopy_backoff(tcp, tcp->tcp_xmit_head, 1);
- }
+ /*
+ * Should not hold the zero-copy messages for too long.
+ */
+ if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
+ tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
+ tcp->tcp_xmit_head, B_TRUE);
+
/*
* We have been retransmitting for too long... The RTT
* we calculated is probably incorrect. Reinitialize it.
@@ -16618,20 +13664,11 @@ tcp_timer(void *arg)
if (mp == NULL) {
return;
}
- /*
- * Attach credentials to retransmitted initial SYNs.
- * In theory we should use the credentials from the connect()
- * call to ensure that getpeerucred() on the peer will be correct.
- * But we assume that SYN's are not dropped for loopback connections.
- */
- if (tcp->tcp_state == TCPS_SYN_SENT) {
- mblk_setcred(mp, CONN_CRED(tcp->tcp_connp), tcp->tcp_cpid);
- }
tcp->tcp_csuna = tcp->tcp_snxt;
BUMP_MIB(&tcps->tcps_mib, tcpRetransSegs);
UPDATE_MIB(&tcps->tcps_mib, tcpRetransBytes, mss);
- tcp_send_data(tcp, tcp->tcp_wq, mp);
+ tcp_send_data(tcp, mp);
}
@@ -16639,7 +13676,6 @@ static int
tcp_do_unbind(conn_t *connp)
{
tcp_t *tcp = connp->conn_tcp;
- int error = 0;
switch (tcp->tcp_state) {
case TCPS_BOUND:
@@ -16659,41 +13695,36 @@ tcp_do_unbind(conn_t *connp)
}
mutex_exit(&tcp->tcp_eager_lock);
- if (tcp->tcp_ipversion == IPV4_VERSION) {
- tcp->tcp_ipha->ipha_src = 0;
- } else {
- V6_SET_ZERO(tcp->tcp_ip6h->ip6_src);
- }
- V6_SET_ZERO(tcp->tcp_ip_src_v6);
- bzero(tcp->tcp_tcph->th_lport, sizeof (tcp->tcp_tcph->th_lport));
+ connp->conn_laddr_v6 = ipv6_all_zeros;
+ connp->conn_saddr_v6 = ipv6_all_zeros;
tcp_bind_hash_remove(tcp);
tcp->tcp_state = TCPS_IDLE;
- tcp->tcp_mdt = B_FALSE;
- connp = tcp->tcp_connp;
- connp->conn_mdt_ok = B_FALSE;
- ipcl_hash_remove(connp);
+ ip_unbind(connp);
bzero(&connp->conn_ports, sizeof (connp->conn_ports));
- return (error);
+ return (0);
}
/* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */
static void
tcp_tpi_unbind(tcp_t *tcp, mblk_t *mp)
{
- int error = tcp_do_unbind(tcp->tcp_connp);
+ conn_t *connp = tcp->tcp_connp;
+ int error;
+ error = tcp_do_unbind(connp);
if (error > 0) {
tcp_err_ack(tcp, mp, TSYSERR, error);
} else if (error < 0) {
tcp_err_ack(tcp, mp, -error, 0);
} else {
/* Send M_FLUSH according to TPI */
- (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW);
+ (void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW);
mp = mi_tpi_ok_ack_alloc(mp);
- putnext(tcp->tcp_rq, mp);
+ if (mp != NULL)
+ putnext(connp->conn_rq, mp);
}
}
@@ -16764,7 +13795,7 @@ retry:
}
}
if (is_system_labeled() &&
- (i = tsol_next_port(crgetzone(tcp->tcp_cred), port,
+ (i = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), port,
IPPROTO_TCP, B_TRUE)) != 0) {
port = i;
goto retry;
@@ -16796,7 +13827,7 @@ retry:
restart = B_TRUE;
}
if (is_system_labeled() &&
- (nextport = tsol_next_port(crgetzone(tcp->tcp_cred),
+ (nextport = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred),
next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) {
next_priv_port = nextport;
goto retry;
@@ -16820,11 +13851,10 @@ struct {
*/
/* ARGSUSED */
static void
-tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2)
+tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
{
conn_t *connp = (conn_t *)arg;
tcp_t *tcp = connp->conn_tcp;
- queue_t *q = tcp->tcp_wq;
ASSERT(DB_TYPE(mp) != M_IOCTL);
/*
@@ -16851,7 +13881,7 @@ tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2)
tcp_wput_flush(tcp, mp);
break;
default:
- CALL_IP_WPUT(connp, q, mp);
+ ip_wput_nondata(connp->conn_wq, mp);
break;
}
}
@@ -16862,7 +13892,7 @@ tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2)
*/
/* ARGSUSED */
void
-tcp_output(void *arg, mblk_t *mp, void *arg2)
+tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
{
int len;
int hdrlen;
@@ -16870,7 +13900,7 @@ tcp_output(void *arg, mblk_t *mp, void *arg2)
mblk_t *mp1;
uchar_t *rptr;
uint32_t snxt;
- tcph_t *tcph;
+ tcpha_t *tcpha;
struct datab *db;
uint32_t suna;
uint32_t mss;
@@ -16882,7 +13912,7 @@ tcp_output(void *arg, mblk_t *mp, void *arg2)
tcp_t *tcp = connp->conn_tcp;
uint32_t msize;
tcp_stack_t *tcps = tcp->tcp_tcps;
- ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
+ ip_xmit_attr_t *ixa;
/*
* Try and ASSERT the minimum possible references on the
@@ -16903,25 +13933,18 @@ tcp_output(void *arg, mblk_t *mp, void *arg2)
tcp->tcp_squeue_bytes -= msize;
mutex_exit(&tcp->tcp_non_sq_lock);
- /* Check to see if this connection wants to be re-fused. */
- if (tcp->tcp_refuse) {
- if (tcp->tcp_ipversion == IPV4_VERSION &&
- !ipst->ips_ip4_observe.he_interested) {
- tcp_fuse(tcp, (uchar_t *)&tcp->tcp_saved_ipha,
- &tcp->tcp_saved_tcph);
- } else if (tcp->tcp_ipversion == IPV6_VERSION &&
- !ipst->ips_ip6_observe.he_interested) {
- tcp_fuse(tcp, (uchar_t *)&tcp->tcp_saved_ip6h,
- &tcp->tcp_saved_tcph);
- }
- }
/* Bypass tcp protocol for fused tcp loopback */
if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize))
return;
mss = tcp->tcp_mss;
- if (tcp->tcp_xmit_zc_clean)
- mp = tcp_zcopy_backoff(tcp, mp, 0);
+ /*
+ * If ZEROCOPY has turned off, try not to send any zero-copy message
+ * down. Do backoff, now.
+ */
+ if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_snd_zcopy_on)
+ mp = tcp_zcopy_backoff(tcp, mp, B_FALSE);
+
ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
len = (int)(mp->b_wptr - mp->b_rptr);
@@ -16977,8 +14000,7 @@ tcp_output(void *arg, mblk_t *mp, void *arg2)
* start again to get back the connection's "self-clock" as
* described in VJ's paper.
*
- * Refer to the comment in tcp_mss_set() for the calculation
- * of tcp_cwnd after idle.
+ * Reinitialize tcp_cwnd after idle.
*/
if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
(TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) {
@@ -16999,7 +14021,7 @@ tcp_output(void *arg, mblk_t *mp, void *arg2)
mutex_enter(&tcp->tcp_non_sq_lock);
if (tcp->tcp_flow_stopped &&
- TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
+ TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
tcp_clrqfull(tcp);
}
mutex_exit(&tcp->tcp_non_sq_lock);
@@ -17046,43 +14068,43 @@ tcp_output(void *arg, mblk_t *mp, void *arg2)
mp->b_next = (mblk_t *)(uintptr_t)snxt;
/* adjust tcp header information */
- tcph = tcp->tcp_tcph;
- tcph->th_flags[0] = (TH_ACK|TH_PUSH);
+ tcpha = tcp->tcp_tcpha;
+ tcpha->tha_flags = (TH_ACK|TH_PUSH);
- sum = len + tcp->tcp_tcp_hdr_len + tcp->tcp_sum;
+ sum = len + connp->conn_ht_ulp_len + connp->conn_sum;
sum = (sum >> 16) + (sum & 0xFFFF);
- U16_TO_ABE16(sum, tcph->th_sum);
+ tcpha->tha_sum = htons(sum);
- U32_TO_ABE32(snxt, tcph->th_seq);
+ tcpha->tha_seq = htonl(snxt);
BUMP_MIB(&tcps->tcps_mib, tcpOutDataSegs);
UPDATE_MIB(&tcps->tcps_mib, tcpOutDataBytes, len);
BUMP_LOCAL(tcp->tcp_obsegs);
/* Update the latest receive window size in TCP header. */
- U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws,
- tcph->th_win);
+ tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
tcp->tcp_last_sent_len = (ushort_t)len;
- plen = len + tcp->tcp_hdr_len;
+ plen = len + connp->conn_ht_iphc_len;
- if (tcp->tcp_ipversion == IPV4_VERSION) {
+ ixa = connp->conn_ixa;
+ ixa->ixa_pktlen = plen;
+
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
tcp->tcp_ipha->ipha_length = htons(plen);
} else {
- tcp->tcp_ip6h->ip6_plen = htons(plen -
- ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc));
+ tcp->tcp_ip6h->ip6_plen = htons(plen - IPV6_HDR_LEN);
}
/* see if we need to allocate a mblk for the headers */
- hdrlen = tcp->tcp_hdr_len;
+ hdrlen = connp->conn_ht_iphc_len;
rptr = mp1->b_rptr - hdrlen;
db = mp1->b_datap;
if ((db->db_ref != 2) || rptr < db->db_base ||
(!OK_32PTR(rptr))) {
/* NOTE: we assume allocb returns an OK_32PTR */
- mp = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH +
- tcps->tcps_wroff_xtra, BPRI_MED);
+ mp = allocb(hdrlen + tcps->tcps_wroff_xtra, BPRI_MED);
if (!mp) {
freemsg(mp1);
goto no_memory;
@@ -17090,7 +14112,6 @@ tcp_output(void *arg, mblk_t *mp, void *arg2)
mp->b_cont = mp1;
mp1 = mp;
/* Leave room for Link Level header */
- /* hdrlen = tcp->tcp_hdr_len; */
rptr = &mp1->b_rptr[tcps->tcps_wroff_xtra];
mp1->b_wptr = &rptr[hdrlen];
}
@@ -17099,16 +14120,16 @@ tcp_output(void *arg, mblk_t *mp, void *arg2)
/* Fill in the timestamp option. */
if (tcp->tcp_snd_ts_ok) {
U32_TO_BE32((uint32_t)lbolt,
- (char *)tcph+TCP_MIN_HEADER_LENGTH+4);
+ (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
U32_TO_BE32(tcp->tcp_ts_recent,
- (char *)tcph+TCP_MIN_HEADER_LENGTH+8);
+ (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
} else {
- ASSERT(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH);
+ ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
}
/* copy header into outgoing packet */
dst = (ipaddr_t *)rptr;
- src = (ipaddr_t *)tcp->tcp_iphc;
+ src = (ipaddr_t *)connp->conn_ht_iphc;
dst[0] = src[0];
dst[1] = src[1];
dst[2] = src[2];
@@ -17135,21 +14156,22 @@ tcp_output(void *arg, mblk_t *mp, void *arg2)
if (tcp->tcp_ecn_ok) {
SET_ECT(tcp, rptr);
- tcph = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len);
+ tcpha = (tcpha_t *)(rptr + ixa->ixa_ip_hdr_length);
if (tcp->tcp_ecn_echo_on)
- tcph->th_flags[0] |= TH_ECE;
+ tcpha->tha_flags |= TH_ECE;
if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) {
- tcph->th_flags[0] |= TH_CWR;
+ tcpha->tha_flags |= TH_CWR;
tcp->tcp_ecn_cwr_sent = B_TRUE;
}
}
if (tcp->tcp_ip_forward_progress) {
- ASSERT(tcp->tcp_ipversion == IPV6_VERSION);
- *(uint32_t *)mp1->b_rptr |= IP_FORWARD_PROG;
tcp->tcp_ip_forward_progress = B_FALSE;
+ connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF;
+ } else {
+ connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF;
}
- tcp_send_data(tcp, tcp->tcp_wq, mp1);
+ tcp_send_data(tcp, mp1);
return;
/*
@@ -17166,29 +14188,27 @@ slow:
tcp_wput_data(tcp, NULL, B_FALSE);
}
+/*
+ * This runs at the tail end of accept processing on the squeue of the
+ * new connection.
+ */
/* ARGSUSED */
void
-tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
+tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
{
conn_t *connp = (conn_t *)arg;
tcp_t *tcp = connp->conn_tcp;
- queue_t *q = tcp->tcp_rq;
- struct tcp_options *tcpopt;
+ queue_t *q = connp->conn_rq;
tcp_stack_t *tcps = tcp->tcp_tcps;
-
/* socket options */
- uint_t sopp_flags;
- ssize_t sopp_rxhiwat;
- ssize_t sopp_maxblk;
- ushort_t sopp_wroff;
- ushort_t sopp_tail;
- ushort_t sopp_copyopt;
+ struct sock_proto_props sopp;
- tcpopt = (struct tcp_options *)mp->b_rptr;
+ /* We should just receive a single mblk that fits a T_discon_ind */
+ ASSERT(mp->b_cont == NULL);
/*
* Drop the eager's ref on the listener, that was placed when
- * this eager began life in tcp_conn_request.
+ * this eager began life in tcp_input_listener.
*/
CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp);
if (IPCL_IS_NONSTR(connp)) {
@@ -17227,15 +14247,12 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
* memory allocation failure problems. We know
* that the size of the incoming mblk i.e.
* stroptions is greater than sizeof
- * T_discon_ind. So the reallocb below can't
- * fail.
+ * T_discon_ind.
*/
- freemsg(mp->b_cont);
- mp->b_cont = NULL;
ASSERT(DB_REF(mp) == 1);
- mp = reallocb(mp, sizeof (struct T_discon_ind),
- B_FALSE);
- ASSERT(mp != NULL);
+ ASSERT(MBLKSIZE(mp) >=
+ sizeof (struct T_discon_ind));
+
DB_TYPE(mp) = M_PROTO;
((union T_primitives *)mp->b_rptr)->type =
T_DISCON_IND;
@@ -17251,41 +14268,21 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
mp->b_wptr = mp->b_rptr +
sizeof (struct T_discon_ind);
putnext(q, mp);
- return;
}
}
- if (tcp->tcp_hard_binding) {
- tcp->tcp_hard_binding = B_FALSE;
- tcp->tcp_hard_bound = B_TRUE;
- }
+ tcp->tcp_hard_binding = B_FALSE;
return;
}
- if (tcpopt->to_flags & TCPOPT_BOUNDIF) {
- int boundif = tcpopt->to_boundif;
- uint_t len = sizeof (int);
-
- (void) tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, IPPROTO_IPV6,
- IPV6_BOUND_IF, len, (uchar_t *)&boundif, &len,
- (uchar_t *)&boundif, NULL, tcp->tcp_cred, NULL);
- }
- if (tcpopt->to_flags & TCPOPT_RECVPKTINFO) {
- uint_t on = 1;
- uint_t len = sizeof (uint_t);
- (void) tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, IPPROTO_IPV6,
- IPV6_RECVPKTINFO, len, (uchar_t *)&on, &len,
- (uchar_t *)&on, NULL, tcp->tcp_cred, NULL);
- }
-
/*
- * Set max window size (tcp_recv_hiwater) of the acceptor.
+ * Set max window size (conn_rcvbuf) of the acceptor.
*/
if (tcp->tcp_rcv_list == NULL) {
/*
* Recv queue is empty, tcp_rwnd should not have changed.
* That means it should be equal to the listener's tcp_rwnd.
*/
- tcp->tcp_recv_hiwater = tcp->tcp_rwnd;
+ connp->conn_rcvbuf = tcp->tcp_rwnd;
} else {
#ifdef DEBUG
mblk_t *tmp;
@@ -17300,19 +14297,19 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
ASSERT(cnt != 0 && tcp->tcp_rcv_cnt == cnt);
#endif
/* There is some data, add them back to get the max. */
- tcp->tcp_recv_hiwater = tcp->tcp_rwnd + tcp->tcp_rcv_cnt;
+ connp->conn_rcvbuf = tcp->tcp_rwnd + tcp->tcp_rcv_cnt;
}
/*
* This is the first time we run on the correct
* queue after tcp_accept. So fix all the q parameters
* here.
*/
- sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_MAXBLK | SOCKOPT_WROFF;
- sopp_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
+ sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_MAXBLK | SOCKOPT_WROFF;
+ sopp.sopp_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
- sopp_rxhiwat = tcp->tcp_fused ?
- tcp_fuse_set_rcv_hiwat(tcp, tcp->tcp_recv_hiwater) :
- tcp->tcp_recv_hiwater;
+ sopp.sopp_rxhiwat = tcp->tcp_fused ?
+ tcp_fuse_set_rcv_hiwat(tcp, connp->conn_rcvbuf) :
+ connp->conn_rcvbuf;
/*
* Determine what write offset value to use depending on SACK and
@@ -17328,18 +14325,18 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
* since it would reduce the amount of work done by kmem.
* Non-fused tcp loopback case is handled separately below.
*/
- sopp_wroff = 0;
+ sopp.sopp_wroff = 0;
/*
* Update the peer's transmit parameters according to
* our recently calculated high water mark value.
*/
(void) tcp_maxpsz_set(tcp->tcp_loopback_peer, B_TRUE);
} else if (tcp->tcp_snd_sack_ok) {
- sopp_wroff = tcp->tcp_hdr_len + TCPOPT_MAX_SACK_LEN +
+ sopp.sopp_wroff = connp->conn_ht_iphc_allocated +
(tcp->tcp_loopback ? 0 : tcps->tcps_wroff_xtra);
} else {
- sopp_wroff = tcp->tcp_hdr_len + (tcp->tcp_loopback ? 0 :
- tcps->tcps_wroff_xtra);
+ sopp.sopp_wroff = connp->conn_ht_iphc_len +
+ (tcp->tcp_loopback ? 0 : tcps->tcps_wroff_xtra);
}
/*
@@ -17354,30 +14351,22 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
* costs.
*/
if (tcp->tcp_kssl_ctx != NULL) {
- sopp_wroff += SSL3_WROFFSET;
+ sopp.sopp_wroff += SSL3_WROFFSET;
- sopp_flags |= SOCKOPT_TAIL;
- sopp_tail = SSL3_MAX_TAIL_LEN;
+ sopp.sopp_flags |= SOCKOPT_TAIL;
+ sopp.sopp_tail = SSL3_MAX_TAIL_LEN;
- sopp_flags |= SOCKOPT_ZCOPY;
- sopp_copyopt = ZCVMUNSAFE;
+ sopp.sopp_flags |= SOCKOPT_ZCOPY;
+ sopp.sopp_zcopyflag = ZCVMUNSAFE;
- sopp_maxblk = SSL3_MAX_RECORD_LEN;
+ sopp.sopp_maxblk = SSL3_MAX_RECORD_LEN;
}
/* Send the options up */
if (IPCL_IS_NONSTR(connp)) {
- struct sock_proto_props sopp;
-
- sopp.sopp_flags = sopp_flags;
- sopp.sopp_wroff = sopp_wroff;
- sopp.sopp_maxblk = sopp_maxblk;
- sopp.sopp_rxhiwat = sopp_rxhiwat;
- if (sopp_flags & SOCKOPT_TAIL) {
+ if (sopp.sopp_flags & SOCKOPT_TAIL) {
ASSERT(tcp->tcp_kssl_ctx != NULL);
- ASSERT(sopp_flags & SOCKOPT_ZCOPY);
- sopp.sopp_tail = sopp_tail;
- sopp.sopp_zcopyflag = sopp_copyopt;
+ ASSERT(sopp.sopp_flags & SOCKOPT_ZCOPY);
}
if (tcp->tcp_loopback) {
sopp.sopp_flags |= SOCKOPT_LOOPBACK;
@@ -17385,34 +14374,40 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
}
(*connp->conn_upcalls->su_set_proto_props)
(connp->conn_upper_handle, &sopp);
+ freemsg(mp);
} else {
+ /*
+ * Let us reuse the incoming mblk to avoid
+ * memory allocation failure problems. We know
+ * that the size of the incoming mblk is at least
+ * stroptions
+ */
struct stroptions *stropt;
- mblk_t *stropt_mp = allocb(sizeof (struct stroptions), BPRI_HI);
- if (stropt_mp == NULL) {
- tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
- return;
- }
- DB_TYPE(stropt_mp) = M_SETOPTS;
- stropt = (struct stroptions *)stropt_mp->b_rptr;
- stropt_mp->b_wptr += sizeof (struct stroptions);
+
+ ASSERT(DB_REF(mp) == 1);
+ ASSERT(MBLKSIZE(mp) >= sizeof (struct stroptions));
+
+ DB_TYPE(mp) = M_SETOPTS;
+ stropt = (struct stroptions *)mp->b_rptr;
+ mp->b_wptr = mp->b_rptr + sizeof (struct stroptions);
+ stropt = (struct stroptions *)mp->b_rptr;
stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
- stropt->so_hiwat = sopp_rxhiwat;
- stropt->so_wroff = sopp_wroff;
- stropt->so_maxblk = sopp_maxblk;
+ stropt->so_hiwat = sopp.sopp_rxhiwat;
+ stropt->so_wroff = sopp.sopp_wroff;
+ stropt->so_maxblk = sopp.sopp_maxblk;
- if (sopp_flags & SOCKOPT_TAIL) {
+ if (sopp.sopp_flags & SOCKOPT_TAIL) {
ASSERT(tcp->tcp_kssl_ctx != NULL);
stropt->so_flags |= SO_TAIL | SO_COPYOPT;
- stropt->so_tail = sopp_tail;
- stropt->so_copyopt = sopp_copyopt;
+ stropt->so_tail = sopp.sopp_tail;
+ stropt->so_copyopt = sopp.sopp_zcopyflag;
}
/* Send the options up */
- putnext(q, stropt_mp);
+ putnext(q, mp);
}
- freemsg(mp);
/*
* Pass up any data and/or a fin that has been received.
*
@@ -17432,7 +14427,7 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
if (!tcp->tcp_fused && (*connp->conn_upcalls->su_recv)
(connp->conn_upper_handle, NULL, 0, 0, &error,
&push) >= 0) {
- tcp->tcp_rwnd = tcp->tcp_recv_hiwater;
+ tcp->tcp_rwnd = connp->conn_rcvbuf;
if (tcp->tcp_state >= TCPS_ESTABLISHED &&
tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
tcp_xmit_ctl(NULL,
@@ -17463,7 +14458,7 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
/* We drain directly in case of fused tcp loopback */
if (!tcp->tcp_fused && canputnext(q)) {
- tcp->tcp_rwnd = tcp->tcp_recv_hiwater;
+ tcp->tcp_rwnd = connp->conn_rcvbuf;
if (tcp->tcp_state >= TCPS_ESTABLISHED &&
tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
tcp_xmit_ctl(NULL,
@@ -17508,12 +14503,9 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
putnext(q, mp);
}
}
- if (tcp->tcp_hard_binding) {
- tcp->tcp_hard_binding = B_FALSE;
- tcp->tcp_hard_bound = B_TRUE;
- }
+ tcp->tcp_hard_binding = B_FALSE;
- if (tcp->tcp_ka_enabled) {
+ if (connp->conn_keepalive) {
tcp->tcp_ka_last_intrvl = 0;
tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer,
MSEC_TO_TICK(tcp->tcp_ka_interval));
@@ -17535,14 +14527,14 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
/*
* The function called through squeue to get behind listener's perimeter to
- * send a deffered conn_ind.
+ * send a deferred conn_ind.
*/
/* ARGSUSED */
void
-tcp_send_pending(void *arg, mblk_t *mp, void *arg2)
+tcp_send_pending(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
{
- conn_t *connp = (conn_t *)arg;
- tcp_t *listener = connp->conn_tcp;
+ conn_t *lconnp = (conn_t *)arg;
+ tcp_t *listener = lconnp->conn_tcp;
struct T_conn_ind *conn_ind;
tcp_t *tcp;
@@ -17560,29 +14552,34 @@ tcp_send_pending(void *arg, mblk_t *mp, void *arg2)
return;
}
- tcp_ulp_newconn(connp, tcp->tcp_connp, mp);
+ tcp_ulp_newconn(lconnp, tcp->tcp_connp, mp);
}
-/* ARGSUSED */
+/*
+ * Common to TPI and sockfs accept code.
+ */
+/* ARGSUSED2 */
static int
tcp_accept_common(conn_t *lconnp, conn_t *econnp, cred_t *cr)
{
tcp_t *listener, *eager;
- mblk_t *opt_mp;
- struct tcp_options *tcpopt;
+ mblk_t *discon_mp;
listener = lconnp->conn_tcp;
ASSERT(listener->tcp_state == TCPS_LISTEN);
eager = econnp->conn_tcp;
ASSERT(eager->tcp_listener != NULL);
- ASSERT(eager->tcp_rq != NULL);
+ /*
+ * Pre allocate the discon_ind mblk also. tcp_accept_finish will
+ * use it if something failed.
+ */
+ discon_mp = allocb(MAX(sizeof (struct T_discon_ind),
+ sizeof (struct stroptions)), BPRI_HI);
- opt_mp = allocb(sizeof (struct tcp_options), BPRI_HI);
- if (opt_mp == NULL) {
+ if (discon_mp == NULL) {
return (-TPROTO);
}
- bzero((char *)opt_mp->b_rptr, sizeof (struct tcp_options));
eager->tcp_issocket = B_TRUE;
econnp->conn_zoneid = listener->tcp_connp->conn_zoneid;
@@ -17607,24 +14604,6 @@ tcp_accept_common(conn_t *lconnp, conn_t *econnp, cred_t *cr)
*/
ASSERT(econnp->conn_ref >= 3);
- opt_mp->b_datap->db_type = M_SETOPTS;
- opt_mp->b_wptr += sizeof (struct tcp_options);
-
- /*
- * Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO
- * from listener to acceptor.
- */
- tcpopt = (struct tcp_options *)opt_mp->b_rptr;
- tcpopt->to_flags = 0;
-
- if (listener->tcp_bound_if != 0) {
- tcpopt->to_flags |= TCPOPT_BOUNDIF;
- tcpopt->to_boundif = listener->tcp_bound_if;
- }
- if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) {
- tcpopt->to_flags |= TCPOPT_RECVPKTINFO;
- }
-
mutex_enter(&listener->tcp_eager_lock);
if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
@@ -17686,7 +14665,7 @@ tcp_accept_common(conn_t *lconnp, conn_t *econnp, cred_t *cr)
/* Need to get inside the listener perimeter */
CONN_INC_REF(listener->tcp_connp);
SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, mp1,
- tcp_send_pending, listener->tcp_connp, SQ_FILL,
+ tcp_send_pending, listener->tcp_connp, NULL, SQ_FILL,
SQTAG_TCP_SEND_PENDING);
}
no_more_eagers:
@@ -17700,8 +14679,8 @@ no_more_eagers:
* before sending the conn_ind in tcp_send_conn_ind.
* The ref will be dropped in tcp_accept_finish().
*/
- SQUEUE_ENTER_ONE(econnp->conn_sqp, opt_mp, tcp_accept_finish,
- econnp, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0);
+ SQUEUE_ENTER_ONE(econnp->conn_sqp, discon_mp, tcp_accept_finish,
+ econnp, NULL, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0);
return (0);
}
@@ -17712,7 +14691,6 @@ tcp_accept(sock_lower_handle_t lproto_handle,
{
conn_t *lconnp, *econnp;
tcp_t *listener, *eager;
- tcp_stack_t *tcps;
lconnp = (conn_t *)lproto_handle;
listener = lconnp->conn_tcp;
@@ -17720,7 +14698,6 @@ tcp_accept(sock_lower_handle_t lproto_handle,
econnp = (conn_t *)eproto_handle;
eager = econnp->conn_tcp;
ASSERT(eager->tcp_listener != NULL);
- tcps = eager->tcp_tcps;
/*
* It is OK to manipulate these fields outside the eager's squeue
@@ -17732,19 +14709,6 @@ tcp_accept(sock_lower_handle_t lproto_handle,
econnp->conn_upper_handle = sock_handle;
econnp->conn_upcalls = lconnp->conn_upcalls;
ASSERT(IPCL_IS_NONSTR(econnp));
- /*
- * Create helper stream if it is a non-TPI TCP connection.
- */
- if (ip_create_helper_stream(econnp, tcps->tcps_ldi_ident)) {
- ip1dbg(("tcp_accept: create of IP helper stream"
- " failed\n"));
- return (EPROTO);
- }
- eager->tcp_rq = econnp->conn_rq;
- eager->tcp_wq = econnp->conn_wq;
-
- ASSERT(eager->tcp_rq != NULL);
-
return (tcp_accept_common(lconnp, econnp, cr));
}
@@ -17752,7 +14716,7 @@ tcp_accept(sock_lower_handle_t lproto_handle,
/*
* This is the STREAMS entry point for T_CONN_RES coming down on
* Acceptor STREAM when sockfs listener does accept processing.
- * Read the block comment on top of tcp_conn_request().
+ * Read the block comment on top of tcp_input_listener().
*/
void
tcp_tpi_accept(queue_t *q, mblk_t *mp)
@@ -17815,8 +14779,8 @@ tcp_tpi_accept(queue_t *q, mblk_t *mp)
econnp = eager->tcp_connp;
econnp->conn_dev = (dev_t)RD(q)->q_ptr;
econnp->conn_minor_arena = (vmem_t *)(WR(q)->q_ptr);
- eager->tcp_rq = rq;
- eager->tcp_wq = q;
+ econnp->conn_rq = rq;
+ econnp->conn_wq = q;
rq->q_ptr = econnp;
rq->q_qinfo = &tcp_rinitv4; /* No open - same as rinitv6 */
q->q_ptr = econnp;
@@ -17836,7 +14800,7 @@ tcp_tpi_accept(queue_t *q, mblk_t *mp)
* should already be enough space in the mp that came
* down from soaccept().
*/
- if (eager->tcp_family == AF_INET) {
+ if (econnp->conn_family == AF_INET) {
sin_t *sin;
ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >=
@@ -17844,8 +14808,8 @@ tcp_tpi_accept(queue_t *q, mblk_t *mp)
sin = (sin_t *)mp->b_wptr;
mp->b_wptr += sizeof (sin_t);
sin->sin_family = AF_INET;
- sin->sin_port = eager->tcp_lport;
- sin->sin_addr.s_addr = eager->tcp_ipha->ipha_src;
+ sin->sin_port = econnp->conn_lport;
+ sin->sin_addr.s_addr = econnp->conn_laddr_v4;
} else {
sin6_t *sin6;
@@ -17854,20 +14818,23 @@ tcp_tpi_accept(queue_t *q, mblk_t *mp)
sin6 = (sin6_t *)mp->b_wptr;
mp->b_wptr += sizeof (sin6_t);
sin6->sin6_family = AF_INET6;
- sin6->sin6_port = eager->tcp_lport;
- if (eager->tcp_ipversion == IPV4_VERSION) {
+ sin6->sin6_port = econnp->conn_lport;
+ sin6->sin6_addr = econnp->conn_laddr_v6;
+ if (econnp->conn_ipversion == IPV4_VERSION) {
sin6->sin6_flowinfo = 0;
- IN6_IPADDR_TO_V4MAPPED(
- eager->tcp_ipha->ipha_src,
- &sin6->sin6_addr);
} else {
ASSERT(eager->tcp_ip6h != NULL);
sin6->sin6_flowinfo =
eager->tcp_ip6h->ip6_vcf &
~IPV6_VERS_AND_FLOW_MASK;
- sin6->sin6_addr = eager->tcp_ip6h->ip6_src;
}
- sin6->sin6_scope_id = 0;
+ if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) &&
+ (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) {
+ sin6->sin6_scope_id =
+ econnp->conn_ixa->ixa_scopeid;
+ } else {
+ sin6->sin6_scope_id = 0;
+ }
sin6->__sin6_src_id = 0;
}
@@ -17881,97 +14848,6 @@ tcp_tpi_accept(queue_t *q, mblk_t *mp)
}
}
-static int
-tcp_do_getsockname(tcp_t *tcp, struct sockaddr *sa, uint_t *salenp)
-{
- sin_t *sin = (sin_t *)sa;
- sin6_t *sin6 = (sin6_t *)sa;
-
- switch (tcp->tcp_family) {
- case AF_INET:
- ASSERT(tcp->tcp_ipversion == IPV4_VERSION);
-
- if (*salenp < sizeof (sin_t))
- return (EINVAL);
-
- *sin = sin_null;
- sin->sin_family = AF_INET;
- if (tcp->tcp_state >= TCPS_BOUND) {
- sin->sin_port = tcp->tcp_lport;
- sin->sin_addr.s_addr = tcp->tcp_ipha->ipha_src;
- }
- *salenp = sizeof (sin_t);
- break;
-
- case AF_INET6:
- if (*salenp < sizeof (sin6_t))
- return (EINVAL);
-
- *sin6 = sin6_null;
- sin6->sin6_family = AF_INET6;
- if (tcp->tcp_state >= TCPS_BOUND) {
- sin6->sin6_port = tcp->tcp_lport;
- mutex_enter(&tcp->tcp_connp->conn_lock);
- if (tcp->tcp_ipversion == IPV4_VERSION) {
- IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src,
- &sin6->sin6_addr);
- } else {
- sin6->sin6_addr = tcp->tcp_ip6h->ip6_src;
- }
- mutex_exit(&tcp->tcp_connp->conn_lock);
- }
- *salenp = sizeof (sin6_t);
- break;
- }
-
- return (0);
-}
-
-static int
-tcp_do_getpeername(tcp_t *tcp, struct sockaddr *sa, uint_t *salenp)
-{
- sin_t *sin = (sin_t *)sa;
- sin6_t *sin6 = (sin6_t *)sa;
-
- if (tcp->tcp_state < TCPS_SYN_RCVD)
- return (ENOTCONN);
-
- switch (tcp->tcp_family) {
- case AF_INET:
- ASSERT(tcp->tcp_ipversion == IPV4_VERSION);
-
- if (*salenp < sizeof (sin_t))
- return (EINVAL);
-
- *sin = sin_null;
- sin->sin_family = AF_INET;
- sin->sin_port = tcp->tcp_fport;
- IN6_V4MAPPED_TO_IPADDR(&tcp->tcp_remote_v6,
- sin->sin_addr.s_addr);
- *salenp = sizeof (sin_t);
- break;
-
- case AF_INET6:
- if (*salenp < sizeof (sin6_t))
- return (EINVAL);
-
- *sin6 = sin6_null;
- sin6->sin6_family = AF_INET6;
- sin6->sin6_port = tcp->tcp_fport;
- sin6->sin6_addr = tcp->tcp_remote_v6;
- mutex_enter(&tcp->tcp_connp->conn_lock);
- if (tcp->tcp_ipversion == IPV6_VERSION) {
- sin6->sin6_flowinfo = tcp->tcp_ip6h->ip6_vcf &
- ~IPV6_VERS_AND_FLOW_MASK;
- }
- mutex_exit(&tcp->tcp_connp->conn_lock);
- *salenp = sizeof (sin6_t);
- break;
- }
-
- return (0);
-}
-
/*
* Handle special out-of-band ioctl requests (see PSARC/2008/265).
*/
@@ -17980,7 +14856,8 @@ tcp_wput_cmdblk(queue_t *q, mblk_t *mp)
{
void *data;
mblk_t *datamp = mp->b_cont;
- tcp_t *tcp = Q_TO_TCP(q);
+ conn_t *connp = Q_TO_CONN(q);
+ tcp_t *tcp = connp->conn_tcp;
cmdblk_t *cmdp = (cmdblk_t *)mp->b_rptr;
if (datamp == NULL || MBLKL(datamp) < cmdp->cb_len) {
@@ -17993,10 +14870,14 @@ tcp_wput_cmdblk(queue_t *q, mblk_t *mp)
switch (cmdp->cb_cmd) {
case TI_GETPEERNAME:
- cmdp->cb_error = tcp_do_getpeername(tcp, data, &cmdp->cb_len);
+ if (tcp->tcp_state < TCPS_SYN_RCVD)
+ cmdp->cb_error = ENOTCONN;
+ else
+ cmdp->cb_error = conn_getpeername(connp, data,
+ &cmdp->cb_len);
break;
case TI_GETMYNAME:
- cmdp->cb_error = tcp_do_getsockname(tcp, data, &cmdp->cb_len);
+ cmdp->cb_error = conn_getsockname(connp, data, &cmdp->cb_len);
break;
default:
cmdp->cb_error = EINVAL;
@@ -18029,14 +14910,14 @@ tcp_wput(queue_t *q, mblk_t *mp)
mutex_enter(&tcp->tcp_non_sq_lock);
tcp->tcp_squeue_bytes += size;
- if (TCP_UNSENT_BYTES(tcp) > tcp->tcp_xmit_hiwater) {
+ if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) {
tcp_setqfull(tcp);
}
mutex_exit(&tcp->tcp_non_sq_lock);
CONN_INC_REF(connp);
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, connp,
- tcp_squeue_flag, SQTAG_TCP_OUTPUT);
+ NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
return;
case M_CMD:
@@ -18053,7 +14934,7 @@ tcp_wput(queue_t *q, mblk_t *mp)
if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) {
type = ((union T_primitives *)rptr)->type;
} else {
- if (tcp->tcp_debug) {
+ if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1,
SL_ERROR|SL_TRACE,
"tcp_wput_proto, dropping one...");
@@ -18093,7 +14974,7 @@ tcp_wput(queue_t *q, mblk_t *mp)
/*
* Most ioctls can be processed right away without going via
* squeues - process them right here. Those that do require
- * squeue (currently TCP_IOC_DEFAULT_Q and _SIOCSOCKFALLBACK)
+ * squeue (currently _SIOCSOCKFALLBACK)
* are processed by tcp_wput_ioctl().
*/
iocp = (struct iocblk *)mp->b_rptr;
@@ -18111,26 +14992,13 @@ tcp_wput(queue_t *q, mblk_t *mp)
case ND_SET:
/* nd_getset does the necessary checks */
case ND_GET:
- if (!nd_getset(q, tcps->tcps_g_nd, mp)) {
- CALL_IP_WPUT(connp, q, mp);
- return;
- }
- qreply(q, mp);
- return;
- case TCP_IOC_DEFAULT_Q:
- /*
- * Wants to be the default wq. Check the credentials
- * first, the rest is executed via squeue.
- */
- if (secpolicy_ip_config(iocp->ioc_cr, B_FALSE) != 0) {
- iocp->ioc_error = EPERM;
- iocp->ioc_count = 0;
- mp->b_datap->db_type = M_IOCACK;
+ if (nd_getset(q, tcps->tcps_g_nd, mp)) {
qreply(q, mp);
return;
}
- output_proc = tcp_wput_ioctl;
- break;
+ ip_wput_nondata(q, mp);
+ return;
+
default:
output_proc = tcp_wput_ioctl;
break;
@@ -18143,7 +15011,7 @@ tcp_wput(queue_t *q, mblk_t *mp)
CONN_INC_REF(connp);
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, output_proc, connp,
- tcp_squeue_flag, SQTAG_TCP_WPUT_OTHER);
+ NULL, tcp_squeue_flag, SQTAG_TCP_WPUT_OTHER);
}
/*
@@ -18188,52 +15056,32 @@ tcp_wput_fallback(queue_t *wq, mblk_t *mp)
freemsg(mp);
}
+/*
+ * Check the usability of ZEROCOPY. It's instead checking the flag set by IP.
+ */
static boolean_t
tcp_zcopy_check(tcp_t *tcp)
{
- conn_t *connp = tcp->tcp_connp;
- ire_t *ire;
+ conn_t *connp = tcp->tcp_connp;
+ ip_xmit_attr_t *ixa = connp->conn_ixa;
boolean_t zc_enabled = B_FALSE;
tcp_stack_t *tcps = tcp->tcp_tcps;
if (do_tcpzcopy == 2)
zc_enabled = B_TRUE;
- else if (tcp->tcp_ipversion == IPV4_VERSION &&
- IPCL_IS_CONNECTED(connp) &&
- (connp->conn_flags & IPCL_CHECK_POLICY) == 0 &&
- connp->conn_dontroute == 0 &&
- !connp->conn_nexthop_set &&
- connp->conn_outgoing_ill == NULL &&
- do_tcpzcopy == 1) {
- /*
- * the checks above closely resemble the fast path checks
- * in tcp_send_data().
- */
- mutex_enter(&connp->conn_lock);
- ire = connp->conn_ire_cache;
- ASSERT(!(connp->conn_state_flags & CONN_INCIPIENT));
- if (ire != NULL && !(ire->ire_marks & IRE_MARK_CONDEMNED)) {
- IRE_REFHOLD(ire);
- if (ire->ire_stq != NULL) {
- ill_t *ill = (ill_t *)ire->ire_stq->q_ptr;
-
- zc_enabled = ill && (ill->ill_capabilities &
- ILL_CAPAB_ZEROCOPY) &&
- (ill->ill_zerocopy_capab->
- ill_zerocopy_flags != 0);
- }
- IRE_REFRELE(ire);
- }
- mutex_exit(&connp->conn_lock);
- }
+ else if ((do_tcpzcopy == 1) && (ixa->ixa_flags & IXAF_ZCOPY_CAPAB))
+ zc_enabled = B_TRUE;
+
tcp->tcp_snd_zcopy_on = zc_enabled;
if (!TCP_IS_DETACHED(tcp)) {
if (zc_enabled) {
- (void) proto_set_tx_copyopt(tcp->tcp_rq, connp,
+ ixa->ixa_flags |= IXAF_VERIFY_ZCOPY;
+ (void) proto_set_tx_copyopt(connp->conn_rq, connp,
ZCVMSAFE);
TCP_STAT(tcps, tcp_zcopy_on);
} else {
- (void) proto_set_tx_copyopt(tcp->tcp_rq, connp,
+ ixa->ixa_flags &= ~IXAF_VERIFY_ZCOPY;
+ (void) proto_set_tx_copyopt(connp->conn_rq, connp,
ZCVMUNSAFE);
TCP_STAT(tcps, tcp_zcopy_off);
}
@@ -18241,99 +15089,84 @@ tcp_zcopy_check(tcp_t *tcp)
return (zc_enabled);
}
-static mblk_t *
-tcp_zcopy_disable(tcp_t *tcp, mblk_t *bp)
-{
- tcp_stack_t *tcps = tcp->tcp_tcps;
-
- if (do_tcpzcopy == 2)
- return (bp);
- else if (tcp->tcp_snd_zcopy_on) {
- tcp->tcp_snd_zcopy_on = B_FALSE;
- if (!TCP_IS_DETACHED(tcp)) {
- (void) proto_set_tx_copyopt(tcp->tcp_rq, tcp->tcp_connp,
- ZCVMUNSAFE);
- TCP_STAT(tcps, tcp_zcopy_disable);
- }
- }
- return (tcp_zcopy_backoff(tcp, bp, 0));
-}
-
/*
- * Backoff from a zero-copy mblk by copying data to a new mblk and freeing
- * the original desballoca'ed segmapped mblk.
+ * Backoff from a zero-copy message by copying data to a new allocated
+ * message and freeing the original desballoca'ed segmapped message.
+ *
+ * This function is called by following two callers:
+ * 1. tcp_timer: fix_xmitlist is set to B_TRUE, because it's safe to free
+ * the origial desballoca'ed message and notify sockfs. This is in re-
+ * transmit state.
+ * 2. tcp_output: fix_xmitlist is set to B_FALSE. Flag STRUIO_ZCNOTIFY need
+ * to be copied to new message.
*/
static mblk_t *
-tcp_zcopy_backoff(tcp_t *tcp, mblk_t *bp, int fix_xmitlist)
+tcp_zcopy_backoff(tcp_t *tcp, mblk_t *bp, boolean_t fix_xmitlist)
{
- mblk_t *head, *tail, *nbp;
+ mblk_t *nbp;
+ mblk_t *head = NULL;
+ mblk_t *tail = NULL;
tcp_stack_t *tcps = tcp->tcp_tcps;
- if (IS_VMLOANED_MBLK(bp)) {
- TCP_STAT(tcps, tcp_zcopy_backoff);
- if ((head = copyb(bp)) == NULL) {
- /* fail to backoff; leave it for the next backoff */
- tcp->tcp_xmit_zc_clean = B_FALSE;
- return (bp);
- }
- if (bp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) {
- if (fix_xmitlist)
- tcp_zcopy_notify(tcp);
- else
- head->b_datap->db_struioflag |= STRUIO_ZCNOTIFY;
- }
- nbp = bp->b_cont;
- if (fix_xmitlist) {
- head->b_prev = bp->b_prev;
- head->b_next = bp->b_next;
- if (tcp->tcp_xmit_tail == bp)
- tcp->tcp_xmit_tail = head;
- }
- bp->b_next = NULL;
- bp->b_prev = NULL;
- freeb(bp);
- } else {
- head = bp;
- nbp = bp->b_cont;
- }
- tail = head;
- while (nbp) {
- if (IS_VMLOANED_MBLK(nbp)) {
+ ASSERT(bp != NULL);
+ while (bp != NULL) {
+ if (IS_VMLOANED_MBLK(bp)) {
TCP_STAT(tcps, tcp_zcopy_backoff);
- if ((tail->b_cont = copyb(nbp)) == NULL) {
+ if ((nbp = copyb(bp)) == NULL) {
tcp->tcp_xmit_zc_clean = B_FALSE;
- tail->b_cont = nbp;
- return (head);
+ if (tail != NULL)
+ tail->b_cont = bp;
+ return ((head == NULL) ? bp : head);
}
- tail = tail->b_cont;
- if (nbp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) {
+
+ if (bp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) {
if (fix_xmitlist)
tcp_zcopy_notify(tcp);
else
- tail->b_datap->db_struioflag |=
+ nbp->b_datap->db_struioflag |=
STRUIO_ZCNOTIFY;
}
- bp = nbp;
- nbp = nbp->b_cont;
+ nbp->b_cont = bp->b_cont;
+
+ /*
+ * Copy saved information and adjust tcp_xmit_tail
+ * if needed.
+ */
if (fix_xmitlist) {
- tail->b_prev = bp->b_prev;
- tail->b_next = bp->b_next;
+ nbp->b_prev = bp->b_prev;
+ nbp->b_next = bp->b_next;
+
if (tcp->tcp_xmit_tail == bp)
- tcp->tcp_xmit_tail = tail;
+ tcp->tcp_xmit_tail = nbp;
}
- bp->b_next = NULL;
+
+ /* Free the original message. */
bp->b_prev = NULL;
+ bp->b_next = NULL;
freeb(bp);
+
+ bp = nbp;
+ }
+
+ if (head == NULL) {
+ head = bp;
+ }
+ if (tail == NULL) {
+ tail = bp;
} else {
- tail->b_cont = nbp;
- tail = nbp;
- nbp = nbp->b_cont;
+ tail->b_cont = bp;
+ tail = bp;
}
+
+ /* Move forward. */
+ bp = bp->b_cont;
}
+
if (fix_xmitlist) {
tcp->tcp_xmit_last = tail;
tcp->tcp_xmit_zc_clean = B_TRUE;
}
+
return (head);
}
@@ -18341,7 +15174,7 @@ static void
tcp_zcopy_notify(tcp_t *tcp)
{
struct stdata *stp;
- conn_t *connp;
+ conn_t *connp;
if (tcp->tcp_detached)
return;
@@ -18351,323 +15184,149 @@ tcp_zcopy_notify(tcp_t *tcp)
(connp->conn_upper_handle);
return;
}
- stp = STREAM(tcp->tcp_rq);
+ stp = STREAM(connp->conn_rq);
mutex_enter(&stp->sd_lock);
stp->sd_flag |= STZCNOTIFY;
cv_broadcast(&stp->sd_zcopy_wait);
mutex_exit(&stp->sd_lock);
}
-static boolean_t
-tcp_send_find_ire(tcp_t *tcp, ipaddr_t *dst, ire_t **irep)
+/*
+ * Update the TCP connection according to change of LSO capability.
+ */
+static void
+tcp_update_lso(tcp_t *tcp, ip_xmit_attr_t *ixa)
{
- ire_t *ire;
- conn_t *connp = tcp->tcp_connp;
- tcp_stack_t *tcps = tcp->tcp_tcps;
- ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
-
- mutex_enter(&connp->conn_lock);
- ire = connp->conn_ire_cache;
- ASSERT(!(connp->conn_state_flags & CONN_INCIPIENT));
-
- if ((ire != NULL) &&
- (((dst != NULL) && (ire->ire_addr == *dst)) || ((dst == NULL) &&
- IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &tcp->tcp_ip6h->ip6_dst))) &&
- !(ire->ire_marks & IRE_MARK_CONDEMNED)) {
- IRE_REFHOLD(ire);
- mutex_exit(&connp->conn_lock);
- } else {
- boolean_t cached = B_FALSE;
- ts_label_t *tsl;
-
- /* force a recheck later on */
- tcp->tcp_ire_ill_check_done = B_FALSE;
-
- TCP_DBGSTAT(tcps, tcp_ire_null1);
- connp->conn_ire_cache = NULL;
- mutex_exit(&connp->conn_lock);
-
- if (ire != NULL)
- IRE_REFRELE_NOTR(ire);
-
- tsl = crgetlabel(CONN_CRED(connp));
- ire = (dst ?
- ire_cache_lookup(*dst, connp->conn_zoneid, tsl, ipst) :
- ire_cache_lookup_v6(&tcp->tcp_ip6h->ip6_dst,
- connp->conn_zoneid, tsl, ipst));
+ /*
+ * We check against IPv4 header length to preserve the old behavior
+ * of only enabling LSO when there are no IP options.
+ * But this restriction might not be necessary at all. Before removing
+ * it, need to verify how LSO is handled for source routing case, with
+ * which IP does software checksum.
+ *
+ * For IPv6, whenever any extension header is needed, LSO is supressed.
+ */
+ if (ixa->ixa_ip_hdr_length != ((ixa->ixa_flags & IXAF_IS_IPV4) ?
+ IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN))
+ return;
- if (ire == NULL) {
- TCP_STAT(tcps, tcp_ire_null);
- return (B_FALSE);
- }
+ /*
+ * Either the LSO capability newly became usable, or it has changed.
+ */
+ if (ixa->ixa_flags & IXAF_LSO_CAPAB) {
+ ill_lso_capab_t *lsoc = &ixa->ixa_lso_capab;
- IRE_REFHOLD_NOTR(ire);
+ ASSERT(lsoc->ill_lso_max > 0);
+ tcp->tcp_lso_max = MIN(TCP_MAX_LSO_LENGTH, lsoc->ill_lso_max);
- mutex_enter(&connp->conn_lock);
- if (CONN_CACHE_IRE(connp)) {
- rw_enter(&ire->ire_bucket->irb_lock, RW_READER);
- if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) {
- TCP_CHECK_IREINFO(tcp, ire);
- connp->conn_ire_cache = ire;
- cached = B_TRUE;
- }
- rw_exit(&ire->ire_bucket->irb_lock);
- }
- mutex_exit(&connp->conn_lock);
+ DTRACE_PROBE3(tcp_update_lso, boolean_t, tcp->tcp_lso,
+ boolean_t, B_TRUE, uint32_t, tcp->tcp_lso_max);
/*
- * We can continue to use the ire but since it was
- * not cached, we should drop the extra reference.
+ * If LSO to be enabled, notify the STREAM header with larger
+ * data block.
*/
- if (!cached)
- IRE_REFRELE_NOTR(ire);
+ if (!tcp->tcp_lso)
+ tcp->tcp_maxpsz_multiplier = 0;
+
+ tcp->tcp_lso = B_TRUE;
+ TCP_STAT(tcp->tcp_tcps, tcp_lso_enabled);
+ } else { /* LSO capability is not usable any more. */
+ DTRACE_PROBE3(tcp_update_lso, boolean_t, tcp->tcp_lso,
+ boolean_t, B_FALSE, uint32_t, tcp->tcp_lso_max);
/*
- * Rampart note: no need to select a new label here, since
- * labels are not allowed to change during the life of a TCP
- * connection.
+ * If LSO to be disabled, notify the STREAM header with smaller
+ * data block. And need to restore fragsize to PMTU.
*/
+ if (tcp->tcp_lso) {
+ tcp->tcp_maxpsz_multiplier =
+ tcp->tcp_tcps->tcps_maxpsz_multiplier;
+ ixa->ixa_fragsize = ixa->ixa_pmtu;
+ tcp->tcp_lso = B_FALSE;
+ TCP_STAT(tcp->tcp_tcps, tcp_lso_disabled);
+ }
}
- *irep = ire;
-
- return (B_TRUE);
+ (void) tcp_maxpsz_set(tcp, B_TRUE);
}
/*
- * Called from tcp_send() or tcp_send_data() to find workable IRE.
- *
- * 0 = success;
- * 1 = failed to find ire and ill.
+ * Update the TCP connection according to change of ZEROCOPY capability.
*/
-static boolean_t
-tcp_send_find_ire_ill(tcp_t *tcp, mblk_t *mp, ire_t **irep, ill_t **illp)
+static void
+tcp_update_zcopy(tcp_t *tcp)
{
- ipha_t *ipha;
- ipaddr_t dst;
- ire_t *ire;
- ill_t *ill;
- mblk_t *ire_fp_mp;
+ conn_t *connp = tcp->tcp_connp;
tcp_stack_t *tcps = tcp->tcp_tcps;
- if (mp != NULL)
- ipha = (ipha_t *)mp->b_rptr;
- else
- ipha = tcp->tcp_ipha;
- dst = ipha->ipha_dst;
-
- if (!tcp_send_find_ire(tcp, &dst, &ire))
- return (B_FALSE);
-
- if ((ire->ire_flags & RTF_MULTIRT) ||
- (ire->ire_stq == NULL) ||
- (ire->ire_nce == NULL) ||
- ((ire_fp_mp = ire->ire_nce->nce_fp_mp) == NULL) ||
- ((mp != NULL) && (ire->ire_max_frag < ntohs(ipha->ipha_length) ||
- MBLKL(ire_fp_mp) > MBLKHEAD(mp)))) {
- TCP_STAT(tcps, tcp_ip_ire_send);
- IRE_REFRELE(ire);
- return (B_FALSE);
+ if (tcp->tcp_snd_zcopy_on) {
+ tcp->tcp_snd_zcopy_on = B_FALSE;
+ if (!TCP_IS_DETACHED(tcp)) {
+ (void) proto_set_tx_copyopt(connp->conn_rq, connp,
+ ZCVMUNSAFE);
+ TCP_STAT(tcps, tcp_zcopy_off);
+ }
+ } else {
+ tcp->tcp_snd_zcopy_on = B_TRUE;
+ if (!TCP_IS_DETACHED(tcp)) {
+ (void) proto_set_tx_copyopt(connp->conn_rq, connp,
+ ZCVMSAFE);
+ TCP_STAT(tcps, tcp_zcopy_on);
+ }
}
+}
- ill = ire_to_ill(ire);
- ASSERT(ill != NULL);
+/*
+ * Notify function registered with ip_xmit_attr_t. It's called in the squeue
+ * so it's safe to update the TCP connection.
+ */
+/* ARGSUSED1 */
+static void
+tcp_notify(void *arg, ip_xmit_attr_t *ixa, ixa_notify_type_t ntype,
+ ixa_notify_arg_t narg)
+{
+ tcp_t *tcp = (tcp_t *)arg;
+ conn_t *connp = tcp->tcp_connp;
- if (!tcp->tcp_ire_ill_check_done) {
- tcp_ire_ill_check(tcp, ire, ill, B_TRUE);
- tcp->tcp_ire_ill_check_done = B_TRUE;
+ switch (ntype) {
+ case IXAN_LSO:
+ tcp_update_lso(tcp, connp->conn_ixa);
+ break;
+ case IXAN_PMTU:
+ tcp_update_pmtu(tcp, B_FALSE);
+ break;
+ case IXAN_ZCOPY:
+ tcp_update_zcopy(tcp);
+ break;
+ default:
+ break;
}
-
- *irep = ire;
- *illp = ill;
-
- return (B_TRUE);
}
static void
-tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp)
+tcp_send_data(tcp_t *tcp, mblk_t *mp)
{
- ipha_t *ipha;
- ipaddr_t src;
- ipaddr_t dst;
- uint32_t cksum;
- ire_t *ire;
- uint16_t *up;
- ill_t *ill;
conn_t *connp = tcp->tcp_connp;
- uint32_t hcksum_txflags = 0;
- mblk_t *ire_fp_mp;
- uint_t ire_fp_mp_len;
- tcp_stack_t *tcps = tcp->tcp_tcps;
- ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
- cred_t *cr;
- pid_t cpid;
-
- ASSERT(DB_TYPE(mp) == M_DATA);
/*
- * Here we need to handle the overloading of the cred_t for
- * both getpeerucred and TX.
- * If this is a SYN then the caller already set db_credp so
- * that getpeerucred will work. But if TX is in use we might have
- * a conn_effective_cred which is different, and we need to use that
- * cred to make TX use the correct label and label dependent route.
+ * Check here to avoid sending zero-copy message down to IP when
+ * ZEROCOPY capability has turned off. We only need to deal with
+ * the race condition between sockfs and the notification here.
+ * Since we have tried to backoff the tcp_xmit_head when turning
+ * zero-copy off and new messages in tcp_output(), we simply drop
+ * the dup'ed packet here and let tcp retransmit, if tcp_xmit_zc_clean
+ * is not true.
*/
- if (is_system_labeled()) {
- cr = msg_getcred(mp, &cpid);
- if (cr == NULL || connp->conn_effective_cred != NULL)
- mblk_setcred(mp, CONN_CRED(connp), cpid);
- }
-
- ipha = (ipha_t *)mp->b_rptr;
- src = ipha->ipha_src;
- dst = ipha->ipha_dst;
-
- ASSERT(q != NULL);
- DTRACE_PROBE2(tcp__trace__send, mblk_t *, mp, tcp_t *, tcp);
-
- /*
- * Drop off fast path for IPv6 and also if options are present or
- * we need to resolve a TS label.
- */
- if (tcp->tcp_ipversion != IPV4_VERSION ||
- !IPCL_IS_CONNECTED(connp) ||
- !CONN_IS_LSO_MD_FASTPATH(connp) ||
- (connp->conn_flags & IPCL_CHECK_POLICY) != 0 ||
- !connp->conn_ulp_labeled ||
- ipha->ipha_ident == IP_HDR_INCLUDED ||
- ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION ||
- IPP_ENABLED(IPP_LOCAL_OUT, ipst)) {
- if (tcp->tcp_snd_zcopy_aware)
- mp = tcp_zcopy_disable(tcp, mp);
- TCP_STAT(tcps, tcp_ip_send);
- CALL_IP_WPUT(connp, q, mp);
- return;
- }
-
- if (!tcp_send_find_ire_ill(tcp, mp, &ire, &ill)) {
- if (tcp->tcp_snd_zcopy_aware)
- mp = tcp_zcopy_backoff(tcp, mp, 0);
- CALL_IP_WPUT(connp, q, mp);
+ if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_snd_zcopy_on &&
+ !tcp->tcp_xmit_zc_clean) {
+ ip_drop_output("TCP ZC was disabled but not clean", mp, NULL);
+ freemsg(mp);
return;
}
- ire_fp_mp = ire->ire_nce->nce_fp_mp;
- ire_fp_mp_len = MBLKL(ire_fp_mp);
-
- ASSERT(ipha->ipha_ident == 0 || ipha->ipha_ident == IP_HDR_INCLUDED);
- ipha->ipha_ident = (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1);
-#ifndef _BIG_ENDIAN
- ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8);
-#endif
-
- /*
- * Check to see if we need to re-enable LSO/MDT for this connection
- * because it was previously disabled due to changes in the ill;
- * note that by doing it here, this re-enabling only applies when
- * the packet is not dispatched through CALL_IP_WPUT().
- *
- * That means for IPv4, it is worth re-enabling LSO/MDT for the fastpath
- * case, since that's how we ended up here. For IPv6, we do the
- * re-enabling work in ip_xmit_v6(), albeit indirectly via squeue.
- */
- if (connp->conn_lso_ok && !tcp->tcp_lso && ILL_LSO_TCP_USABLE(ill)) {
- /*
- * Restore LSO for this connection, so that next time around
- * it is eligible to go through tcp_lsosend() path again.
- */
- TCP_STAT(tcps, tcp_lso_enabled);
- tcp->tcp_lso = B_TRUE;
- ip1dbg(("tcp_send_data: reenabling LSO for connp %p on "
- "interface %s\n", (void *)connp, ill->ill_name));
- } else if (connp->conn_mdt_ok && !tcp->tcp_mdt && ILL_MDT_USABLE(ill)) {
- /*
- * Restore MDT for this connection, so that next time around
- * it is eligible to go through tcp_multisend() path again.
- */
- TCP_STAT(tcps, tcp_mdt_conn_resumed1);
- tcp->tcp_mdt = B_TRUE;
- ip1dbg(("tcp_send_data: reenabling MDT for connp %p on "
- "interface %s\n", (void *)connp, ill->ill_name));
- }
-
- if (tcp->tcp_snd_zcopy_aware) {
- if ((ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) == 0 ||
- (ill->ill_zerocopy_capab->ill_zerocopy_flags == 0))
- mp = tcp_zcopy_disable(tcp, mp);
- /*
- * we shouldn't need to reset ipha as the mp containing
- * ipha should never be a zero-copy mp.
- */
- }
-
- if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) {
- ASSERT(ill->ill_hcksum_capab != NULL);
- hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags;
- }
-
- /* pseudo-header checksum (do it in parts for IP header checksum) */
- cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
-
- ASSERT(ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION);
- up = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
-
- IP_CKSUM_XMIT_FAST(ire->ire_ipversion, hcksum_txflags, mp, ipha, up,
- IPPROTO_TCP, IP_SIMPLE_HDR_LENGTH, ntohs(ipha->ipha_length), cksum);
-
- /* Software checksum? */
- if (DB_CKSUMFLAGS(mp) == 0) {
- TCP_STAT(tcps, tcp_out_sw_cksum);
- TCP_STAT_UPDATE(tcps, tcp_out_sw_cksum_bytes,
- ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH);
- }
-
- /* Calculate IP header checksum if hardware isn't capable */
- if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) {
- IP_HDR_CKSUM(ipha, cksum, ((uint32_t *)ipha)[0],
- ((uint16_t *)ipha)[4]);
- }
- ASSERT(DB_TYPE(ire_fp_mp) == M_DATA);
- mp->b_rptr = (uchar_t *)ipha - ire_fp_mp_len;
- bcopy(ire_fp_mp->b_rptr, mp->b_rptr, ire_fp_mp_len);
-
- UPDATE_OB_PKT_COUNT(ire);
- ire->ire_last_used_time = lbolt;
-
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits);
- UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
- ntohs(ipha->ipha_length));
-
- DTRACE_PROBE4(ip4__physical__out__start,
- ill_t *, NULL, ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
- FW_HOOKS(ipst->ips_ip4_physical_out_event,
- ipst->ips_ipv4firewall_physical_out,
- NULL, ill, ipha, mp, mp, 0, ipst);
- DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
- DTRACE_IP_FASTPATH(mp, ipha, ill, ipha, NULL);
-
- if (mp != NULL) {
- if (ipst->ips_ip4_observe.he_interested) {
- zoneid_t szone;
-
- /*
- * Both of these functions expect b_rptr to be
- * where the IP header starts, so advance past the
- * link layer header if present.
- */
- mp->b_rptr += ire_fp_mp_len;
- szone = ip_get_zoneid_v4(ipha->ipha_src, mp,
- ipst, ALL_ZONES);
- ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone,
- ALL_ZONES, ill, ipst);
- mp->b_rptr -= ire_fp_mp_len;
- }
-
- ILL_SEND_TX(ill, ire, connp, mp, 0, NULL);
- }
-
- IRE_REFRELE(ire);
+ ASSERT(connp->conn_ixa->ixa_notify_cookie == connp->conn_tcp);
+ (void) conn_ip_output(mp, connp->conn_ixa);
}
/*
@@ -18731,15 +15390,13 @@ tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent)
int tcpstate;
int usable = 0;
mblk_t *xmit_tail;
- queue_t *q = tcp->tcp_wq;
int32_t mss;
int32_t num_sack_blk = 0;
+ int32_t total_hdr_len;
int32_t tcp_hdr_len;
- int32_t tcp_tcp_hdr_len;
- int mdt_thres;
int rc;
tcp_stack_t *tcps = tcp->tcp_tcps;
- ip_stack_t *ipst;
+ conn_t *connp = tcp->tcp_connp;
tcpstate = tcp->tcp_state;
if (mp == NULL) {
@@ -18771,7 +15428,7 @@ tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent)
tcp_display(tcp, NULL,
DISP_ADDR_AND_PORT));
#else
- if (tcp->tcp_debug) {
+ if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1,
SL_TRACE|SL_ERROR,
"tcp_wput_data: data after ordrel, %s\n",
@@ -18781,12 +15438,12 @@ tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent)
#endif /* DEBUG */
}
if (tcp->tcp_snd_zcopy_aware &&
- (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) != 0)
+ (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY))
tcp_zcopy_notify(tcp);
freemsg(mp);
mutex_enter(&tcp->tcp_non_sq_lock);
if (tcp->tcp_flow_stopped &&
- TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
+ TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
tcp_clrqfull(tcp);
}
mutex_exit(&tcp->tcp_non_sq_lock);
@@ -18886,12 +15543,12 @@ data_null:
opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN *
2 + TCPOPT_HEADER_LEN;
mss = tcp->tcp_mss - opt_len;
- tcp_hdr_len = tcp->tcp_hdr_len + opt_len;
- tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len + opt_len;
+ total_hdr_len = connp->conn_ht_iphc_len + opt_len;
+ tcp_hdr_len = connp->conn_ht_ulp_len + opt_len;
} else {
mss = tcp->tcp_mss;
- tcp_hdr_len = tcp->tcp_hdr_len;
- tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len;
+ total_hdr_len = connp->conn_ht_iphc_len;
+ tcp_hdr_len = connp->conn_ht_ulp_len;
}
if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
@@ -18913,7 +15570,7 @@ data_null:
* In the special case when cwnd is zero, which can only
* happen if the connection is ECN capable, return now.
* New segments is sent using tcp_timer(). The timer
- * is set in tcp_rput_data().
+ * is set in tcp_input_data().
*/
if (tcp->tcp_cwnd == 0) {
/*
@@ -19023,66 +15680,12 @@ data_null:
}
/* Update the latest receive window size in TCP header. */
- U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws,
- tcp->tcp_tcph->th_win);
+ tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
- /*
- * Determine if it's worthwhile to attempt LSO or MDT, based on:
- *
- * 1. Simple TCP/IP{v4,v6} (no options).
- * 2. IPSEC/IPQoS processing is not needed for the TCP connection.
- * 3. If the TCP connection is in ESTABLISHED state.
- * 4. The TCP is not detached.
- *
- * If any of the above conditions have changed during the
- * connection, stop using LSO/MDT and restore the stream head
- * parameters accordingly.
- */
- ipst = tcps->tcps_netstack->netstack_ip;
-
- if ((tcp->tcp_lso || tcp->tcp_mdt) &&
- ((tcp->tcp_ipversion == IPV4_VERSION &&
- tcp->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH) ||
- (tcp->tcp_ipversion == IPV6_VERSION &&
- tcp->tcp_ip_hdr_len != IPV6_HDR_LEN) ||
- tcp->tcp_state != TCPS_ESTABLISHED ||
- TCP_IS_DETACHED(tcp) || !CONN_IS_LSO_MD_FASTPATH(tcp->tcp_connp) ||
- CONN_IPSEC_OUT_ENCAPSULATED(tcp->tcp_connp) ||
- IPP_ENABLED(IPP_LOCAL_OUT, ipst))) {
- if (tcp->tcp_lso) {
- tcp->tcp_connp->conn_lso_ok = B_FALSE;
- tcp->tcp_lso = B_FALSE;
- } else {
- tcp->tcp_connp->conn_mdt_ok = B_FALSE;
- tcp->tcp_mdt = B_FALSE;
- }
-
- /* Anything other than detached is considered pathological */
- if (!TCP_IS_DETACHED(tcp)) {
- if (tcp->tcp_lso)
- TCP_STAT(tcps, tcp_lso_disabled);
- else
- TCP_STAT(tcps, tcp_mdt_conn_halted1);
- (void) tcp_maxpsz_set(tcp, B_TRUE);
- }
- }
-
- /* Use MDT if sendable amount is greater than the threshold */
- if (tcp->tcp_mdt &&
- (mdt_thres = mss << tcp_mdt_smss_threshold, usable > mdt_thres) &&
- (tail_unsent > mdt_thres || (xmit_tail->b_cont != NULL &&
- MBLKL(xmit_tail->b_cont) > mdt_thres)) &&
- (tcp->tcp_valid_bits == 0 ||
- tcp->tcp_valid_bits == TCP_FSS_VALID)) {
- ASSERT(tcp->tcp_connp->conn_mdt_ok);
- rc = tcp_multisend(q, tcp, mss, tcp_hdr_len, tcp_tcp_hdr_len,
- num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail,
- local_time, mdt_thres);
- } else {
- rc = tcp_send(q, tcp, mss, tcp_hdr_len, tcp_tcp_hdr_len,
- num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail,
- local_time, INT_MAX);
- }
+ /* Send the packet. */
+ rc = tcp_send(tcp, mss, total_hdr_len, tcp_hdr_len,
+ num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail,
+ local_time);
/* Pretend that all we were trying to send really got sent */
if (rc < 0 && tail_unsent < 0) {
@@ -19131,39 +15734,41 @@ done:;
tcp->tcp_unsent += len;
mutex_enter(&tcp->tcp_non_sq_lock);
if (tcp->tcp_flow_stopped) {
- if (TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
+ if (TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) {
tcp_clrqfull(tcp);
}
- } else if (TCP_UNSENT_BYTES(tcp) >= tcp->tcp_xmit_hiwater) {
- tcp_setqfull(tcp);
+ } else if (TCP_UNSENT_BYTES(tcp) >= connp->conn_sndbuf) {
+ if (!(tcp->tcp_detached))
+ tcp_setqfull(tcp);
}
mutex_exit(&tcp->tcp_non_sq_lock);
}
/*
- * tcp_fill_header is called by tcp_send() and tcp_multisend() to fill the
- * outgoing TCP header with the template header, as well as other
- * options such as time-stamp, ECN and/or SACK.
+ * tcp_fill_header is called by tcp_send() to fill the outgoing TCP header
+ * with the template header, as well as other options such as time-stamp,
+ * ECN and/or SACK.
*/
static void
tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk)
{
- tcph_t *tcp_tmpl, *tcp_h;
+ tcpha_t *tcp_tmpl, *tcpha;
uint32_t *dst, *src;
int hdrlen;
+ conn_t *connp = tcp->tcp_connp;
ASSERT(OK_32PTR(rptr));
/* Template header */
- tcp_tmpl = tcp->tcp_tcph;
+ tcp_tmpl = tcp->tcp_tcpha;
/* Header of outgoing packet */
- tcp_h = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len);
+ tcpha = (tcpha_t *)(rptr + connp->conn_ixa->ixa_ip_hdr_length);
/* dst and src are opaque 32-bit fields, used for copying */
dst = (uint32_t *)rptr;
- src = (uint32_t *)tcp->tcp_iphc;
- hdrlen = tcp->tcp_hdr_len;
+ src = (uint32_t *)connp->conn_ht_iphc;
+ hdrlen = connp->conn_ht_iphc_len;
/* Fill time-stamp option if needed */
if (tcp->tcp_snd_ts_ok) {
@@ -19172,7 +15777,7 @@ tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk)
U32_TO_BE32(tcp->tcp_ts_recent,
(char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8);
} else {
- ASSERT(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH);
+ ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
}
/*
@@ -19208,16 +15813,16 @@ tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk)
SET_ECT(tcp, rptr);
if (tcp->tcp_ecn_echo_on)
- tcp_h->th_flags[0] |= TH_ECE;
+ tcpha->tha_flags |= TH_ECE;
if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) {
- tcp_h->th_flags[0] |= TH_CWR;
+ tcpha->tha_flags |= TH_CWR;
tcp->tcp_ecn_cwr_sent = B_TRUE;
}
}
/* Fill in SACK options */
if (num_sack_blk > 0) {
- uchar_t *wptr = rptr + tcp->tcp_hdr_len;
+ uchar_t *wptr = rptr + connp->conn_ht_iphc_len;
sack_blk_t *tmp;
int32_t i;
@@ -19235,1536 +15840,62 @@ tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk)
U32_TO_BE32(tmp[i].end, wptr);
wptr += sizeof (tcp_seq);
}
- tcp_h->th_offset_and_rsrvd[0] +=
+ tcpha->tha_offset_and_reserved +=
((num_sack_blk * 2 + 1) << 4);
}
}
/*
- * tcp_mdt_add_attrs() is called by tcp_multisend() in order to attach
- * the destination address and SAP attribute, and if necessary, the
- * hardware checksum offload attribute to a Multidata message.
- */
-static int
-tcp_mdt_add_attrs(multidata_t *mmd, const mblk_t *dlmp, const boolean_t hwcksum,
- const uint32_t start, const uint32_t stuff, const uint32_t end,
- const uint32_t flags, tcp_stack_t *tcps)
-{
- /* Add global destination address & SAP attribute */
- if (dlmp == NULL || !ip_md_addr_attr(mmd, NULL, dlmp)) {
- ip1dbg(("tcp_mdt_add_attrs: can't add global physical "
- "destination address+SAP\n"));
-
- if (dlmp != NULL)
- TCP_STAT(tcps, tcp_mdt_allocfail);
- return (-1);
- }
-
- /* Add global hwcksum attribute */
- if (hwcksum &&
- !ip_md_hcksum_attr(mmd, NULL, start, stuff, end, flags)) {
- ip1dbg(("tcp_mdt_add_attrs: can't add global hardware "
- "checksum attribute\n"));
-
- TCP_STAT(tcps, tcp_mdt_allocfail);
- return (-1);
- }
-
- return (0);
-}
-
-/*
- * Smaller and private version of pdescinfo_t used specifically for TCP,
- * which allows for only two payload spans per packet.
- */
-typedef struct tcp_pdescinfo_s PDESCINFO_STRUCT(2) tcp_pdescinfo_t;
-
-/*
- * tcp_multisend() is called by tcp_wput_data() for Multidata Transmit
- * scheme, and returns one the following:
+ * tcp_send() is called by tcp_wput_data() and returns one of the following:
*
* -1 = failed allocation.
* 0 = success; burst count reached, or usable send window is too small,
* and that we'd rather wait until later before sending again.
*/
static int
-tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
- const int tcp_tcp_hdr_len, const int num_sack_blk, int *usable,
- uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time,
- const int mdt_thres)
-{
- mblk_t *md_mp_head, *md_mp, *md_pbuf, *md_pbuf_nxt, *md_hbuf;
- multidata_t *mmd;
- uint_t obsegs, obbytes, hdr_frag_sz;
- uint_t cur_hdr_off, cur_pld_off, base_pld_off, first_snxt;
- int num_burst_seg, max_pld;
- pdesc_t *pkt;
- tcp_pdescinfo_t tcp_pkt_info;
- pdescinfo_t *pkt_info;
- int pbuf_idx, pbuf_idx_nxt;
- int seg_len, len, spill, af;
- boolean_t add_buffer, zcopy, clusterwide;
- boolean_t rconfirm = B_FALSE;
- boolean_t done = B_FALSE;
- uint32_t cksum;
- uint32_t hwcksum_flags;
- ire_t *ire = NULL;
- ill_t *ill;
- ipha_t *ipha;
- ip6_t *ip6h;
- ipaddr_t src, dst;
- ill_zerocopy_capab_t *zc_cap = NULL;
- uint16_t *up;
- int err;
- conn_t *connp;
- tcp_stack_t *tcps = tcp->tcp_tcps;
- ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
- int usable_mmd, tail_unsent_mmd;
- uint_t snxt_mmd, obsegs_mmd, obbytes_mmd;
- mblk_t *xmit_tail_mmd;
- netstackid_t stack_id;
-
-#ifdef _BIG_ENDIAN
-#define IPVER(ip6h) ((((uint32_t *)ip6h)[0] >> 28) & 0x7)
-#else
-#define IPVER(ip6h) ((((uint32_t *)ip6h)[0] >> 4) & 0x7)
-#endif
-
-#define PREP_NEW_MULTIDATA() { \
- mmd = NULL; \
- md_mp = md_hbuf = NULL; \
- cur_hdr_off = 0; \
- max_pld = tcp->tcp_mdt_max_pld; \
- pbuf_idx = pbuf_idx_nxt = -1; \
- add_buffer = B_TRUE; \
- zcopy = B_FALSE; \
-}
-
-#define PREP_NEW_PBUF() { \
- md_pbuf = md_pbuf_nxt = NULL; \
- pbuf_idx = pbuf_idx_nxt = -1; \
- cur_pld_off = 0; \
- first_snxt = *snxt; \
- ASSERT(*tail_unsent > 0); \
- base_pld_off = MBLKL(*xmit_tail) - *tail_unsent; \
-}
-
- ASSERT(mdt_thres >= mss);
- ASSERT(*usable > 0 && *usable > mdt_thres);
- ASSERT(tcp->tcp_state == TCPS_ESTABLISHED);
- ASSERT(!TCP_IS_DETACHED(tcp));
- ASSERT(tcp->tcp_valid_bits == 0 ||
- tcp->tcp_valid_bits == TCP_FSS_VALID);
- ASSERT((tcp->tcp_ipversion == IPV4_VERSION &&
- tcp->tcp_ip_hdr_len == IP_SIMPLE_HDR_LENGTH) ||
- (tcp->tcp_ipversion == IPV6_VERSION &&
- tcp->tcp_ip_hdr_len == IPV6_HDR_LEN));
-
- connp = tcp->tcp_connp;
- ASSERT(connp != NULL);
- ASSERT(CONN_IS_LSO_MD_FASTPATH(connp));
- ASSERT(!CONN_IPSEC_OUT_ENCAPSULATED(connp));
-
- stack_id = connp->conn_netstack->netstack_stackid;
-
- usable_mmd = tail_unsent_mmd = 0;
- snxt_mmd = obsegs_mmd = obbytes_mmd = 0;
- xmit_tail_mmd = NULL;
- /*
- * Note that tcp will only declare at most 2 payload spans per
- * packet, which is much lower than the maximum allowable number
- * of packet spans per Multidata. For this reason, we use the
- * privately declared and smaller descriptor info structure, in
- * order to save some stack space.
- */
- pkt_info = (pdescinfo_t *)&tcp_pkt_info;
-
- af = (tcp->tcp_ipversion == IPV4_VERSION) ? AF_INET : AF_INET6;
- if (af == AF_INET) {
- dst = tcp->tcp_ipha->ipha_dst;
- src = tcp->tcp_ipha->ipha_src;
- ASSERT(!CLASSD(dst));
- }
- ASSERT(af == AF_INET ||
- !IN6_IS_ADDR_MULTICAST(&tcp->tcp_ip6h->ip6_dst));
-
- obsegs = obbytes = 0;
- num_burst_seg = tcp->tcp_snd_burst;
- md_mp_head = NULL;
- PREP_NEW_MULTIDATA();
-
- /*
- * Before we go on further, make sure there is an IRE that we can
- * use, and that the ILL supports MDT. Otherwise, there's no point
- * in proceeding any further, and we should just hand everything
- * off to the legacy path.
- */
- if (!tcp_send_find_ire(tcp, (af == AF_INET) ? &dst : NULL, &ire))
- goto legacy_send_no_md;
-
- ASSERT(ire != NULL);
- ASSERT(af != AF_INET || ire->ire_ipversion == IPV4_VERSION);
- ASSERT(af == AF_INET || !IN6_IS_ADDR_V4MAPPED(&(ire->ire_addr_v6)));
- ASSERT(af == AF_INET || ire->ire_nce != NULL);
- ASSERT(!(ire->ire_type & IRE_BROADCAST));
- /*
- * If we do support loopback for MDT (which requires modifications
- * to the receiving paths), the following assertions should go away,
- * and we would be sending the Multidata to loopback conn later on.
- */
- ASSERT(!IRE_IS_LOCAL(ire));
- ASSERT(ire->ire_stq != NULL);
-
- ill = ire_to_ill(ire);
- ASSERT(ill != NULL);
- ASSERT(!ILL_MDT_CAPABLE(ill) || ill->ill_mdt_capab != NULL);
-
- if (!tcp->tcp_ire_ill_check_done) {
- tcp_ire_ill_check(tcp, ire, ill, B_TRUE);
- tcp->tcp_ire_ill_check_done = B_TRUE;
- }
-
- /*
- * If the underlying interface conditions have changed, or if the
- * new interface does not support MDT, go back to legacy path.
- */
- if (!ILL_MDT_USABLE(ill) || (ire->ire_flags & RTF_MULTIRT) != 0) {
- /* don't go through this path anymore for this connection */
- TCP_STAT(tcps, tcp_mdt_conn_halted2);
- tcp->tcp_mdt = B_FALSE;
- ip1dbg(("tcp_multisend: disabling MDT for connp %p on "
- "interface %s\n", (void *)connp, ill->ill_name));
- /* IRE will be released prior to returning */
- goto legacy_send_no_md;
- }
-
- if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY)
- zc_cap = ill->ill_zerocopy_capab;
-
- /*
- * Check if we can take tcp fast-path. Note that "incomplete"
- * ire's (where the link-layer for next hop is not resolved
- * or where the fast-path header in nce_fp_mp is not available
- * yet) are sent down the legacy (slow) path.
- * NOTE: We should fix ip_xmit_v4 to handle M_MULTIDATA
- */
- if (ire->ire_nce && ire->ire_nce->nce_state != ND_REACHABLE) {
- /* IRE will be released prior to returning */
- goto legacy_send_no_md;
- }
-
- /* go to legacy path if interface doesn't support zerocopy */
- if (tcp->tcp_snd_zcopy_aware && do_tcpzcopy != 2 &&
- (zc_cap == NULL || zc_cap->ill_zerocopy_flags == 0)) {
- /* IRE will be released prior to returning */
- goto legacy_send_no_md;
- }
-
- /* does the interface support hardware checksum offload? */
- hwcksum_flags = 0;
- if (ILL_HCKSUM_CAPABLE(ill) &&
- (ill->ill_hcksum_capab->ill_hcksum_txflags &
- (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6 | HCKSUM_INET_PARTIAL |
- HCKSUM_IPHDRCKSUM)) && dohwcksum) {
- if (ill->ill_hcksum_capab->ill_hcksum_txflags &
- HCKSUM_IPHDRCKSUM)
- hwcksum_flags = HCK_IPV4_HDRCKSUM;
-
- if (ill->ill_hcksum_capab->ill_hcksum_txflags &
- (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6))
- hwcksum_flags |= HCK_FULLCKSUM;
- else if (ill->ill_hcksum_capab->ill_hcksum_txflags &
- HCKSUM_INET_PARTIAL)
- hwcksum_flags |= HCK_PARTIALCKSUM;
- }
-
- /*
- * Each header fragment consists of the leading extra space,
- * followed by the TCP/IP header, and the trailing extra space.
- * We make sure that each header fragment begins on a 32-bit
- * aligned memory address (tcp_mdt_hdr_head is already 32-bit
- * aligned in tcp_mdt_update).
- */
- hdr_frag_sz = roundup((tcp->tcp_mdt_hdr_head + tcp_hdr_len +
- tcp->tcp_mdt_hdr_tail), 4);
-
- /* are we starting from the beginning of data block? */
- if (*tail_unsent == 0) {
- *xmit_tail = (*xmit_tail)->b_cont;
- ASSERT((uintptr_t)MBLKL(*xmit_tail) <= (uintptr_t)INT_MAX);
- *tail_unsent = (int)MBLKL(*xmit_tail);
- }
-
- /*
- * Here we create one or more Multidata messages, each made up of
- * one header buffer and up to N payload buffers. This entire
- * operation is done within two loops:
- *
- * The outer loop mostly deals with creating the Multidata message,
- * as well as the header buffer that gets added to it. It also
- * links the Multidata messages together such that all of them can
- * be sent down to the lower layer in a single putnext call; this
- * linking behavior depends on the tcp_mdt_chain tunable.
- *
- * The inner loop takes an existing Multidata message, and adds
- * one or more (up to tcp_mdt_max_pld) payload buffers to it. It
- * packetizes those buffers by filling up the corresponding header
- * buffer fragments with the proper IP and TCP headers, and by
- * describing the layout of each packet in the packet descriptors
- * that get added to the Multidata.
- */
- do {
- /*
- * If usable send window is too small, or data blocks in
- * transmit list are smaller than our threshold (i.e. app
- * performs large writes followed by small ones), we hand
- * off the control over to the legacy path. Note that we'll
- * get back the control once it encounters a large block.
- */
- if (*usable < mss || (*tail_unsent <= mdt_thres &&
- (*xmit_tail)->b_cont != NULL &&
- MBLKL((*xmit_tail)->b_cont) <= mdt_thres)) {
- /* send down what we've got so far */
- if (md_mp_head != NULL) {
- tcp_multisend_data(tcp, ire, ill, md_mp_head,
- obsegs, obbytes, &rconfirm);
- }
- /*
- * Pass control over to tcp_send(), but tell it to
- * return to us once a large-size transmission is
- * possible.
- */
- TCP_STAT(tcps, tcp_mdt_legacy_small);
- if ((err = tcp_send(q, tcp, mss, tcp_hdr_len,
- tcp_tcp_hdr_len, num_sack_blk, usable, snxt,
- tail_unsent, xmit_tail, local_time,
- mdt_thres)) <= 0) {
- /* burst count reached, or alloc failed */
- IRE_REFRELE(ire);
- return (err);
- }
-
- /* tcp_send() may have sent everything, so check */
- if (*usable <= 0) {
- IRE_REFRELE(ire);
- return (0);
- }
-
- TCP_STAT(tcps, tcp_mdt_legacy_ret);
- /*
- * We may have delivered the Multidata, so make sure
- * to re-initialize before the next round.
- */
- md_mp_head = NULL;
- obsegs = obbytes = 0;
- num_burst_seg = tcp->tcp_snd_burst;
- PREP_NEW_MULTIDATA();
-
- /* are we starting from the beginning of data block? */
- if (*tail_unsent == 0) {
- *xmit_tail = (*xmit_tail)->b_cont;
- ASSERT((uintptr_t)MBLKL(*xmit_tail) <=
- (uintptr_t)INT_MAX);
- *tail_unsent = (int)MBLKL(*xmit_tail);
- }
- }
- /*
- * Record current values for parameters we may need to pass
- * to tcp_send() or tcp_multisend_data(). We checkpoint at
- * each iteration of the outer loop (each multidata message
- * creation). If we have a failure in the inner loop, we send
- * any complete multidata messages we have before reverting
- * to using the traditional non-md path.
- */
- snxt_mmd = *snxt;
- usable_mmd = *usable;
- xmit_tail_mmd = *xmit_tail;
- tail_unsent_mmd = *tail_unsent;
- obsegs_mmd = obsegs;
- obbytes_mmd = obbytes;
-
- /*
- * max_pld limits the number of mblks in tcp's transmit
- * queue that can be added to a Multidata message. Once
- * this counter reaches zero, no more additional mblks
- * can be added to it. What happens afterwards depends
- * on whether or not we are set to chain the Multidata
- * messages. If we are to link them together, reset
- * max_pld to its original value (tcp_mdt_max_pld) and
- * prepare to create a new Multidata message which will
- * get linked to md_mp_head. Else, leave it alone and
- * let the inner loop break on its own.
- */
- if (tcp_mdt_chain && max_pld == 0)
- PREP_NEW_MULTIDATA();
-
- /* adding a payload buffer; re-initialize values */
- if (add_buffer)
- PREP_NEW_PBUF();
-
- /*
- * If we don't have a Multidata, either because we just
- * (re)entered this outer loop, or after we branched off
- * to tcp_send above, setup the Multidata and header
- * buffer to be used.
- */
- if (md_mp == NULL) {
- int md_hbuflen;
- uint32_t start, stuff;
-
- /*
- * Calculate Multidata header buffer size large enough
- * to hold all of the headers that can possibly be
- * sent at this moment. We'd rather over-estimate
- * the size than running out of space; this is okay
- * since this buffer is small anyway.
- */
- md_hbuflen = (howmany(*usable, mss) + 1) * hdr_frag_sz;
-
- /*
- * Start and stuff offset for partial hardware
- * checksum offload; these are currently for IPv4.
- * For full checksum offload, they are set to zero.
- */
- if ((hwcksum_flags & HCK_PARTIALCKSUM)) {
- if (af == AF_INET) {
- start = IP_SIMPLE_HDR_LENGTH;
- stuff = IP_SIMPLE_HDR_LENGTH +
- TCP_CHECKSUM_OFFSET;
- } else {
- start = IPV6_HDR_LEN;
- stuff = IPV6_HDR_LEN +
- TCP_CHECKSUM_OFFSET;
- }
- } else {
- start = stuff = 0;
- }
-
- /*
- * Create the header buffer, Multidata, as well as
- * any necessary attributes (destination address,
- * SAP and hardware checksum offload) that should
- * be associated with the Multidata message.
- */
- ASSERT(cur_hdr_off == 0);
- if ((md_hbuf = allocb(md_hbuflen, BPRI_HI)) == NULL ||
- ((md_hbuf->b_wptr += md_hbuflen),
- (mmd = mmd_alloc(md_hbuf, &md_mp,
- KM_NOSLEEP)) == NULL) || (tcp_mdt_add_attrs(mmd,
- /* fastpath mblk */
- ire->ire_nce->nce_res_mp,
- /* hardware checksum enabled */
- (hwcksum_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)),
- /* hardware checksum offsets */
- start, stuff, 0,
- /* hardware checksum flag */
- hwcksum_flags, tcps) != 0)) {
-legacy_send:
- /*
- * We arrive here from a failure within the
- * inner (packetizer) loop or we fail one of
- * the conditionals above. We restore the
- * previously checkpointed values for:
- * xmit_tail
- * usable
- * tail_unsent
- * snxt
- * obbytes
- * obsegs
- * We should then be able to dispatch any
- * complete multidata before reverting to the
- * traditional path with consistent parameters
- * (the inner loop updates these as it
- * iterates).
- */
- *xmit_tail = xmit_tail_mmd;
- *usable = usable_mmd;
- *tail_unsent = tail_unsent_mmd;
- *snxt = snxt_mmd;
- obbytes = obbytes_mmd;
- obsegs = obsegs_mmd;
- if (md_mp != NULL) {
- /* Unlink message from the chain */
- if (md_mp_head != NULL) {
- err = (intptr_t)rmvb(md_mp_head,
- md_mp);
- /*
- * We can't assert that rmvb
- * did not return -1, since we
- * may get here before linkb
- * happens. We do, however,
- * check if we just removed the
- * only element in the list.
- */
- if (err == 0)
- md_mp_head = NULL;
- }
- /* md_hbuf gets freed automatically */
- TCP_STAT(tcps, tcp_mdt_discarded);
- freeb(md_mp);
- } else {
- /* Either allocb or mmd_alloc failed */
- TCP_STAT(tcps, tcp_mdt_allocfail);
- if (md_hbuf != NULL)
- freeb(md_hbuf);
- }
-
- /* send down what we've got so far */
- if (md_mp_head != NULL) {
- tcp_multisend_data(tcp, ire, ill,
- md_mp_head, obsegs, obbytes,
- &rconfirm);
- }
-legacy_send_no_md:
- if (ire != NULL)
- IRE_REFRELE(ire);
- /*
- * Too bad; let the legacy path handle this.
- * We specify INT_MAX for the threshold, since
- * we gave up with the Multidata processings
- * and let the old path have it all.
- */
- TCP_STAT(tcps, tcp_mdt_legacy_all);
- return (tcp_send(q, tcp, mss, tcp_hdr_len,
- tcp_tcp_hdr_len, num_sack_blk, usable,
- snxt, tail_unsent, xmit_tail, local_time,
- INT_MAX));
- }
-
- /* link to any existing ones, if applicable */
- TCP_STAT(tcps, tcp_mdt_allocd);
- if (md_mp_head == NULL) {
- md_mp_head = md_mp;
- } else if (tcp_mdt_chain) {
- TCP_STAT(tcps, tcp_mdt_linked);
- linkb(md_mp_head, md_mp);
- }
- }
-
- ASSERT(md_mp_head != NULL);
- ASSERT(tcp_mdt_chain || md_mp_head->b_cont == NULL);
- ASSERT(md_mp != NULL && mmd != NULL);
- ASSERT(md_hbuf != NULL);
-
- /*
- * Packetize the transmittable portion of the data block;
- * each data block is essentially added to the Multidata
- * as a payload buffer. We also deal with adding more
- * than one payload buffers, which happens when the remaining
- * packetized portion of the current payload buffer is less
- * than MSS, while the next data block in transmit queue
- * has enough data to make up for one. This "spillover"
- * case essentially creates a split-packet, where portions
- * of the packet's payload fragments may span across two
- * virtually discontiguous address blocks.
- */
- seg_len = mss;
- do {
- len = seg_len;
-
- /* one must remain NULL for DTRACE_IP_FASTPATH */
- ipha = NULL;
- ip6h = NULL;
-
- ASSERT(len > 0);
- ASSERT(max_pld >= 0);
- ASSERT(!add_buffer || cur_pld_off == 0);
-
- /*
- * First time around for this payload buffer; note
- * in the case of a spillover, the following has
- * been done prior to adding the split-packet
- * descriptor to Multidata, and we don't want to
- * repeat the process.
- */
- if (add_buffer) {
- ASSERT(mmd != NULL);
- ASSERT(md_pbuf == NULL);
- ASSERT(md_pbuf_nxt == NULL);
- ASSERT(pbuf_idx == -1 && pbuf_idx_nxt == -1);
-
- /*
- * Have we reached the limit? We'd get to
- * this case when we're not chaining the
- * Multidata messages together, and since
- * we're done, terminate this loop.
- */
- if (max_pld == 0)
- break; /* done */
-
- if ((md_pbuf = dupb(*xmit_tail)) == NULL) {
- TCP_STAT(tcps, tcp_mdt_allocfail);
- goto legacy_send; /* out_of_mem */
- }
-
- if (IS_VMLOANED_MBLK(md_pbuf) && !zcopy &&
- zc_cap != NULL) {
- if (!ip_md_zcopy_attr(mmd, NULL,
- zc_cap->ill_zerocopy_flags)) {
- freeb(md_pbuf);
- TCP_STAT(tcps,
- tcp_mdt_allocfail);
- /* out_of_mem */
- goto legacy_send;
- }
- zcopy = B_TRUE;
- }
-
- md_pbuf->b_rptr += base_pld_off;
-
- /*
- * Add a payload buffer to the Multidata; this
- * operation must not fail, or otherwise our
- * logic in this routine is broken. There
- * is no memory allocation done by the
- * routine, so any returned failure simply
- * tells us that we've done something wrong.
- *
- * A failure tells us that either we're adding
- * the same payload buffer more than once, or
- * we're trying to add more buffers than
- * allowed (max_pld calculation is wrong).
- * None of the above cases should happen, and
- * we panic because either there's horrible
- * heap corruption, and/or programming mistake.
- */
- pbuf_idx = mmd_addpldbuf(mmd, md_pbuf);
- if (pbuf_idx < 0) {
- cmn_err(CE_PANIC, "tcp_multisend: "
- "payload buffer logic error "
- "detected for tcp %p mmd %p "
- "pbuf %p (%d)\n",
- (void *)tcp, (void *)mmd,
- (void *)md_pbuf, pbuf_idx);
- }
-
- ASSERT(max_pld > 0);
- --max_pld;
- add_buffer = B_FALSE;
- }
-
- ASSERT(md_mp_head != NULL);
- ASSERT(md_pbuf != NULL);
- ASSERT(md_pbuf_nxt == NULL);
- ASSERT(pbuf_idx != -1);
- ASSERT(pbuf_idx_nxt == -1);
- ASSERT(*usable > 0);
-
- /*
- * We spillover to the next payload buffer only
- * if all of the following is true:
- *
- * 1. There is not enough data on the current
- * payload buffer to make up `len',
- * 2. We are allowed to send `len',
- * 3. The next payload buffer length is large
- * enough to accomodate `spill'.
- */
- if ((spill = len - *tail_unsent) > 0 &&
- *usable >= len &&
- MBLKL((*xmit_tail)->b_cont) >= spill &&
- max_pld > 0) {
- md_pbuf_nxt = dupb((*xmit_tail)->b_cont);
- if (md_pbuf_nxt == NULL) {
- TCP_STAT(tcps, tcp_mdt_allocfail);
- goto legacy_send; /* out_of_mem */
- }
-
- if (IS_VMLOANED_MBLK(md_pbuf_nxt) && !zcopy &&
- zc_cap != NULL) {
- if (!ip_md_zcopy_attr(mmd, NULL,
- zc_cap->ill_zerocopy_flags)) {
- freeb(md_pbuf_nxt);
- TCP_STAT(tcps,
- tcp_mdt_allocfail);
- /* out_of_mem */
- goto legacy_send;
- }
- zcopy = B_TRUE;
- }
-
- /*
- * See comments above on the first call to
- * mmd_addpldbuf for explanation on the panic.
- */
- pbuf_idx_nxt = mmd_addpldbuf(mmd, md_pbuf_nxt);
- if (pbuf_idx_nxt < 0) {
- panic("tcp_multisend: "
- "next payload buffer logic error "
- "detected for tcp %p mmd %p "
- "pbuf %p (%d)\n",
- (void *)tcp, (void *)mmd,
- (void *)md_pbuf_nxt, pbuf_idx_nxt);
- }
-
- ASSERT(max_pld > 0);
- --max_pld;
- } else if (spill > 0) {
- /*
- * If there's a spillover, but the following
- * xmit_tail couldn't give us enough octets
- * to reach "len", then stop the current
- * Multidata creation and let the legacy
- * tcp_send() path take over. We don't want
- * to send the tiny segment as part of this
- * Multidata for performance reasons; instead,
- * we let the legacy path deal with grouping
- * it with the subsequent small mblks.
- */
- if (*usable >= len &&
- MBLKL((*xmit_tail)->b_cont) < spill) {
- max_pld = 0;
- break; /* done */
- }
-
- /*
- * We can't spillover, and we are near
- * the end of the current payload buffer,
- * so send what's left.
- */
- ASSERT(*tail_unsent > 0);
- len = *tail_unsent;
- }
-
- /* tail_unsent is negated if there is a spillover */
- *tail_unsent -= len;
- *usable -= len;
- ASSERT(*usable >= 0);
-
- if (*usable < mss)
- seg_len = *usable;
- /*
- * Sender SWS avoidance; see comments in tcp_send();
- * everything else is the same, except that we only
- * do this here if there is no more data to be sent
- * following the current xmit_tail. We don't check
- * for 1-byte urgent data because we shouldn't get
- * here if TCP_URG_VALID is set.
- */
- if (*usable > 0 && *usable < mss &&
- ((md_pbuf_nxt == NULL &&
- (*xmit_tail)->b_cont == NULL) ||
- (md_pbuf_nxt != NULL &&
- (*xmit_tail)->b_cont->b_cont == NULL)) &&
- seg_len < (tcp->tcp_max_swnd >> 1) &&
- (tcp->tcp_unsent -
- ((*snxt + len) - tcp->tcp_snxt)) > seg_len &&
- !tcp->tcp_zero_win_probe) {
- if ((*snxt + len) == tcp->tcp_snxt &&
- (*snxt + len) == tcp->tcp_suna) {
- TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
- }
- done = B_TRUE;
- }
-
- /*
- * Prime pump for IP's checksumming on our behalf;
- * include the adjustment for a source route if any.
- * Do this only for software/partial hardware checksum
- * offload, as this field gets zeroed out later for
- * the full hardware checksum offload case.
- */
- if (!(hwcksum_flags & HCK_FULLCKSUM)) {
- cksum = len + tcp_tcp_hdr_len + tcp->tcp_sum;
- cksum = (cksum >> 16) + (cksum & 0xFFFF);
- U16_TO_ABE16(cksum, tcp->tcp_tcph->th_sum);
- }
-
- U32_TO_ABE32(*snxt, tcp->tcp_tcph->th_seq);
- *snxt += len;
-
- tcp->tcp_tcph->th_flags[0] = TH_ACK;
- /*
- * We set the PUSH bit only if TCP has no more buffered
- * data to be transmitted (or if sender SWS avoidance
- * takes place), as opposed to setting it for every
- * last packet in the burst.
- */
- if (done ||
- (tcp->tcp_unsent - (*snxt - tcp->tcp_snxt)) == 0)
- tcp->tcp_tcph->th_flags[0] |= TH_PUSH;
-
- /*
- * Set FIN bit if this is our last segment; snxt
- * already includes its length, and it will not
- * be adjusted after this point.
- */
- if (tcp->tcp_valid_bits == TCP_FSS_VALID &&
- *snxt == tcp->tcp_fss) {
- if (!tcp->tcp_fin_acked) {
- tcp->tcp_tcph->th_flags[0] |= TH_FIN;
- BUMP_MIB(&tcps->tcps_mib,
- tcpOutControl);
- }
- if (!tcp->tcp_fin_sent) {
- tcp->tcp_fin_sent = B_TRUE;
- /*
- * tcp state must be ESTABLISHED
- * in order for us to get here in
- * the first place.
- */
- tcp->tcp_state = TCPS_FIN_WAIT_1;
-
- /*
- * Upon returning from this routine,
- * tcp_wput_data() will set tcp_snxt
- * to be equal to snxt + tcp_fin_sent.
- * This is essentially the same as
- * setting it to tcp_fss + 1.
- */
- }
- }
-
- tcp->tcp_last_sent_len = (ushort_t)len;
-
- len += tcp_hdr_len;
- if (tcp->tcp_ipversion == IPV4_VERSION)
- tcp->tcp_ipha->ipha_length = htons(len);
- else
- tcp->tcp_ip6h->ip6_plen = htons(len -
- ((char *)&tcp->tcp_ip6h[1] -
- tcp->tcp_iphc));
-
- pkt_info->flags = (PDESC_HBUF_REF | PDESC_PBUF_REF);
-
- /* setup header fragment */
- PDESC_HDR_ADD(pkt_info,
- md_hbuf->b_rptr + cur_hdr_off, /* base */
- tcp->tcp_mdt_hdr_head, /* head room */
- tcp_hdr_len, /* len */
- tcp->tcp_mdt_hdr_tail); /* tail room */
-
- ASSERT(pkt_info->hdr_lim - pkt_info->hdr_base ==
- hdr_frag_sz);
- ASSERT(MBLKIN(md_hbuf,
- (pkt_info->hdr_base - md_hbuf->b_rptr),
- PDESC_HDRSIZE(pkt_info)));
-
- /* setup first payload fragment */
- PDESC_PLD_INIT(pkt_info);
- PDESC_PLD_SPAN_ADD(pkt_info,
- pbuf_idx, /* index */
- md_pbuf->b_rptr + cur_pld_off, /* start */
- tcp->tcp_last_sent_len); /* len */
-
- /* create a split-packet in case of a spillover */
- if (md_pbuf_nxt != NULL) {
- ASSERT(spill > 0);
- ASSERT(pbuf_idx_nxt > pbuf_idx);
- ASSERT(!add_buffer);
-
- md_pbuf = md_pbuf_nxt;
- md_pbuf_nxt = NULL;
- pbuf_idx = pbuf_idx_nxt;
- pbuf_idx_nxt = -1;
- cur_pld_off = spill;
-
- /* trim out first payload fragment */
- PDESC_PLD_SPAN_TRIM(pkt_info, 0, spill);
-
- /* setup second payload fragment */
- PDESC_PLD_SPAN_ADD(pkt_info,
- pbuf_idx, /* index */
- md_pbuf->b_rptr, /* start */
- spill); /* len */
-
- if ((*xmit_tail)->b_next == NULL) {
- /*
- * Store the lbolt used for RTT
- * estimation. We can only record one
- * timestamp per mblk so we do it when
- * we reach the end of the payload
- * buffer. Also we only take a new
- * timestamp sample when the previous
- * timed data from the same mblk has
- * been ack'ed.
- */
- (*xmit_tail)->b_prev = local_time;
- (*xmit_tail)->b_next =
- (mblk_t *)(uintptr_t)first_snxt;
- }
-
- first_snxt = *snxt - spill;
-
- /*
- * Advance xmit_tail; usable could be 0 by
- * the time we got here, but we made sure
- * above that we would only spillover to
- * the next data block if usable includes
- * the spilled-over amount prior to the
- * subtraction. Therefore, we are sure
- * that xmit_tail->b_cont can't be NULL.
- */
- ASSERT((*xmit_tail)->b_cont != NULL);
- *xmit_tail = (*xmit_tail)->b_cont;
- ASSERT((uintptr_t)MBLKL(*xmit_tail) <=
- (uintptr_t)INT_MAX);
- *tail_unsent = (int)MBLKL(*xmit_tail) - spill;
- } else {
- cur_pld_off += tcp->tcp_last_sent_len;
- }
-
- /*
- * Fill in the header using the template header, and
- * add options such as time-stamp, ECN and/or SACK,
- * as needed.
- */
- tcp_fill_header(tcp, pkt_info->hdr_rptr,
- (clock_t)local_time, num_sack_blk);
-
- /* take care of some IP header businesses */
- if (af == AF_INET) {
- ipha = (ipha_t *)pkt_info->hdr_rptr;
-
- ASSERT(OK_32PTR((uchar_t *)ipha));
- ASSERT(PDESC_HDRL(pkt_info) >=
- IP_SIMPLE_HDR_LENGTH);
- ASSERT(ipha->ipha_version_and_hdr_length ==
- IP_SIMPLE_HDR_VERSION);
-
- /*
- * Assign ident value for current packet; see
- * related comments in ip_wput_ire() about the
- * contract private interface with clustering
- * group.
- */
- clusterwide = B_FALSE;
- if (cl_inet_ipident != NULL) {
- ASSERT(cl_inet_isclusterwide != NULL);
- if ((*cl_inet_isclusterwide)(stack_id,
- IPPROTO_IP, AF_INET,
- (uint8_t *)(uintptr_t)src, NULL)) {
- ipha->ipha_ident =
- (*cl_inet_ipident)(stack_id,
- IPPROTO_IP, AF_INET,
- (uint8_t *)(uintptr_t)src,
- (uint8_t *)(uintptr_t)dst,
- NULL);
- clusterwide = B_TRUE;
- }
- }
-
- if (!clusterwide) {
- ipha->ipha_ident = (uint16_t)
- atomic_add_32_nv(
- &ire->ire_ident, 1);
- }
-#ifndef _BIG_ENDIAN
- ipha->ipha_ident = (ipha->ipha_ident << 8) |
- (ipha->ipha_ident >> 8);
-#endif
- } else {
- ip6h = (ip6_t *)pkt_info->hdr_rptr;
-
- ASSERT(OK_32PTR((uchar_t *)ip6h));
- ASSERT(IPVER(ip6h) == IPV6_VERSION);
- ASSERT(ip6h->ip6_nxt == IPPROTO_TCP);
- ASSERT(PDESC_HDRL(pkt_info) >=
- (IPV6_HDR_LEN + TCP_CHECKSUM_OFFSET +
- TCP_CHECKSUM_SIZE));
- ASSERT(tcp->tcp_ipversion == IPV6_VERSION);
-
- if (tcp->tcp_ip_forward_progress) {
- rconfirm = B_TRUE;
- tcp->tcp_ip_forward_progress = B_FALSE;
- }
- }
-
- /* at least one payload span, and at most two */
- ASSERT(pkt_info->pld_cnt > 0 && pkt_info->pld_cnt < 3);
-
- /* add the packet descriptor to Multidata */
- if ((pkt = mmd_addpdesc(mmd, pkt_info, &err,
- KM_NOSLEEP)) == NULL) {
- /*
- * Any failure other than ENOMEM indicates
- * that we have passed in invalid pkt_info
- * or parameters to mmd_addpdesc, which must
- * not happen.
- *
- * EINVAL is a result of failure on boundary
- * checks against the pkt_info contents. It
- * should not happen, and we panic because
- * either there's horrible heap corruption,
- * and/or programming mistake.
- */
- if (err != ENOMEM) {
- cmn_err(CE_PANIC, "tcp_multisend: "
- "pdesc logic error detected for "
- "tcp %p mmd %p pinfo %p (%d)\n",
- (void *)tcp, (void *)mmd,
- (void *)pkt_info, err);
- }
- TCP_STAT(tcps, tcp_mdt_addpdescfail);
- goto legacy_send; /* out_of_mem */
- }
- ASSERT(pkt != NULL);
-
- /* calculate IP header and TCP checksums */
- if (af == AF_INET) {
- /* calculate pseudo-header checksum */
- cksum = (dst >> 16) + (dst & 0xFFFF) +
- (src >> 16) + (src & 0xFFFF);
-
- /* offset for TCP header checksum */
- up = IPH_TCPH_CHECKSUMP(ipha,
- IP_SIMPLE_HDR_LENGTH);
- } else {
- up = (uint16_t *)&ip6h->ip6_src;
-
- /* calculate pseudo-header checksum */
- cksum = up[0] + up[1] + up[2] + up[3] +
- up[4] + up[5] + up[6] + up[7] +
- up[8] + up[9] + up[10] + up[11] +
- up[12] + up[13] + up[14] + up[15];
-
- /* Fold the initial sum */
- cksum = (cksum & 0xffff) + (cksum >> 16);
-
- up = (uint16_t *)(((uchar_t *)ip6h) +
- IPV6_HDR_LEN + TCP_CHECKSUM_OFFSET);
- }
-
- if (hwcksum_flags & HCK_FULLCKSUM) {
- /* clear checksum field for hardware */
- *up = 0;
- } else if (hwcksum_flags & HCK_PARTIALCKSUM) {
- uint32_t sum;
-
- /* pseudo-header checksumming */
- sum = *up + cksum + IP_TCP_CSUM_COMP;
- sum = (sum & 0xFFFF) + (sum >> 16);
- *up = (sum & 0xFFFF) + (sum >> 16);
- } else {
- /* software checksumming */
- TCP_STAT(tcps, tcp_out_sw_cksum);
- TCP_STAT_UPDATE(tcps, tcp_out_sw_cksum_bytes,
- tcp->tcp_hdr_len + tcp->tcp_last_sent_len);
- *up = IP_MD_CSUM(pkt, tcp->tcp_ip_hdr_len,
- cksum + IP_TCP_CSUM_COMP);
- if (*up == 0)
- *up = 0xFFFF;
- }
-
- /* IPv4 header checksum */
- if (af == AF_INET) {
- if (hwcksum_flags & HCK_IPV4_HDRCKSUM) {
- ipha->ipha_hdr_checksum = 0;
- } else {
- IP_HDR_CKSUM(ipha, cksum,
- ((uint32_t *)ipha)[0],
- ((uint16_t *)ipha)[4]);
- }
- }
-
- if (af == AF_INET &&
- HOOKS4_INTERESTED_PHYSICAL_OUT(ipst) ||
- af == AF_INET6 &&
- HOOKS6_INTERESTED_PHYSICAL_OUT(ipst)) {
- mblk_t *mp, *mp1;
- uchar_t *hdr_rptr, *hdr_wptr;
- uchar_t *pld_rptr, *pld_wptr;
-
- /*
- * We reconstruct a pseudo packet for the hooks
- * framework using mmd_transform_link().
- * If it is a split packet we pullup the
- * payload. FW_HOOKS expects a pkt comprising
- * of two mblks: a header and the payload.
- */
- if ((mp = mmd_transform_link(pkt)) == NULL) {
- TCP_STAT(tcps, tcp_mdt_allocfail);
- goto legacy_send;
- }
-
- if (pkt_info->pld_cnt > 1) {
- /* split payload, more than one pld */
- if ((mp1 = msgpullup(mp->b_cont, -1)) ==
- NULL) {
- freemsg(mp);
- TCP_STAT(tcps,
- tcp_mdt_allocfail);
- goto legacy_send;
- }
- freemsg(mp->b_cont);
- mp->b_cont = mp1;
- } else {
- mp1 = mp->b_cont;
- }
- ASSERT(mp1 != NULL && mp1->b_cont == NULL);
-
- /*
- * Remember the message offsets. This is so we
- * can detect changes when we return from the
- * FW_HOOKS callbacks.
- */
- hdr_rptr = mp->b_rptr;
- hdr_wptr = mp->b_wptr;
- pld_rptr = mp->b_cont->b_rptr;
- pld_wptr = mp->b_cont->b_wptr;
-
- if (af == AF_INET) {
- DTRACE_PROBE4(
- ip4__physical__out__start,
- ill_t *, NULL,
- ill_t *, ill,
- ipha_t *, ipha,
- mblk_t *, mp);
- FW_HOOKS(
- ipst->ips_ip4_physical_out_event,
- ipst->ips_ipv4firewall_physical_out,
- NULL, ill, ipha, mp, mp, 0, ipst);
- DTRACE_PROBE1(
- ip4__physical__out__end,
- mblk_t *, mp);
- } else {
- DTRACE_PROBE4(
- ip6__physical__out_start,
- ill_t *, NULL,
- ill_t *, ill,
- ip6_t *, ip6h,
- mblk_t *, mp);
- FW_HOOKS6(
- ipst->ips_ip6_physical_out_event,
- ipst->ips_ipv6firewall_physical_out,
- NULL, ill, ip6h, mp, mp, 0, ipst);
- DTRACE_PROBE1(
- ip6__physical__out__end,
- mblk_t *, mp);
- }
-
- if (mp == NULL ||
- (mp1 = mp->b_cont) == NULL ||
- mp->b_rptr != hdr_rptr ||
- mp->b_wptr != hdr_wptr ||
- mp1->b_rptr != pld_rptr ||
- mp1->b_wptr != pld_wptr ||
- mp1->b_cont != NULL) {
- /*
- * We abandon multidata processing and
- * return to the normal path, either
- * when a packet is blocked, or when
- * the boundaries of header buffer or
- * payload buffer have been changed by
- * FW_HOOKS[6].
- */
- if (mp != NULL)
- freemsg(mp);
- goto legacy_send;
- }
- /* Finished with the pseudo packet */
- freemsg(mp);
- }
- DTRACE_IP_FASTPATH(md_hbuf, pkt_info->hdr_rptr,
- ill, ipha, ip6h);
- /* advance header offset */
- cur_hdr_off += hdr_frag_sz;
-
- obbytes += tcp->tcp_last_sent_len;
- ++obsegs;
- } while (!done && *usable > 0 && --num_burst_seg > 0 &&
- *tail_unsent > 0);
-
- if ((*xmit_tail)->b_next == NULL) {
- /*
- * Store the lbolt used for RTT estimation. We can only
- * record one timestamp per mblk so we do it when we
- * reach the end of the payload buffer. Also we only
- * take a new timestamp sample when the previous timed
- * data from the same mblk has been ack'ed.
- */
- (*xmit_tail)->b_prev = local_time;
- (*xmit_tail)->b_next = (mblk_t *)(uintptr_t)first_snxt;
- }
-
- ASSERT(*tail_unsent >= 0);
- if (*tail_unsent > 0) {
- /*
- * We got here because we broke out of the above
- * loop due to of one of the following cases:
- *
- * 1. len < adjusted MSS (i.e. small),
- * 2. Sender SWS avoidance,
- * 3. max_pld is zero.
- *
- * We are done for this Multidata, so trim our
- * last payload buffer (if any) accordingly.
- */
- if (md_pbuf != NULL)
- md_pbuf->b_wptr -= *tail_unsent;
- } else if (*usable > 0) {
- *xmit_tail = (*xmit_tail)->b_cont;
- ASSERT((uintptr_t)MBLKL(*xmit_tail) <=
- (uintptr_t)INT_MAX);
- *tail_unsent = (int)MBLKL(*xmit_tail);
- add_buffer = B_TRUE;
- }
- } while (!done && *usable > 0 && num_burst_seg > 0 &&
- (tcp_mdt_chain || max_pld > 0));
-
- if (md_mp_head != NULL) {
- /* send everything down */
- tcp_multisend_data(tcp, ire, ill, md_mp_head, obsegs, obbytes,
- &rconfirm);
- }
-
-#undef PREP_NEW_MULTIDATA
-#undef PREP_NEW_PBUF
-#undef IPVER
-
- IRE_REFRELE(ire);
- return (0);
-}
-
-/*
- * A wrapper function for sending one or more Multidata messages down to
- * the module below ip; this routine does not release the reference of the
- * IRE (caller does that). This routine is analogous to tcp_send_data().
- */
-static void
-tcp_multisend_data(tcp_t *tcp, ire_t *ire, const ill_t *ill, mblk_t *md_mp_head,
- const uint_t obsegs, const uint_t obbytes, boolean_t *rconfirm)
+tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len,
+ const int tcp_hdr_len, const int num_sack_blk, int *usable,
+ uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time)
{
- uint64_t delta;
- nce_t *nce;
- tcp_stack_t *tcps = tcp->tcp_tcps;
- ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
-
- ASSERT(ire != NULL && ill != NULL);
- ASSERT(ire->ire_stq != NULL);
- ASSERT(md_mp_head != NULL);
- ASSERT(rconfirm != NULL);
-
- /* adjust MIBs and IRE timestamp */
- DTRACE_PROBE2(tcp__trace__send, mblk_t *, md_mp_head, tcp_t *, tcp);
- tcp->tcp_obsegs += obsegs;
- UPDATE_MIB(&tcps->tcps_mib, tcpOutDataSegs, obsegs);
- UPDATE_MIB(&tcps->tcps_mib, tcpOutDataBytes, obbytes);
- TCP_STAT_UPDATE(tcps, tcp_mdt_pkt_out, obsegs);
-
- if (tcp->tcp_ipversion == IPV4_VERSION) {
- TCP_STAT_UPDATE(tcps, tcp_mdt_pkt_out_v4, obsegs);
- } else {
- TCP_STAT_UPDATE(tcps, tcp_mdt_pkt_out_v6, obsegs);
- }
- UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests, obsegs);
- UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits, obsegs);
- UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets, obbytes);
-
- ire->ire_ob_pkt_count += obsegs;
- if (ire->ire_ipif != NULL)
- atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, obsegs);
- ire->ire_last_used_time = lbolt;
-
- if ((tcp->tcp_ipversion == IPV4_VERSION &&
- ipst->ips_ip4_observe.he_interested) ||
- (tcp->tcp_ipversion == IPV6_VERSION &&
- ipst->ips_ip6_observe.he_interested)) {
- multidata_t *dlmdp = mmd_getmultidata(md_mp_head);
- pdesc_t *dl_pkt;
- pdescinfo_t pinfo;
- mblk_t *nmp;
- zoneid_t szone = tcp->tcp_connp->conn_zoneid;
-
- for (dl_pkt = mmd_getfirstpdesc(dlmdp, &pinfo);
- (dl_pkt != NULL);
- dl_pkt = mmd_getnextpdesc(dl_pkt, &pinfo)) {
- if ((nmp = mmd_transform_link(dl_pkt)) == NULL)
- continue;
- ipobs_hook(nmp, IPOBS_HOOK_OUTBOUND, szone,
- ALL_ZONES, ill, ipst);
- freemsg(nmp);
- }
- }
-
- /* send it down */
- putnext(ire->ire_stq, md_mp_head);
-
- /* we're done for TCP/IPv4 */
- if (tcp->tcp_ipversion == IPV4_VERSION)
- return;
-
- nce = ire->ire_nce;
-
- ASSERT(nce != NULL);
- ASSERT(!(nce->nce_flags & (NCE_F_NONUD|NCE_F_PERMANENT)));
- ASSERT(nce->nce_state != ND_INCOMPLETE);
-
- /* reachability confirmation? */
- if (*rconfirm) {
- nce->nce_last = TICK_TO_MSEC(lbolt64);
- if (nce->nce_state != ND_REACHABLE) {
- mutex_enter(&nce->nce_lock);
- nce->nce_state = ND_REACHABLE;
- nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT;
- mutex_exit(&nce->nce_lock);
- (void) untimeout(nce->nce_timeout_id);
- if (ip_debug > 2) {
- /* ip1dbg */
- pr_addr_dbg("tcp_multisend_data: state "
- "for %s changed to REACHABLE\n",
- AF_INET6, &ire->ire_addr_v6);
- }
- }
- /* reset transport reachability confirmation */
- *rconfirm = B_FALSE;
- }
-
- delta = TICK_TO_MSEC(lbolt64) - nce->nce_last;
- ip1dbg(("tcp_multisend_data: delta = %" PRId64
- " ill_reachable_time = %d \n", delta, ill->ill_reachable_time));
-
- if (delta > (uint64_t)ill->ill_reachable_time) {
- mutex_enter(&nce->nce_lock);
- switch (nce->nce_state) {
- case ND_REACHABLE:
- case ND_STALE:
- /*
- * ND_REACHABLE is identical to ND_STALE in this
- * specific case. If reachable time has expired for
- * this neighbor (delta is greater than reachable
- * time), conceptually, the neighbor cache is no
- * longer in REACHABLE state, but already in STALE
- * state. So the correct transition here is to
- * ND_DELAY.
- */
- nce->nce_state = ND_DELAY;
- mutex_exit(&nce->nce_lock);
- NDP_RESTART_TIMER(nce,
- ipst->ips_delay_first_probe_time);
- if (ip_debug > 3) {
- /* ip2dbg */
- pr_addr_dbg("tcp_multisend_data: state "
- "for %s changed to DELAY\n",
- AF_INET6, &ire->ire_addr_v6);
- }
- break;
- case ND_DELAY:
- case ND_PROBE:
- mutex_exit(&nce->nce_lock);
- /* Timers have already started */
- break;
- case ND_UNREACHABLE:
- /*
- * ndp timer has detected that this nce is
- * unreachable and initiated deleting this nce
- * and all its associated IREs. This is a race
- * where we found the ire before it was deleted
- * and have just sent out a packet using this
- * unreachable nce.
- */
- mutex_exit(&nce->nce_lock);
- break;
- default:
- ASSERT(0);
- }
- }
-}
-
-/*
- * Derived from tcp_send_data().
- */
-static void
-tcp_lsosend_data(tcp_t *tcp, mblk_t *mp, ire_t *ire, ill_t *ill, const int mss,
- int num_lso_seg)
-{
- ipha_t *ipha;
- mblk_t *ire_fp_mp;
- uint_t ire_fp_mp_len;
- uint32_t hcksum_txflags = 0;
- ipaddr_t src;
- ipaddr_t dst;
- uint32_t cksum;
- uint16_t *up;
- tcp_stack_t *tcps = tcp->tcp_tcps;
- ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
-
- ASSERT(DB_TYPE(mp) == M_DATA);
- ASSERT(tcp->tcp_state == TCPS_ESTABLISHED);
- ASSERT(tcp->tcp_ipversion == IPV4_VERSION);
- ASSERT(tcp->tcp_connp != NULL);
- ASSERT(CONN_IS_LSO_MD_FASTPATH(tcp->tcp_connp));
-
- ipha = (ipha_t *)mp->b_rptr;
- src = ipha->ipha_src;
- dst = ipha->ipha_dst;
-
- DTRACE_PROBE2(tcp__trace__send, mblk_t *, mp, tcp_t *, tcp);
-
- ASSERT(ipha->ipha_ident == 0 || ipha->ipha_ident == IP_HDR_INCLUDED);
- ipha->ipha_ident = (uint16_t)atomic_add_32_nv(&ire->ire_ident,
- num_lso_seg);
-#ifndef _BIG_ENDIAN
- ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8);
-#endif
- if (tcp->tcp_snd_zcopy_aware) {
- if ((ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) == 0 ||
- (ill->ill_zerocopy_capab->ill_zerocopy_flags == 0))
- mp = tcp_zcopy_disable(tcp, mp);
- }
-
- if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) {
- ASSERT(ill->ill_hcksum_capab != NULL);
- hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags;
- }
-
- /*
- * Since the TCP checksum should be recalculated by h/w, we can just
- * zero the checksum field for HCK_FULLCKSUM, or calculate partial
- * pseudo-header checksum for HCK_PARTIALCKSUM.
- * The partial pseudo-header excludes TCP length, that was calculated
- * in tcp_send(), so to zero *up before further processing.
- */
- cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
-
- up = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
- *up = 0;
-
- IP_CKSUM_XMIT_FAST(ire->ire_ipversion, hcksum_txflags, mp, ipha, up,
- IPPROTO_TCP, IP_SIMPLE_HDR_LENGTH, ntohs(ipha->ipha_length), cksum);
-
- /*
- * Append LSO flags and mss to the mp.
- */
- lso_info_set(mp, mss, HW_LSO);
-
- ipha->ipha_fragment_offset_and_flags |=
- (uint32_t)htons(ire->ire_frag_flag);
-
- ire_fp_mp = ire->ire_nce->nce_fp_mp;
- ire_fp_mp_len = MBLKL(ire_fp_mp);
- ASSERT(DB_TYPE(ire_fp_mp) == M_DATA);
- mp->b_rptr = (uchar_t *)ipha - ire_fp_mp_len;
- bcopy(ire_fp_mp->b_rptr, mp->b_rptr, ire_fp_mp_len);
-
- UPDATE_OB_PKT_COUNT(ire);
- ire->ire_last_used_time = lbolt;
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits);
- UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
- ntohs(ipha->ipha_length));
-
- DTRACE_PROBE4(ip4__physical__out__start,
- ill_t *, NULL, ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
- FW_HOOKS(ipst->ips_ip4_physical_out_event,
- ipst->ips_ipv4firewall_physical_out, NULL,
- ill, ipha, mp, mp, 0, ipst);
- DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
- DTRACE_IP_FASTPATH(mp, ipha, ill, ipha, NULL);
-
- if (mp != NULL) {
- if (ipst->ips_ip4_observe.he_interested) {
- zoneid_t szone;
-
- if (ire_fp_mp_len != 0)
- mp->b_rptr += ire_fp_mp_len;
- szone = ip_get_zoneid_v4(ipha->ipha_src, mp,
- ipst, ALL_ZONES);
- ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone,
- ALL_ZONES, ill, ipst);
- if (ire_fp_mp_len != 0)
- mp->b_rptr -= ire_fp_mp_len;
- }
-
- ILL_SEND_TX(ill, ire, tcp->tcp_connp, mp, 0, NULL);
- }
-}
-
-/*
- * tcp_send() is called by tcp_wput_data() for non-Multidata transmission
- * scheme, and returns one of the following:
- *
- * -1 = failed allocation.
- * 0 = success; burst count reached, or usable send window is too small,
- * and that we'd rather wait until later before sending again.
- * 1 = success; we are called from tcp_multisend(), and both usable send
- * window and tail_unsent are greater than the MDT threshold, and thus
- * Multidata Transmit should be used instead.
- */
-static int
-tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
- const int tcp_tcp_hdr_len, const int num_sack_blk, int *usable,
- uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time,
- const int mdt_thres)
-{
- int num_burst_seg = tcp->tcp_snd_burst;
- ire_t *ire = NULL;
- ill_t *ill = NULL;
- mblk_t *ire_fp_mp = NULL;
- uint_t ire_fp_mp_len = 0;
+ int num_burst_seg = tcp->tcp_snd_burst;
int num_lso_seg = 1;
uint_t lso_usable;
boolean_t do_lso_send = B_FALSE;
tcp_stack_t *tcps = tcp->tcp_tcps;
+ conn_t *connp = tcp->tcp_connp;
+ ip_xmit_attr_t *ixa = connp->conn_ixa;
/*
- * Check LSO capability before any further work. And the similar check
- * need to be done in for(;;) loop.
- * LSO will be deployed when therer is more than one mss of available
- * data and a burst transmission is allowed.
+ * Check LSO possibility. The value of tcp->tcp_lso indicates whether
+ * the underlying connection is LSO capable. Will check whether having
+ * enough available data to initiate LSO transmission in the for(){}
+ * loops.
*/
- if (tcp->tcp_lso &&
- (tcp->tcp_valid_bits == 0 ||
- tcp->tcp_valid_bits == TCP_FSS_VALID) &&
- num_burst_seg >= 2 && (*usable - 1) / mss >= 1) {
- /*
- * Try to find usable IRE/ILL and do basic check to the ILL.
- * Double check LSO usability before going further, since the
- * underlying interface could have been changed. In case of any
- * change of LSO capability, set tcp_ire_ill_check_done to
- * B_FALSE to force to check the ILL with the next send.
- */
- if (tcp_send_find_ire_ill(tcp, NULL, &ire, &ill) &&
- tcp->tcp_lso && ILL_LSO_TCP_USABLE(ill)) {
- /*
- * Enable LSO with this transmission.
- * Since IRE has been hold in tcp_send_find_ire_ill(),
- * IRE_REFRELE(ire) should be called before return.
- */
+ if (tcp->tcp_lso && (tcp->tcp_valid_bits & ~TCP_FSS_VALID) == 0)
do_lso_send = B_TRUE;
- ire_fp_mp = ire->ire_nce->nce_fp_mp;
- ire_fp_mp_len = MBLKL(ire_fp_mp);
- /* Round up to multiple of 4 */
- ire_fp_mp_len = ((ire_fp_mp_len + 3) / 4) * 4;
- } else {
- tcp->tcp_lso = B_FALSE;
- tcp->tcp_ire_ill_check_done = B_FALSE;
- do_lso_send = B_FALSE;
- ill = NULL;
- }
- }
for (;;) {
struct datab *db;
- tcph_t *tcph;
+ tcpha_t *tcpha;
uint32_t sum;
mblk_t *mp, *mp1;
uchar_t *rptr;
int len;
/*
- * If we're called by tcp_multisend(), and the amount of
- * sendable data as well as the size of current xmit_tail
- * is beyond the MDT threshold, return to the caller and
- * let the large data transmit be done using MDT.
+ * Burst count reached, return successfully.
*/
- if (*usable > 0 && *usable > mdt_thres &&
- (*tail_unsent > mdt_thres || (*tail_unsent == 0 &&
- MBLKL((*xmit_tail)->b_cont) > mdt_thres))) {
- ASSERT(tcp->tcp_mdt);
- return (1); /* success; do large send */
- }
-
if (num_burst_seg == 0)
- break; /* success; burst count reached */
+ break;
/*
- * Calculate the maximum payload length we can send in *one*
+ * Calculate the maximum payload length we can send at one
* time.
*/
if (do_lso_send) {
/*
- * Check whether need to do LSO any more.
+ * Check whether be able to to do LSO for the current
+ * available data.
*/
if (num_burst_seg >= 2 && (*usable - 1) / mss >= 1) {
lso_usable = MIN(tcp->tcp_lso_max, *usable);
@@ -20787,7 +15918,10 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
}
ASSERT(num_lso_seg <= IP_MAXPACKET / mss + 1);
-
+#ifdef DEBUG
+ DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg, boolean_t,
+ do_lso_send);
+#endif
/*
* Adjust num_burst_seg here.
*/
@@ -20817,7 +15951,7 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
/*
* If the retransmit timer is not running
* we start it so that we will retransmit
- * in the case when the the receiver has
+ * in the case when the receiver has
* decremented the window.
*/
if (*snxt == tcp->tcp_snxt &&
@@ -20838,7 +15972,7 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
}
}
- tcph = tcp->tcp_tcph;
+ tcpha = tcp->tcp_tcpha;
/*
* The reason to adjust len here is that we need to set flags
@@ -20849,19 +15983,25 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
*usable -= len; /* Approximate - can be adjusted later */
if (*usable > 0)
- tcph->th_flags[0] = TH_ACK;
+ tcpha->tha_flags = TH_ACK;
else
- tcph->th_flags[0] = (TH_ACK | TH_PUSH);
+ tcpha->tha_flags = (TH_ACK | TH_PUSH);
/*
- * Prime pump for IP's checksumming on our behalf
+ * Prime pump for IP's checksumming on our behalf.
* Include the adjustment for a source route if any.
+ * In case of LSO, the partial pseudo-header checksum should
+ * exclusive TCP length, so zero tha_sum before IP calculate
+ * pseudo-header checksum for partial checksum offload.
*/
- sum = len + tcp_tcp_hdr_len + tcp->tcp_sum;
- sum = (sum >> 16) + (sum & 0xFFFF);
- U16_TO_ABE16(sum, tcph->th_sum);
-
- U32_TO_ABE32(*snxt, tcph->th_seq);
+ if (do_lso_send) {
+ sum = 0;
+ } else {
+ sum = len + tcp_hdr_len + connp->conn_sum;
+ sum = (sum >> 16) + (sum & 0xFFFF);
+ }
+ tcpha->tha_sum = htons(sum);
+ tcpha->tha_seq = htonl(*snxt);
/*
* Branch off to tcp_xmit_mp() if any of the VALID bits is
@@ -20907,8 +16047,6 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
(*xmit_tail)->b_rptr = prev_rptr;
if (mp == NULL) {
- if (ire != NULL)
- IRE_REFRELE(ire);
return (-1);
}
mp1 = mp->b_cont;
@@ -20927,7 +16065,7 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
BUMP_LOCAL(tcp->tcp_obsegs);
BUMP_MIB(&tcps->tcps_mib, tcpOutDataSegs);
UPDATE_MIB(&tcps->tcps_mib, tcpOutDataBytes, len);
- tcp_send_data(tcp, q, mp);
+ tcp_send_data(tcp, mp);
continue;
}
@@ -20942,18 +16080,18 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
*tail_unsent -= len;
if (len <= mss) /* LSO is unusable */
tcp->tcp_last_sent_len = (ushort_t)len;
- len += tcp_hdr_len;
- if (tcp->tcp_ipversion == IPV4_VERSION)
+ len += total_hdr_len;
+ ixa->ixa_pktlen = len;
+
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
tcp->tcp_ipha->ipha_length = htons(len);
- else
+ } else {
tcp->tcp_ip6h->ip6_plen =
- htons(len -
- ((char *)&tcp->tcp_ip6h[1] -
- tcp->tcp_iphc));
+ htons(len - IPV6_HDR_LEN);
+ }
+
mp = dupb(*xmit_tail);
if (mp == NULL) {
- if (ire != NULL)
- IRE_REFRELE(ire);
return (-1); /* out_of_mem */
}
mp->b_rptr = rptr;
@@ -20983,21 +16121,21 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
if (len <= mss) /* LSO is unusable (!do_lso_send) */
tcp->tcp_last_sent_len = (ushort_t)len;
- len += tcp_hdr_len;
- if (tcp->tcp_ipversion == IPV4_VERSION)
+ len += total_hdr_len;
+ ixa->ixa_pktlen = len;
+
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
tcp->tcp_ipha->ipha_length = htons(len);
- else
- tcp->tcp_ip6h->ip6_plen = htons(len -
- ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc));
+ } else {
+ tcp->tcp_ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
+ }
mp = dupb(*xmit_tail);
if (mp == NULL) {
- if (ire != NULL)
- IRE_REFRELE(ire);
return (-1); /* out_of_mem */
}
- len = tcp_hdr_len;
+ len = total_hdr_len;
/*
* There are four reasons to allocate a new hdr mblk:
* 1) The bytes above us are in use by another packet
@@ -21008,24 +16146,21 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
rptr = mp->b_rptr - len;
if (!OK_32PTR(rptr) ||
((db = mp->b_datap), db->db_ref != 2) ||
- rptr < db->db_base + ire_fp_mp_len) {
+ rptr < db->db_base) {
/* NOTE: we assume allocb returns an OK_32PTR */
must_alloc:;
- mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH +
- tcps->tcps_wroff_xtra + ire_fp_mp_len, BPRI_MED);
+ mp1 = allocb(connp->conn_ht_iphc_allocated +
+ tcps->tcps_wroff_xtra, BPRI_MED);
if (mp1 == NULL) {
freemsg(mp);
- if (ire != NULL)
- IRE_REFRELE(ire);
return (-1); /* out_of_mem */
}
mp1->b_cont = mp;
mp = mp1;
/* Leave room for Link Level header */
- len = tcp_hdr_len;
- rptr =
- &mp->b_rptr[tcps->tcps_wroff_xtra + ire_fp_mp_len];
+ len = total_hdr_len;
+ rptr = &mp->b_rptr[tcps->tcps_wroff_xtra];
mp->b_wptr = &rptr[len];
}
@@ -21057,18 +16192,17 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
/*
* Excess data in mblk; can we split it?
- * If MDT is enabled for the connection,
+ * If LSO is enabled for the connection,
* keep on splitting as this is a transient
* send path.
*/
- if (!do_lso_send && !tcp->tcp_mdt &&
- (spill + nmpsz > 0)) {
+ if (!do_lso_send && (spill + nmpsz > 0)) {
/*
* Don't split if stream head was
* told to break up larger writes
* into smaller ones.
*/
- if (tcp->tcp_maxpsz > 0)
+ if (tcp->tcp_maxpsz_multiplier > 0)
break;
/*
@@ -21096,8 +16230,6 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
if (mp1 == NULL) {
*tail_unsent = spill;
freemsg(mp);
- if (ire != NULL)
- IRE_REFRELE(ire);
return (-1); /* out_of_mem */
}
}
@@ -21119,11 +16251,12 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
/*
* Adjust the checksum
*/
- tcph = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len);
+ tcpha = (tcpha_t *)(rptr +
+ ixa->ixa_ip_hdr_length);
sum += spill;
sum = (sum >> 16) + (sum & 0xFFFF);
- U16_TO_ABE16(sum, tcph->th_sum);
- if (tcp->tcp_ipversion == IPV4_VERSION) {
+ tcpha->tha_sum = htons(sum);
+ if (connp->conn_ipversion == IPV4_VERSION) {
sum = ntohs(
((ipha_t *)rptr)->ipha_length) +
spill;
@@ -21136,311 +16269,55 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
((ip6_t *)rptr)->ip6_plen =
htons(sum);
}
+ ixa->ixa_pktlen += spill;
*tail_unsent = 0;
}
}
if (tcp->tcp_ip_forward_progress) {
- ASSERT(tcp->tcp_ipversion == IPV6_VERSION);
- *(uint32_t *)mp->b_rptr |= IP_FORWARD_PROG;
tcp->tcp_ip_forward_progress = B_FALSE;
+ ixa->ixa_flags |= IXAF_REACH_CONF;
+ } else {
+ ixa->ixa_flags &= ~IXAF_REACH_CONF;
}
+ /*
+ * Append LSO information, both flags and mss, to the mp.
+ */
if (do_lso_send) {
- tcp_lsosend_data(tcp, mp, ire, ill, mss,
- num_lso_seg);
- tcp->tcp_obsegs += num_lso_seg;
+ lso_info_set(mp, mss, HW_LSO);
+ ixa->ixa_fragsize = IP_MAXPACKET;
+ ixa->ixa_extra_ident = num_lso_seg - 1;
+ DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg,
+ boolean_t, B_TRUE);
+
+ tcp_send_data(tcp, mp);
+
+ /*
+ * Restore values of ixa_fragsize and ixa_extra_ident.
+ */
+ ixa->ixa_fragsize = ixa->ixa_pmtu;
+ ixa->ixa_extra_ident = 0;
+ tcp->tcp_obsegs += num_lso_seg;
TCP_STAT(tcps, tcp_lso_times);
TCP_STAT_UPDATE(tcps, tcp_lso_pkt_out, num_lso_seg);
} else {
- tcp_send_data(tcp, q, mp);
+ tcp_send_data(tcp, mp);
BUMP_LOCAL(tcp->tcp_obsegs);
}
}
- if (ire != NULL)
- IRE_REFRELE(ire);
return (0);
}
-/* Unlink and return any mblk that looks like it contains a MDT info */
-static mblk_t *
-tcp_mdt_info_mp(mblk_t *mp)
-{
- mblk_t *prev_mp;
-
- for (;;) {
- prev_mp = mp;
- /* no more to process? */
- if ((mp = mp->b_cont) == NULL)
- break;
-
- switch (DB_TYPE(mp)) {
- case M_CTL:
- if (*(uint32_t *)mp->b_rptr != MDT_IOC_INFO_UPDATE)
- continue;
- ASSERT(prev_mp != NULL);
- prev_mp->b_cont = mp->b_cont;
- mp->b_cont = NULL;
- return (mp);
- default:
- break;
- }
- }
- return (mp);
-}
-
-/* MDT info update routine, called when IP notifies us about MDT */
-static void
-tcp_mdt_update(tcp_t *tcp, ill_mdt_capab_t *mdt_capab, boolean_t first)
-{
- boolean_t prev_state;
- tcp_stack_t *tcps = tcp->tcp_tcps;
-
- /*
- * IP is telling us to abort MDT on this connection? We know
- * this because the capability is only turned off when IP
- * encounters some pathological cases, e.g. link-layer change
- * where the new driver doesn't support MDT, or in situation
- * where MDT usage on the link-layer has been switched off.
- * IP would not have sent us the initial MDT_IOC_INFO_UPDATE
- * if the link-layer doesn't support MDT, and if it does, it
- * will indicate that the feature is to be turned on.
- */
- prev_state = tcp->tcp_mdt;
- tcp->tcp_mdt = (mdt_capab->ill_mdt_on != 0);
- if (!tcp->tcp_mdt && !first) {
- TCP_STAT(tcps, tcp_mdt_conn_halted3);
- ip1dbg(("tcp_mdt_update: disabling MDT for connp %p\n",
- (void *)tcp->tcp_connp));
- }
-
- /*
- * We currently only support MDT on simple TCP/{IPv4,IPv6},
- * so disable MDT otherwise. The checks are done here
- * and in tcp_wput_data().
- */
- if (tcp->tcp_mdt &&
- (tcp->tcp_ipversion == IPV4_VERSION &&
- tcp->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH) ||
- (tcp->tcp_ipversion == IPV6_VERSION &&
- tcp->tcp_ip_hdr_len != IPV6_HDR_LEN))
- tcp->tcp_mdt = B_FALSE;
-
- if (tcp->tcp_mdt) {
- if (mdt_capab->ill_mdt_version != MDT_VERSION_2) {
- cmn_err(CE_NOTE, "tcp_mdt_update: unknown MDT "
- "version (%d), expected version is %d",
- mdt_capab->ill_mdt_version, MDT_VERSION_2);
- tcp->tcp_mdt = B_FALSE;
- return;
- }
-
- /*
- * We need the driver to be able to handle at least three
- * spans per packet in order for tcp MDT to be utilized.
- * The first is for the header portion, while the rest are
- * needed to handle a packet that straddles across two
- * virtually non-contiguous buffers; a typical tcp packet
- * therefore consists of only two spans. Note that we take
- * a zero as "don't care".
- */
- if (mdt_capab->ill_mdt_span_limit > 0 &&
- mdt_capab->ill_mdt_span_limit < 3) {
- tcp->tcp_mdt = B_FALSE;
- return;
- }
-
- /* a zero means driver wants default value */
- tcp->tcp_mdt_max_pld = MIN(mdt_capab->ill_mdt_max_pld,
- tcps->tcps_mdt_max_pbufs);
- if (tcp->tcp_mdt_max_pld == 0)
- tcp->tcp_mdt_max_pld = tcps->tcps_mdt_max_pbufs;
-
- /* ensure 32-bit alignment */
- tcp->tcp_mdt_hdr_head = roundup(MAX(tcps->tcps_mdt_hdr_head_min,
- mdt_capab->ill_mdt_hdr_head), 4);
- tcp->tcp_mdt_hdr_tail = roundup(MAX(tcps->tcps_mdt_hdr_tail_min,
- mdt_capab->ill_mdt_hdr_tail), 4);
-
- if (!first && !prev_state) {
- TCP_STAT(tcps, tcp_mdt_conn_resumed2);
- ip1dbg(("tcp_mdt_update: reenabling MDT for connp %p\n",
- (void *)tcp->tcp_connp));
- }
- }
-}
-
-/* Unlink and return any mblk that looks like it contains a LSO info */
-static mblk_t *
-tcp_lso_info_mp(mblk_t *mp)
-{
- mblk_t *prev_mp;
-
- for (;;) {
- prev_mp = mp;
- /* no more to process? */
- if ((mp = mp->b_cont) == NULL)
- break;
-
- switch (DB_TYPE(mp)) {
- case M_CTL:
- if (*(uint32_t *)mp->b_rptr != LSO_IOC_INFO_UPDATE)
- continue;
- ASSERT(prev_mp != NULL);
- prev_mp->b_cont = mp->b_cont;
- mp->b_cont = NULL;
- return (mp);
- default:
- break;
- }
- }
-
- return (mp);
-}
-
-/* LSO info update routine, called when IP notifies us about LSO */
-static void
-tcp_lso_update(tcp_t *tcp, ill_lso_capab_t *lso_capab)
-{
- tcp_stack_t *tcps = tcp->tcp_tcps;
-
- /*
- * IP is telling us to abort LSO on this connection? We know
- * this because the capability is only turned off when IP
- * encounters some pathological cases, e.g. link-layer change
- * where the new NIC/driver doesn't support LSO, or in situation
- * where LSO usage on the link-layer has been switched off.
- * IP would not have sent us the initial LSO_IOC_INFO_UPDATE
- * if the link-layer doesn't support LSO, and if it does, it
- * will indicate that the feature is to be turned on.
- */
- tcp->tcp_lso = (lso_capab->ill_lso_on != 0);
- TCP_STAT(tcps, tcp_lso_enabled);
-
- /*
- * We currently only support LSO on simple TCP/IPv4,
- * so disable LSO otherwise. The checks are done here
- * and in tcp_wput_data().
- */
- if (tcp->tcp_lso &&
- (tcp->tcp_ipversion == IPV4_VERSION &&
- tcp->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH) ||
- (tcp->tcp_ipversion == IPV6_VERSION)) {
- tcp->tcp_lso = B_FALSE;
- TCP_STAT(tcps, tcp_lso_disabled);
- } else {
- tcp->tcp_lso_max = MIN(TCP_MAX_LSO_LENGTH,
- lso_capab->ill_lso_max);
- }
-}
-
-static void
-tcp_ire_ill_check(tcp_t *tcp, ire_t *ire, ill_t *ill, boolean_t check_lso_mdt)
-{
- conn_t *connp = tcp->tcp_connp;
- tcp_stack_t *tcps = tcp->tcp_tcps;
- ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
-
- ASSERT(ire != NULL);
-
- /*
- * We may be in the fastpath here, and although we essentially do
- * similar checks as in ip_bind_connected{_v6}/ip_xxinfo_return,
- * we try to keep things as brief as possible. After all, these
- * are only best-effort checks, and we do more thorough ones prior
- * to calling tcp_send()/tcp_multisend().
- */
- if ((ipst->ips_ip_lso_outbound || ipst->ips_ip_multidata_outbound) &&
- check_lso_mdt && !(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
- ill != NULL && !CONN_IPSEC_OUT_ENCAPSULATED(connp) &&
- !(ire->ire_flags & RTF_MULTIRT) &&
- !IPP_ENABLED(IPP_LOCAL_OUT, ipst) &&
- CONN_IS_LSO_MD_FASTPATH(connp)) {
- if (ipst->ips_ip_lso_outbound && ILL_LSO_CAPABLE(ill)) {
- /* Cache the result */
- connp->conn_lso_ok = B_TRUE;
-
- ASSERT(ill->ill_lso_capab != NULL);
- if (!ill->ill_lso_capab->ill_lso_on) {
- ill->ill_lso_capab->ill_lso_on = 1;
- ip1dbg(("tcp_ire_ill_check: connp %p enables "
- "LSO for interface %s\n", (void *)connp,
- ill->ill_name));
- }
- tcp_lso_update(tcp, ill->ill_lso_capab);
- } else if (ipst->ips_ip_multidata_outbound &&
- ILL_MDT_CAPABLE(ill)) {
- /* Cache the result */
- connp->conn_mdt_ok = B_TRUE;
-
- ASSERT(ill->ill_mdt_capab != NULL);
- if (!ill->ill_mdt_capab->ill_mdt_on) {
- ill->ill_mdt_capab->ill_mdt_on = 1;
- ip1dbg(("tcp_ire_ill_check: connp %p enables "
- "MDT for interface %s\n", (void *)connp,
- ill->ill_name));
- }
- tcp_mdt_update(tcp, ill->ill_mdt_capab, B_TRUE);
- }
- }
-
- /*
- * The goal is to reduce the number of generated tcp segments by
- * setting the maxpsz multiplier to 0; this will have an affect on
- * tcp_maxpsz_set(). With this behavior, tcp will pack more data
- * into each packet, up to SMSS bytes. Doing this reduces the number
- * of outbound segments and incoming ACKs, thus allowing for better
- * network and system performance. In contrast the legacy behavior
- * may result in sending less than SMSS size, because the last mblk
- * for some packets may have more data than needed to make up SMSS,
- * and the legacy code refused to "split" it.
- *
- * We apply the new behavior on following situations:
- *
- * 1) Loopback connections,
- * 2) Connections in which the remote peer is not on local subnet,
- * 3) Local subnet connections over the bge interface (see below).
- *
- * Ideally, we would like this behavior to apply for interfaces other
- * than bge. However, doing so would negatively impact drivers which
- * perform dynamic mapping and unmapping of DMA resources, which are
- * increased by setting the maxpsz multiplier to 0 (more mblks per
- * packet will be generated by tcp). The bge driver does not suffer
- * from this, as it copies the mblks into pre-mapped buffers, and
- * therefore does not require more I/O resources than before.
- *
- * Otherwise, this behavior is present on all network interfaces when
- * the destination endpoint is non-local, since reducing the number
- * of packets in general is good for the network.
- *
- * TODO We need to remove this hard-coded conditional for bge once
- * a better "self-tuning" mechanism, or a way to comprehend
- * the driver transmit strategy is devised. Until the solution
- * is found and well understood, we live with this hack.
- */
- if (!tcp_static_maxpsz &&
- (tcp->tcp_loopback || !tcp->tcp_localnet ||
- (ill->ill_name_length > 3 && bcmp(ill->ill_name, "bge", 3) == 0))) {
- /* override the default value */
- tcp->tcp_maxpsz = 0;
-
- ip3dbg(("tcp_ire_ill_check: connp %p tcp_maxpsz %d on "
- "interface %s\n", (void *)connp, tcp->tcp_maxpsz,
- ill != NULL ? ill->ill_name : ipif_loopback_name));
- }
-
- /* set the stream head parameters accordingly */
- (void) tcp_maxpsz_set(tcp, B_TRUE);
-}
-
/* tcp_wput_flush is called by tcp_wput_nondata to handle M_FLUSH messages. */
static void
tcp_wput_flush(tcp_t *tcp, mblk_t *mp)
{
uchar_t fval = *mp->b_rptr;
mblk_t *tail;
- queue_t *q = tcp->tcp_wq;
+ conn_t *connp = tcp->tcp_connp;
+ queue_t *q = connp->conn_wq;
/* TODO: How should flush interact with urgent data? */
if ((fval & FLUSHW) && tcp->tcp_xmit_head &&
@@ -21473,7 +16350,7 @@ tcp_wput_flush(tcp_t *tcp, mblk_t *mp)
}
/*
* We have no unsent data, so unsent must be less than
- * tcp_xmit_lowater, so re-enable flow.
+ * conn_sndlowat, so re-enable flow.
*/
mutex_enter(&tcp->tcp_non_sq_lock);
if (tcp->tcp_flow_stopped) {
@@ -21501,12 +16378,12 @@ tcp_wput_flush(tcp_t *tcp, mblk_t *mp)
static void
tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp)
{
- mblk_t *mp1;
- struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
+ mblk_t *mp1;
+ struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
STRUCT_HANDLE(strbuf, sb);
- queue_t *q = tcp->tcp_wq;
- int error;
- uint_t addrlen;
+ uint_t addrlen;
+ conn_t *connp = tcp->tcp_connp;
+ queue_t *q = connp->conn_wq;
/* Make sure it is one of ours. */
switch (iocp->ioc_cmd) {
@@ -21514,7 +16391,7 @@ tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp)
case TI_GETPEERNAME:
break;
default:
- CALL_IP_WPUT(tcp->tcp_connp, q, mp);
+ ip_wput_nondata(q, mp);
return;
}
switch (mi_copy_state(q, mp, &mp1)) {
@@ -21541,43 +16418,56 @@ tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp)
}
STRUCT_SET_HANDLE(sb, iocp->ioc_flag, (void *)mp1->b_rptr);
- addrlen = tcp->tcp_family == AF_INET ? sizeof (sin_t) : sizeof (sin6_t);
+
+ if (connp->conn_family == AF_INET)
+ addrlen = sizeof (sin_t);
+ else
+ addrlen = sizeof (sin6_t);
+
if (STRUCT_FGET(sb, maxlen) < addrlen) {
mi_copy_done(q, mp, EINVAL);
return;
}
- mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
- if (mp1 == NULL)
- return;
-
switch (iocp->ioc_cmd) {
case TI_GETMYNAME:
- error = tcp_do_getsockname(tcp, (void *)mp1->b_rptr, &addrlen);
break;
case TI_GETPEERNAME:
- error = tcp_do_getpeername(tcp, (void *)mp1->b_rptr, &addrlen);
+ if (tcp->tcp_state < TCPS_SYN_RCVD) {
+ mi_copy_done(q, mp, ENOTCONN);
+ return;
+ }
break;
}
+ mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
+ if (!mp1)
+ return;
- if (error != 0) {
- mi_copy_done(q, mp, error);
- } else {
- mp1->b_wptr += addrlen;
- STRUCT_FSET(sb, len, addrlen);
-
- /* Copy out the address */
- mi_copyout(q, mp);
+ STRUCT_FSET(sb, len, addrlen);
+ switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
+ case TI_GETMYNAME:
+ (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr,
+ &addrlen);
+ break;
+ case TI_GETPEERNAME:
+ (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr,
+ &addrlen);
+ break;
}
+ mp1->b_wptr += addrlen;
+ /* Copy out the address */
+ mi_copyout(q, mp);
}
static void
tcp_use_pure_tpi(tcp_t *tcp)
{
+ conn_t *connp = tcp->tcp_connp;
+
#ifdef _ILP32
- tcp->tcp_acceptor_id = (t_uscalar_t)tcp->tcp_rq;
+ tcp->tcp_acceptor_id = (t_uscalar_t)connp->conn_rq;
#else
- tcp->tcp_acceptor_id = tcp->tcp_connp->conn_dev;
+ tcp->tcp_acceptor_id = connp->conn_dev;
#endif
/*
* Insert this socket into the acceptor hash.
@@ -21595,11 +16485,11 @@ tcp_use_pure_tpi(tcp_t *tcp)
*/
/* ARGSUSED */
static void
-tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2)
+tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
{
- conn_t *connp = (conn_t *)arg;
- tcp_t *tcp = connp->conn_tcp;
- queue_t *q = tcp->tcp_wq;
+ conn_t *connp = (conn_t *)arg;
+ tcp_t *tcp = connp->conn_tcp;
+ queue_t *q = connp->conn_wq;
struct iocblk *iocp;
ASSERT(DB_TYPE(mp) == M_IOCTL);
@@ -21617,17 +16507,6 @@ tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2)
iocp = (struct iocblk *)mp->b_rptr;
switch (iocp->ioc_cmd) {
- case TCP_IOC_DEFAULT_Q:
- /* Wants to be the default wq. */
- if (secpolicy_ip_config(iocp->ioc_cr, B_FALSE) != 0) {
- iocp->ioc_error = EPERM;
- iocp->ioc_count = 0;
- mp->b_datap->db_type = M_IOCACK;
- qreply(q, mp);
- return;
- }
- tcp_def_q_set(tcp, mp);
- return;
case _SIOCSOCKFALLBACK:
/*
* Either sockmod is about to be popped and the socket
@@ -21650,7 +16529,7 @@ tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2)
qreply(q, mp);
return;
}
- CALL_IP_WPUT(connp, q, mp);
+ ip_wput_nondata(q, mp);
}
/*
@@ -21658,14 +16537,14 @@ tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2)
*/
/* ARGSUSED */
static void
-tcp_wput_proto(void *arg, mblk_t *mp, void *arg2)
+tcp_wput_proto(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
{
- conn_t *connp = (conn_t *)arg;
- tcp_t *tcp = connp->conn_tcp;
+ conn_t *connp = (conn_t *)arg;
+ tcp_t *tcp = connp->conn_tcp;
union T_primitives *tprim = (union T_primitives *)mp->b_rptr;
- uchar_t *rptr;
- t_scalar_t type;
- cred_t *cr;
+ uchar_t *rptr;
+ t_scalar_t type;
+ cred_t *cr;
/*
* Try and ASSERT the minimum possible references on the
@@ -21684,7 +16563,7 @@ tcp_wput_proto(void *arg, mblk_t *mp, void *arg2)
if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) {
type = ((union T_primitives *)rptr)->type;
if (type == T_EXDATA_REQ) {
- tcp_output_urgent(connp, mp, arg2);
+ tcp_output_urgent(connp, mp, arg2, NULL);
} else if (type != T_DATA_REQ) {
goto non_urgent_data;
} else {
@@ -21695,7 +16574,7 @@ tcp_wput_proto(void *arg, mblk_t *mp, void *arg2)
}
return;
} else {
- if (tcp->tcp_debug) {
+ if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
"tcp_wput_proto, dropping one...");
}
@@ -21776,17 +16655,10 @@ non_urgent_data:
* for subsequent processing by ip_restart_optmgmt(), which
* will do the CONN_DEC_REF().
*/
- CONN_INC_REF(connp);
if ((int)tprim->type == T_SVR4_OPTMGMT_REQ) {
- if (svr4_optcom_req(tcp->tcp_wq, mp, cr, &tcp_opt_obj,
- B_TRUE) != EINPROGRESS) {
- CONN_DEC_REF(connp);
- }
+ svr4_optcom_req(connp->conn_wq, mp, cr, &tcp_opt_obj);
} else {
- if (tpi_optcom_req(tcp->tcp_wq, mp, cr, &tcp_opt_obj,
- B_TRUE) != EINPROGRESS) {
- CONN_DEC_REF(connp);
- }
+ tpi_optcom_req(connp->conn_wq, mp, cr, &tcp_opt_obj);
}
break;
@@ -21804,7 +16676,7 @@ non_urgent_data:
* We were crossing FINs and got a reset from
* the other side. Just ignore it.
*/
- if (tcp->tcp_debug) {
+ if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1,
SL_ERROR|SL_TRACE,
"tcp_wput_proto, T_ORDREL_REQ out of "
@@ -21818,7 +16690,7 @@ non_urgent_data:
tcp_addr_req(tcp, mp);
break;
default:
- if (tcp->tcp_debug) {
+ if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
"tcp_wput_proto, bogus TPI msg, type %d",
tprim->type);
@@ -21844,19 +16716,6 @@ tcp_wsrv(queue_t *q)
TCP_STAT(tcps, tcp_wsrv_called);
}
-/* Non overlapping byte exchanger */
-static void
-tcp_xchg(uchar_t *a, uchar_t *b, int len)
-{
- uchar_t uch;
-
- while (len-- > 0) {
- uch = a[len];
- a[len] = b[len];
- b[len] = uch;
- }
-}
-
/*
* Send out a control packet on the tcp connection specified. This routine
* is typically called where we need a simple ACK or RST generated.
@@ -21865,50 +16724,51 @@ static void
tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl)
{
uchar_t *rptr;
- tcph_t *tcph;
+ tcpha_t *tcpha;
ipha_t *ipha = NULL;
ip6_t *ip6h = NULL;
uint32_t sum;
- int tcp_hdr_len;
- int tcp_ip_hdr_len;
+ int total_hdr_len;
+ int ip_hdr_len;
mblk_t *mp;
tcp_stack_t *tcps = tcp->tcp_tcps;
+ conn_t *connp = tcp->tcp_connp;
+ ip_xmit_attr_t *ixa = connp->conn_ixa;
/*
* Save sum for use in source route later.
*/
- ASSERT(tcp != NULL);
- sum = tcp->tcp_tcp_hdr_len + tcp->tcp_sum;
- tcp_hdr_len = tcp->tcp_hdr_len;
- tcp_ip_hdr_len = tcp->tcp_ip_hdr_len;
+ sum = connp->conn_ht_ulp_len + connp->conn_sum;
+ total_hdr_len = connp->conn_ht_iphc_len;
+ ip_hdr_len = ixa->ixa_ip_hdr_length;
/* If a text string is passed in with the request, pass it to strlog. */
- if (str != NULL && tcp->tcp_debug) {
+ if (str != NULL && connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
"tcp_xmit_ctl: '%s', seq 0x%x, ack 0x%x, ctl 0x%x",
str, seq, ack, ctl);
}
- mp = allocb(tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + tcps->tcps_wroff_xtra,
+ mp = allocb(connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra,
BPRI_MED);
if (mp == NULL) {
return;
}
rptr = &mp->b_rptr[tcps->tcps_wroff_xtra];
mp->b_rptr = rptr;
- mp->b_wptr = &rptr[tcp_hdr_len];
- bcopy(tcp->tcp_iphc, rptr, tcp_hdr_len);
+ mp->b_wptr = &rptr[total_hdr_len];
+ bcopy(connp->conn_ht_iphc, rptr, total_hdr_len);
+
+ ixa->ixa_pktlen = total_hdr_len;
- if (tcp->tcp_ipversion == IPV4_VERSION) {
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
ipha = (ipha_t *)rptr;
- ipha->ipha_length = htons(tcp_hdr_len);
+ ipha->ipha_length = htons(total_hdr_len);
} else {
ip6h = (ip6_t *)rptr;
- ASSERT(tcp != NULL);
- ip6h->ip6_plen = htons(tcp->tcp_hdr_len -
- ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc));
+ ip6h->ip6_plen = htons(total_hdr_len - IPV6_HDR_LEN);
}
- tcph = (tcph_t *)&rptr[tcp_ip_hdr_len];
- tcph->th_flags[0] = (uint8_t)ctl;
+ tcpha = (tcpha_t *)&rptr[ip_hdr_len];
+ tcpha->tha_flags = (uint8_t)ctl;
if (ctl & TH_RST) {
BUMP_MIB(&tcps->tcps_mib, tcpOutRsts);
BUMP_MIB(&tcps->tcps_mib, tcpOutControl);
@@ -21917,43 +16777,45 @@ tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl)
*/
if (tcp->tcp_snd_ts_ok &&
tcp->tcp_state > TCPS_SYN_SENT) {
- mp->b_wptr = &rptr[tcp_hdr_len - TCPOPT_REAL_TS_LEN];
+ mp->b_wptr = &rptr[total_hdr_len - TCPOPT_REAL_TS_LEN];
*(mp->b_wptr) = TCPOPT_EOL;
- if (tcp->tcp_ipversion == IPV4_VERSION) {
- ipha->ipha_length = htons(tcp_hdr_len -
+
+ ixa->ixa_pktlen = total_hdr_len - TCPOPT_REAL_TS_LEN;
+
+ if (connp->conn_ipversion == IPV4_VERSION) {
+ ipha->ipha_length = htons(total_hdr_len -
TCPOPT_REAL_TS_LEN);
} else {
- ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) -
- TCPOPT_REAL_TS_LEN);
+ ip6h->ip6_plen = htons(total_hdr_len -
+ IPV6_HDR_LEN - TCPOPT_REAL_TS_LEN);
}
- tcph->th_offset_and_rsrvd[0] -= (3 << 4);
+ tcpha->tha_offset_and_reserved -= (3 << 4);
sum -= TCPOPT_REAL_TS_LEN;
}
}
if (ctl & TH_ACK) {
if (tcp->tcp_snd_ts_ok) {
U32_TO_BE32(lbolt,
- (char *)tcph+TCP_MIN_HEADER_LENGTH+4);
+ (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
U32_TO_BE32(tcp->tcp_ts_recent,
- (char *)tcph+TCP_MIN_HEADER_LENGTH+8);
+ (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
}
/* Update the latest receive window size in TCP header. */
- U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws,
- tcph->th_win);
+ tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
tcp->tcp_rack = ack;
tcp->tcp_rack_cnt = 0;
BUMP_MIB(&tcps->tcps_mib, tcpOutAck);
}
BUMP_LOCAL(tcp->tcp_obsegs);
- U32_TO_BE32(seq, tcph->th_seq);
- U32_TO_BE32(ack, tcph->th_ack);
+ tcpha->tha_seq = htonl(seq);
+ tcpha->tha_ack = htonl(ack);
/*
* Include the adjustment for a source route if any.
*/
sum = (sum >> 16) + (sum & 0xFFFF);
- U16_TO_BE16(sum, tcph->th_sum);
- tcp_send_data(tcp, tcp->tcp_wq, mp);
+ tcpha->tha_sum = htons(sum);
+ tcp_send_data(tcp, mp);
}
/*
@@ -21991,115 +16853,32 @@ tcp_send_rst_chk(tcp_stack_t *tcps)
}
/*
- * Send down the advice IP ioctl to tell IP to mark an IRE temporary.
- */
-static void
-tcp_ip_ire_mark_advice(tcp_t *tcp)
-{
- mblk_t *mp;
- ipic_t *ipic;
-
- if (tcp->tcp_ipversion == IPV4_VERSION) {
- mp = tcp_ip_advise_mblk(&tcp->tcp_ipha->ipha_dst, IP_ADDR_LEN,
- &ipic);
- } else {
- mp = tcp_ip_advise_mblk(&tcp->tcp_ip6h->ip6_dst, IPV6_ADDR_LEN,
- &ipic);
- }
- if (mp == NULL)
- return;
- ipic->ipic_ire_marks |= IRE_MARK_TEMPORARY;
- CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp);
-}
-
-/*
- * Return an IP advice ioctl mblk and set ipic to be the pointer
- * to the advice structure.
- */
-static mblk_t *
-tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic)
-{
- struct iocblk *ioc;
- mblk_t *mp, *mp1;
-
- mp = allocb(sizeof (ipic_t) + addr_len, BPRI_HI);
- if (mp == NULL)
- return (NULL);
- bzero(mp->b_rptr, sizeof (ipic_t) + addr_len);
- *ipic = (ipic_t *)mp->b_rptr;
- (*ipic)->ipic_cmd = IP_IOC_IRE_ADVISE_NO_REPLY;
- (*ipic)->ipic_addr_offset = sizeof (ipic_t);
-
- bcopy(addr, *ipic + 1, addr_len);
-
- (*ipic)->ipic_addr_length = addr_len;
- mp->b_wptr = &mp->b_rptr[sizeof (ipic_t) + addr_len];
-
- mp1 = mkiocb(IP_IOCTL);
- if (mp1 == NULL) {
- freemsg(mp);
- return (NULL);
- }
- mp1->b_cont = mp;
- ioc = (struct iocblk *)mp1->b_rptr;
- ioc->ioc_count = sizeof (ipic_t) + addr_len;
-
- return (mp1);
-}
-
-/*
* Generate a reset based on an inbound packet, connp is set by caller
* when RST is in response to an unexpected inbound packet for which
* there is active tcp state in the system.
*
* IPSEC NOTE : Try to send the reply with the same protection as it came
- * in. We still have the ipsec_mp that the packet was attached to. Thus
- * the packet will go out at the same level of protection as it came in by
- * converting the IPSEC_IN to IPSEC_OUT.
+ * in. We have the ip_recv_attr_t which is reversed to form the ip_xmit_attr_t.
+ * That way the packet will go out at the same level of protection as it
+ * came in with.
*/
static void
-tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq,
- uint32_t ack, int ctl, uint_t ip_hdr_len, zoneid_t zoneid,
- tcp_stack_t *tcps, conn_t *connp)
+tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, uint32_t ack, int ctl,
+ ip_recv_attr_t *ira, ip_stack_t *ipst, conn_t *connp)
{
ipha_t *ipha = NULL;
ip6_t *ip6h = NULL;
ushort_t len;
- tcph_t *tcph;
+ tcpha_t *tcpha;
int i;
- mblk_t *ipsec_mp;
- boolean_t mctl_present;
- ipic_t *ipic;
ipaddr_t v4addr;
in6_addr_t v6addr;
- int addr_len;
- void *addr;
- queue_t *q = tcps->tcps_g_q;
- tcp_t *tcp;
- cred_t *cr;
- pid_t pid;
- mblk_t *nmp;
- ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
-
- if (tcps->tcps_g_q == NULL) {
- /*
- * For non-zero stackids the default queue isn't created
- * until the first open, thus there can be a need to send
- * a reset before then. But we can't do that, hence we just
- * drop the packet. Later during boot, when the default queue
- * has been setup, a retransmitted packet from the peer
- * will result in a reset.
- */
- ASSERT(tcps->tcps_netstack->netstack_stackid !=
- GLOBAL_NETSTACKID);
- freemsg(mp);
- return;
- }
-
- if (connp != NULL)
- tcp = connp->conn_tcp;
- else
- tcp = Q_TO_TCP(q);
+ netstack_t *ns = ipst->ips_netstack;
+ tcp_stack_t *tcps = ns->netstack_tcp;
+ ip_xmit_attr_t ixas, *ixa;
+ uint_t ip_hdr_len = ira->ira_ip_hdr_length;
+ boolean_t need_refrele = B_FALSE; /* ixa_refrele(ixa) */
+ ushort_t port;
if (!tcp_send_rst_chk(tcps)) {
tcps->tcps_rst_unsent++;
@@ -22107,16 +16886,41 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq,
return;
}
- if (mp->b_datap->db_type == M_CTL) {
- ipsec_mp = mp;
- mp = mp->b_cont;
- mctl_present = B_TRUE;
+ /*
+ * If connp != NULL we use conn_ixa to keep IP_NEXTHOP and other
+ * options from the listener. In that case the caller must ensure that
+ * we are running on the listener = connp squeue.
+ *
+ * We get a safe copy of conn_ixa so we don't need to restore anything
+ * we or ip_output_simple might change in the ixa.
+ */
+ if (connp != NULL) {
+ ASSERT(connp->conn_on_sqp);
+
+ ixa = conn_get_ixa_exclusive(connp);
+ if (ixa == NULL) {
+ tcps->tcps_rst_unsent++;
+ freemsg(mp);
+ return;
+ }
+ need_refrele = B_TRUE;
} else {
- ipsec_mp = mp;
- mctl_present = B_FALSE;
+ bzero(&ixas, sizeof (ixas));
+ ixa = &ixas;
+ /*
+ * IXAF_VERIFY_SOURCE is overkill since we know the
+ * packet was for us.
+ */
+ ixa->ixa_flags |= IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE;
+ ixa->ixa_protocol = IPPROTO_TCP;
+ ixa->ixa_zoneid = ira->ira_zoneid;
+ ixa->ixa_ifindex = 0;
+ ixa->ixa_ipst = ipst;
+ ixa->ixa_cred = kcred;
+ ixa->ixa_cpid = NOPID;
}
- if (str && q && tcps->tcps_dbg) {
+ if (str && tcps->tcps_dbg) {
(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
"tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, "
"flags 0x%x",
@@ -22126,20 +16930,12 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq,
mblk_t *mp1 = copyb(mp);
freemsg(mp);
mp = mp1;
- if (!mp) {
- if (mctl_present)
- freeb(ipsec_mp);
- return;
- } else {
- if (mctl_present) {
- ipsec_mp->b_cont = mp;
- } else {
- ipsec_mp = mp;
- }
- }
+ if (mp == NULL)
+ goto done;
} else if (mp->b_cont) {
freemsg(mp->b_cont);
mp->b_cont = NULL;
+ DB_CKSUMFLAGS(mp) = 0;
}
/*
* We skip reversing source route here.
@@ -22159,18 +16955,20 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq,
*/
if (ipha->ipha_src == 0 || ipha->ipha_src == INADDR_BROADCAST ||
CLASSD(ipha->ipha_src)) {
- freemsg(ipsec_mp);
BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
- return;
+ ip_drop_input("ipIfStatsInDiscards", mp, NULL);
+ freemsg(mp);
+ goto done;
}
} else {
ip6h = (ip6_t *)mp->b_rptr;
if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) ||
IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src)) {
- freemsg(ipsec_mp);
BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInDiscards);
- return;
+ ip_drop_input("ipIfStatsInDiscards", mp, NULL);
+ freemsg(mp);
+ goto done;
}
/* Remove any extension headers assuming partial overlay */
@@ -22185,13 +16983,13 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq,
ip6h->ip6_nxt = IPPROTO_TCP;
}
}
- tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len];
- if (tcph->th_flags[0] & TH_RST) {
- freemsg(ipsec_mp);
- return;
+ tcpha = (tcpha_t *)&mp->b_rptr[ip_hdr_len];
+ if (tcpha->tha_flags & TH_RST) {
+ freemsg(mp);
+ goto done;
}
- tcph->th_offset_and_rsrvd[0] = (5 << 4);
- len = ip_hdr_len + sizeof (tcph_t);
+ tcpha->tha_offset_and_reserved = (5 << 4);
+ len = ip_hdr_len + sizeof (tcpha_t);
mp->b_wptr = &mp->b_rptr[len];
if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
ipha->ipha_length = htons(len);
@@ -22201,108 +16999,79 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq,
ipha->ipha_dst = v4addr;
ipha->ipha_ident = 0;
ipha->ipha_ttl = (uchar_t)tcps->tcps_ipv4_ttl;
- addr_len = IP_ADDR_LEN;
- addr = &v4addr;
+ ixa->ixa_flags |= IXAF_IS_IPV4;
+ ixa->ixa_ip_hdr_length = ip_hdr_len;
} else {
- /* No ip6i_t in this case */
ip6h->ip6_plen = htons(len - IPV6_HDR_LEN);
/* Swap addresses */
v6addr = ip6h->ip6_src;
ip6h->ip6_src = ip6h->ip6_dst;
ip6h->ip6_dst = v6addr;
ip6h->ip6_hops = (uchar_t)tcps->tcps_ipv6_hoplimit;
- addr_len = IPV6_ADDR_LEN;
- addr = &v6addr;
- }
- tcp_xchg(tcph->th_fport, tcph->th_lport, 2);
- U32_TO_BE32(ack, tcph->th_ack);
- U32_TO_BE32(seq, tcph->th_seq);
- U16_TO_BE16(0, tcph->th_win);
- U16_TO_BE16(sizeof (tcph_t), tcph->th_sum);
- tcph->th_flags[0] = (uint8_t)ctl;
+ ixa->ixa_flags &= ~IXAF_IS_IPV4;
+
+ if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_dst)) {
+ ixa->ixa_flags |= IXAF_SCOPEID_SET;
+ ixa->ixa_scopeid = ira->ira_ruifindex;
+ }
+ ixa->ixa_ip_hdr_length = IPV6_HDR_LEN;
+ }
+ ixa->ixa_pktlen = len;
+
+ /* Swap the ports */
+ port = tcpha->tha_fport;
+ tcpha->tha_fport = tcpha->tha_lport;
+ tcpha->tha_lport = port;
+
+ tcpha->tha_ack = htonl(ack);
+ tcpha->tha_seq = htonl(seq);
+ tcpha->tha_win = 0;
+ tcpha->tha_sum = htons(sizeof (tcpha_t));
+ tcpha->tha_flags = (uint8_t)ctl;
if (ctl & TH_RST) {
BUMP_MIB(&tcps->tcps_mib, tcpOutRsts);
BUMP_MIB(&tcps->tcps_mib, tcpOutControl);
}
- /* IP trusts us to set up labels when required. */
- if (is_system_labeled() && (cr = msg_getcred(mp, &pid)) != NULL &&
- crgetlabel(cr) != NULL) {
- int err;
-
- if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION)
- err = tsol_check_label(cr, &mp,
- tcp->tcp_connp->conn_mac_mode,
- tcps->tcps_netstack->netstack_ip, pid);
- else
- err = tsol_check_label_v6(cr, &mp,
- tcp->tcp_connp->conn_mac_mode,
- tcps->tcps_netstack->netstack_ip, pid);
- if (mctl_present)
- ipsec_mp->b_cont = mp;
- else
- ipsec_mp = mp;
- if (err != 0) {
- freemsg(ipsec_mp);
- return;
- }
- if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
- ipha = (ipha_t *)mp->b_rptr;
- } else {
- ip6h = (ip6_t *)mp->b_rptr;
- }
+ /* Discard any old label */
+ if (ixa->ixa_free_flags & IXA_FREE_TSL) {
+ ASSERT(ixa->ixa_tsl != NULL);
+ label_rele(ixa->ixa_tsl);
+ ixa->ixa_free_flags &= ~IXA_FREE_TSL;
}
+ ixa->ixa_tsl = ira->ira_tsl; /* Behave as a multi-level responder */
- if (mctl_present) {
- ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr;
-
- ASSERT(ii->ipsec_in_type == IPSEC_IN);
- if (!ipsec_in_to_out(ipsec_mp, ipha, ip6h, zoneid)) {
- return;
+ if (ira->ira_flags & IRAF_IPSEC_SECURE) {
+ /*
+ * Apply IPsec based on how IPsec was applied to
+ * the packet that caused the RST.
+ */
+ if (!ipsec_in_to_out(ira, ixa, mp, ipha, ip6h)) {
+ BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+ /* Note: mp already consumed and ip_drop_packet done */
+ goto done;
}
+ } else {
+ /*
+ * This is in clear. The RST message we are building
+ * here should go out in clear, independent of our policy.
+ */
+ ixa->ixa_flags |= IXAF_NO_IPSEC;
}
- if (zoneid == ALL_ZONES)
- zoneid = GLOBAL_ZONEID;
-
- /* Add the zoneid so ip_output routes it properly */
- if ((nmp = ip_prepend_zoneid(ipsec_mp, zoneid, ipst)) == NULL) {
- freemsg(ipsec_mp);
- return;
- }
- ipsec_mp = nmp;
/*
* NOTE: one might consider tracing a TCP packet here, but
* this function has no active TCP state and no tcp structure
* that has a trace buffer. If we traced here, we would have
* to keep a local trace buffer in tcp_record_trace().
- *
- * TSol note: The mblk that contains the incoming packet was
- * reused by tcp_xmit_listener_reset, so it already contains
- * the right credentials and we don't need to call mblk_setcred.
- * Also the conn's cred is not right since it is associated
- * with tcps_g_q.
*/
- CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, ipsec_mp);
- /*
- * Tell IP to mark the IRE used for this destination temporary.
- * This way, we can limit our exposure to DoS attack because IP
- * creates an IRE for each destination. If there are too many,
- * the time to do any routing lookup will be extremely long. And
- * the lookup can be in interrupt context.
- *
- * Note that in normal circumstances, this marking should not
- * affect anything. It would be nice if only 1 message is
- * needed to inform IP that the IRE created for this RST should
- * not be added to the cache table. But there is currently
- * not such communication mechanism between TCP and IP. So
- * the best we can do now is to send the advice ioctl to IP
- * to mark the IRE temporary.
- */
- if ((mp = tcp_ip_advise_mblk(addr, addr_len, &ipic)) != NULL) {
- ipic->ipic_ire_marks |= IRE_MARK_TEMPORARY;
- CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp);
+ (void) ip_output_simple(mp, ixa);
+done:
+ ixa_cleanup(ixa);
+ if (need_refrele) {
+ ASSERT(ixa != &ixas);
+ ixa_refrele(ixa);
}
}
@@ -22313,9 +17082,11 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq,
static int
tcp_xmit_end(tcp_t *tcp)
{
- ipic_t *ipic;
- mblk_t *mp;
+ mblk_t *mp;
tcp_stack_t *tcps = tcp->tcp_tcps;
+ iulp_t uinfo;
+ ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
+ conn_t *connp = tcp->tcp_connp;
if (tcp->tcp_state < TCPS_SYN_RCVD ||
tcp->tcp_state > TCPS_CLOSE_WAIT) {
@@ -22337,7 +17108,7 @@ tcp_xmit_end(tcp_t *tcp)
tcp->tcp_fss, B_FALSE, NULL, B_FALSE);
if (mp) {
- tcp_send_data(tcp, tcp->tcp_wq, mp);
+ tcp_send_data(tcp, mp);
} else {
/*
* Couldn't allocate msg. Pretend we got it out.
@@ -22373,66 +17144,49 @@ tcp_xmit_end(tcp_t *tcp)
return (0);
/*
- * NOTE: should not update if source routes i.e. if tcp_remote if
- * different from the destination.
+ * We do not have a good algorithm to update ssthresh at this time.
+ * So don't do any update.
+ */
+ bzero(&uinfo, sizeof (uinfo));
+ uinfo.iulp_rtt = tcp->tcp_rtt_sa;
+ uinfo.iulp_rtt_sd = tcp->tcp_rtt_sd;
+
+ /*
+ * Note that uinfo is kept for conn_faddr in the DCE. Could update even
+ * if source routed but we don't.
*/
- if (tcp->tcp_ipversion == IPV4_VERSION) {
- if (tcp->tcp_remote != tcp->tcp_ipha->ipha_dst) {
+ if (connp->conn_ipversion == IPV4_VERSION) {
+ if (connp->conn_faddr_v4 != tcp->tcp_ipha->ipha_dst) {
return (0);
}
- mp = tcp_ip_advise_mblk(&tcp->tcp_ipha->ipha_dst, IP_ADDR_LEN,
- &ipic);
+ (void) dce_update_uinfo_v4(connp->conn_faddr_v4, &uinfo, ipst);
} else {
- if (!(IN6_ARE_ADDR_EQUAL(&tcp->tcp_remote_v6,
+ uint_t ifindex;
+
+ if (!(IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
&tcp->tcp_ip6h->ip6_dst))) {
return (0);
}
- mp = tcp_ip_advise_mblk(&tcp->tcp_ip6h->ip6_dst, IPV6_ADDR_LEN,
- &ipic);
- }
-
- /* Record route attributes in the IRE for use by future connections. */
- if (mp == NULL)
- return (0);
-
- /*
- * We do not have a good algorithm to update ssthresh at this time.
- * So don't do any update.
- */
- ipic->ipic_rtt = tcp->tcp_rtt_sa;
- ipic->ipic_rtt_sd = tcp->tcp_rtt_sd;
-
- CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp);
-
- return (0);
-}
+ ifindex = 0;
+ if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_faddr_v6)) {
+ ip_xmit_attr_t *ixa = connp->conn_ixa;
-/* ARGSUSED */
-void
-tcp_xmit_reset(void *arg, mblk_t *mp, void *arg2)
-{
- conn_t *connp = (conn_t *)arg;
- mblk_t *mp1;
- tcp_t *tcp = connp->conn_tcp;
- tcp_xmit_reset_event_t *eventp;
-
- ASSERT(mp->b_datap->db_type == M_PROTO &&
- MBLKL(mp) == sizeof (tcp_xmit_reset_event_t));
+ /*
+ * If we are going to create a DCE we'd better have
+ * an ifindex
+ */
+ if (ixa->ixa_nce != NULL) {
+ ifindex = ixa->ixa_nce->nce_common->ncec_ill->
+ ill_phyint->phyint_ifindex;
+ } else {
+ return (0);
+ }
+ }
- if (tcp->tcp_state != TCPS_LISTEN) {
- freemsg(mp);
- return;
+ (void) dce_update_uinfo(&connp->conn_faddr_v6, ifindex, &uinfo,
+ ipst);
}
-
- mp1 = mp->b_cont;
- mp->b_cont = NULL;
- eventp = (tcp_xmit_reset_event_t *)mp->b_rptr;
- ASSERT(eventp->tcp_xre_tcps->tcps_netstack ==
- connp->conn_netstack);
-
- tcp_xmit_listeners_reset(mp1, eventp->tcp_xre_iphdrlen,
- eventp->tcp_xre_zoneid, eventp->tcp_xre_tcps, connp);
- freemsg(mp);
+ return (0);
}
/*
@@ -22442,45 +17196,25 @@ tcp_xmit_reset(void *arg, mblk_t *mp, void *arg2)
* Note that we are reusing the incoming mp to construct the outgoing RST.
*/
void
-tcp_xmit_listeners_reset(mblk_t *mp, uint_t ip_hdr_len, zoneid_t zoneid,
- tcp_stack_t *tcps, conn_t *connp)
+tcp_xmit_listeners_reset(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst,
+ conn_t *connp)
{
uchar_t *rptr;
uint32_t seg_len;
- tcph_t *tcph;
+ tcpha_t *tcpha;
uint32_t seg_seq;
uint32_t seg_ack;
uint_t flags;
- mblk_t *ipsec_mp;
ipha_t *ipha;
ip6_t *ip6h;
- boolean_t mctl_present = B_FALSE;
- boolean_t check = B_TRUE;
boolean_t policy_present;
+ netstack_t *ns = ipst->ips_netstack;
+ tcp_stack_t *tcps = ns->netstack_tcp;
ipsec_stack_t *ipss = tcps->tcps_netstack->netstack_ipsec;
+ uint_t ip_hdr_len = ira->ira_ip_hdr_length;
TCP_STAT(tcps, tcp_no_listener);
- ipsec_mp = mp;
-
- if (mp->b_datap->db_type == M_CTL) {
- ipsec_in_t *ii;
-
- mctl_present = B_TRUE;
- mp = mp->b_cont;
-
- ii = (ipsec_in_t *)ipsec_mp->b_rptr;
- ASSERT(ii->ipsec_in_type == IPSEC_IN);
- if (ii->ipsec_in_dont_check) {
- check = B_FALSE;
- if (!ii->ipsec_in_secure) {
- freeb(ipsec_mp);
- mctl_present = B_FALSE;
- ipsec_mp = mp;
- }
- }
- }
-
if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) {
policy_present = ipss->ipsec_inbound_v4_policy_present;
ipha = (ipha_t *)mp->b_rptr;
@@ -22491,41 +17225,39 @@ tcp_xmit_listeners_reset(mblk_t *mp, uint_t ip_hdr_len, zoneid_t zoneid,
ip6h = (ip6_t *)mp->b_rptr;
}
- if (check && policy_present) {
+ if (policy_present) {
/*
* The conn_t parameter is NULL because we already know
* nobody's home.
*/
- ipsec_mp = ipsec_check_global_policy(
- ipsec_mp, (conn_t *)NULL, ipha, ip6h, mctl_present,
- tcps->tcps_netstack);
- if (ipsec_mp == NULL)
+ mp = ipsec_check_global_policy(mp, (conn_t *)NULL, ipha, ip6h,
+ ira, ns);
+ if (mp == NULL)
return;
}
- if (is_system_labeled() && !tsol_can_reply_error(mp)) {
+ if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) {
DTRACE_PROBE2(
tx__ip__log__error__nolistener__tcp,
char *, "Could not reply with RST to mp(1)",
mblk_t *, mp);
ip2dbg(("tcp_xmit_listeners_reset: not permitted to reply\n"));
- freemsg(ipsec_mp);
+ freemsg(mp);
return;
}
rptr = mp->b_rptr;
- tcph = (tcph_t *)&rptr[ip_hdr_len];
- seg_seq = BE32_TO_U32(tcph->th_seq);
- seg_ack = BE32_TO_U32(tcph->th_ack);
- flags = tcph->th_flags[0];
+ tcpha = (tcpha_t *)&rptr[ip_hdr_len];
+ seg_seq = ntohl(tcpha->tha_seq);
+ seg_ack = ntohl(tcpha->tha_ack);
+ flags = tcpha->tha_flags;
- seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcph) + ip_hdr_len);
+ seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcpha) + ip_hdr_len);
if (flags & TH_RST) {
- freemsg(ipsec_mp);
+ freemsg(mp);
} else if (flags & TH_ACK) {
- tcp_xmit_early_reset("no tcp, reset",
- ipsec_mp, seg_ack, 0, TH_RST, ip_hdr_len, zoneid, tcps,
- connp);
+ tcp_xmit_early_reset("no tcp, reset", mp, seg_ack, 0, TH_RST,
+ ira, ipst, connp);
} else {
if (flags & TH_SYN) {
seg_len++;
@@ -22537,14 +17269,13 @@ tcp_xmit_listeners_reset(mblk_t *mp, uint_t ip_hdr_len, zoneid_t zoneid,
* segment is neither. Just drop it on the
* floor.
*/
- freemsg(ipsec_mp);
+ freemsg(mp);
tcps->tcps_rst_unsent++;
return;
}
- tcp_xmit_early_reset("no tcp, reset/ack",
- ipsec_mp, 0, seg_seq + seg_len,
- TH_RST | TH_ACK, ip_hdr_len, zoneid, tcps, connp);
+ tcp_xmit_early_reset("no tcp, reset/ack", mp, 0,
+ seg_seq + seg_len, TH_RST | TH_ACK, ira, ipst, connp);
}
}
@@ -22573,14 +17304,16 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
mblk_t *mp1;
mblk_t *mp2;
uchar_t *rptr;
- tcph_t *tcph;
+ tcpha_t *tcpha;
int32_t num_sack_blk = 0;
int32_t sack_opt_len = 0;
tcp_stack_t *tcps = tcp->tcp_tcps;
+ conn_t *connp = tcp->tcp_connp;
+ ip_xmit_attr_t *ixa = connp->conn_ixa;
/* Allocate for our maximum TCP header + link-level */
- mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH +
- tcps->tcps_wroff_xtra, BPRI_MED);
+ mp1 = allocb(connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra,
+ BPRI_MED);
if (!mp1)
return (NULL);
data_length = 0;
@@ -22646,15 +17379,14 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
}
/* Update the latest receive window size in TCP header. */
- U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws,
- tcp->tcp_tcph->th_win);
+ tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
rptr = mp1->b_rptr + tcps->tcps_wroff_xtra;
mp1->b_rptr = rptr;
- mp1->b_wptr = rptr + tcp->tcp_hdr_len + sack_opt_len;
- bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len);
- tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len];
- U32_TO_ABE32(seq, tcph->th_seq);
+ mp1->b_wptr = rptr + connp->conn_ht_iphc_len + sack_opt_len;
+ bcopy(connp->conn_ht_iphc, rptr, connp->conn_ht_iphc_len);
+ tcpha = (tcpha_t *)&rptr[ixa->ixa_ip_hdr_length];
+ tcpha->tha_seq = htonl(seq);
/*
* Use tcp_unsent to determine if the PUSH bit should be used assumes
@@ -22729,14 +17461,14 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
wptr[0] = TCPOPT_MAXSEG;
wptr[1] = TCPOPT_MAXSEG_LEN;
wptr += 2;
- u1 = tcp->tcp_if_mtu -
- (tcp->tcp_ipversion == IPV4_VERSION ?
+ u1 = tcp->tcp_initial_pmtu -
+ (connp->conn_ipversion == IPV4_VERSION ?
IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) -
TCP_MIN_HEADER_LENGTH;
U16_TO_BE16(u1, wptr);
mp1->b_wptr = wptr + 2;
/* Update the offset to cover the additional word */
- tcph->th_offset_and_rsrvd[0] += (1 << 4);
+ tcpha->tha_offset_and_reserved += (1 << 4);
/*
* Note that the following way of filling in
@@ -22763,7 +17495,7 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
ASSERT(tcp->tcp_ts_recent == 0);
U32_TO_BE32(0L, wptr);
mp1->b_wptr += TCPOPT_REAL_TS_LEN;
- tcph->th_offset_and_rsrvd[0] +=
+ tcpha->tha_offset_and_reserved +=
(3 << 4);
}
@@ -22819,7 +17551,7 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
wptr[2] = TCPOPT_WS_LEN;
wptr[3] = (uchar_t)tcp->tcp_rcv_ws;
mp1->b_wptr += TCPOPT_REAL_WS_LEN;
- tcph->th_offset_and_rsrvd[0] += (1 << 4);
+ tcpha->tha_offset_and_reserved += (1 << 4);
}
if (tcp->tcp_snd_sack_ok) {
@@ -22829,7 +17561,7 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
wptr[2] = TCPOPT_SACK_PERMITTED;
wptr[3] = TCPOPT_SACK_OK_LEN;
mp1->b_wptr += TCPOPT_REAL_SACK_OK_LEN;
- tcph->th_offset_and_rsrvd[0] += (1 << 4);
+ tcpha->tha_offset_and_reserved += (1 << 4);
}
/* allocb() of adequate mblk assures space */
@@ -22840,9 +17572,9 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
* Get IP set to checksum on our behalf
* Include the adjustment for a source route if any.
*/
- u1 += tcp->tcp_sum;
+ u1 += connp->conn_sum;
u1 = (u1 >> 16) + (u1 & 0xFFFF);
- U16_TO_BE16(u1, tcph->th_sum);
+ tcpha->tha_sum = htons(u1);
BUMP_MIB(&tcps->tcps_mib, tcpOutControl);
}
if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
@@ -22878,10 +17610,10 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
u1 < (uint32_t)(64 * 1024)) {
flags |= TH_URG;
BUMP_MIB(&tcps->tcps_mib, tcpOutUrg);
- U32_TO_ABE16(u1, tcph->th_urp);
+ tcpha->tha_urp = htons(u1);
}
}
- tcph->th_flags[0] = (uchar_t)flags;
+ tcpha->tha_flags = (uchar_t)flags;
tcp->tcp_rack = tcp->tcp_rnxt;
tcp->tcp_rack_cnt = 0;
@@ -22890,14 +17622,14 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
uint32_t llbolt = (uint32_t)lbolt;
U32_TO_BE32(llbolt,
- (char *)tcph+TCP_MIN_HEADER_LENGTH+4);
+ (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
U32_TO_BE32(tcp->tcp_ts_recent,
- (char *)tcph+TCP_MIN_HEADER_LENGTH+8);
+ (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
}
}
if (num_sack_blk > 0) {
- uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len;
+ uchar_t *wptr = (uchar_t *)tcpha + connp->conn_ht_ulp_len;
sack_blk_t *tmp;
int32_t i;
@@ -22915,33 +17647,34 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
U32_TO_BE32(tmp[i].end, wptr);
wptr += sizeof (tcp_seq);
}
- tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) << 4);
+ tcpha->tha_offset_and_reserved += ((num_sack_blk * 2 + 1) << 4);
}
ASSERT((uintptr_t)(mp1->b_wptr - rptr) <= (uintptr_t)INT_MAX);
data_length += (int)(mp1->b_wptr - rptr);
- if (tcp->tcp_ipversion == IPV4_VERSION) {
+
+ ixa->ixa_pktlen = data_length;
+
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
((ipha_t *)rptr)->ipha_length = htons(data_length);
} else {
- ip6_t *ip6 = (ip6_t *)(rptr +
- (((ip6_t *)rptr)->ip6_nxt == IPPROTO_RAW ?
- sizeof (ip6i_t) : 0));
+ ip6_t *ip6 = (ip6_t *)rptr;
- ip6->ip6_plen = htons(data_length -
- ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc));
+ ip6->ip6_plen = htons(data_length - IPV6_HDR_LEN);
}
/*
* Prime pump for IP
* Include the adjustment for a source route if any.
*/
- data_length -= tcp->tcp_ip_hdr_len;
- data_length += tcp->tcp_sum;
+ data_length -= ixa->ixa_ip_hdr_length;
+ data_length += connp->conn_sum;
data_length = (data_length >> 16) + (data_length & 0xFFFF);
- U16_TO_ABE16(data_length, tcph->th_sum);
+ tcpha->tha_sum = htons(data_length);
if (tcp->tcp_ip_forward_progress) {
- ASSERT(tcp->tcp_ipversion == IPV6_VERSION);
- *(uint32_t *)mp1->b_rptr |= IP_FORWARD_PROG;
tcp->tcp_ip_forward_progress = B_FALSE;
+ connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF;
+ } else {
+ connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF;
}
return (mp1);
}
@@ -23012,7 +17745,7 @@ tcp_ack_timer(void *arg)
BUMP_LOCAL(tcp->tcp_obsegs);
BUMP_MIB(&tcps->tcps_mib, tcpOutAck);
BUMP_MIB(&tcps->tcps_mib, tcpOutAckDelayed);
- tcp_send_data(tcp, tcp->tcp_wq, mp);
+ tcp_send_data(tcp, mp);
}
}
@@ -23023,6 +17756,7 @@ tcp_ack_mp(tcp_t *tcp)
{
uint32_t seq_no;
tcp_stack_t *tcps = tcp->tcp_tcps;
+ conn_t *connp = tcp->tcp_connp;
/*
* There are a few cases to be considered while setting the sequence no.
@@ -23058,12 +17792,13 @@ tcp_ack_mp(tcp_t *tcp)
/* Generate a simple ACK */
int data_length;
uchar_t *rptr;
- tcph_t *tcph;
+ tcpha_t *tcpha;
mblk_t *mp1;
+ int32_t total_hdr_len;
int32_t tcp_hdr_len;
- int32_t tcp_tcp_hdr_len;
int32_t num_sack_blk = 0;
int32_t sack_opt_len;
+ ip_xmit_attr_t *ixa = connp->conn_ixa;
/*
* Allocate space for TCP + IP headers
@@ -23074,34 +17809,34 @@ tcp_ack_mp(tcp_t *tcp)
tcp->tcp_num_sack_blk);
sack_opt_len = num_sack_blk * sizeof (sack_blk_t) +
TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN;
- tcp_hdr_len = tcp->tcp_hdr_len + sack_opt_len;
- tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len + sack_opt_len;
+ total_hdr_len = connp->conn_ht_iphc_len + sack_opt_len;
+ tcp_hdr_len = connp->conn_ht_ulp_len + sack_opt_len;
} else {
- tcp_hdr_len = tcp->tcp_hdr_len;
- tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len;
+ total_hdr_len = connp->conn_ht_iphc_len;
+ tcp_hdr_len = connp->conn_ht_ulp_len;
}
- mp1 = allocb(tcp_hdr_len + tcps->tcps_wroff_xtra, BPRI_MED);
+ mp1 = allocb(total_hdr_len + tcps->tcps_wroff_xtra, BPRI_MED);
if (!mp1)
return (NULL);
/* Update the latest receive window size in TCP header. */
- U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws,
- tcp->tcp_tcph->th_win);
+ tcp->tcp_tcpha->tha_win =
+ htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
/* copy in prototype TCP + IP header */
rptr = mp1->b_rptr + tcps->tcps_wroff_xtra;
mp1->b_rptr = rptr;
- mp1->b_wptr = rptr + tcp_hdr_len;
- bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len);
+ mp1->b_wptr = rptr + total_hdr_len;
+ bcopy(connp->conn_ht_iphc, rptr, connp->conn_ht_iphc_len);
- tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len];
+ tcpha = (tcpha_t *)&rptr[ixa->ixa_ip_hdr_length];
/* Set the TCP sequence number. */
- U32_TO_ABE32(seq_no, tcph->th_seq);
+ tcpha->tha_seq = htonl(seq_no);
/* Set up the TCP flag field. */
- tcph->th_flags[0] = (uchar_t)TH_ACK;
+ tcpha->tha_flags = (uchar_t)TH_ACK;
if (tcp->tcp_ecn_echo_on)
- tcph->th_flags[0] |= TH_ECE;
+ tcpha->tha_flags |= TH_ECE;
tcp->tcp_rack = tcp->tcp_rnxt;
tcp->tcp_rack_cnt = 0;
@@ -23111,14 +17846,15 @@ tcp_ack_mp(tcp_t *tcp)
uint32_t llbolt = (uint32_t)lbolt;
U32_TO_BE32(llbolt,
- (char *)tcph+TCP_MIN_HEADER_LENGTH+4);
+ (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
U32_TO_BE32(tcp->tcp_ts_recent,
- (char *)tcph+TCP_MIN_HEADER_LENGTH+8);
+ (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
}
/* Fill in SACK options */
if (num_sack_blk > 0) {
- uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len;
+ uchar_t *wptr = (uchar_t *)tcpha +
+ connp->conn_ht_ulp_len;
sack_blk_t *tmp;
int32_t i;
@@ -23136,34 +17872,33 @@ tcp_ack_mp(tcp_t *tcp)
U32_TO_BE32(tmp[i].end, wptr);
wptr += sizeof (tcp_seq);
}
- tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1)
- << 4);
+ tcpha->tha_offset_and_reserved +=
+ ((num_sack_blk * 2 + 1) << 4);
}
- if (tcp->tcp_ipversion == IPV4_VERSION) {
- ((ipha_t *)rptr)->ipha_length = htons(tcp_hdr_len);
+ ixa->ixa_pktlen = total_hdr_len;
+
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ ((ipha_t *)rptr)->ipha_length = htons(total_hdr_len);
} else {
- /* Check for ip6i_t header in sticky hdrs */
- ip6_t *ip6 = (ip6_t *)(rptr +
- (((ip6_t *)rptr)->ip6_nxt == IPPROTO_RAW ?
- sizeof (ip6i_t) : 0));
+ ip6_t *ip6 = (ip6_t *)rptr;
- ip6->ip6_plen = htons(tcp_hdr_len -
- ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc));
+ ip6->ip6_plen = htons(total_hdr_len - IPV6_HDR_LEN);
}
/*
* Prime pump for checksum calculation in IP. Include the
* adjustment for a source route if any.
*/
- data_length = tcp_tcp_hdr_len + tcp->tcp_sum;
+ data_length = tcp_hdr_len + connp->conn_sum;
data_length = (data_length >> 16) + (data_length & 0xFFFF);
- U16_TO_ABE16(data_length, tcph->th_sum);
+ tcpha->tha_sum = htons(data_length);
if (tcp->tcp_ip_forward_progress) {
- ASSERT(tcp->tcp_ipversion == IPV6_VERSION);
- *(uint32_t *)mp1->b_rptr |= IP_FORWARD_PROG;
tcp->tcp_ip_forward_progress = B_FALSE;
+ connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF;
+ } else {
+ connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF;
}
return (mp1);
}
@@ -23183,6 +17918,8 @@ tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock)
tcp_t **tcpp;
tcp_t *tcpnext;
tcp_t *tcphash;
+ conn_t *connp = tcp->tcp_connp;
+ conn_t *connext;
if (tcp->tcp_ptpbhn != NULL) {
ASSERT(!caller_holds_lock);
@@ -23199,7 +17936,7 @@ tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock)
if (tcphash != NULL) {
/* Look for an entry using the same port */
while ((tcphash = tcpp[0]) != NULL &&
- tcp->tcp_lport != tcphash->tcp_lport)
+ connp->conn_lport != tcphash->tcp_connp->conn_lport)
tcpp = &(tcphash->tcp_bind_hash);
/* The port was not found, just add to the end */
@@ -23219,14 +17956,19 @@ tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock)
* INADDR_ANY.
*/
tcpnext = tcphash;
+ connext = tcpnext->tcp_connp;
tcphash = NULL;
- if (V6_OR_V4_INADDR_ANY(tcp->tcp_bound_source_v6) &&
- !V6_OR_V4_INADDR_ANY(tcpnext->tcp_bound_source_v6)) {
- while ((tcpnext = tcpp[0]) != NULL &&
- !V6_OR_V4_INADDR_ANY(tcpnext->tcp_bound_source_v6))
- tcpp = &(tcpnext->tcp_bind_hash_port);
-
- if (tcpnext) {
+ if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) &&
+ !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) {
+ while ((tcpnext = tcpp[0]) != NULL) {
+ connext = tcpnext->tcp_connp;
+ if (!V6_OR_V4_INADDR_ANY(
+ connext->conn_bound_addr_v6))
+ tcpp = &(tcpnext->tcp_bind_hash_port);
+ else
+ break;
+ }
+ if (tcpnext != NULL) {
tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
tcphash = tcpnext->tcp_bind_hash;
if (tcphash != NULL) {
@@ -23263,6 +18005,7 @@ tcp_bind_hash_remove(tcp_t *tcp)
tcp_t *tcpnext;
kmutex_t *lockp;
tcp_stack_t *tcps = tcp->tcp_tcps;
+ conn_t *connp = tcp->tcp_connp;
if (tcp->tcp_ptpbhn == NULL)
return;
@@ -23271,8 +18014,9 @@ tcp_bind_hash_remove(tcp_t *tcp)
* Extract the lock pointer in case there are concurrent
* hash_remove's for this instance.
*/
- ASSERT(tcp->tcp_lport != 0);
- lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(tcp->tcp_lport)].tf_lock;
+ ASSERT(connp->conn_lport != 0);
+ lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(
+ connp->conn_lport)].tf_lock;
ASSERT(lockp != NULL);
mutex_enter(lockp);
@@ -23548,7 +18292,7 @@ tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp,
*sys_errorp = 0;
*do_disconnectp = 0;
- error = tpi_optcom_buf(tcp->tcp_wq, mp, opt_lenp,
+ error = tpi_optcom_buf(tcp->tcp_connp->conn_wq, mp, opt_lenp,
opt_offset, cr, &tcp_opt_obj,
NULL, &is_absreq_failure);
@@ -23663,238 +18407,6 @@ tcp_sack_info_constructor(void *buf, void *cdrarg, int kmflags)
return (0);
}
-/* ARGSUSED */
-static int
-tcp_iphc_constructor(void *buf, void *cdrarg, int kmflags)
-{
- bzero(buf, TCP_MAX_COMBINED_HEADER_LENGTH);
- return (0);
-}
-
-/*
- * Make sure we wait until the default queue is setup, yet allow
- * tcp_g_q_create() to open a TCP stream.
- * We need to allow tcp_g_q_create() do do an open
- * of tcp, hence we compare curhread.
- * All others have to wait until the tcps_g_q has been
- * setup.
- */
-void
-tcp_g_q_setup(tcp_stack_t *tcps)
-{
- mutex_enter(&tcps->tcps_g_q_lock);
- if (tcps->tcps_g_q != NULL) {
- mutex_exit(&tcps->tcps_g_q_lock);
- return;
- }
- if (tcps->tcps_g_q_creator == NULL) {
- /* This thread will set it up */
- tcps->tcps_g_q_creator = curthread;
- mutex_exit(&tcps->tcps_g_q_lock);
- tcp_g_q_create(tcps);
- mutex_enter(&tcps->tcps_g_q_lock);
- ASSERT(tcps->tcps_g_q_creator == curthread);
- tcps->tcps_g_q_creator = NULL;
- cv_signal(&tcps->tcps_g_q_cv);
- ASSERT(tcps->tcps_g_q != NULL);
- mutex_exit(&tcps->tcps_g_q_lock);
- return;
- }
- /* Everybody but the creator has to wait */
- if (tcps->tcps_g_q_creator != curthread) {
- while (tcps->tcps_g_q == NULL)
- cv_wait(&tcps->tcps_g_q_cv, &tcps->tcps_g_q_lock);
- }
- mutex_exit(&tcps->tcps_g_q_lock);
-}
-
-#define IP "ip"
-
-#define TCP6DEV "/devices/pseudo/tcp6@0:tcp6"
-
-/*
- * Create a default tcp queue here instead of in strplumb
- */
-void
-tcp_g_q_create(tcp_stack_t *tcps)
-{
- int error;
- ldi_handle_t lh = NULL;
- ldi_ident_t li = NULL;
- int rval;
- cred_t *cr;
- major_t IP_MAJ;
-
-#ifdef NS_DEBUG
- (void) printf("tcp_g_q_create()\n");
-#endif
-
- IP_MAJ = ddi_name_to_major(IP);
-
- ASSERT(tcps->tcps_g_q_creator == curthread);
-
- error = ldi_ident_from_major(IP_MAJ, &li);
- if (error) {
-#ifdef DEBUG
- printf("tcp_g_q_create: lyr ident get failed error %d\n",
- error);
-#endif
- return;
- }
-
- cr = zone_get_kcred(netstackid_to_zoneid(
- tcps->tcps_netstack->netstack_stackid));
- ASSERT(cr != NULL);
- /*
- * We set the tcp default queue to IPv6 because IPv4 falls
- * back to IPv6 when it can't find a client, but
- * IPv6 does not fall back to IPv4.
- */
- error = ldi_open_by_name(TCP6DEV, FREAD|FWRITE, cr, &lh, li);
- if (error) {
-#ifdef DEBUG
- printf("tcp_g_q_create: open of TCP6DEV failed error %d\n",
- error);
-#endif
- goto out;
- }
-
- /*
- * This ioctl causes the tcp framework to cache a pointer to
- * this stream, so we don't want to close the stream after
- * this operation.
- * Use the kernel credentials that are for the zone we're in.
- */
- error = ldi_ioctl(lh, TCP_IOC_DEFAULT_Q,
- (intptr_t)0, FKIOCTL, cr, &rval);
- if (error) {
-#ifdef DEBUG
- printf("tcp_g_q_create: ioctl TCP_IOC_DEFAULT_Q failed "
- "error %d\n", error);
-#endif
- goto out;
- }
- tcps->tcps_g_q_lh = lh; /* For tcp_g_q_close */
- lh = NULL;
-out:
- /* Close layered handles */
- if (li)
- ldi_ident_release(li);
- /* Keep cred around until _inactive needs it */
- tcps->tcps_g_q_cr = cr;
-}
-
-/*
- * We keep tcp_g_q set until all other tcp_t's in the zone
- * has gone away, and then when tcp_g_q_inactive() is called
- * we clear it.
- */
-void
-tcp_g_q_destroy(tcp_stack_t *tcps)
-{
-#ifdef NS_DEBUG
- (void) printf("tcp_g_q_destroy()for stack %d\n",
- tcps->tcps_netstack->netstack_stackid);
-#endif
-
- if (tcps->tcps_g_q == NULL) {
- return; /* Nothing to cleanup */
- }
- /*
- * Drop reference corresponding to the default queue.
- * This reference was added from tcp_open when the default queue
- * was created, hence we compensate for this extra drop in
- * tcp_g_q_close. If the refcnt drops to zero here it means
- * the default queue was the last one to be open, in which
- * case, then tcp_g_q_inactive will be
- * called as a result of the refrele.
- */
- TCPS_REFRELE(tcps);
-}
-
-/*
- * Called when last tcp_t drops reference count using TCPS_REFRELE.
- * Run by tcp_q_q_inactive using a taskq.
- */
-static void
-tcp_g_q_close(void *arg)
-{
- tcp_stack_t *tcps = arg;
- int error;
- ldi_handle_t lh = NULL;
- ldi_ident_t li = NULL;
- cred_t *cr;
- major_t IP_MAJ;
-
- IP_MAJ = ddi_name_to_major(IP);
-
-#ifdef NS_DEBUG
- (void) printf("tcp_g_q_inactive() for stack %d refcnt %d\n",
- tcps->tcps_netstack->netstack_stackid,
- tcps->tcps_netstack->netstack_refcnt);
-#endif
- lh = tcps->tcps_g_q_lh;
- if (lh == NULL)
- return; /* Nothing to cleanup */
-
- ASSERT(tcps->tcps_refcnt == 1);
- ASSERT(tcps->tcps_g_q != NULL);
-
- error = ldi_ident_from_major(IP_MAJ, &li);
- if (error) {
-#ifdef DEBUG
- printf("tcp_g_q_inactive: lyr ident get failed error %d\n",
- error);
-#endif
- return;
- }
-
- cr = tcps->tcps_g_q_cr;
- tcps->tcps_g_q_cr = NULL;
- ASSERT(cr != NULL);
-
- /*
- * Make sure we can break the recursion when tcp_close decrements
- * the reference count causing g_q_inactive to be called again.
- */
- tcps->tcps_g_q_lh = NULL;
-
- /* close the default queue */
- (void) ldi_close(lh, FREAD|FWRITE, cr);
- /*
- * At this point in time tcps and the rest of netstack_t might
- * have been deleted.
- */
- tcps = NULL;
-
- /* Close layered handles */
- ldi_ident_release(li);
- crfree(cr);
-}
-
-/*
- * Called when last tcp_t drops reference count using TCPS_REFRELE.
- *
- * Have to ensure that the ldi routines are not used by an
- * interrupt thread by using a taskq.
- */
-void
-tcp_g_q_inactive(tcp_stack_t *tcps)
-{
- if (tcps->tcps_g_q_lh == NULL)
- return; /* Nothing to cleanup */
-
- ASSERT(tcps->tcps_refcnt == 0);
- TCPS_REFHOLD(tcps); /* Compensate for what g_q_destroy did */
-
- if (servicing_interrupt()) {
- (void) taskq_dispatch(tcp_taskq, tcp_g_q_close,
- (void *) tcps, TQ_SLEEP);
- } else {
- tcp_g_q_close(tcps);
- }
-}
-
/*
* Called by IP when IP is loaded into the kernel
*/
@@ -23909,10 +18421,6 @@ tcp_ddi_g_init(void)
sizeof (tcp_sack_info_t), 0,
tcp_sack_info_constructor, NULL, NULL, NULL, NULL, 0);
- tcp_iphc_cache = kmem_cache_create("tcp_iphc_cache",
- TCP_MAX_COMBINED_HEADER_LENGTH, 0,
- tcp_iphc_constructor, NULL, NULL, NULL, NULL, 0);
-
mutex_init(&tcp_random_lock, NULL, MUTEX_DEFAULT, NULL);
/* Initialize the random number generator */
@@ -23923,9 +18431,6 @@ tcp_ddi_g_init(void)
tcp_g_kstat = tcp_g_kstat_init(&tcp_g_statistics);
- tcp_taskq = taskq_create("tcp_taskq", 1, minclsyspri, 1, 1,
- TASKQ_PREPOPULATE);
-
tcp_squeue_flag = tcp_squeue_switch(tcp_squeue_wput);
/*
@@ -23933,8 +18438,7 @@ tcp_ddi_g_init(void)
* destroyed in the kernel, so we can maintain the
* set of tcp_stack_t's.
*/
- netstack_register(NS_TCP, tcp_stack_init, tcp_stack_shutdown,
- tcp_stack_fini);
+ netstack_register(NS_TCP, tcp_stack_init, NULL, tcp_stack_fini);
}
@@ -23956,8 +18460,6 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns)
tcps->tcps_netstack = ns;
/* Initialize locks */
- mutex_init(&tcps->tcps_g_q_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&tcps->tcps_g_q_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&tcps->tcps_iss_key_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&tcps->tcps_epriv_port_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -24018,6 +18520,11 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns)
major = mod_name_to_major(INET_NAME);
error = ldi_ident_from_major(major, &tcps->tcps_ldi_ident);
ASSERT(error == 0);
+ tcps->tcps_ixa_cleanup_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL);
+ ASSERT(tcps->tcps_ixa_cleanup_mp != NULL);
+ cv_init(&tcps->tcps_ixa_cleanup_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&tcps->tcps_ixa_cleanup_lock, NULL, MUTEX_DEFAULT, NULL);
+
return (tcps);
}
@@ -24035,22 +18542,8 @@ tcp_ddi_g_destroy(void)
kmem_cache_destroy(tcp_timercache);
kmem_cache_destroy(tcp_sack_info_cache);
- kmem_cache_destroy(tcp_iphc_cache);
netstack_unregister(NS_TCP);
- taskq_destroy(tcp_taskq);
-}
-
-/*
- * Shut down the TCP stack instance.
- */
-/* ARGSUSED */
-static void
-tcp_stack_shutdown(netstackid_t stackid, void *arg)
-{
- tcp_stack_t *tcps = (tcp_stack_t *)arg;
-
- tcp_g_q_destroy(tcps);
}
/*
@@ -24062,17 +18555,16 @@ tcp_stack_fini(netstackid_t stackid, void *arg)
tcp_stack_t *tcps = (tcp_stack_t *)arg;
int i;
+ freeb(tcps->tcps_ixa_cleanup_mp);
+ tcps->tcps_ixa_cleanup_mp = NULL;
+ cv_destroy(&tcps->tcps_ixa_cleanup_cv);
+ mutex_destroy(&tcps->tcps_ixa_cleanup_lock);
+
nd_free(&tcps->tcps_g_nd);
kmem_free(tcps->tcps_params, sizeof (lcl_tcp_param_arr));
tcps->tcps_params = NULL;
kmem_free(tcps->tcps_wroff_xtra_param, sizeof (tcpparam_t));
tcps->tcps_wroff_xtra_param = NULL;
- kmem_free(tcps->tcps_mdt_head_param, sizeof (tcpparam_t));
- tcps->tcps_mdt_head_param = NULL;
- kmem_free(tcps->tcps_mdt_tail_param, sizeof (tcpparam_t));
- tcps->tcps_mdt_tail_param = NULL;
- kmem_free(tcps->tcps_mdt_max_pbufs_param, sizeof (tcpparam_t));
- tcps->tcps_mdt_max_pbufs_param = NULL;
for (i = 0; i < TCP_BIND_FANOUT_SIZE; i++) {
ASSERT(tcps->tcps_bind_fanout[i].tf_tcp == NULL);
@@ -24091,8 +18583,6 @@ tcp_stack_fini(netstackid_t stackid, void *arg)
tcps->tcps_acceptor_fanout = NULL;
mutex_destroy(&tcps->tcps_iss_key_lock);
- mutex_destroy(&tcps->tcps_g_q_lock);
- cv_destroy(&tcps->tcps_g_q_cv);
mutex_destroy(&tcps->tcps_epriv_port_lock);
ip_drop_unregister(&tcps->tcps_dropper);
@@ -24120,6 +18610,7 @@ tcp_iss_init(tcp_t *tcp)
struct { uint32_t ports; in6_addr_t src; in6_addr_t dst; } arg;
uint32_t answer[4];
tcp_stack_t *tcps = tcp->tcp_tcps;
+ conn_t *connp = tcp->tcp_connp;
tcps->tcps_iss_incr_extra += (ISS_INCR >> 1);
tcp->tcp_iss = tcps->tcps_iss_incr_extra;
@@ -24128,16 +18619,9 @@ tcp_iss_init(tcp_t *tcp)
mutex_enter(&tcps->tcps_iss_key_lock);
context = tcps->tcps_iss_key;
mutex_exit(&tcps->tcps_iss_key_lock);
- arg.ports = tcp->tcp_ports;
- if (tcp->tcp_ipversion == IPV4_VERSION) {
- IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src,
- &arg.src);
- IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_dst,
- &arg.dst);
- } else {
- arg.src = tcp->tcp_ip6h->ip6_src;
- arg.dst = tcp->tcp_ip6h->ip6_dst;
- }
+ arg.ports = connp->conn_ports;
+ arg.src = connp->conn_laddr_v6;
+ arg.dst = connp->conn_faddr_v6;
MD5Update(&context, (uchar_t *)&arg, sizeof (arg));
MD5Final((uchar_t *)answer, &context);
tcp->tcp_iss += answer[0] ^ answer[1] ^ answer[2] ^ answer[3];
@@ -24220,27 +18704,16 @@ cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *), void *arg,
connp = NULL;
while ((connp =
- ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
+ ipcl_get_next_conn(connfp, connp, IPCL_TCPCONN)) != NULL) {
tcp = connp->conn_tcp;
cl_tcpi.cl_tcpi_version = CL_TCPI_V1;
- cl_tcpi.cl_tcpi_ipversion = tcp->tcp_ipversion;
+ cl_tcpi.cl_tcpi_ipversion = connp->conn_ipversion;
cl_tcpi.cl_tcpi_state = tcp->tcp_state;
- cl_tcpi.cl_tcpi_lport = tcp->tcp_lport;
- cl_tcpi.cl_tcpi_fport = tcp->tcp_fport;
- /*
- * The macros tcp_laddr and tcp_faddr give the IPv4
- * addresses. They are copied implicitly below as
- * mapped addresses.
- */
- cl_tcpi.cl_tcpi_laddr_v6 = tcp->tcp_ip_src_v6;
- if (tcp->tcp_ipversion == IPV4_VERSION) {
- cl_tcpi.cl_tcpi_faddr =
- tcp->tcp_ipha->ipha_dst;
- } else {
- cl_tcpi.cl_tcpi_faddr_v6 =
- tcp->tcp_ip6h->ip6_dst;
- }
+ cl_tcpi.cl_tcpi_lport = connp->conn_lport;
+ cl_tcpi.cl_tcpi_fport = connp->conn_fport;
+ cl_tcpi.cl_tcpi_laddr_v6 = connp->conn_laddr_v6;
+ cl_tcpi.cl_tcpi_faddr_v6 = connp->conn_faddr_v6;
/*
* If the callback returns non-zero
@@ -24302,35 +18775,35 @@ cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *), void *arg,
/*
* Check if a tcp structure matches the info in acp.
*/
-#define TCP_AC_ADDR_MATCH(acp, tcp) \
+#define TCP_AC_ADDR_MATCH(acp, connp, tcp) \
(((acp)->ac_local.ss_family == AF_INET) ? \
((TCP_AC_V4LOCAL((acp)) == INADDR_ANY || \
- TCP_AC_V4LOCAL((acp)) == (tcp)->tcp_ip_src) && \
+ TCP_AC_V4LOCAL((acp)) == (connp)->conn_laddr_v4) && \
(TCP_AC_V4REMOTE((acp)) == INADDR_ANY || \
- TCP_AC_V4REMOTE((acp)) == (tcp)->tcp_remote) && \
+ TCP_AC_V4REMOTE((acp)) == (connp)->conn_faddr_v4) && \
(TCP_AC_V4LPORT((acp)) == 0 || \
- TCP_AC_V4LPORT((acp)) == (tcp)->tcp_lport) && \
+ TCP_AC_V4LPORT((acp)) == (connp)->conn_lport) && \
(TCP_AC_V4RPORT((acp)) == 0 || \
- TCP_AC_V4RPORT((acp)) == (tcp)->tcp_fport) && \
- (acp)->ac_start <= (tcp)->tcp_state && \
- (acp)->ac_end >= (tcp)->tcp_state) : \
+ TCP_AC_V4RPORT((acp)) == (connp)->conn_fport) && \
+ (acp)->ac_start <= (tcp)->tcp_state && \
+ (acp)->ac_end >= (tcp)->tcp_state) : \
((IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL((acp))) || \
IN6_ARE_ADDR_EQUAL(&TCP_AC_V6LOCAL((acp)), \
- &(tcp)->tcp_ip_src_v6)) && \
+ &(connp)->conn_laddr_v6)) && \
(IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE((acp))) || \
IN6_ARE_ADDR_EQUAL(&TCP_AC_V6REMOTE((acp)), \
- &(tcp)->tcp_remote_v6)) && \
+ &(connp)->conn_faddr_v6)) && \
(TCP_AC_V6LPORT((acp)) == 0 || \
- TCP_AC_V6LPORT((acp)) == (tcp)->tcp_lport) && \
+ TCP_AC_V6LPORT((acp)) == (connp)->conn_lport) && \
(TCP_AC_V6RPORT((acp)) == 0 || \
- TCP_AC_V6RPORT((acp)) == (tcp)->tcp_fport) && \
- (acp)->ac_start <= (tcp)->tcp_state && \
+ TCP_AC_V6RPORT((acp)) == (connp)->conn_fport) && \
+ (acp)->ac_start <= (tcp)->tcp_state && \
(acp)->ac_end >= (tcp)->tcp_state))
-#define TCP_AC_MATCH(acp, tcp) \
+#define TCP_AC_MATCH(acp, connp, tcp) \
(((acp)->ac_zoneid == ALL_ZONES || \
- (acp)->ac_zoneid == tcp->tcp_connp->conn_zoneid) ? \
- TCP_AC_ADDR_MATCH(acp, tcp) : 0)
+ (acp)->ac_zoneid == (connp)->conn_zoneid) ? \
+ TCP_AC_ADDR_MATCH(acp, connp, tcp) : 0)
/*
* Build a message containing a tcp_ioc_abort_conn_t structure
@@ -24346,8 +18819,6 @@ tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *acp, tcp_t *tp)
if (mp == NULL)
return (NULL);
- mp->b_datap->db_type = M_CTL;
-
*((uint32_t *)mp->b_rptr) = TCP_IOC_ABORT_CONN;
tacp = (tcp_ioc_abort_conn_t *)((uchar_t *)mp->b_rptr +
sizeof (uint32_t));
@@ -24359,17 +18830,17 @@ tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *acp, tcp_t *tp)
if (acp->ac_local.ss_family == AF_INET) {
tacp->ac_local.ss_family = AF_INET;
tacp->ac_remote.ss_family = AF_INET;
- TCP_AC_V4LOCAL(tacp) = tp->tcp_ip_src;
- TCP_AC_V4REMOTE(tacp) = tp->tcp_remote;
- TCP_AC_V4LPORT(tacp) = tp->tcp_lport;
- TCP_AC_V4RPORT(tacp) = tp->tcp_fport;
+ TCP_AC_V4LOCAL(tacp) = tp->tcp_connp->conn_laddr_v4;
+ TCP_AC_V4REMOTE(tacp) = tp->tcp_connp->conn_faddr_v4;
+ TCP_AC_V4LPORT(tacp) = tp->tcp_connp->conn_lport;
+ TCP_AC_V4RPORT(tacp) = tp->tcp_connp->conn_fport;
} else {
tacp->ac_local.ss_family = AF_INET6;
tacp->ac_remote.ss_family = AF_INET6;
- TCP_AC_V6LOCAL(tacp) = tp->tcp_ip_src_v6;
- TCP_AC_V6REMOTE(tacp) = tp->tcp_remote_v6;
- TCP_AC_V6LPORT(tacp) = tp->tcp_lport;
- TCP_AC_V6RPORT(tacp) = tp->tcp_fport;
+ TCP_AC_V6LOCAL(tacp) = tp->tcp_connp->conn_laddr_v6;
+ TCP_AC_V6REMOTE(tacp) = tp->tcp_connp->conn_faddr_v6;
+ TCP_AC_V6LPORT(tacp) = tp->tcp_connp->conn_lport;
+ TCP_AC_V6RPORT(tacp) = tp->tcp_connp->conn_fport;
}
mp->b_wptr = (uchar_t *)mp->b_rptr + sizeof (uint32_t) + sizeof (*acp);
return (mp);
@@ -24419,14 +18890,32 @@ tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *acp)
}
/*
- * Called inside tcp_rput when a message built using
+ * Called using SQ_FILL when a message built using
* tcp_ioctl_abort_build_msg is put into a queue.
* Note that when we get here there is no wildcard in acp any more.
*/
+/* ARGSUSED2 */
static void
-tcp_ioctl_abort_handler(tcp_t *tcp, mblk_t *mp)
+tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2,
+ ip_recv_attr_t *dummy)
{
- tcp_ioc_abort_conn_t *acp;
+ conn_t *connp = (conn_t *)arg;
+ tcp_t *tcp = connp->conn_tcp;
+ tcp_ioc_abort_conn_t *acp;
+
+ /*
+ * Don't accept any input on a closed tcp as this TCP logically does
+ * not exist on the system. Don't proceed further with this TCP.
+ * For eg. this packet could trigger another close of this tcp
+ * which would be disastrous for tcp_refcnt. tcp_close_detached /
+ * tcp_clean_death / tcp_closei_local must be called at most once
+ * on a TCP.
+ */
+ if (tcp->tcp_state == TCPS_CLOSED ||
+ tcp->tcp_state == TCPS_BOUND) {
+ freemsg(mp);
+ return;
+ }
acp = (tcp_ioc_abort_conn_t *)(mp->b_rptr + sizeof (uint32_t));
if (tcp->tcp_state <= acp->ac_end) {
@@ -24468,12 +18957,17 @@ startover:
for (tconnp = connfp->connf_head; tconnp != NULL;
tconnp = tconnp->conn_next) {
tcp = tconnp->conn_tcp;
- if (TCP_AC_MATCH(acp, tcp)) {
- CONN_INC_REF(tcp->tcp_connp);
+ /*
+ * We are missing a check on sin6_scope_id for linklocals here,
+ * but current usage is just for aborting based on zoneid
+ * for shared-IP zones.
+ */
+ if (TCP_AC_MATCH(acp, tconnp, tcp)) {
+ CONN_INC_REF(tconnp);
mp = tcp_ioctl_abort_build_msg(acp, tcp);
if (mp == NULL) {
err = ENOMEM;
- CONN_DEC_REF(tcp->tcp_connp);
+ CONN_DEC_REF(tconnp);
break;
}
mp->b_prev = (mblk_t *)tcp;
@@ -24501,8 +18995,9 @@ startover:
listhead = listhead->b_next;
tcp = (tcp_t *)mp->b_prev;
mp->b_next = mp->b_prev = NULL;
- SQUEUE_ENTER_ONE(tcp->tcp_connp->conn_sqp, mp, tcp_input,
- tcp->tcp_connp, SQ_FILL, SQTAG_TCP_ABORT_BUCKET);
+ SQUEUE_ENTER_ONE(tcp->tcp_connp->conn_sqp, mp,
+ tcp_ioctl_abort_handler, tcp->tcp_connp, NULL,
+ SQ_FILL, SQTAG_TCP_ABORT_BUCKET);
}
*count += nmatch;
@@ -24669,7 +19164,7 @@ out:
*/
void
tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq,
- uint32_t seg_ack, int seg_len, tcph_t *tcph)
+ uint32_t seg_ack, int seg_len, tcpha_t *tcpha, ip_recv_attr_t *ira)
{
int32_t bytes_acked;
int32_t gap;
@@ -24677,17 +19172,18 @@ tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq,
tcp_opt_t tcpopt;
uint_t flags;
uint32_t new_swnd = 0;
- conn_t *connp;
+ conn_t *nconnp;
+ conn_t *connp = tcp->tcp_connp;
tcp_stack_t *tcps = tcp->tcp_tcps;
BUMP_LOCAL(tcp->tcp_ibsegs);
DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp);
- flags = (unsigned int)tcph->th_flags[0] & 0xFF;
- new_swnd = BE16_TO_U16(tcph->th_win) <<
- ((tcph->th_flags[0] & TH_SYN) ? 0 : tcp->tcp_snd_ws);
+ flags = (unsigned int)tcpha->tha_flags & 0xFF;
+ new_swnd = ntohs(tcpha->tha_win) <<
+ ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws);
if (tcp->tcp_snd_ts_ok) {
- if (!tcp_paws_check(tcp, tcph, &tcpopt)) {
+ if (!tcp_paws_check(tcp, tcpha, &tcpopt)) {
tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt,
tcp->tcp_rnxt, TH_ACK);
goto done;
@@ -24770,17 +19266,10 @@ tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq,
mutex_enter(&tcps->tcps_iss_key_lock);
context = tcps->tcps_iss_key;
mutex_exit(&tcps->tcps_iss_key_lock);
- arg.ports = tcp->tcp_ports;
+ arg.ports = connp->conn_ports;
/* We use MAPPED addresses in tcp_iss_init */
- arg.src = tcp->tcp_ip_src_v6;
- if (tcp->tcp_ipversion == IPV4_VERSION) {
- IN6_IPADDR_TO_V4MAPPED(
- tcp->tcp_ipha->ipha_dst,
- &arg.dst);
- } else {
- arg.dst =
- tcp->tcp_ip6h->ip6_dst;
- }
+ arg.src = connp->conn_laddr_v6;
+ arg.dst = connp->conn_faddr_v6;
MD5Update(&context, (uchar_t *)&arg,
sizeof (arg));
MD5Final((uchar_t *)answer, &context);
@@ -24813,21 +19302,11 @@ tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq,
*/
if (tcp_clean_death(tcp, 0, 27) == -1)
goto done;
- /*
- * We will come back to tcp_rput_data
- * on the global queue. Packets destined
- * for the global queue will be checked
- * with global policy. But the policy for
- * this packet has already been checked as
- * this was destined for the detached
- * connection. We need to bypass policy
- * check this time by attaching a dummy
- * ipsec_in with ipsec_in_dont_check set.
- */
- connp = ipcl_classify(mp, tcp->tcp_connp->conn_zoneid, ipst);
- if (connp != NULL) {
+ nconnp = ipcl_classify(mp, ira, ipst);
+ if (nconnp != NULL) {
TCP_STAT(tcps, tcp_time_wait_syn_success);
- tcp_reinput(connp, mp, tcp->tcp_connp->conn_sqp);
+ /* Drops ref on nconnp */
+ tcp_reinput(nconnp, mp, ira, ipst);
return;
}
goto done;
@@ -24905,11 +19384,6 @@ process_ack:
tcp->tcp_rnxt, TH_ACK);
}
done:
- if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) {
- DB_CKSUMSTART(mp) = 0;
- mp->b_datap->db_struioflag &= ~STRUIO_EAGER;
- TCP_STAT(tcps, tcp_time_wait_syn_fail);
- }
freemsg(mp);
}
@@ -24965,11 +19439,12 @@ tcp_timer_callback(void *arg)
tcpt = (tcp_timer_t *)mp->b_rptr;
connp = tcpt->connp;
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timer_handler, connp,
- SQ_FILL, SQTAG_TCP_TIMER);
+ NULL, SQ_FILL, SQTAG_TCP_TIMER);
}
+/* ARGSUSED */
static void
-tcp_timer_handler(void *arg, mblk_t *mp, void *arg2)
+tcp_timer_handler(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
{
tcp_timer_t *tcpt;
conn_t *connp = (conn_t *)arg;
@@ -24983,7 +19458,7 @@ tcp_timer_handler(void *arg, mblk_t *mp, void *arg2)
* If the TCP has reached the closed state, don't proceed any
* further. This TCP logically does not exist on the system.
* tcpt_proc could for example access queues, that have already
- * been qprocoff'ed off. Also see comments at the start of tcp_input
+ * been qprocoff'ed off.
*/
if (tcp->tcp_state != TCPS_CLOSED) {
(*tcpt->tcpt_proc)(connp);
@@ -25148,26 +19623,9 @@ tcp_setqfull(tcp_t *tcp)
if (tcp->tcp_closed)
return;
- if (IPCL_IS_NONSTR(connp)) {
- (*connp->conn_upcalls->su_txq_full)
- (tcp->tcp_connp->conn_upper_handle, B_TRUE);
- tcp->tcp_flow_stopped = B_TRUE;
- } else {
- queue_t *q = tcp->tcp_wq;
-
- if (!(q->q_flag & QFULL)) {
- mutex_enter(QLOCK(q));
- if (!(q->q_flag & QFULL)) {
- /* still need to set QFULL */
- q->q_flag |= QFULL;
- tcp->tcp_flow_stopped = B_TRUE;
- mutex_exit(QLOCK(q));
- TCP_STAT(tcps, tcp_flwctl_on);
- } else {
- mutex_exit(QLOCK(q));
- }
- }
- }
+ conn_setqfull(connp, &tcp->tcp_flow_stopped);
+ if (tcp->tcp_flow_stopped)
+ TCP_STAT(tcps, tcp_flwctl_on);
}
void
@@ -25177,27 +19635,7 @@ tcp_clrqfull(tcp_t *tcp)
if (tcp->tcp_closed)
return;
-
- if (IPCL_IS_NONSTR(connp)) {
- (*connp->conn_upcalls->su_txq_full)
- (tcp->tcp_connp->conn_upper_handle, B_FALSE);
- tcp->tcp_flow_stopped = B_FALSE;
- } else {
- queue_t *q = tcp->tcp_wq;
-
- if (q->q_flag & QFULL) {
- mutex_enter(QLOCK(q));
- if (q->q_flag & QFULL) {
- q->q_flag &= ~QFULL;
- tcp->tcp_flow_stopped = B_FALSE;
- mutex_exit(QLOCK(q));
- if (q->q_flag & QWANTW)
- qbackenable(q, 0);
- } else {
- mutex_exit(QLOCK(q));
- }
- }
- }
+ conn_clrqfull(connp, &tcp->tcp_flow_stopped);
}
/*
@@ -25246,10 +19684,7 @@ tcp_kstat2_init(netstackid_t stackid, tcp_stat_t *tcps_statisticsp)
tcp_stat_t template = {
{ "tcp_time_wait", KSTAT_DATA_UINT64 },
{ "tcp_time_wait_syn", KSTAT_DATA_UINT64 },
- { "tcp_time_wait_success", KSTAT_DATA_UINT64 },
- { "tcp_time_wait_fail", KSTAT_DATA_UINT64 },
- { "tcp_reinput_syn", KSTAT_DATA_UINT64 },
- { "tcp_ip_output", KSTAT_DATA_UINT64 },
+ { "tcp_time_wait_syn_success", KSTAT_DATA_UINT64 },
{ "tcp_detach_non_time_wait", KSTAT_DATA_UINT64 },
{ "tcp_detach_time_wait", KSTAT_DATA_UINT64 },
{ "tcp_time_wait_reap", KSTAT_DATA_UINT64 },
@@ -25287,37 +19722,14 @@ tcp_kstat2_init(netstackid_t stackid, tcp_stat_t *tcps_statisticsp)
{ "tcp_timermp_freed", KSTAT_DATA_UINT64 },
{ "tcp_push_timer_cnt", KSTAT_DATA_UINT64 },
{ "tcp_ack_timer_cnt", KSTAT_DATA_UINT64 },
- { "tcp_ire_null1", KSTAT_DATA_UINT64 },
- { "tcp_ire_null", KSTAT_DATA_UINT64 },
- { "tcp_ip_send", KSTAT_DATA_UINT64 },
- { "tcp_ip_ire_send", KSTAT_DATA_UINT64 },
{ "tcp_wsrv_called", KSTAT_DATA_UINT64 },
{ "tcp_flwctl_on", KSTAT_DATA_UINT64 },
{ "tcp_timer_fire_early", KSTAT_DATA_UINT64 },
{ "tcp_timer_fire_miss", KSTAT_DATA_UINT64 },
{ "tcp_rput_v6_error", KSTAT_DATA_UINT64 },
- { "tcp_out_sw_cksum", KSTAT_DATA_UINT64 },
- { "tcp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 },
{ "tcp_zcopy_on", KSTAT_DATA_UINT64 },
{ "tcp_zcopy_off", KSTAT_DATA_UINT64 },
{ "tcp_zcopy_backoff", KSTAT_DATA_UINT64 },
- { "tcp_zcopy_disable", KSTAT_DATA_UINT64 },
- { "tcp_mdt_pkt_out", KSTAT_DATA_UINT64 },
- { "tcp_mdt_pkt_out_v4", KSTAT_DATA_UINT64 },
- { "tcp_mdt_pkt_out_v6", KSTAT_DATA_UINT64 },
- { "tcp_mdt_discarded", KSTAT_DATA_UINT64 },
- { "tcp_mdt_conn_halted1", KSTAT_DATA_UINT64 },
- { "tcp_mdt_conn_halted2", KSTAT_DATA_UINT64 },
- { "tcp_mdt_conn_halted3", KSTAT_DATA_UINT64 },
- { "tcp_mdt_conn_resumed1", KSTAT_DATA_UINT64 },
- { "tcp_mdt_conn_resumed2", KSTAT_DATA_UINT64 },
- { "tcp_mdt_legacy_small", KSTAT_DATA_UINT64 },
- { "tcp_mdt_legacy_all", KSTAT_DATA_UINT64 },
- { "tcp_mdt_legacy_ret", KSTAT_DATA_UINT64 },
- { "tcp_mdt_allocfail", KSTAT_DATA_UINT64 },
- { "tcp_mdt_addpdescfail", KSTAT_DATA_UINT64 },
- { "tcp_mdt_allocd", KSTAT_DATA_UINT64 },
- { "tcp_mdt_linked", KSTAT_DATA_UINT64 },
{ "tcp_fusion_flowctl", KSTAT_DATA_UINT64 },
{ "tcp_fusion_backenabled", KSTAT_DATA_UINT64 },
{ "tcp_fusion_urg", KSTAT_DATA_UINT64 },
@@ -25490,7 +19902,7 @@ tcp_kstat_update(kstat_t *kp, int rw)
connfp = &ipst->ips_ipcl_globalhash_fanout[i];
connp = NULL;
while ((connp =
- ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
+ ipcl_get_next_conn(connfp, connp, IPCL_TCPCONN)) != NULL) {
tcp = connp->conn_tcp;
switch (tcp_snmp_state(tcp)) {
case MIB2_TCP_established:
@@ -25565,48 +19977,6 @@ tcp_kstat_update(kstat_t *kp, int rw)
return (0);
}
-void
-tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp)
-{
- uint16_t hdr_len;
- ipha_t *ipha;
- uint8_t *nexthdrp;
- tcph_t *tcph;
- tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps;
-
- /* Already has an eager */
- if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) {
- TCP_STAT(tcps, tcp_reinput_syn);
- SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp,
- SQ_PROCESS, SQTAG_TCP_REINPUT_EAGER);
- return;
- }
-
- switch (IPH_HDR_VERSION(mp->b_rptr)) {
- case IPV4_VERSION:
- ipha = (ipha_t *)mp->b_rptr;
- hdr_len = IPH_HDR_LENGTH(ipha);
- break;
- case IPV6_VERSION:
- if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr,
- &hdr_len, &nexthdrp)) {
- CONN_DEC_REF(connp);
- freemsg(mp);
- return;
- }
- break;
- }
-
- tcph = (tcph_t *)&mp->b_rptr[hdr_len];
- if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) {
- mp->b_datap->db_struioflag |= STRUIO_EAGER;
- DB_CKSUMSTART(mp) = (intptr_t)sqp;
- }
-
- SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp,
- SQ_FILL, SQTAG_TCP_REINPUT);
-}
-
static int
tcp_squeue_switch(int val)
{
@@ -25653,278 +20023,20 @@ tcp_squeue_add(squeue_t *sqp)
tcp_time_wait->tcp_free_list_cnt = 0;
}
-static int
-tcp_post_ip_bind(tcp_t *tcp, mblk_t *mp, int error, cred_t *cr, pid_t pid)
+/*
+ * On a labeled system we have some protocols above TCP, such as RPC, which
+ * appear to assume that every mblk in a chain has a db_credp.
+ */
+static void
+tcp_setcred_data(mblk_t *mp, ip_recv_attr_t *ira)
{
- mblk_t *ire_mp = NULL;
- mblk_t *syn_mp;
- mblk_t *mdti;
- mblk_t *lsoi;
- int retval;
- tcph_t *tcph;
- cred_t *ecr;
- ts_label_t *tsl;
- uint32_t mss;
- conn_t *connp = tcp->tcp_connp;
- tcp_stack_t *tcps = tcp->tcp_tcps;
-
- if (error == 0) {
- /*
- * Adapt Multidata information, if any. The
- * following tcp_mdt_update routine will free
- * the message.
- */
- if (mp != NULL && ((mdti = tcp_mdt_info_mp(mp)) != NULL)) {
- tcp_mdt_update(tcp, &((ip_mdt_info_t *)mdti->
- b_rptr)->mdt_capab, B_TRUE);
- freemsg(mdti);
- }
-
- /*
- * Check to update LSO information with tcp, and
- * tcp_lso_update routine will free the message.
- */
- if (mp != NULL && ((lsoi = tcp_lso_info_mp(mp)) != NULL)) {
- tcp_lso_update(tcp, &((ip_lso_info_t *)lsoi->
- b_rptr)->lso_capab);
- freemsg(lsoi);
- }
-
- /* Get the IRE, if we had requested for it */
- if (mp != NULL)
- ire_mp = tcp_ire_mp(&mp);
-
- if (tcp->tcp_hard_binding) {
- tcp->tcp_hard_binding = B_FALSE;
- tcp->tcp_hard_bound = B_TRUE;
- CL_INET_CONNECT(tcp->tcp_connp, tcp, B_TRUE, retval);
- if (retval != 0) {
- error = EADDRINUSE;
- goto bind_failed;
- }
- } else {
- if (ire_mp != NULL)
- freeb(ire_mp);
- goto after_syn_sent;
- }
-
- retval = tcp_adapt_ire(tcp, ire_mp);
- if (ire_mp != NULL)
- freeb(ire_mp);
- if (retval == 0) {
- error = (int)((tcp->tcp_state >= TCPS_SYN_SENT) ?
- ENETUNREACH : EADDRNOTAVAIL);
- goto ipcl_rm;
- }
- /*
- * Don't let an endpoint connect to itself.
- * Also checked in tcp_connect() but that
- * check can't handle the case when the
- * local IP address is INADDR_ANY.
- */
- if (tcp->tcp_ipversion == IPV4_VERSION) {
- if ((tcp->tcp_ipha->ipha_dst ==
- tcp->tcp_ipha->ipha_src) &&
- (BE16_EQL(tcp->tcp_tcph->th_lport,
- tcp->tcp_tcph->th_fport))) {
- error = EADDRNOTAVAIL;
- goto ipcl_rm;
- }
- } else {
- if (IN6_ARE_ADDR_EQUAL(
- &tcp->tcp_ip6h->ip6_dst,
- &tcp->tcp_ip6h->ip6_src) &&
- (BE16_EQL(tcp->tcp_tcph->th_lport,
- tcp->tcp_tcph->th_fport))) {
- error = EADDRNOTAVAIL;
- goto ipcl_rm;
- }
- }
- ASSERT(tcp->tcp_state == TCPS_SYN_SENT);
- /*
- * This should not be possible! Just for
- * defensive coding...
- */
- if (tcp->tcp_state != TCPS_SYN_SENT)
- goto after_syn_sent;
-
- if (is_system_labeled() &&
- !tcp_update_label(tcp, CONN_CRED(tcp->tcp_connp))) {
- error = EHOSTUNREACH;
- goto ipcl_rm;
- }
-
- /*
- * tcp_adapt_ire() does not adjust
- * for TCP/IP header length.
- */
- mss = tcp->tcp_mss - tcp->tcp_hdr_len;
-
- /*
- * Just make sure our rwnd is at
- * least tcp_recv_hiwat_mss * MSS
- * large, and round up to the nearest
- * MSS.
- *
- * We do the round up here because
- * we need to get the interface
- * MTU first before we can do the
- * round up.
- */
- tcp->tcp_rwnd = MAX(MSS_ROUNDUP(tcp->tcp_rwnd, mss),
- tcps->tcps_recv_hiwat_minmss * mss);
- tcp->tcp_recv_hiwater = tcp->tcp_rwnd;
- tcp_set_ws_value(tcp);
- U32_TO_ABE16((tcp->tcp_rwnd >> tcp->tcp_rcv_ws),
- tcp->tcp_tcph->th_win);
- if (tcp->tcp_rcv_ws > 0 || tcps->tcps_wscale_always)
- tcp->tcp_snd_ws_ok = B_TRUE;
-
- /*
- * Set tcp_snd_ts_ok to true
- * so that tcp_xmit_mp will
- * include the timestamp
- * option in the SYN segment.
- */
- if (tcps->tcps_tstamp_always ||
- (tcp->tcp_rcv_ws && tcps->tcps_tstamp_if_wscale)) {
- tcp->tcp_snd_ts_ok = B_TRUE;
- }
-
- /*
- * tcp_snd_sack_ok can be set in
- * tcp_adapt_ire() if the sack metric
- * is set. So check it here also.
- */
- if (tcps->tcps_sack_permitted == 2 ||
- tcp->tcp_snd_sack_ok) {
- if (tcp->tcp_sack_info == NULL) {
- tcp->tcp_sack_info =
- kmem_cache_alloc(tcp_sack_info_cache,
- KM_SLEEP);
- }
- tcp->tcp_snd_sack_ok = B_TRUE;
- }
+ ASSERT(is_system_labeled());
+ ASSERT(ira->ira_cred != NULL);
- /*
- * Should we use ECN? Note that the current
- * default value (SunOS 5.9) of tcp_ecn_permitted
- * is 1. The reason for doing this is that there
- * are equipments out there that will drop ECN
- * enabled IP packets. Setting it to 1 avoids
- * compatibility problems.
- */
- if (tcps->tcps_ecn_permitted == 2)
- tcp->tcp_ecn_ok = B_TRUE;
-
- TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
- syn_mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL,
- tcp->tcp_iss, B_FALSE, NULL, B_FALSE);
- if (syn_mp) {
- /*
- * cr contains the cred from the thread calling
- * connect().
- *
- * If no thread cred is available, use the
- * socket creator's cred instead. If still no
- * cred, drop the request rather than risk a
- * panic on production systems.
- */
- if (cr == NULL) {
- cr = CONN_CRED(connp);
- pid = tcp->tcp_cpid;
- ASSERT(cr != NULL);
- if (cr != NULL) {
- mblk_setcred(syn_mp, cr, pid);
- } else {
- error = ECONNABORTED;
- goto ipcl_rm;
- }
-
- /*
- * If an effective security label exists for
- * the connection, create a copy of the thread's
- * cred but with the effective label attached.
- */
- } else if (is_system_labeled() &&
- connp->conn_effective_cred != NULL &&
- (tsl = crgetlabel(connp->
- conn_effective_cred)) != NULL) {
- if ((ecr = copycred_from_tslabel(cr,
- tsl, KM_NOSLEEP)) == NULL) {
- error = ENOMEM;
- goto ipcl_rm;
- }
- mblk_setcred(syn_mp, ecr, pid);
- crfree(ecr);
-
- /*
- * Default to using the thread's cred unchanged.
- */
- } else {
- mblk_setcred(syn_mp, cr, pid);
- }
-
- /*
- * We must bump the generation before sending the syn
- * to ensure that we use the right generation in case
- * this thread issues a "connected" up call.
- */
- SOCK_CONNID_BUMP(tcp->tcp_connid);
-
- tcp_send_data(tcp, tcp->tcp_wq, syn_mp);
- }
- after_syn_sent:
- if (mp != NULL) {
- ASSERT(mp->b_cont == NULL);
- freeb(mp);
- }
- return (error);
- } else {
- /* error */
- if (tcp->tcp_debug) {
- (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
- "tcp_post_ip_bind: error == %d", error);
- }
- if (mp != NULL) {
- freeb(mp);
- }
+ while (mp != NULL) {
+ mblk_setcred(mp, ira->ira_cred, NOPID);
+ mp = mp->b_cont;
}
-
-ipcl_rm:
- /*
- * Need to unbind with classifier since we were just
- * told that our bind succeeded. a.k.a error == 0 at the entry.
- */
- tcp->tcp_hard_bound = B_FALSE;
- tcp->tcp_hard_binding = B_FALSE;
-
- ipcl_hash_remove(connp);
-
-bind_failed:
- tcp->tcp_state = TCPS_IDLE;
- if (tcp->tcp_ipversion == IPV4_VERSION)
- tcp->tcp_ipha->ipha_src = 0;
- else
- V6_SET_ZERO(tcp->tcp_ip6h->ip6_src);
- /*
- * Copy of the src addr. in tcp_t is needed since
- * the lookup funcs. can only look at tcp_t
- */
- V6_SET_ZERO(tcp->tcp_ip_src_v6);
-
- tcph = tcp->tcp_tcph;
- tcph->th_lport[0] = 0;
- tcph->th_lport[1] = 0;
- tcp_bind_hash_remove(tcp);
- bzero(&connp->u_port, sizeof (connp->u_port));
- /* blow away saved option results if any */
- if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
- tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
-
- conn_delete_ire(tcp->tcp_connp, NULL);
-
- return (error);
}
static int
@@ -25936,16 +20048,16 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
boolean_t user_specified;
in_port_t allocated_port;
in_port_t requested_port = *requested_port_ptr;
- conn_t *connp;
+ conn_t *connp = tcp->tcp_connp;
zone_t *zone;
tcp_stack_t *tcps = tcp->tcp_tcps;
- in6_addr_t v6addr = tcp->tcp_ip_src_v6;
+ in6_addr_t v6addr = connp->conn_laddr_v6;
/*
* XXX It's up to the caller to specify bind_to_req_port_only or not.
*/
- if (cr == NULL)
- cr = tcp->tcp_cred;
+ ASSERT(cr != NULL);
+
/*
* Get a valid port (within the anonymous range and should not
* be a privileged one) to use if the user has not given a port.
@@ -25961,7 +20073,7 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
mlptype = mlptSingle;
mlp_port = requested_port;
if (requested_port == 0) {
- requested_port = tcp->tcp_anon_priv_bind ?
+ requested_port = connp->conn_anon_priv_bind ?
tcp_get_next_priv_port(tcp) :
tcp_update_next_port(tcps->tcps_next_port_to_try,
tcp, B_TRUE);
@@ -25975,7 +20087,6 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
* this socket and RPC is MLP in this zone, then give him an
* anonymous MLP.
*/
- connp = tcp->tcp_connp;
if (connp->conn_anon_mlp && is_system_labeled()) {
zone = crgetzone(cr);
addrtype = tsol_mlp_addr_type(
@@ -26016,7 +20127,7 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
if (priv) {
if (secpolicy_net_privaddr(cr, requested_port,
IPPROTO_TCP) != 0) {
- if (tcp->tcp_debug) {
+ if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1,
SL_ERROR|SL_TRACE,
"tcp_bind: no priv for port %d",
@@ -26044,7 +20155,7 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
if (mlptype != mlptSingle) {
if (secpolicy_net_bindmlp(cr) != 0) {
- if (tcp->tcp_debug) {
+ if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1,
SL_ERROR|SL_TRACE,
"tcp_bind: no priv for multilevel port %d",
@@ -26068,7 +20179,7 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
mlpzone = tsol_mlp_findzone(IPPROTO_TCP,
htons(mlp_port));
if (connp->conn_zoneid != mlpzone) {
- if (tcp->tcp_debug) {
+ if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1,
SL_ERROR|SL_TRACE,
"tcp_bind: attempt to bind port "
@@ -26083,10 +20194,10 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
if (!user_specified) {
int err;
- err = tsol_mlp_anon(zone, mlptype, connp->conn_ulp,
+ err = tsol_mlp_anon(zone, mlptype, connp->conn_proto,
requested_port, B_TRUE);
if (err != 0) {
- if (tcp->tcp_debug) {
+ if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1,
SL_ERROR|SL_TRACE,
"tcp_bind: cannot establish anon "
@@ -26101,17 +20212,18 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
}
allocated_port = tcp_bindi(tcp, requested_port, &v6addr,
- tcp->tcp_reuseaddr, B_FALSE, bind_to_req_port_only, user_specified);
+ connp->conn_reuseaddr, B_FALSE, bind_to_req_port_only,
+ user_specified);
if (allocated_port == 0) {
connp->conn_mlp_type = mlptSingle;
if (connp->conn_anon_port) {
connp->conn_anon_port = B_FALSE;
- (void) tsol_mlp_anon(zone, mlptype, connp->conn_ulp,
+ (void) tsol_mlp_anon(zone, mlptype, connp->conn_proto,
requested_port, B_FALSE);
}
if (bind_to_req_port_only) {
- if (tcp->tcp_debug) {
+ if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1,
SL_ERROR|SL_TRACE,
"tcp_bind: requested addr busy");
@@ -26119,7 +20231,7 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
return (-TADDRBUSY);
} else {
/* If we are out of ports, fail the bind. */
- if (tcp->tcp_debug) {
+ if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1,
SL_ERROR|SL_TRACE,
"tcp_bind: out of ports?");
@@ -26133,6 +20245,9 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
return (0);
}
+/*
+ * Check the address and check/pick a local port number.
+ */
static int
tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
boolean_t bind_to_req_port_only)
@@ -26140,18 +20255,22 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
tcp_t *tcp = connp->conn_tcp;
sin_t *sin;
sin6_t *sin6;
- in_port_t requested_port;
+ in_port_t requested_port;
ipaddr_t v4addr;
in6_addr_t v6addr;
- uint_t ipversion;
- int error = 0;
+ ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */
+ zoneid_t zoneid = IPCL_ZONEID(connp);
+ ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
+ uint_t scopeid = 0;
+ int error = 0;
+ ip_xmit_attr_t *ixa = connp->conn_ixa;
ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX);
if (tcp->tcp_state == TCPS_BOUND) {
return (0);
} else if (tcp->tcp_state > TCPS_BOUND) {
- if (tcp->tcp_debug) {
+ if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
"tcp_bind: bad state, %d", tcp->tcp_state);
}
@@ -26161,7 +20280,7 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
ASSERT(sa != NULL && len != 0);
if (!OK_32PTR((char *)sa)) {
- if (tcp->tcp_debug) {
+ if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1,
SL_ERROR|SL_TRACE,
"tcp_bind: bad address parameter, "
@@ -26171,38 +20290,48 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
return (-TPROTO);
}
+ error = proto_verify_ip_addr(connp->conn_family, sa, len);
+ if (error != 0) {
+ return (error);
+ }
+
switch (len) {
case sizeof (sin_t): /* Complete IPv4 address */
sin = (sin_t *)sa;
- /*
- * With sockets sockfs will accept bogus sin_family in
- * bind() and replace it with the family used in the socket
- * call.
- */
- if (sin->sin_family != AF_INET ||
- tcp->tcp_family != AF_INET) {
- return (EAFNOSUPPORT);
- }
requested_port = ntohs(sin->sin_port);
- ipversion = IPV4_VERSION;
v4addr = sin->sin_addr.s_addr;
IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr);
+ if (v4addr != INADDR_ANY) {
+ laddr_type = ip_laddr_verify_v4(v4addr, zoneid, ipst,
+ B_FALSE);
+ }
break;
case sizeof (sin6_t): /* Complete IPv6 address */
sin6 = (sin6_t *)sa;
- if (sin6->sin6_family != AF_INET6 ||
- tcp->tcp_family != AF_INET6) {
- return (EAFNOSUPPORT);
- }
- requested_port = ntohs(sin6->sin6_port);
- ipversion = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) ?
- IPV4_VERSION : IPV6_VERSION;
v6addr = sin6->sin6_addr;
+ requested_port = ntohs(sin6->sin6_port);
+ if (IN6_IS_ADDR_V4MAPPED(&v6addr)) {
+ if (connp->conn_ipv6_v6only)
+ return (EADDRNOTAVAIL);
+
+ IN6_V4MAPPED_TO_IPADDR(&v6addr, v4addr);
+ if (v4addr != INADDR_ANY) {
+ laddr_type = ip_laddr_verify_v4(v4addr,
+ zoneid, ipst, B_FALSE);
+ }
+ } else {
+ if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) {
+ if (IN6_IS_ADDR_LINKSCOPE(&v6addr))
+ scopeid = sin6->sin6_scope_id;
+ laddr_type = ip_laddr_verify_v6(&v6addr,
+ zoneid, ipst, B_FALSE, scopeid);
+ }
+ }
break;
default:
- if (tcp->tcp_debug) {
+ if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
"tcp_bind: bad address length, %d", len);
}
@@ -26210,34 +20339,32 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
/* return (-TBADADDR); */
}
- tcp->tcp_bound_source_v6 = v6addr;
+ /* Is the local address a valid unicast address? */
+ if (laddr_type == IPVL_BAD)
+ return (EADDRNOTAVAIL);
- /* Check for change in ipversion */
- if (tcp->tcp_ipversion != ipversion) {
- ASSERT(tcp->tcp_family == AF_INET6);
- error = (ipversion == IPV6_VERSION) ?
- tcp_header_init_ipv6(tcp) : tcp_header_init_ipv4(tcp);
- if (error) {
- return (ENOMEM);
- }
- }
-
- /*
- * Initialize family specific fields. Copy of the src addr.
- * in tcp_t is needed for the lookup funcs.
- */
- if (tcp->tcp_ipversion == IPV6_VERSION) {
- tcp->tcp_ip6h->ip6_src = v6addr;
+ connp->conn_bound_addr_v6 = v6addr;
+ if (scopeid != 0) {
+ ixa->ixa_flags |= IXAF_SCOPEID_SET;
+ ixa->ixa_scopeid = scopeid;
+ connp->conn_incoming_ifindex = scopeid;
} else {
- IN6_V4MAPPED_TO_IPADDR(&v6addr, tcp->tcp_ipha->ipha_src);
+ ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
+ connp->conn_incoming_ifindex = connp->conn_bound_if;
}
- tcp->tcp_ip_src_v6 = v6addr;
+
+ connp->conn_laddr_v6 = v6addr;
+ connp->conn_saddr_v6 = v6addr;
bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only;
error = tcp_bind_select_lport(tcp, &requested_port,
bind_to_req_port_only, cr);
-
+ if (error != 0) {
+ connp->conn_laddr_v6 = ipv6_all_zeros;
+ connp->conn_saddr_v6 = ipv6_all_zeros;
+ connp->conn_bound_addr_v6 = ipv6_all_zeros;
+ }
return (error);
}
@@ -26253,7 +20380,7 @@ tcp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
tcp_t *tcp = connp->conn_tcp;
if (tcp->tcp_state >= TCPS_BOUND) {
- if (tcp->tcp_debug) {
+ if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
"tcp_bind: bad state, %d", tcp->tcp_state);
}
@@ -26265,19 +20392,8 @@ tcp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
return (error);
ASSERT(tcp->tcp_state == TCPS_BOUND);
-
tcp->tcp_conn_req_max = 0;
-
- if (tcp->tcp_family == AF_INET6) {
- ASSERT(tcp->tcp_connp->conn_af_isv6);
- error = ip_proto_bind_laddr_v6(connp, NULL, IPPROTO_TCP,
- &tcp->tcp_bound_source_v6, 0, B_FALSE);
- } else {
- ASSERT(!tcp->tcp_connp->conn_af_isv6);
- error = ip_proto_bind_laddr_v4(connp, NULL, IPPROTO_TCP,
- tcp->tcp_ipha->ipha_src, 0, B_FALSE);
- }
- return (tcp_post_ip_bind(tcp, NULL, error, NULL, 0));
+ return (0);
}
int
@@ -26337,7 +20453,14 @@ tcp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
ipaddr_t *dstaddrp;
in_port_t dstport;
uint_t srcid;
- int error = 0;
+ int error;
+ uint32_t mss;
+ mblk_t *syn_mp;
+ tcp_stack_t *tcps = tcp->tcp_tcps;
+ int32_t oldstate;
+ ip_xmit_attr_t *ixa = connp->conn_ixa;
+
+ oldstate = tcp->tcp_state;
switch (len) {
default:
@@ -26351,7 +20474,7 @@ tcp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
if (sin->sin_port == 0) {
return (-TBADADDR);
}
- if (tcp->tcp_connp && tcp->tcp_connp->conn_ipv6_v6only) {
+ if (connp->conn_ipv6_v6only) {
return (EAFNOSUPPORT);
}
break;
@@ -26365,23 +20488,18 @@ tcp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
}
/*
* If we're connecting to an IPv4-mapped IPv6 address, we need to
- * make sure that the template IP header in the tcp structure is an
- * IPv4 header, and that the tcp_ipversion is IPV4_VERSION. We
+ * make sure that the conn_ipversion is IPV4_VERSION. We
* need to this before we call tcp_bindi() so that the port lookup
* code will look for ports in the correct port space (IPv4 and
* IPv6 have separate port spaces).
*/
- if (tcp->tcp_family == AF_INET6 && tcp->tcp_ipversion == IPV6_VERSION &&
+ if (connp->conn_family == AF_INET6 &&
+ connp->conn_ipversion == IPV6_VERSION &&
IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
- int err = 0;
+ if (connp->conn_ipv6_v6only)
+ return (EADDRNOTAVAIL);
- err = tcp_header_init_ipv4(tcp);
- if (err != 0) {
- error = ENOMEM;
- goto connect_failed;
- }
- if (tcp->tcp_lport != 0)
- *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport;
+ connp->conn_ipversion = IPV4_VERSION;
}
switch (tcp->tcp_state) {
@@ -26399,43 +20517,147 @@ tcp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
*/
/* FALLTHRU */
case TCPS_BOUND:
- if (tcp->tcp_family == AF_INET6) {
- if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
- return (tcp_connect_ipv6(tcp,
- &sin6->sin6_addr,
- sin6->sin6_port, sin6->sin6_flowinfo,
- sin6->__sin6_src_id, sin6->sin6_scope_id,
- cr, pid));
- }
+ break;
+ default:
+ return (-TOUTSTATE);
+ }
+
+ /*
+ * We update our cred/cpid based on the caller of connect
+ */
+ if (connp->conn_cred != cr) {
+ crhold(cr);
+ crfree(connp->conn_cred);
+ connp->conn_cred = cr;
+ }
+ connp->conn_cpid = pid;
+
+ /* Cache things in the ixa without any refhold */
+ ixa->ixa_cred = cr;
+ ixa->ixa_cpid = pid;
+ if (is_system_labeled()) {
+ /* We need to restart with a label based on the cred */
+ ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
+ }
+
+ if (connp->conn_family == AF_INET6) {
+ if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+ error = tcp_connect_ipv6(tcp, &sin6->sin6_addr,
+ sin6->sin6_port, sin6->sin6_flowinfo,
+ sin6->__sin6_src_id, sin6->sin6_scope_id);
+ } else {
/*
* Destination adress is mapped IPv6 address.
* Source bound address should be unspecified or
* IPv6 mapped address as well.
*/
if (!IN6_IS_ADDR_UNSPECIFIED(
- &tcp->tcp_bound_source_v6) &&
- !IN6_IS_ADDR_V4MAPPED(&tcp->tcp_bound_source_v6)) {
+ &connp->conn_bound_addr_v6) &&
+ !IN6_IS_ADDR_V4MAPPED(&connp->conn_bound_addr_v6)) {
return (EADDRNOTAVAIL);
}
dstaddrp = &V4_PART_OF_V6((sin6->sin6_addr));
dstport = sin6->sin6_port;
srcid = sin6->__sin6_src_id;
- } else {
- dstaddrp = &sin->sin_addr.s_addr;
- dstport = sin->sin_port;
- srcid = 0;
+ error = tcp_connect_ipv4(tcp, dstaddrp, dstport,
+ srcid);
}
+ } else {
+ dstaddrp = &sin->sin_addr.s_addr;
+ dstport = sin->sin_port;
+ srcid = 0;
+ error = tcp_connect_ipv4(tcp, dstaddrp, dstport, srcid);
+ }
- error = tcp_connect_ipv4(tcp, dstaddrp, dstport, srcid, cr,
- pid);
- break;
- default:
- return (-TOUTSTATE);
+ if (error != 0)
+ goto connect_failed;
+
+ CL_INET_CONNECT(connp, B_TRUE, error);
+ if (error != 0)
+ goto connect_failed;
+
+ /* connect succeeded */
+ BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens);
+ tcp->tcp_active_open = 1;
+
+ /*
+ * tcp_set_destination() does not adjust for TCP/IP header length.
+ */
+ mss = tcp->tcp_mss - connp->conn_ht_iphc_len;
+
+ /*
+ * Just make sure our rwnd is at least rcvbuf * MSS large, and round up
+ * to the nearest MSS.
+ *
+ * We do the round up here because we need to get the interface MTU
+ * first before we can do the round up.
+ */
+ tcp->tcp_rwnd = connp->conn_rcvbuf;
+ tcp->tcp_rwnd = MAX(MSS_ROUNDUP(tcp->tcp_rwnd, mss),
+ tcps->tcps_recv_hiwat_minmss * mss);
+ connp->conn_rcvbuf = tcp->tcp_rwnd;
+ tcp_set_ws_value(tcp);
+ tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws);
+ if (tcp->tcp_rcv_ws > 0 || tcps->tcps_wscale_always)
+ tcp->tcp_snd_ws_ok = B_TRUE;
+
+ /*
+ * Set tcp_snd_ts_ok to true
+ * so that tcp_xmit_mp will
+ * include the timestamp
+ * option in the SYN segment.
+ */
+ if (tcps->tcps_tstamp_always ||
+ (tcp->tcp_rcv_ws && tcps->tcps_tstamp_if_wscale)) {
+ tcp->tcp_snd_ts_ok = B_TRUE;
}
+
/*
- * Note: Code below is the "failure" case
+ * tcp_snd_sack_ok can be set in
+ * tcp_set_destination() if the sack metric
+ * is set. So check it here also.
+ */
+ if (tcps->tcps_sack_permitted == 2 ||
+ tcp->tcp_snd_sack_ok) {
+ if (tcp->tcp_sack_info == NULL) {
+ tcp->tcp_sack_info = kmem_cache_alloc(
+ tcp_sack_info_cache, KM_SLEEP);
+ }
+ tcp->tcp_snd_sack_ok = B_TRUE;
+ }
+
+ /*
+ * Should we use ECN? Note that the current
+ * default value (SunOS 5.9) of tcp_ecn_permitted
+ * is 1. The reason for doing this is that there
+ * are equipments out there that will drop ECN
+ * enabled IP packets. Setting it to 1 avoids
+ * compatibility problems.
*/
+ if (tcps->tcps_ecn_permitted == 2)
+ tcp->tcp_ecn_ok = B_TRUE;
+
+ TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
+ syn_mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL,
+ tcp->tcp_iss, B_FALSE, NULL, B_FALSE);
+ if (syn_mp != NULL) {
+ /*
+ * We must bump the generation before sending the syn
+ * to ensure that we use the right generation in case
+ * this thread issues a "connected" up call.
+ */
+ SOCK_CONNID_BUMP(tcp->tcp_connid);
+ tcp_send_data(tcp, syn_mp);
+ }
+
+ if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
+ tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
+ return (0);
+
connect_failed:
+ connp->conn_faddr_v6 = ipv6_all_zeros;
+ connp->conn_fport = 0;
+ tcp->tcp_state = oldstate;
if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
return (error);
@@ -26446,7 +20668,6 @@ tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
socklen_t len, sock_connid_t *id, cred_t *cr)
{
conn_t *connp = (conn_t *)proto_handle;
- tcp_t *tcp = connp->conn_tcp;
squeue_t *sqp = connp->conn_sqp;
int error;
@@ -26455,7 +20676,7 @@ tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
/* All Solaris components should pass a cred for this operation. */
ASSERT(cr != NULL);
- error = proto_verify_ip_addr(tcp->tcp_family, sa, len);
+ error = proto_verify_ip_addr(connp->conn_family, sa, len);
if (error != 0) {
return (error);
}
@@ -26493,7 +20714,7 @@ tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
}
}
- if (tcp->tcp_loopback) {
+ if (connp->conn_tcp->tcp_loopback) {
struct sock_proto_props sopp;
sopp.sopp_flags = SOCKOPT_LOOPBACK;
@@ -26521,7 +20742,7 @@ tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
return (NULL);
}
- connp = tcp_create_common(NULL, credp, isv6, B_TRUE, errorp);
+ connp = tcp_create_common(credp, isv6, B_TRUE, errorp);
if (connp == NULL) {
return (NULL);
}
@@ -26578,8 +20799,8 @@ tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
connp->conn_upcalls = sock_upcalls;
connp->conn_upper_handle = sock_handle;
- ASSERT(connp->conn_tcp->tcp_recv_hiwater != 0 &&
- connp->conn_tcp->tcp_recv_hiwater == connp->conn_tcp->tcp_rwnd);
+ ASSERT(connp->conn_rcvbuf != 0 &&
+ connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd);
(*sock_upcalls->su_set_proto_props)(sock_handle, &sopp);
}
@@ -26663,7 +20884,7 @@ tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
/*
* Squeue Flow Control
*/
- if (TCP_UNSENT_BYTES(tcp) > tcp->tcp_xmit_hiwater) {
+ if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) {
tcp_setqfull(tcp);
}
mutex_exit(&tcp->tcp_non_sq_lock);
@@ -26680,12 +20901,11 @@ tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
CONN_INC_REF(connp);
if (msg->msg_flags & MSG_OOB) {
- SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
- tcp_output_urgent, connp, tcp_squeue_flag,
- SQTAG_TCP_OUTPUT);
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent,
+ connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
} else {
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output,
- connp, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
+ connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
}
return (0);
@@ -26698,9 +20918,9 @@ tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
return (0);
}
-/* ARGSUSED */
+/* ARGSUSED2 */
void
-tcp_output_urgent(void *arg, mblk_t *mp, void *arg2)
+tcp_output_urgent(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
{
int len;
uint32_t msize;
@@ -26739,7 +20959,7 @@ tcp_output_urgent(void *arg, mblk_t *mp, void *arg2)
tcp_wput_data(tcp, mp, B_TRUE);
}
-/* ARGSUSED */
+/* ARGSUSED3 */
int
tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr,
socklen_t *addrlenp, cred_t *cr)
@@ -26752,24 +20972,24 @@ tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr,
ASSERT(cr != NULL);
ASSERT(tcp != NULL);
+ if (tcp->tcp_state < TCPS_SYN_RCVD)
+ return (ENOTCONN);
- return (tcp_do_getpeername(tcp, addr, addrlenp));
+ return (conn_getpeername(connp, addr, addrlenp));
}
-/* ARGSUSED */
+/* ARGSUSED3 */
int
tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
socklen_t *addrlenp, cred_t *cr)
{
conn_t *connp = (conn_t *)proto_handle;
- tcp_t *tcp = connp->conn_tcp;
/* All Solaris components should pass a cred for this operation. */
ASSERT(cr != NULL);
ASSERT(connp->conn_upper_handle != NULL);
-
- return (tcp_do_getsockname(tcp, addr, addrlenp));
+ return (conn_getsockname(connp, addr, addrlenp));
}
/*
@@ -26809,8 +21029,8 @@ tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
RD(q)->q_ptr = WR(q)->q_ptr = connp;
- connp->conn_tcp->tcp_rq = connp->conn_rq = RD(q);
- connp->conn_tcp->tcp_wq = connp->conn_wq = WR(q);
+ connp->conn_rq = RD(q);
+ connp->conn_wq = WR(q);
WR(q)->q_qinfo = &tcp_sock_winit;
@@ -26830,11 +21050,11 @@ tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
stropt_mp->b_wptr += sizeof (struct stroptions);
stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
- stropt->so_wroff = tcp->tcp_hdr_len + (tcp->tcp_loopback ? 0 :
+ stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 :
tcp->tcp_tcps->tcps_wroff_xtra);
if (tcp->tcp_snd_sack_ok)
stropt->so_wroff += TCPOPT_MAX_SACK_LEN;
- stropt->so_hiwat = tcp->tcp_recv_hiwater;
+ stropt->so_hiwat = connp->conn_rcvbuf;
stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
putnext(RD(q), stropt_mp);
@@ -26845,15 +21065,17 @@ tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID);
laddrlen = faddrlen = sizeof (sin6_t);
- (void) tcp_do_getsockname(tcp, (struct sockaddr *)&laddr, &laddrlen);
- error = tcp_do_getpeername(tcp, (struct sockaddr *)&faddr, &faddrlen);
+ (void) tcp_getsockname((sock_lower_handle_t)connp,
+ (struct sockaddr *)&laddr, &laddrlen, CRED());
+ error = tcp_getpeername((sock_lower_handle_t)connp,
+ (struct sockaddr *)&faddr, &faddrlen, CRED());
if (error != 0)
faddrlen = 0;
opts = 0;
- if (tcp->tcp_oobinline)
+ if (connp->conn_oobinline)
opts |= SO_OOBINLINE;
- if (tcp->tcp_dontroute)
+ if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
opts |= SO_DONTROUTE;
/*
@@ -26868,6 +21090,7 @@ tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
while ((mp = tcp->tcp_rcv_list) != NULL) {
tcp->tcp_rcv_list = mp->b_next;
mp->b_next = NULL;
+ /* We never do fallback for kernel RPC */
putnext(q, mp);
}
tcp->tcp_rcv_last_head = NULL;
@@ -26908,7 +21131,7 @@ tcp_fallback_eager(tcp_t *eager, boolean_t direct_sockfs)
* Sockfs guarantees that the listener will not be closed
* during fallback. So we can safely use the listener's queue.
*/
- putnext(listener->tcp_rq, mp);
+ putnext(listener->tcp_connp->conn_rq, mp);
}
int
@@ -26987,7 +21210,7 @@ tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
/* ARGSUSED */
static void
-tcp_shutdown_output(void *arg, mblk_t *mp, void *arg2)
+tcp_shutdown_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
{
conn_t *connp = (conn_t *)arg;
tcp_t *tcp = connp->conn_tcp;
@@ -27002,7 +21225,7 @@ tcp_shutdown_output(void *arg, mblk_t *mp, void *arg2)
* We were crossing FINs and got a reset from
* the other side. Just ignore it.
*/
- if (tcp->tcp_debug) {
+ if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1,
SL_ERROR|SL_TRACE,
"tcp_shutdown_output() out of state %s",
@@ -27036,7 +21259,7 @@ tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
CONN_INC_REF(connp);
SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output,
- connp, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT);
+ connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT);
(*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
SOCK_OPCTL_SHUT_SEND, 0);
@@ -27109,7 +21332,7 @@ tcp_do_listen(conn_t *connp, struct sockaddr *sa, socklen_t len,
*/
goto do_listen;
}
- if (tcp->tcp_debug) {
+ if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
"tcp_listen: bad state, %d", tcp->tcp_state);
}
@@ -27121,15 +21344,14 @@ tcp_do_listen(conn_t *connp, struct sockaddr *sa, socklen_t len,
sin6_t *sin6;
ASSERT(IPCL_IS_NONSTR(connp));
-
/* Do an implicit bind: Request for a generic port. */
- if (tcp->tcp_family == AF_INET) {
+ if (connp->conn_family == AF_INET) {
len = sizeof (sin_t);
sin = (sin_t *)&addr;
*sin = sin_null;
sin->sin_family = AF_INET;
} else {
- ASSERT(tcp->tcp_family == AF_INET6);
+ ASSERT(connp->conn_family == AF_INET6);
len = sizeof (sin6_t);
sin6 = (sin6_t *)&addr;
*sin6 = sin6_null;
@@ -27171,23 +21393,42 @@ do_listen:
}
/*
- * We can call ip_bind directly, the processing continues
- * in tcp_post_ip_bind().
- *
* We need to make sure that the conn_recv is set to a non-null
* value before we insert the conn into the classifier table.
* This is to avoid a race with an incoming packet which does an
* ipcl_classify().
+ * We initially set it to tcp_input_listener_unbound to try to
+ * pick a good squeue for the listener when the first SYN arrives.
+ * tcp_input_listener_unbound sets it to tcp_input_listener on that
+ * first SYN.
*/
- connp->conn_recv = tcp_conn_request;
- if (tcp->tcp_family == AF_INET) {
- error = ip_proto_bind_laddr_v4(connp, NULL,
- IPPROTO_TCP, tcp->tcp_bound_source, tcp->tcp_lport, B_TRUE);
- } else {
- error = ip_proto_bind_laddr_v6(connp, NULL, IPPROTO_TCP,
- &tcp->tcp_bound_source_v6, tcp->tcp_lport, B_TRUE);
+ connp->conn_recv = tcp_input_listener_unbound;
+
+ /* Insert the listener in the classifier table */
+ error = ip_laddr_fanout_insert(connp);
+ if (error != 0) {
+ /* Undo the bind - release the port number */
+ tcp->tcp_state = TCPS_IDLE;
+ connp->conn_bound_addr_v6 = ipv6_all_zeros;
+
+ connp->conn_laddr_v6 = ipv6_all_zeros;
+ connp->conn_saddr_v6 = ipv6_all_zeros;
+ connp->conn_ports = 0;
+
+ if (connp->conn_anon_port) {
+ zone_t *zone;
+
+ zone = crgetzone(cr);
+ connp->conn_anon_port = B_FALSE;
+ (void) tsol_mlp_anon(zone, connp->conn_mlp_type,
+ connp->conn_proto, connp->conn_lport, B_FALSE);
+ }
+ connp->conn_mlp_type = mlptSingle;
+
+ tcp_bind_hash_remove(tcp);
+ return (error);
}
- return (tcp_post_ip_bind(tcp, NULL, error, NULL, 0));
+ return (error);
}
void
@@ -27222,7 +21463,7 @@ tcp_clr_flowctrl(sock_lower_handle_t proto_handle)
if (tcp->tcp_fused) {
tcp_fuse_backenable(tcp);
} else {
- tcp->tcp_rwnd = tcp->tcp_recv_hiwater;
+ tcp->tcp_rwnd = connp->conn_rcvbuf;
/*
* Send back a window update immediately if TCP is above
* ESTABLISHED state and the increase of the rcv window
@@ -27253,10 +21494,28 @@ tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
/* All Solaris components should pass a cred for this operation. */
ASSERT(cr != NULL);
+ /*
+ * If we don't have a helper stream then create one.
+ * ip_create_helper_stream takes care of locking the conn_t,
+ * so this check for NULL is just a performance optimization.
+ */
+ if (connp->conn_helper_info == NULL) {
+ tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps;
+
+ /*
+ * Create a helper stream for non-STREAMS socket.
+ */
+ error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident);
+ if (error != 0) {
+ ip0dbg(("tcp_ioctl: create of IP helper stream "
+ "failed %d\n", error));
+ return (error);
+ }
+ }
+
switch (cmd) {
case ND_SET:
case ND_GET:
- case TCP_IOC_DEFAULT_Q:
case _SIOCSOCKFALLBACK:
case TCP_IOC_ABORT_CONN:
case TI_GETPEERNAME:
diff --git a/usr/src/uts/common/inet/tcp/tcp_fusion.c b/usr/src/uts/common/inet/tcp/tcp_fusion.c
index 3ee909cc4d..313b024943 100644
--- a/usr/src/uts/common/inet/tcp/tcp_fusion.c
+++ b/usr/src/uts/common/inet/tcp/tcp_fusion.c
@@ -69,50 +69,6 @@
boolean_t do_tcp_fusion = B_TRUE;
/*
- * Return true if this connection needs some IP functionality
- */
-static boolean_t
-tcp_loopback_needs_ip(tcp_t *tcp, netstack_t *ns)
-{
- ipsec_stack_t *ipss = ns->netstack_ipsec;
-
- /*
- * If ire is not cached, do not use fusion
- */
- if (tcp->tcp_connp->conn_ire_cache == NULL) {
- /*
- * There is no need to hold conn_lock here because when called
- * from tcp_fuse() there can be no window where conn_ire_cache
- * can change. This is not true when called from
- * tcp_fuse_output() as conn_ire_cache can become null just
- * after the check. It will be necessary to recheck for a NULL
- * conn_ire_cache in tcp_fuse_output() to avoid passing a
- * stale ill pointer to FW_HOOKS.
- */
- return (B_TRUE);
- }
- if (tcp->tcp_ipversion == IPV4_VERSION) {
- if (tcp->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH)
- return (B_TRUE);
- if (CONN_OUTBOUND_POLICY_PRESENT(tcp->tcp_connp, ipss))
- return (B_TRUE);
- if (CONN_INBOUND_POLICY_PRESENT(tcp->tcp_connp, ipss))
- return (B_TRUE);
- } else {
- if (tcp->tcp_ip_hdr_len != IPV6_HDR_LEN)
- return (B_TRUE);
- if (CONN_OUTBOUND_POLICY_PRESENT_V6(tcp->tcp_connp, ipss))
- return (B_TRUE);
- if (CONN_INBOUND_POLICY_PRESENT_V6(tcp->tcp_connp, ipss))
- return (B_TRUE);
- }
- if (!CONN_IS_LSO_MD_FASTPATH(tcp->tcp_connp))
- return (B_TRUE);
- return (B_FALSE);
-}
-
-
-/*
* This routine gets called by the eager tcp upon changing state from
* SYN_RCVD to ESTABLISHED. It fuses a direct path between itself
* and the active connect tcp such that the regular tcp processings
@@ -124,10 +80,10 @@ tcp_loopback_needs_ip(tcp_t *tcp, netstack_t *ns)
* same squeue as the one given to the active connect tcp during open.
*/
void
-tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph)
+tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcpha_t *tcpha)
{
- conn_t *peer_connp, *connp = tcp->tcp_connp;
- tcp_t *peer_tcp;
+ conn_t *peer_connp, *connp = tcp->tcp_connp;
+ tcp_t *peer_tcp;
tcp_stack_t *tcps = tcp->tcp_tcps;
netstack_t *ns;
ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
@@ -136,20 +92,16 @@ tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph)
ASSERT(tcp->tcp_loopback);
ASSERT(tcp->tcp_loopback_peer == NULL);
/*
- * We need to inherit tcp_recv_hiwater of the listener tcp,
+ * We need to inherit conn_rcvbuf of the listener tcp,
* but we can't really use tcp_listener since we get here after
- * sending up T_CONN_IND and tcp_wput_accept() may be called
+ * sending up T_CONN_IND and tcp_tli_accept() may be called
* independently, at which point tcp_listener is cleared;
* this is why we use tcp_saved_listener. The listener itself
* is guaranteed to be around until tcp_accept_finish() is called
* on this eager -- this won't happen until we're done since we're
* inside the eager's perimeter now.
- *
- * We can also get called in the case were a connection needs
- * to be re-fused. In this case tcp_saved_listener will be
- * NULL but tcp_refuse will be true.
*/
- ASSERT(tcp->tcp_saved_listener != NULL || tcp->tcp_refuse);
+ ASSERT(tcp->tcp_saved_listener != NULL);
/*
* Lookup peer endpoint; search for the remote endpoint having
* the reversed address-port quadruplet in ESTABLISHED state,
@@ -157,12 +109,12 @@ tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph)
* is applied accordingly for loopback address, but not for
* local address since we want fusion to happen across Zones.
*/
- if (tcp->tcp_ipversion == IPV4_VERSION) {
+ if (connp->conn_ipversion == IPV4_VERSION) {
peer_connp = ipcl_conn_tcp_lookup_reversed_ipv4(connp,
- (ipha_t *)iphdr, tcph, ipst);
+ (ipha_t *)iphdr, tcpha, ipst);
} else {
peer_connp = ipcl_conn_tcp_lookup_reversed_ipv6(connp,
- (ip6_t *)iphdr, tcph, ipst);
+ (ip6_t *)iphdr, tcpha, ipst);
}
/*
@@ -202,28 +154,20 @@ tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph)
/*
* Fuse the endpoints; we perform further checks against both
* tcp endpoints to ensure that a fusion is allowed to happen.
- * In particular we bail out for non-simple TCP/IP or if IPsec/
- * IPQoS policy/kernel SSL exists. We also need to check if
- * the connection is quiescent to cover the case when we are
- * trying to re-enable fusion after IPobservability is turned off.
+ * In particular we bail out if kernel SSL exists.
*/
ns = tcps->tcps_netstack;
ipst = ns->netstack_ip;
if (!tcp->tcp_unfusable && !peer_tcp->tcp_unfusable &&
- !tcp_loopback_needs_ip(tcp, ns) &&
- !tcp_loopback_needs_ip(peer_tcp, ns) &&
- tcp->tcp_kssl_ent == NULL &&
- tcp->tcp_xmit_head == NULL && peer_tcp->tcp_xmit_head == NULL &&
- !IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN, ipst)) {
+ (tcp->tcp_kssl_ent == NULL) && (tcp->tcp_xmit_head == NULL) &&
+ (peer_tcp->tcp_xmit_head == NULL)) {
mblk_t *mp;
- queue_t *peer_rq = peer_tcp->tcp_rq;
+ queue_t *peer_rq = peer_connp->conn_rq;
ASSERT(!TCP_IS_DETACHED(peer_tcp));
- ASSERT(tcp->tcp_fused_sigurg_mp == NULL ||
- (!IPCL_IS_NONSTR(connp) && tcp->tcp_refuse));
- ASSERT(peer_tcp->tcp_fused_sigurg_mp == NULL ||
- (!IPCL_IS_NONSTR(peer_connp) && peer_tcp->tcp_refuse));
+ ASSERT(tcp->tcp_fused_sigurg_mp == NULL);
+ ASSERT(peer_tcp->tcp_fused_sigurg_mp == NULL);
ASSERT(tcp->tcp_kssl_ctx == NULL);
/*
@@ -272,54 +216,40 @@ tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph)
tcp_timers_stop(tcp);
tcp_timers_stop(peer_tcp);
- if (!tcp->tcp_refuse) {
- /*
- * Set receive buffer and max packet size for the
- * active open tcp.
- * eager's values will be set in tcp_accept_finish.
- */
-
- (void) tcp_rwnd_set(peer_tcp,
- peer_tcp->tcp_recv_hiwater);
+ /*
+ * Set receive buffer and max packet size for the
+ * active open tcp.
+ * eager's values will be set in tcp_accept_finish.
+ */
+ (void) tcp_rwnd_set(peer_tcp, peer_tcp->tcp_connp->conn_rcvbuf);
- /*
- * Set the write offset value to zero since we won't
- * be needing any room for TCP/IP headers.
- */
- if (!IPCL_IS_NONSTR(peer_tcp->tcp_connp)) {
- struct stroptions *stropt;
+ /*
+ * Set the write offset value to zero since we won't
+ * be needing any room for TCP/IP headers.
+ */
+ if (!IPCL_IS_NONSTR(peer_tcp->tcp_connp)) {
+ struct stroptions *stropt;
- DB_TYPE(mp) = M_SETOPTS;
- mp->b_wptr += sizeof (*stropt);
+ DB_TYPE(mp) = M_SETOPTS;
+ mp->b_wptr += sizeof (*stropt);
- stropt = (struct stroptions *)mp->b_rptr;
- stropt->so_flags = SO_WROFF;
- stropt->so_wroff = 0;
+ stropt = (struct stroptions *)mp->b_rptr;
+ stropt->so_flags = SO_WROFF;
+ stropt->so_wroff = 0;
- /* Send the options up */
- putnext(peer_rq, mp);
- } else {
- struct sock_proto_props sopp;
+ /* Send the options up */
+ putnext(peer_rq, mp);
+ } else {
+ struct sock_proto_props sopp;
- /* The peer is a non-STREAMS end point */
- ASSERT(IPCL_IS_TCP(peer_connp));
+ /* The peer is a non-STREAMS end point */
+ ASSERT(IPCL_IS_TCP(peer_connp));
- sopp.sopp_flags = SOCKOPT_WROFF;
- sopp.sopp_wroff = 0;
- (*peer_connp->conn_upcalls->su_set_proto_props)
- (peer_connp->conn_upper_handle, &sopp);
- }
- } else {
- /*
- * Endpoints are being re-fused, so options will not
- * be sent up. In case of STREAMS, free the stroptions
- * mblk.
- */
- if (!IPCL_IS_NONSTR(connp))
- freemsg(mp);
+ sopp.sopp_flags = SOCKOPT_WROFF;
+ sopp.sopp_wroff = 0;
+ (*peer_connp->conn_upcalls->su_set_proto_props)
+ (peer_connp->conn_upper_handle, &sopp);
}
- tcp->tcp_refuse = B_FALSE;
- peer_tcp->tcp_refuse = B_FALSE;
} else {
TCP_STAT(tcps, tcp_fusion_unqualified);
}
@@ -374,12 +304,12 @@ tcp_unfuse(tcp_t *tcp)
* when called from tcp_rcv_drain().
*/
if (!TCP_IS_DETACHED(tcp)) {
- (void) tcp_fuse_rcv_drain(tcp->tcp_rq, tcp,
+ (void) tcp_fuse_rcv_drain(tcp->tcp_connp->conn_rq, tcp,
&tcp->tcp_fused_sigurg_mp);
}
if (!TCP_IS_DETACHED(peer_tcp)) {
- (void) tcp_fuse_rcv_drain(peer_tcp->tcp_rq, peer_tcp,
- &peer_tcp->tcp_fused_sigurg_mp);
+ (void) tcp_fuse_rcv_drain(peer_tcp->tcp_connp->conn_rq,
+ peer_tcp, &peer_tcp->tcp_fused_sigurg_mp);
}
/* Lift up any flow-control conditions */
@@ -398,12 +328,12 @@ tcp_unfuse(tcp_t *tcp)
mutex_exit(&peer_tcp->tcp_non_sq_lock);
/*
- * Update th_seq and th_ack in the header template
+ * Update tha_seq and tha_ack in the header template
*/
- U32_TO_ABE32(tcp->tcp_snxt, tcp->tcp_tcph->th_seq);
- U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack);
- U32_TO_ABE32(peer_tcp->tcp_snxt, peer_tcp->tcp_tcph->th_seq);
- U32_TO_ABE32(peer_tcp->tcp_rnxt, peer_tcp->tcp_tcph->th_ack);
+ tcp->tcp_tcpha->tha_seq = htonl(tcp->tcp_snxt);
+ tcp->tcp_tcpha->tha_ack = htonl(tcp->tcp_rnxt);
+ peer_tcp->tcp_tcpha->tha_seq = htonl(peer_tcp->tcp_snxt);
+ peer_tcp->tcp_tcpha->tha_ack = htonl(peer_tcp->tcp_rnxt);
/* Unfuse the endpoints */
peer_tcp->tcp_fused = tcp->tcp_fused = B_FALSE;
@@ -509,59 +439,28 @@ tcp_fuse_output_urg(tcp_t *tcp, mblk_t *mp)
boolean_t
tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size)
{
- tcp_t *peer_tcp = tcp->tcp_loopback_peer;
- boolean_t flow_stopped, peer_data_queued = B_FALSE;
- boolean_t urgent = (DB_TYPE(mp) != M_DATA);
- boolean_t push = B_TRUE;
- mblk_t *mp1 = mp;
- ill_t *ilp, *olp;
- ipif_t *iifp, *oifp;
- ipha_t *ipha;
- ip6_t *ip6h;
- tcph_t *tcph;
- uint_t ip_hdr_len;
- uint32_t seq;
- uint32_t recv_size = send_size;
+ conn_t *connp = tcp->tcp_connp;
+ tcp_t *peer_tcp = tcp->tcp_loopback_peer;
+ conn_t *peer_connp = peer_tcp->tcp_connp;
+ boolean_t flow_stopped, peer_data_queued = B_FALSE;
+ boolean_t urgent = (DB_TYPE(mp) != M_DATA);
+ boolean_t push = B_TRUE;
+ mblk_t *mp1 = mp;
+ uint_t ip_hdr_len;
+ uint32_t recv_size = send_size;
tcp_stack_t *tcps = tcp->tcp_tcps;
netstack_t *ns = tcps->tcps_netstack;
ip_stack_t *ipst = ns->netstack_ip;
+ ipsec_stack_t *ipss = ns->netstack_ipsec;
+ iaflags_t ixaflags = connp->conn_ixa->ixa_flags;
+ boolean_t do_ipsec, hooks_out, hooks_in, ipobs_enabled;
ASSERT(tcp->tcp_fused);
ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp);
- ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp);
+ ASSERT(connp->conn_sqp == peer_connp->conn_sqp);
ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO ||
DB_TYPE(mp) == M_PCPROTO);
- /* If this connection requires IP, unfuse and use regular path */
- if (tcp_loopback_needs_ip(tcp, ns) ||
- tcp_loopback_needs_ip(peer_tcp, ns) ||
- IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN, ipst) ||
- (tcp->tcp_ipversion == IPV4_VERSION &&
- ipst->ips_ip4_observe.he_interested) ||
- (tcp->tcp_ipversion == IPV6_VERSION &&
- ipst->ips_ip6_observe.he_interested)) {
- TCP_STAT(tcps, tcp_fusion_aborted);
- tcp->tcp_refuse = B_TRUE;
- peer_tcp->tcp_refuse = B_TRUE;
-
- bcopy(peer_tcp->tcp_tcph, &tcp->tcp_saved_tcph,
- sizeof (tcph_t));
- bcopy(tcp->tcp_tcph, &peer_tcp->tcp_saved_tcph,
- sizeof (tcph_t));
- if (tcp->tcp_ipversion == IPV4_VERSION) {
- bcopy(peer_tcp->tcp_ipha, &tcp->tcp_saved_ipha,
- sizeof (ipha_t));
- bcopy(tcp->tcp_ipha, &peer_tcp->tcp_saved_ipha,
- sizeof (ipha_t));
- } else {
- bcopy(peer_tcp->tcp_ip6h, &tcp->tcp_saved_ip6h,
- sizeof (ip6_t));
- bcopy(tcp->tcp_ip6h, &peer_tcp->tcp_saved_ip6h,
- sizeof (ip6_t));
- }
- goto unfuse;
- }
-
if (send_size == 0) {
freemsg(mp);
return (B_TRUE);
@@ -578,123 +477,74 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size)
mp1 = mp->b_cont;
}
- if (tcp->tcp_ipversion == IPV4_VERSION &&
- (HOOKS4_INTERESTED_LOOPBACK_IN(ipst) ||
- HOOKS4_INTERESTED_LOOPBACK_OUT(ipst)) ||
- tcp->tcp_ipversion == IPV6_VERSION &&
- (HOOKS6_INTERESTED_LOOPBACK_IN(ipst) ||
- HOOKS6_INTERESTED_LOOPBACK_OUT(ipst))) {
- /*
- * Build ip and tcp header to satisfy FW_HOOKS.
- * We only build it when any hook is present.
- */
+ /*
+ * Check that we are still using an IRE_LOCAL or IRE_LOOPBACK before
+ * further processes.
+ */
+ if (!ip_output_verify_local(connp->conn_ixa))
+ goto unfuse;
+
+ /*
+ * Build IP and TCP header in case we have something that needs the
+ * headers. Those cases are:
+ * 1. IPsec
+ * 2. IPobs
+ * 3. FW_HOOKS
+ *
+ * If tcp_xmit_mp() fails to dupb() the message, unfuse the connection
+ * and back to regular path.
+ */
+ if (ixaflags & IXAF_IS_IPV4) {
+ do_ipsec = (ixaflags & IXAF_IPSEC_SECURE) ||
+ CONN_INBOUND_POLICY_PRESENT(peer_connp, ipss);
+
+ hooks_out = HOOKS4_INTERESTED_LOOPBACK_OUT(ipst);
+ hooks_in = HOOKS4_INTERESTED_LOOPBACK_IN(ipst);
+ ipobs_enabled = (ipst->ips_ip4_observe.he_interested != 0);
+ } else {
+ do_ipsec = (ixaflags & IXAF_IPSEC_SECURE) ||
+ CONN_INBOUND_POLICY_PRESENT_V6(peer_connp, ipss);
+
+ hooks_out = HOOKS6_INTERESTED_LOOPBACK_OUT(ipst);
+ hooks_in = HOOKS6_INTERESTED_LOOPBACK_IN(ipst);
+ ipobs_enabled = (ipst->ips_ip6_observe.he_interested != 0);
+ }
+
+ /* We do logical 'or' for efficiency */
+ if (ipobs_enabled | do_ipsec | hooks_in | hooks_out) {
if ((mp1 = tcp_xmit_mp(tcp, mp1, tcp->tcp_mss, NULL, NULL,
tcp->tcp_snxt, B_TRUE, NULL, B_FALSE)) == NULL)
/* If tcp_xmit_mp fails, use regular path */
goto unfuse;
/*
- * The ipif and ill can be safely referenced under the
- * protection of conn_lock - see head of function comment for
- * conn_get_held_ipif(). It is necessary to check that both
- * the ipif and ill can be looked up (i.e. not condemned). If
- * not, bail out and unfuse this connection.
+ * Leave all IP relevant processes to ip_output_process_local(),
+ * which handles IPsec, IPobs, and FW_HOOKS.
*/
- mutex_enter(&peer_tcp->tcp_connp->conn_lock);
- if ((peer_tcp->tcp_connp->conn_ire_cache == NULL) ||
- (peer_tcp->tcp_connp->conn_ire_cache->ire_marks &
- IRE_MARK_CONDEMNED) ||
- ((oifp = peer_tcp->tcp_connp->conn_ire_cache->ire_ipif)
- == NULL) ||
- (!IPIF_CAN_LOOKUP(oifp)) ||
- ((olp = oifp->ipif_ill) == NULL) ||
- (ill_check_and_refhold(olp) != 0)) {
- mutex_exit(&peer_tcp->tcp_connp->conn_lock);
- goto unfuse;
- }
- mutex_exit(&peer_tcp->tcp_connp->conn_lock);
-
- /* PFHooks: LOOPBACK_OUT */
- if (tcp->tcp_ipversion == IPV4_VERSION) {
- ipha = (ipha_t *)mp1->b_rptr;
-
- DTRACE_PROBE4(ip4__loopback__out__start,
- ill_t *, NULL, ill_t *, olp,
- ipha_t *, ipha, mblk_t *, mp1);
- FW_HOOKS(ipst->ips_ip4_loopback_out_event,
- ipst->ips_ipv4firewall_loopback_out,
- NULL, olp, ipha, mp1, mp1, 0, ipst);
- DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp1);
- } else {
- ip6h = (ip6_t *)mp1->b_rptr;
-
- DTRACE_PROBE4(ip6__loopback__out__start,
- ill_t *, NULL, ill_t *, olp,
- ip6_t *, ip6h, mblk_t *, mp1);
- FW_HOOKS6(ipst->ips_ip6_loopback_out_event,
- ipst->ips_ipv6firewall_loopback_out,
- NULL, olp, ip6h, mp1, mp1, 0, ipst);
- DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp1);
- }
- ill_refrele(olp);
+ mp1 = ip_output_process_local(mp1, connp->conn_ixa, hooks_out,
+ hooks_in, do_ipsec ? peer_connp : NULL);
+ /* If the message is dropped for any reason. */
if (mp1 == NULL)
goto unfuse;
/*
- * The ipif and ill can be safely referenced under the
- * protection of conn_lock - see head of function comment for
- * conn_get_held_ipif(). It is necessary to check that both
- * the ipif and ill can be looked up (i.e. not condemned). If
- * not, bail out and unfuse this connection.
+ * Data length might have been changed by FW_HOOKS.
+ * We assume that the first mblk contains the TCP/IP headers.
*/
- mutex_enter(&tcp->tcp_connp->conn_lock);
- if ((tcp->tcp_connp->conn_ire_cache == NULL) ||
- (tcp->tcp_connp->conn_ire_cache->ire_marks &
- IRE_MARK_CONDEMNED) ||
- ((iifp = tcp->tcp_connp->conn_ire_cache->ire_ipif)
- == NULL) ||
- (!IPIF_CAN_LOOKUP(iifp)) ||
- ((ilp = iifp->ipif_ill) == NULL) ||
- (ill_check_and_refhold(ilp) != 0)) {
- mutex_exit(&tcp->tcp_connp->conn_lock);
- goto unfuse;
- }
- mutex_exit(&tcp->tcp_connp->conn_lock);
-
- /* PFHooks: LOOPBACK_IN */
- if (tcp->tcp_ipversion == IPV4_VERSION) {
- DTRACE_PROBE4(ip4__loopback__in__start,
- ill_t *, ilp, ill_t *, NULL,
- ipha_t *, ipha, mblk_t *, mp1);
- FW_HOOKS(ipst->ips_ip4_loopback_in_event,
- ipst->ips_ipv4firewall_loopback_in,
- ilp, NULL, ipha, mp1, mp1, 0, ipst);
- DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp1);
- ill_refrele(ilp);
- if (mp1 == NULL)
- goto unfuse;
-
- ip_hdr_len = IPH_HDR_LENGTH(ipha);
- } else {
- DTRACE_PROBE4(ip6__loopback__in__start,
- ill_t *, ilp, ill_t *, NULL,
- ip6_t *, ip6h, mblk_t *, mp1);
- FW_HOOKS6(ipst->ips_ip6_loopback_in_event,
- ipst->ips_ipv6firewall_loopback_in,
- ilp, NULL, ip6h, mp1, mp1, 0, ipst);
- DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp1);
- ill_refrele(ilp);
- if (mp1 == NULL)
- goto unfuse;
-
- ip_hdr_len = ip_hdr_length_v6(mp1, ip6h);
- }
+ if (hooks_in || hooks_out) {
+ tcpha_t *tcpha;
+
+ ip_hdr_len = (ixaflags & IXAF_IS_IPV4) ?
+ IPH_HDR_LENGTH((ipha_t *)mp1->b_rptr) :
+ ip_hdr_length_v6(mp1, (ip6_t *)mp1->b_rptr);
- /* Data length might be changed by FW_HOOKS */
- tcph = (tcph_t *)&mp1->b_rptr[ip_hdr_len];
- seq = ABE32_TO_U32(tcph->th_seq);
- recv_size += seq - tcp->tcp_snxt;
+ tcpha = (tcpha_t *)&mp1->b_rptr[ip_hdr_len];
+ ASSERT((uchar_t *)tcpha + sizeof (tcpha_t) <=
+ mp1->b_wptr);
+ recv_size += htonl(tcpha->tha_seq) - tcp->tcp_snxt;
+
+ }
/*
* The message duplicated by tcp_xmit_mp is freed.
@@ -712,7 +562,7 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size)
* detached we use tcp_rcv_enqueue() instead. Queued data will be
* drained when the accept completes (in tcp_accept_finish()).
*/
- if (IPCL_IS_NONSTR(peer_tcp->tcp_connp) &&
+ if (IPCL_IS_NONSTR(peer_connp) &&
!TCP_IS_DETACHED(peer_tcp)) {
int error;
int flags = 0;
@@ -720,18 +570,18 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size)
if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
(tcp->tcp_urg == tcp->tcp_snxt)) {
flags = MSG_OOB;
- (*peer_tcp->tcp_connp->conn_upcalls->su_signal_oob)
- (peer_tcp->tcp_connp->conn_upper_handle, 0);
+ (*peer_connp->conn_upcalls->su_signal_oob)
+ (peer_connp->conn_upper_handle, 0);
tcp->tcp_valid_bits &= ~TCP_URG_VALID;
}
- if ((*peer_tcp->tcp_connp->conn_upcalls->su_recv)(
- peer_tcp->tcp_connp->conn_upper_handle, mp, recv_size,
+ if ((*peer_connp->conn_upcalls->su_recv)(
+ peer_connp->conn_upper_handle, mp, recv_size,
flags, &error, &push) < 0) {
ASSERT(error != EOPNOTSUPP);
peer_data_queued = B_TRUE;
}
} else {
- if (IPCL_IS_NONSTR(peer_tcp->tcp_connp) &&
+ if (IPCL_IS_NONSTR(peer_connp) &&
(tcp->tcp_valid_bits & TCP_URG_VALID) &&
(tcp->tcp_urg == tcp->tcp_snxt)) {
/*
@@ -744,7 +594,8 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size)
return (B_TRUE);
}
- tcp_rcv_enqueue(peer_tcp, mp, recv_size);
+ tcp_rcv_enqueue(peer_tcp, mp, recv_size,
+ tcp->tcp_connp->conn_cred);
/* In case it wrapped around and also to keep it constant */
peer_tcp->tcp_rwnd += recv_size;
@@ -764,22 +615,21 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size)
mutex_enter(&tcp->tcp_non_sq_lock);
flow_stopped = tcp->tcp_flow_stopped;
if ((TCP_IS_DETACHED(peer_tcp) &&
- (peer_tcp->tcp_rcv_cnt >= peer_tcp->tcp_recv_hiwater)) ||
+ (peer_tcp->tcp_rcv_cnt >= peer_connp->conn_rcvbuf)) ||
(!TCP_IS_DETACHED(peer_tcp) &&
- !IPCL_IS_NONSTR(peer_tcp->tcp_connp) &&
- !canputnext(peer_tcp->tcp_rq))) {
+ !IPCL_IS_NONSTR(peer_connp) && !canputnext(peer_connp->conn_rq))) {
peer_data_queued = B_TRUE;
}
if (!flow_stopped && (peer_data_queued ||
- (TCP_UNSENT_BYTES(tcp) >= tcp->tcp_xmit_hiwater))) {
+ (TCP_UNSENT_BYTES(tcp) >= connp->conn_sndbuf))) {
tcp_setqfull(tcp);
flow_stopped = B_TRUE;
TCP_STAT(tcps, tcp_fusion_flowctl);
DTRACE_PROBE3(tcp__fuse__output__flowctl, tcp_t *, tcp,
uint_t, send_size, uint_t, peer_tcp->tcp_rcv_cnt);
} else if (flow_stopped && !peer_data_queued &&
- (TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater)) {
+ (TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat)) {
tcp_clrqfull(tcp);
TCP_STAT(tcps, tcp_fusion_backenabled);
flow_stopped = B_FALSE;
@@ -818,13 +668,14 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size)
/*
* For TLI-based streams, a thread in tcp_accept_swap()
* can race with us. That thread will ensure that the
- * correct peer_tcp->tcp_rq is globally visible before
- * peer_tcp->tcp_detached is visible as clear, but we
- * must also ensure that the load of tcp_rq cannot be
- * reordered to be before the tcp_detached check.
+ * correct peer_connp->conn_rq is globally visible
+ * before peer_tcp->tcp_detached is visible as clear,
+ * but we must also ensure that the load of conn_rq
+ * cannot be reordered to be before the tcp_detached
+ * check.
*/
membar_consumer();
- (void) tcp_fuse_rcv_drain(peer_tcp->tcp_rq, peer_tcp,
+ (void) tcp_fuse_rcv_drain(peer_connp->conn_rq, peer_tcp,
NULL);
}
}
@@ -928,11 +779,11 @@ tcp_fuse_rcv_drain(queue_t *q, tcp_t *tcp, mblk_t **sigurg_mpp)
tcp->tcp_rcv_last_head = NULL;
tcp->tcp_rcv_last_tail = NULL;
tcp->tcp_rcv_cnt = 0;
- tcp->tcp_rwnd = tcp->tcp_recv_hiwater;
+ tcp->tcp_rwnd = tcp->tcp_connp->conn_rcvbuf;
mutex_enter(&peer_tcp->tcp_non_sq_lock);
if (peer_tcp->tcp_flow_stopped && (TCP_UNSENT_BYTES(peer_tcp) <=
- peer_tcp->tcp_xmit_lowater)) {
+ peer_tcp->tcp_connp->conn_sndlowat)) {
tcp_clrqfull(peer_tcp);
TCP_STAT(tcps, tcp_fusion_backenabled);
}
@@ -964,8 +815,8 @@ tcp_fuse_set_rcv_hiwat(tcp_t *tcp, size_t rwnd)
* Record high water mark, this is used for flow-control
* purposes in tcp_fuse_output().
*/
- tcp->tcp_recv_hiwater = rwnd;
- tcp->tcp_rwnd = tcp->tcp_recv_hiwater;
+ tcp->tcp_connp->conn_rcvbuf = rwnd;
+ tcp->tcp_rwnd = rwnd;
return (rwnd);
}
@@ -976,12 +827,13 @@ int
tcp_fuse_maxpsz(tcp_t *tcp)
{
tcp_t *peer_tcp = tcp->tcp_loopback_peer;
- uint_t sndbuf = tcp->tcp_xmit_hiwater;
+ conn_t *connp = tcp->tcp_connp;
+ uint_t sndbuf = connp->conn_sndbuf;
uint_t maxpsz = sndbuf;
ASSERT(tcp->tcp_fused);
ASSERT(peer_tcp != NULL);
- ASSERT(peer_tcp->tcp_recv_hiwater != 0);
+ ASSERT(peer_tcp->tcp_connp->conn_rcvbuf != 0);
/*
* In the fused loopback case, we want the stream head to split
* up larger writes into smaller chunks for a more accurate flow-
@@ -990,8 +842,8 @@ tcp_fuse_maxpsz(tcp_t *tcp)
* We round up the buffer to system page size due to the lack of
* TCP MSS concept in Fusion.
*/
- if (maxpsz > peer_tcp->tcp_recv_hiwater)
- maxpsz = peer_tcp->tcp_recv_hiwater;
+ if (maxpsz > peer_tcp->tcp_connp->conn_rcvbuf)
+ maxpsz = peer_tcp->tcp_connp->conn_rcvbuf;
maxpsz = P2ROUNDUP_TYPED(maxpsz, PAGESIZE, uint_t) >> 1;
return (maxpsz);
@@ -1013,12 +865,12 @@ tcp_fuse_backenable(tcp_t *tcp)
peer_tcp->tcp_connp->conn_sqp);
if (tcp->tcp_rcv_list != NULL)
- (void) tcp_fuse_rcv_drain(tcp->tcp_rq, tcp, NULL);
+ (void) tcp_fuse_rcv_drain(tcp->tcp_connp->conn_rq, tcp, NULL);
mutex_enter(&peer_tcp->tcp_non_sq_lock);
if (peer_tcp->tcp_flow_stopped &&
(TCP_UNSENT_BYTES(peer_tcp) <=
- peer_tcp->tcp_xmit_lowater)) {
+ peer_tcp->tcp_connp->conn_sndlowat)) {
tcp_clrqfull(peer_tcp);
}
mutex_exit(&peer_tcp->tcp_non_sq_lock);
diff --git a/usr/src/uts/common/inet/tcp/tcp_kssl.c b/usr/src/uts/common/inet/tcp/tcp_kssl.c
index 75fa36196a..5d9051aed1 100644
--- a/usr/src/uts/common/inet/tcp/tcp_kssl.c
+++ b/usr/src/uts/common/inet/tcp/tcp_kssl.c
@@ -56,20 +56,21 @@
* For the Kernel SSL proxy
*
* Routines in this file are called on tcp's incoming path,
- * tcp_rput_data() mainly, and right before the message is
+ * tcp_input_data() mainly, and right before the message is
* to be putnext()'ed upstreams.
*/
static void tcp_kssl_input_callback(void *, mblk_t *, kssl_cmd_t);
-static void tcp_kssl_input_asynch(void *, mblk_t *, void *);
+static void tcp_kssl_input_asynch(void *, mblk_t *, void *,
+ ip_recv_attr_t *);
-extern void tcp_output(void *, mblk_t *, void *);
+extern void tcp_output(void *, mblk_t *, void *, ip_recv_attr_t *);
extern void tcp_send_conn_ind(void *, mblk_t *, void *);
extern int tcp_squeue_flag;
/*
- * tcp_rput_data() calls this routine for all packet destined to a
+ * tcp_input_data() calls this routine for all packet destined to a
* connection to the SSL port, when the SSL kernel proxy is configured
* to intercept and process those packets.
* A packet may carry multiple SSL records, so the function
@@ -84,7 +85,7 @@ extern int tcp_squeue_flag;
* which could decrement the conn/tcp reference before we get to increment it.
*/
void
-tcp_kssl_input(tcp_t *tcp, mblk_t *mp)
+tcp_kssl_input(tcp_t *tcp, mblk_t *mp, cred_t *cr)
{
struct conn_s *connp = tcp->tcp_connp;
tcp_t *listener;
@@ -97,15 +98,26 @@ tcp_kssl_input(tcp_t *tcp, mblk_t *mp)
boolean_t is_v4;
void *addr;
+ if (is_system_labeled() && mp != NULL) {
+ ASSERT(cr != NULL || msg_getcred(mp, NULL) != NULL);
+ /*
+ * Provide for protocols above TCP such as RPC. NOPID leaves
+ * db_cpid unchanged.
+ * The cred could have already been set.
+ */
+ if (cr != NULL)
+ mblk_setcred(mp, cr, NOPID);
+ }
+
/* First time here, allocate the SSL context */
if (tcp->tcp_kssl_ctx == NULL) {
ASSERT(tcp->tcp_kssl_pending);
- is_v4 = (tcp->tcp_ipversion == IPV4_VERSION);
+ is_v4 = (connp->conn_ipversion == IPV4_VERSION);
if (is_v4) {
- addr = &tcp->tcp_ipha->ipha_dst;
+ addr = &connp->conn_faddr_v4;
} else {
- addr = &tcp->tcp_ip6h->ip6_dst;
+ addr = &connp->conn_faddr_v6;
}
if (kssl_init_context(tcp->tcp_kssl_ent,
@@ -146,7 +158,7 @@ tcp_kssl_input(tcp_t *tcp, mblk_t *mp)
mutex_enter(&tcp->tcp_non_sq_lock);
tcp->tcp_squeue_bytes += msgdsize(outmp);
mutex_exit(&tcp->tcp_non_sq_lock);
- tcp_output(connp, outmp, NULL);
+ tcp_output(connp, outmp, NULL, NULL);
/* FALLTHROUGH */
case KSSL_CMD_NONE:
@@ -194,7 +206,7 @@ tcp_kssl_input(tcp_t *tcp, mblk_t *mp)
tci->PRIM_type = T_SSL_PROXY_CONN_IND;
/*
- * The code below is copied from tcp_rput_data()
+ * The code below is copied from tcp_input_data
* delivering the T_CONN_IND on a TCPS_SYN_RCVD,
* and all conn ref cnt comments apply.
*/
@@ -214,7 +226,7 @@ tcp_kssl_input(tcp_t *tcp, mblk_t *mp)
SQUEUE_ENTER_ONE(
listener->tcp_connp->conn_sqp,
ind_mp, tcp_send_conn_ind,
- listener->tcp_connp, SQ_FILL,
+ listener->tcp_connp, NULL, SQ_FILL,
SQTAG_TCP_CONN_IND);
}
}
@@ -240,11 +252,12 @@ tcp_kssl_input(tcp_t *tcp, mblk_t *mp)
if (tcp->tcp_listener != NULL) {
DTRACE_PROBE1(kssl_mblk__input_rcv_enqueue,
mblk_t *, outmp);
- tcp_rcv_enqueue(tcp, outmp, msgdsize(outmp));
+ tcp_rcv_enqueue(tcp, outmp, msgdsize(outmp),
+ NULL);
} else {
DTRACE_PROBE1(kssl_mblk__input_putnext,
mblk_t *, outmp);
- putnext(tcp->tcp_rq, outmp);
+ putnext(connp->conn_rq, outmp);
}
/*
* We're at a phase where records are sent upstreams,
@@ -283,7 +296,7 @@ no_can_do:
tci->PRIM_type = T_SSL_PROXY_CONN_IND;
/*
- * The code below is copied from tcp_rput_data()
+ * The code below is copied from tcp_input_data
* delivering the T_CONN_IND on a TCPS_SYN_RCVD,
* and all conn ref cnt comments apply.
*/
@@ -303,12 +316,12 @@ no_can_do:
SQUEUE_ENTER_ONE(
listener->tcp_connp->conn_sqp,
ind_mp, tcp_send_conn_ind,
- listener->tcp_connp,
+ listener->tcp_connp, NULL,
SQ_FILL, SQTAG_TCP_CONN_IND);
}
}
if (mp != NULL)
- tcp_rcv_enqueue(tcp, mp, msgdsize(mp));
+ tcp_rcv_enqueue(tcp, mp, msgdsize(mp), NULL);
break;
}
mp = NULL;
@@ -351,7 +364,7 @@ tcp_kssl_input_callback(void *arg, mblk_t *mp, kssl_cmd_t kssl_cmd)
}
CONN_INC_REF(connp);
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, connp,
- tcp_squeue_flag, SQTAG_TCP_OUTPUT);
+ NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
/* FALLTHROUGH */
case KSSL_CMD_NONE:
@@ -363,9 +376,9 @@ tcp_kssl_input_callback(void *arg, mblk_t *mp, kssl_cmd_t kssl_cmd)
* Keep accumulating if not yet accepted.
*/
if (tcp->tcp_listener != NULL) {
- tcp_rcv_enqueue(tcp, mp, msgdsize(mp));
+ tcp_rcv_enqueue(tcp, mp, msgdsize(mp), NULL);
} else {
- putnext(tcp->tcp_rq, mp);
+ putnext(connp->conn_rq, mp);
}
break;
@@ -383,7 +396,7 @@ tcp_kssl_input_callback(void *arg, mblk_t *mp, kssl_cmd_t kssl_cmd)
if ((sqmp = allocb(1, BPRI_MED)) != NULL) {
CONN_INC_REF(connp);
SQUEUE_ENTER_ONE(connp->conn_sqp, sqmp, tcp_kssl_input_asynch,
- connp, SQ_FILL, SQTAG_TCP_KSSL_INPUT);
+ connp, NULL, SQ_FILL, SQTAG_TCP_KSSL_INPUT);
} else {
DTRACE_PROBE(kssl_err__allocb_failed);
}
@@ -396,7 +409,7 @@ tcp_kssl_input_callback(void *arg, mblk_t *mp, kssl_cmd_t kssl_cmd)
*/
/* ARGSUSED */
void
-tcp_kssl_input_asynch(void *arg, mblk_t *mp, void *arg2)
+tcp_kssl_input_asynch(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
{
conn_t *connp = (conn_t *)arg;
tcp_t *tcp = connp->conn_tcp;
@@ -409,6 +422,6 @@ tcp_kssl_input_asynch(void *arg, mblk_t *mp, void *arg2)
* while we're away
*/
if (tcp->tcp_kssl_ctx != NULL) {
- tcp_kssl_input(tcp, NULL);
+ tcp_kssl_input(tcp, NULL, NULL);
}
}
diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
index fa2529a5ac..d15ff4ffcd 100644
--- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c
+++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
@@ -39,12 +39,7 @@
#include <netinet/tcp.h>
#include <inet/optcom.h>
-
-extern int tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr);
-extern int tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr);
-extern int tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level,
- int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
- void *thisdg_attrs, cred_t *cr, mblk_t *mblk);
+#include <inet/tcp_impl.h>
/*
* Table of all known options handled on a TCP protocol stack.
@@ -55,161 +50,165 @@ extern int tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level,
*/
opdes_t tcp_opt_arr[] = {
-{ SO_LINGER, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ SO_LINGER, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
sizeof (struct linger), 0 },
-{ SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_KEEPALIVE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0
+{ SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_KEEPALIVE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
},
-{ SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
+{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
sizeof (struct timeval), 0 },
-{ SO_RCVTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ SO_RCVTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
sizeof (struct timeval), 0 },
-{ SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0
+{ SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
},
{ SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-{ SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int),
+{ SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
0 },
-{ SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int),
+{ SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
0 },
-{ SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int),
+{ SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
0 },
-{ SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT, sizeof (int),
+{ SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
0 },
-{ SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
+{ SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-{ SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
+{ SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
-{ SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
+{ SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
-{ TCP_NODELAY, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0
+{ TCP_NODELAY, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
},
-{ TCP_MAXSEG, IPPROTO_TCP, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (uint_t),
+{ TCP_MAXSEG, IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t),
536 },
{ TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_DEF_FN), sizeof (int), -1 /* not initialized */ },
+ OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
{ TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_DEF_FN), sizeof (int), -1 /* not initialized */ },
+ OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
{ TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_DEF_FN), sizeof (int), -1 /* not initialized */ },
+ OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
{ TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_DEF_FN), sizeof (int), -1 /* not initialized */ },
+ OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
-{ TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int),
+{ TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
0 },
-{ TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, OP_PASSNEXT,
+{ TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0,
sizeof (int), 0 },
-{ TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0
+{ TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
},
-{ TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, OP_PASSNEXT,
+{ TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0,
sizeof (int), 0 },
-{ TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
+{ TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+ (OP_VARLEN|OP_NODEFAULT),
IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
{ T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+ (OP_VARLEN|OP_NODEFAULT),
IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
-{ IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_DEF_FN),
+{ IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
sizeof (int), -1 /* not initialized */ },
-{ IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
sizeof (ipsec_req_t), -1 /* not initialized */ },
-{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 /* no ifindex */ },
-{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, OP_PASSNEXT,
+{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
sizeof (int), 0 },
-{ IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_DEF_FN),
+{ IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
sizeof (int), -1 /* not initialized */ },
-{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 /* no ifindex */ },
-{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT,
+{ IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+
+{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
sizeof (in_addr_t), -1 /* not initialized */ },
-{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, OP_PASSNEXT,
+{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
sizeof (int), 0 },
{ IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT|OP_VARLEN),
+ (OP_NODEFAULT|OP_VARLEN),
sizeof (struct in6_pktinfo), -1 /* not initialized */ },
{ IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT),
+ OP_NODEFAULT,
sizeof (sin6_t), -1 /* not initialized */ },
{ IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), 255*8,
+ (OP_VARLEN|OP_NODEFAULT), 255*8,
-1 /* not initialized */ },
{ IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), 255*8,
+ (OP_VARLEN|OP_NODEFAULT), 255*8,
-1 /* not initialized */ },
{ IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), 255*8,
+ (OP_VARLEN|OP_NODEFAULT), 255*8,
-1 /* not initialized */ },
{ IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), 255*8,
+ (OP_VARLEN|OP_NODEFAULT), 255*8,
-1 /* not initialized */ },
{ IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT),
+ OP_NODEFAULT,
sizeof (int), -1 /* not initialized */ },
{ IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT),
+ OP_NODEFAULT,
sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
-{ IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
+ sizeof (int), 0 },
+{ IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
/* Enable receipt of ancillary data */
-{ IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
sizeof (ipsec_req_t), -1 /* not initialized */ },
-{ IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
};
@@ -247,7 +246,6 @@ optdb_obj_t tcp_opt_obj = {
tcp_opt_default, /* TCP default value function pointer */
tcp_tpi_opt_get, /* TCP get function pointer */
tcp_tpi_opt_set, /* TCP set function pointer */
- B_TRUE, /* TCP is tpi provider */
TCP_OPT_ARR_CNT, /* TCP option database count of entries */
tcp_opt_arr, /* TCP option database */
TCP_VALID_LEVELS_CNT, /* TCP valid level count of entries */
diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h
index bec2b3256f..1b7c87736a 100644
--- a/usr/src/uts/common/inet/tcp_impl.h
+++ b/usr/src/uts/common/inet/tcp_impl.h
@@ -70,41 +70,6 @@ extern "C" {
}
/*
- * Before caching the conn IRE, we need to make sure certain TCP
- * states are in sync with the ire. The mismatch could occur if the
- * TCP state has been set in tcp_adapt_ire() using a different IRE,
- * e.g if an address was not present during an initial connect(),
- * tcp_adapt_ire() will set the state using the interface route.
- * Subsequently, if the address is added to the local machine, the
- * retransmitted SYN will get the correct (loopback) IRE, but the TCP
- * state (tcp_loopback and tcp_localnet) will remain out of sync.
- * This is especially an issue with TCP fusion which relies on the
- * TCP state to be accurate.
- *
- * This check/change should be made only if the TCP is not yet in
- * the established state, else it would lead to inconsistencies.
- */
-#define TCP_CHECK_IREINFO(tcp, ire) { \
- if ((tcp)->tcp_state < TCPS_ESTABLISHED) { \
- if (((ire)->ire_type & (IRE_LOOPBACK | \
- IRE_LOCAL)) && !(tcp)->tcp_loopback) { \
- (tcp)->tcp_loopback = B_TRUE; \
- } else if ((tcp)->tcp_loopback && \
- !((ire)->ire_type & (IRE_LOOPBACK | IRE_LOCAL))) { \
- (tcp)->tcp_loopback = B_FALSE; \
- } \
- if ((tcp)->tcp_ipversion == IPV4_VERSION) { \
- (tcp)->tcp_localnet = \
- ((ire)->ire_gateway_addr == 0); \
- } else { \
- (tcp)->tcp_localnet = \
- IN6_IS_ADDR_UNSPECIFIED( \
- &(ire)->ire_gateway_addr_v6); \
- } \
- } \
-}
-
-/*
* Write-side flow-control is implemented via the per instance STREAMS
* write-side Q by explicitly setting QFULL to stop the flow of mblk_t(s)
* and clearing QFULL and calling qbackenable() to restart the flow based
@@ -205,18 +170,19 @@ typedef struct tcpparam_s {
#define tcps_keepalive_abort_interval_high tcps_params[59].tcp_param_max
#define tcps_keepalive_abort_interval tcps_params[59].tcp_param_val
#define tcps_keepalive_abort_interval_low tcps_params[59].tcp_param_min
+#define tcps_dev_flow_ctl tcps_params[60].tcp_param_val
extern struct qinit tcp_rinitv4, tcp_rinitv6;
extern boolean_t do_tcp_fusion;
extern int tcp_maxpsz_set(tcp_t *, boolean_t);
extern void tcp_timers_stop(tcp_t *);
-extern void tcp_rcv_enqueue(tcp_t *, mblk_t *, uint_t);
+extern void tcp_rcv_enqueue(tcp_t *, mblk_t *, uint_t, cred_t *);
extern void tcp_push_timer(void *);
extern timeout_id_t tcp_timeout(conn_t *, void (*)(void *), clock_t);
extern clock_t tcp_timeout_cancel(conn_t *, timeout_id_t);
-extern void tcp_fuse(tcp_t *, uchar_t *, tcph_t *);
+extern void tcp_fuse(tcp_t *, uchar_t *, tcpha_t *);
extern void tcp_unfuse(tcp_t *);
extern boolean_t tcp_fuse_output(tcp_t *, mblk_t *, uint32_t);
extern void tcp_fuse_output_urg(tcp_t *, mblk_t *);
@@ -242,6 +208,11 @@ extern int tcp_fallback(sock_lower_handle_t, queue_t *, boolean_t,
extern sock_downcalls_t sock_tcp_downcalls;
+extern int tcp_opt_default(queue_t *, t_scalar_t, t_scalar_t, uchar_t *);
+extern int tcp_tpi_opt_get(queue_t *, t_scalar_t, t_scalar_t, uchar_t *);
+extern int tcp_tpi_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *,
+ uint_t *, uchar_t *, void *, cred_t *);
+
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/inet/tcp_stack.h b/usr/src/uts/common/inet/tcp_stack.h
index 2c151894eb..a254da4b43 100644
--- a/usr/src/uts/common/inet/tcp_stack.h
+++ b/usr/src/uts/common/inet/tcp_stack.h
@@ -42,9 +42,6 @@ typedef struct tcp_stat {
kstat_named_t tcp_time_wait;
kstat_named_t tcp_time_wait_syn;
kstat_named_t tcp_time_wait_syn_success;
- kstat_named_t tcp_time_wait_syn_fail;
- kstat_named_t tcp_reinput_syn;
- kstat_named_t tcp_ip_output;
kstat_named_t tcp_detach_non_time_wait;
kstat_named_t tcp_detach_time_wait;
kstat_named_t tcp_time_wait_reap;
@@ -82,37 +79,14 @@ typedef struct tcp_stat {
kstat_named_t tcp_timermp_freed;
kstat_named_t tcp_push_timer_cnt;
kstat_named_t tcp_ack_timer_cnt;
- kstat_named_t tcp_ire_null1;
- kstat_named_t tcp_ire_null;
- kstat_named_t tcp_ip_send;
- kstat_named_t tcp_ip_ire_send;
kstat_named_t tcp_wsrv_called;
kstat_named_t tcp_flwctl_on;
kstat_named_t tcp_timer_fire_early;
kstat_named_t tcp_timer_fire_miss;
kstat_named_t tcp_rput_v6_error;
- kstat_named_t tcp_out_sw_cksum;
- kstat_named_t tcp_out_sw_cksum_bytes;
kstat_named_t tcp_zcopy_on;
kstat_named_t tcp_zcopy_off;
kstat_named_t tcp_zcopy_backoff;
- kstat_named_t tcp_zcopy_disable;
- kstat_named_t tcp_mdt_pkt_out;
- kstat_named_t tcp_mdt_pkt_out_v4;
- kstat_named_t tcp_mdt_pkt_out_v6;
- kstat_named_t tcp_mdt_discarded;
- kstat_named_t tcp_mdt_conn_halted1;
- kstat_named_t tcp_mdt_conn_halted2;
- kstat_named_t tcp_mdt_conn_halted3;
- kstat_named_t tcp_mdt_conn_resumed1;
- kstat_named_t tcp_mdt_conn_resumed2;
- kstat_named_t tcp_mdt_legacy_small;
- kstat_named_t tcp_mdt_legacy_all;
- kstat_named_t tcp_mdt_legacy_ret;
- kstat_named_t tcp_mdt_allocfail;
- kstat_named_t tcp_mdt_addpdescfail;
- kstat_named_t tcp_mdt_allocd;
- kstat_named_t tcp_mdt_linked;
kstat_named_t tcp_fusion_flowctl;
kstat_named_t tcp_fusion_backenabled;
kstat_named_t tcp_fusion_urg;
@@ -154,15 +128,6 @@ struct tcp_stack {
mib2_tcp_t tcps_mib;
- /* Protected by tcps_g_q_lock */
- queue_t *tcps_g_q; /* Default queue */
- uint_t tcps_refcnt; /* Total number of tcp_t's */
- kmutex_t tcps_g_q_lock;
- kcondvar_t tcps_g_q_cv;
- kthread_t *tcps_g_q_creator;
- struct __ldi_handle *tcps_g_q_lh;
- cred_t *tcps_g_q_cr; /* For _inactive close call */
-
/*
* Extra privileged ports. In host byte order.
* Protected by tcp_epriv_port_lock.
@@ -182,9 +147,6 @@ struct tcp_stack {
caddr_t tcps_g_nd;
struct tcpparam_s *tcps_params; /* ndd parameters */
struct tcpparam_s *tcps_wroff_xtra_param;
- struct tcpparam_s *tcps_mdt_head_param;
- struct tcpparam_s *tcps_mdt_tail_param;
- struct tcpparam_s *tcps_mdt_max_pbufs_param;
/* Hint not protected by any lock */
uint_t tcps_next_port_to_try;
@@ -222,6 +184,11 @@ struct tcp_stack {
/* The number of RST not sent because of the rate limit. */
uint32_t tcps_rst_unsent;
ldi_ident_t tcps_ldi_ident;
+
+ /* Used to synchronize access when reclaiming memory */
+ mblk_t *tcps_ixa_cleanup_mp;
+ kmutex_t tcps_ixa_cleanup_lock;
+ kcondvar_t tcps_ixa_cleanup_cv;
};
typedef struct tcp_stack tcp_stack_t;
diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c
index d0bab511b0..e18fc57f40 100644
--- a/usr/src/uts/common/inet/udp/udp.c
+++ b/usr/src/uts/common/inet/udp/udp.c
@@ -26,12 +26,9 @@
#include <sys/types.h>
#include <sys/stream.h>
-#include <sys/dlpi.h>
-#include <sys/pattr.h>
#include <sys/stropts.h>
#include <sys/strlog.h>
#include <sys/strsun.h>
-#include <sys/time.h>
#define _SUN_TPI_VERSION 2
#include <sys/tihdr.h>
#include <sys/timod.h>
@@ -41,7 +38,9 @@
#include <sys/suntpi.h>
#include <sys/xti_inet.h>
#include <sys/kmem.h>
+#include <sys/cred_impl.h>
#include <sys/policy.h>
+#include <sys/priv.h>
#include <sys/ucred.h>
#include <sys/zone.h>
@@ -57,12 +56,11 @@
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
#include <netinet/udp.h>
-#include <net/if.h>
-#include <net/route.h>
#include <inet/common.h>
#include <inet/ip.h>
#include <inet/ip_impl.h>
+#include <inet/ipsec_impl.h>
#include <inet/ip6.h>
#include <inet/ip_ire.h>
#include <inet/ip_if.h>
@@ -74,34 +72,25 @@
#include <inet/optcom.h>
#include <inet/snmpcom.h>
#include <inet/kstatcom.h>
-#include <inet/udp_impl.h>
#include <inet/ipclassifier.h>
-#include <inet/ipsec_impl.h>
-#include <inet/ipp_common.h>
#include <sys/squeue_impl.h>
#include <inet/ipnet.h>
#include <sys/ethernet.h>
-/*
- * The ipsec_info.h header file is here since it has the definition for the
- * M_CTL message types used by IP to convey information to the ULP. The
- * ipsec_info.h needs the pfkeyv2.h, hence the latter's presence.
- */
-#include <net/pfkeyv2.h>
-#include <inet/ipsec_info.h>
-
#include <sys/tsol/label.h>
#include <sys/tsol/tnet.h>
#include <rpc/pmap_prot.h>
+#include <inet/udp_impl.h>
+
/*
* Synchronization notes:
*
* UDP is MT and uses the usual kernel synchronization primitives. There are 2
- * locks, the fanout lock (uf_lock) and the udp endpoint lock udp_rwlock.
- * We also use conn_lock when updating things that affect the IP classifier
- * lookup.
- * The lock order is udp_rwlock -> uf_lock and is udp_rwlock -> conn_lock.
+ * locks, the fanout lock (uf_lock) and conn_lock. conn_lock
+ * protects the contents of the udp_t. uf_lock protects the address and the
+ * fanout information.
+ * The lock order is conn_lock -> uf_lock.
*
* The fanout lock uf_lock:
* When a UDP endpoint is bound to a local port, it is inserted into
@@ -114,11 +103,6 @@
* from the bind hash list only when it is being unbound or being closed.
* The per bucket lock also protects a UDP endpoint's state changes.
*
- * The udp_rwlock:
- * This protects most of the other fields in the udp_t. The exact list of
- * fields which are protected by each of the above locks is documented in
- * the udp_t structure definition.
- *
* Plumbing notes:
* UDP is always a device driver. For compatibility with mibopen() code
* it is possible to I_PUSH "udp", but that results in pushing a passthrough
@@ -133,41 +117,32 @@
/* For /etc/system control */
uint_t udp_bind_fanout_size = UDP_BIND_FANOUT_SIZE;
-/* Option processing attrs */
-typedef struct udpattrs_s {
- union {
- ip6_pkt_t *udpattr_ipp6; /* For V6 */
- ip4_pkt_t *udpattr_ipp4; /* For V4 */
- } udpattr_ippu;
-#define udpattr_ipp6 udpattr_ippu.udpattr_ipp6
-#define udpattr_ipp4 udpattr_ippu.udpattr_ipp4
- mblk_t *udpattr_mb;
- boolean_t udpattr_credset;
-} udpattrs_t;
-
static void udp_addr_req(queue_t *q, mblk_t *mp);
static void udp_tpi_bind(queue_t *q, mblk_t *mp);
static void udp_bind_hash_insert(udp_fanout_t *uf, udp_t *udp);
static void udp_bind_hash_remove(udp_t *udp, boolean_t caller_holds_lock);
-static int udp_build_hdrs(udp_t *udp);
+static int udp_build_hdr_template(conn_t *, const in6_addr_t *,
+ const in6_addr_t *, in_port_t, uint32_t);
static void udp_capability_req(queue_t *q, mblk_t *mp);
static int udp_tpi_close(queue_t *q, int flags);
+static void udp_close_free(conn_t *);
static void udp_tpi_connect(queue_t *q, mblk_t *mp);
static void udp_tpi_disconnect(queue_t *q, mblk_t *mp);
static void udp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
- int sys_error);
-static void udp_err_ack_prim(queue_t *q, mblk_t *mp, int primitive,
- t_scalar_t tlierr, int unixerr);
+ int sys_error);
+static void udp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
+ t_scalar_t tlierr, int sys_error);
static int udp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp,
cred_t *cr);
static int udp_extra_priv_ports_add(queue_t *q, mblk_t *mp,
char *value, caddr_t cp, cred_t *cr);
static int udp_extra_priv_ports_del(queue_t *q, mblk_t *mp,
char *value, caddr_t cp, cred_t *cr);
-static void udp_icmp_error(conn_t *, mblk_t *);
-static void udp_icmp_error_ipv6(conn_t *, mblk_t *);
+static void udp_icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
+static void udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp,
+ ip_recv_attr_t *ira);
static void udp_info_req(queue_t *q, mblk_t *mp);
-static void udp_input(void *, mblk_t *, void *);
+static void udp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
static void udp_lrput(queue_t *, mblk_t *);
static void udp_lwput(queue_t *, mblk_t *);
static int udp_open(queue_t *q, dev_t *devp, int flag, int sflag,
@@ -176,24 +151,34 @@ static int udp_openv4(queue_t *q, dev_t *devp, int flag, int sflag,
cred_t *credp);
static int udp_openv6(queue_t *q, dev_t *devp, int flag, int sflag,
cred_t *credp);
-static int udp_unitdata_opt_process(queue_t *q, mblk_t *mp,
- int *errorp, udpattrs_t *udpattrs);
static boolean_t udp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name);
+int udp_opt_set(conn_t *connp, uint_t optset_context,
+ int level, int name, uint_t inlen,
+ uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
+ void *thisdg_attrs, cred_t *cr);
+int udp_opt_get(conn_t *connp, int level, int name,
+ uchar_t *ptr);
+static int udp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr,
+ pid_t pid);
+static int udp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr,
+ pid_t pid, ip_xmit_attr_t *ixa);
+static int udp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin,
+ sin6_t *sin6, ushort_t ipversion, cred_t *cr, pid_t,
+ ip_xmit_attr_t *ixa);
static int udp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
static boolean_t udp_param_register(IDP *ndp, udpparam_t *udppa, int cnt);
static int udp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
cred_t *cr);
-static void udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp,
- ipha_t *ipha);
-static void udp_ud_err(queue_t *q, mblk_t *mp, uchar_t *destaddr,
- t_scalar_t destlen, t_scalar_t err);
+static mblk_t *udp_prepend_hdr(conn_t *, ip_xmit_attr_t *, const ip_pkt_t *,
+ const in6_addr_t *, const in6_addr_t *, in_port_t, uint32_t, mblk_t *,
+ int *);
+static mblk_t *udp_prepend_header_template(conn_t *, ip_xmit_attr_t *,
+ mblk_t *, const in6_addr_t *, in_port_t, uint32_t, int *);
+static void udp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err);
+static void udp_ud_err_connected(conn_t *, t_scalar_t);
static void udp_tpi_unbind(queue_t *q, mblk_t *mp);
static in_port_t udp_update_next_port(udp_t *udp, in_port_t port,
boolean_t random);
-static mblk_t *udp_output_v4(conn_t *, mblk_t *, ipaddr_t, uint16_t, uint_t,
- int *, boolean_t, struct nmsghdr *, cred_t *, pid_t);
-static mblk_t *udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6,
- int *error, struct nmsghdr *msg, cred_t *cr, pid_t pid);
static void udp_wput_other(queue_t *q, mblk_t *mp);
static void udp_wput_iocdata(queue_t *q, mblk_t *mp);
static void udp_wput_fallback(queue_t *q, mblk_t *mp);
@@ -208,11 +193,9 @@ static void *udp_kstat2_init(netstackid_t, udp_stat_t *);
static void udp_kstat2_fini(netstackid_t, kstat_t *);
static int udp_kstat_update(kstat_t *kp, int rw);
-static void udp_xmit(queue_t *, mblk_t *, ire_t *ire, conn_t *, zoneid_t);
-static int udp_send_connected(conn_t *, mblk_t *, struct nmsghdr *,
- cred_t *, pid_t);
-static void udp_ulp_recv(conn_t *, mblk_t *);
+/* Common routines for TPI and socket module */
+static void udp_ulp_recv(conn_t *, mblk_t *, uint_t, ip_recv_attr_t *);
/* Common routine for TPI and socket module */
static conn_t *udp_do_open(cred_t *, boolean_t, int);
@@ -220,30 +203,20 @@ static void udp_do_close(conn_t *);
static int udp_do_bind(conn_t *, struct sockaddr *, socklen_t, cred_t *,
boolean_t);
static int udp_do_unbind(conn_t *);
-static int udp_do_getsockname(udp_t *, struct sockaddr *, uint_t *);
-static int udp_do_getpeername(udp_t *, struct sockaddr *, uint_t *);
int udp_getsockname(sock_lower_handle_t,
struct sockaddr *, socklen_t *, cred_t *);
int udp_getpeername(sock_lower_handle_t,
struct sockaddr *, socklen_t *, cred_t *);
static int udp_do_connect(conn_t *, const struct sockaddr *, socklen_t,
- cred_t *cr);
-static int udp_post_ip_bind_connect(udp_t *, mblk_t *, int);
+ cred_t *, pid_t);
#define UDP_RECV_HIWATER (56 * 1024)
#define UDP_RECV_LOWATER 128
#define UDP_XMIT_HIWATER (56 * 1024)
#define UDP_XMIT_LOWATER 1024
-/*
- * The following is defined in tcp.c
- */
-extern int (*cl_inet_connect2)(netstackid_t stack_id,
- uint8_t protocol, boolean_t is_outgoing,
- sa_family_t addr_family,
- uint8_t *laddrp, in_port_t lport,
- uint8_t *faddrp, in_port_t fport, void *args);
+#pragma inline(udp_output_connected, udp_output_newdst, udp_output_lastdst)
/*
* Checks if the given destination addr/port is allowed out.
@@ -251,7 +224,7 @@ extern int (*cl_inet_connect2)(netstackid_t stack_id,
* Called for each connect() and for sendto()/sendmsg() to a different
* destination.
* For connect(), called in udp_connect().
- * For sendto()/sendmsg(), called in udp_output_v{4,6}().
+ * For sendto()/sendmsg(), called in udp_output_newdst().
*
* This macro assumes that the cl_inet_connect2 hook is not NULL.
* Please check this before calling this macro.
@@ -260,25 +233,26 @@ extern int (*cl_inet_connect2)(netstackid_t stack_id,
* CL_INET_UDP_CONNECT(conn_t cp, udp_t *udp, boolean_t is_outgoing,
* in6_addr_t *faddrp, in_port_t (or uint16_t) fport, int err);
*/
-#define CL_INET_UDP_CONNECT(cp, udp, is_outgoing, faddrp, fport, err) { \
+#define CL_INET_UDP_CONNECT(cp, is_outgoing, faddrp, fport, err) { \
(err) = 0; \
/* \
* Running in cluster mode - check and register active \
* "connection" information \
*/ \
- if ((udp)->udp_ipversion == IPV4_VERSION) \
+ if ((cp)->conn_ipversion == IPV4_VERSION) \
(err) = (*cl_inet_connect2)( \
(cp)->conn_netstack->netstack_stackid, \
IPPROTO_UDP, is_outgoing, AF_INET, \
- (uint8_t *)&((udp)->udp_v6src._S6_un._S6_u32[3]), \
- (udp)->udp_port, \
- (uint8_t *)&((faddrp)->_S6_un._S6_u32[3]), \
+ (uint8_t *)&((cp)->conn_laddr_v4), \
+ (cp)->conn_lport, \
+ (uint8_t *)&(V4_PART_OF_V6(*faddrp)), \
(in_port_t)(fport), NULL); \
else \
(err) = (*cl_inet_connect2)( \
(cp)->conn_netstack->netstack_stackid, \
IPPROTO_UDP, is_outgoing, AF_INET6, \
- (uint8_t *)&((udp)->udp_v6src), (udp)->udp_port, \
+ (uint8_t *)&((cp)->conn_laddr_v6), \
+ (cp)->conn_lport, \
(uint8_t *)(faddrp), (in_port_t)(fport), NULL); \
}
@@ -387,6 +361,8 @@ udpparam_t udp_param_arr[] = {
{ 0, (1<<30), UDP_XMIT_LOWATER, "udp_xmit_lowat"},
{ UDP_RECV_LOWATER, (1<<30), UDP_RECV_HIWATER, "udp_recv_hiwat"},
{ 65536, (1<<30), 2*1024*1024, "udp_max_buf"},
+ { 0, 1, 0, "udp_pmtu_discovery" },
+ { 0, 1, 0, "udp_sendto_ignerr" },
};
/* END CSTYLED */
@@ -451,9 +427,10 @@ retry:
static void
udp_bind_hash_remove(udp_t *udp, boolean_t caller_holds_lock)
{
- udp_t *udpnext;
- kmutex_t *lockp;
- udp_stack_t *us = udp->udp_us;
+ udp_t *udpnext;
+ kmutex_t *lockp;
+ udp_stack_t *us = udp->udp_us;
+ conn_t *connp = udp->udp_connp;
if (udp->udp_ptpbhn == NULL)
return;
@@ -462,9 +439,9 @@ udp_bind_hash_remove(udp_t *udp, boolean_t caller_holds_lock)
* Extract the lock pointer in case there are concurrent
* hash_remove's for this instance.
*/
- ASSERT(udp->udp_port != 0);
+ ASSERT(connp->conn_lport != 0);
if (!caller_holds_lock) {
- lockp = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port,
+ lockp = &us->us_bind_fanout[UDP_BIND_HASH(connp->conn_lport,
us->us_bind_fanout_size)].uf_lock;
ASSERT(lockp != NULL);
mutex_enter(lockp);
@@ -486,8 +463,10 @@ udp_bind_hash_remove(udp_t *udp, boolean_t caller_holds_lock)
static void
udp_bind_hash_insert(udp_fanout_t *uf, udp_t *udp)
{
+ conn_t *connp = udp->udp_connp;
udp_t **udpp;
udp_t *udpnext;
+ conn_t *connext;
ASSERT(MUTEX_HELD(&uf->uf_lock));
ASSERT(udp->udp_ptpbhn == NULL);
@@ -503,11 +482,11 @@ udp_bind_hash_insert(udp_fanout_t *uf, udp_t *udp)
* specific address get preference over those binding to
* INADDR_ANY.
*/
- if (V6_OR_V4_INADDR_ANY(udp->udp_bound_v6src) &&
- !V6_OR_V4_INADDR_ANY(udpnext->udp_bound_v6src)) {
+ connext = udpnext->udp_connp;
+ if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) &&
+ !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) {
while ((udpnext = udpp[0]) != NULL &&
- !V6_OR_V4_INADDR_ANY(
- udpnext->udp_bound_v6src)) {
+ !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) {
udpp = &(udpnext->udp_bind_hash);
}
if (udpnext != NULL)
@@ -525,10 +504,9 @@ udp_bind_hash_insert(udp_fanout_t *uf, udp_t *udp)
* This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message
* passed to udp_wput.
* It associates a port number and local address with the stream.
- * The O_T_BIND_REQ/T_BIND_REQ is passed downstream to ip with the UDP
- * protocol type (IPPROTO_UDP) placed in the message following the address.
- * A T_BIND_ACK message is passed upstream when ip acknowledges the request.
- * (Called as writer.)
+ * It calls IP to verify the local IP address, and calls IP to insert
+ * the conn_t in the fanout table.
+ * If everything is ok it then sends the T_BIND_ACK back up.
*
* Note that UDP over IPv4 and IPv6 sockets can use the same port number
* without setting SO_REUSEADDR. This is needed so that they
@@ -580,10 +558,10 @@ udp_tpi_bind(queue_t *q, mblk_t *mp)
}
/*
* Reallocate the message to make sure we have enough room for an
- * address and the protocol type.
+ * address.
*/
- mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t) + 1, 1);
- if (!mp1) {
+ mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
+ if (mp1 == NULL) {
udp_err_ack(q, mp, TSYSERR, ENOMEM);
return;
}
@@ -597,7 +575,7 @@ udp_tpi_bind(queue_t *q, mblk_t *mp)
switch (tbr->ADDR_length) {
case 0: /* Request for a generic port */
tbr->ADDR_offset = sizeof (struct T_bind_req);
- if (udp->udp_family == AF_INET) {
+ if (connp->conn_family == AF_INET) {
tbr->ADDR_length = sizeof (sin_t);
sin = (sin_t *)&tbr[1];
*sin = sin_null;
@@ -605,7 +583,7 @@ udp_tpi_bind(queue_t *q, mblk_t *mp)
mp->b_wptr = (uchar_t *)&sin[1];
sa = (struct sockaddr *)sin;
} else {
- ASSERT(udp->udp_family == AF_INET6);
+ ASSERT(connp->conn_family == AF_INET6);
tbr->ADDR_length = sizeof (sin6_t);
sin6 = (sin6_t *)&tbr[1];
*sin6 = sin6_null;
@@ -622,7 +600,7 @@ udp_tpi_bind(queue_t *q, mblk_t *mp)
udp_err_ack(q, mp, TSYSERR, EINVAL);
return;
}
- if (udp->udp_family != AF_INET ||
+ if (connp->conn_family != AF_INET ||
sa->sa_family != AF_INET) {
udp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT);
return;
@@ -636,7 +614,7 @@ udp_tpi_bind(queue_t *q, mblk_t *mp)
udp_err_ack(q, mp, TSYSERR, EINVAL);
return;
}
- if (udp->udp_family != AF_INET6 ||
+ if (connp->conn_family != AF_INET6 ||
sa->sa_family != AF_INET6) {
udp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT);
return;
@@ -669,29 +647,21 @@ udp_tpi_bind(queue_t *q, mblk_t *mp)
* This routine handles each T_CONN_REQ message passed to udp. It
* associates a default destination address with the stream.
*
- * This routine sends down a T_BIND_REQ to IP with the following mblks:
- * T_BIND_REQ - specifying local and remote address/port
- * IRE_DB_REQ_TYPE - to get an IRE back containing ire_type and src
- * T_OK_ACK - for the T_CONN_REQ
- * T_CONN_CON - to keep the TPI user happy
- *
- * The connect completes in udp_do_connect.
- * When a T_BIND_ACK is received information is extracted from the IRE
- * and the two appended messages are sent to the TPI user.
- * Should udp_bind_result receive T_ERROR_ACK for the T_BIND_REQ it will
- * convert it to an error ack for the appropriate primitive.
+ * After various error checks are completed, udp_connect() lays
+ * the target address and port into the composite header template.
+ * Then we ask IP for information, including a source address if we didn't
+ * already have one. Finally we send up the T_OK_ACK reply message.
*/
static void
udp_tpi_connect(queue_t *q, mblk_t *mp)
{
- udp_t *udp;
conn_t *connp = Q_TO_CONN(q);
int error;
socklen_t len;
struct sockaddr *sa;
struct T_conn_req *tcr;
cred_t *cr;
-
+ pid_t pid;
/*
* All Solaris components should pass a db_credp
* for this TPI message, hence we ASSERT.
@@ -699,14 +669,13 @@ udp_tpi_connect(queue_t *q, mblk_t *mp)
* like a TPI message sent by some other kernel
* component, we check and return an error.
*/
- cr = msg_getcred(mp, NULL);
+ cr = msg_getcred(mp, &pid);
ASSERT(cr != NULL);
if (cr == NULL) {
udp_err_ack(q, mp, TSYSERR, EINVAL);
return;
}
- udp = connp->conn_udp;
tcr = (struct T_conn_req *)mp->b_rptr;
/* A bit of sanity checking */
@@ -724,7 +693,7 @@ udp_tpi_connect(queue_t *q, mblk_t *mp)
* Determine packet type based on type of address passed in
* the request should contain an IPv4 or IPv6 address.
* Make sure that address family matches the type of
- * family of the the address passed down
+ * family of the address passed down.
*/
len = tcr->DEST_length;
switch (tcr->DEST_length) {
@@ -743,13 +712,13 @@ udp_tpi_connect(queue_t *q, mblk_t *mp)
break;
}
- error = proto_verify_ip_addr(udp->udp_family, sa, len);
+ error = proto_verify_ip_addr(connp->conn_family, sa, len);
if (error != 0) {
udp_err_ack(q, mp, TSYSERR, error);
return;
}
- error = udp_do_connect(connp, sa, len, cr);
+ error = udp_do_connect(connp, sa, len, cr, pid);
if (error != 0) {
if (error < 0)
udp_err_ack(q, mp, -error, 0);
@@ -761,7 +730,7 @@ udp_tpi_connect(queue_t *q, mblk_t *mp)
* We have to send a connection confirmation to
* keep TLI happy.
*/
- if (udp->udp_family == AF_INET) {
+ if (connp->conn_family == AF_INET) {
mp1 = mi_tpi_conn_con(NULL, (char *)sa,
sizeof (sin_t), NULL, 0);
} else {
@@ -810,72 +779,14 @@ done:
return (0);
}
-/*
- * Called in the close path to quiesce the conn
- */
-void
-udp_quiesce_conn(conn_t *connp)
-{
- udp_t *udp = connp->conn_udp;
-
- if (cl_inet_unbind != NULL && udp->udp_state == TS_IDLE) {
- /*
- * Running in cluster mode - register unbind information
- */
- if (udp->udp_ipversion == IPV4_VERSION) {
- (*cl_inet_unbind)(
- connp->conn_netstack->netstack_stackid,
- IPPROTO_UDP, AF_INET,
- (uint8_t *)(&(V4_PART_OF_V6(udp->udp_v6src))),
- (in_port_t)udp->udp_port, NULL);
- } else {
- (*cl_inet_unbind)(
- connp->conn_netstack->netstack_stackid,
- IPPROTO_UDP, AF_INET6,
- (uint8_t *)(&(udp->udp_v6src)),
- (in_port_t)udp->udp_port, NULL);
- }
- }
-
- udp_bind_hash_remove(udp, B_FALSE);
-
-}
-
-void
+static void
udp_close_free(conn_t *connp)
{
udp_t *udp = connp->conn_udp;
/* If there are any options associated with the stream, free them. */
- if (udp->udp_ip_snd_options != NULL) {
- mi_free((char *)udp->udp_ip_snd_options);
- udp->udp_ip_snd_options = NULL;
- udp->udp_ip_snd_options_len = 0;
- }
-
- if (udp->udp_ip_rcv_options != NULL) {
- mi_free((char *)udp->udp_ip_rcv_options);
- udp->udp_ip_rcv_options = NULL;
- udp->udp_ip_rcv_options_len = 0;
- }
-
- /* Free memory associated with sticky options */
- if (udp->udp_sticky_hdrs_len != 0) {
- kmem_free(udp->udp_sticky_hdrs,
- udp->udp_sticky_hdrs_len);
- udp->udp_sticky_hdrs = NULL;
- udp->udp_sticky_hdrs_len = 0;
- }
- if (udp->udp_last_cred != NULL) {
- crfree(udp->udp_last_cred);
- udp->udp_last_cred = NULL;
- }
- if (udp->udp_effective_cred != NULL) {
- crfree(udp->udp_effective_cred);
- udp->udp_effective_cred = NULL;
- }
-
- ip6_pkt_free(&udp->udp_sticky_ipp);
+ if (udp->udp_recv_ipp.ipp_fields != 0)
+ ip_pkt_free(&udp->udp_recv_ipp);
/*
* Clear any fields which the kmem_cache constructor clears.
@@ -892,59 +803,48 @@ static int
udp_do_disconnect(conn_t *connp)
{
udp_t *udp;
- mblk_t *ire_mp;
udp_fanout_t *udpf;
udp_stack_t *us;
int error;
udp = connp->conn_udp;
us = udp->udp_us;
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- if (udp->udp_state != TS_DATA_XFER || udp->udp_pending_op != -1) {
- rw_exit(&udp->udp_rwlock);
+ mutex_enter(&connp->conn_lock);
+ if (udp->udp_state != TS_DATA_XFER) {
+ mutex_exit(&connp->conn_lock);
return (-TOUTSTATE);
}
- udp->udp_pending_op = T_DISCON_REQ;
- udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port,
+ udpf = &us->us_bind_fanout[UDP_BIND_HASH(connp->conn_lport,
us->us_bind_fanout_size)];
mutex_enter(&udpf->uf_lock);
- udp->udp_v6src = udp->udp_bound_v6src;
+ if (connp->conn_mcbc_bind)
+ connp->conn_saddr_v6 = ipv6_all_zeros;
+ else
+ connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
+ connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
+ connp->conn_faddr_v6 = ipv6_all_zeros;
+ connp->conn_fport = 0;
udp->udp_state = TS_IDLE;
mutex_exit(&udpf->uf_lock);
- if (udp->udp_family == AF_INET6) {
- /* Rebuild the header template */
- error = udp_build_hdrs(udp);
- if (error != 0) {
- udp->udp_pending_op = -1;
- rw_exit(&udp->udp_rwlock);
- return (error);
- }
- }
+ /* Remove any remnants of mapped address binding */
+ if (connp->conn_family == AF_INET6)
+ connp->conn_ipversion = IPV6_VERSION;
- ire_mp = allocb(sizeof (ire_t), BPRI_HI);
- if (ire_mp == NULL) {
- mutex_enter(&udpf->uf_lock);
- udp->udp_pending_op = -1;
- mutex_exit(&udpf->uf_lock);
- rw_exit(&udp->udp_rwlock);
- return (ENOMEM);
- }
-
- rw_exit(&udp->udp_rwlock);
-
- if (udp->udp_family == AF_INET6) {
- error = ip_proto_bind_laddr_v6(connp, &ire_mp, IPPROTO_UDP,
- &udp->udp_bound_v6src, udp->udp_port, B_TRUE);
- } else {
- error = ip_proto_bind_laddr_v4(connp, &ire_mp, IPPROTO_UDP,
- V4_PART_OF_V6(udp->udp_bound_v6src), udp->udp_port, B_TRUE);
- }
+ connp->conn_v6lastdst = ipv6_all_zeros;
+ error = udp_build_hdr_template(connp, &connp->conn_saddr_v6,
+ &connp->conn_faddr_v6, connp->conn_fport, connp->conn_flowinfo);
+ mutex_exit(&connp->conn_lock);
+ if (error != 0)
+ return (error);
- return (udp_post_ip_bind_connect(udp, ire_mp, error));
+ /*
+ * Tell IP to remove the full binding and revert
+ * to the local address binding.
+ */
+ return (ip_laddr_fanout_insert(connp));
}
-
static void
udp_tpi_disconnect(queue_t *q, mblk_t *mp)
{
@@ -981,12 +881,9 @@ int
udp_disconnect(conn_t *connp)
{
int error;
- udp_t *udp = connp->conn_udp;
-
- udp->udp_dgram_errind = B_FALSE;
+ connp->conn_dgram_errind = B_FALSE;
error = udp_do_disconnect(connp);
-
if (error < 0)
error = proto_tlitosyserr(-error);
@@ -1003,8 +900,8 @@ udp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
/* Shorthand to generate and send TPI error acks to our client */
static void
-udp_err_ack_prim(queue_t *q, mblk_t *mp, int primitive, t_scalar_t t_error,
- int sys_error)
+udp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
+ t_scalar_t t_error, int sys_error)
{
struct T_error_ack *teackp;
@@ -1018,7 +915,7 @@ udp_err_ack_prim(queue_t *q, mblk_t *mp, int primitive, t_scalar_t t_error,
}
}
-/*ARGSUSED*/
+/*ARGSUSED2*/
static int
udp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
{
@@ -1033,7 +930,7 @@ udp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
return (0);
}
-/* ARGSUSED */
+/* ARGSUSED1 */
static int
udp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
cred_t *cr)
@@ -1072,7 +969,7 @@ udp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
return (0);
}
-/* ARGSUSED */
+/* ARGSUSED1 */
static int
udp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
cred_t *cr)
@@ -1109,39 +1006,41 @@ udp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
#define ICMP_MIN_UDP_HDR 4
/*
- * udp_icmp_error is called by udp_input to process ICMP msgs. passed up by IP.
+ * udp_icmp_input is called as conn_recvicmp to process ICMP messages.
* Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
* Assumes that IP has pulled up everything up to and including the ICMP header.
*/
+/* ARGSUSED2 */
static void
-udp_icmp_error(conn_t *connp, mblk_t *mp)
+udp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
{
- icmph_t *icmph;
- ipha_t *ipha;
- int iph_hdr_length;
- udpha_t *udpha;
- sin_t sin;
- sin6_t sin6;
- mblk_t *mp1;
- int error = 0;
- udp_t *udp = connp->conn_udp;
+ conn_t *connp = (conn_t *)arg1;
+ icmph_t *icmph;
+ ipha_t *ipha;
+ int iph_hdr_length;
+ udpha_t *udpha;
+ sin_t sin;
+ sin6_t sin6;
+ mblk_t *mp1;
+ int error = 0;
+ udp_t *udp = connp->conn_udp;
- mp1 = NULL;
ipha = (ipha_t *)mp->b_rptr;
ASSERT(OK_32PTR(mp->b_rptr));
if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
- udp_icmp_error_ipv6(connp, mp);
+ udp_icmp_error_ipv6(connp, mp, ira);
return;
}
ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
/* Skip past the outer IP and ICMP headers */
- iph_hdr_length = IPH_HDR_LENGTH(ipha);
+ ASSERT(IPH_HDR_LENGTH(ipha) == ira->ira_ip_hdr_length);
+ iph_hdr_length = ira->ira_ip_hdr_length;
icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
- ipha = (ipha_t *)&icmph[1];
+ ipha = (ipha_t *)&icmph[1]; /* Inner IP header */
/* Skip past the inner IP and find the ULP header */
iph_hdr_length = IPH_HDR_LENGTH(ipha);
@@ -1150,11 +1049,41 @@ udp_icmp_error(conn_t *connp, mblk_t *mp)
switch (icmph->icmph_type) {
case ICMP_DEST_UNREACHABLE:
switch (icmph->icmph_code) {
- case ICMP_FRAGMENTATION_NEEDED:
+ case ICMP_FRAGMENTATION_NEEDED: {
+ ipha_t *ipha;
+ ip_xmit_attr_t *ixa;
/*
* IP has already adjusted the path MTU.
+ * But we need to adjust DF for IPv4.
*/
+ if (connp->conn_ipversion != IPV4_VERSION)
+ break;
+
+ ixa = conn_get_ixa(connp, B_FALSE);
+ if (ixa == NULL || ixa->ixa_ire == NULL) {
+ /*
+ * Some other thread holds conn_ixa. We will
+ * redo this on the next ICMP too big.
+ */
+ if (ixa != NULL)
+ ixa_refrele(ixa);
+ break;
+ }
+ (void) ip_get_pmtu(ixa);
+
+ mutex_enter(&connp->conn_lock);
+ ipha = (ipha_t *)connp->conn_ht_iphc;
+ if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
+ ipha->ipha_fragment_offset_and_flags |=
+ IPH_DF_HTONS;
+ } else {
+ ipha->ipha_fragment_offset_and_flags &=
+ ~IPH_DF_HTONS;
+ }
+ mutex_exit(&connp->conn_lock);
+ ixa_refrele(ixa);
break;
+ }
case ICMP_PORT_UNREACHABLE:
case ICMP_PROTOCOL_UNREACHABLE:
error = ECONNREFUSED;
@@ -1177,25 +1106,24 @@ udp_icmp_error(conn_t *connp, mblk_t *mp)
* Deliver T_UDERROR_IND when the application has asked for it.
* The socket layer enables this automatically when connected.
*/
- if (!udp->udp_dgram_errind) {
+ if (!connp->conn_dgram_errind) {
freemsg(mp);
return;
}
-
- switch (udp->udp_family) {
+ switch (connp->conn_family) {
case AF_INET:
sin = sin_null;
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = ipha->ipha_dst;
sin.sin_port = udpha->uha_dst_port;
if (IPCL_IS_NONSTR(connp)) {
- rw_enter(&udp->udp_rwlock, RW_WRITER);
+ mutex_enter(&connp->conn_lock);
if (udp->udp_state == TS_DATA_XFER) {
- if (sin.sin_port == udp->udp_dstport &&
+ if (sin.sin_port == connp->conn_fport &&
sin.sin_addr.s_addr ==
- V4_PART_OF_V6(udp->udp_v6dst)) {
- rw_exit(&udp->udp_rwlock);
+ connp->conn_faddr_v4) {
+ mutex_exit(&connp->conn_lock);
(*connp->conn_upcalls->su_set_error)
(connp->conn_upper_handle, error);
goto done;
@@ -1204,10 +1132,12 @@ udp_icmp_error(conn_t *connp, mblk_t *mp)
udp->udp_delayed_error = error;
*((sin_t *)&udp->udp_delayed_addr) = sin;
}
- rw_exit(&udp->udp_rwlock);
+ mutex_exit(&connp->conn_lock);
} else {
mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t),
NULL, 0, error);
+ if (mp1 != NULL)
+ putnext(connp->conn_rq, mp1);
}
break;
case AF_INET6:
@@ -1216,12 +1146,12 @@ udp_icmp_error(conn_t *connp, mblk_t *mp)
IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &sin6.sin6_addr);
sin6.sin6_port = udpha->uha_dst_port;
if (IPCL_IS_NONSTR(connp)) {
- rw_enter(&udp->udp_rwlock, RW_WRITER);
+ mutex_enter(&connp->conn_lock);
if (udp->udp_state == TS_DATA_XFER) {
- if (sin6.sin6_port == udp->udp_dstport &&
+ if (sin6.sin6_port == connp->conn_fport &&
IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr,
- &udp->udp_v6dst)) {
- rw_exit(&udp->udp_rwlock);
+ &connp->conn_faddr_v6)) {
+ mutex_exit(&connp->conn_lock);
(*connp->conn_upcalls->su_set_error)
(connp->conn_upper_handle, error);
goto done;
@@ -1230,17 +1160,16 @@ udp_icmp_error(conn_t *connp, mblk_t *mp)
udp->udp_delayed_error = error;
*((sin6_t *)&udp->udp_delayed_addr) = sin6;
}
- rw_exit(&udp->udp_rwlock);
+ mutex_exit(&connp->conn_lock);
} else {
mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
NULL, 0, error);
+ if (mp1 != NULL)
+ putnext(connp->conn_rq, mp1);
}
break;
}
- if (mp1 != NULL)
- putnext(connp->conn_rq, mp1);
done:
- ASSERT(!RW_ISWRITER(&udp->udp_rwlock));
freemsg(mp);
}
@@ -1251,7 +1180,7 @@ done:
* ICMPv6 header.
*/
static void
-udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
+udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira)
{
icmp6_t *icmp6;
ip6_t *ip6h, *outer_ip6h;
@@ -1265,12 +1194,19 @@ udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
udp_stack_t *us = udp->udp_us;
outer_ip6h = (ip6_t *)mp->b_rptr;
+#ifdef DEBUG
if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6)
iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h);
else
iph_hdr_length = IPV6_HDR_LEN;
+ ASSERT(iph_hdr_length == ira->ira_ip_hdr_length);
+#endif
+ /* Skip past the outer IP and ICMP headers */
+ iph_hdr_length = ira->ira_ip_hdr_length;
icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
- ip6h = (ip6_t *)&icmp6[1];
+
+ /* Skip past the inner IP and find the ULP header */
+ ip6h = (ip6_t *)&icmp6[1]; /* Inner IP header */
if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) {
freemsg(mp);
return;
@@ -1308,7 +1244,7 @@ udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
* information, send up an empty message containing an
* IPV6_PATHMTU ancillary data item.
*/
- if (!udp->udp_ipv6_recvpathmtu)
+ if (!connp->conn_ipv6_recvpathmtu)
break;
udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) +
@@ -1334,7 +1270,7 @@ udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
sin6 = (sin6_t *)&tudi[1];
bzero(sin6, sizeof (sin6_t));
sin6->sin6_family = AF_INET6;
- sin6->sin6_addr = udp->udp_v6dst;
+ sin6->sin6_addr = connp->conn_faddr_v6;
toh = (struct T_opthdr *)&sin6[1];
toh->level = IPPROTO_IPV6;
@@ -1352,8 +1288,7 @@ udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
* message. Free it, then send our empty message.
*/
freemsg(mp);
- udp_ulp_recv(connp, newmp);
-
+ udp_ulp_recv(connp, newmp, msgdsize(newmp), ira);
return;
}
case ICMP6_TIME_EXCEEDED:
@@ -1378,7 +1313,7 @@ udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
* Deliver T_UDERROR_IND when the application has asked for it.
* The socket layer enables this automatically when connected.
*/
- if (!udp->udp_dgram_errind) {
+ if (!connp->conn_dgram_errind) {
freemsg(mp);
return;
}
@@ -1390,12 +1325,12 @@ udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
if (IPCL_IS_NONSTR(connp)) {
- rw_enter(&udp->udp_rwlock, RW_WRITER);
+ mutex_enter(&connp->conn_lock);
if (udp->udp_state == TS_DATA_XFER) {
- if (sin6.sin6_port == udp->udp_dstport &&
+ if (sin6.sin6_port == connp->conn_fport &&
IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr,
- &udp->udp_v6dst)) {
- rw_exit(&udp->udp_rwlock);
+ &connp->conn_faddr_v6)) {
+ mutex_exit(&connp->conn_lock);
(*connp->conn_upcalls->su_set_error)
(connp->conn_upper_handle, error);
goto done;
@@ -1404,7 +1339,7 @@ udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
udp->udp_delayed_error = error;
*((sin6_t *)&udp->udp_delayed_addr) = sin6;
}
- rw_exit(&udp->udp_rwlock);
+ mutex_exit(&connp->conn_lock);
} else {
mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
NULL, 0, error);
@@ -1412,7 +1347,6 @@ udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
putnext(connp->conn_rq, mp1);
}
done:
- ASSERT(!RW_ISWRITER(&udp->udp_rwlock));
freemsg(mp);
}
@@ -1426,11 +1360,12 @@ done:
static void
udp_addr_req(queue_t *q, mblk_t *mp)
{
- sin_t *sin;
- sin6_t *sin6;
+ struct sockaddr *sa;
mblk_t *ackmp;
struct T_addr_ack *taa;
udp_t *udp = Q_TO_UDP(q);
+ conn_t *connp = udp->udp_connp;
+ uint_t addrlen;
/* Make it large enough for worst case */
ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
@@ -1446,7 +1381,13 @@ udp_addr_req(queue_t *q, mblk_t *mp)
taa->PRIM_type = T_ADDR_ACK;
ackmp->b_datap->db_type = M_PCPROTO;
- rw_enter(&udp->udp_rwlock, RW_READER);
+
+ if (connp->conn_family == AF_INET)
+ addrlen = sizeof (sin_t);
+ else
+ addrlen = sizeof (sin6_t);
+
+ mutex_enter(&connp->conn_lock);
/*
* Note: Following code assumes 32 bit alignment of basic
* data structures like sin_t and struct T_addr_ack.
@@ -1456,91 +1397,23 @@ udp_addr_req(queue_t *q, mblk_t *mp)
* Fill in local address first
*/
taa->LOCADDR_offset = sizeof (*taa);
- if (udp->udp_family == AF_INET) {
- taa->LOCADDR_length = sizeof (sin_t);
- sin = (sin_t *)&taa[1];
- /* Fill zeroes and then initialize non-zero fields */
- *sin = sin_null;
- sin->sin_family = AF_INET;
- if (!IN6_IS_ADDR_V4MAPPED_ANY(&udp->udp_v6src) &&
- !IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) {
- IN6_V4MAPPED_TO_IPADDR(&udp->udp_v6src,
- sin->sin_addr.s_addr);
- } else {
- /*
- * INADDR_ANY
- * udp_v6src is not set, we might be bound to
- * broadcast/multicast. Use udp_bound_v6src as
- * local address instead (that could
- * also still be INADDR_ANY)
- */
- IN6_V4MAPPED_TO_IPADDR(&udp->udp_bound_v6src,
- sin->sin_addr.s_addr);
- }
- sin->sin_port = udp->udp_port;
- ackmp->b_wptr = (uchar_t *)&sin[1];
- if (udp->udp_state == TS_DATA_XFER) {
- /*
- * connected, fill remote address too
- */
- taa->REMADDR_length = sizeof (sin_t);
- /* assumed 32-bit alignment */
- taa->REMADDR_offset = taa->LOCADDR_offset +
- taa->LOCADDR_length;
-
- sin = (sin_t *)(ackmp->b_rptr +
- taa->REMADDR_offset);
- /* initialize */
- *sin = sin_null;
- sin->sin_family = AF_INET;
- sin->sin_addr.s_addr =
- V4_PART_OF_V6(udp->udp_v6dst);
- sin->sin_port = udp->udp_dstport;
- ackmp->b_wptr = (uchar_t *)&sin[1];
- }
- } else {
- taa->LOCADDR_length = sizeof (sin6_t);
- sin6 = (sin6_t *)&taa[1];
- /* Fill zeroes and then initialize non-zero fields */
- *sin6 = sin6_null;
- sin6->sin6_family = AF_INET6;
- if (!IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) {
- sin6->sin6_addr = udp->udp_v6src;
- } else {
- /*
- * UNSPECIFIED
- * udp_v6src is not set, we might be bound to
- * broadcast/multicast. Use udp_bound_v6src as
- * local address instead (that could
- * also still be UNSPECIFIED)
- */
- sin6->sin6_addr =
- udp->udp_bound_v6src;
- }
- sin6->sin6_port = udp->udp_port;
- ackmp->b_wptr = (uchar_t *)&sin6[1];
- if (udp->udp_state == TS_DATA_XFER) {
- /*
- * connected, fill remote address too
- */
- taa->REMADDR_length = sizeof (sin6_t);
- /* assumed 32-bit alignment */
- taa->REMADDR_offset = taa->LOCADDR_offset +
- taa->LOCADDR_length;
-
- sin6 = (sin6_t *)(ackmp->b_rptr +
- taa->REMADDR_offset);
- /* initialize */
- *sin6 = sin6_null;
- sin6->sin6_family = AF_INET6;
- sin6->sin6_addr = udp->udp_v6dst;
- sin6->sin6_port = udp->udp_dstport;
- ackmp->b_wptr = (uchar_t *)&sin6[1];
- }
- ackmp->b_wptr = (uchar_t *)&sin6[1];
- }
+ taa->LOCADDR_length = addrlen;
+ sa = (struct sockaddr *)&taa[1];
+ (void) conn_getsockname(connp, sa, &addrlen);
+ ackmp->b_wptr += addrlen;
}
- rw_exit(&udp->udp_rwlock);
+ if (udp->udp_state == TS_DATA_XFER) {
+ /*
+ * connected, fill remote address too
+ */
+ taa->REMADDR_length = addrlen;
+ /* assumed 32-bit alignment */
+ taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length;
+ sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset);
+ (void) conn_getpeername(connp, sa, &addrlen);
+ ackmp->b_wptr += addrlen;
+ }
+ mutex_exit(&connp->conn_lock);
ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
qreply(q, ackmp);
}
@@ -1548,7 +1421,9 @@ udp_addr_req(queue_t *q, mblk_t *mp)
static void
udp_copy_info(struct T_info_ack *tap, udp_t *udp)
{
- if (udp->udp_family == AF_INET) {
+ conn_t *connp = udp->udp_connp;
+
+ if (connp->conn_family == AF_INET) {
*tap = udp_g_t_info_ack_ipv4;
} else {
*tap = udp_g_t_info_ack_ipv6;
@@ -1632,20 +1507,15 @@ udp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
* This is the open routine for udp. It allocates a udp_t structure for
* the stream and, on the first open of the module, creates an ND table.
*/
-/*ARGSUSED2*/
static int
udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
boolean_t isv6)
{
- int error;
udp_t *udp;
conn_t *connp;
dev_t conn_dev;
- udp_stack_t *us;
vmem_t *minor_arena;
- TRACE_1(TR_FAC_UDP, TR_UDP_OPEN, "udp_open: q %p", q);
-
/* If the stream is already open, return immediately. */
if (q->q_ptr != NULL)
return (0);
@@ -1685,7 +1555,6 @@ udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
return (ENOMEM);
}
udp = connp->conn_udp;
- us = udp->udp_us;
*devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
connp->conn_dev = conn_dev;
@@ -1699,39 +1568,27 @@ udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
connp->conn_rq = q;
connp->conn_wq = WR(q);
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- ASSERT(connp->conn_ulp == IPPROTO_UDP);
+ /*
+ * Since this conn_t/udp_t is not yet visible to anybody else we don't
+ * need to lock anything.
+ */
+ ASSERT(connp->conn_proto == IPPROTO_UDP);
ASSERT(connp->conn_udp == udp);
ASSERT(udp->udp_connp == connp);
if (flag & SO_SOCKSTR) {
- connp->conn_flags |= IPCL_SOCKET;
udp->udp_issocket = B_TRUE;
}
- q->q_hiwat = us->us_recv_hiwat;
- WR(q)->q_hiwat = us->us_xmit_hiwat;
- WR(q)->q_lowat = us->us_xmit_lowat;
+ WR(q)->q_hiwat = connp->conn_sndbuf;
+ WR(q)->q_lowat = connp->conn_sndlowat;
qprocson(q);
- if (udp->udp_family == AF_INET6) {
- /* Build initial header template for transmit */
- if ((error = udp_build_hdrs(udp)) != 0) {
- rw_exit(&udp->udp_rwlock);
- qprocsoff(q);
- inet_minor_free(minor_arena, conn_dev);
- ipcl_conn_destroy(connp);
- return (error);
- }
- }
- rw_exit(&udp->udp_rwlock);
-
/* Set the Stream head write offset and high watermark. */
- (void) proto_set_tx_wroff(q, connp,
- udp->udp_max_hdr_len + us->us_wroff_extra);
- /* XXX udp_set_rcv_hiwat() doesn't hold the lock, is it a bug??? */
- (void) proto_set_rx_hiwat(q, connp, udp_set_rcv_hiwat(udp, q->q_hiwat));
+ (void) proto_set_tx_wroff(q, connp, connp->conn_wroff);
+ (void) proto_set_rx_hiwat(q, connp,
+ udp_set_rcv_hiwat(udp, connp->conn_rcvbuf));
mutex_enter(&connp->conn_lock);
connp->conn_state_flags &= ~CONN_INCIPIENT;
@@ -1753,7 +1610,6 @@ udp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name)
* This routine gets default values of certain options whose default
* values are maintained by protcol specific code
*/
-/* ARGSUSED */
int
udp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
{
@@ -1791,456 +1647,127 @@ udp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
/*
* This routine retrieves the current status of socket options.
- * It returns the size of the option retrieved.
+ * It returns the size of the option retrieved, or -1.
*/
-static int
-udp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
+int
+udp_opt_get(conn_t *connp, t_scalar_t level, t_scalar_t name,
+ uchar_t *ptr)
{
- udp_t *udp = connp->conn_udp;
- udp_stack_t *us = udp->udp_us;
int *i1 = (int *)ptr;
- ip6_pkt_t *ipp = &udp->udp_sticky_ipp;
+ udp_t *udp = connp->conn_udp;
int len;
+ conn_opt_arg_t coas;
+ int retval;
- ASSERT(RW_READ_HELD(&udp->udp_rwlock));
- switch (level) {
- case SOL_SOCKET:
- switch (name) {
- case SO_DEBUG:
- *i1 = udp->udp_debug;
- break; /* goto sizeof (int) option return */
- case SO_REUSEADDR:
- *i1 = udp->udp_reuseaddr;
- break; /* goto sizeof (int) option return */
- case SO_TYPE:
- *i1 = SOCK_DGRAM;
- break; /* goto sizeof (int) option return */
+ coas.coa_connp = connp;
+ coas.coa_ixa = connp->conn_ixa;
+ coas.coa_ipp = &connp->conn_xmit_ipp;
+ coas.coa_ancillary = B_FALSE;
+ coas.coa_changed = 0;
+ /*
+ * We assume that the optcom framework has checked for the set
+ * of levels and names that are supported, hence we don't worry
+ * about rejecting based on that.
+ * First check for UDP specific handling, then pass to common routine.
+ */
+ switch (level) {
+ case IPPROTO_IP:
/*
- * The following three items are available here,
- * but are only meaningful to IP.
+ * Only allow IPv4 option processing on IPv4 sockets.
*/
- case SO_DONTROUTE:
- *i1 = udp->udp_dontroute;
- break; /* goto sizeof (int) option return */
- case SO_USELOOPBACK:
- *i1 = udp->udp_useloopback;
- break; /* goto sizeof (int) option return */
- case SO_BROADCAST:
- *i1 = udp->udp_broadcast;
- break; /* goto sizeof (int) option return */
-
- case SO_SNDBUF:
- *i1 = udp->udp_xmit_hiwat;
- break; /* goto sizeof (int) option return */
- case SO_RCVBUF:
- *i1 = udp->udp_rcv_disply_hiwat;
- break; /* goto sizeof (int) option return */
- case SO_DGRAM_ERRIND:
- *i1 = udp->udp_dgram_errind;
- break; /* goto sizeof (int) option return */
- case SO_RECVUCRED:
- *i1 = udp->udp_recvucred;
- break; /* goto sizeof (int) option return */
- case SO_TIMESTAMP:
- *i1 = udp->udp_timestamp;
- break; /* goto sizeof (int) option return */
- case SO_ANON_MLP:
- *i1 = connp->conn_anon_mlp;
- break; /* goto sizeof (int) option return */
- case SO_MAC_EXEMPT:
- *i1 = (connp->conn_mac_mode == CONN_MAC_AWARE);
- break;
- case SO_MAC_IMPLICIT:
- *i1 = (connp->conn_mac_mode == CONN_MAC_IMPLICIT);
- break;
- case SO_ALLZONES:
- *i1 = connp->conn_allzones;
- break; /* goto sizeof (int) option return */
- case SO_EXCLBIND:
- *i1 = udp->udp_exclbind ? SO_EXCLBIND : 0;
- break;
- case SO_PROTOTYPE:
- *i1 = IPPROTO_UDP;
- break;
- case SO_DOMAIN:
- *i1 = udp->udp_family;
- break;
- default:
- return (-1);
- }
- break;
- case IPPROTO_IP:
- if (udp->udp_family != AF_INET)
+ if (connp->conn_family != AF_INET)
return (-1);
+
switch (name) {
case IP_OPTIONS:
case T_IP_OPTIONS:
- len = udp->udp_ip_rcv_options_len - udp->udp_label_len;
- if (len > 0) {
- bcopy(udp->udp_ip_rcv_options +
- udp->udp_label_len, ptr, len);
- }
- return (len);
- case IP_TOS:
- case T_IP_TOS:
- *i1 = (int)udp->udp_type_of_service;
- break; /* goto sizeof (int) option return */
- case IP_TTL:
- *i1 = (int)udp->udp_ttl;
- break; /* goto sizeof (int) option return */
- case IP_DHCPINIT_IF:
- return (-EINVAL);
- case IP_NEXTHOP:
- case IP_RECVPKTINFO:
- /*
- * This also handles IP_PKTINFO.
- * IP_PKTINFO and IP_RECVPKTINFO have the same value.
- * Differentiation is based on the size of the argument
- * passed in.
- * This option is handled in IP which will return an
- * error for IP_PKTINFO as it's not supported as a
- * sticky option.
- */
- return (-EINVAL);
- case IP_MULTICAST_IF:
- /* 0 address if not set */
- *(ipaddr_t *)ptr = udp->udp_multicast_if_addr;
- return (sizeof (ipaddr_t));
- case IP_MULTICAST_TTL:
- *(uchar_t *)ptr = udp->udp_multicast_ttl;
- return (sizeof (uchar_t));
- case IP_MULTICAST_LOOP:
- *ptr = connp->conn_multicast_loop;
- return (sizeof (uint8_t));
- case IP_RECVOPTS:
- *i1 = udp->udp_recvopts;
- break; /* goto sizeof (int) option return */
- case IP_RECVDSTADDR:
- *i1 = udp->udp_recvdstaddr;
- break; /* goto sizeof (int) option return */
- case IP_RECVIF:
- *i1 = udp->udp_recvif;
- break; /* goto sizeof (int) option return */
- case IP_RECVSLLA:
- *i1 = udp->udp_recvslla;
- break; /* goto sizeof (int) option return */
- case IP_RECVTTL:
- *i1 = udp->udp_recvttl;
- break; /* goto sizeof (int) option return */
- case IP_ADD_MEMBERSHIP:
- case IP_DROP_MEMBERSHIP:
- case IP_BLOCK_SOURCE:
- case IP_UNBLOCK_SOURCE:
- case IP_ADD_SOURCE_MEMBERSHIP:
- case IP_DROP_SOURCE_MEMBERSHIP:
- case MCAST_JOIN_GROUP:
- case MCAST_LEAVE_GROUP:
- case MCAST_BLOCK_SOURCE:
- case MCAST_UNBLOCK_SOURCE:
- case MCAST_JOIN_SOURCE_GROUP:
- case MCAST_LEAVE_SOURCE_GROUP:
- /* cannot "get" the value for these */
- return (-1);
- case IP_BOUND_IF:
- /* Zero if not set */
- *i1 = udp->udp_bound_if;
- break; /* goto sizeof (int) option return */
- case IP_UNSPEC_SRC:
- *i1 = udp->udp_unspec_source;
- break; /* goto sizeof (int) option return */
- case IP_BROADCAST_TTL:
- *(uchar_t *)ptr = connp->conn_broadcast_ttl;
- return (sizeof (uchar_t));
- default:
- return (-1);
- }
- break;
- case IPPROTO_IPV6:
- if (udp->udp_family != AF_INET6)
- return (-1);
- switch (name) {
- case IPV6_UNICAST_HOPS:
- *i1 = (unsigned int)udp->udp_ttl;
- break; /* goto sizeof (int) option return */
- case IPV6_MULTICAST_IF:
- /* 0 index if not set */
- *i1 = udp->udp_multicast_if_index;
- break; /* goto sizeof (int) option return */
- case IPV6_MULTICAST_HOPS:
- *i1 = udp->udp_multicast_ttl;
- break; /* goto sizeof (int) option return */
- case IPV6_MULTICAST_LOOP:
- *i1 = connp->conn_multicast_loop;
- break; /* goto sizeof (int) option return */
- case IPV6_JOIN_GROUP:
- case IPV6_LEAVE_GROUP:
- case MCAST_JOIN_GROUP:
- case MCAST_LEAVE_GROUP:
- case MCAST_BLOCK_SOURCE:
- case MCAST_UNBLOCK_SOURCE:
- case MCAST_JOIN_SOURCE_GROUP:
- case MCAST_LEAVE_SOURCE_GROUP:
- /* cannot "get" the value for these */
- return (-1);
- case IPV6_BOUND_IF:
- /* Zero if not set */
- *i1 = udp->udp_bound_if;
- break; /* goto sizeof (int) option return */
- case IPV6_UNSPEC_SRC:
- *i1 = udp->udp_unspec_source;
- break; /* goto sizeof (int) option return */
- case IPV6_RECVPKTINFO:
- *i1 = udp->udp_ip_recvpktinfo;
- break; /* goto sizeof (int) option return */
- case IPV6_RECVTCLASS:
- *i1 = udp->udp_ipv6_recvtclass;
- break; /* goto sizeof (int) option return */
- case IPV6_RECVPATHMTU:
- *i1 = udp->udp_ipv6_recvpathmtu;
- break; /* goto sizeof (int) option return */
- case IPV6_RECVHOPLIMIT:
- *i1 = udp->udp_ipv6_recvhoplimit;
- break; /* goto sizeof (int) option return */
- case IPV6_RECVHOPOPTS:
- *i1 = udp->udp_ipv6_recvhopopts;
- break; /* goto sizeof (int) option return */
- case IPV6_RECVDSTOPTS:
- *i1 = udp->udp_ipv6_recvdstopts;
- break; /* goto sizeof (int) option return */
- case _OLD_IPV6_RECVDSTOPTS:
- *i1 = udp->udp_old_ipv6_recvdstopts;
- break; /* goto sizeof (int) option return */
- case IPV6_RECVRTHDRDSTOPTS:
- *i1 = udp->udp_ipv6_recvrthdrdstopts;
- break; /* goto sizeof (int) option return */
- case IPV6_RECVRTHDR:
- *i1 = udp->udp_ipv6_recvrthdr;
- break; /* goto sizeof (int) option return */
- case IPV6_PKTINFO: {
- /* XXX assumes that caller has room for max size! */
- struct in6_pktinfo *pkti;
-
- pkti = (struct in6_pktinfo *)ptr;
- if (ipp->ipp_fields & IPPF_IFINDEX)
- pkti->ipi6_ifindex = ipp->ipp_ifindex;
- else
- pkti->ipi6_ifindex = 0;
- if (ipp->ipp_fields & IPPF_ADDR)
- pkti->ipi6_addr = ipp->ipp_addr;
- else
- pkti->ipi6_addr = ipv6_all_zeros;
- return (sizeof (struct in6_pktinfo));
- }
- case IPV6_TCLASS:
- if (ipp->ipp_fields & IPPF_TCLASS)
- *i1 = ipp->ipp_tclass;
- else
- *i1 = IPV6_FLOW_TCLASS(
- IPV6_DEFAULT_VERS_AND_FLOW);
- break; /* goto sizeof (int) option return */
- case IPV6_NEXTHOP: {
- sin6_t *sin6 = (sin6_t *)ptr;
-
- if (!(ipp->ipp_fields & IPPF_NEXTHOP))
- return (0);
- *sin6 = sin6_null;
- sin6->sin6_family = AF_INET6;
- sin6->sin6_addr = ipp->ipp_nexthop;
- return (sizeof (sin6_t));
- }
- case IPV6_HOPOPTS:
- if (!(ipp->ipp_fields & IPPF_HOPOPTS))
- return (0);
- if (ipp->ipp_hopoptslen <= udp->udp_label_len_v6)
+ mutex_enter(&connp->conn_lock);
+ if (!(udp->udp_recv_ipp.ipp_fields &
+ IPPF_IPV4_OPTIONS)) {
+ mutex_exit(&connp->conn_lock);
return (0);
- /*
- * The cipso/label option is added by kernel.
- * User is not usually aware of this option.
- * We copy out the hbh opt after the label option.
- */
- bcopy((char *)ipp->ipp_hopopts + udp->udp_label_len_v6,
- ptr, ipp->ipp_hopoptslen - udp->udp_label_len_v6);
- if (udp->udp_label_len_v6 > 0) {
- ptr[0] = ((char *)ipp->ipp_hopopts)[0];
- ptr[1] = (ipp->ipp_hopoptslen -
- udp->udp_label_len_v6 + 7) / 8 - 1;
}
- return (ipp->ipp_hopoptslen - udp->udp_label_len_v6);
- case IPV6_RTHDRDSTOPTS:
- if (!(ipp->ipp_fields & IPPF_RTDSTOPTS))
- return (0);
- bcopy(ipp->ipp_rtdstopts, ptr, ipp->ipp_rtdstoptslen);
- return (ipp->ipp_rtdstoptslen);
- case IPV6_RTHDR:
- if (!(ipp->ipp_fields & IPPF_RTHDR))
- return (0);
- bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen);
- return (ipp->ipp_rthdrlen);
- case IPV6_DSTOPTS:
- if (!(ipp->ipp_fields & IPPF_DSTOPTS))
- return (0);
- bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen);
- return (ipp->ipp_dstoptslen);
- case IPV6_PATHMTU:
- return (ip_fill_mtuinfo(&udp->udp_v6dst,
- udp->udp_dstport, (struct ip6_mtuinfo *)ptr,
- us->us_netstack));
- default:
- return (-1);
+
+ len = udp->udp_recv_ipp.ipp_ipv4_options_len;
+ ASSERT(len != 0);
+ bcopy(udp->udp_recv_ipp.ipp_ipv4_options, ptr, len);
+ mutex_exit(&connp->conn_lock);
+ return (len);
}
break;
case IPPROTO_UDP:
switch (name) {
- case UDP_ANONPRIVBIND:
- *i1 = udp->udp_anon_priv_bind;
- break;
- case UDP_EXCLBIND:
- *i1 = udp->udp_exclbind ? UDP_EXCLBIND : 0;
- break;
- case UDP_RCVHDR:
- *i1 = udp->udp_rcvhdr ? 1 : 0;
- break;
case UDP_NAT_T_ENDPOINT:
+ mutex_enter(&connp->conn_lock);
*i1 = udp->udp_nat_t_endpoint;
- break;
- default:
- return (-1);
+ mutex_exit(&connp->conn_lock);
+ return (sizeof (int));
+ case UDP_RCVHDR:
+ mutex_enter(&connp->conn_lock);
+ *i1 = udp->udp_rcvhdr ? 1 : 0;
+ mutex_exit(&connp->conn_lock);
+ return (sizeof (int));
}
- break;
- default:
- return (-1);
}
- return (sizeof (int));
+ mutex_enter(&connp->conn_lock);
+ retval = conn_opt_get(&coas, level, name, ptr);
+ mutex_exit(&connp->conn_lock);
+ return (retval);
}
+/*
+ * This routine retrieves the current status of socket options.
+ * It returns the size of the option retrieved, or -1.
+ */
int
udp_tpi_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
{
- udp_t *udp;
- int err;
-
- udp = Q_TO_UDP(q);
+ conn_t *connp = Q_TO_CONN(q);
+ int err;
- rw_enter(&udp->udp_rwlock, RW_READER);
- err = udp_opt_get(Q_TO_CONN(q), level, name, ptr);
- rw_exit(&udp->udp_rwlock);
+ err = udp_opt_get(connp, level, name, ptr);
return (err);
}
/*
* This routine sets socket options.
*/
-/* ARGSUSED */
-static int
-udp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
- uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, cred_t *cr,
- void *thisdg_attrs, boolean_t checkonly)
+int
+udp_do_opt_set(conn_opt_arg_t *coa, int level, int name,
+ uint_t inlen, uchar_t *invalp, cred_t *cr, boolean_t checkonly)
{
- udpattrs_t *attrs = thisdg_attrs;
- int *i1 = (int *)invalp;
- boolean_t onoff = (*i1 == 0) ? 0 : 1;
- udp_t *udp = connp->conn_udp;
+ conn_t *connp = coa->coa_connp;
+ ip_xmit_attr_t *ixa = coa->coa_ixa;
+ udp_t *udp = connp->conn_udp;
udp_stack_t *us = udp->udp_us;
- int error;
- uint_t newlen;
- size_t sth_wroff;
+ int *i1 = (int *)invalp;
+ boolean_t onoff = (*i1 == 0) ? 0 : 1;
+ int error;
- ASSERT(RW_WRITE_HELD(&udp->udp_rwlock));
+ ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock));
/*
- * For fixed length options, no sanity check
- * of passed in length is done. It is assumed *_optcom_req()
- * routines do the right thing.
+ * First do UDP specific sanity checks and handle UDP specific
+ * options. Note that some IPPROTO_UDP options are handled
+ * by conn_opt_set.
*/
switch (level) {
case SOL_SOCKET:
switch (name) {
- case SO_REUSEADDR:
- if (!checkonly) {
- udp->udp_reuseaddr = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case SO_DEBUG:
- if (!checkonly)
- udp->udp_debug = onoff;
- break;
- /*
- * The following three items are available here,
- * but are only meaningful to IP.
- */
- case SO_DONTROUTE:
- if (!checkonly) {
- udp->udp_dontroute = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case SO_USELOOPBACK:
- if (!checkonly) {
- udp->udp_useloopback = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case SO_BROADCAST:
- if (!checkonly) {
- udp->udp_broadcast = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
-
case SO_SNDBUF:
if (*i1 > us->us_max_buf) {
- *outlenp = 0;
return (ENOBUFS);
}
- if (!checkonly) {
- udp->udp_xmit_hiwat = *i1;
- connp->conn_wq->q_hiwat = *i1;
- }
break;
case SO_RCVBUF:
if (*i1 > us->us_max_buf) {
- *outlenp = 0;
return (ENOBUFS);
}
- if (!checkonly) {
- int size;
-
- udp->udp_rcv_disply_hiwat = *i1;
- size = udp_set_rcv_hiwat(udp, *i1);
- rw_exit(&udp->udp_rwlock);
- (void) proto_set_rx_hiwat(connp->conn_rq, connp,
- size);
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- }
- break;
- case SO_DGRAM_ERRIND:
- if (!checkonly)
- udp->udp_dgram_errind = onoff;
- break;
- case SO_RECVUCRED:
- if (!checkonly)
- udp->udp_recvucred = onoff;
- break;
- case SO_ALLZONES:
- /*
- * "soft" error (negative)
- * option not handled at this level
- * Do not modify *outlenp.
- */
- return (-EINVAL);
- case SO_TIMESTAMP:
- if (!checkonly)
- udp->udp_timestamp = onoff;
- break;
- case SO_ANON_MLP:
- case SO_MAC_EXEMPT:
- case SO_MAC_IMPLICIT:
- PASS_OPT_TO_IP(connp);
break;
+
case SCM_UCRED: {
struct ucred_s *ucr;
- cred_t *cr, *newcr;
+ cred_t *newcr;
ts_label_t *tsl;
/*
@@ -2250,20 +1777,18 @@ udp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
*/
if (connp->conn_mlp_type == mlptSingle)
break;
+
ucr = (struct ucred_s *)invalp;
if (inlen != ucredsize ||
ucr->uc_labeloff < sizeof (*ucr) ||
ucr->uc_labeloff + sizeof (bslabel_t) > inlen)
return (EINVAL);
if (!checkonly) {
- mblk_t *mb;
- pid_t cpid;
-
- if (attrs == NULL ||
- (mb = attrs->udpattr_mb) == NULL)
- return (EINVAL);
- if ((cr = msg_getcred(mb, &cpid)) == NULL)
- cr = udp->udp_connp->conn_cred;
+ /*
+ * Set ixa_tsl to the new label.
+ * We assume that crgetzoneid doesn't change
+ * as part of the SCM_UCRED.
+ */
ASSERT(cr != NULL);
if ((tsl = crgetlabel(cr)) == NULL)
return (EINVAL);
@@ -2271,778 +1796,75 @@ udp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
tsl->tsl_doi, KM_NOSLEEP);
if (newcr == NULL)
return (ENOSR);
- mblk_setcred(mb, newcr, cpid);
- attrs->udpattr_credset = B_TRUE;
- crfree(newcr);
- }
- break;
- }
- case SO_EXCLBIND:
- if (!checkonly)
- udp->udp_exclbind = onoff;
- break;
- case SO_RCVTIMEO:
- case SO_SNDTIMEO:
- /*
- * Pass these two options in order for third part
- * protocol usage. Here just return directly.
- */
- return (0);
- default:
- *outlenp = 0;
- return (EINVAL);
- }
- break;
- case IPPROTO_IP:
- if (udp->udp_family != AF_INET) {
- *outlenp = 0;
- return (ENOPROTOOPT);
- }
- switch (name) {
- case IP_OPTIONS:
- case T_IP_OPTIONS:
- /* Save options for use by IP. */
- newlen = inlen + udp->udp_label_len;
- if ((inlen & 0x3) || newlen > IP_MAX_OPT_LENGTH) {
- *outlenp = 0;
- return (EINVAL);
- }
- if (checkonly)
- break;
-
- /*
- * Update the stored options taking into account
- * any CIPSO option which we should not overwrite.
- */
- if (!tsol_option_set(&udp->udp_ip_snd_options,
- &udp->udp_ip_snd_options_len,
- udp->udp_label_len, invalp, inlen)) {
- *outlenp = 0;
- return (ENOMEM);
- }
-
- udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
- UDPH_SIZE + udp->udp_ip_snd_options_len;
- sth_wroff = udp->udp_max_hdr_len + us->us_wroff_extra;
- rw_exit(&udp->udp_rwlock);
- (void) proto_set_tx_wroff(connp->conn_rq, connp,
- sth_wroff);
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- break;
-
- case IP_TTL:
- if (!checkonly) {
- udp->udp_ttl = (uchar_t)*i1;
- }
- break;
- case IP_TOS:
- case T_IP_TOS:
- if (!checkonly) {
- udp->udp_type_of_service = (uchar_t)*i1;
- }
- break;
- case IP_MULTICAST_IF: {
- /*
- * TODO should check OPTMGMT reply and undo this if
- * there is an error.
- */
- struct in_addr *inap = (struct in_addr *)invalp;
- if (!checkonly) {
- udp->udp_multicast_if_addr =
- inap->s_addr;
- PASS_OPT_TO_IP(connp);
- }
- break;
- }
- case IP_MULTICAST_TTL:
- if (!checkonly)
- udp->udp_multicast_ttl = *invalp;
- break;
- case IP_MULTICAST_LOOP:
- if (!checkonly) {
- connp->conn_multicast_loop = *invalp;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IP_RECVOPTS:
- if (!checkonly)
- udp->udp_recvopts = onoff;
- break;
- case IP_RECVDSTADDR:
- if (!checkonly)
- udp->udp_recvdstaddr = onoff;
- break;
- case IP_RECVIF:
- if (!checkonly) {
- udp->udp_recvif = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IP_RECVSLLA:
- if (!checkonly) {
- udp->udp_recvslla = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IP_RECVTTL:
- if (!checkonly)
- udp->udp_recvttl = onoff;
- break;
- case IP_PKTINFO: {
- /*
- * This also handles IP_RECVPKTINFO.
- * IP_PKTINFO and IP_RECVPKTINFO have same value.
- * Differentiation is based on the size of the
- * argument passed in.
- */
- struct in_pktinfo *pktinfop;
- ip4_pkt_t *attr_pktinfop;
-
- if (checkonly)
- break;
-
- if (inlen == sizeof (int)) {
- /*
- * This is IP_RECVPKTINFO option.
- * Keep a local copy of whether this option is
- * set or not and pass it down to IP for
- * processing.
- */
-
- udp->udp_ip_recvpktinfo = onoff;
- return (-EINVAL);
- }
-
- if (attrs == NULL ||
- (attr_pktinfop = attrs->udpattr_ipp4) == NULL) {
+ ASSERT(newcr->cr_label != NULL);
/*
- * sticky option or no buffer to return
- * the results.
+ * Move the hold on the cr_label to ixa_tsl by
+ * setting cr_label to NULL. Then release newcr.
*/
- return (EINVAL);
- }
-
- if (inlen != sizeof (struct in_pktinfo))
- return (EINVAL);
-
- pktinfop = (struct in_pktinfo *)invalp;
-
- /*
- * At least one of the values should be specified
- */
- if (pktinfop->ipi_ifindex == 0 &&
- pktinfop->ipi_spec_dst.s_addr == INADDR_ANY) {
- return (EINVAL);
- }
-
- attr_pktinfop->ip4_addr = pktinfop->ipi_spec_dst.s_addr;
- attr_pktinfop->ip4_ill_index = pktinfop->ipi_ifindex;
-
- break;
- }
- case IP_ADD_MEMBERSHIP:
- case IP_DROP_MEMBERSHIP:
- case IP_BLOCK_SOURCE:
- case IP_UNBLOCK_SOURCE:
- case IP_ADD_SOURCE_MEMBERSHIP:
- case IP_DROP_SOURCE_MEMBERSHIP:
- case MCAST_JOIN_GROUP:
- case MCAST_LEAVE_GROUP:
- case MCAST_BLOCK_SOURCE:
- case MCAST_UNBLOCK_SOURCE:
- case MCAST_JOIN_SOURCE_GROUP:
- case MCAST_LEAVE_SOURCE_GROUP:
- case IP_SEC_OPT:
- case IP_NEXTHOP:
- case IP_DHCPINIT_IF:
- /*
- * "soft" error (negative)
- * option not handled at this level
- * Do not modify *outlenp.
- */
- return (-EINVAL);
- case IP_BOUND_IF:
- if (!checkonly) {
- udp->udp_bound_if = *i1;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IP_UNSPEC_SRC:
- if (!checkonly) {
- udp->udp_unspec_source = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IP_BROADCAST_TTL:
- if (!checkonly)
- connp->conn_broadcast_ttl = *invalp;
- break;
- default:
- *outlenp = 0;
- return (EINVAL);
- }
- break;
- case IPPROTO_IPV6: {
- ip6_pkt_t *ipp;
- boolean_t sticky;
-
- if (udp->udp_family != AF_INET6) {
- *outlenp = 0;
- return (ENOPROTOOPT);
- }
- /*
- * Deal with both sticky options and ancillary data
- */
- sticky = B_FALSE;
- if (attrs == NULL || (ipp = attrs->udpattr_ipp6) ==
- NULL) {
- /* sticky options, or none */
- ipp = &udp->udp_sticky_ipp;
- sticky = B_TRUE;
- }
-
- switch (name) {
- case IPV6_MULTICAST_IF:
- if (!checkonly) {
- udp->udp_multicast_if_index = *i1;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IPV6_UNICAST_HOPS:
- /* -1 means use default */
- if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) {
- *outlenp = 0;
- return (EINVAL);
- }
- if (!checkonly) {
- if (*i1 == -1) {
- udp->udp_ttl = ipp->ipp_unicast_hops =
- us->us_ipv6_hoplimit;
- ipp->ipp_fields &= ~IPPF_UNICAST_HOPS;
- /* Pass modified value to IP. */
- *i1 = udp->udp_ttl;
- } else {
- udp->udp_ttl = ipp->ipp_unicast_hops =
- (uint8_t)*i1;
- ipp->ipp_fields |= IPPF_UNICAST_HOPS;
- }
- /* Rebuild the header template */
- error = udp_build_hdrs(udp);
- if (error != 0) {
- *outlenp = 0;
- return (error);
- }
- }
- break;
- case IPV6_MULTICAST_HOPS:
- /* -1 means use default */
- if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) {
- *outlenp = 0;
- return (EINVAL);
- }
- if (!checkonly) {
- if (*i1 == -1) {
- udp->udp_multicast_ttl =
- ipp->ipp_multicast_hops =
- IP_DEFAULT_MULTICAST_TTL;
- ipp->ipp_fields &= ~IPPF_MULTICAST_HOPS;
- /* Pass modified value to IP. */
- *i1 = udp->udp_multicast_ttl;
- } else {
- udp->udp_multicast_ttl =
- ipp->ipp_multicast_hops =
- (uint8_t)*i1;
- ipp->ipp_fields |= IPPF_MULTICAST_HOPS;
- }
- }
- break;
- case IPV6_MULTICAST_LOOP:
- if (*i1 != 0 && *i1 != 1) {
- *outlenp = 0;
- return (EINVAL);
- }
- if (!checkonly) {
- connp->conn_multicast_loop = *i1;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IPV6_JOIN_GROUP:
- case IPV6_LEAVE_GROUP:
- case MCAST_JOIN_GROUP:
- case MCAST_LEAVE_GROUP:
- case MCAST_BLOCK_SOURCE:
- case MCAST_UNBLOCK_SOURCE:
- case MCAST_JOIN_SOURCE_GROUP:
- case MCAST_LEAVE_SOURCE_GROUP:
- /*
- * "soft" error (negative)
- * option not handled at this level
- * Note: Do not modify *outlenp
- */
- return (-EINVAL);
- case IPV6_BOUND_IF:
- if (!checkonly) {
- udp->udp_bound_if = *i1;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IPV6_UNSPEC_SRC:
- if (!checkonly) {
- udp->udp_unspec_source = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
- /*
- * Set boolean switches for ancillary data delivery
- */
- case IPV6_RECVPKTINFO:
- if (!checkonly) {
- udp->udp_ip_recvpktinfo = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IPV6_RECVTCLASS:
- if (!checkonly) {
- udp->udp_ipv6_recvtclass = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IPV6_RECVPATHMTU:
- if (!checkonly) {
- udp->udp_ipv6_recvpathmtu = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IPV6_RECVHOPLIMIT:
- if (!checkonly) {
- udp->udp_ipv6_recvhoplimit = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IPV6_RECVHOPOPTS:
- if (!checkonly) {
- udp->udp_ipv6_recvhopopts = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IPV6_RECVDSTOPTS:
- if (!checkonly) {
- udp->udp_ipv6_recvdstopts = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case _OLD_IPV6_RECVDSTOPTS:
- if (!checkonly)
- udp->udp_old_ipv6_recvdstopts = onoff;
- break;
- case IPV6_RECVRTHDRDSTOPTS:
- if (!checkonly) {
- udp->udp_ipv6_recvrthdrdstopts = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IPV6_RECVRTHDR:
- if (!checkonly) {
- udp->udp_ipv6_recvrthdr = onoff;
- PASS_OPT_TO_IP(connp);
- }
- break;
- /*
- * Set sticky options or ancillary data.
- * If sticky options, (re)build any extension headers
- * that might be needed as a result.
- */
- case IPV6_PKTINFO:
- /*
- * The source address and ifindex are verified
- * in ip_opt_set(). For ancillary data the
- * source address is checked in ip_wput_v6.
- */
- if (inlen != 0 && inlen != sizeof (struct in6_pktinfo))
- return (EINVAL);
- if (checkonly)
- break;
-
- if (inlen == 0) {
- ipp->ipp_fields &= ~(IPPF_IFINDEX|IPPF_ADDR);
- ipp->ipp_sticky_ignored |=
- (IPPF_IFINDEX|IPPF_ADDR);
- } else {
- struct in6_pktinfo *pkti;
-
- pkti = (struct in6_pktinfo *)invalp;
- ipp->ipp_ifindex = pkti->ipi6_ifindex;
- ipp->ipp_addr = pkti->ipi6_addr;
- if (ipp->ipp_ifindex != 0)
- ipp->ipp_fields |= IPPF_IFINDEX;
- else
- ipp->ipp_fields &= ~IPPF_IFINDEX;
- if (!IN6_IS_ADDR_UNSPECIFIED(
- &ipp->ipp_addr))
- ipp->ipp_fields |= IPPF_ADDR;
- else
- ipp->ipp_fields &= ~IPPF_ADDR;
- }
- if (sticky) {
- error = udp_build_hdrs(udp);
- if (error != 0)
- return (error);
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IPV6_HOPLIMIT:
- if (sticky)
- return (EINVAL);
- if (inlen != 0 && inlen != sizeof (int))
- return (EINVAL);
- if (checkonly)
- break;
-
- if (inlen == 0) {
- ipp->ipp_fields &= ~IPPF_HOPLIMIT;
- ipp->ipp_sticky_ignored |= IPPF_HOPLIMIT;
- } else {
- if (*i1 > 255 || *i1 < -1)
- return (EINVAL);
- if (*i1 == -1)
- ipp->ipp_hoplimit =
- us->us_ipv6_hoplimit;
- else
- ipp->ipp_hoplimit = *i1;
- ipp->ipp_fields |= IPPF_HOPLIMIT;
- }
- break;
- case IPV6_TCLASS:
- if (inlen != 0 && inlen != sizeof (int))
- return (EINVAL);
- if (checkonly)
- break;
-
- if (inlen == 0) {
- ipp->ipp_fields &= ~IPPF_TCLASS;
- ipp->ipp_sticky_ignored |= IPPF_TCLASS;
- } else {
- if (*i1 > 255 || *i1 < -1)
- return (EINVAL);
- if (*i1 == -1)
- ipp->ipp_tclass = 0;
- else
- ipp->ipp_tclass = *i1;
- ipp->ipp_fields |= IPPF_TCLASS;
- }
- if (sticky) {
- error = udp_build_hdrs(udp);
- if (error != 0)
- return (error);
- }
- break;
- case IPV6_NEXTHOP:
- /*
- * IP will verify that the nexthop is reachable
- * and fail for sticky options.
- */
- if (inlen != 0 && inlen != sizeof (sin6_t))
- return (EINVAL);
- if (checkonly)
- break;
-
- if (inlen == 0) {
- ipp->ipp_fields &= ~IPPF_NEXTHOP;
- ipp->ipp_sticky_ignored |= IPPF_NEXTHOP;
- } else {
- sin6_t *sin6 = (sin6_t *)invalp;
-
- if (sin6->sin6_family != AF_INET6) {
- return (EAFNOSUPPORT);
- }
- if (IN6_IS_ADDR_V4MAPPED(
- &sin6->sin6_addr))
- return (EADDRNOTAVAIL);
- ipp->ipp_nexthop = sin6->sin6_addr;
- if (!IN6_IS_ADDR_UNSPECIFIED(
- &ipp->ipp_nexthop))
- ipp->ipp_fields |= IPPF_NEXTHOP;
- else
- ipp->ipp_fields &= ~IPPF_NEXTHOP;
- }
- if (sticky) {
- error = udp_build_hdrs(udp);
- if (error != 0)
- return (error);
- PASS_OPT_TO_IP(connp);
- }
- break;
- case IPV6_HOPOPTS: {
- ip6_hbh_t *hopts = (ip6_hbh_t *)invalp;
- /*
- * Sanity checks - minimum size, size a multiple of
- * eight bytes, and matching size passed in.
- */
- if (inlen != 0 &&
- inlen != (8 * (hopts->ip6h_len + 1)))
- return (EINVAL);
-
- if (checkonly)
- break;
-
- error = optcom_pkt_set(invalp, inlen, sticky,
- (uchar_t **)&ipp->ipp_hopopts,
- &ipp->ipp_hopoptslen,
- sticky ? udp->udp_label_len_v6 : 0);
- if (error != 0)
- return (error);
- if (ipp->ipp_hopoptslen == 0) {
- ipp->ipp_fields &= ~IPPF_HOPOPTS;
- ipp->ipp_sticky_ignored |= IPPF_HOPOPTS;
- } else {
- ipp->ipp_fields |= IPPF_HOPOPTS;
- }
- if (sticky) {
- error = udp_build_hdrs(udp);
- if (error != 0)
- return (error);
- }
- break;
- }
- case IPV6_RTHDRDSTOPTS: {
- ip6_dest_t *dopts = (ip6_dest_t *)invalp;
-
- /*
- * Sanity checks - minimum size, size a multiple of
- * eight bytes, and matching size passed in.
- */
- if (inlen != 0 &&
- inlen != (8 * (dopts->ip6d_len + 1)))
- return (EINVAL);
-
- if (checkonly)
- break;
-
- if (inlen == 0) {
- if (sticky &&
- (ipp->ipp_fields & IPPF_RTDSTOPTS) != 0) {
- kmem_free(ipp->ipp_rtdstopts,
- ipp->ipp_rtdstoptslen);
- ipp->ipp_rtdstopts = NULL;
- ipp->ipp_rtdstoptslen = 0;
- }
-
- ipp->ipp_fields &= ~IPPF_RTDSTOPTS;
- ipp->ipp_sticky_ignored |= IPPF_RTDSTOPTS;
- } else {
- error = optcom_pkt_set(invalp, inlen, sticky,
- (uchar_t **)&ipp->ipp_rtdstopts,
- &ipp->ipp_rtdstoptslen, 0);
- if (error != 0)
- return (error);
- ipp->ipp_fields |= IPPF_RTDSTOPTS;
- }
- if (sticky) {
- error = udp_build_hdrs(udp);
- if (error != 0)
- return (error);
- }
- break;
- }
- case IPV6_DSTOPTS: {
- ip6_dest_t *dopts = (ip6_dest_t *)invalp;
-
- /*
- * Sanity checks - minimum size, size a multiple of
- * eight bytes, and matching size passed in.
- */
- if (inlen != 0 &&
- inlen != (8 * (dopts->ip6d_len + 1)))
- return (EINVAL);
-
- if (checkonly)
- break;
-
- if (inlen == 0) {
- if (sticky &&
- (ipp->ipp_fields & IPPF_DSTOPTS) != 0) {
- kmem_free(ipp->ipp_dstopts,
- ipp->ipp_dstoptslen);
- ipp->ipp_dstopts = NULL;
- ipp->ipp_dstoptslen = 0;
- }
- ipp->ipp_fields &= ~IPPF_DSTOPTS;
- ipp->ipp_sticky_ignored |= IPPF_DSTOPTS;
- } else {
- error = optcom_pkt_set(invalp, inlen, sticky,
- (uchar_t **)&ipp->ipp_dstopts,
- &ipp->ipp_dstoptslen, 0);
- if (error != 0)
- return (error);
- ipp->ipp_fields |= IPPF_DSTOPTS;
- }
- if (sticky) {
- error = udp_build_hdrs(udp);
- if (error != 0)
- return (error);
- }
- break;
- }
- case IPV6_RTHDR: {
- ip6_rthdr_t *rt = (ip6_rthdr_t *)invalp;
-
- /*
- * Sanity checks - minimum size, size a multiple of
- * eight bytes, and matching size passed in.
- */
- if (inlen != 0 &&
- inlen != (8 * (rt->ip6r_len + 1)))
- return (EINVAL);
-
- if (checkonly)
- break;
-
- if (inlen == 0) {
- if (sticky &&
- (ipp->ipp_fields & IPPF_RTHDR) != 0) {
- kmem_free(ipp->ipp_rthdr,
- ipp->ipp_rthdrlen);
- ipp->ipp_rthdr = NULL;
- ipp->ipp_rthdrlen = 0;
- }
- ipp->ipp_fields &= ~IPPF_RTHDR;
- ipp->ipp_sticky_ignored |= IPPF_RTHDR;
- } else {
- error = optcom_pkt_set(invalp, inlen, sticky,
- (uchar_t **)&ipp->ipp_rthdr,
- &ipp->ipp_rthdrlen, 0);
- if (error != 0)
- return (error);
- ipp->ipp_fields |= IPPF_RTHDR;
- }
- if (sticky) {
- error = udp_build_hdrs(udp);
- if (error != 0)
- return (error);
+ ip_xmit_attr_replace_tsl(ixa, newcr->cr_label);
+ ixa->ixa_flags |= IXAF_UCRED_TSL;
+ newcr->cr_label = NULL;
+ crfree(newcr);
+ coa->coa_changed |= COA_HEADER_CHANGED;
+ coa->coa_changed |= COA_WROFF_CHANGED;
}
- break;
+ /* Fully handled this option. */
+ return (0);
}
-
- case IPV6_DONTFRAG:
- if (checkonly)
- break;
-
- if (onoff) {
- ipp->ipp_fields |= IPPF_DONTFRAG;
- } else {
- ipp->ipp_fields &= ~IPPF_DONTFRAG;
- }
- break;
-
- case IPV6_USE_MIN_MTU:
- if (inlen != sizeof (int))
- return (EINVAL);
-
- if (*i1 < -1 || *i1 > 1)
- return (EINVAL);
-
- if (checkonly)
- break;
-
- ipp->ipp_fields |= IPPF_USE_MIN_MTU;
- ipp->ipp_use_min_mtu = *i1;
- break;
-
- case IPV6_SEC_OPT:
- case IPV6_SRC_PREFERENCES:
- case IPV6_V6ONLY:
- /* Handled at the IP level */
- return (-EINVAL);
- default:
- *outlenp = 0;
- return (EINVAL);
}
break;
- } /* end IPPROTO_IPV6 */
case IPPROTO_UDP:
switch (name) {
- case UDP_ANONPRIVBIND:
- if ((error = secpolicy_net_privaddr(cr, 0,
- IPPROTO_UDP)) != 0) {
- *outlenp = 0;
- return (error);
- }
- if (!checkonly) {
- udp->udp_anon_priv_bind = onoff;
- }
- break;
- case UDP_EXCLBIND:
- if (!checkonly)
- udp->udp_exclbind = onoff;
- break;
- case UDP_RCVHDR:
- if (!checkonly)
- udp->udp_rcvhdr = onoff;
- break;
case UDP_NAT_T_ENDPOINT:
if ((error = secpolicy_ip_config(cr, B_FALSE)) != 0) {
- *outlenp = 0;
return (error);
}
/*
- * Use udp_family instead so we can avoid ambiguitites
+ * Use conn_family instead so we can avoid ambiguitites
* with AF_INET6 sockets that may switch from IPv4
* to IPv6.
*/
- if (udp->udp_family != AF_INET) {
- *outlenp = 0;
+ if (connp->conn_family != AF_INET) {
return (EAFNOSUPPORT);
}
if (!checkonly) {
- int size;
-
+ mutex_enter(&connp->conn_lock);
udp->udp_nat_t_endpoint = onoff;
-
- udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
- UDPH_SIZE + udp->udp_ip_snd_options_len;
-
- /* Also, adjust wroff */
- if (onoff) {
- udp->udp_max_hdr_len +=
- sizeof (uint32_t);
- }
- size = udp->udp_max_hdr_len +
- us->us_wroff_extra;
- (void) proto_set_tx_wroff(connp->conn_rq, connp,
- size);
+ mutex_exit(&connp->conn_lock);
+ coa->coa_changed |= COA_HEADER_CHANGED;
+ coa->coa_changed |= COA_WROFF_CHANGED;
}
- break;
- default:
- *outlenp = 0;
- return (EINVAL);
+ /* Fully handled this option. */
+ return (0);
+ case UDP_RCVHDR:
+ mutex_enter(&connp->conn_lock);
+ udp->udp_rcvhdr = onoff;
+ mutex_exit(&connp->conn_lock);
+ return (0);
}
break;
- default:
- *outlenp = 0;
- return (EINVAL);
- }
- /*
- * Common case of OK return with outval same as inval.
- */
- if (invalp != outvalp) {
- /* don't trust bcopy for identical src/dst */
- (void) bcopy(invalp, outvalp, inlen);
}
- *outlenp = inlen;
- return (0);
+ error = conn_opt_set(coa, level, name, inlen, invalp,
+ checkonly, cr);
+ return (error);
}
+/*
+ * This routine sets socket options.
+ */
int
-udp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
- uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
- void *thisdg_attrs, cred_t *cr)
+udp_opt_set(conn_t *connp, uint_t optset_context, int level,
+ int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
+ uchar_t *outvalp, void *thisdg_attrs, cred_t *cr)
{
- int error;
+ udp_t *udp = connp->conn_udp;
+ int err;
+ conn_opt_arg_t coas, *coa;
boolean_t checkonly;
+ udp_stack_t *us = udp->udp_us;
- error = 0;
switch (optset_context) {
case SETFN_OPTCOM_CHECKONLY:
checkonly = B_TRUE;
@@ -3056,7 +1878,7 @@ udp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
*/
if (inlen == 0) {
*outlenp = 0;
- goto done;
+ return (0);
}
break;
case SETFN_OPTCOM_NEGOTIATE:
@@ -3074,8 +1896,7 @@ udp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
*/
if (!udp_opt_allow_udr_set(level, name)) {
*outlenp = 0;
- error = EINVAL;
- goto done;
+ return (EINVAL);
}
break;
default:
@@ -3083,99 +1904,326 @@ udp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
* We should never get here
*/
*outlenp = 0;
- error = EINVAL;
- goto done;
+ return (EINVAL);
}
ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
(optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
- error = udp_do_opt_set(connp, level, name, inlen, invalp, outlenp,
- outvalp, cr, thisdg_attrs, checkonly);
-done:
- return (error);
+ if (thisdg_attrs != NULL) {
+ /* Options from T_UNITDATA_REQ */
+ coa = (conn_opt_arg_t *)thisdg_attrs;
+ ASSERT(coa->coa_connp == connp);
+ ASSERT(coa->coa_ixa != NULL);
+ ASSERT(coa->coa_ipp != NULL);
+ ASSERT(coa->coa_ancillary);
+ } else {
+ coa = &coas;
+ coas.coa_connp = connp;
+ /* Get a reference on conn_ixa to prevent concurrent mods */
+ coas.coa_ixa = conn_get_ixa(connp, B_TRUE);
+ if (coas.coa_ixa == NULL) {
+ *outlenp = 0;
+ return (ENOMEM);
+ }
+ coas.coa_ipp = &connp->conn_xmit_ipp;
+ coas.coa_ancillary = B_FALSE;
+ coas.coa_changed = 0;
+ }
+
+ err = udp_do_opt_set(coa, level, name, inlen, invalp,
+ cr, checkonly);
+ if (err != 0) {
+errout:
+ if (!coa->coa_ancillary)
+ ixa_refrele(coa->coa_ixa);
+ *outlenp = 0;
+ return (err);
+ }
+ /* Handle DHCPINIT here outside of lock */
+ if (level == IPPROTO_IP && name == IP_DHCPINIT_IF) {
+ uint_t ifindex;
+ ill_t *ill;
+
+ ifindex = *(uint_t *)invalp;
+ if (ifindex == 0) {
+ ill = NULL;
+ } else {
+ ill = ill_lookup_on_ifindex(ifindex, B_FALSE,
+ coa->coa_ixa->ixa_ipst);
+ if (ill == NULL) {
+ err = ENXIO;
+ goto errout;
+ }
+
+ mutex_enter(&ill->ill_lock);
+ if (ill->ill_state_flags & ILL_CONDEMNED) {
+ mutex_exit(&ill->ill_lock);
+ ill_refrele(ill);
+ err = ENXIO;
+ goto errout;
+ }
+ if (IS_VNI(ill)) {
+ mutex_exit(&ill->ill_lock);
+ ill_refrele(ill);
+ err = EINVAL;
+ goto errout;
+ }
+ }
+ mutex_enter(&connp->conn_lock);
+
+ if (connp->conn_dhcpinit_ill != NULL) {
+ /*
+ * We've locked the conn so conn_cleanup_ill()
+ * cannot clear conn_dhcpinit_ill -- so it's
+ * safe to access the ill.
+ */
+ ill_t *oill = connp->conn_dhcpinit_ill;
+
+ ASSERT(oill->ill_dhcpinit != 0);
+ atomic_dec_32(&oill->ill_dhcpinit);
+ ill_set_inputfn(connp->conn_dhcpinit_ill);
+ connp->conn_dhcpinit_ill = NULL;
+ }
+
+ if (ill != NULL) {
+ connp->conn_dhcpinit_ill = ill;
+ atomic_inc_32(&ill->ill_dhcpinit);
+ ill_set_inputfn(ill);
+ mutex_exit(&connp->conn_lock);
+ mutex_exit(&ill->ill_lock);
+ ill_refrele(ill);
+ } else {
+ mutex_exit(&connp->conn_lock);
+ }
+ }
+
+ /*
+ * Common case of OK return with outval same as inval.
+ */
+ if (invalp != outvalp) {
+ /* don't trust bcopy for identical src/dst */
+ (void) bcopy(invalp, outvalp, inlen);
+ }
+ *outlenp = inlen;
+
+ /*
+ * If this was not ancillary data, then we rebuild the headers,
+ * update the IRE/NCE, and IPsec as needed.
+ * Since the label depends on the destination we go through
+ * ip_set_destination first.
+ */
+ if (coa->coa_ancillary) {
+ return (0);
+ }
+
+ if (coa->coa_changed & COA_ROUTE_CHANGED) {
+ in6_addr_t saddr, faddr, nexthop;
+ in_port_t fport;
+
+ /*
+ * We clear lastdst to make sure we pick up the change
+ * next time sending.
+ * If we are connected we re-cache the information.
+ * We ignore errors to preserve BSD behavior.
+ * Note that we don't redo IPsec policy lookup here
+ * since the final destination (or source) didn't change.
+ */
+ mutex_enter(&connp->conn_lock);
+ connp->conn_v6lastdst = ipv6_all_zeros;
+
+ ip_attr_nexthop(coa->coa_ipp, coa->coa_ixa,
+ &connp->conn_faddr_v6, &nexthop);
+ saddr = connp->conn_saddr_v6;
+ faddr = connp->conn_faddr_v6;
+ fport = connp->conn_fport;
+ mutex_exit(&connp->conn_lock);
+
+ if (!IN6_IS_ADDR_UNSPECIFIED(&faddr) &&
+ !IN6_IS_ADDR_V4MAPPED_ANY(&faddr)) {
+ (void) ip_attr_connect(connp, coa->coa_ixa,
+ &saddr, &faddr, &nexthop, fport, NULL, NULL,
+ IPDF_ALLOW_MCBC | IPDF_VERIFY_DST);
+ }
+ }
+
+ ixa_refrele(coa->coa_ixa);
+
+ if (coa->coa_changed & COA_HEADER_CHANGED) {
+ /*
+ * Rebuild the header template if we are connected.
+ * Otherwise clear conn_v6lastdst so we rebuild the header
+ * in the data path.
+ */
+ mutex_enter(&connp->conn_lock);
+ if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
+ !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
+ err = udp_build_hdr_template(connp,
+ &connp->conn_saddr_v6, &connp->conn_faddr_v6,
+ connp->conn_fport, connp->conn_flowinfo);
+ if (err != 0) {
+ mutex_exit(&connp->conn_lock);
+ return (err);
+ }
+ } else {
+ connp->conn_v6lastdst = ipv6_all_zeros;
+ }
+ mutex_exit(&connp->conn_lock);
+ }
+ if (coa->coa_changed & COA_RCVBUF_CHANGED) {
+ (void) proto_set_rx_hiwat(connp->conn_rq, connp,
+ connp->conn_rcvbuf);
+ }
+ if ((coa->coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
+ connp->conn_wq->q_hiwat = connp->conn_sndbuf;
+ }
+ if (coa->coa_changed & COA_WROFF_CHANGED) {
+ /* Increase wroff if needed */
+ uint_t wroff;
+
+ mutex_enter(&connp->conn_lock);
+ wroff = connp->conn_ht_iphc_allocated + us->us_wroff_extra;
+ if (udp->udp_nat_t_endpoint)
+ wroff += sizeof (uint32_t);
+ if (wroff > connp->conn_wroff) {
+ connp->conn_wroff = wroff;
+ mutex_exit(&connp->conn_lock);
+ (void) proto_set_tx_wroff(connp->conn_rq, connp, wroff);
+ } else {
+ mutex_exit(&connp->conn_lock);
+ }
+ }
+ return (err);
}
-/* ARGSUSED */
+/* This routine sets socket options. */
int
udp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
- void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
+ void *thisdg_attrs, cred_t *cr)
{
- conn_t *connp = Q_TO_CONN(q);
+ conn_t *connp = Q_TO_CONN(q);
int error;
- udp_t *udp = connp->conn_udp;
- rw_enter(&udp->udp_rwlock, RW_WRITER);
error = udp_opt_set(connp, optset_context, level, name, inlen, invalp,
outlenp, outvalp, thisdg_attrs, cr);
- rw_exit(&udp->udp_rwlock);
return (error);
}
/*
- * Update udp_sticky_hdrs based on udp_sticky_ipp, udp_v6src, and udp_ttl.
- * The headers include ip6i_t (if needed), ip6_t, any sticky extension
- * headers, and the udp header.
- * Returns failure if can't allocate memory.
+ * Setup IP and UDP headers.
+ * Returns NULL on allocation failure, in which case data_mp is freed.
*/
-static int
-udp_build_hdrs(udp_t *udp)
+mblk_t *
+udp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
+ const in6_addr_t *v6src, const in6_addr_t *v6dst, in_port_t dstport,
+ uint32_t flowinfo, mblk_t *data_mp, int *errorp)
{
- udp_stack_t *us = udp->udp_us;
- uchar_t *hdrs;
- uint_t hdrs_len;
- ip6_t *ip6h;
- ip6i_t *ip6i;
- udpha_t *udpha;
- ip6_pkt_t *ipp = &udp->udp_sticky_ipp;
- size_t sth_wroff;
- conn_t *connp = udp->udp_connp;
-
- ASSERT(RW_WRITE_HELD(&udp->udp_rwlock));
- ASSERT(connp != NULL);
+ mblk_t *mp;
+ udpha_t *udpha;
+ udp_stack_t *us = connp->conn_netstack->netstack_udp;
+ uint_t data_len;
+ uint32_t cksum;
+ udp_t *udp = connp->conn_udp;
+ boolean_t insert_spi = udp->udp_nat_t_endpoint;
+ uint_t ulp_hdr_len;
- hdrs_len = ip_total_hdrs_len_v6(ipp) + UDPH_SIZE;
- ASSERT(hdrs_len != 0);
- if (hdrs_len != udp->udp_sticky_hdrs_len) {
- /* Need to reallocate */
- hdrs = kmem_alloc(hdrs_len, KM_NOSLEEP);
- if (hdrs == NULL)
- return (ENOMEM);
+ data_len = msgdsize(data_mp);
+ ulp_hdr_len = UDPH_SIZE;
+ if (insert_spi)
+ ulp_hdr_len += sizeof (uint32_t);
- if (udp->udp_sticky_hdrs_len != 0) {
- kmem_free(udp->udp_sticky_hdrs,
- udp->udp_sticky_hdrs_len);
- }
- udp->udp_sticky_hdrs = hdrs;
- udp->udp_sticky_hdrs_len = hdrs_len;
+ mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, IPPROTO_UDP, flowinfo,
+ ulp_hdr_len, data_mp, data_len, us->us_wroff_extra, &cksum, errorp);
+ if (mp == NULL) {
+ ASSERT(*errorp != 0);
+ return (NULL);
}
- ip_build_hdrs_v6(udp->udp_sticky_hdrs,
- udp->udp_sticky_hdrs_len - UDPH_SIZE, ipp, IPPROTO_UDP);
- /* Set header fields not in ipp */
- if (ipp->ipp_fields & IPPF_HAS_IP6I) {
- ip6i = (ip6i_t *)udp->udp_sticky_hdrs;
- ip6h = (ip6_t *)&ip6i[1];
+ data_len += ulp_hdr_len;
+ ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length;
+
+ udpha = (udpha_t *)(mp->b_rptr + ixa->ixa_ip_hdr_length);
+ udpha->uha_src_port = connp->conn_lport;
+ udpha->uha_dst_port = dstport;
+ udpha->uha_checksum = 0;
+ udpha->uha_length = htons(data_len);
+
+ /*
+ * If there was a routing option/header then conn_prepend_hdr
+ * has massaged it and placed the pseudo-header checksum difference
+ * in the cksum argument.
+ *
+ * Setup header length and prepare for ULP checksum done in IP.
+ *
+ * We make it easy for IP to include our pseudo header
+ * by putting our length in uha_checksum.
+ * The IP source, destination, and length have already been set by
+ * conn_prepend_hdr.
+ */
+ cksum += data_len;
+ cksum = (cksum >> 16) + (cksum & 0xFFFF);
+ ASSERT(cksum < 0x10000);
+
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ ipha_t *ipha = (ipha_t *)mp->b_rptr;
+
+ ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen);
+
+ /* IP does the checksum if uha_checksum is non-zero */
+ if (us->us_do_checksum) {
+ if (cksum == 0)
+ udpha->uha_checksum = 0xffff;
+ else
+ udpha->uha_checksum = htons(cksum);
+ } else {
+ udpha->uha_checksum = 0;
+ }
} else {
- ip6h = (ip6_t *)udp->udp_sticky_hdrs;
+ ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+
+ ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == ixa->ixa_pktlen);
+ if (cksum == 0)
+ udpha->uha_checksum = 0xffff;
+ else
+ udpha->uha_checksum = htons(cksum);
}
- if (!(ipp->ipp_fields & IPPF_ADDR))
- ip6h->ip6_src = udp->udp_v6src;
+ /* Insert all-0s SPI now. */
+ if (insert_spi)
+ *((uint32_t *)(udpha + 1)) = 0;
- udpha = (udpha_t *)(udp->udp_sticky_hdrs + hdrs_len - UDPH_SIZE);
- udpha->uha_src_port = udp->udp_port;
+ return (mp);
+}
- /* Try to get everything in a single mblk */
- if (hdrs_len > udp->udp_max_hdr_len) {
- udp->udp_max_hdr_len = hdrs_len;
- sth_wroff = udp->udp_max_hdr_len + us->us_wroff_extra;
- rw_exit(&udp->udp_rwlock);
- (void) proto_set_tx_wroff(udp->udp_connp->conn_rq,
- udp->udp_connp, sth_wroff);
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- }
+static int
+udp_build_hdr_template(conn_t *connp, const in6_addr_t *v6src,
+ const in6_addr_t *v6dst, in_port_t dstport, uint32_t flowinfo)
+{
+ udpha_t *udpha;
+ int error;
+
+ ASSERT(MUTEX_HELD(&connp->conn_lock));
+ /*
+ * We clear lastdst to make sure we don't use the lastdst path
+ * next time sending since we might not have set v6dst yet.
+ */
+ connp->conn_v6lastdst = ipv6_all_zeros;
+
+ error = conn_build_hdr_template(connp, UDPH_SIZE, 0, v6src, v6dst,
+ flowinfo);
+ if (error != 0)
+ return (error);
+
+ /*
+ * Any routing header/option has been massaged. The checksum difference
+ * is stored in conn_sum.
+ */
+ udpha = (udpha_t *)connp->conn_ht_ulp;
+ udpha->uha_src_port = connp->conn_lport;
+ udpha->uha_dst_port = dstport;
+ udpha->uha_checksum = 0;
+ udpha->uha_length = htons(UDPH_SIZE); /* Filled in later */
return (0);
}
@@ -3252,189 +2300,6 @@ udp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr)
return (0);
}
-/*
- * Copy hop-by-hop option from ipp->ipp_hopopts to the buffer provided (with
- * T_opthdr) and return the number of bytes copied. 'dbuf' may be NULL to
- * just count the length needed for allocation. If 'dbuf' is non-NULL,
- * then it's assumed to be allocated to be large enough.
- *
- * Returns zero if trimming of the security option causes all options to go
- * away.
- */
-static size_t
-copy_hop_opts(const ip6_pkt_t *ipp, uchar_t *dbuf)
-{
- struct T_opthdr *toh;
- size_t hol = ipp->ipp_hopoptslen;
- ip6_hbh_t *dstopt = NULL;
- const ip6_hbh_t *srcopt = ipp->ipp_hopopts;
- size_t tlen, olen, plen;
- boolean_t deleting;
- const struct ip6_opt *sopt, *lastpad;
- struct ip6_opt *dopt;
-
- if ((toh = (struct T_opthdr *)dbuf) != NULL) {
- toh->level = IPPROTO_IPV6;
- toh->name = IPV6_HOPOPTS;
- toh->status = 0;
- dstopt = (ip6_hbh_t *)(toh + 1);
- }
-
- /*
- * If labeling is enabled, then skip the label option
- * but get other options if there are any.
- */
- if (is_system_labeled()) {
- dopt = NULL;
- if (dstopt != NULL) {
- /* will fill in ip6h_len later */
- dstopt->ip6h_nxt = srcopt->ip6h_nxt;
- dopt = (struct ip6_opt *)(dstopt + 1);
- }
- sopt = (const struct ip6_opt *)(srcopt + 1);
- hol -= sizeof (*srcopt);
- tlen = sizeof (*dstopt);
- lastpad = NULL;
- deleting = B_FALSE;
- /*
- * This loop finds the first (lastpad pointer) of any number of
- * pads that preceeds the security option, then treats the
- * security option as though it were a pad, and then finds the
- * next non-pad option (or end of list).
- *
- * It then treats the entire block as one big pad. To preserve
- * alignment of any options that follow, or just the end of the
- * list, it computes a minimal new padding size that keeps the
- * same alignment for the next option.
- *
- * If it encounters just a sequence of pads with no security
- * option, those are copied as-is rather than collapsed.
- *
- * Note that to handle the end of list case, the code makes one
- * loop with 'hol' set to zero.
- */
- for (;;) {
- if (hol > 0) {
- if (sopt->ip6o_type == IP6OPT_PAD1) {
- if (lastpad == NULL)
- lastpad = sopt;
- sopt = (const struct ip6_opt *)
- &sopt->ip6o_len;
- hol--;
- continue;
- }
- olen = sopt->ip6o_len + sizeof (*sopt);
- if (olen > hol)
- olen = hol;
- if (sopt->ip6o_type == IP6OPT_PADN ||
- sopt->ip6o_type == ip6opt_ls) {
- if (sopt->ip6o_type == ip6opt_ls)
- deleting = B_TRUE;
- if (lastpad == NULL)
- lastpad = sopt;
- sopt = (const struct ip6_opt *)
- ((const char *)sopt + olen);
- hol -= olen;
- continue;
- }
- } else {
- /* if nothing was copied at all, then delete */
- if (tlen == sizeof (*dstopt))
- return (0);
- /* last pass; pick up any trailing padding */
- olen = 0;
- }
- if (deleting) {
- /*
- * compute aligning effect of deleted material
- * to reproduce with pad.
- */
- plen = ((const char *)sopt -
- (const char *)lastpad) & 7;
- tlen += plen;
- if (dopt != NULL) {
- if (plen == 1) {
- dopt->ip6o_type = IP6OPT_PAD1;
- } else if (plen > 1) {
- plen -= sizeof (*dopt);
- dopt->ip6o_type = IP6OPT_PADN;
- dopt->ip6o_len = plen;
- if (plen > 0)
- bzero(dopt + 1, plen);
- }
- dopt = (struct ip6_opt *)
- ((char *)dopt + plen);
- }
- deleting = B_FALSE;
- lastpad = NULL;
- }
- /* if there's uncopied padding, then copy that now */
- if (lastpad != NULL) {
- olen += (const char *)sopt -
- (const char *)lastpad;
- sopt = lastpad;
- lastpad = NULL;
- }
- if (dopt != NULL && olen > 0) {
- bcopy(sopt, dopt, olen);
- dopt = (struct ip6_opt *)((char *)dopt + olen);
- }
- if (hol == 0)
- break;
- tlen += olen;
- sopt = (const struct ip6_opt *)
- ((const char *)sopt + olen);
- hol -= olen;
- }
- /* go back and patch up the length value, rounded upward */
- if (dstopt != NULL)
- dstopt->ip6h_len = (tlen - 1) >> 3;
- } else {
- tlen = hol;
- if (dstopt != NULL)
- bcopy(srcopt, dstopt, hol);
- }
-
- tlen += sizeof (*toh);
- if (toh != NULL)
- toh->len = tlen;
-
- return (tlen);
-}
-
-/*
- * Update udp_rcv_opt_len from the packet.
- * Called when options received, and when no options received but
- * udp_ip_recv_opt_len has previously recorded options.
- */
-static void
-udp_save_ip_rcv_opt(udp_t *udp, void *opt, int opt_len)
-{
- /* Save the options if any */
- if (opt_len > 0) {
- if (opt_len > udp->udp_ip_rcv_options_len) {
- /* Need to allocate larger buffer */
- if (udp->udp_ip_rcv_options_len != 0)
- mi_free((char *)udp->udp_ip_rcv_options);
- udp->udp_ip_rcv_options_len = 0;
- udp->udp_ip_rcv_options =
- (uchar_t *)mi_alloc(opt_len, BPRI_HI);
- if (udp->udp_ip_rcv_options != NULL)
- udp->udp_ip_rcv_options_len = opt_len;
- }
- if (udp->udp_ip_rcv_options_len != 0) {
- bcopy(opt, udp->udp_ip_rcv_options, opt_len);
- /* Adjust length if we are resusing the space */
- udp->udp_ip_rcv_options_len = opt_len;
- }
- } else if (udp->udp_ip_rcv_options_len != 0) {
- /* Clear out previously recorded options */
- mi_free((char *)udp->udp_ip_rcv_options);
- udp->udp_ip_rcv_options = NULL;
- udp->udp_ip_rcv_options_len = 0;
- }
-}
-
static mblk_t *
udp_queue_fallback(udp_t *udp, mblk_t *mp)
{
@@ -3466,15 +2331,15 @@ udp_queue_fallback(udp_t *udp, mblk_t *mp)
* TPI, then we'll queue the mp for later processing.
*/
static void
-udp_ulp_recv(conn_t *connp, mblk_t *mp)
+udp_ulp_recv(conn_t *connp, mblk_t *mp, uint_t len, ip_recv_attr_t *ira)
{
if (IPCL_IS_NONSTR(connp)) {
udp_t *udp = connp->conn_udp;
int error;
+ ASSERT(len == msgdsize(mp));
if ((*connp->conn_upcalls->su_recv)
- (connp->conn_upper_handle, mp, msgdsize(mp), 0, &error,
- NULL) < 0) {
+ (connp->conn_upper_handle, mp, len, 0, &error, NULL) < 0) {
mutex_enter(&udp->udp_recv_lock);
if (error == ENOSPC) {
/*
@@ -3500,282 +2365,170 @@ udp_ulp_recv(conn_t *connp, mblk_t *mp)
}
ASSERT(MUTEX_NOT_HELD(&udp->udp_recv_lock));
} else {
+ if (is_system_labeled()) {
+ ASSERT(ira->ira_cred != NULL);
+ /*
+ * Provide for protocols above UDP such as RPC
+ * NOPID leaves db_cpid unchanged.
+ */
+ mblk_setcred(mp, ira->ira_cred, NOPID);
+ }
+
putnext(connp->conn_rq, mp);
}
}
+/*
+ * This is the inbound data path.
+ * IP has already pulled up the IP plus UDP headers and verified alignment
+ * etc.
+ */
/* ARGSUSED2 */
static void
-udp_input(void *arg1, mblk_t *mp, void *arg2)
+udp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
{
- conn_t *connp = (conn_t *)arg1;
+ conn_t *connp = (conn_t *)arg1;
struct T_unitdata_ind *tudi;
uchar_t *rptr; /* Pointer to IP header */
int hdr_length; /* Length of IP+UDP headers */
- int opt_len;
int udi_size; /* Size of T_unitdata_ind */
- int mp_len;
+ int pkt_len;
udp_t *udp;
udpha_t *udpha;
- int ipversion;
- ip6_pkt_t ipp;
+ ip_pkt_t ipps;
ip6_t *ip6h;
- ip6i_t *ip6i;
mblk_t *mp1;
- mblk_t *options_mp = NULL;
- ip_pktinfo_t *pinfo = NULL;
- cred_t *cr = NULL;
- pid_t cpid;
- uint32_t udp_ip_rcv_options_len;
- udp_bits_t udp_bits;
- cred_t *rcr = connp->conn_cred;
- udp_stack_t *us;
+ uint32_t udp_ipv4_options_len;
+ crb_t recv_ancillary;
+ udp_stack_t *us;
ASSERT(connp->conn_flags & IPCL_UDPCONN);
udp = connp->conn_udp;
us = udp->udp_us;
rptr = mp->b_rptr;
- ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_CTL);
+
+ ASSERT(DB_TYPE(mp) == M_DATA);
ASSERT(OK_32PTR(rptr));
+ ASSERT(ira->ira_pktlen == msgdsize(mp));
+ pkt_len = ira->ira_pktlen;
/*
- * IP should have prepended the options data in an M_CTL
- * Check M_CTL "type" to make sure are not here bcos of
- * a valid ICMP message
+ * Get a snapshot of these and allow other threads to change
+ * them after that. We need the same recv_ancillary when determining
+ * the size as when adding the ancillary data items.
*/
- if (DB_TYPE(mp) == M_CTL) {
- if (MBLKL(mp) == sizeof (ip_pktinfo_t) &&
- ((ip_pktinfo_t *)mp->b_rptr)->ip_pkt_ulp_type ==
- IN_PKTINFO) {
- /*
- * IP_RECVIF or IP_RECVSLLA or IPF_RECVADDR information
- * has been prepended to the packet by IP. We need to
- * extract the mblk and adjust the rptr
- */
- pinfo = (ip_pktinfo_t *)mp->b_rptr;
- options_mp = mp;
- mp = mp->b_cont;
- rptr = mp->b_rptr;
- UDP_STAT(us, udp_in_pktinfo);
- } else {
- /*
- * ICMP messages.
- */
- udp_icmp_error(connp, mp);
- return;
- }
- }
+ mutex_enter(&connp->conn_lock);
+ udp_ipv4_options_len = udp->udp_recv_ipp.ipp_ipv4_options_len;
+ recv_ancillary = connp->conn_recv_ancillary;
+ mutex_exit(&connp->conn_lock);
+
+ hdr_length = ira->ira_ip_hdr_length;
- mp_len = msgdsize(mp);
/*
- * This is the inbound data path.
- * First, we check to make sure the IP version number is correct,
- * and then pull the IP and UDP headers into the first mblk.
+ * IP inspected the UDP header thus all of it must be in the mblk.
+ * UDP length check is performed for IPv6 packets and IPv4 packets
+ * to check if the size of the packet as specified
+ * by the UDP header is the same as the length derived from the IP
+ * header.
*/
+ udpha = (udpha_t *)(rptr + hdr_length);
+ if (pkt_len != ntohs(udpha->uha_length) + hdr_length)
+ goto tossit;
- /* Initialize regardless if ipversion is IPv4 or IPv6 */
- ipp.ipp_fields = 0;
+ hdr_length += UDPH_SIZE;
+ ASSERT(MBLKL(mp) >= hdr_length); /* IP did a pullup */
- ipversion = IPH_HDR_VERSION(rptr);
+ /* Initialize regardless of IP version */
+ ipps.ipp_fields = 0;
- rw_enter(&udp->udp_rwlock, RW_READER);
- udp_ip_rcv_options_len = udp->udp_ip_rcv_options_len;
- udp_bits = udp->udp_bits;
- rw_exit(&udp->udp_rwlock);
+ if (((ira->ira_flags & IRAF_IPV4_OPTIONS) ||
+ udp_ipv4_options_len > 0) &&
+ connp->conn_family == AF_INET) {
+ int err;
- switch (ipversion) {
- case IPV4_VERSION:
- ASSERT(MBLKL(mp) >= sizeof (ipha_t));
- ASSERT(((ipha_t *)rptr)->ipha_protocol == IPPROTO_UDP);
- hdr_length = IPH_HDR_LENGTH(rptr) + UDPH_SIZE;
- opt_len = hdr_length - (IP_SIMPLE_HDR_LENGTH + UDPH_SIZE);
- if ((opt_len > 0 || udp_ip_rcv_options_len > 0) &&
- udp->udp_family == AF_INET) {
- /*
- * Record/update udp_ip_rcv_options with the lock
- * held. Not needed for AF_INET6 sockets
- * since they don't support a getsockopt of IP_OPTIONS.
- */
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- udp_save_ip_rcv_opt(udp, rptr + IP_SIMPLE_HDR_LENGTH,
- opt_len);
- rw_exit(&udp->udp_rwlock);
- }
- /* Handle IPV6_RECVPKTINFO even for IPv4 packet. */
- if ((udp->udp_family == AF_INET6) && (pinfo != NULL) &&
- udp->udp_ip_recvpktinfo) {
- if (pinfo->ip_pkt_flags & IPF_RECVIF) {
- ipp.ipp_fields |= IPPF_IFINDEX;
- ipp.ipp_ifindex = pinfo->ip_pkt_ifindex;
- }
- }
- break;
- case IPV6_VERSION:
/*
- * IPv6 packets can only be received by applications
- * that are prepared to receive IPv6 addresses.
- * The IP fanout must ensure this.
+ * Record/update udp_recv_ipp with the lock
+ * held. Not needed for AF_INET6 sockets
+ * since they don't support a getsockopt of IP_OPTIONS.
*/
- ASSERT(udp->udp_family == AF_INET6);
+ mutex_enter(&connp->conn_lock);
+ err = ip_find_hdr_v4((ipha_t *)rptr, &udp->udp_recv_ipp,
+ B_TRUE);
+ if (err != 0) {
+ /* Allocation failed. Drop packet */
+ mutex_exit(&connp->conn_lock);
+ freemsg(mp);
+ BUMP_MIB(&us->us_udp_mib, udpInErrors);
+ return;
+ }
+ mutex_exit(&connp->conn_lock);
+ }
- ip6h = (ip6_t *)rptr;
- ASSERT((uchar_t *)&ip6h[1] <= mp->b_wptr);
+ if (recv_ancillary.crb_all != 0) {
+ /*
+ * Record packet information in the ip_pkt_t
+ */
+ if (ira->ira_flags & IRAF_IS_IPV4) {
+ ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION);
+ ASSERT(MBLKL(mp) >= sizeof (ipha_t));
+ ASSERT(((ipha_t *)rptr)->ipha_protocol == IPPROTO_UDP);
+ ASSERT(ira->ira_ip_hdr_length == IPH_HDR_LENGTH(rptr));
- if (ip6h->ip6_nxt != IPPROTO_UDP) {
+ (void) ip_find_hdr_v4((ipha_t *)rptr, &ipps, B_FALSE);
+ } else {
uint8_t nexthdrp;
- /* Look for ifindex information */
- if (ip6h->ip6_nxt == IPPROTO_RAW) {
- ip6i = (ip6i_t *)ip6h;
- if ((uchar_t *)&ip6i[1] > mp->b_wptr)
- goto tossit;
-
- if (ip6i->ip6i_flags & IP6I_IFINDEX) {
- ASSERT(ip6i->ip6i_ifindex != 0);
- ipp.ipp_fields |= IPPF_IFINDEX;
- ipp.ipp_ifindex = ip6i->ip6i_ifindex;
- }
- rptr = (uchar_t *)&ip6i[1];
- mp->b_rptr = rptr;
- if (rptr == mp->b_wptr) {
- mp1 = mp->b_cont;
- freeb(mp);
- mp = mp1;
- rptr = mp->b_rptr;
- }
- if (MBLKL(mp) < (IPV6_HDR_LEN + UDPH_SIZE))
- goto tossit;
- ip6h = (ip6_t *)rptr;
- mp_len = msgdsize(mp);
- }
+
+ ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION);
/*
- * Find any potentially interesting extension headers
- * as well as the length of the IPv6 + extension
- * headers.
+ * IPv6 packets can only be received by applications
+ * that are prepared to receive IPv6 addresses.
+ * The IP fanout must ensure this.
*/
- hdr_length = ip_find_hdr_v6(mp, ip6h, &ipp, &nexthdrp) +
- UDPH_SIZE;
- ASSERT(nexthdrp == IPPROTO_UDP);
- } else {
- hdr_length = IPV6_HDR_LEN + UDPH_SIZE;
- ip6i = NULL;
- }
- break;
- default:
- ASSERT(0);
- }
+ ASSERT(connp->conn_family == AF_INET6);
- /*
- * IP inspected the UDP header thus all of it must be in the mblk.
- * UDP length check is performed for IPv6 packets and IPv4 packets
- * to check if the size of the packet as specified
- * by the header is the same as the physical size of the packet.
- * FIXME? Didn't IP already check this?
- */
- udpha = (udpha_t *)(rptr + (hdr_length - UDPH_SIZE));
- if ((MBLKL(mp) < hdr_length) ||
- (mp_len != (ntohs(udpha->uha_length) + hdr_length - UDPH_SIZE))) {
- goto tossit;
- }
+ ip6h = (ip6_t *)rptr;
-
- /* Walk past the headers unless UDP_RCVHDR was set. */
- if (!udp_bits.udpb_rcvhdr) {
- mp->b_rptr = rptr + hdr_length;
- mp_len -= hdr_length;
+ /* We don't care about the length, but need the ipp */
+ hdr_length = ip_find_hdr_v6(mp, ip6h, B_TRUE, &ipps,
+ &nexthdrp);
+ ASSERT(hdr_length == ira->ira_ip_hdr_length);
+ /* Restore */
+ hdr_length = ira->ira_ip_hdr_length + UDPH_SIZE;
+ ASSERT(nexthdrp == IPPROTO_UDP);
+ }
}
/*
* This is the inbound data path. Packets are passed upstream as
- * T_UNITDATA_IND messages with full IP headers still attached.
+ * T_UNITDATA_IND messages.
*/
- if (udp->udp_family == AF_INET) {
+ if (connp->conn_family == AF_INET) {
sin_t *sin;
ASSERT(IPH_HDR_VERSION((ipha_t *)rptr) == IPV4_VERSION);
/*
* Normally only send up the source address.
- * If IP_RECVDSTADDR is set we include the destination IP
- * address as an option. With IP_RECVOPTS we include all
- * the IP options.
+ * If any ancillary data items are wanted we add those.
*/
udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
- if (udp_bits.udpb_recvdstaddr) {
- udi_size += sizeof (struct T_opthdr) +
- sizeof (struct in_addr);
- UDP_STAT(us, udp_in_recvdstaddr);
- }
-
- if (udp_bits.udpb_ip_recvpktinfo && (pinfo != NULL) &&
- (pinfo->ip_pkt_flags & IPF_RECVADDR)) {
- udi_size += sizeof (struct T_opthdr) +
- sizeof (struct in_pktinfo);
- UDP_STAT(us, udp_ip_rcvpktinfo);
- }
-
- if ((udp_bits.udpb_recvopts) && opt_len > 0) {
- udi_size += sizeof (struct T_opthdr) + opt_len;
- UDP_STAT(us, udp_in_recvopts);
- }
-
- /*
- * If the IP_RECVSLLA or the IP_RECVIF is set then allocate
- * space accordingly
- */
- if ((udp_bits.udpb_recvif) && (pinfo != NULL) &&
- (pinfo->ip_pkt_flags & IPF_RECVIF)) {
- udi_size += sizeof (struct T_opthdr) + sizeof (uint_t);
- UDP_STAT(us, udp_in_recvif);
- }
-
- if ((udp_bits.udpb_recvslla) && (pinfo != NULL) &&
- (pinfo->ip_pkt_flags & IPF_RECVSLLA)) {
- udi_size += sizeof (struct T_opthdr) +
- sizeof (struct sockaddr_dl);
- UDP_STAT(us, udp_in_recvslla);
- }
-
- if ((udp_bits.udpb_recvucred) &&
- (cr = msg_getcred(mp, &cpid)) != NULL) {
- udi_size += sizeof (struct T_opthdr) + ucredsize;
- UDP_STAT(us, udp_in_recvucred);
- }
-
- /*
- * If SO_TIMESTAMP is set allocate the appropriate sized
- * buffer. Since gethrestime() expects a pointer aligned
- * argument, we allocate space necessary for extra
- * alignment (even though it might not be used).
- */
- if (udp_bits.udpb_timestamp) {
- udi_size += sizeof (struct T_opthdr) +
- sizeof (timestruc_t) + _POINTER_ALIGNMENT;
- UDP_STAT(us, udp_in_timestamp);
- }
-
- /*
- * If IP_RECVTTL is set allocate the appropriate sized buffer
- */
- if (udp_bits.udpb_recvttl) {
- udi_size += sizeof (struct T_opthdr) + sizeof (uint8_t);
- UDP_STAT(us, udp_in_recvttl);
+ if (recv_ancillary.crb_all != 0) {
+ udi_size += conn_recvancillary_size(connp,
+ recv_ancillary, ira, mp, &ipps);
}
/* Allocate a message block for the T_UNITDATA_IND structure. */
mp1 = allocb(udi_size, BPRI_MED);
if (mp1 == NULL) {
freemsg(mp);
- if (options_mp != NULL)
- freeb(options_mp);
BUMP_MIB(&us->us_udp_mib, udpInErrors);
return;
}
mp1->b_cont = mp;
- mp = mp1;
- mp->b_datap->db_type = M_PROTO;
- tudi = (struct T_unitdata_ind *)mp->b_rptr;
- mp->b_wptr = (uchar_t *)tudi + udi_size;
+ mp1->b_datap->db_type = M_PROTO;
+ tudi = (struct T_unitdata_ind *)mp1->b_rptr;
+ mp1->b_wptr = (uchar_t *)tudi + udi_size;
tudi->PRIM_type = T_UNITDATA_IND;
tudi->SRC_length = sizeof (sin_t);
tudi->SRC_offset = sizeof (struct T_unitdata_ind);
@@ -3786,7 +2539,7 @@ udp_input(void *arg1, mblk_t *mp, void *arg2)
sin = (sin_t *)&tudi[1];
sin->sin_addr.s_addr = ((ipha_t *)rptr)->ipha_src;
sin->sin_port = udpha->uha_src_port;
- sin->sin_family = udp->udp_family;
+ sin->sin_family = connp->conn_family;
*(uint32_t *)&sin->sin_zero[0] = 0;
*(uint32_t *)&sin->sin_zero[4] = 0;
@@ -3795,166 +2548,8 @@ udp_input(void *arg1, mblk_t *mp, void *arg2)
* IP_RECVTTL has been set.
*/
if (udi_size != 0) {
- /*
- * Copy in destination address before options to avoid
- * any padding issues.
- */
- char *dstopt;
-
- dstopt = (char *)&sin[1];
- if (udp_bits.udpb_recvdstaddr) {
- struct T_opthdr *toh;
- ipaddr_t *dstptr;
-
- toh = (struct T_opthdr *)dstopt;
- toh->level = IPPROTO_IP;
- toh->name = IP_RECVDSTADDR;
- toh->len = sizeof (struct T_opthdr) +
- sizeof (ipaddr_t);
- toh->status = 0;
- dstopt += sizeof (struct T_opthdr);
- dstptr = (ipaddr_t *)dstopt;
- *dstptr = ((ipha_t *)rptr)->ipha_dst;
- dstopt += sizeof (ipaddr_t);
- udi_size -= toh->len;
- }
-
- if (udp_bits.udpb_recvopts && opt_len > 0) {
- struct T_opthdr *toh;
-
- toh = (struct T_opthdr *)dstopt;
- toh->level = IPPROTO_IP;
- toh->name = IP_RECVOPTS;
- toh->len = sizeof (struct T_opthdr) + opt_len;
- toh->status = 0;
- dstopt += sizeof (struct T_opthdr);
- bcopy(rptr + IP_SIMPLE_HDR_LENGTH, dstopt,
- opt_len);
- dstopt += opt_len;
- udi_size -= toh->len;
- }
-
- if ((udp_bits.udpb_ip_recvpktinfo) && (pinfo != NULL) &&
- (pinfo->ip_pkt_flags & IPF_RECVADDR)) {
- struct T_opthdr *toh;
- struct in_pktinfo *pktinfop;
-
- toh = (struct T_opthdr *)dstopt;
- toh->level = IPPROTO_IP;
- toh->name = IP_PKTINFO;
- toh->len = sizeof (struct T_opthdr) +
- sizeof (*pktinfop);
- toh->status = 0;
- dstopt += sizeof (struct T_opthdr);
- pktinfop = (struct in_pktinfo *)dstopt;
- pktinfop->ipi_ifindex = pinfo->ip_pkt_ifindex;
- pktinfop->ipi_spec_dst =
- pinfo->ip_pkt_match_addr;
- pktinfop->ipi_addr.s_addr =
- ((ipha_t *)rptr)->ipha_dst;
-
- dstopt += sizeof (struct in_pktinfo);
- udi_size -= toh->len;
- }
-
- if ((udp_bits.udpb_recvslla) && (pinfo != NULL) &&
- (pinfo->ip_pkt_flags & IPF_RECVSLLA)) {
-
- struct T_opthdr *toh;
- struct sockaddr_dl *dstptr;
-
- toh = (struct T_opthdr *)dstopt;
- toh->level = IPPROTO_IP;
- toh->name = IP_RECVSLLA;
- toh->len = sizeof (struct T_opthdr) +
- sizeof (struct sockaddr_dl);
- toh->status = 0;
- dstopt += sizeof (struct T_opthdr);
- dstptr = (struct sockaddr_dl *)dstopt;
- bcopy(&pinfo->ip_pkt_slla, dstptr,
- sizeof (struct sockaddr_dl));
- dstopt += sizeof (struct sockaddr_dl);
- udi_size -= toh->len;
- }
-
- if ((udp_bits.udpb_recvif) && (pinfo != NULL) &&
- (pinfo->ip_pkt_flags & IPF_RECVIF)) {
-
- struct T_opthdr *toh;
- uint_t *dstptr;
-
- toh = (struct T_opthdr *)dstopt;
- toh->level = IPPROTO_IP;
- toh->name = IP_RECVIF;
- toh->len = sizeof (struct T_opthdr) +
- sizeof (uint_t);
- toh->status = 0;
- dstopt += sizeof (struct T_opthdr);
- dstptr = (uint_t *)dstopt;
- *dstptr = pinfo->ip_pkt_ifindex;
- dstopt += sizeof (uint_t);
- udi_size -= toh->len;
- }
-
- if (cr != NULL) {
- struct T_opthdr *toh;
-
- toh = (struct T_opthdr *)dstopt;
- toh->level = SOL_SOCKET;
- toh->name = SCM_UCRED;
- toh->len = sizeof (struct T_opthdr) + ucredsize;
- toh->status = 0;
- dstopt += sizeof (struct T_opthdr);
- (void) cred2ucred(cr, cpid, dstopt, rcr);
- dstopt += ucredsize;
- udi_size -= toh->len;
- }
-
- if (udp_bits.udpb_timestamp) {
- struct T_opthdr *toh;
-
- toh = (struct T_opthdr *)dstopt;
- toh->level = SOL_SOCKET;
- toh->name = SCM_TIMESTAMP;
- toh->len = sizeof (struct T_opthdr) +
- sizeof (timestruc_t) + _POINTER_ALIGNMENT;
- toh->status = 0;
- dstopt += sizeof (struct T_opthdr);
- /* Align for gethrestime() */
- dstopt = (char *)P2ROUNDUP((intptr_t)dstopt,
- sizeof (intptr_t));
- gethrestime((timestruc_t *)dstopt);
- dstopt = (char *)toh + toh->len;
- udi_size -= toh->len;
- }
-
- /*
- * CAUTION:
- * Due to aligment issues
- * Processing of IP_RECVTTL option
- * should always be the last. Adding
- * any option processing after this will
- * cause alignment panic.
- */
- if (udp_bits.udpb_recvttl) {
- struct T_opthdr *toh;
- uint8_t *dstptr;
-
- toh = (struct T_opthdr *)dstopt;
- toh->level = IPPROTO_IP;
- toh->name = IP_RECVTTL;
- toh->len = sizeof (struct T_opthdr) +
- sizeof (uint8_t);
- toh->status = 0;
- dstopt += sizeof (struct T_opthdr);
- dstptr = (uint8_t *)dstopt;
- *dstptr = ((ipha_t *)rptr)->ipha_ttl;
- dstopt += sizeof (uint8_t);
- udi_size -= toh->len;
- }
-
- /* Consumed all of allocated space */
- ASSERT(udi_size == 0);
+ conn_recvancillary_add(connp, recv_ancillary, ira,
+ &ipps, (uchar_t *)&sin[1], udi_size);
}
} else {
sin6_t *sin6;
@@ -3968,89 +2563,21 @@ udp_input(void *arg1, mblk_t *mp, void *arg2)
*/
udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
- if (ipp.ipp_fields & (IPPF_HOPOPTS|IPPF_DSTOPTS|IPPF_RTDSTOPTS|
- IPPF_RTHDR|IPPF_IFINDEX)) {
- if ((udp_bits.udpb_ipv6_recvhopopts) &&
- (ipp.ipp_fields & IPPF_HOPOPTS)) {
- size_t hlen;
-
- UDP_STAT(us, udp_in_recvhopopts);
- hlen = copy_hop_opts(&ipp, NULL);
- if (hlen == 0)
- ipp.ipp_fields &= ~IPPF_HOPOPTS;
- udi_size += hlen;
- }
- if (((udp_bits.udpb_ipv6_recvdstopts) ||
- udp_bits.udpb_old_ipv6_recvdstopts) &&
- (ipp.ipp_fields & IPPF_DSTOPTS)) {
- udi_size += sizeof (struct T_opthdr) +
- ipp.ipp_dstoptslen;
- UDP_STAT(us, udp_in_recvdstopts);
- }
- if ((((udp_bits.udpb_ipv6_recvdstopts) &&
- udp_bits.udpb_ipv6_recvrthdr &&
- (ipp.ipp_fields & IPPF_RTHDR)) ||
- (udp_bits.udpb_ipv6_recvrthdrdstopts)) &&
- (ipp.ipp_fields & IPPF_RTDSTOPTS)) {
- udi_size += sizeof (struct T_opthdr) +
- ipp.ipp_rtdstoptslen;
- UDP_STAT(us, udp_in_recvrtdstopts);
- }
- if ((udp_bits.udpb_ipv6_recvrthdr) &&
- (ipp.ipp_fields & IPPF_RTHDR)) {
- udi_size += sizeof (struct T_opthdr) +
- ipp.ipp_rthdrlen;
- UDP_STAT(us, udp_in_recvrthdr);
- }
- if ((udp_bits.udpb_ip_recvpktinfo) &&
- (ipp.ipp_fields & IPPF_IFINDEX)) {
- udi_size += sizeof (struct T_opthdr) +
- sizeof (struct in6_pktinfo);
- UDP_STAT(us, udp_in_recvpktinfo);
- }
-
- }
- if ((udp_bits.udpb_recvucred) &&
- (cr = msg_getcred(mp, &cpid)) != NULL) {
- udi_size += sizeof (struct T_opthdr) + ucredsize;
- UDP_STAT(us, udp_in_recvucred);
- }
-
- /*
- * If SO_TIMESTAMP is set allocate the appropriate sized
- * buffer. Since gethrestime() expects a pointer aligned
- * argument, we allocate space necessary for extra
- * alignment (even though it might not be used).
- */
- if (udp_bits.udpb_timestamp) {
- udi_size += sizeof (struct T_opthdr) +
- sizeof (timestruc_t) + _POINTER_ALIGNMENT;
- UDP_STAT(us, udp_in_timestamp);
- }
-
- if (udp_bits.udpb_ipv6_recvhoplimit) {
- udi_size += sizeof (struct T_opthdr) + sizeof (int);
- UDP_STAT(us, udp_in_recvhoplimit);
- }
-
- if (udp_bits.udpb_ipv6_recvtclass) {
- udi_size += sizeof (struct T_opthdr) + sizeof (int);
- UDP_STAT(us, udp_in_recvtclass);
+ if (recv_ancillary.crb_all != 0) {
+ udi_size += conn_recvancillary_size(connp,
+ recv_ancillary, ira, mp, &ipps);
}
mp1 = allocb(udi_size, BPRI_MED);
if (mp1 == NULL) {
freemsg(mp);
- if (options_mp != NULL)
- freeb(options_mp);
BUMP_MIB(&us->us_udp_mib, udpInErrors);
return;
}
mp1->b_cont = mp;
- mp = mp1;
- mp->b_datap->db_type = M_PROTO;
- tudi = (struct T_unitdata_ind *)mp->b_rptr;
- mp->b_wptr = (uchar_t *)tudi + udi_size;
+ mp1->b_datap->db_type = M_PROTO;
+ tudi = (struct T_unitdata_ind *)mp1->b_rptr;
+ mp1->b_wptr = (uchar_t *)tudi + udi_size;
tudi->PRIM_type = T_UNITDATA_IND;
tudi->SRC_length = sizeof (sin6_t);
tudi->SRC_offset = sizeof (struct T_unitdata_ind);
@@ -4059,7 +2586,7 @@ udp_input(void *arg1, mblk_t *mp, void *arg2)
udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t));
tudi->OPT_length = udi_size;
sin6 = (sin6_t *)&tudi[1];
- if (ipversion == IPV4_VERSION) {
+ if (ira->ira_flags & IRAF_IS_IPV4) {
in6_addr_t v6dst;
IN6_IPADDR_TO_V4MAPPED(((ipha_t *)rptr)->ipha_src,
@@ -4069,196 +2596,43 @@ udp_input(void *arg1, mblk_t *mp, void *arg2)
sin6->sin6_flowinfo = 0;
sin6->sin6_scope_id = 0;
sin6->__sin6_src_id = ip_srcid_find_addr(&v6dst,
- connp->conn_zoneid, us->us_netstack);
+ IPCL_ZONEID(connp), us->us_netstack);
} else {
+ ip6h = (ip6_t *)rptr;
+
sin6->sin6_addr = ip6h->ip6_src;
/* No sin6_flowinfo per API */
sin6->sin6_flowinfo = 0;
- /* For link-scope source pass up scope id */
- if ((ipp.ipp_fields & IPPF_IFINDEX) &&
- IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
- sin6->sin6_scope_id = ipp.ipp_ifindex;
+ /* For link-scope pass up scope id */
+ if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
+ sin6->sin6_scope_id = ira->ira_ruifindex;
else
sin6->sin6_scope_id = 0;
sin6->__sin6_src_id = ip_srcid_find_addr(
- &ip6h->ip6_dst, connp->conn_zoneid,
+ &ip6h->ip6_dst, IPCL_ZONEID(connp),
us->us_netstack);
}
sin6->sin6_port = udpha->uha_src_port;
- sin6->sin6_family = udp->udp_family;
+ sin6->sin6_family = connp->conn_family;
if (udi_size != 0) {
- uchar_t *dstopt;
-
- dstopt = (uchar_t *)&sin6[1];
- if ((udp_bits.udpb_ip_recvpktinfo) &&
- (ipp.ipp_fields & IPPF_IFINDEX)) {
- struct T_opthdr *toh;
- struct in6_pktinfo *pkti;
-
- toh = (struct T_opthdr *)dstopt;
- toh->level = IPPROTO_IPV6;
- toh->name = IPV6_PKTINFO;
- toh->len = sizeof (struct T_opthdr) +
- sizeof (*pkti);
- toh->status = 0;
- dstopt += sizeof (struct T_opthdr);
- pkti = (struct in6_pktinfo *)dstopt;
- if (ipversion == IPV6_VERSION)
- pkti->ipi6_addr = ip6h->ip6_dst;
- else
- IN6_IPADDR_TO_V4MAPPED(
- ((ipha_t *)rptr)->ipha_dst,
- &pkti->ipi6_addr);
- pkti->ipi6_ifindex = ipp.ipp_ifindex;
- dstopt += sizeof (*pkti);
- udi_size -= toh->len;
- }
- if (udp_bits.udpb_ipv6_recvhoplimit) {
- struct T_opthdr *toh;
-
- toh = (struct T_opthdr *)dstopt;
- toh->level = IPPROTO_IPV6;
- toh->name = IPV6_HOPLIMIT;
- toh->len = sizeof (struct T_opthdr) +
- sizeof (uint_t);
- toh->status = 0;
- dstopt += sizeof (struct T_opthdr);
- if (ipversion == IPV6_VERSION)
- *(uint_t *)dstopt = ip6h->ip6_hops;
- else
- *(uint_t *)dstopt =
- ((ipha_t *)rptr)->ipha_ttl;
- dstopt += sizeof (uint_t);
- udi_size -= toh->len;
- }
- if (udp_bits.udpb_ipv6_recvtclass) {
- struct T_opthdr *toh;
-
- toh = (struct T_opthdr *)dstopt;
- toh->level = IPPROTO_IPV6;
- toh->name = IPV6_TCLASS;
- toh->len = sizeof (struct T_opthdr) +
- sizeof (uint_t);
- toh->status = 0;
- dstopt += sizeof (struct T_opthdr);
- if (ipversion == IPV6_VERSION) {
- *(uint_t *)dstopt =
- IPV6_FLOW_TCLASS(ip6h->ip6_flow);
- } else {
- ipha_t *ipha = (ipha_t *)rptr;
- *(uint_t *)dstopt =
- ipha->ipha_type_of_service;
- }
- dstopt += sizeof (uint_t);
- udi_size -= toh->len;
- }
- if ((udp_bits.udpb_ipv6_recvhopopts) &&
- (ipp.ipp_fields & IPPF_HOPOPTS)) {
- size_t hlen;
-
- hlen = copy_hop_opts(&ipp, dstopt);
- dstopt += hlen;
- udi_size -= hlen;
- }
- if ((udp_bits.udpb_ipv6_recvdstopts) &&
- (udp_bits.udpb_ipv6_recvrthdr) &&
- (ipp.ipp_fields & IPPF_RTHDR) &&
- (ipp.ipp_fields & IPPF_RTDSTOPTS)) {
- struct T_opthdr *toh;
-
- toh = (struct T_opthdr *)dstopt;
- toh->level = IPPROTO_IPV6;
- toh->name = IPV6_DSTOPTS;
- toh->len = sizeof (struct T_opthdr) +
- ipp.ipp_rtdstoptslen;
- toh->status = 0;
- dstopt += sizeof (struct T_opthdr);
- bcopy(ipp.ipp_rtdstopts, dstopt,
- ipp.ipp_rtdstoptslen);
- dstopt += ipp.ipp_rtdstoptslen;
- udi_size -= toh->len;
- }
- if ((udp_bits.udpb_ipv6_recvrthdr) &&
- (ipp.ipp_fields & IPPF_RTHDR)) {
- struct T_opthdr *toh;
-
- toh = (struct T_opthdr *)dstopt;
- toh->level = IPPROTO_IPV6;
- toh->name = IPV6_RTHDR;
- toh->len = sizeof (struct T_opthdr) +
- ipp.ipp_rthdrlen;
- toh->status = 0;
- dstopt += sizeof (struct T_opthdr);
- bcopy(ipp.ipp_rthdr, dstopt, ipp.ipp_rthdrlen);
- dstopt += ipp.ipp_rthdrlen;
- udi_size -= toh->len;
- }
- if ((udp_bits.udpb_ipv6_recvdstopts) &&
- (ipp.ipp_fields & IPPF_DSTOPTS)) {
- struct T_opthdr *toh;
-
- toh = (struct T_opthdr *)dstopt;
- toh->level = IPPROTO_IPV6;
- toh->name = IPV6_DSTOPTS;
- toh->len = sizeof (struct T_opthdr) +
- ipp.ipp_dstoptslen;
- toh->status = 0;
- dstopt += sizeof (struct T_opthdr);
- bcopy(ipp.ipp_dstopts, dstopt,
- ipp.ipp_dstoptslen);
- dstopt += ipp.ipp_dstoptslen;
- udi_size -= toh->len;
- }
- if (cr != NULL) {
- struct T_opthdr *toh;
-
- toh = (struct T_opthdr *)dstopt;
- toh->level = SOL_SOCKET;
- toh->name = SCM_UCRED;
- toh->len = sizeof (struct T_opthdr) + ucredsize;
- toh->status = 0;
- (void) cred2ucred(cr, cpid, &toh[1], rcr);
- dstopt += toh->len;
- udi_size -= toh->len;
- }
- if (udp_bits.udpb_timestamp) {
- struct T_opthdr *toh;
-
- toh = (struct T_opthdr *)dstopt;
- toh->level = SOL_SOCKET;
- toh->name = SCM_TIMESTAMP;
- toh->len = sizeof (struct T_opthdr) +
- sizeof (timestruc_t) + _POINTER_ALIGNMENT;
- toh->status = 0;
- dstopt += sizeof (struct T_opthdr);
- /* Align for gethrestime() */
- dstopt = (uchar_t *)P2ROUNDUP((intptr_t)dstopt,
- sizeof (intptr_t));
- gethrestime((timestruc_t *)dstopt);
- dstopt = (uchar_t *)toh + toh->len;
- udi_size -= toh->len;
- }
-
- /* Consumed all of allocated space */
- ASSERT(udi_size == 0);
+ conn_recvancillary_add(connp, recv_ancillary, ira,
+ &ipps, (uchar_t *)&sin6[1], udi_size);
}
-#undef sin6
- /* No IP_RECVDSTADDR for IPv6. */
}
- BUMP_MIB(&us->us_udp_mib, udpHCInDatagrams);
- if (options_mp != NULL)
- freeb(options_mp);
-
- udp_ulp_recv(connp, mp);
+ /* Walk past the headers unless IP_RECVHDR was set. */
+ if (!udp->udp_rcvhdr) {
+ mp->b_rptr = rptr + hdr_length;
+ pkt_len -= hdr_length;
+ }
+ BUMP_MIB(&us->us_udp_mib, udpHCInDatagrams);
+ udp_ulp_recv(connp, mp1, pkt_len, ira);
return;
tossit:
freemsg(mp);
- if (options_mp != NULL)
- freeb(options_mp);
BUMP_MIB(&us->us_udp_mib, udpInErrors);
}
@@ -4386,23 +2760,34 @@ udp_snmp_get(queue_t *q, mblk_t *mpctl)
needattr = B_TRUE;
break;
}
+ mutex_enter(&connp->conn_lock);
+ if (udp->udp_state == TS_DATA_XFER &&
+ connp->conn_ixa->ixa_tsl != NULL) {
+ ts_label_t *tsl;
+
+ tsl = connp->conn_ixa->ixa_tsl;
+ mlp.tme_flags |= MIB2_TMEF_IS_LABELED;
+ mlp.tme_doi = label2doi(tsl);
+ mlp.tme_label = *label2bslabel(tsl);
+ needattr = B_TRUE;
+ }
+ mutex_exit(&connp->conn_lock);
/*
* Create an IPv4 table entry for IPv4 entries and also
* any IPv6 entries which are bound to in6addr_any
* (i.e. anything a IPv4 peer could connect/send to).
*/
- if (udp->udp_ipversion == IPV4_VERSION ||
+ if (connp->conn_ipversion == IPV4_VERSION ||
(udp->udp_state <= TS_IDLE &&
- IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src))) {
+ IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6))) {
ude.udpEntryInfo.ue_state = state;
/*
* If in6addr_any this will set it to
* INADDR_ANY
*/
- ude.udpLocalAddress =
- V4_PART_OF_V6(udp->udp_v6src);
- ude.udpLocalPort = ntohs(udp->udp_port);
+ ude.udpLocalAddress = connp->conn_laddr_v4;
+ ude.udpLocalPort = ntohs(connp->conn_lport);
if (udp->udp_state == TS_DATA_XFER) {
/*
* Can potentially get here for
@@ -4414,9 +2799,9 @@ udp_snmp_get(queue_t *q, mblk_t *mpctl)
* this part of the code.
*/
ude.udpEntryInfo.ue_RemoteAddress =
- V4_PART_OF_V6(udp->udp_v6dst);
+ connp->conn_faddr_v4;
ude.udpEntryInfo.ue_RemotePort =
- ntohs(udp->udp_dstport);
+ ntohs(connp->conn_fport);
} else {
ude.udpEntryInfo.ue_RemoteAddress = 0;
ude.udpEntryInfo.ue_RemotePort = 0;
@@ -4429,10 +2814,10 @@ udp_snmp_get(queue_t *q, mblk_t *mpctl)
*/
ude.udpInstance = (uint32_t)(uintptr_t)udp;
ude.udpCreationProcess =
- (udp->udp_open_pid < 0) ?
+ (connp->conn_cpid < 0) ?
MIB2_UNKNOWN_PROCESS :
- udp->udp_open_pid;
- ude.udpCreationTime = udp->udp_open_time;
+ connp->conn_cpid;
+ ude.udpCreationTime = connp->conn_open_time;
(void) snmp_append_data2(mp_conn_ctl->b_cont,
&mp_conn_tail, (char *)&ude, sizeof (ude));
@@ -4442,16 +2827,24 @@ udp_snmp_get(queue_t *q, mblk_t *mpctl)
mp_attr_ctl->b_cont, &mp_attr_tail,
(char *)&mlp, sizeof (mlp));
}
- if (udp->udp_ipversion == IPV6_VERSION) {
+ if (connp->conn_ipversion == IPV6_VERSION) {
ude6.udp6EntryInfo.ue_state = state;
- ude6.udp6LocalAddress = udp->udp_v6src;
- ude6.udp6LocalPort = ntohs(udp->udp_port);
- ude6.udp6IfIndex = udp->udp_bound_if;
+ ude6.udp6LocalAddress = connp->conn_laddr_v6;
+ ude6.udp6LocalPort = ntohs(connp->conn_lport);
+ mutex_enter(&connp->conn_lock);
+ if (connp->conn_ixa->ixa_flags &
+ IXAF_SCOPEID_SET) {
+ ude6.udp6IfIndex =
+ connp->conn_ixa->ixa_scopeid;
+ } else {
+ ude6.udp6IfIndex = connp->conn_bound_if;
+ }
+ mutex_exit(&connp->conn_lock);
if (udp->udp_state == TS_DATA_XFER) {
ude6.udp6EntryInfo.ue_RemoteAddress =
- udp->udp_v6dst;
+ connp->conn_faddr_v6;
ude6.udp6EntryInfo.ue_RemotePort =
- ntohs(udp->udp_dstport);
+ ntohs(connp->conn_fport);
} else {
ude6.udp6EntryInfo.ue_RemoteAddress =
sin6_null.sin6_addr;
@@ -4464,10 +2857,10 @@ udp_snmp_get(queue_t *q, mblk_t *mpctl)
*/
ude6.udp6Instance = (uint32_t)(uintptr_t)udp;
ude6.udp6CreationProcess =
- (udp->udp_open_pid < 0) ?
+ (connp->conn_cpid < 0) ?
MIB2_UNKNOWN_PROCESS :
- udp->udp_open_pid;
- ude6.udp6CreationTime = udp->udp_open_time;
+ connp->conn_cpid;
+ ude6.udp6CreationTime = connp->conn_open_time;
(void) snmp_append_data2(mp6_conn_ctl->b_cont,
&mp6_conn_tail, (char *)&ude6,
@@ -4548,39 +2941,34 @@ udp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
* passed in mp. This message is freed.
*/
static void
-udp_ud_err(queue_t *q, mblk_t *mp, uchar_t *destaddr, t_scalar_t destlen,
- t_scalar_t err)
+udp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err)
{
struct T_unitdata_req *tudr;
mblk_t *mp1;
+ uchar_t *destaddr;
+ t_scalar_t destlen;
uchar_t *optaddr;
t_scalar_t optlen;
- if (DB_TYPE(mp) == M_DATA) {
- ASSERT(destaddr != NULL && destlen != 0);
- optaddr = NULL;
- optlen = 0;
- } else {
- if ((mp->b_wptr < mp->b_rptr) ||
- (MBLKL(mp)) < sizeof (struct T_unitdata_req)) {
- goto done;
- }
- tudr = (struct T_unitdata_req *)mp->b_rptr;
- destaddr = mp->b_rptr + tudr->DEST_offset;
- if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr ||
- destaddr + tudr->DEST_length < mp->b_rptr ||
- destaddr + tudr->DEST_length > mp->b_wptr) {
- goto done;
- }
- optaddr = mp->b_rptr + tudr->OPT_offset;
- if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr ||
- optaddr + tudr->OPT_length < mp->b_rptr ||
- optaddr + tudr->OPT_length > mp->b_wptr) {
- goto done;
- }
- destlen = tudr->DEST_length;
- optlen = tudr->OPT_length;
+ if ((mp->b_wptr < mp->b_rptr) ||
+ (MBLKL(mp)) < sizeof (struct T_unitdata_req)) {
+ goto done;
}
+ tudr = (struct T_unitdata_req *)mp->b_rptr;
+ destaddr = mp->b_rptr + tudr->DEST_offset;
+ if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr ||
+ destaddr + tudr->DEST_length < mp->b_rptr ||
+ destaddr + tudr->DEST_length > mp->b_wptr) {
+ goto done;
+ }
+ optaddr = mp->b_rptr + tudr->OPT_offset;
+ if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr ||
+ optaddr + tudr->OPT_length < mp->b_rptr ||
+ optaddr + tudr->OPT_length > mp->b_wptr) {
+ goto done;
+ }
+ destlen = tudr->DEST_length;
+ optlen = tudr->OPT_length;
mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen,
(char *)optaddr, optlen, err);
@@ -4685,1093 +3073,721 @@ retry:
return (port);
}
+/*
+ * Handle T_UNITDATA_REQ with options. Both IPv4 and IPv6
+ * Either tudr_mp or msg is set. If tudr_mp we take ancillary data from
+ * the TPI options, otherwise we take them from msg_control.
+ * If both sin and sin6 is set it is a connected socket and we use conn_faddr.
+ * Always consumes mp; never consumes tudr_mp.
+ */
static int
-udp_update_label(queue_t *wq, mblk_t *mp, ipaddr_t dst)
+udp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp,
+ mblk_t *tudr_mp, struct nmsghdr *msg, cred_t *cr, pid_t pid)
{
- int err;
- cred_t *cred;
- cred_t *orig_cred = NULL;
- cred_t *effective_cred = NULL;
- uchar_t opt_storage[IP_MAX_OPT_LENGTH];
- udp_t *udp = Q_TO_UDP(wq);
+ udp_t *udp = connp->conn_udp;
udp_stack_t *us = udp->udp_us;
+ int error;
+ ip_xmit_attr_t *ixa;
+ ip_pkt_t *ipp;
+ in6_addr_t v6src;
+ in6_addr_t v6dst;
+ in6_addr_t v6nexthop;
+ in_port_t dstport;
+ uint32_t flowinfo;
+ uint_t srcid;
+ int is_absreq_failure = 0;
+ conn_opt_arg_t coas, *coa;
- /*
- * All Solaris components should pass a db_credp
- * for this message, hence we ASSERT.
- * On production kernels we return an error to be robust against
- * random streams modules sitting on top of us.
- */
- cred = orig_cred = msg_getcred(mp, NULL);
- ASSERT(cred != NULL);
- if (cred == NULL)
- return (EINVAL);
+ ASSERT(tudr_mp != NULL || msg != NULL);
/*
- * Verify the destination is allowed to receive packets at
- * the security label of the message data. tsol_check_dest()
- * may create a new effective cred for this message with a
- * modified label or label flags. Note that we use the cred/label
- * from the message to handle MLP
+ * Get ixa before checking state to handle a disconnect race.
+ *
+ * We need an exclusive copy of conn_ixa since the ancillary data
+ * options might modify it. That copy has no pointers hence we
+ * need to set them up once we've parsed the ancillary data.
*/
- if ((err = tsol_check_dest(cred, &dst, IPV4_VERSION,
- udp->udp_connp->conn_mac_mode, &effective_cred)) != 0)
- goto done;
- if (effective_cred != NULL)
- cred = effective_cred;
+ ixa = conn_get_ixa_exclusive(connp);
+ if (ixa == NULL) {
+ BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+ freemsg(mp);
+ return (ENOMEM);
+ }
+ ASSERT(cr != NULL);
+ ixa->ixa_cred = cr;
+ ixa->ixa_cpid = pid;
+ if (is_system_labeled()) {
+ /* We need to restart with a label based on the cred */
+ ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
+ }
- /*
- * Calculate the security label to be placed in the text
- * of the message (if any).
- */
- if ((err = tsol_compute_label(cred, dst, opt_storage,
- us->us_netstack->netstack_ip)) != 0)
- goto done;
+ /* In case previous destination was multicast or multirt */
+ ip_attr_newdst(ixa);
- /*
- * Insert the security label in the cached ip options,
- * removing any old label that may exist.
- */
- if ((err = tsol_update_options(&udp->udp_ip_snd_options,
- &udp->udp_ip_snd_options_len, &udp->udp_label_len,
- opt_storage)) != 0)
+ /* Get a copy of conn_xmit_ipp since the options might change it */
+ ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP);
+ if (ipp == NULL) {
+ ixa_refrele(ixa);
+ BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+ freemsg(mp);
+ return (ENOMEM);
+ }
+ mutex_enter(&connp->conn_lock);
+ error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP);
+ mutex_exit(&connp->conn_lock);
+ if (error != 0) {
+ BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+ freemsg(mp);
goto done;
+ }
/*
- * Save the destination address and creds we used to
- * generate the security label text.
+ * Parse the options and update ixa and ipp as a result.
+ * Note that ixa_tsl can be updated if SCM_UCRED.
+ * ixa_refrele/ixa_inactivate will release any reference on ixa_tsl.
*/
- if (cred != udp->udp_effective_cred) {
- if (udp->udp_effective_cred != NULL)
- crfree(udp->udp_effective_cred);
- crhold(cred);
- udp->udp_effective_cred = cred;
- }
- if (orig_cred != udp->udp_last_cred) {
- if (udp->udp_last_cred != NULL)
- crfree(udp->udp_last_cred);
- crhold(orig_cred);
- udp->udp_last_cred = orig_cred;
- }
-done:
- if (effective_cred != NULL)
- crfree(effective_cred);
- if (err != 0) {
- DTRACE_PROBE4(
- tx__ip__log__info__updatelabel__udp,
- char *, "queue(1) failed to update options(2) on mp(3)",
- queue_t *, wq, char *, opt_storage, mblk_t *, mp);
- }
- return (err);
-}
+ coa = &coas;
+ coa->coa_connp = connp;
+ coa->coa_ixa = ixa;
+ coa->coa_ipp = ipp;
+ coa->coa_ancillary = B_TRUE;
+ coa->coa_changed = 0;
-static mblk_t *
-udp_output_v4(conn_t *connp, mblk_t *mp, ipaddr_t v4dst, uint16_t port,
- uint_t srcid, int *error, boolean_t insert_spi, struct nmsghdr *msg,
- cred_t *cr, pid_t pid)
-{
- udp_t *udp = connp->conn_udp;
- mblk_t *mp1 = mp;
- mblk_t *mp2;
- ipha_t *ipha;
- int ip_hdr_length;
- uint32_t ip_len;
- udpha_t *udpha;
- boolean_t lock_held = B_FALSE;
- in_port_t uha_src_port;
- udpattrs_t attrs;
- uchar_t ip_snd_opt[IP_MAX_OPT_LENGTH];
- uint32_t ip_snd_opt_len = 0;
- ip4_pkt_t pktinfo;
- ip4_pkt_t *pktinfop = &pktinfo;
- ip_opt_info_t optinfo;
- ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
- udp_stack_t *us = udp->udp_us;
- ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
- queue_t *q = connp->conn_wq;
- ire_t *ire;
- in6_addr_t v6dst;
- boolean_t update_lastdst = B_FALSE;
-
- *error = 0;
- pktinfop->ip4_ill_index = 0;
- pktinfop->ip4_addr = INADDR_ANY;
- optinfo.ip_opt_flags = 0;
- optinfo.ip_opt_ill_index = 0;
+ if (msg != NULL) {
+ error = process_auxiliary_options(connp, msg->msg_control,
+ msg->msg_controllen, coa, &udp_opt_obj, udp_opt_set, cr);
+ } else {
+ struct T_unitdata_req *tudr;
- if (v4dst == INADDR_ANY)
- v4dst = htonl(INADDR_LOOPBACK);
+ tudr = (struct T_unitdata_req *)tudr_mp->b_rptr;
+ ASSERT(tudr->PRIM_type == T_UNITDATA_REQ);
+ error = tpi_optcom_buf(connp->conn_wq, tudr_mp,
+ &tudr->OPT_length, tudr->OPT_offset, cr, &udp_opt_obj,
+ coa, &is_absreq_failure);
+ }
+ if (error != 0) {
+ /*
+ * Note: No special action needed in this
+ * module for "is_absreq_failure"
+ */
+ freemsg(mp);
+ BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+ goto done;
+ }
+ ASSERT(is_absreq_failure == 0);
+ mutex_enter(&connp->conn_lock);
/*
- * If options passed in, feed it for verification and handling
+ * If laddr is unspecified then we look at sin6_src_id.
+ * We will give precedence to a source address set with IPV6_PKTINFO
+ * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
+ * want ip_attr_connect to select a source (since it can fail) when
+ * IPV6_PKTINFO is specified.
+ * If this doesn't result in a source address then we get a source
+ * from ip_attr_connect() below.
*/
- attrs.udpattr_credset = B_FALSE;
- if (IPCL_IS_NONSTR(connp)) {
- if (msg->msg_controllen != 0) {
- attrs.udpattr_ipp4 = pktinfop;
- attrs.udpattr_mb = mp;
-
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- *error = process_auxiliary_options(connp,
- msg->msg_control, msg->msg_controllen,
- &attrs, &udp_opt_obj, udp_opt_set, cr);
- rw_exit(&udp->udp_rwlock);
- if (*error)
- goto done;
+ v6src = connp->conn_saddr_v6;
+ if (sin != NULL) {
+ IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
+ dstport = sin->sin_port;
+ flowinfo = 0;
+ ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
+ ixa->ixa_flags |= IXAF_IS_IPV4;
+ } else if (sin6 != NULL) {
+ v6dst = sin6->sin6_addr;
+ dstport = sin6->sin6_port;
+ flowinfo = sin6->sin6_flowinfo;
+ srcid = sin6->__sin6_src_id;
+ if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
+ ixa->ixa_scopeid = sin6->sin6_scope_id;
+ ixa->ixa_flags |= IXAF_SCOPEID_SET;
+ } else {
+ ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
}
- } else {
- if (DB_TYPE(mp) != M_DATA) {
- mp1 = mp->b_cont;
- if (((struct T_unitdata_req *)
- mp->b_rptr)->OPT_length != 0) {
- attrs.udpattr_ipp4 = pktinfop;
- attrs.udpattr_mb = mp;
- if (udp_unitdata_opt_process(q, mp, error,
- &attrs) < 0)
- goto done;
- /*
- * Note: success in processing options.
- * mp option buffer represented by
- * OPT_length/offset now potentially modified
- * and contain option setting results
- */
- ASSERT(*error == 0);
- }
+ if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
+ ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
+ connp->conn_netstack);
}
+ if (IN6_IS_ADDR_V4MAPPED(&v6dst))
+ ixa->ixa_flags |= IXAF_IS_IPV4;
+ else
+ ixa->ixa_flags &= ~IXAF_IS_IPV4;
+ } else {
+ /* Connected case */
+ v6dst = connp->conn_faddr_v6;
+ dstport = connp->conn_fport;
+ flowinfo = connp->conn_flowinfo;
}
+ mutex_exit(&connp->conn_lock);
- /* mp1 points to the M_DATA mblk carrying the packet */
- ASSERT(mp1 != NULL && DB_TYPE(mp1) == M_DATA);
-
- /*
- * Determine whether we need to mark the mblk with the user's
- * credentials.
- * If labeled then sockfs would have already done this.
- */
- ASSERT(!is_system_labeled() || msg_getcred(mp, NULL) != NULL);
-
- ire = connp->conn_ire_cache;
- if (CLASSD(v4dst) || (ire == NULL) || (ire->ire_addr != v4dst) ||
- (ire->ire_type & (IRE_BROADCAST | IRE_LOCAL | IRE_LOOPBACK))) {
- if (cr != NULL && msg_getcred(mp, NULL) == NULL)
- mblk_setcred(mp, cr, pid);
+ /* Handle IPV6_PKTINFO setting source address. */
+ if (IN6_IS_ADDR_UNSPECIFIED(&v6src) &&
+ (ipp->ipp_fields & IPPF_ADDR)) {
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
+ v6src = ipp->ipp_addr;
+ } else {
+ if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
+ v6src = ipp->ipp_addr;
+ }
}
- rw_enter(&udp->udp_rwlock, RW_READER);
- lock_held = B_TRUE;
+ ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
+ error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
+ &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | IPDF_IPSEC);
- /*
- * Cluster and TSOL note:
- * udp.udp_v6lastdst is shared by Cluster and TSOL
- * udp.udp_lastdstport is used by Cluster
- *
- * Both Cluster and TSOL need to update the dest addr and/or port.
- * Updating is done after both Cluster and TSOL checks, protected
- * by conn_lock.
- */
- mutex_enter(&connp->conn_lock);
-
- if (cl_inet_connect2 != NULL &&
- (!IN6_IS_ADDR_V4MAPPED(&udp->udp_v6lastdst) ||
- V4_PART_OF_V6(udp->udp_v6lastdst) != v4dst ||
- udp->udp_lastdstport != port)) {
- mutex_exit(&connp->conn_lock);
- *error = 0;
- IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
- CL_INET_UDP_CONNECT(connp, udp, B_TRUE, &v6dst, port, *error);
- if (*error != 0) {
- *error = EHOSTUNREACH;
- goto done;
+ switch (error) {
+ case 0:
+ break;
+ case EADDRNOTAVAIL:
+ /*
+ * IXAF_VERIFY_SOURCE tells us to pick a better source.
+ * Don't have the application see that errno
+ */
+ error = ENETUNREACH;
+ goto failed;
+ case ENETDOWN:
+ /*
+ * Have !ipif_addr_ready address; drop packet silently
+ * until we can get applications to not send until we
+ * are ready.
+ */
+ error = 0;
+ goto failed;
+ case EHOSTUNREACH:
+ case ENETUNREACH:
+ if (ixa->ixa_ire != NULL) {
+ /*
+ * Let conn_ip_output/ire_send_noroute return
+ * the error and send any local ICMP error.
+ */
+ error = 0;
+ break;
}
- update_lastdst = B_TRUE;
- mutex_enter(&connp->conn_lock);
+ /* FALLTHRU */
+ default:
+ failed:
+ freemsg(mp);
+ BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+ goto done;
}
/*
- * Check if our saved options are valid; update if not.
- * TSOL Note: Since we are not in WRITER mode, UDP packets
- * to different destination may require different labels,
- * or worse, UDP packets to same IP address may require
- * different labels due to use of shared all-zones address.
- * We use conn_lock to ensure that lastdst, ip_snd_options,
- * and ip_snd_options_len are consistent for the current
- * destination and are updated atomically.
+ * We might be going to a different destination than last time,
+ * thus check that TX allows the communication and compute any
+ * needed label.
+ *
+ * TSOL Note: We have an exclusive ipp and ixa for this thread so we
+ * don't have to worry about concurrent threads.
*/
if (is_system_labeled()) {
- cred_t *credp;
- pid_t cpid;
-
/* Using UDP MLP requires SCM_UCRED from user */
if (connp->conn_mlp_type != mlptSingle &&
- !attrs.udpattr_credset) {
- mutex_exit(&connp->conn_lock);
- DTRACE_PROBE4(
- tx__ip__log__info__output__udp,
- char *, "MLP mp(1) lacks SCM_UCRED attr(2) on q(3)",
- mblk_t *, mp, udpattrs_t *, &attrs, queue_t *, q);
- *error = EINVAL;
+ !((ixa->ixa_flags & IXAF_UCRED_TSL))) {
+ BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+ error = ECONNREFUSED;
+ freemsg(mp);
goto done;
}
/*
- * Update label option for this UDP socket if
- * - the destination has changed,
- * - the UDP socket is MLP, or
- * - the cred attached to the mblk changed.
+ * Check whether Trusted Solaris policy allows communication
+ * with this host, and pretend that the destination is
+ * unreachable if not.
+ * Compute any needed label and place it in ipp_label_v4/v6.
+ *
+ * Later conn_build_hdr_template/conn_prepend_hdr takes
+ * ipp_label_v4/v6 to form the packet.
+ *
+ * Tsol note: We have ipp structure local to this thread so
+ * no locking is needed.
*/
- credp = msg_getcred(mp, &cpid);
- if (!IN6_IS_ADDR_V4MAPPED(&udp->udp_v6lastdst) ||
- V4_PART_OF_V6(udp->udp_v6lastdst) != v4dst ||
- connp->conn_mlp_type != mlptSingle ||
- credp != udp->udp_last_cred) {
- if ((*error = udp_update_label(q, mp, v4dst)) != 0) {
- mutex_exit(&connp->conn_lock);
- goto done;
- }
- update_lastdst = B_TRUE;
+ error = conn_update_label(connp, ixa, &v6dst, ipp);
+ if (error != 0) {
+ freemsg(mp);
+ BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+ goto done;
}
-
- /*
- * Attach the effective cred to the mblk to ensure future
- * routing decisions will be based on it's label.
- */
- mblk_setcred(mp, udp->udp_effective_cred, cpid);
}
- if (update_lastdst) {
- IN6_IPADDR_TO_V4MAPPED(v4dst, &udp->udp_v6lastdst);
- udp->udp_lastdstport = port;
+ mp = udp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, dstport,
+ flowinfo, mp, &error);
+ if (mp == NULL) {
+ ASSERT(error != 0);
+ BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+ goto done;
}
- if (udp->udp_ip_snd_options_len > 0) {
- ip_snd_opt_len = udp->udp_ip_snd_options_len;
- bcopy(udp->udp_ip_snd_options, ip_snd_opt, ip_snd_opt_len);
+ if (ixa->ixa_pktlen > IP_MAXPACKET) {
+ error = EMSGSIZE;
+ BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+ freemsg(mp);
+ goto done;
}
- mutex_exit(&connp->conn_lock);
+ /* We're done. Pass the packet to ip. */
+ BUMP_MIB(&us->us_udp_mib, udpHCOutDatagrams);
- /* Add an IP header */
- ip_hdr_length = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE + ip_snd_opt_len +
- (insert_spi ? sizeof (uint32_t) : 0);
- ipha = (ipha_t *)&mp1->b_rptr[-ip_hdr_length];
- if (DB_REF(mp1) != 1 || (uchar_t *)ipha < DB_BASE(mp1) ||
- !OK_32PTR(ipha)) {
- mp2 = allocb(ip_hdr_length + us->us_wroff_extra, BPRI_LO);
- if (mp2 == NULL) {
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
- "udp_wput_end: q %p (%S)", q, "allocbfail2");
- *error = ENOMEM;
- goto done;
- }
- mp2->b_wptr = DB_LIM(mp2);
- mp2->b_cont = mp1;
- mp1 = mp2;
- if (DB_TYPE(mp) != M_DATA)
- mp->b_cont = mp1;
- else
- mp = mp1;
- ipha = (ipha_t *)(mp1->b_wptr - ip_hdr_length);
- }
- ip_hdr_length -= (UDPH_SIZE + (insert_spi ? sizeof (uint32_t) : 0));
-#ifdef _BIG_ENDIAN
- /* Set version, header length, and tos */
- *(uint16_t *)&ipha->ipha_version_and_hdr_length =
- ((((IP_VERSION << 4) | (ip_hdr_length>>2)) << 8) |
- udp->udp_type_of_service);
- /* Set ttl and protocol */
- *(uint16_t *)&ipha->ipha_ttl = (udp->udp_ttl << 8) | IPPROTO_UDP;
-#else
- /* Set version, header length, and tos */
- *(uint16_t *)&ipha->ipha_version_and_hdr_length =
- ((udp->udp_type_of_service << 8) |
- ((IP_VERSION << 4) | (ip_hdr_length>>2)));
- /* Set ttl and protocol */
- *(uint16_t *)&ipha->ipha_ttl = (IPPROTO_UDP << 8) | udp->udp_ttl;
-#endif
- if (pktinfop->ip4_addr != INADDR_ANY) {
- ipha->ipha_src = pktinfop->ip4_addr;
- optinfo.ip_opt_flags = IP_VERIFY_SRC;
- } else {
+ error = conn_ip_output(mp, ixa);
+ /* No udpOutErrors if an error since IP increases its error counter */
+ switch (error) {
+ case 0:
+ break;
+ case EWOULDBLOCK:
+ (void) ixa_check_drain_insert(connp, ixa);
+ error = 0;
+ break;
+ case EADDRNOTAVAIL:
/*
- * Copy our address into the packet. If this is zero,
- * first look at __sin6_src_id for a hint. If we leave the
- * source as INADDR_ANY then ip will fill in the real source
- * address.
+ * IXAF_VERIFY_SOURCE tells us to pick a better source.
+ * Don't have the application see that errno
*/
- IN6_V4MAPPED_TO_IPADDR(&udp->udp_v6src, ipha->ipha_src);
- if (srcid != 0 && ipha->ipha_src == INADDR_ANY) {
- in6_addr_t v6src;
-
- ip_srcid_find_id(srcid, &v6src, connp->conn_zoneid,
- us->us_netstack);
- IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src);
- }
- }
- uha_src_port = udp->udp_port;
- if (ip_hdr_length == IP_SIMPLE_HDR_LENGTH) {
- rw_exit(&udp->udp_rwlock);
- lock_held = B_FALSE;
- }
-
- if (pktinfop->ip4_ill_index != 0) {
- optinfo.ip_opt_ill_index = pktinfop->ip4_ill_index;
+ error = ENETUNREACH;
+ /* FALLTHRU */
+ default:
+ mutex_enter(&connp->conn_lock);
+ /*
+ * Clear the source and v6lastdst so we call ip_attr_connect
+ * for the next packet and try to pick a better source.
+ */
+ if (connp->conn_mcbc_bind)
+ connp->conn_saddr_v6 = ipv6_all_zeros;
+ else
+ connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
+ connp->conn_v6lastdst = ipv6_all_zeros;
+ mutex_exit(&connp->conn_lock);
+ break;
}
+done:
+ ixa_refrele(ixa);
+ ip_pkt_free(ipp);
+ kmem_free(ipp, sizeof (*ipp));
+ return (error);
+}
- ipha->ipha_fragment_offset_and_flags = 0;
- ipha->ipha_ident = 0;
-
- mp1->b_rptr = (uchar_t *)ipha;
-
- ASSERT((uintptr_t)(mp1->b_wptr - (uchar_t *)ipha) <=
- (uintptr_t)UINT_MAX);
+/*
+ * Handle sending an M_DATA for a connected socket.
+ * Handles both IPv4 and IPv6.
+ */
+static int
+udp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid)
+{
+ udp_t *udp = connp->conn_udp;
+ udp_stack_t *us = udp->udp_us;
+ int error;
+ ip_xmit_attr_t *ixa;
- /* Determine length of packet */
- ip_len = (uint32_t)(mp1->b_wptr - (uchar_t *)ipha);
- if ((mp2 = mp1->b_cont) != NULL) {
- do {
- ASSERT((uintptr_t)MBLKL(mp2) <= (uintptr_t)UINT_MAX);
- ip_len += (uint32_t)MBLKL(mp2);
- } while ((mp2 = mp2->b_cont) != NULL);
- }
/*
- * If the size of the packet is greater than the maximum allowed by
- * ip, return an error. Passing this down could cause panics because
- * the size will have wrapped and be inconsistent with the msg size.
+ * If no other thread is using conn_ixa this just gets a reference to
+ * conn_ixa. Otherwise we get a safe copy of conn_ixa.
*/
- if (ip_len > IP_MAXPACKET) {
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
- "udp_wput_end: q %p (%S)", q, "IP length exceeded");
- *error = EMSGSIZE;
- goto done;
+ ixa = conn_get_ixa(connp, B_FALSE);
+ if (ixa == NULL) {
+ BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+ freemsg(mp);
+ return (ENOMEM);
}
- ipha->ipha_length = htons((uint16_t)ip_len);
- ip_len -= ip_hdr_length;
- ip_len = htons((uint16_t)ip_len);
- udpha = (udpha_t *)(((uchar_t *)ipha) + ip_hdr_length);
-
- /* Insert all-0s SPI now. */
- if (insert_spi)
- *((uint32_t *)(udpha + 1)) = 0;
- /*
- * Copy in the destination address
- */
- ipha->ipha_dst = v4dst;
-
- /*
- * Set ttl based on IP_MULTICAST_TTL to match IPv6 logic.
- */
- if (CLASSD(v4dst))
- ipha->ipha_ttl = udp->udp_multicast_ttl;
-
- udpha->uha_dst_port = port;
- udpha->uha_src_port = uha_src_port;
+ ASSERT(cr != NULL);
+ ixa->ixa_cred = cr;
+ ixa->ixa_cpid = pid;
- if (ip_snd_opt_len > 0) {
- uint32_t cksum;
+ mutex_enter(&connp->conn_lock);
+ mp = udp_prepend_header_template(connp, ixa, mp, &connp->conn_saddr_v6,
+ connp->conn_fport, connp->conn_flowinfo, &error);
- bcopy(ip_snd_opt, &ipha[1], ip_snd_opt_len);
- lock_held = B_FALSE;
- rw_exit(&udp->udp_rwlock);
- /*
- * Massage source route putting first source route in ipha_dst.
- * Ignore the destination in T_unitdata_req.
- * Create a checksum adjustment for a source route, if any.
- */
- cksum = ip_massage_options(ipha, us->us_netstack);
- cksum = (cksum & 0xFFFF) + (cksum >> 16);
- cksum -= ((ipha->ipha_dst >> 16) & 0xFFFF) +
- (ipha->ipha_dst & 0xFFFF);
- if ((int)cksum < 0)
- cksum--;
- cksum = (cksum & 0xFFFF) + (cksum >> 16);
- /*
- * IP does the checksum if uha_checksum is non-zero,
- * We make it easy for IP to include our pseudo header
- * by putting our length in uha_checksum.
- */
- cksum += ip_len;
- cksum = (cksum & 0xFFFF) + (cksum >> 16);
- /* There might be a carry. */
- cksum = (cksum & 0xFFFF) + (cksum >> 16);
-#ifdef _LITTLE_ENDIAN
- if (us->us_do_checksum)
- ip_len = (cksum << 16) | ip_len;
-#else
- if (us->us_do_checksum)
- ip_len = (ip_len << 16) | cksum;
- else
- ip_len <<= 16;
-#endif
- } else {
- /*
- * IP does the checksum if uha_checksum is non-zero,
- * We make it easy for IP to include our pseudo header
- * by putting our length in uha_checksum.
- */
- if (us->us_do_checksum)
- ip_len |= (ip_len << 16);
-#ifndef _LITTLE_ENDIAN
- else
- ip_len <<= 16;
-#endif
+ if (mp == NULL) {
+ ASSERT(error != 0);
+ mutex_exit(&connp->conn_lock);
+ ixa_refrele(ixa);
+ BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+ freemsg(mp);
+ return (error);
}
- ASSERT(!lock_held);
- /* Set UDP length and checksum */
- *((uint32_t *)&udpha->uha_length) = ip_len;
- if (DB_TYPE(mp) != M_DATA) {
- cred_t *cr;
- pid_t cpid;
+ /*
+ * In case we got a safe copy of conn_ixa, or if opt_set made us a new
+ * safe copy, then we need to fill in any pointers in it.
+ */
+ if (ixa->ixa_ire == NULL) {
+ in6_addr_t faddr, saddr;
+ in6_addr_t nexthop;
+ in_port_t fport;
+
+ saddr = connp->conn_saddr_v6;
+ faddr = connp->conn_faddr_v6;
+ fport = connp->conn_fport;
+ ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &faddr, &nexthop);
+ mutex_exit(&connp->conn_lock);
- /* Move any cred from the T_UNITDATA_REQ to the packet */
- cr = msg_extractcred(mp, &cpid);
- if (cr != NULL) {
- if (mp1->b_datap->db_credp != NULL)
- crfree(mp1->b_datap->db_credp);
- mp1->b_datap->db_credp = cr;
- mp1->b_datap->db_cpid = cpid;
+ error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop,
+ fport, NULL, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
+ IPDF_IPSEC);
+ switch (error) {
+ case 0:
+ break;
+ case EADDRNOTAVAIL:
+ /*
+ * IXAF_VERIFY_SOURCE tells us to pick a better source.
+ * Don't have the application see that errno
+ */
+ error = ENETUNREACH;
+ goto failed;
+ case ENETDOWN:
+ /*
+ * Have !ipif_addr_ready address; drop packet silently
+ * until we can get applications to not send until we
+ * are ready.
+ */
+ error = 0;
+ goto failed;
+ case EHOSTUNREACH:
+ case ENETUNREACH:
+ if (ixa->ixa_ire != NULL) {
+ /*
+ * Let conn_ip_output/ire_send_noroute return
+ * the error and send any local ICMP error.
+ */
+ error = 0;
+ break;
+ }
+ /* FALLTHRU */
+ default:
+ failed:
+ ixa_refrele(ixa);
+ freemsg(mp);
+ BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+ return (error);
}
- ASSERT(mp != mp1);
- freeb(mp);
+ } else {
+ /* Done with conn_t */
+ mutex_exit(&connp->conn_lock);
}
-
- /* mp has been consumed and we'll return success */
- ASSERT(*error == 0);
- mp = NULL;
+ ASSERT(ixa->ixa_ire != NULL);
/* We're done. Pass the packet to ip. */
BUMP_MIB(&us->us_udp_mib, udpHCOutDatagrams);
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
- "udp_wput_end: q %p (%S)", q, "end");
-
- if ((connp->conn_flags & IPCL_CHECK_POLICY) != 0 ||
- CONN_OUTBOUND_POLICY_PRESENT(connp, ipss) ||
- connp->conn_dontroute ||
- connp->conn_outgoing_ill != NULL || optinfo.ip_opt_flags != 0 ||
- optinfo.ip_opt_ill_index != 0 ||
- ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION ||
- IPP_ENABLED(IPP_LOCAL_OUT, ipst) ||
- ipst->ips_ip_g_mrouter != NULL) {
- UDP_STAT(us, udp_ip_send);
- ip_output_options(connp, mp1, connp->conn_wq, IP_WPUT,
- &optinfo);
- } else {
- udp_send_data(udp, connp->conn_wq, mp1, ipha);
- }
-done:
- if (lock_held)
- rw_exit(&udp->udp_rwlock);
- if (*error != 0) {
- ASSERT(mp != NULL);
- BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+ error = conn_ip_output(mp, ixa);
+ /* No udpOutErrors if an error since IP increases its error counter */
+ switch (error) {
+ case 0:
+ break;
+ case EWOULDBLOCK:
+ (void) ixa_check_drain_insert(connp, ixa);
+ error = 0;
+ break;
+ case EADDRNOTAVAIL:
+ /*
+ * IXAF_VERIFY_SOURCE tells us to pick a better source.
+ * Don't have the application see that errno
+ */
+ error = ENETUNREACH;
+ break;
}
- return (mp);
+ ixa_refrele(ixa);
+ return (error);
}
-static void
-udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha)
+/*
+ * Handle sending an M_DATA to the last destination.
+ * Handles both IPv4 and IPv6.
+ *
+ * NOTE: The caller must hold conn_lock and we drop it here.
+ */
+static int
+udp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid,
+ ip_xmit_attr_t *ixa)
{
- conn_t *connp = udp->udp_connp;
- ipaddr_t src, dst;
- ire_t *ire;
- ipif_t *ipif = NULL;
- mblk_t *ire_fp_mp;
- boolean_t retry_caching;
- udp_stack_t *us = udp->udp_us;
- ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
-
- dst = ipha->ipha_dst;
- src = ipha->ipha_src;
- ASSERT(ipha->ipha_ident == 0);
-
- if (CLASSD(dst)) {
- int err;
-
- ipif = conn_get_held_ipif(connp,
- &connp->conn_multicast_ipif, &err);
-
- if (ipif == NULL || ipif->ipif_isv6 ||
- (ipif->ipif_ill->ill_phyint->phyint_flags &
- PHYI_LOOPBACK)) {
- if (ipif != NULL)
- ipif_refrele(ipif);
- UDP_STAT(us, udp_ip_send);
- ip_output(connp, mp, q, IP_WPUT);
- return;
- }
- }
+ udp_t *udp = connp->conn_udp;
+ udp_stack_t *us = udp->udp_us;
+ int error;
- retry_caching = B_FALSE;
- mutex_enter(&connp->conn_lock);
- ire = connp->conn_ire_cache;
- ASSERT(!(connp->conn_state_flags & CONN_INCIPIENT));
+ ASSERT(MUTEX_HELD(&connp->conn_lock));
+ ASSERT(ixa != NULL);
- if (ire == NULL || ire->ire_addr != dst ||
- (ire->ire_marks & IRE_MARK_CONDEMNED)) {
- retry_caching = B_TRUE;
- } else if (CLASSD(dst) && (ire->ire_type & IRE_CACHE)) {
- ill_t *stq_ill = (ill_t *)ire->ire_stq->q_ptr;
+ ASSERT(cr != NULL);
+ ixa->ixa_cred = cr;
+ ixa->ixa_cpid = pid;
- ASSERT(ipif != NULL);
- if (!IS_ON_SAME_LAN(stq_ill, ipif->ipif_ill))
- retry_caching = B_TRUE;
- }
+ mp = udp_prepend_header_template(connp, ixa, mp, &connp->conn_v6lastsrc,
+ connp->conn_lastdstport, connp->conn_lastflowinfo, &error);
- if (!retry_caching) {
- ASSERT(ire != NULL);
- IRE_REFHOLD(ire);
+ if (mp == NULL) {
+ ASSERT(error != 0);
mutex_exit(&connp->conn_lock);
- } else {
- boolean_t cached = B_FALSE;
+ ixa_refrele(ixa);
+ BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+ freemsg(mp);
+ return (error);
+ }
- connp->conn_ire_cache = NULL;
+ /*
+ * In case we got a safe copy of conn_ixa, or if opt_set made us a new
+ * safe copy, then we need to fill in any pointers in it.
+ */
+ if (ixa->ixa_ire == NULL) {
+ in6_addr_t lastdst, lastsrc;
+ in6_addr_t nexthop;
+ in_port_t lastport;
+
+ lastsrc = connp->conn_v6lastsrc;
+ lastdst = connp->conn_v6lastdst;
+ lastport = connp->conn_lastdstport;
+ ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &lastdst, &nexthop);
mutex_exit(&connp->conn_lock);
- /* Release the old ire */
- if (ire != NULL) {
- IRE_REFRELE_NOTR(ire);
- ire = NULL;
- }
-
- if (CLASSD(dst)) {
- ASSERT(ipif != NULL);
- ire = ire_ctable_lookup(dst, 0, 0, ipif,
- connp->conn_zoneid, msg_getlabel(mp),
- MATCH_IRE_ILL, ipst);
- } else {
- ASSERT(ipif == NULL);
- ire = ire_cache_lookup(dst, connp->conn_zoneid,
- msg_getlabel(mp), ipst);
- }
-
- if (ire == NULL) {
- if (ipif != NULL)
- ipif_refrele(ipif);
- UDP_STAT(us, udp_ire_null);
- ip_output(connp, mp, q, IP_WPUT);
- return;
- }
- IRE_REFHOLD_NOTR(ire);
-
- mutex_enter(&connp->conn_lock);
- if (CONN_CACHE_IRE(connp) && connp->conn_ire_cache == NULL &&
- !(ire->ire_marks & IRE_MARK_CONDEMNED)) {
- irb_t *irb = ire->ire_bucket;
-
+ error = ip_attr_connect(connp, ixa, &lastsrc, &lastdst,
+ &nexthop, lastport, NULL, NULL, IPDF_ALLOW_MCBC |
+ IPDF_VERIFY_DST | IPDF_IPSEC);
+ switch (error) {
+ case 0:
+ break;
+ case EADDRNOTAVAIL:
/*
- * IRE's created for non-connection oriented transports
- * are normally initialized with IRE_MARK_TEMPORARY set
- * in the ire_marks. These IRE's are preferentially
- * reaped when the hash chain length in the cache
- * bucket exceeds the maximum value specified in
- * ip[6]_ire_max_bucket_cnt. This can severely affect
- * UDP performance if IRE cache entries that we need
- * to reuse are continually removed. To remedy this,
- * when we cache the IRE in the conn_t, we remove the
- * IRE_MARK_TEMPORARY bit from the ire_marks if it was
- * set.
+ * IXAF_VERIFY_SOURCE tells us to pick a better source.
+ * Don't have the application see that errno
*/
- if (ire->ire_marks & IRE_MARK_TEMPORARY) {
- rw_enter(&irb->irb_lock, RW_WRITER);
- if (ire->ire_marks & IRE_MARK_TEMPORARY) {
- ire->ire_marks &= ~IRE_MARK_TEMPORARY;
- irb->irb_tmp_ire_cnt--;
- }
- rw_exit(&irb->irb_lock);
+ error = ENETUNREACH;
+ goto failed;
+ case ENETDOWN:
+ /*
+ * Have !ipif_addr_ready address; drop packet silently
+ * until we can get applications to not send until we
+ * are ready.
+ */
+ error = 0;
+ goto failed;
+ case EHOSTUNREACH:
+ case ENETUNREACH:
+ if (ixa->ixa_ire != NULL) {
+ /*
+ * Let conn_ip_output/ire_send_noroute return
+ * the error and send any local ICMP error.
+ */
+ error = 0;
+ break;
}
- connp->conn_ire_cache = ire;
- cached = B_TRUE;
+ /* FALLTHRU */
+ default:
+ failed:
+ ixa_refrele(ixa);
+ freemsg(mp);
+ BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+ return (error);
}
+ } else {
+ /* Done with conn_t */
mutex_exit(&connp->conn_lock);
-
- /*
- * We can continue to use the ire but since it was not
- * cached, we should drop the extra reference.
- */
- if (!cached)
- IRE_REFRELE_NOTR(ire);
}
- ASSERT(ire != NULL && ire->ire_ipversion == IPV4_VERSION);
- ASSERT(!CLASSD(dst) || ipif != NULL);
- /*
- * Check if we can take the fast-path.
- * Note that "incomplete" ire's (where the link-layer for next hop
- * is not resolved, or where the fast-path header in nce_fp_mp is not
- * available yet) are sent down the legacy (slow) path
- */
- if ((ire->ire_type & (IRE_BROADCAST|IRE_LOCAL|IRE_LOOPBACK)) ||
- (ire->ire_flags & RTF_MULTIRT) || (ire->ire_stq == NULL) ||
- (ire->ire_max_frag < ntohs(ipha->ipha_length)) ||
- ((ire->ire_nce == NULL) ||
- ((ire_fp_mp = ire->ire_nce->nce_fp_mp) == NULL)) ||
- connp->conn_nexthop_set || (MBLKL(ire_fp_mp) > MBLKHEAD(mp))) {
- if (ipif != NULL)
- ipif_refrele(ipif);
- UDP_STAT(us, udp_ip_ire_send);
- IRE_REFRELE(ire);
- ip_output(connp, mp, q, IP_WPUT);
- return;
- }
+ /* We're done. Pass the packet to ip. */
+ BUMP_MIB(&us->us_udp_mib, udpHCOutDatagrams);
- if (src == INADDR_ANY && !connp->conn_unspec_src) {
- if (CLASSD(dst) && !(ire->ire_flags & RTF_SETSRC))
- ipha->ipha_src = ipif->ipif_src_addr;
+ error = conn_ip_output(mp, ixa);
+ /* No udpOutErrors if an error since IP increases its error counter */
+ switch (error) {
+ case 0:
+ break;
+ case EWOULDBLOCK:
+ (void) ixa_check_drain_insert(connp, ixa);
+ error = 0;
+ break;
+ case EADDRNOTAVAIL:
+ /*
+ * IXAF_VERIFY_SOURCE tells us to pick a better source.
+ * Don't have the application see that errno
+ */
+ error = ENETUNREACH;
+ /* FALLTHRU */
+ default:
+ mutex_enter(&connp->conn_lock);
+ /*
+ * Clear the source and v6lastdst so we call ip_attr_connect
+ * for the next packet and try to pick a better source.
+ */
+ if (connp->conn_mcbc_bind)
+ connp->conn_saddr_v6 = ipv6_all_zeros;
else
- ipha->ipha_src = ire->ire_src_addr;
+ connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
+ connp->conn_v6lastdst = ipv6_all_zeros;
+ mutex_exit(&connp->conn_lock);
+ break;
}
-
- if (ipif != NULL)
- ipif_refrele(ipif);
-
- udp_xmit(connp->conn_wq, mp, ire, connp, connp->conn_zoneid);
+ ixa_refrele(ixa);
+ return (error);
}
-static void
-udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid)
+
+/*
+ * Prepend the header template and then fill in the source and
+ * flowinfo. The caller needs to handle the destination address since
+ * it's setting is different if rthdr or source route.
+ *
+ * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET.
+ * When it returns NULL it sets errorp.
+ */
+static mblk_t *
+udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
+ const in6_addr_t *v6src, in_port_t dstport, uint32_t flowinfo, int *errorp)
{
- ipaddr_t src, dst;
- ill_t *ill;
- mblk_t *ire_fp_mp;
- uint_t ire_fp_mp_len;
- uint16_t *up;
- uint32_t cksum, hcksum_txflags;
- queue_t *dev_q;
- udp_t *udp = connp->conn_udp;
- ipha_t *ipha = (ipha_t *)mp->b_rptr;
+ udp_t *udp = connp->conn_udp;
udp_stack_t *us = udp->udp_us;
- ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
- boolean_t ll_multicast = B_FALSE;
- boolean_t direct_send;
-
- dev_q = ire->ire_stq->q_next;
- ASSERT(dev_q != NULL);
+ boolean_t insert_spi = udp->udp_nat_t_endpoint;
+ uint_t pktlen;
+ uint_t alloclen;
+ uint_t copylen;
+ uint8_t *iph;
+ uint_t ip_hdr_length;
+ udpha_t *udpha;
+ uint32_t cksum;
+ ip_pkt_t *ipp;
- ill = ire_to_ill(ire);
- ASSERT(ill != NULL);
+ ASSERT(MUTEX_HELD(&connp->conn_lock));
/*
- * For the direct send case, if resetting of conn_direct_blocked
- * was missed, it is still ok because the putq() would enable
- * the queue and write service will drain it out.
+ * Copy the header template and leave space for an SPI
*/
- direct_send = ILL_DIRECT_CAPABLE(ill);
-
- /* is queue flow controlled? */
- if ((!direct_send) && (q->q_first != NULL || connp->conn_draining ||
- DEV_Q_FLOW_BLOCKED(dev_q))) {
- BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
- BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
- if (ipst->ips_ip_output_queue) {
- DTRACE_PROBE1(udp__xmit__putq, conn_t *, connp);
- (void) putq(connp->conn_wq, mp);
- } else {
- freemsg(mp);
- }
- ire_refrele(ire);
- return;
- }
-
- ire_fp_mp = ire->ire_nce->nce_fp_mp;
- ire_fp_mp_len = MBLKL(ire_fp_mp);
- ASSERT(MBLKHEAD(mp) >= ire_fp_mp_len);
-
- dst = ipha->ipha_dst;
- src = ipha->ipha_src;
-
-
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
-
- ipha->ipha_ident = (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1);
-#ifndef _BIG_ENDIAN
- ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8);
-#endif
-
- if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) {
- ASSERT(ill->ill_hcksum_capab != NULL);
- hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags;
- } else {
- hcksum_txflags = 0;
- }
-
- /* pseudo-header checksum (do it in parts for IP header checksum) */
- cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
-
- ASSERT(ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION);
- up = IPH_UDPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
- if (*up != 0) {
- IP_CKSUM_XMIT_FAST(ire->ire_ipversion, hcksum_txflags,
- mp, ipha, up, IPPROTO_UDP, IP_SIMPLE_HDR_LENGTH,
- ntohs(ipha->ipha_length), cksum);
-
- /* Software checksum? */
- if (DB_CKSUMFLAGS(mp) == 0) {
- UDP_STAT(us, udp_out_sw_cksum);
- UDP_STAT_UPDATE(us, udp_out_sw_cksum_bytes,
- ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH);
- }
- }
-
- if (!CLASSD(dst)) {
- ipha->ipha_fragment_offset_and_flags |=
- (uint32_t)htons(ire->ire_frag_flag);
- }
-
- /* Calculate IP header checksum if hardware isn't capable */
- if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) {
- IP_HDR_CKSUM(ipha, cksum, ((uint32_t *)ipha)[0],
- ((uint16_t *)ipha)[4]);
+ copylen = connp->conn_ht_iphc_len;
+ alloclen = copylen + (insert_spi ? sizeof (uint32_t) : 0);
+ pktlen = alloclen + msgdsize(mp);
+ if (pktlen > IP_MAXPACKET) {
+ freemsg(mp);
+ *errorp = EMSGSIZE;
+ return (NULL);
}
+ ixa->ixa_pktlen = pktlen;
- if (CLASSD(dst)) {
- if (ilm_lookup_ill(ill, dst, ALL_ZONES) != NULL) {
- ip_multicast_loopback(q, ill, mp,
- connp->conn_multicast_loop ? 0 :
- IP_FF_NO_MCAST_LOOP, zoneid);
- }
+ /* check/fix buffer config, setup pointers into it */
+ iph = mp->b_rptr - alloclen;
+ if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) {
+ mblk_t *mp1;
- /* If multicast TTL is 0 then we are done */
- if (ipha->ipha_ttl == 0) {
+ mp1 = allocb(alloclen + us->us_wroff_extra, BPRI_MED);
+ if (mp1 == NULL) {
freemsg(mp);
- ire_refrele(ire);
- return;
+ *errorp = ENOMEM;
+ return (NULL);
}
- ll_multicast = B_TRUE;
+ mp1->b_wptr = DB_LIM(mp1);
+ mp1->b_cont = mp;
+ mp = mp1;
+ iph = (mp->b_wptr - alloclen);
}
+ mp->b_rptr = iph;
+ bcopy(connp->conn_ht_iphc, iph, copylen);
+ ip_hdr_length = (uint_t)(connp->conn_ht_ulp - connp->conn_ht_iphc);
- ASSERT(DB_TYPE(ire_fp_mp) == M_DATA);
- mp->b_rptr = (uchar_t *)ipha - ire_fp_mp_len;
- bcopy(ire_fp_mp->b_rptr, mp->b_rptr, ire_fp_mp_len);
-
- UPDATE_OB_PKT_COUNT(ire);
- ire->ire_last_used_time = lbolt;
-
- BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits);
- UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
- ntohs(ipha->ipha_length));
+ ixa->ixa_ip_hdr_length = ip_hdr_length;
+ udpha = (udpha_t *)(iph + ip_hdr_length);
- DTRACE_PROBE4(ip4__physical__out__start,
- ill_t *, NULL, ill_t *, ill, ipha_t *, ipha, mblk_t *, mp);
- FW_HOOKS(ipst->ips_ip4_physical_out_event,
- ipst->ips_ipv4firewall_physical_out, NULL, ill, ipha, mp, mp,
- ll_multicast, ipst);
- DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp);
- if (ipst->ips_ip4_observe.he_interested && mp != NULL) {
- zoneid_t szone;
-
- /*
- * Both of these functions expect b_rptr to be
- * where the IP header starts, so advance past the
- * link layer header if present.
- */
- mp->b_rptr += ire_fp_mp_len;
- szone = ip_get_zoneid_v4(ipha->ipha_src, mp,
- ipst, ALL_ZONES);
- ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone,
- ALL_ZONES, ill, ipst);
- mp->b_rptr -= ire_fp_mp_len;
- }
+ /*
+ * Setup header length and prepare for ULP checksum done in IP.
+ * udp_build_hdr_template has already massaged any routing header
+ * and placed the result in conn_sum.
+ *
+ * We make it easy for IP to include our pseudo header
+ * by putting our length in uha_checksum.
+ */
+ cksum = pktlen - ip_hdr_length;
+ udpha->uha_length = htons(cksum);
- if (mp == NULL)
- goto bail;
+ cksum += connp->conn_sum;
+ cksum = (cksum >> 16) + (cksum & 0xFFFF);
+ ASSERT(cksum < 0x10000);
- DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL,
- void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill,
- ipha_t *, ipha, ip6_t *, NULL, int, 0);
+ ipp = &connp->conn_xmit_ipp;
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ ipha_t *ipha = (ipha_t *)iph;
- if (direct_send) {
- uintptr_t cookie;
- ill_dld_direct_t *idd = &ill->ill_dld_capab->idc_direct;
+ ipha->ipha_length = htons((uint16_t)pktlen);
- cookie = idd->idd_tx_df(idd->idd_tx_dh, mp,
- (uintptr_t)connp, 0);
- if (cookie != NULL) {
- idl_tx_list_t *idl_txl;
+ /* IP does the checksum if uha_checksum is non-zero */
+ if (us->us_do_checksum)
+ udpha->uha_checksum = htons(cksum);
- /*
- * Flow controlled.
- */
- DTRACE_PROBE2(non__null__cookie, uintptr_t,
- cookie, conn_t *, connp);
- idl_txl = &ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)];
- mutex_enter(&idl_txl->txl_lock);
- /*
- * Check again after holding txl_lock to see if Tx
- * ring is still blocked and only then insert the
- * connp into the drain list.
- */
- if (connp->conn_direct_blocked ||
- (idd->idd_tx_fctl_df(idd->idd_tx_fctl_dh,
- cookie) == 0)) {
- mutex_exit(&idl_txl->txl_lock);
- goto bail;
- }
- if (idl_txl->txl_cookie != NULL &&
- idl_txl->txl_cookie != cookie) {
- DTRACE_PROBE2(udp__xmit__collision,
- uintptr_t, cookie,
- uintptr_t, idl_txl->txl_cookie);
- UDP_STAT(us, udp_cookie_coll);
- } else {
- connp->conn_direct_blocked = B_TRUE;
- idl_txl->txl_cookie = cookie;
- conn_drain_insert(connp, idl_txl);
- DTRACE_PROBE1(udp__xmit__insert,
- conn_t *, connp);
- }
- mutex_exit(&idl_txl->txl_lock);
+ /* if IP_PKTINFO specified an addres it wins over bind() */
+ if ((ipp->ipp_fields & IPPF_ADDR) &&
+ IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
+ ASSERT(ipp->ipp_addr_v4 != INADDR_ANY);
+ ipha->ipha_src = ipp->ipp_addr_v4;
+ } else {
+ IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src);
}
} else {
- DTRACE_PROBE1(udp__xmit__putnext, mblk_t *, mp);
- putnext(ire->ire_stq, mp);
- }
-bail:
- IRE_REFRELE(ire);
-}
+ ip6_t *ip6h = (ip6_t *)iph;
-static boolean_t
-udp_update_label_v6(queue_t *wq, mblk_t *mp, in6_addr_t *dst)
-{
- udp_t *udp = Q_TO_UDP(wq);
- int err;
- cred_t *cred;
- cred_t *orig_cred;
- cred_t *effective_cred = NULL;
- uchar_t opt_storage[TSOL_MAX_IPV6_OPTION];
- udp_stack_t *us = udp->udp_us;
-
- /*
- * All Solaris components should pass a db_credp
- * for this message, hence we ASSERT.
- * On production kernels we return an error to be robust against
- * random streams modules sitting on top of us.
- */
- cred = orig_cred = msg_getcred(mp, NULL);
- ASSERT(cred != NULL);
- if (cred == NULL)
- return (EINVAL);
-
- /*
- * Verify the destination is allowed to receive packets at
- * the security label of the message data. tsol_check_dest()
- * may create a new effective cred for this message with a
- * modified label or label flags. Note that we use the
- * cred/label from the message to handle MLP.
- */
- if ((err = tsol_check_dest(cred, dst, IPV6_VERSION,
- udp->udp_connp->conn_mac_mode, &effective_cred)) != 0)
- goto done;
- if (effective_cred != NULL)
- cred = effective_cred;
-
- /*
- * Calculate the security label to be placed in the text
- * of the message (if any).
- */
- if ((err = tsol_compute_label_v6(cred, dst, opt_storage,
- us->us_netstack->netstack_ip)) != 0)
- goto done;
-
- /*
- * Insert the security label in the cached ip options,
- * removing any old label that may exist.
- */
- if ((err = tsol_update_sticky(&udp->udp_sticky_ipp,
- &udp->udp_label_len_v6, opt_storage)) != 0)
- goto done;
+ ip6h->ip6_plen = htons((uint16_t)(pktlen - IPV6_HDR_LEN));
+ udpha->uha_checksum = htons(cksum);
- /*
- * Save the destination address and cred we used to
- * generate the security label text.
- */
- if (cred != udp->udp_effective_cred) {
- if (udp->udp_effective_cred != NULL)
- crfree(udp->udp_effective_cred);
- crhold(cred);
- udp->udp_effective_cred = cred;
- }
- if (orig_cred != udp->udp_last_cred) {
- if (udp->udp_last_cred != NULL)
- crfree(udp->udp_last_cred);
- crhold(orig_cred);
- udp->udp_last_cred = orig_cred;
+ /* if IP_PKTINFO specified an addres it wins over bind() */
+ if ((ipp->ipp_fields & IPPF_ADDR) &&
+ !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
+ ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr));
+ ip6h->ip6_src = ipp->ipp_addr;
+ } else {
+ ip6h->ip6_src = *v6src;
+ }
+ ip6h->ip6_vcf =
+ (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
+ (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
+ if (ipp->ipp_fields & IPPF_TCLASS) {
+ /* Overrides the class part of flowinfo */
+ ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
+ ipp->ipp_tclass);
+ }
}
-done:
- if (effective_cred != NULL)
- crfree(effective_cred);
+ /* Insert all-0s SPI now. */
+ if (insert_spi)
+ *((uint32_t *)(udpha + 1)) = 0;
- if (err != 0) {
- DTRACE_PROBE4(
- tx__ip__log__drop__updatelabel__udp6,
- char *, "queue(1) failed to update options(2) on mp(3)",
- queue_t *, wq, char *, opt_storage, mblk_t *, mp);
- }
- return (err);
+ udpha->uha_dst_port = dstport;
+ return (mp);
}
-static int
-udp_send_connected(conn_t *connp, mblk_t *mp, struct nmsghdr *msg, cred_t *cr,
- pid_t pid)
+/*
+ * Send a T_UDERR_IND in response to an M_DATA
+ */
+static void
+udp_ud_err_connected(conn_t *connp, t_scalar_t error)
{
- udp_t *udp = connp->conn_udp;
- udp_stack_t *us = udp->udp_us;
- ipaddr_t v4dst;
- in_port_t dstport;
- boolean_t mapped_addr;
struct sockaddr_storage ss;
sin_t *sin;
sin6_t *sin6;
struct sockaddr *addr;
socklen_t addrlen;
- int error;
- boolean_t insert_spi = udp->udp_nat_t_endpoint;
-
- /* M_DATA for connected socket */
-
- ASSERT(udp->udp_issocket);
- UDP_DBGSTAT(us, udp_data_conn);
+ mblk_t *mp1;
mutex_enter(&connp->conn_lock);
- if (udp->udp_state != TS_DATA_XFER) {
- mutex_exit(&connp->conn_lock);
- BUMP_MIB(&us->us_udp_mib, udpOutErrors);
- UDP_STAT(us, udp_out_err_notconn);
- freemsg(mp);
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
- "udp_wput_end: connp %p (%S)", connp,
- "not-connected; address required");
- return (EDESTADDRREQ);
- }
-
- mapped_addr = IN6_IS_ADDR_V4MAPPED(&udp->udp_v6dst);
- if (mapped_addr)
- IN6_V4MAPPED_TO_IPADDR(&udp->udp_v6dst, v4dst);
-
/* Initialize addr and addrlen as if they're passed in */
- if (udp->udp_family == AF_INET) {
+ if (connp->conn_family == AF_INET) {
sin = (sin_t *)&ss;
+ *sin = sin_null;
sin->sin_family = AF_INET;
- dstport = sin->sin_port = udp->udp_dstport;
- ASSERT(mapped_addr);
- sin->sin_addr.s_addr = v4dst;
+ sin->sin_port = connp->conn_fport;
+ sin->sin_addr.s_addr = connp->conn_faddr_v4;
addr = (struct sockaddr *)sin;
addrlen = sizeof (*sin);
} else {
sin6 = (sin6_t *)&ss;
+ *sin6 = sin6_null;
sin6->sin6_family = AF_INET6;
- dstport = sin6->sin6_port = udp->udp_dstport;
- sin6->sin6_flowinfo = udp->udp_flowinfo;
- sin6->sin6_addr = udp->udp_v6dst;
- sin6->sin6_scope_id = 0;
+ sin6->sin6_port = connp->conn_fport;
+ sin6->sin6_flowinfo = connp->conn_flowinfo;
+ sin6->sin6_addr = connp->conn_faddr_v6;
+ if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_faddr_v6) &&
+ (connp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) {
+ sin6->sin6_scope_id = connp->conn_ixa->ixa_scopeid;
+ } else {
+ sin6->sin6_scope_id = 0;
+ }
sin6->__sin6_src_id = 0;
addr = (struct sockaddr *)sin6;
addrlen = sizeof (*sin6);
}
mutex_exit(&connp->conn_lock);
- if (mapped_addr) {
- /*
- * Handle both AF_INET and AF_INET6; the latter
- * for IPV4 mapped destination addresses. Note
- * here that both addr and addrlen point to the
- * corresponding struct depending on the address
- * family of the socket.
- */
- mp = udp_output_v4(connp, mp, v4dst, dstport, 0, &error,
- insert_spi, msg, cr, pid);
- } else {
- mp = udp_output_v6(connp, mp, sin6, &error, msg, cr, pid);
- }
- if (error == 0) {
- ASSERT(mp == NULL);
- return (0);
- }
-
- UDP_STAT(us, udp_out_err_output);
- ASSERT(mp != NULL);
- if (IPCL_IS_NONSTR(connp)) {
- freemsg(mp);
- return (error);
- } else {
- /* mp is freed by the following routine */
- udp_ud_err(connp->conn_wq, mp, (uchar_t *)addr,
- (t_scalar_t)addrlen, (t_scalar_t)error);
- return (0);
- }
-}
-
-/* ARGSUSED */
-static int
-udp_send_not_connected(conn_t *connp, mblk_t *mp, struct sockaddr *addr,
- socklen_t addrlen, struct nmsghdr *msg, cred_t *cr, pid_t pid)
-{
-
- udp_t *udp = connp->conn_udp;
- boolean_t insert_spi = udp->udp_nat_t_endpoint;
- int error = 0;
- sin6_t *sin6;
- sin_t *sin;
- uint_t srcid;
- uint16_t port;
- ipaddr_t v4dst;
-
-
- ASSERT(addr != NULL);
-
- switch (udp->udp_family) {
- case AF_INET6:
- sin6 = (sin6_t *)addr;
- if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
- /*
- * Destination is a non-IPv4-compatible IPv6 address.
- * Send out an IPv6 format packet.
- */
- mp = udp_output_v6(connp, mp, sin6, &error, msg, cr,
- pid);
- if (error != 0)
- goto ud_error;
-
- return (0);
- }
- /*
- * If the local address is not zero or a mapped address
- * return an error. It would be possible to send an IPv4
- * packet but the response would never make it back to the
- * application since it is bound to a non-mapped address.
- */
- if (!IN6_IS_ADDR_V4MAPPED(&udp->udp_v6src) &&
- !IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) {
- error = EADDRNOTAVAIL;
- goto ud_error;
- }
- /* Send IPv4 packet without modifying udp_ipversion */
- /* Extract port and ipaddr */
- port = sin6->sin6_port;
- IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, v4dst);
- srcid = sin6->__sin6_src_id;
- break;
-
- case AF_INET:
- sin = (sin_t *)addr;
- /* Extract port and ipaddr */
- port = sin->sin_port;
- v4dst = sin->sin_addr.s_addr;
- srcid = 0;
- break;
- }
-
- mp = udp_output_v4(connp, mp, v4dst, port, srcid, &error, insert_spi,
- msg, cr, pid);
-
- if (error == 0) {
- ASSERT(mp == NULL);
- return (0);
- }
-
-ud_error:
- ASSERT(mp != NULL);
-
- return (error);
+ mp1 = mi_tpi_uderror_ind((char *)addr, addrlen, NULL, 0, error);
+ if (mp1 != NULL)
+ putnext(connp->conn_rq, mp1);
}
/*
@@ -5788,15 +3804,20 @@ ud_error:
void
udp_wput(queue_t *q, mblk_t *mp)
{
+ sin6_t *sin6;
+ sin_t *sin = NULL;
+ uint_t srcid;
conn_t *connp = Q_TO_CONN(q);
udp_t *udp = connp->conn_udp;
int error = 0;
- struct sockaddr *addr;
+ struct sockaddr *addr = NULL;
socklen_t addrlen;
udp_stack_t *us = udp->udp_us;
-
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_START,
- "udp_wput_start: queue %p mp %p", q, mp);
+ struct T_unitdata_req *tudr;
+ mblk_t *data_mp;
+ ushort_t ipversion;
+ cred_t *cr;
+ pid_t pid;
/*
* We directly handle several cases here: T_UNITDATA_REQ message
@@ -5805,910 +3826,612 @@ udp_wput(queue_t *q, mblk_t *mp)
*/
switch (DB_TYPE(mp)) {
case M_DATA:
- /*
- * Quick check for error cases. Checks will be done again
- * under the lock later on
- */
if (!udp->udp_issocket || udp->udp_state != TS_DATA_XFER) {
/* Not connected; address is required */
BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+ UDP_DBGSTAT(us, udp_data_notconn);
UDP_STAT(us, udp_out_err_notconn);
freemsg(mp);
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
- "udp_wput_end: connp %p (%S)", connp,
- "not-connected; address required");
return;
}
- (void) udp_send_connected(connp, mp, NULL, NULL, -1);
+ /*
+ * All Solaris components should pass a db_credp
+ * for this message, hence we ASSERT.
+ * On production kernels we return an error to be robust against
+ * random streams modules sitting on top of us.
+ */
+ cr = msg_getcred(mp, &pid);
+ ASSERT(cr != NULL);
+ if (cr == NULL) {
+ BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+ freemsg(mp);
+ return;
+ }
+ ASSERT(udp->udp_issocket);
+ UDP_DBGSTAT(us, udp_data_conn);
+ error = udp_output_connected(connp, mp, cr, pid);
+ if (error != 0) {
+ UDP_STAT(us, udp_out_err_output);
+ if (connp->conn_rq != NULL)
+ udp_ud_err_connected(connp, (t_scalar_t)error);
+#ifdef DEBUG
+ printf("udp_output_connected returned %d\n", error);
+#endif
+ }
return;
case M_PROTO:
- case M_PCPROTO: {
- struct T_unitdata_req *tudr;
-
- ASSERT((uintptr_t)MBLKL(mp) <= (uintptr_t)INT_MAX);
+ case M_PCPROTO:
tudr = (struct T_unitdata_req *)mp->b_rptr;
-
- /* Handle valid T_UNITDATA_REQ here */
- if (MBLKL(mp) >= sizeof (*tudr) &&
- ((t_primp_t)mp->b_rptr)->type == T_UNITDATA_REQ) {
- if (mp->b_cont == NULL) {
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
- "udp_wput_end: q %p (%S)", q, "badaddr");
- error = EPROTO;
- goto ud_error;
- }
-
- if (!MBLKIN(mp, 0, tudr->DEST_offset +
- tudr->DEST_length)) {
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
- "udp_wput_end: q %p (%S)", q, "badaddr");
- error = EADDRNOTAVAIL;
- goto ud_error;
- }
- /*
- * If a port has not been bound to the stream, fail.
- * This is not a problem when sockfs is directly
- * above us, because it will ensure that the socket
- * is first bound before allowing data to be sent.
- */
- if (udp->udp_state == TS_UNBND) {
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
- "udp_wput_end: q %p (%S)", q, "outstate");
- error = EPROTO;
- goto ud_error;
- }
- addr = (struct sockaddr *)
- &mp->b_rptr[tudr->DEST_offset];
- addrlen = tudr->DEST_length;
- if (tudr->OPT_length != 0)
- UDP_STAT(us, udp_out_opt);
- break;
+ if (MBLKL(mp) < sizeof (*tudr) ||
+ ((t_primp_t)mp->b_rptr)->type != T_UNITDATA_REQ) {
+ udp_wput_other(q, mp);
+ return;
}
- /* FALLTHRU */
- }
+ break;
+
default:
udp_wput_other(q, mp);
return;
}
- ASSERT(addr != NULL);
- error = udp_send_not_connected(connp, mp, addr, addrlen, NULL, NULL,
- -1);
- if (error != 0) {
-ud_error:
- UDP_STAT(us, udp_out_err_output);
- ASSERT(mp != NULL);
- /* mp is freed by the following routine */
- udp_ud_err(q, mp, (uchar_t *)addr, (t_scalar_t)addrlen,
- (t_scalar_t)error);
+ /* Handle valid T_UNITDATA_REQ here */
+ data_mp = mp->b_cont;
+ if (data_mp == NULL) {
+ error = EPROTO;
+ goto ud_error2;
}
-}
+ mp->b_cont = NULL;
-/* ARGSUSED */
-static void
-udp_wput_fallback(queue_t *wq, mblk_t *mp)
-{
-#ifdef DEBUG
- cmn_err(CE_CONT, "udp_wput_fallback: Message in fallback \n");
-#endif
- freemsg(mp);
-}
-
-
-/*
- * udp_output_v6():
- * Assumes that udp_wput did some sanity checking on the destination
- * address.
- */
-static mblk_t *
-udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6, int *error,
- struct nmsghdr *msg, cred_t *cr, pid_t pid)
-{
- ip6_t *ip6h;
- ip6i_t *ip6i; /* mp1->b_rptr even if no ip6i_t */
- mblk_t *mp1 = mp;
- mblk_t *mp2;
- int udp_ip_hdr_len = IPV6_HDR_LEN + UDPH_SIZE;
- size_t ip_len;
- udpha_t *udph;
- udp_t *udp = connp->conn_udp;
- udp_stack_t *us = udp->udp_us;
- queue_t *q = connp->conn_wq;
- ip6_pkt_t ipp_s; /* For ancillary data options */
- ip6_pkt_t *ipp = &ipp_s;
- ip6_pkt_t *tipp; /* temporary ipp */
- uint32_t csum = 0;
- uint_t ignore = 0;
- uint_t option_exists = 0, is_sticky = 0;
- uint8_t *cp;
- uint8_t *nxthdr_ptr;
- in6_addr_t ip6_dst;
- in_port_t port;
- udpattrs_t attrs;
- boolean_t opt_present;
- ip6_hbh_t *hopoptsptr = NULL;
- uint_t hopoptslen = 0;
- boolean_t is_ancillary = B_FALSE;
- size_t sth_wroff = 0;
- ire_t *ire;
- boolean_t update_lastdst = B_FALSE;
-
- *error = 0;
-
- /*
- * If the local address is a mapped address return
- * an error.
- * It would be possible to send an IPv6 packet but the
- * response would never make it back to the application
- * since it is bound to a mapped address.
- */
- if (IN6_IS_ADDR_V4MAPPED(&udp->udp_v6src)) {
- *error = EADDRNOTAVAIL;
- goto done;
+ if (!MBLKIN(mp, 0, tudr->DEST_offset + tudr->DEST_length)) {
+ error = EADDRNOTAVAIL;
+ goto ud_error2;
}
- ipp->ipp_fields = 0;
- ipp->ipp_sticky_ignored = 0;
-
/*
- * If TPI options passed in, feed it for verification and handling
+ * All Solaris components should pass a db_credp
+ * for this TPI message, hence we should ASSERT.
+ * However, RPC (svc_clts_ksend) does this odd thing where it
+ * passes the options from a T_UNITDATA_IND unchanged in a
+ * T_UNITDATA_REQ. While that is the right thing to do for
+ * some options, SCM_UCRED being the key one, this also makes it
+ * pass down IP_RECVDSTADDR. Hence we can't ASSERT here.
*/
- attrs.udpattr_credset = B_FALSE;
- opt_present = B_FALSE;
- if (IPCL_IS_NONSTR(connp)) {
- if (msg->msg_controllen != 0) {
- attrs.udpattr_ipp6 = ipp;
- attrs.udpattr_mb = mp;
-
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- *error = process_auxiliary_options(connp,
- msg->msg_control, msg->msg_controllen,
- &attrs, &udp_opt_obj, udp_opt_set, cr);
- rw_exit(&udp->udp_rwlock);
- if (*error)
- goto done;
- ASSERT(*error == 0);
- opt_present = B_TRUE;
- }
- } else {
- if (DB_TYPE(mp) != M_DATA) {
- mp1 = mp->b_cont;
- if (((struct T_unitdata_req *)
- mp->b_rptr)->OPT_length != 0) {
- attrs.udpattr_ipp6 = ipp;
- attrs.udpattr_mb = mp;
- if (udp_unitdata_opt_process(q, mp, error,
- &attrs) < 0) {
- goto done;
- }
- ASSERT(*error == 0);
- opt_present = B_TRUE;
- }
- }
+ cr = msg_getcred(mp, &pid);
+ if (cr == NULL) {
+ cr = connp->conn_cred;
+ pid = connp->conn_cpid;
}
/*
- * Determine whether we need to mark the mblk with the user's
- * credentials.
- * If labeled then sockfs would have already done this.
+ * If a port has not been bound to the stream, fail.
+ * This is not a problem when sockfs is directly
+ * above us, because it will ensure that the socket
+ * is first bound before allowing data to be sent.
*/
- ASSERT(!is_system_labeled() || msg_getcred(mp, NULL) != NULL);
- ire = connp->conn_ire_cache;
- if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr) || (ire == NULL) ||
- (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &sin6->sin6_addr)) ||
- (ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK))) {
- if (cr != NULL && msg_getcred(mp, NULL) == NULL)
- mblk_setcred(mp, cr, pid);
- }
-
- rw_enter(&udp->udp_rwlock, RW_READER);
- ignore = ipp->ipp_sticky_ignored;
-
- /* mp1 points to the M_DATA mblk carrying the packet */
- ASSERT(mp1 != NULL && DB_TYPE(mp1) == M_DATA);
-
- if (sin6->sin6_scope_id != 0 &&
- IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
- /*
- * IPPF_SCOPE_ID is special. It's neither a sticky
- * option nor ancillary data. It needs to be
- * explicitly set in options_exists.
- */
- option_exists |= IPPF_SCOPE_ID;
+ if (udp->udp_state == TS_UNBND) {
+ error = EPROTO;
+ goto ud_error2;
}
+ addr = (struct sockaddr *)&mp->b_rptr[tudr->DEST_offset];
+ addrlen = tudr->DEST_length;
- /*
- * Compute the destination address
- */
- ip6_dst = sin6->sin6_addr;
- if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
- ip6_dst = ipv6_loopback;
-
- port = sin6->sin6_port;
-
- /*
- * Cluster and TSOL notes, Cluster check:
- * see comments in udp_output_v4().
- */
- mutex_enter(&connp->conn_lock);
-
- if (cl_inet_connect2 != NULL &&
- (!IN6_ARE_ADDR_EQUAL(&ip6_dst, &udp->udp_v6lastdst) ||
- port != udp->udp_lastdstport)) {
- mutex_exit(&connp->conn_lock);
- *error = 0;
- CL_INET_UDP_CONNECT(connp, udp, B_TRUE, &ip6_dst, port, *error);
- if (*error != 0) {
- *error = EHOSTUNREACH;
- rw_exit(&udp->udp_rwlock);
- goto done;
+ switch (connp->conn_family) {
+ case AF_INET6:
+ sin6 = (sin6_t *)addr;
+ if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) ||
+ (sin6->sin6_family != AF_INET6)) {
+ error = EADDRNOTAVAIL;
+ goto ud_error2;
}
- update_lastdst = B_TRUE;
- mutex_enter(&connp->conn_lock);
- }
- /*
- * If we're not going to the same destination as last time, then
- * recompute the label required. This is done in a separate routine to
- * avoid blowing up our stack here.
- *
- * TSOL Note: Since we are not in WRITER mode, UDP packets
- * to different destination may require different labels,
- * or worse, UDP packets to same IP address may require
- * different labels due to use of shared all-zones address.
- * We use conn_lock to ensure that lastdst, sticky ipp_hopopts,
- * and sticky ipp_hopoptslen are consistent for the current
- * destination and are updated atomically.
- */
- if (is_system_labeled()) {
- cred_t *credp;
- pid_t cpid;
+ srcid = sin6->__sin6_src_id;
+ if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+ /*
+ * Destination is a non-IPv4-compatible IPv6 address.
+ * Send out an IPv6 format packet.
+ */
- /* Using UDP MLP requires SCM_UCRED from user */
- if (connp->conn_mlp_type != mlptSingle &&
- !attrs.udpattr_credset) {
- DTRACE_PROBE4(
- tx__ip__log__info__output__udp6,
- char *, "MLP mp(1) lacks SCM_UCRED attr(2) on q(3)",
- mblk_t *, mp1, udpattrs_t *, &attrs, queue_t *, q);
- *error = EINVAL;
- rw_exit(&udp->udp_rwlock);
- mutex_exit(&connp->conn_lock);
- goto done;
- }
- /*
- * update label option for this UDP socket if
- * - the destination has changed,
- * - the UDP socket is MLP, or
- * - the cred attached to the mblk changed.
- */
- credp = msg_getcred(mp, &cpid);
- if (opt_present ||
- !IN6_ARE_ADDR_EQUAL(&udp->udp_v6lastdst, &ip6_dst) ||
- connp->conn_mlp_type != mlptSingle ||
- credp != udp->udp_last_cred) {
- if ((*error = udp_update_label_v6(q, mp, &ip6_dst))
- != 0) {
- rw_exit(&udp->udp_rwlock);
- mutex_exit(&connp->conn_lock);
- goto done;
+ /*
+ * If the local address is a mapped address return
+ * an error.
+ * It would be possible to send an IPv6 packet but the
+ * response would never make it back to the application
+ * since it is bound to a mapped address.
+ */
+ if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
+ error = EADDRNOTAVAIL;
+ goto ud_error2;
}
- update_lastdst = B_TRUE;
- }
- /*
- * Attach the effective cred to the mblk to ensure future
- * routing decisions will be based on it's label.
- */
- mblk_setcred(mp, udp->udp_effective_cred, cpid);
- }
- if (update_lastdst) {
- udp->udp_v6lastdst = ip6_dst;
- udp->udp_lastdstport = port;
- }
+ UDP_DBGSTAT(us, udp_out_ipv6);
- /*
- * If there's a security label here, then we ignore any options the
- * user may try to set. We keep the peer's label as a hidden sticky
- * option. We make a private copy of this label before releasing the
- * lock so that label is kept consistent with the destination addr.
- */
- if (udp->udp_label_len_v6 > 0) {
- ignore &= ~IPPF_HOPOPTS;
- ipp->ipp_fields &= ~IPPF_HOPOPTS;
- }
+ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
+ sin6->sin6_addr = ipv6_loopback;
+ ipversion = IPV6_VERSION;
+ } else {
+ if (connp->conn_ipv6_v6only) {
+ error = EADDRNOTAVAIL;
+ goto ud_error2;
+ }
- if ((udp->udp_sticky_ipp.ipp_fields == 0) && (ipp->ipp_fields == 0)) {
- /* No sticky options nor ancillary data. */
- mutex_exit(&connp->conn_lock);
- goto no_options;
- }
+ /*
+ * If the local address is not zero or a mapped address
+ * return an error. It would be possible to send an
+ * IPv4 packet but the response would never make it
+ * back to the application since it is bound to a
+ * non-mapped address.
+ */
+ if (!IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6) &&
+ !IN6_IS_ADDR_UNSPECIFIED(&connp->conn_saddr_v6)) {
+ error = EADDRNOTAVAIL;
+ goto ud_error2;
+ }
+ UDP_DBGSTAT(us, udp_out_mapped);
- /*
- * Go through the options figuring out where each is going to
- * come from and build two masks. The first mask indicates if
- * the option exists at all. The second mask indicates if the
- * option is sticky or ancillary.
- */
- if (!(ignore & IPPF_HOPOPTS)) {
- if (ipp->ipp_fields & IPPF_HOPOPTS) {
- option_exists |= IPPF_HOPOPTS;
- udp_ip_hdr_len += ipp->ipp_hopoptslen;
- } else if (udp->udp_sticky_ipp.ipp_fields & IPPF_HOPOPTS) {
- option_exists |= IPPF_HOPOPTS;
- is_sticky |= IPPF_HOPOPTS;
- ASSERT(udp->udp_sticky_ipp.ipp_hopoptslen != 0);
- hopoptsptr = kmem_alloc(
- udp->udp_sticky_ipp.ipp_hopoptslen, KM_NOSLEEP);
- if (hopoptsptr == NULL) {
- *error = ENOMEM;
- mutex_exit(&connp->conn_lock);
- goto done;
+ if (V4_PART_OF_V6(sin6->sin6_addr) == INADDR_ANY) {
+ V4_PART_OF_V6(sin6->sin6_addr) =
+ htonl(INADDR_LOOPBACK);
}
- hopoptslen = udp->udp_sticky_ipp.ipp_hopoptslen;
- bcopy(udp->udp_sticky_ipp.ipp_hopopts, hopoptsptr,
- hopoptslen);
- udp_ip_hdr_len += hopoptslen;
+ ipversion = IPV4_VERSION;
}
- }
- mutex_exit(&connp->conn_lock);
- if (!(ignore & IPPF_RTHDR)) {
- if (ipp->ipp_fields & IPPF_RTHDR) {
- option_exists |= IPPF_RTHDR;
- udp_ip_hdr_len += ipp->ipp_rthdrlen;
- } else if (udp->udp_sticky_ipp.ipp_fields & IPPF_RTHDR) {
- option_exists |= IPPF_RTHDR;
- is_sticky |= IPPF_RTHDR;
- udp_ip_hdr_len += udp->udp_sticky_ipp.ipp_rthdrlen;
- }
- }
+ if (tudr->OPT_length != 0) {
+ /*
+ * If we are connected then the destination needs to be
+ * the same as the connected one.
+ */
+ if (udp->udp_state == TS_DATA_XFER &&
+ !conn_same_as_last_v6(connp, sin6)) {
+ error = EISCONN;
+ goto ud_error2;
+ }
+ UDP_STAT(us, udp_out_opt);
+ error = udp_output_ancillary(connp, NULL, sin6,
+ data_mp, mp, NULL, cr, pid);
+ } else {
+ ip_xmit_attr_t *ixa;
- if (!(ignore & IPPF_RTDSTOPTS) && (option_exists & IPPF_RTHDR)) {
- if (ipp->ipp_fields & IPPF_RTDSTOPTS) {
- option_exists |= IPPF_RTDSTOPTS;
- udp_ip_hdr_len += ipp->ipp_rtdstoptslen;
- } else if (udp->udp_sticky_ipp.ipp_fields & IPPF_RTDSTOPTS) {
- option_exists |= IPPF_RTDSTOPTS;
- is_sticky |= IPPF_RTDSTOPTS;
- udp_ip_hdr_len += udp->udp_sticky_ipp.ipp_rtdstoptslen;
+ /*
+ * We have to allocate an ip_xmit_attr_t before we grab
+ * conn_lock and we need to hold conn_lock once we've
+ * checked conn_same_as_last_v6 to handle concurrent
+ * send* calls on a socket.
+ */
+ ixa = conn_get_ixa(connp, B_FALSE);
+ if (ixa == NULL) {
+ error = ENOMEM;
+ goto ud_error2;
+ }
+ mutex_enter(&connp->conn_lock);
+
+ if (conn_same_as_last_v6(connp, sin6) &&
+ connp->conn_lastsrcid == srcid &&
+ ipsec_outbound_policy_current(ixa)) {
+ UDP_DBGSTAT(us, udp_out_lastdst);
+ /* udp_output_lastdst drops conn_lock */
+ error = udp_output_lastdst(connp, data_mp, cr,
+ pid, ixa);
+ } else {
+ UDP_DBGSTAT(us, udp_out_diffdst);
+ /* udp_output_newdst drops conn_lock */
+ error = udp_output_newdst(connp, data_mp, NULL,
+ sin6, ipversion, cr, pid, ixa);
+ }
+ ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
}
- }
-
- if (!(ignore & IPPF_DSTOPTS)) {
- if (ipp->ipp_fields & IPPF_DSTOPTS) {
- option_exists |= IPPF_DSTOPTS;
- udp_ip_hdr_len += ipp->ipp_dstoptslen;
- } else if (udp->udp_sticky_ipp.ipp_fields & IPPF_DSTOPTS) {
- option_exists |= IPPF_DSTOPTS;
- is_sticky |= IPPF_DSTOPTS;
- udp_ip_hdr_len += udp->udp_sticky_ipp.ipp_dstoptslen;
+ if (error == 0) {
+ freeb(mp);
+ return;
}
- }
+ break;
- if (!(ignore & IPPF_IFINDEX)) {
- if (ipp->ipp_fields & IPPF_IFINDEX) {
- option_exists |= IPPF_IFINDEX;
- } else if (udp->udp_sticky_ipp.ipp_fields & IPPF_IFINDEX) {
- option_exists |= IPPF_IFINDEX;
- is_sticky |= IPPF_IFINDEX;
+ case AF_INET:
+ sin = (sin_t *)addr;
+ if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) ||
+ (sin->sin_family != AF_INET)) {
+ error = EADDRNOTAVAIL;
+ goto ud_error2;
}
- }
+ UDP_DBGSTAT(us, udp_out_ipv4);
+ if (sin->sin_addr.s_addr == INADDR_ANY)
+ sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ ipversion = IPV4_VERSION;
- if (!(ignore & IPPF_ADDR)) {
- if (ipp->ipp_fields & IPPF_ADDR) {
- option_exists |= IPPF_ADDR;
- } else if (udp->udp_sticky_ipp.ipp_fields & IPPF_ADDR) {
- option_exists |= IPPF_ADDR;
- is_sticky |= IPPF_ADDR;
- }
- }
+ srcid = 0;
+ if (tudr->OPT_length != 0) {
+ /*
+ * If we are connected then the destination needs to be
+ * the same as the connected one.
+ */
+ if (udp->udp_state == TS_DATA_XFER &&
+ !conn_same_as_last_v4(connp, sin)) {
+ error = EISCONN;
+ goto ud_error2;
+ }
+ UDP_STAT(us, udp_out_opt);
+ error = udp_output_ancillary(connp, sin, NULL,
+ data_mp, mp, NULL, cr, pid);
+ } else {
+ ip_xmit_attr_t *ixa;
- if (!(ignore & IPPF_DONTFRAG)) {
- if (ipp->ipp_fields & IPPF_DONTFRAG) {
- option_exists |= IPPF_DONTFRAG;
- } else if (udp->udp_sticky_ipp.ipp_fields & IPPF_DONTFRAG) {
- option_exists |= IPPF_DONTFRAG;
- is_sticky |= IPPF_DONTFRAG;
+ /*
+ * We have to allocate an ip_xmit_attr_t before we grab
+ * conn_lock and we need to hold conn_lock once we've
+ * checked conn_same_as_last_v4 to handle concurrent
+ * send* calls on a socket.
+ */
+ ixa = conn_get_ixa(connp, B_FALSE);
+ if (ixa == NULL) {
+ error = ENOMEM;
+ goto ud_error2;
+ }
+ mutex_enter(&connp->conn_lock);
+
+ if (conn_same_as_last_v4(connp, sin) &&
+ ipsec_outbound_policy_current(ixa)) {
+ UDP_DBGSTAT(us, udp_out_lastdst);
+ /* udp_output_lastdst drops conn_lock */
+ error = udp_output_lastdst(connp, data_mp, cr,
+ pid, ixa);
+ } else {
+ UDP_DBGSTAT(us, udp_out_diffdst);
+ /* udp_output_newdst drops conn_lock */
+ error = udp_output_newdst(connp, data_mp, sin,
+ NULL, ipversion, cr, pid, ixa);
+ }
+ ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
}
- }
-
- if (!(ignore & IPPF_USE_MIN_MTU)) {
- if (ipp->ipp_fields & IPPF_USE_MIN_MTU) {
- option_exists |= IPPF_USE_MIN_MTU;
- } else if (udp->udp_sticky_ipp.ipp_fields &
- IPPF_USE_MIN_MTU) {
- option_exists |= IPPF_USE_MIN_MTU;
- is_sticky |= IPPF_USE_MIN_MTU;
+ if (error == 0) {
+ freeb(mp);
+ return;
}
+ break;
}
+ UDP_STAT(us, udp_out_err_output);
+ ASSERT(mp != NULL);
+ /* mp is freed by the following routine */
+ udp_ud_err(q, mp, (t_scalar_t)error);
+ return;
- if (!(ignore & IPPF_HOPLIMIT) && (ipp->ipp_fields & IPPF_HOPLIMIT))
- option_exists |= IPPF_HOPLIMIT;
- /* IPV6_HOPLIMIT can never be sticky */
- ASSERT(!(udp->udp_sticky_ipp.ipp_fields & IPPF_HOPLIMIT));
+ud_error2:
+ BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+ freemsg(data_mp);
+ UDP_STAT(us, udp_out_err_output);
+ ASSERT(mp != NULL);
+ /* mp is freed by the following routine */
+ udp_ud_err(q, mp, (t_scalar_t)error);
+}
- if (!(ignore & IPPF_UNICAST_HOPS) &&
- (udp->udp_sticky_ipp.ipp_fields & IPPF_UNICAST_HOPS)) {
- option_exists |= IPPF_UNICAST_HOPS;
- is_sticky |= IPPF_UNICAST_HOPS;
- }
+/*
+ * Handle the case of the IP address, port, flow label being different
+ * for both IPv4 and IPv6.
+ *
+ * NOTE: The caller must hold conn_lock and we drop it here.
+ */
+static int
+udp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6,
+ ushort_t ipversion, cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa)
+{
+ uint_t srcid;
+ uint32_t flowinfo;
+ udp_t *udp = connp->conn_udp;
+ int error = 0;
+ ip_xmit_attr_t *oldixa;
+ udp_stack_t *us = udp->udp_us;
+ in6_addr_t v6src;
+ in6_addr_t v6dst;
+ in6_addr_t v6nexthop;
+ in_port_t dstport;
- if (!(ignore & IPPF_MULTICAST_HOPS) &&
- (udp->udp_sticky_ipp.ipp_fields & IPPF_MULTICAST_HOPS)) {
- option_exists |= IPPF_MULTICAST_HOPS;
- is_sticky |= IPPF_MULTICAST_HOPS;
- }
+ ASSERT(MUTEX_HELD(&connp->conn_lock));
+ ASSERT(ixa != NULL);
+ /*
+ * We hold conn_lock across all the use and modifications of
+ * the conn_lastdst, conn_ixa, and conn_xmit_ipp to ensure that they
+ * stay consistent.
+ */
- if (!(ignore & IPPF_TCLASS)) {
- if (ipp->ipp_fields & IPPF_TCLASS) {
- option_exists |= IPPF_TCLASS;
- } else if (udp->udp_sticky_ipp.ipp_fields & IPPF_TCLASS) {
- option_exists |= IPPF_TCLASS;
- is_sticky |= IPPF_TCLASS;
- }
+ ASSERT(cr != NULL);
+ ixa->ixa_cred = cr;
+ ixa->ixa_cpid = pid;
+ if (is_system_labeled()) {
+ /* We need to restart with a label based on the cred */
+ ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
}
- if (!(ignore & IPPF_NEXTHOP) &&
- (udp->udp_sticky_ipp.ipp_fields & IPPF_NEXTHOP)) {
- option_exists |= IPPF_NEXTHOP;
- is_sticky |= IPPF_NEXTHOP;
+ /*
+ * If we are connected then the destination needs to be the
+ * same as the connected one, which is not the case here since we
+ * checked for that above.
+ */
+ if (udp->udp_state == TS_DATA_XFER) {
+ mutex_exit(&connp->conn_lock);
+ error = EISCONN;
+ goto ud_error;
}
-no_options:
+ /* In case previous destination was multicast or multirt */
+ ip_attr_newdst(ixa);
/*
- * If any options carried in the ip6i_t were specified, we
- * need to account for the ip6i_t in the data we'll be sending
- * down.
+ * If laddr is unspecified then we look at sin6_src_id.
+ * We will give precedence to a source address set with IPV6_PKTINFO
+ * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
+ * want ip_attr_connect to select a source (since it can fail) when
+ * IPV6_PKTINFO is specified.
+ * If this doesn't result in a source address then we get a source
+ * from ip_attr_connect() below.
*/
- if (option_exists & IPPF_HAS_IP6I)
- udp_ip_hdr_len += sizeof (ip6i_t);
-
- /* check/fix buffer config, setup pointers into it */
- ip6h = (ip6_t *)&mp1->b_rptr[-udp_ip_hdr_len];
- if (DB_REF(mp1) != 1 || ((unsigned char *)ip6h < DB_BASE(mp1)) ||
- !OK_32PTR(ip6h)) {
-
- /* Try to get everything in a single mblk next time */
- if (udp_ip_hdr_len > udp->udp_max_hdr_len) {
- udp->udp_max_hdr_len = udp_ip_hdr_len;
- sth_wroff = udp->udp_max_hdr_len + us->us_wroff_extra;
+ v6src = connp->conn_saddr_v6;
+ if (sin != NULL) {
+ IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
+ dstport = sin->sin_port;
+ flowinfo = 0;
+ srcid = 0;
+ ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
+ if (srcid != 0 && V4_PART_OF_V6(&v6src) == INADDR_ANY) {
+ ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
+ connp->conn_netstack);
}
-
- mp2 = allocb(udp_ip_hdr_len + us->us_wroff_extra, BPRI_LO);
- if (mp2 == NULL) {
- *error = ENOMEM;
- rw_exit(&udp->udp_rwlock);
- goto done;
+ ixa->ixa_flags |= IXAF_IS_IPV4;
+ } else {
+ v6dst = sin6->sin6_addr;
+ dstport = sin6->sin6_port;
+ flowinfo = sin6->sin6_flowinfo;
+ srcid = sin6->__sin6_src_id;
+ if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
+ ixa->ixa_scopeid = sin6->sin6_scope_id;
+ ixa->ixa_flags |= IXAF_SCOPEID_SET;
+ } else {
+ ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
}
- mp2->b_wptr = DB_LIM(mp2);
- mp2->b_cont = mp1;
- mp1 = mp2;
- if (DB_TYPE(mp) != M_DATA)
- mp->b_cont = mp1;
+ if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
+ ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
+ connp->conn_netstack);
+ }
+ if (IN6_IS_ADDR_V4MAPPED(&v6dst))
+ ixa->ixa_flags |= IXAF_IS_IPV4;
else
- mp = mp1;
-
- ip6h = (ip6_t *)(mp1->b_wptr - udp_ip_hdr_len);
+ ixa->ixa_flags &= ~IXAF_IS_IPV4;
}
- mp1->b_rptr = (unsigned char *)ip6h;
- ip6i = (ip6i_t *)ip6h;
-
-#define ANCIL_OR_STICKY_PTR(f) ((is_sticky & f) ? &udp->udp_sticky_ipp : ipp)
- if (option_exists & IPPF_HAS_IP6I) {
- ip6h = (ip6_t *)&ip6i[1];
- ip6i->ip6i_flags = 0;
- ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
-
- /* sin6_scope_id takes precendence over IPPF_IFINDEX */
- if (option_exists & IPPF_SCOPE_ID) {
- ip6i->ip6i_flags |= IP6I_IFINDEX;
- ip6i->ip6i_ifindex = sin6->sin6_scope_id;
- } else if (option_exists & IPPF_IFINDEX) {
- tipp = ANCIL_OR_STICKY_PTR(IPPF_IFINDEX);
- ASSERT(tipp->ipp_ifindex != 0);
- ip6i->ip6i_flags |= IP6I_IFINDEX;
- ip6i->ip6i_ifindex = tipp->ipp_ifindex;
- }
-
- if (option_exists & IPPF_ADDR) {
- /*
- * Enable per-packet source address verification if
- * IPV6_PKTINFO specified the source address.
- * ip6_src is set in the transport's _wput function.
- */
- ip6i->ip6i_flags |= IP6I_VERIFY_SRC;
- }
-
- if (option_exists & IPPF_DONTFRAG) {
- ip6i->ip6i_flags |= IP6I_DONTFRAG;
- }
+ /* Handle IPV6_PKTINFO setting source address. */
+ if (IN6_IS_ADDR_UNSPECIFIED(&v6src) &&
+ (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR)) {
+ ip_pkt_t *ipp = &connp->conn_xmit_ipp;
- if (option_exists & IPPF_USE_MIN_MTU) {
- ip6i->ip6i_flags = IP6I_API_USE_MIN_MTU(
- ip6i->ip6i_flags, ipp->ipp_use_min_mtu);
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
+ v6src = ipp->ipp_addr;
+ } else {
+ if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
+ v6src = ipp->ipp_addr;
}
+ }
- if (option_exists & IPPF_NEXTHOP) {
- tipp = ANCIL_OR_STICKY_PTR(IPPF_NEXTHOP);
- ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&tipp->ipp_nexthop));
- ip6i->ip6i_flags |= IP6I_NEXTHOP;
- ip6i->ip6i_nexthop = tipp->ipp_nexthop;
- }
+ ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &v6dst, &v6nexthop);
+ mutex_exit(&connp->conn_lock);
+ error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
+ &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | IPDF_IPSEC);
+ switch (error) {
+ case 0:
+ break;
+ case EADDRNOTAVAIL:
/*
- * tell IP this is an ip6i_t private header
+ * IXAF_VERIFY_SOURCE tells us to pick a better source.
+ * Don't have the application see that errno
*/
- ip6i->ip6i_nxt = IPPROTO_RAW;
- }
-
- /* Initialize IPv6 header */
- ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
- bzero(&ip6h->ip6_src, sizeof (ip6h->ip6_src));
-
- /* Set the hoplimit of the outgoing packet. */
- if (option_exists & IPPF_HOPLIMIT) {
- /* IPV6_HOPLIMIT ancillary data overrides all other settings. */
- ip6h->ip6_hops = ipp->ipp_hoplimit;
- ip6i->ip6i_flags |= IP6I_HOPLIMIT;
- } else if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
- ip6h->ip6_hops = udp->udp_multicast_ttl;
- if (option_exists & IPPF_MULTICAST_HOPS)
- ip6i->ip6i_flags |= IP6I_HOPLIMIT;
- } else {
- ip6h->ip6_hops = udp->udp_ttl;
- if (option_exists & IPPF_UNICAST_HOPS)
- ip6i->ip6i_flags |= IP6I_HOPLIMIT;
- }
-
- if (option_exists & IPPF_ADDR) {
- tipp = ANCIL_OR_STICKY_PTR(IPPF_ADDR);
- ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&tipp->ipp_addr));
- ip6h->ip6_src = tipp->ipp_addr;
- } else {
+ error = ENETUNREACH;
+ goto failed;
+ case ENETDOWN:
/*
- * The source address was not set using IPV6_PKTINFO.
- * First look at the bound source.
- * If unspecified fallback to __sin6_src_id.
+ * Have !ipif_addr_ready address; drop packet silently
+ * until we can get applications to not send until we
+ * are ready.
*/
- ip6h->ip6_src = udp->udp_v6src;
- if (sin6->__sin6_src_id != 0 &&
- IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) {
- ip_srcid_find_id(sin6->__sin6_src_id,
- &ip6h->ip6_src, connp->conn_zoneid,
- us->us_netstack);
+ error = 0;
+ goto failed;
+ case EHOSTUNREACH:
+ case ENETUNREACH:
+ if (ixa->ixa_ire != NULL) {
+ /*
+ * Let conn_ip_output/ire_send_noroute return
+ * the error and send any local ICMP error.
+ */
+ error = 0;
+ break;
}
+ /* FALLTHRU */
+ failed:
+ default:
+ goto ud_error;
}
- nxthdr_ptr = (uint8_t *)&ip6h->ip6_nxt;
- cp = (uint8_t *)&ip6h[1];
/*
- * Here's where we have to start stringing together
- * any extension headers in the right order:
- * Hop-by-hop, destination, routing, and final destination opts.
+ * Cluster note: we let the cluster hook know that we are sending to a
+ * new address and/or port.
*/
- if (option_exists & IPPF_HOPOPTS) {
- /* Hop-by-hop options */
- ip6_hbh_t *hbh = (ip6_hbh_t *)cp;
- tipp = ANCIL_OR_STICKY_PTR(IPPF_HOPOPTS);
- if (hopoptslen == 0) {
- hopoptsptr = tipp->ipp_hopopts;
- hopoptslen = tipp->ipp_hopoptslen;
- is_ancillary = B_TRUE;
- }
-
- *nxthdr_ptr = IPPROTO_HOPOPTS;
- nxthdr_ptr = &hbh->ip6h_nxt;
-
- bcopy(hopoptsptr, cp, hopoptslen);
- cp += hopoptslen;
-
- if (hopoptsptr != NULL && !is_ancillary) {
- kmem_free(hopoptsptr, hopoptslen);
- hopoptsptr = NULL;
- hopoptslen = 0;
+ if (cl_inet_connect2 != NULL) {
+ CL_INET_UDP_CONNECT(connp, B_TRUE, &v6dst, dstport, error);
+ if (error != 0) {
+ error = EHOSTUNREACH;
+ goto ud_error;
}
}
- /*
- * En-route destination options
- * Only do them if there's a routing header as well
- */
- if (option_exists & IPPF_RTDSTOPTS) {
- ip6_dest_t *dst = (ip6_dest_t *)cp;
- tipp = ANCIL_OR_STICKY_PTR(IPPF_RTDSTOPTS);
-
- *nxthdr_ptr = IPPROTO_DSTOPTS;
- nxthdr_ptr = &dst->ip6d_nxt;
- bcopy(tipp->ipp_rtdstopts, cp, tipp->ipp_rtdstoptslen);
- cp += tipp->ipp_rtdstoptslen;
- }
- /*
- * Routing header next
- */
- if (option_exists & IPPF_RTHDR) {
- ip6_rthdr_t *rt = (ip6_rthdr_t *)cp;
- tipp = ANCIL_OR_STICKY_PTR(IPPF_RTHDR);
-
- *nxthdr_ptr = IPPROTO_ROUTING;
- nxthdr_ptr = &rt->ip6r_nxt;
-
- bcopy(tipp->ipp_rthdr, cp, tipp->ipp_rthdrlen);
- cp += tipp->ipp_rthdrlen;
- }
+ mutex_enter(&connp->conn_lock);
/*
- * Do ultimate destination options
+ * While we dropped the lock some other thread might have connected
+ * this socket. If so we bail out with EISCONN to ensure that the
+ * connecting thread is the one that updates conn_ixa, conn_ht_*
+ * and conn_*last*.
*/
- if (option_exists & IPPF_DSTOPTS) {
- ip6_dest_t *dest = (ip6_dest_t *)cp;
- tipp = ANCIL_OR_STICKY_PTR(IPPF_DSTOPTS);
-
- *nxthdr_ptr = IPPROTO_DSTOPTS;
- nxthdr_ptr = &dest->ip6d_nxt;
-
- bcopy(tipp->ipp_dstopts, cp, tipp->ipp_dstoptslen);
- cp += tipp->ipp_dstoptslen;
+ if (udp->udp_state == TS_DATA_XFER) {
+ mutex_exit(&connp->conn_lock);
+ error = EISCONN;
+ goto ud_error;
}
- /*
- * Now set the last header pointer to the proto passed in
- */
- ASSERT((int)(cp - (uint8_t *)ip6i) == (udp_ip_hdr_len - UDPH_SIZE));
- *nxthdr_ptr = IPPROTO_UDP;
-
- /* Update UDP header */
- udph = (udpha_t *)((uchar_t *)ip6i + udp_ip_hdr_len - UDPH_SIZE);
- udph->uha_dst_port = sin6->sin6_port;
- udph->uha_src_port = udp->udp_port;
/*
- * Copy in the destination address
+ * We need to rebuild the headers if
+ * - we are labeling packets (could be different for different
+ * destinations)
+ * - we have a source route (or routing header) since we need to
+ * massage that to get the pseudo-header checksum
+ * - the IP version is different than the last time
+ * - a socket option with COA_HEADER_CHANGED has been set which
+ * set conn_v6lastdst to zero.
+ *
+ * Otherwise the prepend function will just update the src, dst,
+ * dstport, and flow label.
*/
- ip6h->ip6_dst = ip6_dst;
-
- ip6h->ip6_vcf =
- (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
- (sin6->sin6_flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
-
- if (option_exists & IPPF_TCLASS) {
- tipp = ANCIL_OR_STICKY_PTR(IPPF_TCLASS);
- ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
- tipp->ipp_tclass);
- }
- rw_exit(&udp->udp_rwlock);
-
- if (option_exists & IPPF_RTHDR) {
- ip6_rthdr_t *rth;
-
+ if (is_system_labeled()) {
+ /* TX MLP requires SCM_UCRED and don't have that here */
+ if (connp->conn_mlp_type != mlptSingle) {
+ mutex_exit(&connp->conn_lock);
+ error = ECONNREFUSED;
+ goto ud_error;
+ }
/*
- * Perform any processing needed for source routing.
- * We know that all extension headers will be in the same mblk
- * as the IPv6 header.
+ * Check whether Trusted Solaris policy allows communication
+ * with this host, and pretend that the destination is
+ * unreachable if not.
+ * Compute any needed label and place it in ipp_label_v4/v6.
+ *
+ * Later conn_build_hdr_template/conn_prepend_hdr takes
+ * ipp_label_v4/v6 to form the packet.
+ *
+ * Tsol note: Since we hold conn_lock we know no other
+ * thread manipulates conn_xmit_ipp.
*/
- rth = ip_find_rthdr_v6(ip6h, mp1->b_wptr);
- if (rth != NULL && rth->ip6r_segleft != 0) {
- if (rth->ip6r_type != IPV6_RTHDR_TYPE_0) {
- /*
- * Drop packet - only support Type 0 routing.
- * Notify the application as well.
- */
- *error = EPROTO;
- goto done;
- }
-
- /*
- * rth->ip6r_len is twice the number of
- * addresses in the header. Thus it must be even.
- */
- if (rth->ip6r_len & 0x1) {
- *error = EPROTO;
- goto done;
- }
- /*
- * Shuffle the routing header and ip6_dst
- * addresses, and get the checksum difference
- * between the first hop (in ip6_dst) and
- * the destination (in the last routing hdr entry).
- */
- csum = ip_massage_options_v6(ip6h, rth,
- us->us_netstack);
- /*
- * Verify that the first hop isn't a mapped address.
- * Routers along the path need to do this verification
- * for subsequent hops.
- */
- if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_dst)) {
- *error = EADDRNOTAVAIL;
- goto done;
+ error = conn_update_label(connp, ixa, &v6dst,
+ &connp->conn_xmit_ipp);
+ if (error != 0) {
+ mutex_exit(&connp->conn_lock);
+ goto ud_error;
+ }
+ /* Rebuild the header template */
+ error = udp_build_hdr_template(connp, &v6src, &v6dst, dstport,
+ flowinfo);
+ if (error != 0) {
+ mutex_exit(&connp->conn_lock);
+ goto ud_error;
+ }
+ } else if ((connp->conn_xmit_ipp.ipp_fields &
+ (IPPF_IPV4_OPTIONS|IPPF_RTHDR)) ||
+ ipversion != connp->conn_lastipversion ||
+ IN6_IS_ADDR_UNSPECIFIED(&connp->conn_v6lastdst)) {
+ /* Rebuild the header template */
+ error = udp_build_hdr_template(connp, &v6src, &v6dst, dstport,
+ flowinfo);
+ if (error != 0) {
+ mutex_exit(&connp->conn_lock);
+ goto ud_error;
+ }
+ } else {
+ /* Simply update the destination address if no source route */
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ ipha_t *ipha = (ipha_t *)connp->conn_ht_iphc;
+
+ IN6_V4MAPPED_TO_IPADDR(&v6dst, ipha->ipha_dst);
+ if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
+ ipha->ipha_fragment_offset_and_flags |=
+ IPH_DF_HTONS;
+ } else {
+ ipha->ipha_fragment_offset_and_flags &=
+ ~IPH_DF_HTONS;
}
-
- cp += (rth->ip6r_len + 1)*8;
+ } else {
+ ip6_t *ip6h = (ip6_t *)connp->conn_ht_iphc;
+ ip6h->ip6_dst = v6dst;
}
}
- /* count up length of UDP packet */
- ip_len = (mp1->b_wptr - (unsigned char *)ip6h) - IPV6_HDR_LEN;
- if ((mp2 = mp1->b_cont) != NULL) {
- do {
- ASSERT((uintptr_t)MBLKL(mp2) <= (uintptr_t)UINT_MAX);
- ip_len += (uint32_t)MBLKL(mp2);
- } while ((mp2 = mp2->b_cont) != NULL);
- }
-
/*
- * If the size of the packet is greater than the maximum allowed by
- * ip, return an error. Passing this down could cause panics because
- * the size will have wrapped and be inconsistent with the msg size.
- */
- if (ip_len > IP_MAXPACKET) {
- *error = EMSGSIZE;
- goto done;
- }
-
- /* Store the UDP length. Subtract length of extension hdrs */
- udph->uha_length = htons(ip_len + IPV6_HDR_LEN -
- (int)((uchar_t *)udph - (uchar_t *)ip6h));
-
- /*
- * We make it easy for IP to include our pseudo header
- * by putting our length in uh_checksum, modified (if
- * we have a routing header) by the checksum difference
- * between the ultimate destination and first hop addresses.
- * Note: UDP over IPv6 must always checksum the packet.
+ * Remember the dst/dstport etc which corresponds to the built header
+ * template and conn_ixa.
*/
- csum += udph->uha_length;
- csum = (csum & 0xFFFF) + (csum >> 16);
- udph->uha_checksum = (uint16_t)csum;
-
-#ifdef _LITTLE_ENDIAN
- ip_len = htons(ip_len);
-#endif
- ip6h->ip6_plen = ip_len;
-
- if (DB_TYPE(mp) != M_DATA) {
- cred_t *cr;
- pid_t cpid;
-
- /* Move any cred from the T_UNITDATA_REQ to the packet */
- cr = msg_extractcred(mp, &cpid);
- if (cr != NULL) {
- if (mp1->b_datap->db_credp != NULL)
- crfree(mp1->b_datap->db_credp);
- mp1->b_datap->db_credp = cr;
- mp1->b_datap->db_cpid = cpid;
- }
+ oldixa = conn_replace_ixa(connp, ixa);
+ connp->conn_v6lastdst = v6dst;
+ connp->conn_lastipversion = ipversion;
+ connp->conn_lastdstport = dstport;
+ connp->conn_lastflowinfo = flowinfo;
+ connp->conn_lastscopeid = ixa->ixa_scopeid;
+ connp->conn_lastsrcid = srcid;
+ /* Also remember a source to use together with lastdst */
+ connp->conn_v6lastsrc = v6src;
+
+ data_mp = udp_prepend_header_template(connp, ixa, data_mp, &v6src,
+ dstport, flowinfo, &error);
+
+ /* Done with conn_t */
+ mutex_exit(&connp->conn_lock);
+ ixa_refrele(oldixa);
- ASSERT(mp != mp1);
- freeb(mp);
+ if (data_mp == NULL) {
+ ASSERT(error != 0);
+ goto ud_error;
}
- /* mp has been consumed and we'll return success */
- ASSERT(*error == 0);
- mp = NULL;
-
- /* We're done. Pass the packet to IP */
+ /* We're done. Pass the packet to ip. */
BUMP_MIB(&us->us_udp_mib, udpHCOutDatagrams);
- ip_output_v6(connp, mp1, q, IP_WPUT);
-done:
- if (sth_wroff != 0) {
- (void) proto_set_tx_wroff(RD(q), connp,
- udp->udp_max_hdr_len + us->us_wroff_extra);
- }
- if (hopoptsptr != NULL && !is_ancillary) {
- kmem_free(hopoptsptr, hopoptslen);
- hopoptsptr = NULL;
- }
- if (*error != 0) {
- ASSERT(mp != NULL);
- BUMP_MIB(&us->us_udp_mib, udpOutErrors);
- }
- return (mp);
-}
-
-
-static int
-i_udp_getpeername(udp_t *udp, struct sockaddr *sa, uint_t *salenp)
-{
- sin_t *sin = (sin_t *)sa;
- sin6_t *sin6 = (sin6_t *)sa;
-
- ASSERT(RW_LOCK_HELD(&udp->udp_rwlock));
-
- if (udp->udp_state != TS_DATA_XFER)
- return (ENOTCONN);
-
- switch (udp->udp_family) {
- case AF_INET:
- ASSERT(udp->udp_ipversion == IPV4_VERSION);
-
- if (*salenp < sizeof (sin_t))
- return (EINVAL);
-
- *salenp = sizeof (sin_t);
- *sin = sin_null;
- sin->sin_family = AF_INET;
- sin->sin_port = udp->udp_dstport;
- sin->sin_addr.s_addr = V4_PART_OF_V6(udp->udp_v6dst);
+ error = conn_ip_output(data_mp, ixa);
+ /* No udpOutErrors if an error since IP increases its error counter */
+ switch (error) {
+ case 0:
break;
-
- case AF_INET6:
- if (*salenp < sizeof (sin6_t))
- return (EINVAL);
-
- *salenp = sizeof (sin6_t);
- *sin6 = sin6_null;
- sin6->sin6_family = AF_INET6;
- sin6->sin6_port = udp->udp_dstport;
- sin6->sin6_addr = udp->udp_v6dst;
- sin6->sin6_flowinfo = udp->udp_flowinfo;
+ case EWOULDBLOCK:
+ (void) ixa_check_drain_insert(connp, ixa);
+ error = 0;
break;
- }
-
- return (0);
-}
-
-static int
-udp_getmyname(udp_t *udp, struct sockaddr *sa, uint_t *salenp)
-{
- sin_t *sin = (sin_t *)sa;
- sin6_t *sin6 = (sin6_t *)sa;
-
- ASSERT(RW_LOCK_HELD(&udp->udp_rwlock));
-
- switch (udp->udp_family) {
- case AF_INET:
- ASSERT(udp->udp_ipversion == IPV4_VERSION);
-
- if (*salenp < sizeof (sin_t))
- return (EINVAL);
-
- *salenp = sizeof (sin_t);
- *sin = sin_null;
- sin->sin_family = AF_INET;
- sin->sin_port = udp->udp_port;
-
+ case EADDRNOTAVAIL:
/*
- * If udp_v6src is unspecified, we might be bound to broadcast
- * / multicast. Use udp_bound_v6src as local address instead
- * (that could also still be unspecified).
+ * IXAF_VERIFY_SOURCE tells us to pick a better source.
+ * Don't have the application see that errno
*/
- if (!IN6_IS_ADDR_V4MAPPED_ANY(&udp->udp_v6src) &&
- !IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) {
- sin->sin_addr.s_addr = V4_PART_OF_V6(udp->udp_v6src);
- } else {
- sin->sin_addr.s_addr =
- V4_PART_OF_V6(udp->udp_bound_v6src);
- }
- break;
-
- case AF_INET6:
- if (*salenp < sizeof (sin6_t))
- return (EINVAL);
-
- *salenp = sizeof (sin6_t);
- *sin6 = sin6_null;
- sin6->sin6_family = AF_INET6;
- sin6->sin6_port = udp->udp_port;
- sin6->sin6_flowinfo = udp->udp_flowinfo;
-
+ error = ENETUNREACH;
+ /* FALLTHRU */
+ default:
+ mutex_enter(&connp->conn_lock);
/*
- * If udp_v6src is unspecified, we might be bound to broadcast
- * / multicast. Use udp_bound_v6src as local address instead
- * (that could also still be unspecified).
+ * Clear the source and v6lastdst so we call ip_attr_connect
+ * for the next packet and try to pick a better source.
*/
- if (!IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src))
- sin6->sin6_addr = udp->udp_v6src;
+ if (connp->conn_mcbc_bind)
+ connp->conn_saddr_v6 = ipv6_all_zeros;
else
- sin6->sin6_addr = udp->udp_bound_v6src;
+ connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
+ connp->conn_v6lastdst = ipv6_all_zeros;
+ mutex_exit(&connp->conn_lock);
break;
}
+ ixa_refrele(ixa);
+ return (error);
- return (0);
+ud_error:
+ if (ixa != NULL)
+ ixa_refrele(ixa);
+
+ freemsg(data_mp);
+ BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+ UDP_STAT(us, udp_out_err_output);
+ return (error);
+}
+
+/* ARGSUSED */
+static void
+udp_wput_fallback(queue_t *wq, mblk_t *mp)
+{
+#ifdef DEBUG
+ cmn_err(CE_CONT, "udp_wput_fallback: Message in fallback \n");
+#endif
+ freemsg(mp);
}
+
/*
* Handle special out-of-band ioctl requests (see PSARC/2008/265).
*/
@@ -6717,7 +4440,8 @@ udp_wput_cmdblk(queue_t *q, mblk_t *mp)
{
void *data;
mblk_t *datamp = mp->b_cont;
- udp_t *udp = Q_TO_UDP(q);
+ conn_t *connp = Q_TO_CONN(q);
+ udp_t *udp = connp->conn_udp;
cmdblk_t *cmdp = (cmdblk_t *)mp->b_rptr;
if (datamp == NULL || MBLKL(datamp) < cmdp->cb_len) {
@@ -6727,19 +4451,23 @@ udp_wput_cmdblk(queue_t *q, mblk_t *mp)
}
data = datamp->b_rptr;
- rw_enter(&udp->udp_rwlock, RW_READER);
+ mutex_enter(&connp->conn_lock);
switch (cmdp->cb_cmd) {
case TI_GETPEERNAME:
- cmdp->cb_error = i_udp_getpeername(udp, data, &cmdp->cb_len);
+ if (udp->udp_state != TS_DATA_XFER)
+ cmdp->cb_error = ENOTCONN;
+ else
+ cmdp->cb_error = conn_getpeername(connp, data,
+ &cmdp->cb_len);
break;
case TI_GETMYNAME:
- cmdp->cb_error = udp_getmyname(udp, data, &cmdp->cb_len);
+ cmdp->cb_error = conn_getsockname(connp, data, &cmdp->cb_len);
break;
default:
cmdp->cb_error = EINVAL;
break;
}
- rw_exit(&udp->udp_rwlock);
+ mutex_exit(&connp->conn_lock);
qreply(q, mp);
}
@@ -6747,10 +4475,11 @@ udp_wput_cmdblk(queue_t *q, mblk_t *mp)
static void
udp_use_pure_tpi(udp_t *udp)
{
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- udp->udp_issocket = B_FALSE;
- rw_exit(&udp->udp_rwlock);
+ conn_t *connp = udp->udp_connp;
+ mutex_enter(&connp->conn_lock);
+ udp->udp_issocket = B_FALSE;
+ mutex_exit(&connp->conn_lock);
UDP_STAT(udp->udp_us, udp_sock_fallback);
}
@@ -6758,20 +4487,13 @@ static void
udp_wput_other(queue_t *q, mblk_t *mp)
{
uchar_t *rptr = mp->b_rptr;
- struct datab *db;
struct iocblk *iocp;
- cred_t *cr;
conn_t *connp = Q_TO_CONN(q);
udp_t *udp = connp->conn_udp;
- udp_stack_t *us;
-
- TRACE_1(TR_FAC_UDP, TR_UDP_WPUT_OTHER_START,
- "udp_wput_other_start: q %p", q);
-
- us = udp->udp_us;
- db = mp->b_datap;
+ udp_stack_t *us = udp->udp_us;
+ cred_t *cr;
- switch (db->db_type) {
+ switch (mp->b_datap->db_type) {
case M_CMD:
udp_wput_cmdblk(q, mp);
return;
@@ -6779,37 +4501,29 @@ udp_wput_other(queue_t *q, mblk_t *mp)
case M_PROTO:
case M_PCPROTO:
if (mp->b_wptr - rptr < sizeof (t_scalar_t)) {
+ /*
+ * If the message does not contain a PRIM_type,
+ * throw it away.
+ */
freemsg(mp);
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
- "udp_wput_other_end: q %p (%S)", q, "protoshort");
return;
}
switch (((t_primp_t)rptr)->type) {
case T_ADDR_REQ:
udp_addr_req(q, mp);
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
- "udp_wput_other_end: q %p (%S)", q, "addrreq");
return;
case O_T_BIND_REQ:
case T_BIND_REQ:
udp_tpi_bind(q, mp);
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
- "udp_wput_other_end: q %p (%S)", q, "bindreq");
return;
case T_CONN_REQ:
udp_tpi_connect(q, mp);
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
- "udp_wput_other_end: q %p (%S)", q, "connreq");
return;
case T_CAPABILITY_REQ:
udp_capability_req(q, mp);
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
- "udp_wput_other_end: q %p (%S)", q, "capabreq");
return;
case T_INFO_REQ:
udp_info_req(q, mp);
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
- "udp_wput_other_end: q %p (%S)", q, "inforeq");
return;
case T_UNITDATA_REQ:
/*
@@ -6817,14 +4531,10 @@ udp_wput_other(queue_t *q, mblk_t *mp)
* be bad. Valid T_UNITDATA_REQs are handled
* in udp_wput.
*/
- udp_ud_err(q, mp, NULL, 0, EADDRNOTAVAIL);
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
- "udp_wput_other_end: q %p (%S)", q, "unitdatareq");
+ udp_ud_err(q, mp, EADDRNOTAVAIL);
return;
case T_UNBIND_REQ:
udp_tpi_unbind(q, mp);
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
- "udp_wput_other_end: q %p (%S)", q, "unbindreq");
return;
case T_SVR4_OPTMGMT_REQ:
/*
@@ -6842,11 +4552,8 @@ udp_wput_other(queue_t *q, mblk_t *mp)
}
if (!snmpcom_req(q, mp, udp_snmp_set, ip_snmp_get,
cr)) {
- (void) svr4_optcom_req(q,
- mp, cr, &udp_opt_obj, B_TRUE);
+ svr4_optcom_req(q, mp, cr, &udp_opt_obj);
}
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
- "udp_wput_other_end: q %p (%S)", q, "optmgmtreq");
return;
case T_OPTMGMT_REQ:
@@ -6863,34 +4570,24 @@ udp_wput_other(queue_t *q, mblk_t *mp)
udp_err_ack(q, mp, TSYSERR, EINVAL);
return;
}
- (void) tpi_optcom_req(q, mp, cr, &udp_opt_obj, B_TRUE);
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
- "udp_wput_other_end: q %p (%S)", q, "optmgmtreq");
+ tpi_optcom_req(q, mp, cr, &udp_opt_obj);
return;
case T_DISCON_REQ:
udp_tpi_disconnect(q, mp);
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
- "udp_wput_other_end: q %p (%S)", q, "disconreq");
return;
/* The following TPI message is not supported by udp. */
case O_T_CONN_RES:
case T_CONN_RES:
udp_err_ack(q, mp, TNOTSUPPORT, 0);
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
- "udp_wput_other_end: q %p (%S)", q,
- "connres/disconreq");
return;
- /* The following 3 TPI messages are illegal for udp. */
+ /* The following 3 TPI requests are illegal for udp. */
case T_DATA_REQ:
case T_EXDATA_REQ:
case T_ORDREL_REQ:
udp_err_ack(q, mp, TNOTSUPPORT, 0);
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
- "udp_wput_other_end: q %p (%S)", q,
- "data/exdata/ordrel");
return;
default:
break;
@@ -6914,13 +4611,10 @@ udp_wput_other(queue_t *q, mblk_t *mp)
iocp->ioc_count = 0;
mp->b_datap->db_type = M_IOCACK;
qreply(q, mp);
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
- "udp_wput_other_end: q %p (%S)", q,
- "getpeername");
return;
}
/* FALLTHRU */
- case TI_GETMYNAME: {
+ case TI_GETMYNAME:
/*
* For TI_GETPEERNAME and TI_GETMYNAME, we first
* need to copyin the user's strbuf structure.
@@ -6929,17 +4623,12 @@ udp_wput_other(queue_t *q, mblk_t *mp)
*/
mi_copyin(q, mp, NULL,
SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
- "udp_wput_other_end: q %p (%S)", q, "getmyname");
return;
- }
case ND_SET:
/* nd_getset performs the necessary checking */
case ND_GET:
if (nd_getset(q, us->us_nd, mp)) {
qreply(q, mp);
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
- "udp_wput_other_end: q %p (%S)", q, "get");
return;
}
break;
@@ -6969,16 +4658,12 @@ udp_wput_other(queue_t *q, mblk_t *mp)
break;
case M_IOCDATA:
udp_wput_iocdata(q, mp);
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
- "udp_wput_other_end: q %p (%S)", q, "iocdata");
return;
default:
/* Unrecognized messages are passed through without change. */
break;
}
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
- "udp_wput_other_end: q %p (%S)", q, "end");
- ip_output(connp, mp, q, IP_WPUT);
+ ip_wput_nondata(q, mp);
}
/*
@@ -6991,9 +4676,9 @@ udp_wput_iocdata(queue_t *q, mblk_t *mp)
mblk_t *mp1;
struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
STRUCT_HANDLE(strbuf, sb);
- udp_t *udp = Q_TO_UDP(q);
- int error;
uint_t addrlen;
+ conn_t *connp = Q_TO_CONN(q);
+ udp_t *udp = connp->conn_udp;
/* Make sure it is one of ours. */
switch (iocp->ioc_cmd) {
@@ -7001,7 +4686,7 @@ udp_wput_iocdata(queue_t *q, mblk_t *mp)
case TI_GETPEERNAME:
break;
default:
- ip_output(udp->udp_connp, mp, q, IP_WPUT);
+ ip_wput_nondata(q, mp);
return;
}
@@ -7040,77 +4725,45 @@ udp_wput_iocdata(queue_t *q, mblk_t *mp)
* address and then we'll copyout the strbuf.
*/
STRUCT_SET_HANDLE(sb, iocp->ioc_flag, (void *)mp1->b_rptr);
- addrlen = udp->udp_family == AF_INET ? sizeof (sin_t) : sizeof (sin6_t);
+
+ if (connp->conn_family == AF_INET)
+ addrlen = sizeof (sin_t);
+ else
+ addrlen = sizeof (sin6_t);
+
if (STRUCT_FGET(sb, maxlen) < addrlen) {
mi_copy_done(q, mp, EINVAL);
return;
}
- mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
-
- if (mp1 == NULL)
- return;
-
- rw_enter(&udp->udp_rwlock, RW_READER);
switch (iocp->ioc_cmd) {
case TI_GETMYNAME:
- error = udp_do_getsockname(udp, (void *)mp1->b_rptr, &addrlen);
break;
case TI_GETPEERNAME:
- error = udp_do_getpeername(udp, (void *)mp1->b_rptr, &addrlen);
+ if (udp->udp_state != TS_DATA_XFER) {
+ mi_copy_done(q, mp, ENOTCONN);
+ return;
+ }
break;
}
- rw_exit(&udp->udp_rwlock);
-
- if (error != 0) {
- mi_copy_done(q, mp, error);
- } else {
- mp1->b_wptr += addrlen;
- STRUCT_FSET(sb, len, addrlen);
-
- /* Copy out the address */
- mi_copyout(q, mp);
- }
-}
-
-static int
-udp_unitdata_opt_process(queue_t *q, mblk_t *mp, int *errorp,
- udpattrs_t *udpattrs)
-{
- struct T_unitdata_req *udreqp;
- int is_absreq_failure;
- cred_t *cr;
-
- ASSERT(((t_primp_t)mp->b_rptr)->type);
-
- /*
- * All Solaris components should pass a db_credp
- * for this TPI message, hence we should ASSERT.
- * However, RPC (svc_clts_ksend) does this odd thing where it
- * passes the options from a T_UNITDATA_IND unchanged in a
- * T_UNITDATA_REQ. While that is the right thing to do for
- * some options, SCM_UCRED being the key one, this also makes it
- * pass down IP_RECVDSTADDR. Hence we can't ASSERT here.
- */
- cr = msg_getcred(mp, NULL);
- if (cr == NULL) {
- cr = Q_TO_CONN(q)->conn_cred;
- }
- udreqp = (struct T_unitdata_req *)mp->b_rptr;
-
- *errorp = tpi_optcom_buf(q, mp, &udreqp->OPT_length,
- udreqp->OPT_offset, cr, &udp_opt_obj,
- udpattrs, &is_absreq_failure);
+ mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
+ if (!mp1)
+ return;
- if (*errorp != 0) {
- /*
- * Note: No special action needed in this
- * module for "is_absreq_failure"
- */
- return (-1); /* failure */
+ STRUCT_FSET(sb, len, addrlen);
+ switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
+ case TI_GETMYNAME:
+ (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr,
+ &addrlen);
+ break;
+ case TI_GETPEERNAME:
+ (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr,
+ &addrlen);
+ break;
}
- ASSERT(is_absreq_failure == 0);
- return (0); /* success */
+ mp1->b_wptr += addrlen;
+ /* Copy out the address */
+ mi_copyout(q, mp);
}
void
@@ -7234,34 +4887,19 @@ udp_kstat2_init(netstackid_t stackid, udp_stat_t *us_statisticsp)
kstat_t *ksp;
udp_stat_t template = {
- { "udp_ip_send", KSTAT_DATA_UINT64 },
- { "udp_ip_ire_send", KSTAT_DATA_UINT64 },
- { "udp_ire_null", KSTAT_DATA_UINT64 },
{ "udp_sock_fallback", KSTAT_DATA_UINT64 },
- { "udp_out_sw_cksum", KSTAT_DATA_UINT64 },
- { "udp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 },
{ "udp_out_opt", KSTAT_DATA_UINT64 },
{ "udp_out_err_notconn", KSTAT_DATA_UINT64 },
{ "udp_out_err_output", KSTAT_DATA_UINT64 },
{ "udp_out_err_tudr", KSTAT_DATA_UINT64 },
- { "udp_in_pktinfo", KSTAT_DATA_UINT64 },
- { "udp_in_recvdstaddr", KSTAT_DATA_UINT64 },
- { "udp_in_recvopts", KSTAT_DATA_UINT64 },
- { "udp_in_recvif", KSTAT_DATA_UINT64 },
- { "udp_in_recvslla", KSTAT_DATA_UINT64 },
- { "udp_in_recvucred", KSTAT_DATA_UINT64 },
- { "udp_in_recvttl", KSTAT_DATA_UINT64 },
- { "udp_in_recvhopopts", KSTAT_DATA_UINT64 },
- { "udp_in_recvhoplimit", KSTAT_DATA_UINT64 },
- { "udp_in_recvdstopts", KSTAT_DATA_UINT64 },
- { "udp_in_recvrtdstopts", KSTAT_DATA_UINT64 },
- { "udp_in_recvrthdr", KSTAT_DATA_UINT64 },
- { "udp_in_recvpktinfo", KSTAT_DATA_UINT64 },
- { "udp_in_recvtclass", KSTAT_DATA_UINT64 },
- { "udp_in_timestamp", KSTAT_DATA_UINT64 },
#ifdef DEBUG
{ "udp_data_conn", KSTAT_DATA_UINT64 },
{ "udp_data_notconn", KSTAT_DATA_UINT64 },
+ { "udp_out_lastdst", KSTAT_DATA_UINT64 },
+ { "udp_out_diffdst", KSTAT_DATA_UINT64 },
+ { "udp_out_ipv6", KSTAT_DATA_UINT64 },
+ { "udp_out_mapped", KSTAT_DATA_UINT64 },
+ { "udp_out_ipv4", KSTAT_DATA_UINT64 },
#endif
};
@@ -7384,8 +5022,6 @@ udp_set_rcv_hiwat(udp_t *udp, size_t size)
static void
udp_lrput(queue_t *q, mblk_t *mp)
{
- mblk_t *mp1;
-
switch (mp->b_datap->db_type) {
case M_FLUSH:
/* Turn around */
@@ -7396,9 +5032,6 @@ udp_lrput(queue_t *q, mblk_t *mp)
}
break;
}
- /* Could receive messages that passed through ar_rput */
- for (mp1 = mp; mp1; mp1 = mp1->b_cont)
- mp1->b_prev = mp1->b_next = NULL;
freemsg(mp);
}
@@ -7425,6 +5058,7 @@ udp_do_open(cred_t *credp, boolean_t isv6, int flags)
zoneid_t zoneid;
netstack_t *ns;
udp_stack_t *us;
+ int len;
ns = netstack_find_by_cred(credp);
ASSERT(ns != NULL);
@@ -7455,34 +5089,40 @@ udp_do_open(cred_t *credp, boolean_t isv6, int flags)
*/
netstack_rele(ns);
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- ASSERT(connp->conn_ulp == IPPROTO_UDP);
+ /*
+ * Since this conn_t/udp_t is not yet visible to anybody else we don't
+ * need to lock anything.
+ */
+ ASSERT(connp->conn_proto == IPPROTO_UDP);
ASSERT(connp->conn_udp == udp);
ASSERT(udp->udp_connp == connp);
/* Set the initial state of the stream and the privilege status. */
udp->udp_state = TS_UNBND;
+ connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
if (isv6) {
- udp->udp_family = AF_INET6;
- udp->udp_ipversion = IPV6_VERSION;
- udp->udp_max_hdr_len = IPV6_HDR_LEN + UDPH_SIZE;
- udp->udp_ttl = us->us_ipv6_hoplimit;
- connp->conn_af_isv6 = B_TRUE;
+ connp->conn_family = AF_INET6;
+ connp->conn_ipversion = IPV6_VERSION;
+ connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4;
+ connp->conn_default_ttl = us->us_ipv6_hoplimit;
+ len = sizeof (ip6_t) + UDPH_SIZE;
} else {
- udp->udp_family = AF_INET;
- udp->udp_ipversion = IPV4_VERSION;
- udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE;
- udp->udp_ttl = us->us_ipv4_ttl;
- connp->conn_af_isv6 = B_FALSE;
+ connp->conn_family = AF_INET;
+ connp->conn_ipversion = IPV4_VERSION;
+ connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4;
+ connp->conn_default_ttl = us->us_ipv4_ttl;
+ len = sizeof (ipha_t) + UDPH_SIZE;
}
- udp->udp_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
- udp->udp_pending_op = -1;
- connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
- connp->conn_zoneid = zoneid;
+ ASSERT(connp->conn_ixa->ixa_protocol == connp->conn_proto);
+ connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl;
+
+ connp->conn_ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
+ connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM;
+ /* conn_allzones can not be set this early, hence no IPCL_ZONEID */
+ connp->conn_ixa->ixa_zoneid = zoneid;
- udp->udp_open_time = lbolt64;
- udp->udp_open_pid = curproc->p_pid;
+ connp->conn_zoneid = zoneid;
/*
* If the caller has the process-wide flag set, then default to MAC
@@ -7491,22 +5131,38 @@ udp_do_open(cred_t *credp, boolean_t isv6, int flags)
if (getpflags(NET_MAC_AWARE, credp) != 0)
connp->conn_mac_mode = CONN_MAC_AWARE;
- connp->conn_ulp_labeled = is_system_labeled();
+ connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID);
udp->udp_us = us;
+ connp->conn_rcvbuf = us->us_recv_hiwat;
+ connp->conn_sndbuf = us->us_xmit_hiwat;
+ connp->conn_sndlowat = us->us_xmit_lowat;
+ connp->conn_rcvlowat = udp_mod_info.mi_lowat;
+
+ connp->conn_wroff = len + us->us_wroff_extra;
+ connp->conn_so_type = SOCK_DGRAM;
+
connp->conn_recv = udp_input;
+ connp->conn_recvicmp = udp_icmp_input;
crhold(credp);
connp->conn_cred = credp;
+ connp->conn_cpid = curproc->p_pid;
+ connp->conn_open_time = lbolt64;
+ /* Cache things in ixa without an extra refhold */
+ connp->conn_ixa->ixa_cred = connp->conn_cred;
+ connp->conn_ixa->ixa_cpid = connp->conn_cpid;
+ if (is_system_labeled())
+ connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
*((sin6_t *)&udp->udp_delayed_addr) = sin6_null;
- rw_exit(&udp->udp_rwlock);
+ if (us->us_pmtu_discovery)
+ connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
return (connp);
}
-/* ARGSUSED */
sock_lower_handle_t
udp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
uint_t *smodep, int *errorp, int flags, cred_t *credp)
@@ -7539,39 +5195,17 @@ udp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
ASSERT(us != NULL);
udp->udp_issocket = B_TRUE;
- connp->conn_flags |= IPCL_NONSTR | IPCL_SOCKET;
-
- /* Set flow control */
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- (void) udp_set_rcv_hiwat(udp, us->us_recv_hiwat);
- udp->udp_rcv_disply_hiwat = us->us_recv_hiwat;
- udp->udp_rcv_lowat = udp_mod_info.mi_lowat;
- udp->udp_xmit_hiwat = us->us_xmit_hiwat;
- udp->udp_xmit_lowat = us->us_xmit_lowat;
-
- if (udp->udp_family == AF_INET6) {
- /* Build initial header template for transmit */
- if ((*errorp = udp_build_hdrs(udp)) != 0) {
- rw_exit(&udp->udp_rwlock);
- ipcl_conn_destroy(connp);
- return (NULL);
- }
- }
- rw_exit(&udp->udp_rwlock);
+ connp->conn_flags |= IPCL_NONSTR;
- connp->conn_flow_cntrld = B_FALSE;
-
- ASSERT(us->us_ldi_ident != NULL);
-
- if ((*errorp = ip_create_helper_stream(connp, us->us_ldi_ident)) != 0) {
- ip1dbg(("udp_create: create of IP helper stream failed\n"));
- udp_do_close(connp);
- return (NULL);
- }
+ /*
+ * Set flow control
+ * Since this conn_t/udp_t is not yet visible to anybody else we don't
+ * need to lock anything.
+ */
+ (void) udp_set_rcv_hiwat(udp, connp->conn_rcvbuf);
+ udp->udp_rcv_disply_hiwat = connp->conn_rcvbuf;
- /* Set the send flow control */
- connp->conn_wq->q_hiwat = us->us_xmit_hiwat;
- connp->conn_wq->q_lowat = us->us_xmit_lowat;
+ connp->conn_flow_cntrld = B_FALSE;
mutex_enter(&connp->conn_lock);
connp->conn_state_flags &= ~CONN_INCIPIENT;
@@ -7583,14 +5217,12 @@ udp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
return ((sock_lower_handle_t)connp);
}
-/* ARGSUSED */
+/* ARGSUSED3 */
void
udp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
sock_upcalls_t *sock_upcalls, int flags, cred_t *cr)
{
conn_t *connp = (conn_t *)proto_handle;
- udp_t *udp = connp->conn_udp;
- udp_stack_t *us = udp->udp_us;
struct sock_proto_props sopp;
/* All Solaris components should pass a cred for this operation. */
@@ -7599,14 +5231,15 @@ udp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
connp->conn_upcalls = sock_upcalls;
connp->conn_upper_handle = sock_handle;
- sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT |
+ sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ;
- sopp.sopp_wroff = udp->udp_max_hdr_len + us->us_wroff_extra;
+ sopp.sopp_wroff = connp->conn_wroff;
sopp.sopp_maxblk = INFPSZ;
- sopp.sopp_rxhiwat = udp->udp_rcv_hiwat;
+ sopp.sopp_rxhiwat = connp->conn_rcvbuf;
+ sopp.sopp_rxlowat = connp->conn_rcvlowat;
sopp.sopp_maxaddrlen = sizeof (sin6_t);
sopp.sopp_maxpsz =
- (udp->udp_family == AF_INET) ? UDP_MAXPACKET_IPV4 :
+ (connp->conn_family == AF_INET) ? UDP_MAXPACKET_IPV4 :
UDP_MAXPACKET_IPV6;
sopp.sopp_minpsz = (udp_mod_info.mi_minpsz == 1) ? 0 :
udp_mod_info.mi_minpsz;
@@ -7618,9 +5251,32 @@ udp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
static void
udp_do_close(conn_t *connp)
{
+ udp_t *udp;
+
ASSERT(connp != NULL && IPCL_IS_UDP(connp));
+ udp = connp->conn_udp;
+
+ if (cl_inet_unbind != NULL && udp->udp_state == TS_IDLE) {
+ /*
+ * Running in cluster mode - register unbind information
+ */
+ if (connp->conn_ipversion == IPV4_VERSION) {
+ (*cl_inet_unbind)(
+ connp->conn_netstack->netstack_stackid,
+ IPPROTO_UDP, AF_INET,
+ (uint8_t *)(&V4_PART_OF_V6(connp->conn_laddr_v6)),
+ (in_port_t)connp->conn_lport, NULL);
+ } else {
+ (*cl_inet_unbind)(
+ connp->conn_netstack->netstack_stackid,
+ IPPROTO_UDP, AF_INET6,
+ (uint8_t *)&(connp->conn_laddr_v6),
+ (in_port_t)connp->conn_lport, NULL);
+ }
+ }
+
+ udp_bind_hash_remove(udp, B_FALSE);
- udp_quiesce_conn(connp);
ip_quiesce_conn(connp);
if (!IPCL_IS_NONSTR(connp)) {
@@ -7642,6 +5298,7 @@ udp_do_close(conn_t *connp)
* future.
*/
ASSERT(connp->conn_ref == 1);
+
if (!IPCL_IS_NONSTR(connp)) {
inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
} else {
@@ -7652,7 +5309,7 @@ udp_do_close(conn_t *connp)
ipcl_conn_destroy(connp);
}
-/* ARGSUSED */
+/* ARGSUSED1 */
int
udp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
{
@@ -7671,59 +5328,41 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
{
sin_t *sin;
sin6_t *sin6;
- sin6_t sin6addr;
+ udp_t *udp = connp->conn_udp;
+ int error = 0;
+ ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */
in_port_t port; /* Host byte order */
in_port_t requested_port; /* Host byte order */
int count;
+ ipaddr_t v4src; /* Set if AF_INET */
in6_addr_t v6src;
int loopmax;
udp_fanout_t *udpf;
in_port_t lport; /* Network byte order */
- udp_t *udp;
+ uint_t scopeid = 0;
+ zoneid_t zoneid = IPCL_ZONEID(connp);
+ ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
boolean_t is_inaddr_any;
mlp_type_t addrtype, mlptype;
- udp_stack_t *us;
- int error = 0;
- mblk_t *mp = NULL;
-
- udp = connp->conn_udp;
- us = udp->udp_us;
-
- if (udp->udp_state != TS_UNBND) {
- (void) strlog(UDP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
- "udp_bind: bad state, %u", udp->udp_state);
- return (-TOUTSTATE);
- }
+ udp_stack_t *us = udp->udp_us;
switch (len) {
- case 0:
- if (udp->udp_family == AF_INET) {
- sin = (sin_t *)&sin6addr;
- *sin = sin_null;
- sin->sin_family = AF_INET;
- sin->sin_addr.s_addr = INADDR_ANY;
- udp->udp_ipversion = IPV4_VERSION;
- } else {
- ASSERT(udp->udp_family == AF_INET6);
- sin6 = (sin6_t *)&sin6addr;
- *sin6 = sin6_null;
- sin6->sin6_family = AF_INET6;
- V6_SET_ZERO(sin6->sin6_addr);
- udp->udp_ipversion = IPV6_VERSION;
- }
- port = 0;
- break;
-
case sizeof (sin_t): /* Complete IPv4 address */
sin = (sin_t *)sa;
if (sin == NULL || !OK_32PTR((char *)sin))
return (EINVAL);
- if (udp->udp_family != AF_INET ||
+ if (connp->conn_family != AF_INET ||
sin->sin_family != AF_INET) {
return (EAFNOSUPPORT);
}
+ v4src = sin->sin_addr.s_addr;
+ IN6_IPADDR_TO_V4MAPPED(v4src, &v6src);
+ if (v4src != INADDR_ANY) {
+ laddr_type = ip_laddr_verify_v4(v4src, zoneid, ipst,
+ B_TRUE);
+ }
port = ntohs(sin->sin_port);
break;
@@ -7733,10 +5372,28 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
if (sin6 == NULL || !OK_32PTR((char *)sin6))
return (EINVAL);
- if (udp->udp_family != AF_INET6 ||
+ if (connp->conn_family != AF_INET6 ||
sin6->sin6_family != AF_INET6) {
return (EAFNOSUPPORT);
}
+ v6src = sin6->sin6_addr;
+ if (IN6_IS_ADDR_V4MAPPED(&v6src)) {
+ if (connp->conn_ipv6_v6only)
+ return (EADDRNOTAVAIL);
+
+ IN6_V4MAPPED_TO_IPADDR(&v6src, v4src);
+ if (v4src != INADDR_ANY) {
+ laddr_type = ip_laddr_verify_v4(v4src,
+ zoneid, ipst, B_FALSE);
+ }
+ } else {
+ if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
+ if (IN6_IS_ADDR_LINKSCOPE(&v6src))
+ scopeid = sin6->sin6_scope_id;
+ laddr_type = ip_laddr_verify_v6(&v6src,
+ zoneid, ipst, B_TRUE, scopeid);
+ }
+ }
port = ntohs(sin6->sin6_port);
break;
@@ -7746,6 +5403,10 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
return (-TBADADDR);
}
+ /* Is the local address a valid unicast, multicast, or broadcast? */
+ if (laddr_type == IPVL_BAD)
+ return (EADDRNOTAVAIL);
+
requested_port = port;
if (requested_port == 0 || !bind_to_req_port_only)
@@ -7759,7 +5420,7 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
* doesn't care which port number we bind to. Get one in the
* valid range.
*/
- if (udp->udp_anon_priv_bind) {
+ if (connp->conn_anon_priv_bind) {
port = udp_get_next_priv_port(udp);
} else {
port = udp_update_next_port(udp,
@@ -7798,53 +5459,45 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
* TPI primitives only 1 at a time and wait for the response before
* sending the next primitive.
*/
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- if (udp->udp_state != TS_UNBND || udp->udp_pending_op != -1) {
- rw_exit(&udp->udp_rwlock);
+ mutex_enter(&connp->conn_lock);
+ if (udp->udp_state != TS_UNBND) {
+ mutex_exit(&connp->conn_lock);
(void) strlog(UDP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
"udp_bind: bad state, %u", udp->udp_state);
return (-TOUTSTATE);
}
- /* XXX how to remove the T_BIND_REQ? Should set it before calling */
- udp->udp_pending_op = T_BIND_REQ;
/*
* Copy the source address into our udp structure. This address
* may still be zero; if so, IP will fill in the correct address
* each time an outbound packet is passed to it. Since the udp is
* not yet in the bind hash list, we don't grab the uf_lock to
- * change udp_ipversion
+ * change conn_ipversion
*/
- if (udp->udp_family == AF_INET) {
+ if (connp->conn_family == AF_INET) {
ASSERT(sin != NULL);
- ASSERT(udp->udp_ipversion == IPV4_VERSION);
- udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE +
- udp->udp_ip_snd_options_len;
- IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6src);
+ ASSERT(connp->conn_ixa->ixa_flags & IXAF_IS_IPV4);
} else {
- ASSERT(sin6 != NULL);
- v6src = sin6->sin6_addr;
if (IN6_IS_ADDR_V4MAPPED(&v6src)) {
/*
- * no need to hold the uf_lock to set the udp_ipversion
+ * no need to hold the uf_lock to set the conn_ipversion
* since we are not yet in the fanout list
*/
- udp->udp_ipversion = IPV4_VERSION;
- udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
- UDPH_SIZE + udp->udp_ip_snd_options_len;
+ connp->conn_ipversion = IPV4_VERSION;
+ connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4;
} else {
- udp->udp_ipversion = IPV6_VERSION;
- udp->udp_max_hdr_len = udp->udp_sticky_hdrs_len;
+ connp->conn_ipversion = IPV6_VERSION;
+ connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4;
}
}
/*
- * If udp_reuseaddr is not set, then we have to make sure that
+ * If conn_reuseaddr is not set, then we have to make sure that
* the IP address and port number the application requested
* (or we selected for the application) is not being used by
* another stream. If another stream is already using the
* requested IP address and port, the behavior depends on
* "bind_to_req_port_only". If set the bind fails; otherwise we
- * search for any an unused port to bind to the the stream.
+ * search for any an unused port to bind to the stream.
*
* As per the BSD semantics, as modified by the Deering multicast
* changes, if udp_reuseaddr is set, then we allow multiple binds
@@ -7860,7 +5513,7 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
*/
count = 0;
- if (udp->udp_anon_priv_bind) {
+ if (connp->conn_anon_priv_bind) {
/*
* loopmax = (IPPORT_RESERVED-1) -
* us->us_min_anonpriv_port + 1
@@ -7876,6 +5529,7 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
for (;;) {
udp_t *udp1;
boolean_t found_exclbind = B_FALSE;
+ conn_t *connp1;
/*
* Walk through the list of udp streams bound to
@@ -7887,7 +5541,9 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
mutex_enter(&udpf->uf_lock);
for (udp1 = udpf->uf_udp; udp1 != NULL;
udp1 = udp1->udp_bind_hash) {
- if (lport != udp1->udp_port)
+ connp1 = udp1->udp_connp;
+
+ if (lport != connp1->conn_lport)
continue;
/*
@@ -7896,7 +5552,7 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
* privilege as being in all zones, as there's
* otherwise no way to identify the right receiver.
*/
- if (!IPCL_BIND_ZONE_MATCH(udp1->udp_connp, connp))
+ if (!IPCL_BIND_ZONE_MATCH(connp1, connp))
continue;
/*
@@ -7918,12 +5574,13 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
* For labeled systems, SO_MAC_EXEMPT behaves the same
* as UDP_EXCLBIND, except that zoneid is ignored.
*/
- if (udp1->udp_exclbind || udp->udp_exclbind ||
+ if (connp1->conn_exclbind || connp->conn_exclbind ||
IPCL_CONNS_MAC(udp1->udp_connp, connp)) {
if (V6_OR_V4_INADDR_ANY(
- udp1->udp_bound_v6src) ||
+ connp1->conn_bound_addr_v6) ||
is_inaddr_any ||
- IN6_ARE_ADDR_EQUAL(&udp1->udp_bound_v6src,
+ IN6_ARE_ADDR_EQUAL(
+ &connp1->conn_bound_addr_v6,
&v6src)) {
found_exclbind = B_TRUE;
break;
@@ -7935,7 +5592,7 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
* Check ipversion to allow IPv4 and IPv6 sockets to
* have disjoint port number spaces.
*/
- if (udp->udp_ipversion != udp1->udp_ipversion) {
+ if (connp->conn_ipversion != connp1->conn_ipversion) {
/*
* On the first time through the loop, if the
@@ -7963,8 +5620,8 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
* (non-wildcard, also), keep going.
*/
if (!is_inaddr_any &&
- !V6_OR_V4_INADDR_ANY(udp1->udp_bound_v6src) &&
- !IN6_ARE_ADDR_EQUAL(&udp1->udp_bound_v6src,
+ !V6_OR_V4_INADDR_ANY(connp1->conn_bound_addr_v6) &&
+ !IN6_ARE_ADDR_EQUAL(&connp1->conn_laddr_v6,
&v6src)) {
continue;
}
@@ -7972,7 +5629,7 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
}
if (!found_exclbind &&
- (udp->udp_reuseaddr && requested_port != 0)) {
+ (connp->conn_reuseaddr && requested_port != 0)) {
break;
}
@@ -7995,12 +5652,11 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
* the routine (and exit the loop).
*
*/
- udp->udp_pending_op = -1;
- rw_exit(&udp->udp_rwlock);
+ mutex_exit(&connp->conn_lock);
return (-TADDRBUSY);
}
- if (udp->udp_anon_priv_bind) {
+ if (connp->conn_anon_priv_bind) {
port = udp_get_next_priv_port(udp);
} else {
if ((count == 0) && (requested_port != 0)) {
@@ -8025,66 +5681,82 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
* there are none available, so send an error
* to the user.
*/
- udp->udp_pending_op = -1;
- rw_exit(&udp->udp_rwlock);
+ mutex_exit(&connp->conn_lock);
return (-TNOADDR);
}
}
/*
* Copy the source address into our udp structure. This address
- * may still be zero; if so, ip will fill in the correct address
- * each time an outbound packet is passed to it.
+ * may still be zero; if so, ip_attr_connect will fill in the correct
+ * address when a packet is about to be sent.
* If we are binding to a broadcast or multicast address then
- * udp_post_ip_bind_connect will clear the source address
- * when udp_do_bind success.
+ * we just set the conn_bound_addr since we don't want to use
+ * that as the source address when sending.
*/
- udp->udp_v6src = udp->udp_bound_v6src = v6src;
- udp->udp_port = lport;
+ connp->conn_bound_addr_v6 = v6src;
+ connp->conn_laddr_v6 = v6src;
+ if (scopeid != 0) {
+ connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
+ connp->conn_ixa->ixa_scopeid = scopeid;
+ connp->conn_incoming_ifindex = scopeid;
+ } else {
+ connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
+ connp->conn_incoming_ifindex = connp->conn_bound_if;
+ }
+
+ switch (laddr_type) {
+ case IPVL_UNICAST_UP:
+ case IPVL_UNICAST_DOWN:
+ connp->conn_saddr_v6 = v6src;
+ connp->conn_mcbc_bind = B_FALSE;
+ break;
+ case IPVL_MCAST:
+ case IPVL_BCAST:
+ /* ip_set_destination will pick a source address later */
+ connp->conn_saddr_v6 = ipv6_all_zeros;
+ connp->conn_mcbc_bind = B_TRUE;
+ break;
+ }
+
+ /* Any errors after this point should use late_error */
+ connp->conn_lport = lport;
+
/*
- * Now reset the the next anonymous port if the application requested
+ * Now reset the next anonymous port if the application requested
* an anonymous port, or we handed out the next anonymous port.
*/
- if ((requested_port == 0) && (!udp->udp_anon_priv_bind)) {
+ if ((requested_port == 0) && (!connp->conn_anon_priv_bind)) {
us->us_next_port_to_try = port + 1;
}
- /* Initialize the O_T_BIND_REQ/T_BIND_REQ for ip. */
- if (udp->udp_family == AF_INET) {
- sin->sin_port = udp->udp_port;
+ /* Initialize the T_BIND_ACK. */
+ if (connp->conn_family == AF_INET) {
+ sin->sin_port = connp->conn_lport;
} else {
- sin6->sin6_port = udp->udp_port;
- /* Rebuild the header template */
- error = udp_build_hdrs(udp);
- if (error != 0) {
- udp->udp_pending_op = -1;
- rw_exit(&udp->udp_rwlock);
- mutex_exit(&udpf->uf_lock);
- return (error);
- }
+ sin6->sin6_port = connp->conn_lport;
}
udp->udp_state = TS_IDLE;
udp_bind_hash_insert(udpf, udp);
mutex_exit(&udpf->uf_lock);
- rw_exit(&udp->udp_rwlock);
+ mutex_exit(&connp->conn_lock);
if (cl_inet_bind) {
/*
* Running in cluster mode - register bind information
*/
- if (udp->udp_ipversion == IPV4_VERSION) {
+ if (connp->conn_ipversion == IPV4_VERSION) {
(*cl_inet_bind)(connp->conn_netstack->netstack_stackid,
- IPPROTO_UDP, AF_INET,
- (uint8_t *)(&V4_PART_OF_V6(udp->udp_v6src)),
- (in_port_t)udp->udp_port, NULL);
+ IPPROTO_UDP, AF_INET, (uint8_t *)&v4src,
+ (in_port_t)connp->conn_lport, NULL);
} else {
(*cl_inet_bind)(connp->conn_netstack->netstack_stackid,
- IPPROTO_UDP, AF_INET6,
- (uint8_t *)&(udp->udp_v6src),
- (in_port_t)udp->udp_port, NULL);
+ IPPROTO_UDP, AF_INET6, (uint8_t *)&v6src,
+ (in_port_t)connp->conn_lport, NULL);
}
}
+ mutex_enter(&connp->conn_lock);
connp->conn_anon_port = (is_system_labeled() && requested_port == 0);
if (is_system_labeled() && (!connp->conn_anon_port ||
connp->conn_anon_mlp)) {
@@ -8092,18 +5764,16 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
zone_t *zone;
zone = crgetzone(cr);
- connp->conn_mlp_type = udp->udp_recvucred ? mlptBoth :
+ connp->conn_mlp_type =
+ connp->conn_recv_ancillary.crb_recvucred ? mlptBoth :
mlptSingle;
addrtype = tsol_mlp_addr_type(
connp->conn_allzones ? ALL_ZONES : zone->zone_id,
IPV6_VERSION, &v6src, us->us_netstack->netstack_ip);
if (addrtype == mlptSingle) {
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- udp->udp_pending_op = -1;
- rw_exit(&udp->udp_rwlock);
- connp->conn_anon_port = B_FALSE;
- connp->conn_mlp_type = mlptSingle;
- return (-TNOADDR);
+ error = -TNOADDR;
+ mutex_exit(&connp->conn_lock);
+ goto late_error;
}
mlpport = connp->conn_anon_port ? PMAPPORT : port;
mlptype = tsol_mlp_port_type(zone, IPPROTO_UDP, mlpport,
@@ -8115,12 +5785,9 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
*/
if (mlptype != mlptSingle &&
connp->conn_mlp_type == mlptSingle) {
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- udp->udp_pending_op = -1;
- rw_exit(&udp->udp_rwlock);
- connp->conn_anon_port = B_FALSE;
- connp->conn_mlp_type = mlptSingle;
- return (EINVAL);
+ error = EINVAL;
+ mutex_exit(&connp->conn_lock);
+ goto late_error;
}
/*
@@ -8129,18 +5796,15 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
*/
if (mlptype != mlptSingle &&
secpolicy_net_bindmlp(cr) != 0) {
- if (udp->udp_debug) {
+ if (connp->conn_debug) {
(void) strlog(UDP_MOD_ID, 0, 1,
SL_ERROR|SL_TRACE,
"udp_bind: no priv for multilevel port %d",
mlpport);
}
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- udp->udp_pending_op = -1;
- rw_exit(&udp->udp_rwlock);
- connp->conn_anon_port = B_FALSE;
- connp->conn_mlp_type = mlptSingle;
- return (-TACCES);
+ error = -TACCES;
+ mutex_exit(&connp->conn_lock);
+ goto late_error;
}
/*
@@ -8158,7 +5822,7 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
mlpzone = tsol_mlp_findzone(IPPROTO_UDP,
htons(mlpport));
if (connp->conn_zoneid != mlpzone) {
- if (udp->udp_debug) {
+ if (connp->conn_debug) {
(void) strlog(UDP_MOD_ID, 0, 1,
SL_ERROR|SL_TRACE,
"udp_bind: attempt to bind port "
@@ -8167,62 +5831,82 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
mlpport, connp->conn_zoneid,
mlpzone);
}
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- udp->udp_pending_op = -1;
- rw_exit(&udp->udp_rwlock);
- connp->conn_anon_port = B_FALSE;
- connp->conn_mlp_type = mlptSingle;
- return (-TACCES);
+ error = -TACCES;
+ mutex_exit(&connp->conn_lock);
+ goto late_error;
}
}
if (connp->conn_anon_port) {
- error = tsol_mlp_anon(zone, mlptype, connp->conn_ulp,
+ error = tsol_mlp_anon(zone, mlptype, connp->conn_proto,
port, B_TRUE);
if (error != 0) {
- if (udp->udp_debug) {
+ if (connp->conn_debug) {
(void) strlog(UDP_MOD_ID, 0, 1,
SL_ERROR|SL_TRACE,
"udp_bind: cannot establish anon "
"MLP for port %d", port);
}
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- udp->udp_pending_op = -1;
- rw_exit(&udp->udp_rwlock);
- connp->conn_anon_port = B_FALSE;
- connp->conn_mlp_type = mlptSingle;
- return (-TACCES);
+ error = -TACCES;
+ mutex_exit(&connp->conn_lock);
+ goto late_error;
}
}
connp->conn_mlp_type = mlptype;
}
- if (!V6_OR_V4_INADDR_ANY(udp->udp_v6src)) {
- /*
- * Append a request for an IRE if udp_v6src not
- * zero (IPv4 - INADDR_ANY, or IPv6 - all-zeroes address).
- */
- mp = allocb(sizeof (ire_t), BPRI_HI);
- if (!mp) {
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- udp->udp_pending_op = -1;
- rw_exit(&udp->udp_rwlock);
- return (ENOMEM);
- }
- mp->b_wptr += sizeof (ire_t);
- mp->b_datap->db_type = IRE_DB_REQ_TYPE;
+ /*
+ * We create an initial header template here to make a subsequent
+ * sendto have a starting point. Since conn_last_dst is zero the
+ * first sendto will always follow the 'dst changed' code path.
+ * Note that we defer massaging options and the related checksum
+ * adjustment until we have a destination address.
+ */
+ error = udp_build_hdr_template(connp, &connp->conn_saddr_v6,
+ &connp->conn_faddr_v6, connp->conn_fport, connp->conn_flowinfo);
+ if (error != 0) {
+ mutex_exit(&connp->conn_lock);
+ goto late_error;
}
- if (udp->udp_family == AF_INET6) {
- ASSERT(udp->udp_connp->conn_af_isv6);
- error = ip_proto_bind_laddr_v6(connp, &mp, IPPROTO_UDP,
- &udp->udp_bound_v6src, udp->udp_port, B_TRUE);
- } else {
- ASSERT(!udp->udp_connp->conn_af_isv6);
- error = ip_proto_bind_laddr_v4(connp, &mp, IPPROTO_UDP,
- V4_PART_OF_V6(udp->udp_bound_v6src), udp->udp_port,
- B_TRUE);
+ /* Just in case */
+ connp->conn_faddr_v6 = ipv6_all_zeros;
+ connp->conn_fport = 0;
+ connp->conn_v6lastdst = ipv6_all_zeros;
+ mutex_exit(&connp->conn_lock);
+
+ error = ip_laddr_fanout_insert(connp);
+ if (error != 0)
+ goto late_error;
+
+ /* Bind succeeded */
+ return (0);
+
+late_error:
+ /* We had already picked the port number, and then the bind failed */
+ mutex_enter(&connp->conn_lock);
+ udpf = &us->us_bind_fanout[
+ UDP_BIND_HASH(connp->conn_lport,
+ us->us_bind_fanout_size)];
+ mutex_enter(&udpf->uf_lock);
+ connp->conn_saddr_v6 = ipv6_all_zeros;
+ connp->conn_bound_addr_v6 = ipv6_all_zeros;
+ connp->conn_laddr_v6 = ipv6_all_zeros;
+ if (scopeid != 0) {
+ connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
+ connp->conn_incoming_ifindex = connp->conn_bound_if;
}
+ udp->udp_state = TS_UNBND;
+ udp_bind_hash_remove(udp, B_TRUE);
+ connp->conn_lport = 0;
+ mutex_exit(&udpf->uf_lock);
+ connp->conn_anon_port = B_FALSE;
+ connp->conn_mlp_type = mlptSingle;
- (void) udp_post_ip_bind_connect(udp, mp, error);
+ connp->conn_v6lastdst = ipv6_all_zeros;
+
+ /* Restore the header that was built above - different source address */
+ (void) udp_build_hdr_template(connp, &connp->conn_saddr_v6,
+ &connp->conn_faddr_v6, connp->conn_fport, connp->conn_flowinfo);
+ mutex_exit(&connp->conn_lock);
return (error);
}
@@ -8256,12 +5940,32 @@ udp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
static int
udp_implicit_bind(conn_t *connp, cred_t *cr)
{
+ sin6_t sin6addr;
+ sin_t *sin;
+ sin6_t *sin6;
+ socklen_t len;
int error;
/* All Solaris components should pass a cred for this operation. */
ASSERT(cr != NULL);
- error = udp_do_bind(connp, NULL, 0, cr, B_FALSE);
+ if (connp->conn_family == AF_INET) {
+ len = sizeof (struct sockaddr_in);
+ sin = (sin_t *)&sin6addr;
+ *sin = sin_null;
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = INADDR_ANY;
+ } else {
+ ASSERT(connp->conn_family == AF_INET6);
+ len = sizeof (sin6_t);
+ sin6 = (sin6_t *)&sin6addr;
+ *sin6 = sin6_null;
+ sin6->sin6_family = AF_INET6;
+ V6_SET_ZERO(sin6->sin6_addr);
+ }
+
+ error = udp_do_bind(connp, (struct sockaddr *)&sin6addr, len,
+ cr, B_FALSE);
return ((error < 0) ? proto_tlitosyserr(-error) : error);
}
@@ -8280,137 +5984,51 @@ udp_do_unbind(conn_t *connp)
/*
* Running in cluster mode - register unbind information
*/
- if (udp->udp_ipversion == IPV4_VERSION) {
+ if (connp->conn_ipversion == IPV4_VERSION) {
(*cl_inet_unbind)(
connp->conn_netstack->netstack_stackid,
IPPROTO_UDP, AF_INET,
- (uint8_t *)(&V4_PART_OF_V6(udp->udp_v6src)),
- (in_port_t)udp->udp_port, NULL);
+ (uint8_t *)(&V4_PART_OF_V6(connp->conn_laddr_v6)),
+ (in_port_t)connp->conn_lport, NULL);
} else {
(*cl_inet_unbind)(
connp->conn_netstack->netstack_stackid,
IPPROTO_UDP, AF_INET6,
- (uint8_t *)&(udp->udp_v6src),
- (in_port_t)udp->udp_port, NULL);
+ (uint8_t *)&(connp->conn_laddr_v6),
+ (in_port_t)connp->conn_lport, NULL);
}
}
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- if (udp->udp_state == TS_UNBND || udp->udp_pending_op != -1) {
- rw_exit(&udp->udp_rwlock);
+ mutex_enter(&connp->conn_lock);
+ /* If a bind has not been done, we can't unbind. */
+ if (udp->udp_state == TS_UNBND) {
+ mutex_exit(&connp->conn_lock);
return (-TOUTSTATE);
}
- udp->udp_pending_op = T_UNBIND_REQ;
- rw_exit(&udp->udp_rwlock);
-
- /*
- * Pass the unbind to IP; T_UNBIND_REQ is larger than T_OK_ACK
- * and therefore ip_unbind must never return NULL.
- */
- ip_unbind(connp);
-
- /*
- * Once we're unbound from IP, the pending operation may be cleared
- * here.
- */
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port,
+ udpf = &us->us_bind_fanout[UDP_BIND_HASH(connp->conn_lport,
us->us_bind_fanout_size)];
-
mutex_enter(&udpf->uf_lock);
udp_bind_hash_remove(udp, B_TRUE);
- V6_SET_ZERO(udp->udp_v6src);
- V6_SET_ZERO(udp->udp_bound_v6src);
- udp->udp_port = 0;
+ connp->conn_saddr_v6 = ipv6_all_zeros;
+ connp->conn_bound_addr_v6 = ipv6_all_zeros;
+ connp->conn_laddr_v6 = ipv6_all_zeros;
+ connp->conn_mcbc_bind = B_FALSE;
+ connp->conn_lport = 0;
+ /* In case we were also connected */
+ connp->conn_faddr_v6 = ipv6_all_zeros;
+ connp->conn_fport = 0;
mutex_exit(&udpf->uf_lock);
- udp->udp_pending_op = -1;
+ connp->conn_v6lastdst = ipv6_all_zeros;
udp->udp_state = TS_UNBND;
- if (udp->udp_family == AF_INET6)
- (void) udp_build_hdrs(udp);
- rw_exit(&udp->udp_rwlock);
- return (0);
-}
-
-static int
-udp_post_ip_bind_connect(udp_t *udp, mblk_t *ire_mp, int error)
-{
- ire_t *ire;
- udp_fanout_t *udpf;
- udp_stack_t *us = udp->udp_us;
-
- ASSERT(udp->udp_pending_op != -1);
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- if (error == 0) {
- /* For udp_do_connect() success */
- /* udp_do_bind() success will do nothing in here */
- /*
- * If a broadcast/multicast address was bound, set
- * the source address to 0.
- * This ensures no datagrams with broadcast address
- * as source address are emitted (which would violate
- * RFC1122 - Hosts requirements)
- *
- * Note that when connecting the returned IRE is
- * for the destination address and we only perform
- * the broadcast check for the source address (it
- * is OK to connect to a broadcast/multicast address.)
- */
- if (ire_mp != NULL && ire_mp->b_datap->db_type == IRE_DB_TYPE) {
- ire = (ire_t *)ire_mp->b_rptr;
+ (void) udp_build_hdr_template(connp, &connp->conn_saddr_v6,
+ &connp->conn_faddr_v6, connp->conn_fport, connp->conn_flowinfo);
+ mutex_exit(&connp->conn_lock);
- /*
- * Note: we get IRE_BROADCAST for IPv6 to "mark" a
- * multicast local address.
- */
- udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port,
- us->us_bind_fanout_size)];
- if (ire->ire_type == IRE_BROADCAST &&
- udp->udp_state != TS_DATA_XFER) {
- ASSERT(udp->udp_pending_op == T_BIND_REQ ||
- udp->udp_pending_op == O_T_BIND_REQ);
- /*
- * This was just a local bind to a broadcast
- * addr.
- */
- mutex_enter(&udpf->uf_lock);
- V6_SET_ZERO(udp->udp_v6src);
- mutex_exit(&udpf->uf_lock);
- if (udp->udp_family == AF_INET6)
- (void) udp_build_hdrs(udp);
- } else if (V6_OR_V4_INADDR_ANY(udp->udp_v6src)) {
- if (udp->udp_family == AF_INET6)
- (void) udp_build_hdrs(udp);
- }
- }
- } else {
- udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port,
- us->us_bind_fanout_size)];
- mutex_enter(&udpf->uf_lock);
+ ip_unbind(connp);
- if (udp->udp_state == TS_DATA_XFER) {
- /* Connect failed */
- /* Revert back to the bound source */
- udp->udp_v6src = udp->udp_bound_v6src;
- udp->udp_state = TS_IDLE;
- } else {
- /* For udp_do_bind() failed */
- V6_SET_ZERO(udp->udp_v6src);
- V6_SET_ZERO(udp->udp_bound_v6src);
- udp->udp_state = TS_UNBND;
- udp_bind_hash_remove(udp, B_TRUE);
- udp->udp_port = 0;
- }
- mutex_exit(&udpf->uf_lock);
- if (udp->udp_family == AF_INET6)
- (void) udp_build_hdrs(udp);
- }
- udp->udp_pending_op = -1;
- rw_exit(&udp->udp_rwlock);
- if (ire_mp != NULL)
- freeb(ire_mp);
- return (error);
+ return (0);
}
/*
@@ -8418,7 +6036,7 @@ udp_post_ip_bind_connect(udp_t *udp, mblk_t *ire_mp, int error)
*/
static int
udp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
- cred_t *cr)
+ cred_t *cr, pid_t pid)
{
sin6_t *sin6;
sin_t *sin;
@@ -8426,12 +6044,16 @@ udp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
ipaddr_t v4dst;
uint16_t dstport;
uint32_t flowinfo;
- mblk_t *ire_mp;
udp_fanout_t *udpf;
udp_t *udp, *udp1;
ushort_t ipversion;
udp_stack_t *us;
int error;
+ conn_t *connp1;
+ ip_xmit_attr_t *ixa;
+ uint_t scopeid = 0;
+ uint_t srcid = 0;
+ in6_addr_t v6src = connp->conn_saddr_v6;
udp = connp->conn_udp;
us = udp->udp_us;
@@ -8451,7 +6073,7 @@ udp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
v4dst = sin->sin_addr.s_addr;
dstport = sin->sin_port;
IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
- ASSERT(udp->udp_ipversion == IPV4_VERSION);
+ ASSERT(connp->conn_ipversion == IPV4_VERSION);
ipversion = IPV4_VERSION;
break;
@@ -8459,13 +6081,33 @@ udp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
sin6 = (sin6_t *)sa;
v6dst = sin6->sin6_addr;
dstport = sin6->sin6_port;
+ srcid = sin6->__sin6_src_id;
+ if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
+ ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
+ connp->conn_netstack);
+ }
if (IN6_IS_ADDR_V4MAPPED(&v6dst)) {
+ if (connp->conn_ipv6_v6only)
+ return (EADDRNOTAVAIL);
+
+ /*
+ * Destination adress is mapped IPv6 address.
+ * Source bound address should be unspecified or
+ * IPv6 mapped address as well.
+ */
+ if (!IN6_IS_ADDR_UNSPECIFIED(
+ &connp->conn_bound_addr_v6) &&
+ !IN6_IS_ADDR_V4MAPPED(&connp->conn_bound_addr_v6)) {
+ return (EADDRNOTAVAIL);
+ }
IN6_V4MAPPED_TO_IPADDR(&v6dst, v4dst);
ipversion = IPV4_VERSION;
flowinfo = 0;
} else {
ipversion = IPV6_VERSION;
flowinfo = sin6->sin6_flowinfo;
+ if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr))
+ scopeid = sin6->sin6_scope_id;
}
break;
}
@@ -8473,44 +6115,53 @@ udp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
if (dstport == 0)
return (-TBADADDR);
- rw_enter(&udp->udp_rwlock, RW_WRITER);
+ /*
+ * If there is a different thread using conn_ixa then we get a new
+ * copy and cut the old one loose from conn_ixa. Otherwise we use
+ * conn_ixa and prevent any other thread from using/changing it.
+ * Once connect() is done other threads can use conn_ixa since the
+ * refcnt will be back at one.
+ */
+ ixa = conn_get_ixa(connp, B_TRUE);
+ if (ixa == NULL)
+ return (ENOMEM);
+ ASSERT(ixa->ixa_refcnt >= 2);
+ ASSERT(ixa == connp->conn_ixa);
+
+ mutex_enter(&connp->conn_lock);
/*
- * This UDP must have bound to a port already before doing a connect.
- * TPI mandates that users must send TPI primitives only 1 at a time
- * and wait for the response before sending the next primitive.
+ * This udp_t must have bound to a port already before doing a connect.
+ * Reject if a connect is in progress (we drop conn_lock during
+ * udp_do_connect).
*/
- if (udp->udp_state == TS_UNBND || udp->udp_pending_op != -1) {
- rw_exit(&udp->udp_rwlock);
+ if (udp->udp_state == TS_UNBND || udp->udp_state == TS_WCON_CREQ) {
+ mutex_exit(&connp->conn_lock);
(void) strlog(UDP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
"udp_connect: bad state, %u", udp->udp_state);
+ ixa_refrele(ixa);
return (-TOUTSTATE);
}
- udp->udp_pending_op = T_CONN_REQ;
- ASSERT(udp->udp_port != 0 && udp->udp_ptpbhn != NULL);
-
- if (ipversion == IPV4_VERSION) {
- udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE +
- udp->udp_ip_snd_options_len;
- } else {
- udp->udp_max_hdr_len = udp->udp_sticky_hdrs_len;
- }
+ ASSERT(connp->conn_lport != 0 && udp->udp_ptpbhn != NULL);
- udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port,
+ udpf = &us->us_bind_fanout[UDP_BIND_HASH(connp->conn_lport,
us->us_bind_fanout_size)];
mutex_enter(&udpf->uf_lock);
if (udp->udp_state == TS_DATA_XFER) {
/* Already connected - clear out state */
- udp->udp_v6src = udp->udp_bound_v6src;
+ if (connp->conn_mcbc_bind)
+ connp->conn_saddr_v6 = ipv6_all_zeros;
+ else
+ connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
+ connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
+ connp->conn_faddr_v6 = ipv6_all_zeros;
+ connp->conn_fport = 0;
udp->udp_state = TS_IDLE;
}
- /*
- * Create a default IP header with no IP options.
- */
- udp->udp_dstport = dstport;
- udp->udp_ipversion = ipversion;
+ connp->conn_fport = dstport;
+ connp->conn_ipversion = ipversion;
if (ipversion == IPV4_VERSION) {
/*
* Interpret a zero destination to mean loopback.
@@ -8520,29 +6171,16 @@ udp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
if (v4dst == INADDR_ANY) {
v4dst = htonl(INADDR_LOOPBACK);
IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
- if (udp->udp_family == AF_INET) {
+ if (connp->conn_family == AF_INET) {
sin->sin_addr.s_addr = v4dst;
} else {
sin6->sin6_addr = v6dst;
}
}
- udp->udp_v6dst = v6dst;
- udp->udp_flowinfo = 0;
-
- /*
- * If the destination address is multicast and
- * an outgoing multicast interface has been set,
- * use the address of that interface as our
- * source address if no source address has been set.
- */
- if (V4_PART_OF_V6(udp->udp_v6src) == INADDR_ANY &&
- CLASSD(v4dst) &&
- udp->udp_multicast_if_addr != INADDR_ANY) {
- IN6_IPADDR_TO_V4MAPPED(udp->udp_multicast_if_addr,
- &udp->udp_v6src);
- }
+ connp->conn_faddr_v6 = v6dst;
+ connp->conn_flowinfo = 0;
} else {
- ASSERT(udp->udp_ipversion == IPV6_VERSION);
+ ASSERT(connp->conn_ipversion == IPV6_VERSION);
/*
* Interpret a zero destination to mean loopback.
* Update the T_CONN_REQ (sin/sin6) since it is used to
@@ -8552,82 +6190,133 @@ udp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
v6dst = ipv6_loopback;
sin6->sin6_addr = v6dst;
}
- udp->udp_v6dst = v6dst;
- udp->udp_flowinfo = flowinfo;
- /*
- * If the destination address is multicast and
- * an outgoing multicast interface has been set,
- * then the ip bind logic will pick the correct source
- * address (i.e. matching the outgoing multicast interface).
- */
+ connp->conn_faddr_v6 = v6dst;
+ connp->conn_flowinfo = flowinfo;
+ }
+ mutex_exit(&udpf->uf_lock);
+
+ ixa->ixa_cred = cr;
+ ixa->ixa_cpid = pid;
+ if (is_system_labeled()) {
+ /* We need to restart with a label based on the cred */
+ ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
+ }
+
+ if (scopeid != 0) {
+ ixa->ixa_flags |= IXAF_SCOPEID_SET;
+ ixa->ixa_scopeid = scopeid;
+ connp->conn_incoming_ifindex = scopeid;
+ } else {
+ ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
+ connp->conn_incoming_ifindex = connp->conn_bound_if;
+ }
+ /*
+ * conn_connect will drop conn_lock and reacquire it.
+ * To prevent a send* from messing with this udp_t while the lock
+ * is dropped we set udp_state and clear conn_v6lastdst.
+ * That will make all send* fail with EISCONN.
+ */
+ connp->conn_v6lastdst = ipv6_all_zeros;
+ udp->udp_state = TS_WCON_CREQ;
+
+ error = conn_connect(connp, NULL, IPDF_ALLOW_MCBC);
+ mutex_exit(&connp->conn_lock);
+ if (error != 0)
+ goto connect_failed;
+
+ /*
+ * The addresses have been verified. Time to insert in
+ * the correct fanout list.
+ */
+ error = ipcl_conn_insert(connp);
+ if (error != 0)
+ goto connect_failed;
+
+ mutex_enter(&connp->conn_lock);
+ error = udp_build_hdr_template(connp, &connp->conn_saddr_v6,
+ &connp->conn_faddr_v6, connp->conn_fport, connp->conn_flowinfo);
+ if (error != 0) {
+ mutex_exit(&connp->conn_lock);
+ goto connect_failed;
}
+ udp->udp_state = TS_DATA_XFER;
+ /* Record this as the "last" send even though we haven't sent any */
+ connp->conn_v6lastdst = connp->conn_faddr_v6;
+ connp->conn_lastipversion = connp->conn_ipversion;
+ connp->conn_lastdstport = connp->conn_fport;
+ connp->conn_lastflowinfo = connp->conn_flowinfo;
+ connp->conn_lastscopeid = scopeid;
+ connp->conn_lastsrcid = srcid;
+ /* Also remember a source to use together with lastdst */
+ connp->conn_v6lastsrc = v6src;
+ mutex_exit(&connp->conn_lock);
+
/*
- * Verify that the src/port/dst/port is unique for all
- * connections in TS_DATA_XFER
+ * We've picked a source address above. Now we can
+ * verify that the src/port/dst/port is unique for all
+ * connections in TS_DATA_XFER, skipping ourselves.
*/
+ mutex_enter(&udpf->uf_lock);
for (udp1 = udpf->uf_udp; udp1 != NULL; udp1 = udp1->udp_bind_hash) {
if (udp1->udp_state != TS_DATA_XFER)
continue;
- if (udp->udp_port != udp1->udp_port ||
- udp->udp_ipversion != udp1->udp_ipversion ||
- dstport != udp1->udp_dstport ||
- !IN6_ARE_ADDR_EQUAL(&udp->udp_v6src, &udp1->udp_v6src) ||
- !IN6_ARE_ADDR_EQUAL(&v6dst, &udp1->udp_v6dst) ||
- !(IPCL_ZONE_MATCH(udp->udp_connp,
- udp1->udp_connp->conn_zoneid) ||
- IPCL_ZONE_MATCH(udp1->udp_connp,
- udp->udp_connp->conn_zoneid)))
+
+ if (udp1 == udp)
+ continue;
+
+ connp1 = udp1->udp_connp;
+ if (connp->conn_lport != connp1->conn_lport ||
+ connp->conn_ipversion != connp1->conn_ipversion ||
+ dstport != connp1->conn_fport ||
+ !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
+ &connp1->conn_laddr_v6) ||
+ !IN6_ARE_ADDR_EQUAL(&v6dst, &connp1->conn_faddr_v6) ||
+ !(IPCL_ZONE_MATCH(connp, connp1->conn_zoneid) ||
+ IPCL_ZONE_MATCH(connp1, connp->conn_zoneid)))
continue;
mutex_exit(&udpf->uf_lock);
- udp->udp_pending_op = -1;
- rw_exit(&udp->udp_rwlock);
- return (-TBADADDR);
+ error = -TBADADDR;
+ goto connect_failed;
}
-
if (cl_inet_connect2 != NULL) {
- CL_INET_UDP_CONNECT(connp, udp, B_TRUE, &v6dst, dstport, error);
+ CL_INET_UDP_CONNECT(connp, B_TRUE, &v6dst, dstport, error);
if (error != 0) {
mutex_exit(&udpf->uf_lock);
- udp->udp_pending_op = -1;
- rw_exit(&udp->udp_rwlock);
- return (-TBADADDR);
+ error = -TBADADDR;
+ goto connect_failed;
}
}
-
- udp->udp_state = TS_DATA_XFER;
mutex_exit(&udpf->uf_lock);
- ire_mp = allocb(sizeof (ire_t), BPRI_HI);
- if (ire_mp == NULL) {
- mutex_enter(&udpf->uf_lock);
- udp->udp_state = TS_IDLE;
- udp->udp_pending_op = -1;
- mutex_exit(&udpf->uf_lock);
- rw_exit(&udp->udp_rwlock);
- return (ENOMEM);
- }
-
- rw_exit(&udp->udp_rwlock);
+ ixa_refrele(ixa);
+ return (0);
- ire_mp->b_wptr += sizeof (ire_t);
- ire_mp->b_datap->db_type = IRE_DB_REQ_TYPE;
+connect_failed:
+ if (ixa != NULL)
+ ixa_refrele(ixa);
+ mutex_enter(&connp->conn_lock);
+ mutex_enter(&udpf->uf_lock);
+ udp->udp_state = TS_IDLE;
+ connp->conn_faddr_v6 = ipv6_all_zeros;
+ connp->conn_fport = 0;
+ /* In case the source address was set above */
+ if (connp->conn_mcbc_bind)
+ connp->conn_saddr_v6 = ipv6_all_zeros;
+ else
+ connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
+ connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
+ mutex_exit(&udpf->uf_lock);
- if (udp->udp_family == AF_INET) {
- error = ip_proto_bind_connected_v4(connp, &ire_mp, IPPROTO_UDP,
- &V4_PART_OF_V6(udp->udp_v6src), udp->udp_port,
- V4_PART_OF_V6(udp->udp_v6dst), udp->udp_dstport,
- B_TRUE, B_TRUE, cr);
- } else {
- error = ip_proto_bind_connected_v6(connp, &ire_mp, IPPROTO_UDP,
- &udp->udp_v6src, udp->udp_port, &udp->udp_v6dst,
- &udp->udp_sticky_ipp, udp->udp_dstport, B_TRUE, B_TRUE, cr);
- }
+ connp->conn_v6lastdst = ipv6_all_zeros;
+ connp->conn_flowinfo = 0;
- return (udp_post_ip_bind_connect(udp, ire_mp, error));
+ (void) udp_build_hdr_template(connp, &connp->conn_saddr_v6,
+ &connp->conn_faddr_v6, connp->conn_fport, connp->conn_flowinfo);
+ mutex_exit(&connp->conn_lock);
+ return (error);
}
-/* ARGSUSED */
static int
udp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
socklen_t len, sock_connid_t *id, cred_t *cr)
@@ -8636,6 +6325,7 @@ udp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
udp_t *udp = connp->conn_udp;
int error;
boolean_t did_bind = B_FALSE;
+ pid_t pid = curproc->p_pid;
/* All Solaris components should pass a cred for this operation. */
ASSERT(cr != NULL);
@@ -8652,7 +6342,7 @@ udp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
return (error);
}
- error = proto_verify_ip_addr(udp->udp_family, sa, len);
+ error = proto_verify_ip_addr(connp->conn_family, sa, len);
if (error != 0)
goto done;
@@ -8671,9 +6361,9 @@ udp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
/*
* set SO_DGRAM_ERRIND
*/
- udp->udp_dgram_errind = B_TRUE;
+ connp->conn_dgram_errind = B_TRUE;
- error = udp_do_connect(connp, sa, len, cr);
+ error = udp_do_connect(connp, sa, len, cr, pid);
if (error != 0 && did_bind) {
int unbind_err;
@@ -8702,44 +6392,33 @@ done:
return (error);
}
-/* ARGSUSED */
int
udp_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
cred_t *cr)
{
+ sin6_t *sin6;
+ sin_t *sin = NULL;
+ uint_t srcid;
conn_t *connp = (conn_t *)proto_handle;
udp_t *udp = connp->conn_udp;
- udp_stack_t *us = udp->udp_us;
int error = 0;
+ udp_stack_t *us = udp->udp_us;
+ ushort_t ipversion;
+ pid_t pid = curproc->p_pid;
+ ip_xmit_attr_t *ixa;
ASSERT(DB_TYPE(mp) == M_DATA);
/* All Solaris components should pass a cred for this operation. */
ASSERT(cr != NULL);
- /* If labeled then sockfs should have already set db_credp */
- ASSERT(!is_system_labeled() || msg_getcred(mp, NULL) != NULL);
-
- /*
- * If the socket is connected and no change in destination
- */
- if (msg->msg_namelen == 0) {
- error = udp_send_connected(connp, mp, msg, cr, curproc->p_pid);
- if (error == EDESTADDRREQ)
- return (error);
- else
- return (udp->udp_dgram_errind ? error : 0);
- }
-
- /*
- * Do an implicit bind if necessary.
- */
+ /* do an implicit bind if necessary */
if (udp->udp_state == TS_UNBND) {
error = udp_implicit_bind(connp, cr);
/*
* We could be racing with an actual bind, in which case
* we would see EPROTO. We cross our fingers and try
- * to send.
+ * to connect.
*/
if (!(error == 0 || error == EPROTO)) {
freemsg(mp);
@@ -8747,75 +6426,203 @@ udp_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
}
}
- rw_enter(&udp->udp_rwlock, RW_WRITER);
-
- if (msg->msg_name != NULL && udp->udp_state == TS_DATA_XFER) {
- rw_exit(&udp->udp_rwlock);
- freemsg(mp);
+ /* Connected? */
+ if (msg->msg_name == NULL) {
+ if (udp->udp_state != TS_DATA_XFER) {
+ BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+ return (EDESTADDRREQ);
+ }
+ if (msg->msg_controllen != 0) {
+ error = udp_output_ancillary(connp, NULL, NULL, mp,
+ NULL, msg, cr, pid);
+ } else {
+ error = udp_output_connected(connp, mp, cr, pid);
+ }
+ if (us->us_sendto_ignerr)
+ return (0);
+ else
+ return (error);
+ }
+ if (udp->udp_state == TS_DATA_XFER) {
+ BUMP_MIB(&us->us_udp_mib, udpOutErrors);
return (EISCONN);
}
+ error = proto_verify_ip_addr(connp->conn_family,
+ (struct sockaddr *)msg->msg_name, msg->msg_namelen);
+ if (error != 0) {
+ BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+ return (error);
+ }
+ switch (connp->conn_family) {
+ case AF_INET6:
+ sin6 = (sin6_t *)msg->msg_name;
+ srcid = sin6->__sin6_src_id;
- if (udp->udp_delayed_error != 0) {
- boolean_t match;
+ if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+ /*
+ * Destination is a non-IPv4-compatible IPv6 address.
+ * Send out an IPv6 format packet.
+ */
- error = udp->udp_delayed_error;
- match = B_FALSE;
- udp->udp_delayed_error = 0;
- switch (udp->udp_family) {
- case AF_INET: {
- /* Compare just IP address and port */
- sin_t *sin1 = (sin_t *)msg->msg_name;
- sin_t *sin2 = (sin_t *)&udp->udp_delayed_addr;
+ /*
+ * If the local address is a mapped address return
+ * an error.
+ * It would be possible to send an IPv6 packet but the
+ * response would never make it back to the application
+ * since it is bound to a mapped address.
+ */
+ if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
+ BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+ return (EADDRNOTAVAIL);
+ }
+ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
+ sin6->sin6_addr = ipv6_loopback;
+ ipversion = IPV6_VERSION;
+ } else {
+ if (connp->conn_ipv6_v6only) {
+ BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+ return (EADDRNOTAVAIL);
+ }
- if (msg->msg_namelen == sizeof (sin_t) &&
- sin1->sin_port == sin2->sin_port &&
- sin1->sin_addr.s_addr == sin2->sin_addr.s_addr)
- match = B_TRUE;
+ /*
+ * If the local address is not zero or a mapped address
+ * return an error. It would be possible to send an
+ * IPv4 packet but the response would never make it
+ * back to the application since it is bound to a
+ * non-mapped address.
+ */
+ if (!IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6) &&
+ !IN6_IS_ADDR_UNSPECIFIED(&connp->conn_saddr_v6)) {
+ BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+ return (EADDRNOTAVAIL);
+ }
- break;
+ if (V4_PART_OF_V6(sin6->sin6_addr) == INADDR_ANY) {
+ V4_PART_OF_V6(sin6->sin6_addr) =
+ htonl(INADDR_LOOPBACK);
+ }
+ ipversion = IPV4_VERSION;
}
- case AF_INET6: {
- sin6_t *sin1 = (sin6_t *)msg->msg_name;
- sin6_t *sin2 = (sin6_t *)&udp->udp_delayed_addr;
- if (msg->msg_namelen == sizeof (sin6_t) &&
- sin1->sin6_port == sin2->sin6_port &&
- IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
- &sin2->sin6_addr))
- match = B_TRUE;
- break;
- }
- default:
- ASSERT(0);
+ /*
+ * We have to allocate an ip_xmit_attr_t before we grab
+ * conn_lock and we need to hold conn_lock once we've check
+ * conn_same_as_last_v6 to handle concurrent send* calls on a
+ * socket.
+ */
+ if (msg->msg_controllen == 0) {
+ ixa = conn_get_ixa(connp, B_FALSE);
+ if (ixa == NULL) {
+ BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+ return (ENOMEM);
+ }
+ } else {
+ ixa = NULL;
}
+ mutex_enter(&connp->conn_lock);
+ if (udp->udp_delayed_error != 0) {
+ sin6_t *sin2 = (sin6_t *)&udp->udp_delayed_addr;
- *((sin6_t *)&udp->udp_delayed_addr) = sin6_null;
+ error = udp->udp_delayed_error;
+ udp->udp_delayed_error = 0;
- if (match) {
- rw_exit(&udp->udp_rwlock);
- freemsg(mp);
+ /* Compare IP address, port, and family */
+
+ if (sin6->sin6_port == sin2->sin6_port &&
+ IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
+ &sin2->sin6_addr) &&
+ sin6->sin6_family == sin2->sin6_family) {
+ mutex_exit(&connp->conn_lock);
+ BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+ if (ixa != NULL)
+ ixa_refrele(ixa);
+ return (error);
+ }
+ }
+
+ if (msg->msg_controllen != 0) {
+ mutex_exit(&connp->conn_lock);
+ ASSERT(ixa == NULL);
+ error = udp_output_ancillary(connp, NULL, sin6, mp,
+ NULL, msg, cr, pid);
+ } else if (conn_same_as_last_v6(connp, sin6) &&
+ connp->conn_lastsrcid == srcid &&
+ ipsec_outbound_policy_current(ixa)) {
+ /* udp_output_lastdst drops conn_lock */
+ error = udp_output_lastdst(connp, mp, cr, pid, ixa);
+ } else {
+ /* udp_output_newdst drops conn_lock */
+ error = udp_output_newdst(connp, mp, NULL, sin6,
+ ipversion, cr, pid, ixa);
+ }
+ ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
+ if (us->us_sendto_ignerr)
+ return (0);
+ else
return (error);
+ case AF_INET:
+ sin = (sin_t *)msg->msg_name;
+
+ ipversion = IPV4_VERSION;
+
+ if (sin->sin_addr.s_addr == INADDR_ANY)
+ sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+
+ /*
+ * We have to allocate an ip_xmit_attr_t before we grab
+ * conn_lock and we need to hold conn_lock once we've check
+ * conn_same_as_last_v6 to handle concurrent send* on a socket.
+ */
+ if (msg->msg_controllen == 0) {
+ ixa = conn_get_ixa(connp, B_FALSE);
+ if (ixa == NULL) {
+ BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+ return (ENOMEM);
+ }
+ } else {
+ ixa = NULL;
}
- }
+ mutex_enter(&connp->conn_lock);
+ if (udp->udp_delayed_error != 0) {
+ sin_t *sin2 = (sin_t *)&udp->udp_delayed_addr;
- error = proto_verify_ip_addr(udp->udp_family,
- (struct sockaddr *)msg->msg_name, msg->msg_namelen);
- rw_exit(&udp->udp_rwlock);
+ error = udp->udp_delayed_error;
+ udp->udp_delayed_error = 0;
- if (error != 0) {
- freemsg(mp);
- return (error);
- }
+ /* Compare IP address and port */
- error = udp_send_not_connected(connp, mp,
- (struct sockaddr *)msg->msg_name, msg->msg_namelen, msg, cr,
- curproc->p_pid);
- if (error != 0) {
- UDP_STAT(us, udp_out_err_output);
- freemsg(mp);
+ if (sin->sin_port == sin2->sin_port &&
+ sin->sin_addr.s_addr == sin2->sin_addr.s_addr) {
+ mutex_exit(&connp->conn_lock);
+ BUMP_MIB(&us->us_udp_mib, udpOutErrors);
+ if (ixa != NULL)
+ ixa_refrele(ixa);
+ return (error);
+ }
+ }
+ if (msg->msg_controllen != 0) {
+ mutex_exit(&connp->conn_lock);
+ ASSERT(ixa == NULL);
+ error = udp_output_ancillary(connp, sin, NULL, mp,
+ NULL, msg, cr, pid);
+ } else if (conn_same_as_last_v4(connp, sin) &&
+ ipsec_outbound_policy_current(ixa)) {
+ /* udp_output_lastdst drops conn_lock */
+ error = udp_output_lastdst(connp, mp, cr, pid, ixa);
+ } else {
+ /* udp_output_newdst drops conn_lock */
+ error = udp_output_newdst(connp, mp, sin, NULL,
+ ipversion, cr, pid, ixa);
+ }
+ ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
+ if (us->us_sendto_ignerr)
+ return (0);
+ else
+ return (error);
+ default:
+ return (EINVAL);
}
- return (udp->udp_dgram_errind ? error : 0);
}
int
@@ -8854,8 +6661,7 @@ udp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
stropt_mp->b_wptr += sizeof (*stropt);
stropt = (struct stroptions *)stropt_mp->b_rptr;
stropt->so_flags = SO_WROFF | SO_HIWAT;
- stropt->so_wroff =
- (ushort_t)(udp->udp_max_hdr_len + udp->udp_us->us_wroff_extra);
+ stropt->so_wroff = connp->conn_wroff;
stropt->so_hiwat = udp->udp_rcv_disply_hiwat;
putnext(RD(q), stropt_mp);
@@ -8881,9 +6687,9 @@ udp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
faddrlen = 0;
opts = 0;
- if (udp->udp_dgram_errind)
+ if (connp->conn_dgram_errind)
opts |= SO_DGRAM_ERRIND;
- if (udp->udp_dontroute)
+ if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
opts |= SO_DONTROUTE;
(*quiesced_cb)(connp->conn_upper_handle, q, &tca,
@@ -8908,9 +6714,9 @@ udp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
/*
* No longer a streams less socket
*/
- rw_enter(&udp->udp_rwlock, RW_WRITER);
+ mutex_enter(&connp->conn_lock);
connp->conn_flags &= ~IPCL_NONSTR;
- rw_exit(&udp->udp_rwlock);
+ mutex_exit(&connp->conn_lock);
mutex_exit(&udp->udp_recv_lock);
@@ -8919,48 +6725,7 @@ udp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
return (0);
}
-static int
-udp_do_getpeername(udp_t *udp, struct sockaddr *sa, uint_t *salenp)
-{
- sin_t *sin = (sin_t *)sa;
- sin6_t *sin6 = (sin6_t *)sa;
-
- ASSERT(RW_LOCK_HELD(&udp->udp_rwlock));
- ASSERT(udp != NULL);
-
- if (udp->udp_state != TS_DATA_XFER)
- return (ENOTCONN);
-
- switch (udp->udp_family) {
- case AF_INET:
- ASSERT(udp->udp_ipversion == IPV4_VERSION);
-
- if (*salenp < sizeof (sin_t))
- return (EINVAL);
-
- *salenp = sizeof (sin_t);
- *sin = sin_null;
- sin->sin_family = AF_INET;
- sin->sin_port = udp->udp_dstport;
- sin->sin_addr.s_addr = V4_PART_OF_V6(udp->udp_v6dst);
- break;
- case AF_INET6:
- if (*salenp < sizeof (sin6_t))
- return (EINVAL);
-
- *salenp = sizeof (sin6_t);
- *sin6 = sin6_null;
- sin6->sin6_family = AF_INET6;
- sin6->sin6_port = udp->udp_dstport;
- sin6->sin6_addr = udp->udp_v6dst;
- sin6->sin6_flowinfo = udp->udp_flowinfo;
- break;
- }
-
- return (0);
-}
-
-/* ARGSUSED */
+/* ARGSUSED3 */
int
udp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa,
socklen_t *salenp, cred_t *cr)
@@ -8972,104 +6737,29 @@ udp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa,
/* All Solaris components should pass a cred for this operation. */
ASSERT(cr != NULL);
- ASSERT(udp != NULL);
-
- rw_enter(&udp->udp_rwlock, RW_READER);
-
- error = udp_do_getpeername(udp, sa, salenp);
-
- rw_exit(&udp->udp_rwlock);
-
+ mutex_enter(&connp->conn_lock);
+ if (udp->udp_state != TS_DATA_XFER)
+ error = ENOTCONN;
+ else
+ error = conn_getpeername(connp, sa, salenp);
+ mutex_exit(&connp->conn_lock);
return (error);
}
-static int
-udp_do_getsockname(udp_t *udp, struct sockaddr *sa, uint_t *salenp)
-{
- sin_t *sin = (sin_t *)sa;
- sin6_t *sin6 = (sin6_t *)sa;
-
- ASSERT(udp != NULL);
- ASSERT(RW_LOCK_HELD(&udp->udp_rwlock));
-
- switch (udp->udp_family) {
- case AF_INET:
- ASSERT(udp->udp_ipversion == IPV4_VERSION);
-
- if (*salenp < sizeof (sin_t))
- return (EINVAL);
-
- *salenp = sizeof (sin_t);
- *sin = sin_null;
- sin->sin_family = AF_INET;
- if (udp->udp_state == TS_UNBND) {
- break;
- }
- sin->sin_port = udp->udp_port;
-
- if (!IN6_IS_ADDR_V4MAPPED_ANY(&udp->udp_v6src) &&
- !IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) {
- sin->sin_addr.s_addr = V4_PART_OF_V6(udp->udp_v6src);
- } else {
- /*
- * INADDR_ANY
- * udp_v6src is not set, we might be bound to
- * broadcast/multicast. Use udp_bound_v6src as
- * local address instead (that could
- * also still be INADDR_ANY)
- */
- sin->sin_addr.s_addr =
- V4_PART_OF_V6(udp->udp_bound_v6src);
- }
- break;
-
- case AF_INET6:
- if (*salenp < sizeof (sin6_t))
- return (EINVAL);
-
- *salenp = sizeof (sin6_t);
- *sin6 = sin6_null;
- sin6->sin6_family = AF_INET6;
- if (udp->udp_state == TS_UNBND) {
- break;
- }
- sin6->sin6_port = udp->udp_port;
-
- if (!IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) {
- sin6->sin6_addr = udp->udp_v6src;
- } else {
- /*
- * UNSPECIFIED
- * udp_v6src is not set, we might be bound to
- * broadcast/multicast. Use udp_bound_v6src as
- * local address instead (that could
- * also still be UNSPECIFIED)
- */
- sin6->sin6_addr = udp->udp_bound_v6src;
- }
- }
- return (0);
-}
-
-/* ARGSUSED */
+/* ARGSUSED3 */
int
udp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa,
socklen_t *salenp, cred_t *cr)
{
conn_t *connp = (conn_t *)proto_handle;
- udp_t *udp = connp->conn_udp;
int error;
/* All Solaris components should pass a cred for this operation. */
ASSERT(cr != NULL);
- ASSERT(udp != NULL);
- rw_enter(&udp->udp_rwlock, RW_READER);
-
- error = udp_do_getsockname(udp, sa, salenp);
-
- rw_exit(&udp->udp_rwlock);
-
+ mutex_enter(&connp->conn_lock);
+ error = conn_getsockname(connp, sa, salenp);
+ mutex_exit(&connp->conn_lock);
return (error);
}
@@ -9078,7 +6768,6 @@ udp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
void *optvalp, socklen_t *optlen, cred_t *cr)
{
conn_t *connp = (conn_t *)proto_handle;
- udp_t *udp = connp->conn_udp;
int error;
t_uscalar_t max_optbuf_len;
void *optvalp_buf;
@@ -9090,7 +6779,6 @@ udp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
udp_opt_obj.odb_opt_des_arr,
udp_opt_obj.odb_opt_arr_cnt,
- udp_opt_obj.odb_topmost_tpiprovider,
B_FALSE, B_TRUE, cr);
if (error != 0) {
if (error < 0)
@@ -9099,28 +6787,22 @@ udp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
}
optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
- rw_enter(&udp->udp_rwlock, RW_READER);
len = udp_opt_get(connp, level, option_name, optvalp_buf);
- rw_exit(&udp->udp_rwlock);
-
- if (len < 0) {
- /*
- * Pass on to IP
- */
+ if (len == -1) {
kmem_free(optvalp_buf, max_optbuf_len);
- return (ip_get_options(connp, level, option_name,
- optvalp, optlen, cr));
- } else {
- /*
- * update optlen and copy option value
- */
- t_uscalar_t size = MIN(len, *optlen);
- bcopy(optvalp_buf, optvalp, size);
- bcopy(&size, optlen, sizeof (size));
-
- kmem_free(optvalp_buf, max_optbuf_len);
- return (0);
+ return (EINVAL);
}
+
+ /*
+ * update optlen and copy option value
+ */
+ t_uscalar_t size = MIN(len, *optlen);
+
+ bcopy(optvalp_buf, optvalp, size);
+ bcopy(&size, optlen, sizeof (size));
+
+ kmem_free(optvalp_buf, max_optbuf_len);
+ return (0);
}
int
@@ -9128,7 +6810,6 @@ udp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
const void *optvalp, socklen_t optlen, cred_t *cr)
{
conn_t *connp = (conn_t *)proto_handle;
- udp_t *udp = connp->conn_udp;
int error;
/* All Solaris components should pass a cred for this operation. */
@@ -9137,7 +6818,6 @@ udp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
error = proto_opt_check(level, option_name, optlen, NULL,
udp_opt_obj.odb_opt_des_arr,
udp_opt_obj.odb_opt_arr_cnt,
- udp_opt_obj.odb_topmost_tpiprovider,
B_TRUE, B_FALSE, cr);
if (error != 0) {
@@ -9146,19 +6826,11 @@ udp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
return (error);
}
- rw_enter(&udp->udp_rwlock, RW_WRITER);
error = udp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name,
optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp,
NULL, cr);
- rw_exit(&udp->udp_rwlock);
- if (error < 0) {
- /*
- * Pass on to ip
- */
- error = ip_set_options(connp, level, option_name, optvalp,
- optlen, cr);
- }
+ ASSERT(error >= 0);
return (error);
}
@@ -9174,7 +6846,7 @@ udp_clr_flowctrl(sock_lower_handle_t proto_handle)
mutex_exit(&udp->udp_recv_lock);
}
-/* ARGSUSED */
+/* ARGSUSED2 */
int
udp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
{
@@ -9204,6 +6876,27 @@ udp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
/* All Solaris components should pass a cred for this operation. */
ASSERT(cr != NULL);
+ /*
+ * If we don't have a helper stream then create one.
+ * ip_create_helper_stream takes care of locking the conn_t,
+ * so this check for NULL is just a performance optimization.
+ */
+ if (connp->conn_helper_info == NULL) {
+ udp_stack_t *us = connp->conn_udp->udp_us;
+
+ ASSERT(us->us_ldi_ident != NULL);
+
+ /*
+ * Create a helper stream for non-STREAMS socket.
+ */
+ error = ip_create_helper_stream(connp, us->us_ldi_ident);
+ if (error != 0) {
+ ip0dbg(("tcp_ioctl: create of IP helper stream "
+ "failed %d\n", error));
+ return (error);
+ }
+ }
+
switch (cmd) {
case ND_SET:
case ND_GET:
diff --git a/usr/src/uts/common/inet/udp/udp_opt_data.c b/usr/src/uts/common/inet/udp/udp_opt_data.c
index 425d258697..02d9d3f8f8 100644
--- a/usr/src/uts/common/inet/udp/udp_opt_data.c
+++ b/usr/src/uts/common/inet/udp/udp_opt_data.c
@@ -56,227 +56,229 @@
*/
opdes_t udp_opt_arr[] = {
-{ SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0
+{ SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
},
-{ SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
+{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
sizeof (struct timeval), 0 },
-{ SO_RCVTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ SO_RCVTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
sizeof (struct timeval), 0 },
-{ SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int),
+{ SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
0 },
-{ SO_RECVUCRED, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0
+{ SO_RECVUCRED, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
},
-{ SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT, sizeof (int),
+{ SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
0 },
-{ SO_TIMESTAMP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0
+{ SO_TIMESTAMP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
},
-{ SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int),
+{ SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
0 },
-{ SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int),
+{ SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
0 },
-{ SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int),
+{ SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
0 },
{ SCM_UCRED, SOL_SOCKET, OA_W, OA_W, OP_NP, OP_VARLEN|OP_NODEFAULT, 512, 0 },
-{ SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
+{ SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
+{ SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
{ IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+ (OP_VARLEN|OP_NODEFAULT),
IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
{ T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+ (OP_VARLEN|OP_NODEFAULT),
IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
-{ IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ IP_RECVOPTS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ IP_RECVDSTADDR, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0
+{ IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ IP_RECVOPTS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ IP_RECVDSTADDR, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
},
-{ IP_RECVIF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ IP_RECVSLLA, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ IP_RECVTTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int),
+{ IP_RECVIF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ IP_RECVSLLA, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ IP_RECVTTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
0 },
-{ IP_MULTICAST_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IP_MULTICAST_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
sizeof (struct in_addr), 0 /* INADDR_ANY */ },
-{ IP_MULTICAST_LOOP, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_DEF_FN),
+{ IP_MULTICAST_LOOP, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
sizeof (uchar_t), -1 /* not initialized */},
-{ IP_MULTICAST_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_DEF_FN),
+{ IP_MULTICAST_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
sizeof (uchar_t), -1 /* not initialized */ },
-{ IP_ADD_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IP_ADD_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, OP_NODEFAULT,
sizeof (struct ip_mreq), -1 /* not initialized */ },
-{ IP_DROP_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IP_DROP_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, OP_NODEFAULT,
sizeof (struct ip_mreq), -1 /* not initialized */ },
-{ IP_BLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IP_BLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, OP_NODEFAULT,
sizeof (struct ip_mreq_source), -1 /* not initialized */ },
-{ IP_UNBLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IP_UNBLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, OP_NODEFAULT,
sizeof (struct ip_mreq_source), -1 /* not initialized */ },
{ IP_ADD_SOURCE_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct ip_mreq_source), -1 },
+ OP_NODEFAULT, sizeof (struct ip_mreq_source), -1 },
{ IP_DROP_SOURCE_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct ip_mreq_source), -1 },
+ OP_NODEFAULT, sizeof (struct ip_mreq_source), -1 },
-{ IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
sizeof (ipsec_req_t), -1 /* not initialized */ },
-{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 /* no ifindex */ },
-{ IP_DHCPINIT_IF, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT,
+{ IP_DHCPINIT_IF, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
sizeof (int), 0 },
-{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, OP_PASSNEXT,
+{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
sizeof (int), 0 },
{ IP_BROADCAST_TTL, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0, sizeof (uchar_t),
0 /* disabled */ },
{ IP_PKTINFO, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT|OP_VARLEN),
+ (OP_NODEFAULT|OP_VARLEN),
sizeof (struct in_pktinfo), -1 /* not initialized */ },
-{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT,
+{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
sizeof (in_addr_t), -1 /* not initialized */ },
+{ IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+
{ MCAST_JOIN_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_req),
+ OP_NODEFAULT, sizeof (struct group_req),
-1 /* not initialized */ },
{ MCAST_LEAVE_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_req),
+ OP_NODEFAULT, sizeof (struct group_req),
-1 /* not initialized */ },
{ MCAST_BLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+ OP_NODEFAULT, sizeof (struct group_source_req),
-1 /* not initialized */ },
{ MCAST_UNBLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+ OP_NODEFAULT, sizeof (struct group_source_req),
-1 /* not initialized */ },
{ MCAST_JOIN_SOURCE_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+ OP_NODEFAULT, sizeof (struct group_source_req),
-1 /* not initialized */ },
{ MCAST_LEAVE_SOURCE_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+ OP_NODEFAULT, sizeof (struct group_source_req),
-1 /* not initialized */ },
-{ IPV6_MULTICAST_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_MULTICAST_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
{ IPV6_MULTICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_DEF_FN), sizeof (int), -1 /* not initialized */ },
+ OP_DEF_FN, sizeof (int), -1 /* not initialized */ },
{ IPV6_MULTICAST_LOOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_DEF_FN), sizeof (int), -1 /* not initialized */},
+ OP_DEF_FN, sizeof (int), -1 /* not initialized */},
-{ IPV6_JOIN_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IPV6_JOIN_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, OP_NODEFAULT,
sizeof (struct ipv6_mreq), -1 /* not initialized */ },
{ IPV6_LEAVE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT),
+ OP_NODEFAULT,
sizeof (struct ipv6_mreq), -1 /* not initialized */ },
-{ IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_DEF_FN),
+{ IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
sizeof (int), -1 /* not initialized */ },
-{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 /* no ifindex */ },
-{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, OP_PASSNEXT,
+{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
sizeof (int), 0 },
{ IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT|OP_VARLEN),
+ (OP_NODEFAULT|OP_VARLEN),
sizeof (struct in6_pktinfo), -1 /* not initialized */ },
{ IPV6_HOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT),
+ OP_NODEFAULT,
sizeof (int), -1 /* not initialized */ },
{ IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT|OP_VARLEN),
+ (OP_NODEFAULT|OP_VARLEN),
sizeof (sin6_t), -1 /* not initialized */ },
{ IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+ (OP_VARLEN|OP_NODEFAULT),
MAX_EHDR_LEN, -1 /* not initialized */ },
{ IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+ (OP_VARLEN|OP_NODEFAULT),
MAX_EHDR_LEN, -1 /* not initialized */ },
{ IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+ (OP_VARLEN|OP_NODEFAULT),
MAX_EHDR_LEN, -1 /* not initialized */ },
{ IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+ (OP_VARLEN|OP_NODEFAULT),
MAX_EHDR_LEN, -1 /* not initialized */ },
{ IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT),
+ OP_NODEFAULT,
sizeof (int), -1 /* not initialized */ },
{ IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT),
- sizeof (int), -1 /* not initialized */ },
-{ IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+ OP_NODEFAULT,
+ sizeof (struct ip6_mtuinfo), -1 },
+{ IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
{ IPV6_RECVPATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
- OP_PASSNEXT, sizeof (int), 0 },
-{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+ 0, sizeof (int), 0 },
+{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (int), 0 },
-{ IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_NODEFAULT),
+{ IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
sizeof (ipsec_req_t), -1 /* not initialized */ },
-{ IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
+{ IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
{ MCAST_JOIN_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_req),
+ OP_NODEFAULT, sizeof (struct group_req),
-1 /* not initialized */ },
{ MCAST_LEAVE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_req),
+ OP_NODEFAULT, sizeof (struct group_req),
-1 /* not initialized */ },
{ MCAST_BLOCK_SOURCE, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+ OP_NODEFAULT, sizeof (struct group_source_req),
-1 /* not initialized */ },
{ MCAST_UNBLOCK_SOURCE, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+ OP_NODEFAULT, sizeof (struct group_source_req),
-1 /* not initialized */ },
{ MCAST_JOIN_SOURCE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+ OP_NODEFAULT, sizeof (struct group_source_req),
-1 /* not initialized */ },
{ MCAST_LEAVE_SOURCE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP,
- (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req),
+ OP_NODEFAULT, sizeof (struct group_source_req),
-1 /* not initialized */ },
-{ UDP_ANONPRIVBIND, IPPROTO_UDP, OA_R, OA_RW, OP_PRIVPORT, OP_PASSNEXT,
+{ UDP_ANONPRIVBIND, IPPROTO_UDP, OA_R, OA_RW, OP_PRIVPORT, 0,
sizeof (int), 0 },
-{ UDP_EXCLBIND, IPPROTO_UDP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0
+{ UDP_EXCLBIND, IPPROTO_UDP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
},
{ UDP_RCVHDR, IPPROTO_UDP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
},
@@ -317,7 +319,6 @@ optdb_obj_t udp_opt_obj = {
udp_opt_default, /* UDP default value function pointer */
udp_tpi_opt_get, /* UDP get function pointer */
udp_tpi_opt_set, /* UDP set function pointer */
- B_TRUE, /* UDP is tpi provider */
UDP_OPT_ARR_CNT, /* UDP option database count of entries */
udp_opt_arr, /* UDP option database */
UDP_VALID_LEVELS_CNT, /* UDP valid level count of entries */
diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h
index 1b4935f456..4da82a0377 100644
--- a/usr/src/uts/common/inet/udp_impl.h
+++ b/usr/src/uts/common/inet/udp_impl.h
@@ -51,84 +51,6 @@ extern "C" {
#define UDP_MOD_ID 5607
-typedef struct udp_bits_s {
-
- uint32_t
-
- udpb_debug : 1, /* SO_DEBUG "socket" option. */
- udpb_dontroute : 1, /* SO_DONTROUTE "socket" option. */
- udpb_broadcast : 1, /* SO_BROADCAST "socket" option. */
- udpb_useloopback : 1, /* SO_USELOOPBACK "socket" option */
-
- udpb_reuseaddr : 1, /* SO_REUSEADDR "socket" option. */
- udpb_dgram_errind : 1, /* SO_DGRAM_ERRIND option */
- udpb_recvdstaddr : 1, /* IP_RECVDSTADDR option */
- udpb_recvopts : 1, /* IP_RECVOPTS option */
-
- udpb_unspec_source : 1, /* IP*_UNSPEC_SRC option */
- udpb_ip_recvpktinfo : 1, /* IPV6_RECVPKTINFO option */
- udpb_ipv6_recvhoplimit : 1, /* IPV6_RECVHOPLIMIT option */
- udpb_ipv6_recvhopopts : 1, /* IPV6_RECVHOPOPTS option */
-
- udpb_ipv6_recvdstopts : 1, /* IPV6_RECVDSTOPTS option */
- udpb_ipv6_recvrthdr : 1, /* IPV6_RECVRTHDR option */
- udpb_ipv6_recvtclass : 1, /* IPV6_RECVTCLASS */
- udpb_ipv6_recvpathmtu : 1, /* IPV6_RECVPATHMTU */
-
- udpb_anon_priv_bind : 1,
- udpb_exclbind : 1, /* ``exclusive'' binding */
- udpb_recvif : 1, /* IP_RECVIF option */
- udpb_recvslla : 1, /* IP_RECVSLLA option */
-
- udpb_recvttl : 1, /* IP_RECVTTL option */
- udpb_recvucred : 1, /* IP_RECVUCRED option */
- udpb_old_ipv6_recvdstopts : 1, /* old form of IPV6_DSTOPTS */
- udpb_ipv6_recvrthdrdstopts : 1, /* IPV6_RECVRTHDRDSTOPTS */
-
- udpb_rcvhdr : 1, /* UDP_RCVHDR option */
- udpb_issocket : 1, /* socket mode; sockfs is on top */
- udpb_timestamp : 1, /* SO_TIMESTAMP "socket" option */
-
- udpb_nat_t_endpoint : 1, /* UDP_NAT_T_ENDPOINT option */
- udpb_pad_to_bit_31 : 4;
-} udp_bits_t;
-
-#define udp_debug udp_bits.udpb_debug
-#define udp_dontroute udp_bits.udpb_dontroute
-#define udp_broadcast udp_bits.udpb_broadcast
-#define udp_useloopback udp_bits.udpb_useloopback
-
-#define udp_reuseaddr udp_bits.udpb_reuseaddr
-#define udp_dgram_errind udp_bits.udpb_dgram_errind
-#define udp_recvdstaddr udp_bits.udpb_recvdstaddr
-#define udp_recvopts udp_bits.udpb_recvopts
-
-#define udp_unspec_source udp_bits.udpb_unspec_source
-#define udp_ip_recvpktinfo udp_bits.udpb_ip_recvpktinfo
-#define udp_ipv6_recvhoplimit udp_bits.udpb_ipv6_recvhoplimit
-#define udp_ipv6_recvhopopts udp_bits.udpb_ipv6_recvhopopts
-
-#define udp_ipv6_recvdstopts udp_bits.udpb_ipv6_recvdstopts
-#define udp_ipv6_recvrthdr udp_bits.udpb_ipv6_recvrthdr
-#define udp_ipv6_recvtclass udp_bits.udpb_ipv6_recvtclass
-#define udp_ipv6_recvpathmtu udp_bits.udpb_ipv6_recvpathmtu
-
-#define udp_anon_priv_bind udp_bits.udpb_anon_priv_bind
-#define udp_exclbind udp_bits.udpb_exclbind
-#define udp_recvif udp_bits.udpb_recvif
-#define udp_recvslla udp_bits.udpb_recvslla
-
-#define udp_recvttl udp_bits.udpb_recvttl
-#define udp_recvucred udp_bits.udpb_recvucred
-#define udp_old_ipv6_recvdstopts udp_bits.udpb_old_ipv6_recvdstopts
-#define udp_ipv6_recvrthdrdstopts udp_bits.udpb_ipv6_recvrthdrdstopts
-
-#define udp_rcvhdr udp_bits.udpb_rcvhdr
-#define udp_issocket udp_bits.udpb_issocket
-#define udp_timestamp udp_bits.udpb_timestamp
-
-#define udp_nat_t_endpoint udp_bits.udpb_nat_t_endpoint
-
/*
* Bind hash list size and hash function. It has to be a power of 2 for
* hashing.
@@ -148,49 +70,21 @@ typedef struct udp_fanout_s {
#endif
} udp_fanout_t;
-/*
- * dev_q is the write side queue of the entity below IP.
- * If there is a module below IP, we can't optimize by looking
- * at q_first of the queue below IP. If the driver is directly
- * below IP and if the q_first is NULL, we optimize by not doing
- * the canput check
- */
-#define DEV_Q_FLOW_BLOCKED(dev_q) \
- (((dev_q)->q_next != NULL || (dev_q)->q_first != NULL) && \
- !canput(dev_q))
-
/* Kstats */
typedef struct udp_stat { /* Class "net" kstats */
- kstat_named_t udp_ip_send;
- kstat_named_t udp_ip_ire_send;
- kstat_named_t udp_ire_null;
kstat_named_t udp_sock_fallback;
- kstat_named_t udp_out_sw_cksum;
- kstat_named_t udp_out_sw_cksum_bytes;
kstat_named_t udp_out_opt;
kstat_named_t udp_out_err_notconn;
kstat_named_t udp_out_err_output;
kstat_named_t udp_out_err_tudr;
- kstat_named_t udp_in_pktinfo;
- kstat_named_t udp_in_recvdstaddr;
- kstat_named_t udp_in_recvopts;
- kstat_named_t udp_in_recvif;
- kstat_named_t udp_in_recvslla;
- kstat_named_t udp_in_recvucred;
- kstat_named_t udp_in_recvttl;
- kstat_named_t udp_in_recvhopopts;
- kstat_named_t udp_in_recvhoplimit;
- kstat_named_t udp_in_recvdstopts;
- kstat_named_t udp_in_recvrtdstopts;
- kstat_named_t udp_in_recvrthdr;
- kstat_named_t udp_in_recvpktinfo;
- kstat_named_t udp_in_recvtclass;
- kstat_named_t udp_in_timestamp;
- kstat_named_t udp_ip_rcvpktinfo;
- kstat_named_t udp_cookie_coll;
#ifdef DEBUG
kstat_named_t udp_data_conn;
kstat_named_t udp_data_notconn;
+ kstat_named_t udp_out_lastdst;
+ kstat_named_t udp_out_diffdst;
+ kstat_named_t udp_out_ipv6;
+ kstat_named_t udp_out_mapped;
+ kstat_named_t udp_out_ipv4;
#endif
} udp_stat_t;
@@ -242,79 +136,43 @@ typedef struct udp_stack udp_stack_t;
/* Internal udp control structure, one per open stream */
typedef struct udp_s {
- krwlock_t udp_rwlock; /* Protects most of udp_t */
- t_scalar_t udp_pending_op; /* The current TPI operation */
/*
- * Following fields up to udp_ipversion protected by conn_lock,
- * and the fanout lock i.e.uf_lock. Need both locks to change the
- * field, either lock is sufficient for reading the field.
+ * The addresses and ports in the conn_t and udp_state are protected by
+ * conn_lock and the fanout lock i.e. uf_lock. Need both locks to change
+ * the fields, either lock is sufficient for reading the field.
+ * conn_lock also protects the content of udp_t.
*/
uint32_t udp_state; /* TPI state */
- in_port_t udp_port; /* Port bound to this stream */
- in_port_t udp_dstport; /* Connected port */
- in6_addr_t udp_v6src; /* Source address of this stream */
- in6_addr_t udp_bound_v6src; /* Explicitly bound address */
- in6_addr_t udp_v6dst; /* Connected destination */
- /*
- * IP format that packets transmitted from this struct should use.
- * Value can be IP4_VERSION or IPV6_VERSION.
- */
- ushort_t udp_ipversion;
- /* Written to only once at the time of opening the endpoint */
- sa_family_t udp_family; /* Family from socket() call */
-
- /* Following protected by udp_rwlock */
- uint32_t udp_flowinfo; /* Connected flow id and tclass */
- uint32_t udp_max_hdr_len; /* For write offset in stream head */
- uint32_t udp_ip_snd_options_len; /* Len of IPv4 options */
- uchar_t *udp_ip_snd_options; /* Ptr to IPv4 options */
- uint32_t udp_ip_rcv_options_len; /* Len of IPv4 options recvd */
- uchar_t *udp_ip_rcv_options; /* Ptr to IPv4 options recvd */
- uchar_t udp_multicast_ttl; /* IP*_MULTICAST_TTL/HOPS */
- ipaddr_t udp_multicast_if_addr; /* IP_MULTICAST_IF option */
- uint_t udp_multicast_if_index; /* IPV6_MULTICAST_IF option */
- int udp_bound_if; /* IP*_BOUND_IF option */
+ ip_pkt_t udp_recv_ipp; /* Used for IPv4 options received */
/* Written to only once at the time of opening the endpoint */
conn_t *udp_connp;
- /* Following protected by udp_rwlock */
- udp_bits_t udp_bits; /* Bit fields defined above */
- uint8_t udp_type_of_service; /* IP_TOS option */
- uint8_t udp_ttl; /* TTL or hoplimit */
- ip6_pkt_t udp_sticky_ipp; /* Sticky options */
- uint8_t *udp_sticky_hdrs; /* Prebuilt IPv6 hdrs */
- uint_t udp_sticky_hdrs_len; /* Incl. ip6h and any ip6i */
+ uint32_t
+ udp_issocket : 1, /* socket mode; sockfs is on top */
+ udp_nat_t_endpoint : 1, /* UDP_NAT_T_ENDPOINT option */
+ udp_rcvhdr : 1, /* UDP_RCVHDR option */
+
+ udp_pad_to_bit_31 : 29;
/* Following 2 fields protected by the uf_lock */
struct udp_s *udp_bind_hash; /* Bind hash chain */
struct udp_s **udp_ptpbhn; /* Pointer to previous bind hash next. */
- /* Following protected by udp_rwlock */
kmutex_t udp_recv_lock; /* recv lock */
size_t udp_rcv_disply_hiwat; /* user's view of rcvbuf */
size_t udp_rcv_hiwat; /* receive high watermark */
- size_t udp_rcv_lowat; /* receive low watermark */
- size_t udp_xmit_hiwat; /* Send buffer high watermark */
- size_t udp_xmit_lowat; /* Send buffer low watermark */
- uint_t udp_label_len; /* length of security label */
- uint_t udp_label_len_v6; /* len of v6 security label */
- in6_addr_t udp_v6lastdst; /* most recent destination */
- in_port_t udp_lastdstport; /* most recent dest port */
- cred_t *udp_last_cred; /* most recent credentials */
- cred_t *udp_effective_cred; /* cred with effective label */
-
- uint64_t udp_open_time; /* time when this was opened */
- pid_t udp_open_pid; /* process id when this was opened */
+
+ /* Set at open time and never changed */
udp_stack_t *udp_us; /* Stack instance for zone */
+
int udp_delayed_error;
mblk_t *udp_fallback_queue_head;
mblk_t *udp_fallback_queue_tail;
struct sockaddr_storage udp_delayed_addr;
} udp_t;
-/* UDP Protocol header */
/* UDP Protocol header aligned */
typedef struct udpahdr_s {
in_port_t uha_src_port; /* Source port */
@@ -334,6 +192,8 @@ typedef struct udpahdr_s {
#define us_xmit_lowat us_param_arr[8].udp_param_value
#define us_recv_hiwat us_param_arr[9].udp_param_value
#define us_max_buf us_param_arr[10].udp_param_value
+#define us_pmtu_discovery us_param_arr[11].udp_param_value
+#define us_sendto_ignerr us_param_arr[12].udp_param_value
#define UDP_STAT(us, x) ((us)->us_statistics.x.value.ui64++)
@@ -348,14 +208,11 @@ typedef struct udpahdr_s {
extern int udp_opt_default(queue_t *, t_scalar_t, t_scalar_t, uchar_t *);
extern int udp_tpi_opt_get(queue_t *, t_scalar_t, t_scalar_t, uchar_t *);
extern int udp_tpi_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *,
- uint_t *, uchar_t *, void *, cred_t *, mblk_t *);
+ uint_t *, uchar_t *, void *, cred_t *);
extern mblk_t *udp_snmp_get(queue_t *, mblk_t *);
extern int udp_snmp_set(queue_t *, t_scalar_t, t_scalar_t, uchar_t *, int);
-extern void udp_close_free(conn_t *);
-extern void udp_quiesce_conn(conn_t *);
extern void udp_ddi_g_init(void);
extern void udp_ddi_g_destroy(void);
-extern void udp_g_q_inactive(udp_stack_t *);
extern void udp_output(conn_t *connp, mblk_t *mp, struct sockaddr *addr,
socklen_t addrlen);
extern void udp_wput(queue_t *, mblk_t *);
diff --git a/usr/src/uts/common/io/dld/dld_proto.c b/usr/src/uts/common/io/dld/dld_proto.c
index 338a1c96d0..79b88ca659 100644
--- a/usr/src/uts/common/io/dld/dld_proto.c
+++ b/usr/src/uts/common/io/dld/dld_proto.c
@@ -1478,7 +1478,7 @@ dld_capab_lso(dld_str_t *dsp, void *data, uint_t flags)
lso->lso_flags = 0;
/* translate the flag for mac clients */
if ((mac_lso.lso_flags & LSO_TX_BASIC_TCP_IPV4) != 0)
- lso->lso_flags |= DLD_LSO_TX_BASIC_TCP_IPV4;
+ lso->lso_flags |= DLD_LSO_BASIC_TCP_IPV4;
dsp->ds_lso = B_TRUE;
dsp->ds_lso_max = lso->lso_max;
} else {
diff --git a/usr/src/uts/common/io/ib/clients/rds/rds_opt.c b/usr/src/uts/common/io/ib/clients/rds/rds_opt.c
index 902d838ff4..639bb28bcc 100644
--- a/usr/src/uts/common/io/ib/clients/rds/rds_opt.c
+++ b/usr/src/uts/common/io/ib/clients/rds/rds_opt.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -29,9 +29,9 @@
#define rds_max_buf 2097152
opdes_t rds_opt_arr[] = {
-{ SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
-{ SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
+{ SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
+{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
};
/* ARGSUSED */
@@ -79,7 +79,7 @@ rds_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
int
rds_opt_set(queue_t *q, uint_t optset_context, int level,
int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
- uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
+ uchar_t *outvalp, void *thisdg_attrs, cred_t *cr)
{
int *i1 = (int *)(uintptr_t)invalp;
boolean_t checkonly;
@@ -187,7 +187,6 @@ optdb_obj_t rds_opt_obj = {
rds_opt_default, /* RDS default value function pointer */
rds_opt_get, /* RDS get function pointer */
rds_opt_set, /* RDS set function pointer */
- B_TRUE, /* RDS is tpi provider */
RDS_OPT_ARR_CNT, /* RDS option database count of entries */
rds_opt_arr, /* RDS option database */
RDS_VALID_LEVELS_CNT, /* RDS valid level count of entries */
diff --git a/usr/src/uts/common/io/ib/clients/rds/rdsddi.c b/usr/src/uts/common/io/ib/clients/rds/rdsddi.c
index a4a9c6c8e0..13a1d4bf75 100644
--- a/usr/src/uts/common/io/ib/clients/rds/rdsddi.c
+++ b/usr/src/uts/common/io/ib/clients/rds/rdsddi.c
@@ -654,11 +654,9 @@ rds_wput_other(queue_t *q, mblk_t *mp)
}
if (((union T_primitives *)(uintptr_t)rptr)->type ==
T_SVR4_OPTMGMT_REQ) {
- (void) svr4_optcom_req(q, mp, cr, &rds_opt_obj,
- B_FALSE);
+ svr4_optcom_req(q, mp, cr, &rds_opt_obj);
} else {
- (void) tpi_optcom_req(q, mp, cr, &rds_opt_obj,
- B_FALSE);
+ tpi_optcom_req(q, mp, cr, &rds_opt_obj);
}
return;
case T_CONN_REQ:
diff --git a/usr/src/uts/common/io/ib/clients/rds/rdssubr.c b/usr/src/uts/common/io/ib/clients/rds/rdssubr.c
index 8e57cb783d..f9bbcd092f 100644
--- a/usr/src/uts/common/io/ib/clients/rds/rdssubr.c
+++ b/usr/src/uts/common/io/ib/clients/rds/rdssubr.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/ib/clients/rds/rds.h>
#include <sys/ib/clients/rds/rds_kstat.h>
@@ -135,9 +133,9 @@ rds_init()
* kstats
*/
rds_kstatsp = kstat_create("rds", 0,
- "rds_kstat", "misc", KSTAT_TYPE_NAMED,
- sizeof (rds_kstat) / sizeof (kstat_named_t),
- KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
+ "rds_kstat", "misc", KSTAT_TYPE_NAMED,
+ sizeof (rds_kstat) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
if (rds_kstatsp != NULL) {
rds_kstatsp->ks_lock = &rds_kstat_mutex;
rds_kstatsp->ks_data = (void *)&rds_kstat;
@@ -298,17 +296,14 @@ rds_fanout(ipaddr_t local_addr, ipaddr_t rem_addr,
boolean_t
rds_islocal(ipaddr_t addr)
{
- ire_t *ire;
ip_stack_t *ipst;
ipst = netstack_find_by_zoneid(GLOBAL_ZONEID)->netstack_ip;
ASSERT(ipst != NULL);
-
- ire = ire_ctable_lookup(addr, NULL, IRE_LOCAL | IRE_LOOPBACK |
- IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
- netstack_rele(ipst->ips_netstack);
- if (ire == NULL)
+ if (ip_laddr_verify_v4(addr, ALL_ZONES, ipst, B_FALSE) == IPVL_BAD) {
+ netstack_rele(ipst->ips_netstack);
return (B_FALSE);
- ire_refrele(ire);
+ }
+ netstack_rele(ipst->ips_netstack);
return (B_TRUE);
}
diff --git a/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp.c b/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp.c
index 944e61a067..3bb7d3a98c 100644
--- a/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp.c
+++ b/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp.c
@@ -26,41 +26,28 @@
#include <sys/types.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
-#include <sys/stropts.h>
-#include <sys/stream.h>
-#include <sys/strsun.h>
#include <sys/strsubr.h>
#include <sys/socket.h>
-#include <sys/stat.h>
#include <net/if_arp.h>
#include <net/if_types.h>
-#include <sys/file.h>
#include <sys/sockio.h>
#include <sys/pathname.h>
-#include <inet/arp.h>
-#include <sys/modctl.h>
#include <sys/ib/mgt/ibcm/ibcm_arp.h>
#include <sys/kstr.h>
-#include <sys/tiuser.h>
#include <sys/t_kuser.h>
extern char cmlog[];
-extern int ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr,
- ibt_ip_addr_t *src_addr, ibcm_arp_pr_comp_func_t func);
-extern void ibcm_arp_pr_arp_ack(mblk_t *mp);
-extern void ibcm_arp_prwqn_delete(ibcm_arp_prwqn_t *wqnp);
+extern int ibcm_resolver_pr_lookup(ibcm_arp_streams_t *ib_s,
+ ibt_ip_addr_t *dst_addr, ibt_ip_addr_t *src_addr);
+extern void ibcm_arp_delete_prwqn(ibcm_arp_prwqn_t *wqnp);
-_NOTE(SCHEME_PROTECTS_DATA("Unshared data", datab))
_NOTE(SCHEME_PROTECTS_DATA("Unshared data", ibt_ip_addr_s))
_NOTE(SCHEME_PROTECTS_DATA("Unshared data", ibcm_arp_ip_t))
_NOTE(SCHEME_PROTECTS_DATA("Unshared data", ibcm_arp_ibd_insts_t))
_NOTE(SCHEME_PROTECTS_DATA("Unshared data", ibcm_arp_prwqn_t))
-_NOTE(SCHEME_PROTECTS_DATA("Unshared data", iocblk))
-_NOTE(SCHEME_PROTECTS_DATA("Unshared data", msgb))
-_NOTE(SCHEME_PROTECTS_DATA("Unshared data", queue))
_NOTE(SCHEME_PROTECTS_DATA("Unshared data", sockaddr_in))
_NOTE(SCHEME_PROTECTS_DATA("Unshared data", sockaddr_in6))
@@ -89,269 +76,6 @@ ibcm_ip_print(char *label, ibt_ip_addr_t *ipaddr)
}
}
-/*
- * ibcm_arp_get_ibaddr_cb
- */
-static int
-ibcm_arp_get_ibaddr_cb(void *arg, int status)
-{
- ibcm_arp_prwqn_t *wqnp = (ibcm_arp_prwqn_t *)arg;
- ibcm_arp_streams_t *ib_s = (ibcm_arp_streams_t *)wqnp->arg;
-
- IBTF_DPRINTF_L4(cmlog, "ibcm_arp_get_ibaddr_cb(ib_s: %p wqnp: %p)",
- ib_s, wqnp);
-
- mutex_enter(&ib_s->lock);
- ib_s->status = status;
- ib_s->done = B_TRUE;
-
- IBTF_DPRINTF_L3(cmlog, "ibcm_arp_get_ibaddr_cb: SGID %llX:%llX "
- "DGID: %llX:%llX", wqnp->sgid.gid_prefix, wqnp->sgid.gid_guid,
- wqnp->dgid.gid_prefix, wqnp->dgid.gid_guid);
-
- /* lock is held by the caller. */
- cv_signal(&ib_s->cv);
- mutex_exit(&ib_s->lock);
- return (0);
-}
-
-/*
- * Lower read service procedure (messages coming back from arp/ip).
- * Process messages based on queue type.
- */
-static int
-ibcm_arp_lrsrv(queue_t *q)
-{
- mblk_t *mp;
- ibcm_arp_streams_t *ib_s = q->q_ptr;
-
- IBTF_DPRINTF_L4(cmlog, "ibcm_arp_lrsrv(%p, ibd_s: 0x%p)", q, ib_s);
-
- if (WR(q) == ib_s->arpqueue) {
- while (mp = getq(q)) {
- ibcm_arp_pr_arp_ack(mp);
- }
- }
-
- return (0);
-}
-
-/*
- * Lower write service procedure.
- * Used when lower streams are flow controlled.
- */
-static int
-ibcm_arp_lwsrv(queue_t *q)
-{
- mblk_t *mp;
-
- IBTF_DPRINTF_L4(cmlog, "ibcm_arp_lwsrv(%p)", q);
-
- while (mp = getq(q)) {
- if (canputnext(q)) {
- putnext(q, mp);
- } else {
- (void) putbq(q, mp);
- qenable(q);
- break;
- }
- }
-
- return (0);
-}
-
-/*
- * Lower read put procedure. Arp/ip messages come here.
- */
-static int
-ibcm_arp_lrput(queue_t *q, mblk_t *mp)
-{
- IBTF_DPRINTF_L4(cmlog, "ibcm_arp_lrput(0x%p, db_type: %d)",
- q, DB_TYPE(mp));
-
- switch (DB_TYPE(mp)) {
- case M_FLUSH:
- /*
- * Turn around
- */
- if (*mp->b_rptr & FLUSHW) {
- *mp->b_rptr &= ~FLUSHR;
- qreply(q, mp);
- return (0);
- }
- freemsg(mp);
- break;
- case M_IOCACK:
- case M_IOCNAK:
- case M_DATA:
- /*
- * This could be in interrupt context.
- * Some of the ibt calls cannot be called in
- * interrupt context, so
- * put it in the queue and the message will be
- * processed by service proccedure
- */
- (void) putq(q, mp);
- qenable(q);
- break;
- default:
- IBTF_DPRINTF_L2(cmlog, "ibcm_arp_lrput: "
- "got unknown msg <0x%x>\n", mp->b_datap->db_type);
- ASSERT(0);
- break;
- }
-
- return (0);
-}
-
-/*
- * Streams write queue module info
- */
-static struct module_info ibcm_arp_winfo = {
- 0, /* module ID number */
- "ibcm", /* module name */
- 0, /* min packet size */
- INFPSZ,
- 49152, /* STREAM queue high water mark -- 49152 */
- 12 /* STREAM queue low water mark -- 12 */
-};
-
-/*
- * Streams lower write queue, for ibcm/ip requests.
- */
-static struct qinit ibcm_arp_lwinit = {
- NULL, /* qi_putp */
- ibcm_arp_lwsrv, /* qi_srvp */
- NULL, /* qi_qopen */
- NULL, /* qi_qclose */
- NULL, /* qi_qadmin */
- &ibcm_arp_winfo, /* module info */
- NULL, /* module statistics struct */
- NULL,
- NULL,
- STRUIOT_NONE /* stream uio type is standard uiomove() */
-};
-
-/*
- * Streams lower read queue: read reply messages from ibcm/ip.
- */
-static struct qinit ibcm_arp_lrinit = {
- ibcm_arp_lrput, /* qi_putp */
- ibcm_arp_lrsrv, /* qi_srvp */
- NULL, /* qi_qopen */
- NULL, /* qi_qclose */
- NULL, /* qi_qadmin */
- &ibcm_arp_winfo, /* module info */
- NULL, /* module statistics struct */
- NULL,
- NULL,
- STRUIOT_NONE /* stream uio type is standard uiomove() */
-};
-
-
-static int
-ibcm_arp_link_driver(ibcm_arp_streams_t *ib_s, char *path, queue_t **q,
- vnode_t **dev_vp)
-{
- struct stdata *dev_stp;
- vnode_t *vp;
- int error;
- queue_t *rq;
-
- IBTF_DPRINTF_L4(cmlog, "ibcm_arp_link_driver: Enter: %s", path);
-
- /* open the driver from inside the kernel */
- error = vn_open(path, UIO_SYSSPACE, FREAD|FWRITE, 0, &vp,
- 0, NULL);
- if (error) {
- IBTF_DPRINTF_L2(cmlog, "ibcm_arp_link_driver: "
- "vn_open('%s') failed\n", path);
- return (error);
- }
- *dev_vp = vp;
-
- dev_stp = vp->v_stream;
- *q = dev_stp->sd_wrq;
-
- VN_HOLD(vp);
-
- rq = RD(dev_stp->sd_wrq);
- RD(rq)->q_ptr = WR(rq)->q_ptr = ib_s;
- setq(rq, &ibcm_arp_lrinit, &ibcm_arp_lwinit, NULL, QMTSAFE,
- SQ_CI|SQ_CO, B_FALSE);
-
- return (0);
-}
-
-extern struct qinit strdata;
-extern struct qinit stwdata;
-
-/*
- * Unlink ip, ibcm, icmp6 drivers
- */
-/* ARGSUSED */
-static int
-ibcm_arp_unlink_driver(queue_t **q, vnode_t **dev_vp)
-{
- vnode_t *vp = *dev_vp;
- struct stdata *dev_stp = vp->v_stream;
- queue_t *wrq, *rq;
- int rc;
-
- IBTF_DPRINTF_L4(cmlog, "ibcm_arp_unlink_driver: Enter: 0x%p", q);
-
- wrq = dev_stp->sd_wrq;
- rq = RD(wrq);
-
- disable_svc(rq);
- wait_svc(rq);
- flushq(rq, FLUSHALL);
- flushq(WR(rq), FLUSHALL);
-
- rq->q_ptr = wrq->q_ptr = dev_stp;
-
- setq(rq, &strdata, &stwdata, NULL, QMTSAFE, SQ_CI|SQ_CO, B_TRUE);
-
- if ((rc = VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL)) != 0) {
- IBTF_DPRINTF_L2(cmlog, "ibcm_arp_unlink_driver: VOP_CLOSE "
- "failed %d\n", rc);
- }
- VN_RELE(vp);
-
- return (0);
-}
-
-static int
-ibcm_arp_unlink_drivers(ibcm_arp_streams_t *ib_s)
-{
- IBTF_DPRINTF_L4(cmlog, "ibcm_arp_unlink_drivers(%p)", ib_s);
-
- if (ib_s->arpqueue) {
- (void) ibcm_arp_unlink_driver(&ib_s->arpqueue, &ib_s->arp_vp);
- }
-
- return (0);
-}
-
-/*
- * Link ip, ibtl drivers below ibtl
- */
-static int
-ibcm_arp_link_drivers(ibcm_arp_streams_t *ib_s)
-{
- int rc;
-
- IBTF_DPRINTF_L4(cmlog, "ibcm_arp_link_drivers(%p)", ib_s);
-
- if ((rc = ibcm_arp_link_driver(ib_s, "/dev/arp", &ib_s->arpqueue,
- &ib_s->arp_vp)) != 0) {
- IBTF_DPRINTF_L2(cmlog, "ibcm_arp_link_drivers: "
- "ibcm_arp_link_driver failed: %d\n", rc);
- return (rc);
- }
-
- return (0);
-}
ibt_status_t
ibcm_arp_get_ibaddr(ibt_ip_addr_t srcaddr, ibt_ip_addr_t destaddr,
@@ -370,21 +94,13 @@ ibcm_arp_get_ibaddr(ibt_ip_addr_t srcaddr, ibt_ip_addr_t destaddr,
mutex_init(&ib_s->lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&ib_s->cv, NULL, CV_DRIVER, NULL);
- ret = ibcm_arp_link_drivers(ib_s);
- if (ret != 0) {
- IBTF_DPRINTF_L3(cmlog, "ibcm_arp_get_ibaddr: "
- "ibcm_arp_link_drivers failed %d", ret);
- goto arp_ibaddr_error;
- }
-
mutex_enter(&ib_s->lock);
ib_s->done = B_FALSE;
mutex_exit(&ib_s->lock);
- ret = ibcm_arp_pr_lookup(ib_s, &destaddr, &srcaddr,
- ibcm_arp_get_ibaddr_cb);
+ ret = ibcm_resolver_pr_lookup(ib_s, &destaddr, &srcaddr);
- IBTF_DPRINTF_L3(cmlog, "ibcm_arp_get_ibaddr: ibcm_arp_pr_lookup "
+ IBTF_DPRINTF_L3(cmlog, "ibcm_arp_get_ibaddr: ibcm_resolver_pr_lookup "
"returned: %d", ret);
if (ret == 0) {
mutex_enter(&ib_s->lock);
@@ -393,7 +109,6 @@ ibcm_arp_get_ibaddr(ibt_ip_addr_t srcaddr, ibt_ip_addr_t destaddr,
mutex_exit(&ib_s->lock);
}
- (void) ibcm_arp_unlink_drivers(ib_s);
mutex_enter(&ib_s->lock);
wqnp = ib_s->wqnp;
if (ib_s->status == 0) {
@@ -407,11 +122,11 @@ ibcm_arp_get_ibaddr(ibt_ip_addr_t srcaddr, ibt_ip_addr_t destaddr,
ib_s->wqnp->sgid.gid_prefix, ib_s->wqnp->sgid.gid_guid,
ib_s->wqnp->dgid.gid_prefix, ib_s->wqnp->dgid.gid_guid);
- ibcm_arp_prwqn_delete(wqnp);
+ ibcm_arp_delete_prwqn(wqnp);
} else if (ret == 0) {
/*
* We come here only when lookup has returned empty (failed)
- * via callback routine - ibcm_arp_get_ibaddr_cb
+ * via callback routine.
* i.e. ib_s->status is non-zero, while ret is zero.
*/
if (wqnp)
@@ -884,20 +599,3 @@ srcip_plist_end:
return (ret);
}
-/* Routines for warlock */
-
-/* ARGSUSED */
-static int
-ibcm_arp_dummy_ibaddr_hdl(void *arg, int status)
-{
- ibcm_arp_prwqn_t dummy_wqn1;
- ibcm_arp_prwqn_t dummy_wqn2;
-
- dummy_wqn1.func = ibcm_arp_get_ibaddr_cb;
- dummy_wqn2.func = ibcm_arp_dummy_ibaddr_hdl;
-
- IBTF_DPRINTF_L5(cmlog, "ibcm_arp_dummy_ibaddr_hdl: "
- "dummy_wqn1.func %p %p", dummy_wqn1.func, dummy_wqn2.func);
-
- return (0);
-}
diff --git a/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp_link.c b/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp_link.c
index 79d420d467..45fbfd7932 100644
--- a/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp_link.c
+++ b/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp_link.c
@@ -24,309 +24,32 @@
*/
#include <sys/types.h>
-#include <sys/stream.h>
-#include <sys/dlpi.h>
-#include <sys/stropts.h>
-#include <sys/strsun.h>
-#include <sys/sysmacros.h>
-#include <sys/strlog.h>
-#include <sys/ddi.h>
-#include <sys/cmn_err.h>
-#include <sys/socket.h>
#include <net/if.h>
#include <net/if_types.h>
-#include <netinet/in.h>
-#include <sys/ethernet.h>
-#include <inet/arp.h>
#include <inet/ip.h>
#include <inet/ip_ire.h>
#include <inet/ip_if.h>
#include <sys/ib/mgt/ibcm/ibcm_arp.h>
-#include <inet/ip_ftable.h>
-
-static areq_t ibcm_arp_areq_template = {
- AR_ENTRY_QUERY, /* cmd */
- sizeof (areq_t) + (2 * IP_ADDR_LEN), /* name offset */
- sizeof (areq_t), /* name len */
- IP_ARP_PROTO_TYPE, /* protocol, from arps perspective */
- sizeof (areq_t), /* target addr offset */
- IP_ADDR_LEN, /* target ADDR_length */
- 0, /* flags */
- sizeof (areq_t) + IP_ADDR_LEN, /* sender addr offset */
- IP_ADDR_LEN, /* sender addr length */
- IBCM_ARP_XMIT_COUNT, /* xmit_count */
- IBCM_ARP_XMIT_INTERVAL, /* (re)xmit_interval in milliseconds */
- 4 /* max # of requests to buffer */
- /*
- * anything else filled in by the code
- */
-};
-
-static area_t ibcm_arp_area_template = {
- AR_ENTRY_ADD, /* cmd */
- sizeof (area_t) + IPOIB_ADDRL + (2 * IP_ADDR_LEN), /* name offset */
- sizeof (area_t), /* name len */
- IP_ARP_PROTO_TYPE, /* protocol, from arps perspective */
- sizeof (area_t), /* proto addr offset */
- IP_ADDR_LEN, /* proto ADDR_length */
- sizeof (area_t) + (IP_ADDR_LEN), /* proto mask offset */
- 0, /* flags */
- sizeof (area_t) + (2 * IP_ADDR_LEN), /* hw addr offset */
- IPOIB_ADDRL /* hw addr length */
-};
extern char cmlog[];
-_NOTE(SCHEME_PROTECTS_DATA("Unshared data", msgb))
-_NOTE(SCHEME_PROTECTS_DATA("Unshared data", area_t))
_NOTE(SCHEME_PROTECTS_DATA("Unshared data", ibcm_arp_streams_t))
-static void ibcm_arp_timeout(void *arg);
-static void ibcm_arp_pr_callback(ibcm_arp_prwqn_t *wqnp, int status);
-static void ibcm_ipv6_resolver_ack(ip2mac_t *, void *);
-static int ibcm_ipv6_lookup(ibcm_arp_prwqn_t *wqnp, ill_t *ill, zoneid_t zid);
-
-/*
- * issue a AR_ENTRY_QUERY to arp driver and schedule a timeout.
- */
-static int
-ibcm_arp_query_arp(ibcm_arp_prwqn_t *wqnp)
-{
- int len;
- int name_len;
- int name_offset;
- char *cp;
- mblk_t *mp;
- mblk_t *mp1;
- areq_t *areqp;
- ibcm_arp_streams_t *ib_s = (ibcm_arp_streams_t *)wqnp->arg;
-
- IBTF_DPRINTF_L4(cmlog, "ibcm_arp_query_arp(ib_s: %p wqnp: %p)",
- ib_s, wqnp);
-
- name_offset = ibcm_arp_areq_template.areq_name_offset;
-
- /*
- * allocate mblk for AR_ENTRY_QUERY
- */
- name_len = strlen(wqnp->ifname) + 1;
- len = name_len + name_offset;
- if ((mp = allocb(len, BPRI_HI)) == NULL) {
- return (ENOMEM);
- }
- bzero(mp->b_rptr, len);
- mp->b_wptr += len;
-
- /*
- * allocate a mblk and set wqnp in the data
- */
- if ((mp1 = allocb(sizeof (void *), BPRI_HI)) == NULL) {
- freeb(mp);
- return (ENOMEM);
- }
-
- mp1->b_wptr += sizeof (void *);
- *(uintptr_t *)(void *)mp1->b_rptr = (uintptr_t)wqnp; /* store wqnp */
-
- cp = (char *)mp->b_rptr;
- bcopy(&ibcm_arp_areq_template, cp, sizeof (areq_t));
- areqp = (void *)cp;
- areqp->areq_name_length = name_len;
-
- cp = (char *)areqp + areqp->areq_name_offset;
- bcopy(wqnp->ifname, cp, name_len);
-
- areqp->areq_proto = wqnp->ifproto;
- bcopy(&wqnp->ifproto, areqp->areq_sap, 2);
- cp = (char *)areqp + areqp->areq_target_addr_offset;
- bcopy(&wqnp->dst_addr.un.ip4addr, cp, IP_ADDR_LEN);
- cp = (char *)areqp + areqp->areq_sender_addr_offset;
- bcopy(&wqnp->src_addr.un.ip4addr, cp, IP_ADDR_LEN);
-
- mp->b_cont = mp1;
-
- DB_TYPE(mp) = M_PROTO;
-
- /*
- * issue the request to arp
- */
- wqnp->flags |= IBCM_ARP_PR_RESOLVE_PENDING;
- wqnp->timeout_id = timeout(ibcm_arp_timeout, wqnp,
- drv_usectohz(IBCM_ARP_TIMEOUT * 1000));
- if (canputnext(ib_s->arpqueue)) {
- putnext(ib_s->arpqueue, mp);
- } else {
- (void) putq(ib_s->arpqueue, mp);
- qenable(ib_s->arpqueue);
- }
-
- return (0);
-}
-
-/*
- * issue AR_ENTRY_SQUERY to arp driver
- */
-static int
-ibcm_arp_squery_arp(ibcm_arp_prwqn_t *wqnp)
-{
- int len;
- int name_len;
- char *cp;
- mblk_t *mp;
- mblk_t *mp1;
- area_t *areap;
- uint32_t proto_mask = 0xffffffff;
- struct iocblk *ioc;
- ibcm_arp_streams_t *ib_s = (ibcm_arp_streams_t *)wqnp->arg;
-
- IBTF_DPRINTF_L4(cmlog, "ibcm_arp_squery_arp(ib_s: %p wqnp: %p)",
- ib_s, wqnp);
-
- /*
- * allocate mblk for AR_ENTRY_SQUERY
- */
- name_len = strlen(wqnp->ifname) + 1;
- len = ibcm_arp_area_template.area_name_offset + name_len +
- sizeof (uintptr_t);
- if ((mp = allocb(len, BPRI_HI)) == NULL) {
- return (ENOMEM);
- }
- bzero(mp->b_rptr, len);
- mp->b_wptr += len + sizeof (uintptr_t);
-
- *(uintptr_t *)(void *)mp->b_rptr = (uintptr_t)wqnp; /* store wqnp */
- mp->b_rptr += sizeof (uintptr_t);
-
-
- cp = (char *)mp->b_rptr;
- bcopy(&ibcm_arp_area_template, cp, sizeof (area_t));
-
- areap = (void *)cp;
- areap->area_cmd = AR_ENTRY_SQUERY;
- areap->area_name_length = name_len;
- cp = (char *)areap + areap->area_name_offset;
- bcopy(wqnp->ifname, cp, name_len);
-
- cp = (char *)areap + areap->area_proto_addr_offset;
- bcopy(&wqnp->dst_addr.un.ip4addr, cp, IP_ADDR_LEN);
-
- cp = (char *)areap + areap->area_proto_mask_offset;
- bcopy(&proto_mask, cp, IP_ADDR_LEN);
-
- mp1 = allocb(sizeof (struct iocblk), BPRI_HI);
- if (mp1 == NULL) {
- freeb(mp);
- return (ENOMEM);
- }
- ioc = (void *)mp1->b_rptr;
- ioc->ioc_cmd = AR_ENTRY_SQUERY;
- ioc->ioc_error = 0;
- ioc->ioc_cr = NULL;
- ioc->ioc_count = msgdsize(mp);
- mp1->b_wptr += sizeof (struct iocblk);
- mp1->b_cont = mp;
-
- DB_TYPE(mp1) = M_IOCTL;
-
- if (canputnext(ib_s->arpqueue)) {
- putnext(ib_s->arpqueue, mp1);
- } else {
- (void) putq(ib_s->arpqueue, mp1);
- qenable(ib_s->arpqueue);
- }
- return (0);
-}
-
-/*
- * issue a AR_ENTRY_ADD to arp driver
- * This is required as arp driver does not maintain a cache.
- */
-static int
-ibcm_arp_add(ibcm_arp_prwqn_t *wqnp)
-{
- int len;
- int name_len;
- char *cp;
- mblk_t *mp;
- area_t *areap;
- uint32_t proto_mask = 0xffffffff;
- ibcm_arp_streams_t *ib_s = (ibcm_arp_streams_t *)wqnp->arg;
-
- IBTF_DPRINTF_L4(cmlog, "ibcm_arp_add(ib_s: %p wqnp: %p)", ib_s, wqnp);
-
- /*
- * allocate mblk for AR_ENTRY_ADD
- */
-
- name_len = strlen(wqnp->ifname) + 1;
- len = ibcm_arp_area_template.area_name_offset + name_len;
- if ((mp = allocb(len, BPRI_HI)) == NULL) {
- return (ENOMEM);
- }
- bzero(mp->b_rptr, len);
- mp->b_wptr += len;
-
- cp = (char *)mp->b_rptr;
- bcopy(&ibcm_arp_area_template, cp, sizeof (area_t));
-
- areap = (void *)mp->b_rptr;
- areap->area_name_length = name_len;
- cp = (char *)areap + areap->area_name_offset;
- bcopy(wqnp->ifname, cp, name_len);
-
- cp = (char *)areap + areap->area_proto_addr_offset;
- bcopy(&wqnp->dst_addr.un.ip4addr, cp, IP_ADDR_LEN);
-
- cp = (char *)areap + areap->area_proto_mask_offset;
- bcopy(&proto_mask, cp, IP_ADDR_LEN);
-
- cp = (char *)areap + areap->area_hw_addr_offset;
- bcopy(&wqnp->dst_mac, cp, IPOIB_ADDRL);
-
- DB_TYPE(mp) = M_PROTO;
-
- if (canputnext(ib_s->arpqueue)) {
- putnext(ib_s->arpqueue, mp);
- } else {
- (void) putq(ib_s->arpqueue, mp);
- qenable(ib_s->arpqueue);
- }
- return (0);
-}
-
-
-/*
- * timeout routine when there is no response to AR_ENTRY_QUERY
- */
-static void
-ibcm_arp_timeout(void *arg)
-{
- ibcm_arp_prwqn_t *wqnp = (ibcm_arp_prwqn_t *)arg;
- ibcm_arp_streams_t *ib_s = (ibcm_arp_streams_t *)wqnp->arg;
-
- IBTF_DPRINTF_L4(cmlog, "ibcm_arp_timeout(ib_s: %p wqnp: %p)",
- ib_s, wqnp);
- wqnp->flags &= ~IBCM_ARP_PR_RESOLVE_PENDING;
- cv_broadcast(&ib_s->cv);
-
- /*
- * indicate to user
- */
- ibcm_arp_pr_callback(wqnp, EHOSTUNREACH);
-}
+static void ibcm_resolver_ack(ip2mac_t *, void *);
+static int ibcm_nce_lookup(ibcm_arp_prwqn_t *wqnp, ill_t *ill, zoneid_t zid);
/*
* delete a wait queue node from the list.
* assumes mutex is acquired
*/
void
-ibcm_arp_prwqn_delete(ibcm_arp_prwqn_t *wqnp)
+ibcm_arp_delete_prwqn(ibcm_arp_prwqn_t *wqnp)
{
ibcm_arp_streams_t *ib_s;
- IBTF_DPRINTF_L4(cmlog, "ibcm_arp_prwqn_delete(%p)", wqnp);
+ IBTF_DPRINTF_L4(cmlog, "ibcm_arp_delete_prwqn(%p)", wqnp);
- ib_s = (ibcm_arp_streams_t *)wqnp->arg;
+ ib_s = wqnp->ib_str;
ib_s->wqnp = NULL;
kmem_free(wqnp, sizeof (ibcm_arp_prwqn_t));
}
@@ -336,7 +59,7 @@ ibcm_arp_prwqn_delete(ibcm_arp_prwqn_t *wqnp)
*/
static ibcm_arp_prwqn_t *
ibcm_arp_create_prwqn(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr,
- ibt_ip_addr_t *src_addr, ibcm_arp_pr_comp_func_t func)
+ ibt_ip_addr_t *src_addr)
{
ibcm_arp_prwqn_t *wqnp;
@@ -354,8 +77,7 @@ ibcm_arp_create_prwqn(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr,
if (src_addr) {
wqnp->usrc_addr = *src_addr;
}
- wqnp->func = func;
- wqnp->arg = ib_s;
+ wqnp->ib_str = ib_s;
wqnp->ifproto = (dst_addr->family == AF_INET) ?
ETHERTYPE_IP : ETHERTYPE_IPV6;
@@ -366,17 +88,6 @@ ibcm_arp_create_prwqn(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr,
return (wqnp);
}
-/*
- * call the user function
- * called with lock held
- */
-static void
-ibcm_arp_pr_callback(ibcm_arp_prwqn_t *wqnp, int status)
-{
- IBTF_DPRINTF_L4(cmlog, "ibcm_arp_pr_callback(%p, %d)", wqnp, status);
-
- wqnp->func((void *)wqnp, status);
-}
/*
* Check if the interface is loopback or IB.
@@ -391,23 +102,24 @@ ibcm_arp_check_interface(ill_t *ill)
}
int
-ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr,
- ibt_ip_addr_t *src_addr, ibcm_arp_pr_comp_func_t func)
+ibcm_resolver_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr,
+ ibt_ip_addr_t *src_addr)
{
ibcm_arp_prwqn_t *wqnp;
ire_t *ire = NULL;
- ire_t *src_ire = NULL;
- ipif_t *ipif;
- ill_t *ill, *hwaddr_ill = NULL;
+ ipif_t *ipif = NULL;
+ ill_t *ill = NULL;
+ ill_t *hwaddr_ill = NULL;
ip_stack_t *ipst;
int len;
+ ipaddr_t setsrcv4;
+ in6_addr_t setsrcv6;
IBCM_PRINT_IP("ibcm_arp_pr_lookup: SRC", src_addr);
IBCM_PRINT_IP("ibcm_arp_pr_lookup: DST", dst_addr);
- if ((wqnp = ibcm_arp_create_prwqn(ib_s, dst_addr,
- src_addr, func)) == NULL) {
- IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: "
+ if ((wqnp = ibcm_arp_create_prwqn(ib_s, dst_addr, src_addr)) == NULL) {
+ IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: "
"ibcm_arp_create_prwqn failed");
ib_s->status = ENOMEM;
return (1);
@@ -416,86 +128,111 @@ ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr,
ipst = netstack_find_by_zoneid(GLOBAL_ZONEID)->netstack_ip;
if (dst_addr->family == AF_INET) {
/*
- * Get the ire for the local address
+ * A local address is always specified, and it is used
+ * to find the zoneid.
*/
- IBTF_DPRINTF_L5(cmlog, "ibcm_arp_pr_lookup: ire_ctable_lookup");
- src_ire = ire_ctable_lookup(src_addr->un.ip4addr, NULL,
- IRE_LOCAL, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
- if (src_ire == NULL) {
- IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: "
- "ire_ctable_lookup failed");
+ ipif = ipif_lookup_addr(src_addr->un.ip4addr, NULL, ALL_ZONES,
+ ipst);
+ if (ipif == NULL) {
+ IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: "
+ "ipif_lookup_addr failed");
ib_s->status = EFAULT;
goto fail;
}
- IBTF_DPRINTF_L5(cmlog, "ibcm_arp_pr_lookup: ire_ctable_lookup");
/*
- * get an ire for the destination address with the matching
- * source address
+ * get an ire for the destination adress.
+ * Note that we can't use MATCH_IRE_ILL since that would
+ * require that the first ill we find have ire_ill set. Thus
+ * we compare ire_ill against ipif_ill after the lookup.
*/
- ire = ire_ftable_lookup(dst_addr->un.ip4addr, 0, 0, 0,
- src_ire->ire_ipif, 0, src_ire->ire_zoneid, 0, NULL,
- MATCH_IRE_SRC, ipst);
- if (ire == NULL) {
- IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: "
- "ire_ftable_lookup failed");
+ setsrcv4 = INADDR_ANY;
+ ire = ire_route_recursive_v4(dst_addr->un.ip4addr, 0, NULL,
+ ipif->ipif_zoneid, NULL, MATCH_IRE_DSTONLY, B_TRUE, 0, ipst,
+ &setsrcv4, NULL, NULL);
+
+ ASSERT(ire != NULL);
+ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+ IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: "
+ "ire_route_recursive_v4 failed");
+ ib_s->status = EFAULT;
+ goto fail;
+ }
+ ill = ire_nexthop_ill(ire);
+ if (ill == NULL) {
+ IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: "
+ "ire_nexthop_ill failed");
+ ib_s->status = EFAULT;
+ goto fail;
+ }
+ if (ill != ipif->ipif_ill) {
+ IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: "
+ "wrong ill");
ib_s->status = EFAULT;
goto fail;
}
- IBTF_DPRINTF_L5(cmlog, "ibcm_arp_pr_lookup: ire_ftable_lookup:"
- "done");
-
- wqnp->gateway.un.ip4addr =
- ((ire->ire_gateway_addr == INADDR_ANY) ?
- ire->ire_addr : ire->ire_gateway_addr);
+ wqnp->gateway.un.ip4addr = ire->ire_gateway_addr;
wqnp->netmask.un.ip4addr = ire->ire_mask;
- wqnp->src_addr.un.ip4addr = ire->ire_src_addr;
+ wqnp->src_addr.un.ip4addr = src_addr->un.ip4addr;
wqnp->src_addr.family = wqnp->gateway.family =
wqnp->netmask.family = AF_INET;
} else if (dst_addr->family == AF_INET6) {
/*
- * Get the ire for the local address
+ * A local address is always specified, and it is used
+ * to find the zoneid.
+ * We should really match on scopeid for link locals here.
*/
- src_ire = ire_ctable_lookup_v6(&src_addr->un.ip6addr, NULL,
- IRE_LOCAL, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
- if (src_ire == NULL) {
- IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: "
- "ire_ctable_lookup_v6 failed");
+ ipif = ipif_lookup_addr_v6(&src_addr->un.ip6addr, NULL,
+ ALL_ZONES, ipst);
+ if (ipif == NULL) {
+ IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: "
+ "ipif_lookup_addr_v6 failed");
ib_s->status = EFAULT;
goto fail;
}
- IBTF_DPRINTF_L5(cmlog, "ibcm_arp_pr_lookup: "
- "ire_ctable_lookup_v6: done");
/*
- * get an ire for the destination address with the matching
- * source address
+ * get an ire for the destination adress.
+ * Note that we can't use MATCH_IRE_ILL since that would
+ * require that the first ill we find have ire_ill set. Thus
+ * we compare ire_ill against ipif_ill after the lookup.
*/
- ire = ire_ftable_lookup_v6(&dst_addr->un.ip6addr, 0, 0, 0,
- src_ire->ire_ipif, 0, src_ire->ire_zoneid, 0, NULL,
- MATCH_IRE_SRC, ipst);
- if (ire == NULL) {
- IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: "
- "ire_ftable_lookup_v6 failed");
+ setsrcv6 = ipv6_all_zeros;
+ ire = ire_route_recursive_v6(&dst_addr->un.ip6addr, 0, NULL,
+ ipif->ipif_zoneid, NULL, MATCH_IRE_DSTONLY, B_TRUE, 0, ipst,
+ &setsrcv6, NULL, NULL);
+
+ ASSERT(ire != NULL);
+ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+ IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: "
+ "ire_route_recursive_v6 failed");
+ ib_s->status = EFAULT;
+ goto fail;
+ }
+ ill = ire_nexthop_ill(ire);
+ if (ill == NULL) {
+ IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: "
+ "ire_nexthop_ill failed");
+ ib_s->status = EFAULT;
+ goto fail;
+ }
+
+ if (ill != ipif->ipif_ill) {
+ IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: "
+ "wrong ill");
ib_s->status = EFAULT;
goto fail;
}
- IBTF_DPRINTF_L5(cmlog, "ibcm_arp_pr_lookup: "
- "ire_ftable_lookup_v6: done");
- wqnp->gateway.un.ip6addr =
- (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6) ?
- ire->ire_addr_v6 : ire->ire_gateway_addr_v6);
+ wqnp->gateway.un.ip6addr = ire->ire_gateway_addr_v6;
wqnp->netmask.un.ip6addr = ire->ire_mask_v6;
- wqnp->src_addr.un.ip6addr = ire->ire_src_addr_v6;
+ wqnp->src_addr.un.ip6addr = src_addr->un.ip6addr;
wqnp->src_addr.family = wqnp->gateway.family =
wqnp->netmask.family = AF_INET6;
}
- ipif = src_ire->ire_ipif;
- ill = ipif->ipif_ill;
(void) strlcpy(wqnp->ifname, ill->ill_name, sizeof (wqnp->ifname));
/*
@@ -504,18 +241,19 @@ ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr,
*/
if (IS_IPMP(ill)) {
if ((hwaddr_ill = ipmp_ipif_hold_bound_ill(ipif)) == NULL) {
- IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: no bound "
- "ill for IPMP interface %s", ill->ill_name);
+ IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: "
+ "no bound ill for IPMP interface %s",
+ ill->ill_name);
ib_s->status = EFAULT;
goto fail;
}
} else {
hwaddr_ill = ill;
- ill_refhold(hwaddr_ill); /* for symmetry */
+ ill_refhold(hwaddr_ill); /* for symmetry */
}
if ((ib_s->status = ibcm_arp_check_interface(hwaddr_ill)) != 0) {
- IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: "
+ IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: "
"ibcm_arp_check_interface failed");
goto fail;
}
@@ -523,7 +261,7 @@ ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr,
bcopy(hwaddr_ill->ill_phys_addr, &wqnp->src_mac,
hwaddr_ill->ill_phys_addr_length);
- IBTF_DPRINTF_L4(cmlog, "ibcm_arp_pr_lookup: outgoing if:%s",
+ IBTF_DPRINTF_L4(cmlog, "ibcm_resolver_pr_lookup: outgoing if:%s",
wqnp->ifname);
/*
@@ -534,8 +272,8 @@ ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr,
len = (wqnp->usrc_addr.family == AF_INET) ?
IP_ADDR_LEN : sizeof (in6_addr_t);
if (bcmp(&wqnp->usrc_addr.un, &wqnp->src_addr.un, len)) {
- IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: srcaddr "
- "mismatch:%d", ENETUNREACH);
+ IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: "
+ "srcaddr mismatch:%d", ENETUNREACH);
goto fail;
}
}
@@ -545,253 +283,77 @@ ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr,
* interface, now get the destination mac address from
* arp or ipv6 drivers
*/
- if (wqnp->dst_addr.family == AF_INET) {
- if ((ib_s->status = ibcm_arp_squery_arp(wqnp)) != 0) {
- IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: "
- "ibcm_arp_squery_arp failed: %d", ib_s->status);
- goto fail;
- }
- } else {
- if ((ib_s->status = ibcm_ipv6_lookup(wqnp, ill, getzoneid())) !=
- 0) {
- IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: "
- "ibcm_ipv6_lookup failed: %d", ib_s->status);
- goto fail;
- }
+ ib_s->status = ibcm_nce_lookup(wqnp, ill, getzoneid());
+ if (ib_s->status != 0) {
+ IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: "
+ "ibcm_nce_lookup failed: %d", ib_s->status);
+ goto fail;
}
ill_refrele(hwaddr_ill);
- IRE_REFRELE(ire);
- IRE_REFRELE(src_ire);
+ ill_refrele(ill);
+ ire_refrele(ire);
+ ipif_refrele(ipif);
netstack_rele(ipst->ips_netstack);
- IBTF_DPRINTF_L4(cmlog, "ibcm_arp_pr_lookup: Return: 0x%p", wqnp);
+ IBTF_DPRINTF_L4(cmlog, "ibcm_resolver_pr_lookup: Return: 0x%p", wqnp);
return (0);
fail:
if (hwaddr_ill != NULL)
ill_refrele(hwaddr_ill);
+ if (ill != NULL)
+ ill_refrele(ill);
if (ire != NULL)
- IRE_REFRELE(ire);
- if (src_ire != NULL)
- IRE_REFRELE(src_ire);
- ibcm_arp_prwqn_delete(wqnp);
+ ire_refrele(ire);
+ if (ipif != NULL)
+ ipif_refrele(ipif);
+ ibcm_arp_delete_prwqn(wqnp);
netstack_rele(ipst->ips_netstack);
return (1);
}
/*
- * called from lrsrv.
- * process a AR_ENTRY_QUERY reply from arp
- * the message should be M_DATA -->> dl_unitdata_req
- */
-static void
-ibcm_arp_pr_arp_query_ack(mblk_t *mp)
-{
- ibcm_arp_prwqn_t *wqnp;
- dl_unitdata_req_t *dlreq;
- ibcm_arp_streams_t *ib_s;
- char *cp;
- int rc;
-
- IBTF_DPRINTF_L4(cmlog, "ibcm_arp_pr_arp_query_ack(%p)", mp);
-
- /*
- * the first mblk contains the wqnp pointer for the request
- */
- if (MBLKL(mp) != sizeof (void *)) {
- freemsg(mp);
- return;
- }
-
- wqnp = *(ibcm_arp_prwqn_t **)(void *)mp->b_rptr; /* retrieve wqnp */
- ib_s = (ibcm_arp_streams_t *)wqnp->arg;
-
- mutex_enter(&ib_s->lock);
-
- /*
- * cancel the timeout for this request
- */
- (void) untimeout(wqnp->timeout_id);
-
- /*
- * sanity checks on the dl_unitdata_req block
- */
- if (!mp->b_cont) {
- IBTF_DPRINTF_L2(cmlog, "areq_ack: b_cont = NULL\n");
- rc = EPROTO;
- goto user_callback;
- }
- if (MBLKL(mp->b_cont) < (sizeof (dl_unitdata_req_t) + IPOIB_ADDRL)) {
- IBTF_DPRINTF_L2(cmlog, "areq_ack: invalid len in "
- "dl_unitdatareq_t block\n");
- rc = EPROTO;
- goto user_callback;
- }
- dlreq = (void *)mp->b_cont->b_rptr;
- if (dlreq->dl_primitive != DL_UNITDATA_REQ) {
- IBTF_DPRINTF_L2(cmlog, "areq_ack: invalid dl_primitive "
- "in dl_unitdatareq_t block\n");
- rc = EPROTO;
- goto user_callback;
- }
- if (dlreq->dl_dest_addr_length != (IPOIB_ADDRL + 2)) {
- IBTF_DPRINTF_L2(cmlog, "areq_ack: invalid hw len in "
- "dl_unitdatareq_t block %d\n", dlreq->dl_dest_addr_length);
- rc = EPROTO;
- goto user_callback;
- }
- cp = (char *)mp->b_cont->b_rptr + dlreq->dl_dest_addr_offset;
- bcopy(cp, &wqnp->dst_mac, IPOIB_ADDRL);
-
- /*
- * at this point we have src/dst gid's derived from the mac addresses
- * now get the hca, port
- */
- bcopy(&wqnp->src_mac.ipoib_gidpref, &wqnp->sgid, sizeof (ib_gid_t));
- bcopy(&wqnp->dst_mac.ipoib_gidpref, &wqnp->dgid, sizeof (ib_gid_t));
- freemsg(mp);
-
- IBCM_H2N_GID(wqnp->sgid);
- IBCM_H2N_GID(wqnp->dgid);
-
- (void) ibcm_arp_add(wqnp);
-
- mutex_exit(&ib_s->lock);
- ibcm_arp_pr_callback(wqnp, 0);
-
- return;
-user_callback:
- freemsg(mp);
- mutex_exit(&ib_s->lock);
-
- /*
- * indicate to user
- */
- ibcm_arp_pr_callback(wqnp, rc);
-}
-
-/*
- * process a AR_ENTRY_SQUERY reply from arp
- * the message should be M_IOCACK -->> area_t
+ * Query the neighbor cache for IPv4/IPv6 to mac address mapping.
*/
-static void
-ibcm_arp_pr_arp_squery_ack(mblk_t *mp)
+static int
+ibcm_nce_lookup(ibcm_arp_prwqn_t *wqnp, ill_t *ill, zoneid_t zoneid)
{
- struct iocblk *ioc;
- mblk_t *mp1;
- ibcm_arp_prwqn_t *wqnp;
- ibcm_arp_streams_t *ib_s;
- area_t *areap;
- char *cp;
-
- IBTF_DPRINTF_L4(cmlog, "ibcm_arp_pr_arp_squery_ack(%p)", mp);
-
- if (MBLKL(mp) < sizeof (struct iocblk)) {
- freemsg(mp);
- return;
- }
-
- ioc = (void *)mp->b_rptr;
- if ((ioc->ioc_cmd != AR_ENTRY_SQUERY) || (mp->b_cont == NULL)) {
- freemsg(mp);
- return;
- }
-
- mp1 = mp->b_cont;
-
- wqnp = *(ibcm_arp_prwqn_t **)((uintptr_t)mp1->b_rptr -
- sizeof (uintptr_t));
- ib_s = (ibcm_arp_streams_t *)wqnp->arg;
-
- mutex_enter(&ib_s->lock);
-
- /*
- * cancel the timeout for this request
- */
- (void) untimeout(wqnp->timeout_id);
-
- /* If the entry was not in arp cache, ioc_error is set */
- if (ioc->ioc_error) {
-
- /*
- * send out AR_ENTRY_QUERY which would send
- * arp-request on wire
- */
- IBTF_DPRINTF_L3(cmlog, "Sending a Query_ARP");
-
- (void) ibcm_arp_query_arp(wqnp);
- freemsg(mp);
- mutex_exit(&ib_s->lock);
- return;
+ ip2mac_t ip2m;
+ sin_t *sin;
+ sin6_t *sin6;
+ ip2mac_id_t ip2mid;
+ int err;
+
+ if (wqnp->src_addr.family != wqnp->dst_addr.family) {
+ IBTF_DPRINTF_L2(cmlog, "ibcm_nce_lookup: Mis-match SRC_ADDR "
+ "Family: %d, DST_ADDR Family %d", wqnp->src_addr.family,
+ wqnp->dst_addr.family);
+ return (1);
}
+ bzero(&ip2m, sizeof (ip2m));
- areap = (void *)mp1->b_rptr;
- cp = (char *)areap + areap->area_hw_addr_offset;
- bcopy(cp, &wqnp->dst_mac, IPOIB_ADDRL);
-
- /*
- * at this point we have src/dst gid's derived from the mac addresses
- * now get the hca, port
- */
- bcopy(&wqnp->src_mac.ipoib_gidpref, &wqnp->sgid, sizeof (ib_gid_t));
- bcopy(&wqnp->dst_mac.ipoib_gidpref, &wqnp->dgid, sizeof (ib_gid_t));
- freemsg(mp);
-
- IBCM_H2N_GID(wqnp->sgid);
- IBCM_H2N_GID(wqnp->dgid);
-
- mutex_exit(&ib_s->lock);
- ibcm_arp_pr_callback(wqnp, 0);
-}
-
-/*
- * Process arp ack's.
- */
-void
-ibcm_arp_pr_arp_ack(mblk_t *mp)
-{
- IBTF_DPRINTF_L4(cmlog, "ibcm_arp_pr_arp_ack(0x%p, DB_TYPE %lX)",
- mp, DB_TYPE(mp));
-
- if (DB_TYPE(mp) == M_DATA) {
- ibcm_arp_pr_arp_query_ack(mp);
- } else if ((DB_TYPE(mp) == M_IOCACK) ||
- (DB_TYPE(mp) == M_IOCNAK)) {
- ibcm_arp_pr_arp_squery_ack(mp);
+ if (wqnp->dst_addr.family == AF_INET) {
+ sin = (sin_t *)&ip2m.ip2mac_pa;
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = wqnp->dst_addr.un.ip4addr;
+ } else if (wqnp->dst_addr.family == AF_INET6) {
+ sin6 = (sin6_t *)&ip2m.ip2mac_pa;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_addr = wqnp->dst_addr.un.ip6addr;
} else {
- freemsg(mp);
- }
-}
-
-/*
- * query the ipv6 driver cache for ipv6 to mac address mapping.
- */
-static int
-ibcm_ipv6_lookup(ibcm_arp_prwqn_t *wqnp, ill_t *ill, zoneid_t zoneid)
-{
- ip2mac_t ip2m;
- sin6_t *sin6;
- ip2mac_id_t ip2mid;
- int err;
-
- if (wqnp->src_addr.family != AF_INET6) {
- IBTF_DPRINTF_L2(cmlog, "ibcm_ipv6_lookup: SRC_ADDR NOT INET6: "
- "%d", wqnp->src_addr.family);
+ IBTF_DPRINTF_L2(cmlog, "ibcm_nce_lookup: Invalid DST_ADDR "
+ "Family: %d", wqnp->dst_addr.family);
return (1);
}
- bzero(&ip2m, sizeof (ip2m));
- sin6 = (sin6_t *)&ip2m.ip2mac_pa;
- sin6->sin6_family = AF_INET6;
- sin6->sin6_addr = wqnp->dst_addr.un.ip6addr;
ip2m.ip2mac_ifindex = ill->ill_phyint->phyint_ifindex;
wqnp->flags |= IBCM_ARP_PR_RESOLVE_PENDING;
+
/*
- * XXX XTBD set the scopeid?
* issue the request to IP for Neighbor Discovery
*/
- ip2mid = ip2mac(IP2MAC_RESOLVE, &ip2m, ibcm_ipv6_resolver_ack, wqnp,
+ ip2mid = ip2mac(IP2MAC_RESOLVE, &ip2m, ibcm_resolver_ack, wqnp,
zoneid);
err = ip2m.ip2mac_err;
if (err == EINPROGRESS) {
@@ -799,7 +361,7 @@ ibcm_ipv6_lookup(ibcm_arp_prwqn_t *wqnp, ill_t *ill, zoneid_t zoneid)
wqnp->flags |= IBCM_ARP_PR_RESOLVE_PENDING;
err = 0;
} else if (err == 0) {
- ibcm_ipv6_resolver_ack(&ip2m, wqnp);
+ ibcm_resolver_ack(&ip2m, wqnp);
}
return (err);
}
@@ -822,16 +384,16 @@ ibcm_check_sockdl(struct sockaddr_dl *sdl)
* If Address resolution was succesful: return GID info.
*/
static void
-ibcm_ipv6_resolver_ack(ip2mac_t *ip2macp, void *arg)
+ibcm_resolver_ack(ip2mac_t *ip2macp, void *arg)
{
ibcm_arp_prwqn_t *wqnp = (ibcm_arp_prwqn_t *)arg;
ibcm_arp_streams_t *ib_s;
uchar_t *cp;
int err = 0;
- IBTF_DPRINTF_L4(cmlog, "ibcm_ipv6_resolver_ack(%p, %p)", ip2macp, wqnp);
+ IBTF_DPRINTF_L4(cmlog, "ibcm_resolver_ack(%p, %p)", ip2macp, wqnp);
- ib_s = (ibcm_arp_streams_t *)wqnp->arg;
+ ib_s = wqnp->ib_str;
mutex_enter(&ib_s->lock);
if (ip2macp->ip2mac_err != 0) {
@@ -842,7 +404,7 @@ ibcm_ipv6_resolver_ack(ip2mac_t *ip2macp, void *arg)
}
if (!ibcm_check_sockdl(&ip2macp->ip2mac_ha)) {
- IBTF_DPRINTF_L2(cmlog, "ibcm_ipv6_resolver_ack: Error: "
+ IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_ack: Error: "
"interface %s is not IB\n", wqnp->ifname);
err = EHOSTUNREACH;
goto user_callback;
@@ -862,6 +424,11 @@ ibcm_ipv6_resolver_ack(ip2mac_t *ip2macp, void *arg)
IBCM_H2N_GID(wqnp->dgid);
user_callback:
+
+ ib_s->status = err;
+ ib_s->done = B_TRUE;
+
+ /* lock is held by the caller. */
+ cv_signal(&ib_s->cv);
mutex_exit(&ib_s->lock);
- ibcm_arp_pr_callback(wqnp, err);
}
diff --git a/usr/src/uts/common/io/mac/mac_util.c b/usr/src/uts/common/io/mac/mac_util.c
index 0d342fdd93..88468b353e 100644
--- a/usr/src/uts/common/io/mac/mac_util.c
+++ b/usr/src/uts/common/io/mac/mac_util.c
@@ -476,7 +476,7 @@ mac_ip_hdr_length_v6(mblk_t *mp, ip6_t *ip6h, uint16_t *hdr_length,
endptr = mp->b_wptr;
if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr)
return (B_FALSE);
- ASSERT((IPH_HDR_VERSION(ip6h) & ~IP_FORWARD_PROG_BIT) == IPV6_VERSION);
+ ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
length = IPV6_HDR_LEN;
whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
diff --git a/usr/src/uts/common/io/softmac/softmac_dev.c b/usr/src/uts/common/io/softmac/softmac_dev.c
index 23f43ced0b..eeb09fcb0b 100644
--- a/usr/src/uts/common/io/softmac/softmac_dev.c
+++ b/usr/src/uts/common/io/softmac/softmac_dev.c
@@ -146,6 +146,9 @@ static struct modlinkage softmac_modlinkage = {
NULL
};
+static void softmac_dedicated_rx(void *, mac_resource_handle_t, mblk_t *,
+ mac_header_info_t *);
+
/*ARGSUSED*/
static int
softmac_upper_constructor(void *buf, void *arg, int kmflag)
@@ -367,7 +370,8 @@ softmac_mod_rput(queue_t *rq, mblk_t *mp)
if (dlp->dl_primitive == DL_UNITDATA_IND) {
if ((rxinfo = slp->sl_rxinfo) != NULL) {
- rxinfo->slr_rx(rxinfo->slr_arg, NULL, mp, NULL);
+ softmac_dedicated_rx(slp->sl_sup, NULL, mp,
+ NULL);
break;
}
diff --git a/usr/src/uts/common/io/softmac/softmac_fp.c b/usr/src/uts/common/io/softmac/softmac_fp.c
index 7a10aa68b7..2fc66e9bd3 100644
--- a/usr/src/uts/common/io/softmac/softmac_fp.c
+++ b/usr/src/uts/common/io/softmac/softmac_fp.c
@@ -674,9 +674,12 @@ softmac_wput_single_nondata(softmac_upper_t *sup, mblk_t *mp)
t_uscalar_t prim;
dbtype = DB_TYPE(mp);
+ sup->su_is_arp = 0;
switch (dbtype) {
- case M_IOCTL:
- case M_CTL: {
+ case M_CTL:
+ sup->su_is_arp = 1;
+ /* FALLTHROUGH */
+ case M_IOCTL: {
uint32_t expected_mode;
if (((struct iocblk *)(mp->b_rptr))->ioc_cmd != SIOCSLIFNAME)
@@ -1132,7 +1135,10 @@ softmac_datapath_switch(softmac_t *softmac, boolean_t disable, boolean_t admin)
break;
req->ssq_expected_mode = expected_mode;
-
+ if (sup->su_is_arp) {
+ list_insert_tail(&reqlist, req);
+ continue;
+ }
/*
* Allocate the DL_NOTE_REPLUMB message.
*/
@@ -1174,18 +1180,19 @@ softmac_datapath_switch(softmac_t *softmac, boolean_t disable, boolean_t admin)
*/
for (sup = list_head(&softmac->smac_sup_list); sup != NULL;
sup = list_next(&softmac->smac_sup_list, sup)) {
- mp = head->b_next;
- head->b_next = NULL;
-
+ if (!sup->su_is_arp) {
+ mp = head->b_next;
+ head->b_next = NULL;
+ softmac_wput_nondata(sup, head);
+ head = mp;
+ }
/*
- * Add the swtich request to the requests list of the stream.
+ * Add the switch request to the requests list of the stream.
*/
req = list_head(&reqlist);
ASSERT(req != NULL);
list_remove(&reqlist, req);
list_insert_tail(&sup->su_req_list, req);
- softmac_wput_nondata(sup, head);
- head = mp;
}
mutex_exit(&softmac->smac_fp_mutex);
diff --git a/usr/src/uts/common/io/stream.c b/usr/src/uts/common/io/stream.c
index b23036e9c5..658735b784 100644
--- a/usr/src/uts/common/io/stream.c
+++ b/usr/src/uts/common/io/stream.c
@@ -1605,7 +1605,9 @@ pullupmsg(mblk_t *mp, ssize_t len)
ASSERT(bp->b_datap->db_ref > 0);
ASSERT(bp->b_wptr >= bp->b_rptr);
n = MIN(bp->b_wptr - bp->b_rptr, len);
- bcopy(bp->b_rptr, mp->b_wptr, (size_t)n);
+ ASSERT(n >= 0); /* allow zero-length mblk_t's */
+ if (n > 0)
+ bcopy(bp->b_rptr, mp->b_wptr, (size_t)n);
mp->b_wptr += n;
bp->b_rptr += n;
len -= n;
diff --git a/usr/src/uts/common/io/strplumb.c b/usr/src/uts/common/io/strplumb.c
index f43648fd7f..473f7bc72e 100644
--- a/usr/src/uts/common/io/strplumb.c
+++ b/usr/src/uts/common/io/strplumb.c
@@ -53,17 +53,6 @@
#include <sys/esunddi.h>
#include <sys/promif.h>
-#include <netinet/in.h>
-#include <netinet/ip6.h>
-#include <netinet/icmp6.h>
-#include <netinet/sctp.h>
-#include <inet/common.h>
-#include <inet/ip.h>
-#include <inet/ip6.h>
-#include <inet/tcp.h>
-#include <inet/sctp_ip.h>
-#include <inet/udp_impl.h>
-
#include <sys/strlog.h>
#include <sys/log.h>
#include <sys/ethernet.h>
@@ -222,104 +211,6 @@ strplumb_init(void)
return (0);
}
-static int
-strplumb_autopush(void)
-{
- major_t maj;
- minor_t min;
- char *mods[5];
- uint_t anchor = 1;
- int err;
-
- min = (minor_t)-1;
- mods[1] = NULL;
-
- /*
- * ARP
- */
- DBG0("setting up arp autopush\n");
-
- mods[0] = ARP;
-
- maj = ddi_name_to_major(ARP);
- if ((err = kstr_autopush(SET_AUTOPUSH, &maj, &min, NULL, &anchor,
- mods)) != 0) {
- printf("strplumb: kstr_autopush(SET/ARP) failed: %d\n", err);
- return (err);
- }
-
- return (0);
-}
-
-static int
-strplumb_sctpq(ldi_ident_t li)
-{
- ldi_handle_t lh = NULL;
- int err;
- int rval;
-
- DBG0("configuring SCTP default queue\n");
-
- if ((err = ldi_open_by_name(SCTP6DEV, FREAD|FWRITE, CRED(), &lh,
- li)) != 0) {
- printf("strplumb: open of SCTP6DEV failed: %d\n", err);
- return (err);
- }
-
- if ((err = ldi_ioctl(lh, SCTP_IOC_DEFAULT_Q, (intptr_t)0, FKIOCTL,
- CRED(), &rval)) != 0) {
- printf("strplumb: failed to set SCTP default queue: %d\n",
- err);
- (void) ldi_close(lh, FREAD|FWRITE, CRED());
- return (err);
- }
-
- return (0);
-}
-
-static int
-strplumb_tcpq(ldi_ident_t li)
-{
- ldi_handle_t lh = NULL;
- ldi_handle_t ip_lh = NULL;
- int err;
- int rval;
-
- DBG0("configuring TCP default queue\n");
-
- /*
- * We open IP6DEV here because we need to have it open to in
- * order to open TCP6DEV successfully.
- */
- if ((err = ldi_open_by_name(IP6DEV, FREAD|FWRITE, CRED(), &ip_lh,
- li)) != 0) {
- printf("strplumb: open of IP6DEV failed: %d\n", err);
- return (err);
- }
-
- /*
- * We set the tcp default queue to IPv6 because IPv4 falls back to
- * IPv6 when it can't find a client, but IPv6 does not fall back to
- * IPv4.
- */
- if ((err = ldi_open_by_name(TCP6DEV, FREAD|FWRITE, CRED(), &lh,
- li)) != 0) {
- printf("strplumb: open of TCP6DEV failed: %d\n", err);
- goto done;
- }
-
- if ((err = ldi_ioctl(lh, TCP_IOC_DEFAULT_Q, (intptr_t)0, FKIOCTL,
- CRED(), &rval)) != 0) {
- printf("strplumb: failed to set TCP default queue: %d\n",
- err);
- goto done;
- }
-
-done:
- (void) ldi_close(ip_lh, FREAD|FWRITE, CRED());
- return (err);
-}
-
/*
* Can be set in /etc/system in the case of local booting. See comment below.
*/
@@ -447,11 +338,8 @@ strplumb_dev(ldi_ident_t li)
/*
* Now set up the links. Ultimately, we should have two streams
- * permanently linked underneath UDP (which is actually IP with UDP
- * autopushed). One stream consists of the ARP-[ifname] combination,
- * while the other consists of ARP-IP-[ifname]. The second combination
- * seems a little weird, but is linked underneath UDP just to keep it
- * around.
+ * permanently linked under UDP. One stream consists of the
+ * ARP-[ifname] combination, while the other consists of IP-[ifname].
*
* We pin underneath UDP here to match what is done in ifconfig(1m);
* otherwise, ifconfig will be unable to unplumb the stream (the major
@@ -462,7 +350,7 @@ strplumb_dev(ldi_ident_t li)
*/
/*
- * Plumb UDP-ARP-IP-<dev>
+ * Plumb UDP-IP-<dev>
*/
if ((err = ldi_open_by_name(rootfs.bo_devname, FREAD|FWRITE, CRED(),
@@ -494,12 +382,6 @@ strplumb_dev(ldi_ident_t li)
lifr.lifr_flags &= ~IFF_IPV4;
name = UDP6DEV;
}
- if ((err = ldi_ioctl(lh, I_PUSH, (intptr_t)ARP, FKIOCTL, CRED(),
- &rval)) != 0) {
- printf("strplumb: push ARP failed: %d\n", err);
- goto done;
- }
-
(void) strlcpy(lifr.lifr_name, rootfs.bo_ifname,
sizeof (lifr.lifr_name));
lifr.lifr_ppa = rootfs.bo_ppa;
@@ -507,29 +389,17 @@ strplumb_dev(ldi_ident_t li)
if ((err = setifname(lh, &lifr)) != 0)
goto done;
- /* Get the flags and check if ARP is needed */
+ /* get the flags and check if ARP is needed */
if ((err = getifflags(lh, &lifr)) != 0) {
printf("strplumb: getifflags %s IP failed, error %d\n",
lifr.lifr_name, err);
goto done;
}
-
- /* Pop out ARP if not needed */
- if (lifr.lifr_flags & (IFF_NOARP | IFF_IPV6)) {
- err = ldi_ioctl(lh, I_POP, (intptr_t)0, FKIOCTL, CRED(),
- &rval);
- if (err != 0) {
- printf("strplumb: pop ARP failed, error %d\n", err);
- goto done;
- }
- }
-
if ((err = ldi_open_by_name(name, FREAD|FWRITE, CRED(), &mux_lh,
li)) != 0) {
printf("strplumb: open of %s failed: %d\n", name, err);
goto done;
}
-
if ((err = ldi_ioctl(mux_lh, I_PLINK, (intptr_t)lh,
FREAD|FWRITE|FNOCTTY|FKIOCTL, CRED(),
&(ifr.ifr_ip_muxid))) != 0) {
@@ -538,9 +408,9 @@ strplumb_dev(ldi_ident_t li)
goto done;
}
- if (af == AF_INET6) {
+ /* if ARP is not needed, we are done */
+ if (lifr.lifr_flags & (IFF_NOARP | IFF_IPV6))
goto done;
- }
DBG2("UDP-ARP-IP-%s muxid: %d\n", rootfs.bo_ifname, ifr.ifr_ip_muxid);
@@ -610,22 +480,9 @@ strplumb(void)
if ((err = strplumb_init()) != 0)
return (err);
- if ((err = strplumb_autopush()) != 0)
- return (err);
-
if ((err = ldi_ident_from_mod(&modlinkage, &li)) != 0)
return (err);
- /*
- * Setup the TCP and SCTP default queues for the global stack.
- * tcp/sctp_stack_init will do this for additional stack instances.
- */
- if ((err = strplumb_sctpq(li)) != 0)
- goto done;
-
- if ((err = strplumb_tcpq(li)) != 0)
- goto done;
-
if ((err = resolve_boot_path()) != 0)
goto done;
diff --git a/usr/src/uts/common/io/tl.c b/usr/src/uts/common/io/tl.c
index 7ddb24cddb..83f8cf6944 100644
--- a/usr/src/uts/common/io/tl.c
+++ b/usr/src/uts/common/io/tl.c
@@ -452,7 +452,7 @@ opdes_t tl_opt_arr[] = {
OA_R,
OA_R,
OP_NP,
- OP_PASSNEXT,
+ 0,
sizeof (t_scalar_t),
0
},
@@ -462,7 +462,7 @@ opdes_t tl_opt_arr[] = {
OA_RW,
OA_RW,
OP_NP,
- OP_PASSNEXT,
+ 0,
sizeof (int),
0
}
@@ -867,7 +867,7 @@ static void tl_fill_option(uchar_t *, cred_t *, pid_t, int, cred_t *);
static int tl_default_opt(queue_t *, int, int, uchar_t *);
static int tl_get_opt(queue_t *, int, int, uchar_t *);
static int tl_set_opt(queue_t *, uint_t, int, int, uint_t, uchar_t *, uint_t *,
- uchar_t *, void *, cred_t *, mblk_t *);
+ uchar_t *, void *, cred_t *);
static void tl_memrecover(queue_t *, mblk_t *, size_t);
static void tl_freetip(tl_endpt_t *, tl_icon_t *);
static void tl_free(tl_endpt_t *);
@@ -904,7 +904,6 @@ optdb_obj_t tl_opt_obj = {
tl_default_opt, /* TL default value function pointer */
tl_get_opt, /* TL get function pointer */
tl_set_opt, /* TL set function pointer */
- B_TRUE, /* TL is tpi provider */
TL_OPT_ARR_CNT, /* TL option database count of entries */
tl_opt_arr, /* TL option database */
TL_VALID_LEVELS_CNT, /* TL valid level count of entries */
@@ -2789,12 +2788,10 @@ tl_optmgmt(queue_t *wq, mblk_t *mp)
* call common option management routine from drv/ip
*/
if (prim->type == T_SVR4_OPTMGMT_REQ) {
- (void) svr4_optcom_req(wq, mp, cr, &tl_opt_obj,
- B_FALSE);
+ svr4_optcom_req(wq, mp, cr, &tl_opt_obj);
} else {
ASSERT(prim->type == T_OPTMGMT_REQ);
- (void) tpi_optcom_req(wq, mp, cr, &tl_opt_obj,
- B_FALSE);
+ tpi_optcom_req(wq, mp, cr, &tl_opt_obj);
}
}
@@ -6066,8 +6063,7 @@ tl_set_opt(
uint_t *outlenp,
uchar_t *outvalp,
void *thisdg_attrs,
- cred_t *cr,
- mblk_t *mblk)
+ cred_t *cr)
{
int error;
tl_endpt_t *tep;
diff --git a/usr/src/uts/common/io/warlock/ibcm.wlcmd b/usr/src/uts/common/io/warlock/ibcm.wlcmd
index b4ae04a925..e66149c4fd 100644
--- a/usr/src/uts/common/io/warlock/ibcm.wlcmd
+++ b/usr/src/uts/common/io/warlock/ibcm.wlcmd
@@ -66,11 +66,7 @@ root ibt_get_src_ip
root ibt_ofuvcm_get_req_data
root ibt_ofuvcm_proceed
-root ibcm_arp_timeout
root ibcm_arp_get_srcip_plist
-root ibcm_arp_lrput
-root ibcm_arp_lwsrv
-root ibcm_arp_lrsrv
root ibcm_arp_get_ibd_insts_cb
# callback entry points from ibmf
diff --git a/usr/src/uts/common/ipp/dlcosmk/dlcosmk.c b/usr/src/uts/common/ipp/dlcosmk/dlcosmk.c
index 27eaaba86f..c827fb9e82 100644
--- a/usr/src/uts/common/ipp/dlcosmk/dlcosmk.c
+++ b/usr/src/uts/common/ipp/dlcosmk/dlcosmk.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/stream.h>
#include <sys/dlpi.h>
@@ -88,8 +86,8 @@ dlcosmk_process(mblk_t **mpp, dlcosmk_data_t *dlcosmk_data, uint32_t ill_index,
}
if ((ill_index == 0) ||
- ((ill = ill_lookup_on_ifindex_global_instance(ill_index, B_FALSE,
- NULL, NULL, NULL, NULL)) == NULL)) {
+ ((ill = ill_lookup_on_ifindex_global_instance(ill_index,
+ B_FALSE)) == NULL)) {
dlcosmk2dbg(("dlcosmk_process:invalid ill index %u\n",
ill_index));
atomic_add_64(&dlcosmk_data->ipackets, 1);
diff --git a/usr/src/uts/common/ipp/ipgpc/classifierddi.c b/usr/src/uts/common/ipp/ipgpc/classifierddi.c
index 4d31da6396..e76c181d92 100644
--- a/usr/src/uts/common/ipp/ipgpc/classifierddi.c
+++ b/usr/src/uts/common/ipp/ipgpc/classifierddi.c
@@ -445,10 +445,9 @@ ipgpc_invoke_action(ipp_action_id_t aid, ipp_packet_t *packet)
pkt.direction = callout_pos; /* set packet direction */
/* The ill_index could be 0 when called from forwarding (read) path */
- if (ill_idx > 0) {
- ill = ill_lookup_on_ifindex_global_instance(ill_idx, B_FALSE,
- NULL, NULL, NULL, NULL);
- }
+ if (ill_idx > 0)
+ ill = ill_lookup_on_ifindex_global_instance(ill_idx, B_FALSE);
+
if (ill != NULL) {
/*
* Since all IPP actions in an IPMP group are performed
diff --git a/usr/src/uts/common/ktli/t_kutil.c b/usr/src/uts/common/ktli/t_kutil.c
index cfd153d873..ab762403fd 100644
--- a/usr/src/uts/common/ktli/t_kutil.c
+++ b/usr/src/uts/common/ktli/t_kutil.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -36,8 +36,6 @@
* contributors.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* Contains the following utility functions:
* tli_send:
@@ -230,7 +228,7 @@ t_kadvise(TIUSER *tiptr, uchar_t *addr, int addr_len)
bzero(ipid, sizeof (*ipid));
ipid->ipid_cmd = IP_IOC_IRE_DELETE_NO_REPLY;
- ipid->ipid_ire_type = IRE_CACHE;
+ ipid->ipid_ire_type = 0;
ipid->ipid_addr_offset = sizeof (ipid_t);
ipid->ipid_addr_length = addr_len;
diff --git a/usr/src/uts/common/net/route.h b/usr/src/uts/common/net/route.h
index 3e4307f25e..9c004b74b1 100644
--- a/usr/src/uts/common/net/route.h
+++ b/usr/src/uts/common/net/route.h
@@ -130,7 +130,8 @@ struct rtentry {
#define RTF_PROTO1 0x8000 /* protocol specific routing flag */
#define RTF_MULTIRT 0x10000 /* multiroute */
#define RTF_SETSRC 0x20000 /* set default outgoing src address */
-
+#define RTF_INDIRECT 0x40000 /* gateway not directly reachable */
+#define RTF_KERNEL 0x80000 /* created by kernel; can't delete */
/*
* OLD statistics not used by the kernel. The kernel uses <inet/mib2.h>.
diff --git a/usr/src/uts/common/netinet/in.h b/usr/src/uts/common/netinet/in.h
index fc2c750ba7..c1166fc34f 100644
--- a/usr/src/uts/common/netinet/in.h
+++ b/usr/src/uts/common/netinet/in.h
@@ -888,6 +888,7 @@ struct sockaddr_in6 {
*/
#define IP_PKTINFO 0x1a /* specify src address and/or index */
#define IP_RECVPKTINFO 0x1a /* recv dest/matched addr and index */
+#define IP_DONTFRAG 0x1b /* don't fragment packets */
#if !defined(_XPG4_2) || defined(__EXTENSIONS__)
/*
diff --git a/usr/src/uts/common/netinet/ip_mroute.h b/usr/src/uts/common/netinet/ip_mroute.h
index 8a658a0fca..b1dde41b1f 100644
--- a/usr/src/uts/common/netinet/ip_mroute.h
+++ b/usr/src/uts/common/netinet/ip_mroute.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -19,17 +18,16 @@
*
* CDDL HEADER END
*/
+
/*
- * Copyright 1991, 1997-1999, 2001, 2003 Sun Microsystems, Inc.
- * All rights reserved. Use is subject to license terms.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
#ifndef _NETINET_IP_MROUTE_H
#define _NETINET_IP_MROUTE_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -188,6 +186,7 @@ struct vif {
uint_t v_refcnt;
uchar_t v_marks;
kmutex_t v_lock;
+ ilm_t *v_ilm; /* allmulti join */
};
/*
diff --git a/usr/src/uts/common/os/ip_cksum.c b/usr/src/uts/common/os/ip_cksum.c
index 722c793b79..1fa1c9425b 100644
--- a/usr/src/uts/common/os/ip_cksum.c
+++ b/usr/src/uts/common/os/ip_cksum.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -93,9 +93,6 @@ ip_cksum(mblk_t *mp, int offset, uint_t sum)
#endif
ASSERT(dp);
- TRACE_2(TR_FAC_IP, TR_IP_CKSUM_START,
- "ip_cksum_start:%p (%X)", mp, sum);
-
if (mp->b_cont == NULL) {
/*
* May be fast-path, only one mblk.
@@ -277,9 +274,6 @@ slow1:
mlen = mp->b_wptr - (uchar_t *)w;
}
- TRACE_2(TR_FAC_IP, TR_IP_CKSUM_START,
- "ip_cksum_start:%p (%X)", mp, sum)
-
mp = mp->b_cont;
if (mlen > 0 && pmlen == -1) {
/*
diff --git a/usr/src/uts/common/os/strsubr.c b/usr/src/uts/common/os/strsubr.c
index 76ce1af025..22bdc86e03 100644
--- a/usr/src/uts/common/os/strsubr.c
+++ b/usr/src/uts/common/os/strsubr.c
@@ -8474,9 +8474,7 @@ hcksum_retrieve(mblk_t *mp, multidata_t *mmd, pdesc_t *pd,
ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_MULTIDATA);
if (mp->b_datap->db_type == M_DATA) {
if (flags != NULL) {
- *flags = DB_CKSUMFLAGS(mp) & (HCK_IPV4_HDRCKSUM |
- HCK_PARTIALCKSUM | HCK_FULLCKSUM |
- HCK_FULLCKSUM_OK);
+ *flags = DB_CKSUMFLAGS(mp) & HCK_FLAGS;
if ((*flags & (HCK_PARTIALCKSUM |
HCK_FULLCKSUM)) != 0) {
if (value != NULL)
diff --git a/usr/src/uts/common/sys/dld.h b/usr/src/uts/common/sys/dld.h
index 9542a15a8e..ed80269fbc 100644
--- a/usr/src/uts/common/sys/dld.h
+++ b/usr/src/uts/common/sys/dld.h
@@ -395,7 +395,8 @@ typedef struct dld_capab_poll_s {
/*
* Currently supported flags for LSO.
*/
-#define DLD_LSO_TX_BASIC_TCP_IPV4 0x01 /* TCP LSO capability */
+#define DLD_LSO_BASIC_TCP_IPV4 0x01 /* TCP LSO over IPv4 capability */
+#define DLD_LSO_BASIC_TCP_IPV6 0x02 /* TCP LSO over IPv6 capability */
typedef struct dld_capab_lso_s {
uint_t lso_flags; /* capability flags */
diff --git a/usr/src/uts/common/sys/dlpi.h b/usr/src/uts/common/sys/dlpi.h
index 8b0681e2d8..6b3a5801d7 100644
--- a/usr/src/uts/common/sys/dlpi.h
+++ b/usr/src/uts/common/sys/dlpi.h
@@ -593,10 +593,6 @@ union DL_qos_types {
/* dl_data is dl_capab_id_t */
#define DL_CAPAB_HCKSUM 0x01 /* Checksum offload */
/* dl_data is dl_capab_hcksum_t */
-#define DL_CAPAB_IPSEC_AH 0x02 /* IPsec AH acceleration */
- /* dl_data is dl_capab_ipsec_t */
-#define DL_CAPAB_IPSEC_ESP 0x03 /* IPsec ESP acceleration */
- /* dl_data is dl_capab_ipsec_t */
#define DL_CAPAB_MDT 0x04 /* Multidata Transmit capability */
/* dl_data is dl_capab_mdt_t */
#define DL_CAPAB_ZEROCOPY 0x05 /* Zero-copy capability */
@@ -611,45 +607,8 @@ typedef struct {
} dl_capability_sub_t;
/*
- * Definitions and structures needed for DL_CONTROL_REQ and DL_CONTROL_ACK
- * primitives.
- * Extensible message to send down control information to the DLS provider.
- * The response is a DL_CONTROL_ACK or DL_ERROR_ACK.
- *
- * Different types of control operations will define different format for the
- * key and data fields. ADD requires key and data fields; if the <type, key>
- * matches an already existing entry a DL_ERROR_ACK will be returned. DELETE
- * requires a key field; if the <type, key> does not exist, a DL_ERROR_ACK
- * will be returned. FLUSH requires neither a key nor data; it
- * unconditionally removes all entries for the specified type. GET requires a
- * key field; the get operation returns the data for the <type, key>. If
- * <type, key> doesn't exist a DL_ERROR_ACK is returned. UPDATE requires key
- * and data fields; if <type, key> doesn't exist a DL_ERROR_ACK is returned.
- */
-
-/*
- * Control operations
- */
-#define DL_CO_ADD 0x01 /* Add new entry matching for <type,key> */
-#define DL_CO_DELETE 0x02 /* Delete the entry matching <type,key> */
-#define DL_CO_FLUSH 0x03 /* Purge all entries of <type> */
-#define DL_CO_GET 0x04 /* Get the data for the <type,key> */
-#define DL_CO_UPDATE 0x05 /* Update the data for <type,key> */
-#define DL_CO_SET 0x06 /* Add or update as appropriate */
-
-/*
- * Control types (dl_type field of dl_control_req_t and dl_control_ack_t)
- */
-#define DL_CT_IPSEC_AH 0x01 /* AH; key=spi,dest_addr; */
- /* data=keying material */
-#define DL_CT_IPSEC_ESP 0x02 /* ESP; key=spi,des_taddr; */
- /* data=keying material */
-
-/*
* Module ID token to be included in new sub-capability structures.
- * Existing sub-capabilities lacking an identification token, e.g. IPSEC
- * hardware acceleration, need to be encapsulated within the ID sub-
- * capability. Access to this structure must be done through
+ * Access to this structure must be done through
* dlcapab{set,check}qid().
*/
typedef struct {
diff --git a/usr/src/uts/common/sys/ib/mgt/ibcm/ibcm_arp.h b/usr/src/uts/common/sys/ib/mgt/ibcm/ibcm_arp.h
index c307ed7575..e0b7e1e1e7 100644
--- a/usr/src/uts/common/sys/ib/mgt/ibcm/ibcm_arp.h
+++ b/usr/src/uts/common/sys/ib/mgt/ibcm/ibcm_arp.h
@@ -31,24 +31,11 @@ extern "C" {
#endif
#include <sys/ib/mgt/ibcm/ibcm_impl.h>
-#include <sys/modhash.h>
#include <sys/ib/clients/ibd/ibd.h>
-#include <sys/strsun.h>
-#include <sys/socket.h>
-#include <sys/stat.h> /* for S_IFCHR */
#include <inet/ip2mac.h>
#include <inet/ip6.h>
-/*
- * IPoIB addr lookup completion function
- */
-typedef int (*ibcm_arp_pr_comp_func_t) (void *usr_arg, int status);
-
#define IBCM_ARP_MAX_IFNAME_LEN 24
-#define IBCM_ARP_XMIT_COUNT 6
-#define IBCM_ARP_XMIT_INTERVAL 1000 /* timeout in milliseconds */
-#define IBCM_ARP_TIMEOUT \
- ((IBCM_ARP_XMIT_COUNT + 1) * IBCM_ARP_XMIT_INTERVAL)
#define IBCM_H2N_GID(gid) \
{ \
@@ -68,9 +55,7 @@ typedef int (*ibcm_arp_pr_comp_func_t) (void *usr_arg, int status);
* Path record wait queue node definition
*/
typedef struct ibcm_arp_prwqn {
- ibcm_arp_pr_comp_func_t func; /* user callback function */
- void *arg; /* callback function arg */
- timeout_id_t timeout_id;
+ struct ibcm_arp_streams_s *ib_str;
uint8_t flags;
ibt_ip_addr_t usrc_addr; /* user supplied src address */
ibt_ip_addr_t dst_addr; /* user supplied dest address */
@@ -89,15 +74,11 @@ typedef struct ibcm_arp_prwqn {
typedef struct ibcm_arp_streams_s {
kmutex_t lock;
kcondvar_t cv;
- queue_t *arpqueue;
- vnode_t *arp_vp;
int status;
boolean_t done;
ibcm_arp_prwqn_t *wqnp;
} ibcm_arp_streams_t;
-/* GID to IP-Addr and Ip-Addr to GID look-up functions. */
-
#define IBCM_ARP_IBD_INSTANCES 4
typedef struct ibcm_arp_ip_s {
diff --git a/usr/src/uts/common/sys/iphada.h b/usr/src/uts/common/sys/iphada.h
deleted file mode 100644
index 9d1a6e28e8..0000000000
--- a/usr/src/uts/common/sys/iphada.h
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2002-2003 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SYS_IPHADA_H
-#define _SYS_IPHADA_H
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define DA_ICV_MAX_LEN 128 /* max ICV length [bytes] */
-
-/*
- * iphada.h header for IP Hardware Acceleration Data Attributes
- *
- * This is a contract private interface for use by the Sun
- * Hardware Accelerated Ethernet driver ONLY.
- */
-typedef struct da_ipsec {
- int da_type; /* M_CTL message ident */
- int da_flag;
- uint32_t da_icv_len; /* da_icv length in bytes */
- uchar_t da_icv[DA_ICV_MAX_LEN]; /* ICV for AH or ESP+auth */
-} da_ipsec_t;
-
-#define IPHADA_M_CTL 0xA1D53DE5u
-
-/*
- * IPSec algorithms capabilities (cip_data in dl_capab_ipsec_t)
- */
-typedef struct {
- t_uscalar_t alg_type;
- t_uscalar_t alg_prim; /* algorithm primitive */
- t_uscalar_t alg_thruput; /* approx throughput metric in Mb/s */
- t_uscalar_t alg_flag; /* flags */
- t_uscalar_t alg_minbits; /* minimum key len in bits */
- t_uscalar_t alg_maxbits; /* maximum key len in bits */
- t_uscalar_t alg_incrbits; /* key len increment in bits */
-} dl_capab_ipsec_alg_t;
-
-/*
- * IPSec sub-capability (follows dl_capability_sub_t)
- */
-typedef struct {
- t_uscalar_t cip_version; /* interface version */
- t_uscalar_t cip_nciphers; /* number ciphers supported */
- dl_capab_ipsec_alg_t cip_data[1]; /* data */
-} dl_capab_ipsec_t;
-
-/*
- * Algorithm types (alg_type field of dl_capab_ipsec_alg_t)
- */
-#define DL_CAPAB_IPSEC_ALG_AUTH 0x01 /* authentication alg. */
-#define DL_CAPAB_IPSEC_ALG_ENCR 0x02 /* encryption alg. */
-
-/* alg_prim ciphers */
-#define DL_CAPAB_IPSEC_ENCR_DES 0x02
-#define DL_CAPAB_IPSEC_ENCR_3DES 0x03
-#define DL_CAPAB_IPSEC_ENCR_BLOWFISH 0x07
-#define DL_CAPAB_IPSEC_ENCR_NULL 0x0b /* no encryption */
-#define DL_CAPAB_IPSEC_ENCR_AES 0x0c
-
-/* alg_prim authentications */
-#define DL_CAPAB_IPSEC_AUTH_NONE 0x00 /* no authentication */
-#define DL_CAPAB_IPSEC_AUTH_MD5HMAC 0x02
-#define DL_CAPAB_IPSEC_AUTH_SHA1HMAC 0x03
-
-/* alg_flag values */
-#define DL_CAPAB_ALG_ENABLE 0x01 /* enable this algorithm */
-
-/*
- * For DL_CT_IPSEC_AH and DL_CT_IPSEC_ESP, the optional dl_key data
- * that follows the dl_control_req_t or dl_control_ack_t will be the IPsec
- * SPI (Security Parameters Index) value and the destination address.
- * This is defined as being unique per protocol.
- */
-
-#define DL_CTL_IPSEC_ADDR_LEN 16 /* IP addr length in bytes */
-
-typedef struct dl_ct_ipsec_key {
- uint32_t dl_key_spi; /* Security Parameters Index value */
- uchar_t dl_key_dest_addr[DL_CTL_IPSEC_ADDR_LEN]; /* dest IP address */
- uint32_t dl_key_addr_family; /* family of dest IP address */
- /* (AF_INET or AF_INET6) */
-} dl_ct_ipsec_key_t;
-
-#define DL_CT_IPSEC_MAX_KEY_LEN 512 /* max key length in bytes */
-
-/*
- * Possible flags for sadb_sa_flags.
- */
-#define DL_CT_IPSEC_INBOUND 0x01 /* SA can be used for inbound pkts */
-#define DL_CT_IPSEC_OUTBOUND 0x02 /* SA can be used for outbound pkts */
-
-/*
- * minimal SADB entry content
- * fields are defined as per RFC 2367 and <net/pfkeyv2.h>
- * This defines the content and format of the dl_data portion of
- * the dl_control_req_t or dl_control_ack_t.
- */
-typedef struct dl_ct_ipsec {
- uint8_t sadb_sa_auth; /* Authentication algorithm */
- uint8_t sadb_sa_encrypt; /* Encryption algorithm */
- uint32_t sadb_sa_flags; /* SA flags. */
- uint16_t sadb_key_len_a; /* auth key length in bytes */
- uint16_t sadb_key_bits_a; /* auth key length in bits */
- uint16_t sadb_key_data_a[DL_CT_IPSEC_MAX_KEY_LEN]; /* key data */
- uint16_t sadb_key_len_e; /* encr key length in bytes */
- uint16_t sadb_key_bits_e; /* encr key length in bits */
- uint16_t sadb_key_data_e[DL_CT_IPSEC_MAX_KEY_LEN]; /* key data */
-} dl_ct_ipsec_t;
-
-
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_IPHADA_H */
diff --git a/usr/src/uts/common/sys/pattr.h b/usr/src/uts/common/sys/pattr.h
index cac046d675..f3b8397681 100644
--- a/usr/src/uts/common/sys/pattr.h
+++ b/usr/src/uts/common/sys/pattr.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_PATTR_H
#define _SYS_PATTR_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -92,6 +90,9 @@ typedef struct pattr_hcksum_s {
/* check the attached h/w computed */
/* checksum value to determine if */
/* checksum was bad */
+
+#define HCK_FLAGS (HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | \
+ HCK_FULLCKSUM | HCK_FULLCKSUM_OK)
/*
* Extended hardware offloading flags that also use hcksum_flags
*/
diff --git a/usr/src/uts/common/sys/softmac_impl.h b/usr/src/uts/common/sys/softmac_impl.h
index eb71063bc7..bd94d4982e 100644
--- a/usr/src/uts/common/sys/softmac_impl.h
+++ b/usr/src/uts/common/sys/softmac_impl.h
@@ -301,7 +301,9 @@ typedef struct softmac_upper_s {
uint32_t su_bound : 1, /* SL */
su_active : 1, /* SL */
- su_direct : 1; /* SL */
+ su_direct : 1, /* SL */
+ su_is_arp : 1,
+ su_pad_to_32:28;
/*
* Used for fastpath data path.
diff --git a/usr/src/uts/common/sys/squeue.h b/usr/src/uts/common/sys/squeue.h
index a2d808f647..de0f18bd4d 100644
--- a/usr/src/uts/common/sys/squeue.h
+++ b/usr/src/uts/common/sys/squeue.h
@@ -44,21 +44,19 @@ typedef struct squeue_s squeue_t;
(mp)->b_prev = (mblk_t *)(arg); \
}
-#define GET_SQUEUE(mp) ((conn_t *)((mp)->b_prev))->conn_sqp
-
#define SQ_FILL 0x0001
#define SQ_NODRAIN 0x0002
#define SQ_PROCESS 0x0004
-#define SQUEUE_ENTER(sqp, head, tail, cnt, flag, tag) { \
- sqp->sq_enter(sqp, head, tail, cnt, flag, tag); \
+#define SQUEUE_ENTER(sqp, head, tail, cnt, ira, flag, tag) { \
+ sqp->sq_enter(sqp, head, tail, cnt, ira, flag, tag); \
}
-#define SQUEUE_ENTER_ONE(sqp, mp, proc, arg, flag, tag) { \
+#define SQUEUE_ENTER_ONE(sqp, mp, proc, arg, ira, flag, tag) { \
ASSERT(mp->b_next == NULL); \
ASSERT(mp->b_prev == NULL); \
SET_SQUEUE(mp, proc, arg); \
- SQUEUE_ENTER(sqp, mp, mp, 1, flag, tag); \
+ SQUEUE_ENTER(sqp, mp, mp, 1, ira, flag, tag); \
}
/*
@@ -77,12 +75,13 @@ typedef enum {
SQPRIVATE_MAX
} sqprivate_t;
+struct ip_recv_attr_s;
extern void squeue_init(void);
extern squeue_t *squeue_create(clock_t, pri_t);
extern void squeue_bind(squeue_t *, processorid_t);
extern void squeue_unbind(squeue_t *);
extern void squeue_enter(squeue_t *, mblk_t *, mblk_t *,
- uint32_t, int, uint8_t);
+ uint32_t, struct ip_recv_attr_s *, int, uint8_t);
extern uintptr_t *squeue_getprivate(squeue_t *, sqprivate_t);
struct conn_s;
diff --git a/usr/src/uts/common/sys/squeue_impl.h b/usr/src/uts/common/sys/squeue_impl.h
index bd934cc0b3..22550886eb 100644
--- a/usr/src/uts/common/sys/squeue_impl.h
+++ b/usr/src/uts/common/sys/squeue_impl.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -79,9 +79,9 @@ typedef struct squeue_set_s {
processorid_t sqs_cpuid;
} squeue_set_t;
-typedef void (*sqproc_t)(void *, mblk_t *, void *);
+typedef void (*sqproc_t)(void *, mblk_t *, void *, struct ip_recv_attr_s *);
typedef void (*sq_enter_proc_t)(squeue_t *, mblk_t *, mblk_t *, uint32_t,
- int, uint8_t);
+ struct ip_recv_attr_s *, int, uint8_t);
typedef void (*sq_drain_proc_t)(squeue_t *, uint_t, hrtime_t);
extern void squeue_worker_wakeup(squeue_t *);
diff --git a/usr/src/uts/common/sys/stream.h b/usr/src/uts/common/sys/stream.h
index b9c96a8345..7a3b4e3448 100644
--- a/usr/src/uts/common/sys/stream.h
+++ b/usr/src/uts/common/sys/stream.h
@@ -404,9 +404,6 @@ typedef struct bcache {
#define STRUIO_IP 0x04 /* IP checksum stored in db_struioun */
#define STRUIO_ZC 0x08 /* mblk eligible for zero-copy */
#define STRUIO_ZCNOTIFY 0x10 /* notify stream head when mblk acked */
-#define STRUIO_EAGER 0x20 /* new eager; db_cksumstart has squeue to use */
-#define STRUIO_POLICY 0x40 /* new eager when IPsec is enabled */
-#define STRUIO_CONNECT 0x80 /* conn did a connect */
/*
* Message flags. These are interpreted by the stream head.
@@ -418,8 +415,7 @@ typedef struct bcache {
/* UNUSED 0x08 was MSGNOGET (can be recycled) */
#define MSGMARKNEXT 0x10 /* Private: first byte of next msg marked */
#define MSGNOTMARKNEXT 0x20 /* Private: ... not marked */
-#define MSGHASREF 0x40 /* Private: message has reference to owner */
-#define MSGWAITSYNC 0x80 /* Private: waiting for sync squeue enter */
+#define MSGWAITSYNC 0x40 /* Private: waiting for sync squeue enter */
/*
* Streams message types.
diff --git a/usr/src/uts/common/sys/tsol/tnet.h b/usr/src/uts/common/sys/tsol/tnet.h
index 221f4c775a..0da65ae5ca 100644
--- a/usr/src/uts/common/sys/tsol/tnet.h
+++ b/usr/src/uts/common/sys/tsol/tnet.h
@@ -46,35 +46,30 @@ extern "C" {
extern int tsol_tnrh_chk(tsol_tpent_t *, bslabel_t *, int);
extern tsol_tnrhc_t *find_rhc(const void *, uchar_t, boolean_t);
-extern int tsol_check_dest(const cred_t *, const void *, uchar_t, uint_t,
- cred_t **);
-extern int tsol_compute_label(const cred_t *, ipaddr_t, uchar_t *,
- ip_stack_t *);
-extern int tsol_compute_label_v6(const cred_t *, const in6_addr_t *, uchar_t *,
- ip_stack_t *);
-extern int tsol_check_label(const cred_t *, mblk_t **, uint_t,
- ip_stack_t *, pid_t);
-extern int tsol_check_label_v6(const cred_t *, mblk_t **, uint_t,
- ip_stack_t *, pid_t);
+extern int tsol_check_dest(const ts_label_t *, const void *, uchar_t,
+ uint_t, boolean_t, ts_label_t **);
+extern int tsol_compute_label_v4(const ts_label_t *, zoneid_t, ipaddr_t,
+ uchar_t *, ip_stack_t *);
+extern int tsol_compute_label_v6(const ts_label_t *, zoneid_t,
+ const in6_addr_t *, uchar_t *, ip_stack_t *);
+extern int tsol_check_label_v4(const ts_label_t *, zoneid_t, mblk_t **,
+ uint_t, boolean_t, ip_stack_t *, ts_label_t **);
+extern int tsol_check_label_v6(const ts_label_t *, zoneid_t, mblk_t **,
+ uint_t, boolean_t, ip_stack_t *, ts_label_t **);
extern int tsol_prepend_option(uchar_t *, ipha_t *, int);
extern int tsol_prepend_option_v6(uchar_t *, ip6_t *, int);
extern int tsol_remove_secopt(ipha_t *, int);
extern int tsol_remove_secopt_v6(ip6_t *, int);
-extern int tsol_update_sticky(ip6_pkt_t *, uint_t *, const uchar_t *);
-extern int tsol_update_options(uchar_t **, uint_t *, uint_t *,
- const uchar_t *);
-extern boolean_t tsol_option_set(uchar_t **, uint_t *, uint_t, const uchar_t *,
- uint_t);
extern tsol_ire_gw_secattr_t *ire_gw_secattr_alloc(int);
extern void ire_gw_secattr_free(tsol_ire_gw_secattr_t *);
-extern boolean_t tsol_can_reply_error(const mblk_t *);
+extern boolean_t tsol_can_reply_error(const mblk_t *, ip_recv_attr_t *);
extern boolean_t tsol_receive_local(const mblk_t *, const void *, uchar_t,
- boolean_t, const conn_t *);
-extern boolean_t tsol_can_accept_raw(mblk_t *, boolean_t);
-extern boolean_t tsol_get_pkt_label(mblk_t *, int);
-extern zoneid_t tsol_packet_to_zoneid(const mblk_t *);
+ ip_recv_attr_t *, const conn_t *);
+extern boolean_t tsol_can_accept_raw(mblk_t *, ip_recv_attr_t *, boolean_t);
+extern boolean_t tsol_get_pkt_label(mblk_t *, int, ip_recv_attr_t *);
+extern zoneid_t tsol_attr_to_zoneid(const ip_recv_attr_t *);
extern boolean_t tsol_get_option_v4(mblk_t *, tsol_ip_label_t *, uint8_t **);
extern boolean_t tsol_get_option_v6(mblk_t *, tsol_ip_label_t *, uint8_t **);
@@ -83,8 +78,8 @@ extern boolean_t tsol_find_secopt_v6(const uchar_t *, uint_t, uchar_t **,
extern int tsol_ire_match_gwattr(ire_t *, const ts_label_t *);
extern int tsol_rtsa_init(rt_msghdr_t *, tsol_rtsecattr_t *, caddr_t);
-extern int tsol_ire_init_gwattr(ire_t *, uchar_t, tsol_gc_t *, tsol_gcgrp_t *);
-extern mblk_t *tsol_ip_forward(ire_t *, mblk_t *);
+extern int tsol_ire_init_gwattr(ire_t *, uchar_t, tsol_gc_t *);
+extern mblk_t *tsol_ip_forward(ire_t *, mblk_t *, const ip_recv_attr_t *);
extern uint32_t tsol_pmtu_adjust(mblk_t *, uint32_t, int, int);
extern mlp_type_t tsol_mlp_addr_type(zoneid_t, uchar_t, const void *,
diff --git a/usr/src/uts/intel/Makefile.intel.shared b/usr/src/uts/intel/Makefile.intel.shared
index f1ceb0257e..6b20559ef4 100644
--- a/usr/src/uts/intel/Makefile.intel.shared
+++ b/usr/src/uts/intel/Makefile.intel.shared
@@ -371,7 +371,6 @@ DRV_KMODS += pppt
DRV_KMODS += ncall nsctl sdbc nskern sv
DRV_KMODS += ii rdc rdcsrv rdcstub
DRV_KMODS += iptun
-DRV_KMODS += iptunq
#
# Don't build some of these for OpenSolaris, since they will be
diff --git a/usr/src/uts/intel/arp/Makefile b/usr/src/uts/intel/arp/Makefile
index aff11806da..9b91950434 100644
--- a/usr/src/uts/intel/arp/Makefile
+++ b/usr/src/uts/intel/arp/Makefile
@@ -21,11 +21,9 @@
#
# uts/intel/arp/Makefile
#
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
-#
# This makefile drives the production of the arp driver kernel module.
#
# intel implementation architecture dependent
@@ -68,7 +66,7 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOTLINK) $(ROOT_CONFFILE)
#
# depends on ip
#
-LDFLAGS += -dy -Ndrv/ip -Ndrv/hook -Nmisc/neti
+LDFLAGS += -dy -Ndrv/ip
#
# For now, disable these lint checks; maintainers should endeavor
diff --git a/usr/src/uts/intel/arp/arp.global-objs.debug64 b/usr/src/uts/intel/arp/arp.global-objs.debug64
index 7f826ea213..f936276753 100644
--- a/usr/src/uts/intel/arp/arp.global-objs.debug64
+++ b/usr/src/uts/intel/arp/arp.global-objs.debug64
@@ -23,15 +23,6 @@
# Use is subject to license terms.
#
-ar_cmd_tbl
-ar_m_tbl
-arp_mod_info
-arp_no_defense
-arpinfo
-arprinit
-arpwinit
-arp_param_arr
-arp_netinfo
cb_inet_devops
fsw
inet_dev_info
diff --git a/usr/src/uts/intel/ia32/ml/modstubs.s b/usr/src/uts/intel/ia32/ml/modstubs.s
index 6cd415a78f..3837728d4c 100644
--- a/usr/src/uts/intel/ia32/ml/modstubs.s
+++ b/usr/src/uts/intel/ia32/ml/modstubs.s
@@ -509,7 +509,6 @@ fcnname/**/_info: \
MODULE(ipsecah,drv);
WSTUB(ipsecah, ipsec_construct_inverse_acquire, nomod_zero);
WSTUB(ipsecah, sadb_acquire, nomod_zero);
- WSTUB(ipsecah, sadb_ill_download, nomod_zero);
WSTUB(ipsecah, ipsecah_algs_changed, nomod_zero);
WSTUB(ipsecah, sadb_alg_update, nomod_zero);
WSTUB(ipsecah, sadb_unlinkassoc, nomod_zero);
@@ -1294,8 +1293,6 @@ fcnname/**/_info: \
STUB(iptun, iptun_create, nomod_einval);
STUB(iptun, iptun_delete, nomod_einval);
STUB(iptun, iptun_set_policy, nomod_void) ;
- STUB(iptun, iptun_set_g_q, nomod_einval);
- STUB(iptun, iptun_clear_g_q, nomod_void);
END_MODULE(iptun);
#endif
diff --git a/usr/src/uts/intel/ip/ip.global-objs.debug64 b/usr/src/uts/intel/ip/ip.global-objs.debug64
index 6009f5b006..07e9aaedde 100644
--- a/usr/src/uts/intel/ip/ip.global-objs.debug64
+++ b/usr/src/uts/intel/ip/ip.global-objs.debug64
@@ -23,19 +23,24 @@
# Use is subject to license terms.
#
+arp_m_tbl
+arp_mod_info
+arp_netinfo
+arp_no_defense
+arpinfo
cb_inet_devops
cl_inet_bind
+cl_inet_checkspi
cl_inet_connect2
+cl_inet_deletespi
cl_inet_disconnect
+cl_inet_getspi
+cl_inet_idlesa
cl_inet_ipident
cl_inet_isclusterwide
cl_inet_listen
cl_inet_unbind
cl_inet_unlisten
-cl_inet_getspi
-cl_inet_checkspi
-cl_inet_deletespi
-cl_inet_idlesa
cl_sctp_assoc_change
cl_sctp_check_addrs
cl_sctp_connect
@@ -43,6 +48,7 @@ cl_sctp_disconnect
cl_sctp_listen
cl_sctp_unlisten
conn_drain_nthreads
+dce_cache
default_ip6_asp_table
do_tcp_fusion
do_tcpzcopy
@@ -97,74 +103,45 @@ ill_no_arena
ill_null
inet_dev_info
inet_devops
-ip6_area_template
-ip6_ared_template
-ip6_cache_table_size
ip6_ftable_hash_size
-ip6_ire_max_bucket_cnt
-ip6_ire_min_bucket_cnt
-ip6_max_cache_table_size
ip6opt_ls
-ip_ard_template
-ip_area_template
-ip_ared_template
-ip_areq_template
-ip_arma_multi_template
-ip_aroff_template
-ip_aron_template
-ip_aru_template
-ip_cache_table_size
ip_cgtp_filter_rev
ip_conn_cache
ip_debug
ip_g_all_ones
-ip_helper_stream_cache
ip_helper_stream_info
ip_helper_stream_rinit
ip_helper_stream_winit
ip_ioctl_ftbl
-ip_ire_cleanup_cnt
-ip_ire_cpu_ratio
-ip_ire_max_bucket_cnt
-ip_ire_mem_ratio
-ip_ire_min_bucket_cnt
-ip_loopback_mtu
ip_loopback_mtu_v6plus
ip_loopback_mtuplus
ip_m_tbl
-ip_max_cache_table_size
ip_max_frag_dups
ip_min_frag_prune_time
-ip_minor_arena_sa
ip_minor_arena_la
+ip_minor_arena_sa
ip_misc_ioctl_count
ip_misc_ioctl_table
ip_mod_info
ip_modclose_ackwait_ms
ip_ndx_ioctl_count
ip_ndx_ioctl_table
-ip_opt_arr
-ip_opt_obj
ip_poll_normal_ms
ip_poll_normal_ticks
ip_rput_pullups
ip_six_byte_all_ones
ip_squeue_create_callback
ip_squeue_enter
-ip_squeue_enter_unbound
ip_squeue_fanout
ip_squeue_flag
ip_squeue_worker_wait
ip_thread_data
ip_thread_list
ip_thread_rwlock
-ip_use_helper_cache
-ip_wput_frag_mdt_min
ipcl_bind_fanout_size
ipcl_conn_hash_maxsize
ipcl_conn_hash_memfactor
ipcl_conn_hash_size
-ipcl_debug_level
ipcl_iptun_fanout_size
ipcl_raw_fanout_size
ipcl_udp_fanout_size
@@ -174,24 +151,16 @@ ipinfov4
ipinfov6
iplrinit
iplwinit
-ipmp_aract_template
-ipmp_ardeact_template
ipmp_kstats
iprinitv4
iprinitv6
ipsec_action_cache
ipsec_hdr_pullup_needed
-ipsec_info_cache
ipsec_pol_cache
ipsec_policy_failure_msgs
ipsec_sel_cache
ipsec_spd_hashsize
ipsec_weird_null_inbound_policy
-ipsechw_debug
-iptunq_info
-iptunq_modinfo
-iptunq_rinit
-iptunq_winit
ipv4_forward_suffix
ipv4info
ipv6_all_hosts_mcast
@@ -199,29 +168,22 @@ ipv6_all_ones
ipv6_all_rtrs_mcast
ipv6_all_v2rtrs_mcast
ipv6_all_zeros
-ipv6_areq_template
ipv6_forward_suffix
ipv6_ll_template
ipv6_loopback
ipv6_solicited_node_mcast
ipv6_unspecified_group
ipv6info
-ipwinitv4
-ipwinitv6
+ipwinit
ire_cache
ire_gw_secattr_cache
-ire_idle_cutoff_interval
ire_null
ire_nv_arr
ire_nv_tbl
-ire_uinfo_null
lcl_ndp_arr
lcl_param_arr
lcl_sctp_param_arr
lcl_sctp_wroff_xtra_param
-lcl_tcp_mdt_head_param
-lcl_tcp_mdt_max_pbufs_param
-lcl_tcp_mdt_tail_param
lcl_tcp_param_arr
lcl_tcp_wroff_xtra_param
mask_rnhead
@@ -230,6 +192,8 @@ modldrv
modlinkage
modlstrmod
multicast_encap_iphdr
+nce_cache
+ncec_cache
netdev_privs
prov_update_handle
radix_mask_cache
@@ -238,6 +202,7 @@ rawip_conn_cache
recvq_call
recvq_loop_cnt
req_arr
+rinit_arp
rn_mkfreelist
rn_ones
rn_zeros
@@ -260,25 +225,23 @@ sctp_kmem_faddr_cache
sctp_kmem_ftsn_set_cache
sctp_kmem_set_cache
sctp_mod_info
+sctp_opt_arr
+sctp_opt_arr_size
sctp_recvq_tq_task_max
sctp_recvq_tq_task_min
sctp_recvq_tq_thr_max
sctp_recvq_tq_thr_min
sctp_sin6_null
-sctp_taskq
sctpdebug
sctpinfo
sctprinit
sctpwinit
-sendq_collision
-sendq_empty
-sendq_loop_cnt
sin6_null
sin_null
skip_sctp_cksum
-sock_tcp_downcalls
-sock_rts_downcalls
sock_rawip_downcalls
+sock_rts_downcalls
+sock_tcp_downcalls
sock_udp_downcalls
sqset_global_list
sqset_global_size
@@ -300,12 +263,10 @@ tcp_g_statistics
tcp_g_t_info_ack
tcp_g_t_info_ack_v6
tcp_icmp_source_quench
-tcp_iphc_cache
tcp_max_optsize
-tcp_mdt_chain
-tcp_mdt_smss_threshold
tcp_opt_arr
tcp_opt_obj
+tcp_outbound_squeue_switch
tcp_random_anon_port
tcp_random_end_ptr
tcp_random_fptr
@@ -321,13 +282,11 @@ tcp_sock_winit
tcp_squeue_flag
tcp_squeue_wput
tcp_static_maxpsz
-tcp_taskq
tcp_timercache
tcp_tx_pull_len
tcp_valid_levels_arr
tcp_winfo
tcp_winit
-tcp_outbound_squeue_switch
tcpinfov4
tcpinfov6
tli_errs
@@ -352,4 +311,6 @@ udp_valid_levels_arr
udp_winit
udpinfov4
udpinfov6
-zero_info
+winit_arp
+eri_cksum_workaround
+nxge_cksum_workaround
diff --git a/usr/src/uts/intel/ip/ip.global-objs.obj64 b/usr/src/uts/intel/ip/ip.global-objs.obj64
index 1706a82aa7..526e907ab5 100644
--- a/usr/src/uts/intel/ip/ip.global-objs.obj64
+++ b/usr/src/uts/intel/ip/ip.global-objs.obj64
@@ -23,19 +23,24 @@
# Use is subject to license terms.
#
+arp_m_tbl
+arp_mod_info
+arp_netinfo
+arp_no_defense
+arpinfo
cb_inet_devops
cl_inet_bind
+cl_inet_checkspi
cl_inet_connect2
+cl_inet_deletespi
cl_inet_disconnect
+cl_inet_getspi
+cl_inet_idlesa
cl_inet_ipident
cl_inet_isclusterwide
cl_inet_listen
cl_inet_unbind
cl_inet_unlisten
-cl_inet_getspi
-cl_inet_checkspi
-cl_inet_deletespi
-cl_inet_idlesa
cl_sctp_assoc_change
cl_sctp_check_addrs
cl_sctp_connect
@@ -43,6 +48,7 @@ cl_sctp_disconnect
cl_sctp_listen
cl_sctp_unlisten
conn_drain_nthreads
+dce_cache
default_ip6_asp_table
do_tcp_fusion
do_tcpzcopy
@@ -97,69 +103,41 @@ ill_no_arena
ill_null
inet_dev_info
inet_devops
-ip6_area_template
-ip6_ared_template
-ip6_cache_table_size
ip6_ftable_hash_size
-ip6_ire_max_bucket_cnt
-ip6_ire_min_bucket_cnt
-ip6_max_cache_table_size
ip6opt_ls
-ip_ard_template
-ip_area_template
-ip_ared_template
-ip_areq_template
-ip_arma_multi_template
-ip_aroff_template
-ip_aron_template
-ip_aru_template
-ip_cache_table_size
ip_cgtp_filter_rev
ip_conn_cache
ip_debug
ip_g_all_ones
-ip_helper_stream_cache
ip_helper_stream_info
ip_helper_stream_rinit
ip_helper_stream_winit
ip_ioctl_ftbl
-ip_ire_cleanup_cnt
-ip_ire_cpu_ratio
-ip_ire_max_bucket_cnt
-ip_ire_mem_ratio
-ip_ire_min_bucket_cnt
-ip_loopback_mtu
ip_loopback_mtu_v6plus
ip_loopback_mtuplus
ip_m_tbl
-ip_max_cache_table_size
ip_max_frag_dups
ip_min_frag_prune_time
-ip_minor_arena_sa
ip_minor_arena_la
+ip_minor_arena_sa
ip_misc_ioctl_count
ip_misc_ioctl_table
ip_mod_info
ip_modclose_ackwait_ms
ip_ndx_ioctl_count
ip_ndx_ioctl_table
-ip_opt_arr
-ip_opt_obj
ip_poll_normal_ms
ip_poll_normal_ticks
ip_rput_pullups
ip_six_byte_all_ones
ip_squeue_create_callback
ip_squeue_enter
-ip_squeue_enter_unbound
ip_squeue_fanout
ip_squeue_flag
ip_squeue_worker_wait
ip_thread_data
ip_thread_list
ip_thread_rwlock
-ip_use_helper_cache
-ip_wput_frag_mdt_min
ipcl_bind_fanout_size
ipcl_conn_hash_maxsize
ipcl_conn_hash_memfactor
@@ -173,23 +151,16 @@ ipinfov4
ipinfov6
iplrinit
iplwinit
-ipmp_aract_template
-ipmp_ardeact_template
ipmp_kstats
iprinitv4
iprinitv6
ipsec_action_cache
ipsec_hdr_pullup_needed
-ipsec_info_cache
ipsec_pol_cache
ipsec_policy_failure_msgs
ipsec_sel_cache
ipsec_spd_hashsize
ipsec_weird_null_inbound_policy
-iptunq_info
-iptunq_modinfo
-iptunq_rinit
-iptunq_winit
ipv4_forward_suffix
ipv4info
ipv6_all_hosts_mcast
@@ -197,29 +168,22 @@ ipv6_all_ones
ipv6_all_rtrs_mcast
ipv6_all_v2rtrs_mcast
ipv6_all_zeros
-ipv6_areq_template
ipv6_forward_suffix
ipv6_ll_template
ipv6_loopback
ipv6_solicited_node_mcast
ipv6_unspecified_group
ipv6info
-ipwinitv4
-ipwinitv6
+ipwinit
ire_cache
ire_gw_secattr_cache
-ire_idle_cutoff_interval
ire_null
ire_nv_arr
ire_nv_tbl
-ire_uinfo_null
lcl_ndp_arr
lcl_param_arr
lcl_sctp_param_arr
lcl_sctp_wroff_xtra_param
-lcl_tcp_mdt_head_param
-lcl_tcp_mdt_max_pbufs_param
-lcl_tcp_mdt_tail_param
lcl_tcp_param_arr
lcl_tcp_wroff_xtra_param
mask_rnhead
@@ -228,12 +192,15 @@ modldrv
modlinkage
modlstrmod
multicast_encap_iphdr
+nce_cache
+ncec_cache
netdev_privs
prov_update_handle
radix_mask_cache
radix_node_cache
rawip_conn_cache
req_arr
+rinit_arp
rn_mkfreelist
rn_ones
rn_zeros
@@ -256,21 +223,22 @@ sctp_kmem_faddr_cache
sctp_kmem_ftsn_set_cache
sctp_kmem_set_cache
sctp_mod_info
+sctp_opt_arr
+sctp_opt_arr_size
sctp_recvq_tq_task_max
sctp_recvq_tq_task_min
sctp_recvq_tq_thr_max
sctp_recvq_tq_thr_min
sctp_sin6_null
-sctp_taskq
sctpdebug
sctpinfo
sctprinit
sctpwinit
sin6_null
sin_null
-sock_tcp_downcalls
-sock_rts_downcalls
sock_rawip_downcalls
+sock_rts_downcalls
+sock_tcp_downcalls
sock_udp_downcalls
sqset_global_list
sqset_global_size
@@ -292,12 +260,10 @@ tcp_g_statistics
tcp_g_t_info_ack
tcp_g_t_info_ack_v6
tcp_icmp_source_quench
-tcp_iphc_cache
tcp_max_optsize
-tcp_mdt_chain
-tcp_mdt_smss_threshold
tcp_opt_arr
tcp_opt_obj
+tcp_outbound_squeue_switch
tcp_random_anon_port
tcp_random_end_ptr
tcp_random_fptr
@@ -313,13 +279,11 @@ tcp_sock_winit
tcp_squeue_flag
tcp_squeue_wput
tcp_static_maxpsz
-tcp_taskq
tcp_timercache
tcp_tx_pull_len
tcp_valid_levels_arr
tcp_winfo
tcp_winit
-tcp_outbound_squeue_switch
tcpinfov4
tcpinfov6
tli_errs
@@ -344,4 +308,6 @@ udp_valid_levels_arr
udp_winit
udpinfov4
udpinfov6
-zero_info
+winit_arp
+eri_cksum_workaround
+nxge_cksum_workaround
diff --git a/usr/src/uts/sparc/Makefile.sparc.shared b/usr/src/uts/sparc/Makefile.sparc.shared
index 7aa463978d..873557cbd6 100644
--- a/usr/src/uts/sparc/Makefile.sparc.shared
+++ b/usr/src/uts/sparc/Makefile.sparc.shared
@@ -205,7 +205,7 @@ DRV_KMODS += aggr arp audio bl bofi clone cn conskbd consms cpuid
DRV_KMODS += crypto cryptoadm devinfo dump
DRV_KMODS += dtrace fasttrap fbt lockstat profile sdt systrace dcpc
DRV_KMODS += fssnap icmp icmp6 ip ip6 ipnet ipsecah
-DRV_KMODS += ipsecesp iptun iptunq iwscn keysock kmdb kstat ksyms llc1
+DRV_KMODS += ipsecesp iptun iwscn keysock kmdb kstat ksyms llc1
DRV_KMODS += lofi
DRV_KMODS += log logindmux kssl mm nca physmem pm poll pool
DRV_KMODS += pseudo ptc ptm pts ptsl ramdisk random rsm rts sad
diff --git a/usr/src/uts/sparc/arp/Makefile b/usr/src/uts/sparc/arp/Makefile
index 21c26c762e..6d1610da66 100644
--- a/usr/src/uts/sparc/arp/Makefile
+++ b/usr/src/uts/sparc/arp/Makefile
@@ -20,11 +20,9 @@
#
#
# uts/sparc/arp/Makefile
-# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-#ident "%Z%%M% %I% %E% SMI"
-#
# This makefile drives the production of the arp driver kernel module.
#
# sparc architecture dependent
@@ -72,7 +70,7 @@ CFLAGS += $(CCVERBOSE)
#
# depends on ip
#
-LDFLAGS += -dy -Ndrv/ip -Ndrv/hook -Nmisc/neti
+LDFLAGS += -dy -Ndrv/ip
#
# For now, disable these lint checks; maintainers should endeavor
diff --git a/usr/src/uts/sparc/arp/arp.global-objs.debug64 b/usr/src/uts/sparc/arp/arp.global-objs.debug64
index 7f826ea213..f936276753 100644
--- a/usr/src/uts/sparc/arp/arp.global-objs.debug64
+++ b/usr/src/uts/sparc/arp/arp.global-objs.debug64
@@ -23,15 +23,6 @@
# Use is subject to license terms.
#
-ar_cmd_tbl
-ar_m_tbl
-arp_mod_info
-arp_no_defense
-arpinfo
-arprinit
-arpwinit
-arp_param_arr
-arp_netinfo
cb_inet_devops
fsw
inet_dev_info
diff --git a/usr/src/uts/sparc/ip/ip.global-objs.debug64 b/usr/src/uts/sparc/ip/ip.global-objs.debug64
index 8df87d813d..07e9aaedde 100644
--- a/usr/src/uts/sparc/ip/ip.global-objs.debug64
+++ b/usr/src/uts/sparc/ip/ip.global-objs.debug64
@@ -23,19 +23,24 @@
# Use is subject to license terms.
#
+arp_m_tbl
+arp_mod_info
+arp_netinfo
+arp_no_defense
+arpinfo
cb_inet_devops
cl_inet_bind
+cl_inet_checkspi
cl_inet_connect2
+cl_inet_deletespi
cl_inet_disconnect
+cl_inet_getspi
+cl_inet_idlesa
cl_inet_ipident
cl_inet_isclusterwide
cl_inet_listen
cl_inet_unbind
cl_inet_unlisten
-cl_inet_getspi
-cl_inet_checkspi
-cl_inet_deletespi
-cl_inet_idlesa
cl_sctp_assoc_change
cl_sctp_check_addrs
cl_sctp_connect
@@ -43,6 +48,7 @@ cl_sctp_disconnect
cl_sctp_listen
cl_sctp_unlisten
conn_drain_nthreads
+dce_cache
default_ip6_asp_table
do_tcp_fusion
do_tcpzcopy
@@ -97,74 +103,45 @@ ill_no_arena
ill_null
inet_dev_info
inet_devops
-ip6_area_template
-ip6_ared_template
-ip6_cache_table_size
ip6_ftable_hash_size
-ip6_ire_max_bucket_cnt
-ip6_ire_min_bucket_cnt
-ip6_max_cache_table_size
ip6opt_ls
-ip_ard_template
-ip_area_template
-ip_ared_template
-ip_areq_template
-ip_arma_multi_template
-ip_aroff_template
-ip_aron_template
-ip_aru_template
-ip_cache_table_size
ip_cgtp_filter_rev
ip_conn_cache
ip_debug
ip_g_all_ones
-ip_helper_stream_cache
ip_helper_stream_info
ip_helper_stream_rinit
ip_helper_stream_winit
ip_ioctl_ftbl
-ip_ire_cleanup_cnt
-ip_ire_cpu_ratio
-ip_ire_max_bucket_cnt
-ip_ire_mem_ratio
-ip_ire_min_bucket_cnt
-ip_loopback_mtu
ip_loopback_mtu_v6plus
ip_loopback_mtuplus
ip_m_tbl
-ip_max_cache_table_size
ip_max_frag_dups
ip_min_frag_prune_time
-ip_minor_arena_sa
ip_minor_arena_la
+ip_minor_arena_sa
ip_misc_ioctl_count
ip_misc_ioctl_table
ip_mod_info
ip_modclose_ackwait_ms
ip_ndx_ioctl_count
ip_ndx_ioctl_table
-ip_opt_arr
-ip_opt_obj
ip_poll_normal_ms
ip_poll_normal_ticks
ip_rput_pullups
ip_six_byte_all_ones
ip_squeue_create_callback
ip_squeue_enter
-ip_squeue_enter_unbound
ip_squeue_fanout
ip_squeue_flag
ip_squeue_worker_wait
ip_thread_data
ip_thread_list
ip_thread_rwlock
-ip_use_helper_cache
-ip_wput_frag_mdt_min
ipcl_bind_fanout_size
ipcl_conn_hash_maxsize
ipcl_conn_hash_memfactor
ipcl_conn_hash_size
-ipcl_debug_level
ipcl_iptun_fanout_size
ipcl_raw_fanout_size
ipcl_udp_fanout_size
@@ -174,24 +151,16 @@ ipinfov4
ipinfov6
iplrinit
iplwinit
-ipmp_aract_template
-ipmp_ardeact_template
ipmp_kstats
iprinitv4
iprinitv6
ipsec_action_cache
ipsec_hdr_pullup_needed
-ipsec_info_cache
ipsec_pol_cache
ipsec_policy_failure_msgs
ipsec_sel_cache
ipsec_spd_hashsize
ipsec_weird_null_inbound_policy
-ipsechw_debug
-iptunq_info
-iptunq_modinfo
-iptunq_rinit
-iptunq_winit
ipv4_forward_suffix
ipv4info
ipv6_all_hosts_mcast
@@ -199,29 +168,22 @@ ipv6_all_ones
ipv6_all_rtrs_mcast
ipv6_all_v2rtrs_mcast
ipv6_all_zeros
-ipv6_areq_template
ipv6_forward_suffix
ipv6_ll_template
ipv6_loopback
ipv6_solicited_node_mcast
ipv6_unspecified_group
ipv6info
-ipwinitv4
-ipwinitv6
+ipwinit
ire_cache
ire_gw_secattr_cache
-ire_idle_cutoff_interval
ire_null
ire_nv_arr
ire_nv_tbl
-ire_uinfo_null
lcl_ndp_arr
lcl_param_arr
lcl_sctp_param_arr
lcl_sctp_wroff_xtra_param
-lcl_tcp_mdt_head_param
-lcl_tcp_mdt_max_pbufs_param
-lcl_tcp_mdt_tail_param
lcl_tcp_param_arr
lcl_tcp_wroff_xtra_param
mask_rnhead
@@ -230,6 +192,8 @@ modldrv
modlinkage
modlstrmod
multicast_encap_iphdr
+nce_cache
+ncec_cache
netdev_privs
prov_update_handle
radix_mask_cache
@@ -238,6 +202,7 @@ rawip_conn_cache
recvq_call
recvq_loop_cnt
req_arr
+rinit_arp
rn_mkfreelist
rn_ones
rn_zeros
@@ -260,19 +225,17 @@ sctp_kmem_faddr_cache
sctp_kmem_ftsn_set_cache
sctp_kmem_set_cache
sctp_mod_info
+sctp_opt_arr
+sctp_opt_arr_size
sctp_recvq_tq_task_max
sctp_recvq_tq_task_min
sctp_recvq_tq_thr_max
sctp_recvq_tq_thr_min
sctp_sin6_null
-sctp_taskq
sctpdebug
sctpinfo
sctprinit
sctpwinit
-sendq_collision
-sendq_empty
-sendq_loop_cnt
sin6_null
sin_null
skip_sctp_cksum
@@ -300,12 +263,10 @@ tcp_g_statistics
tcp_g_t_info_ack
tcp_g_t_info_ack_v6
tcp_icmp_source_quench
-tcp_iphc_cache
tcp_max_optsize
-tcp_mdt_chain
-tcp_mdt_smss_threshold
tcp_opt_arr
tcp_opt_obj
+tcp_outbound_squeue_switch
tcp_random_anon_port
tcp_random_end_ptr
tcp_random_fptr
@@ -321,13 +282,11 @@ tcp_sock_winit
tcp_squeue_flag
tcp_squeue_wput
tcp_static_maxpsz
-tcp_taskq
tcp_timercache
tcp_tx_pull_len
tcp_valid_levels_arr
tcp_winfo
tcp_winit
-tcp_outbound_squeue_switch
tcpinfov4
tcpinfov6
tli_errs
@@ -352,4 +311,6 @@ udp_valid_levels_arr
udp_winit
udpinfov4
udpinfov6
-zero_info
+winit_arp
+eri_cksum_workaround
+nxge_cksum_workaround
diff --git a/usr/src/uts/sparc/ip/ip.global-objs.obj64 b/usr/src/uts/sparc/ip/ip.global-objs.obj64
index 3df973b8f9..526e907ab5 100644
--- a/usr/src/uts/sparc/ip/ip.global-objs.obj64
+++ b/usr/src/uts/sparc/ip/ip.global-objs.obj64
@@ -23,19 +23,24 @@
# Use is subject to license terms.
#
+arp_m_tbl
+arp_mod_info
+arp_netinfo
+arp_no_defense
+arpinfo
cb_inet_devops
cl_inet_bind
+cl_inet_checkspi
cl_inet_connect2
+cl_inet_deletespi
cl_inet_disconnect
+cl_inet_getspi
+cl_inet_idlesa
cl_inet_ipident
cl_inet_isclusterwide
cl_inet_listen
cl_inet_unbind
cl_inet_unlisten
-cl_inet_getspi
-cl_inet_checkspi
-cl_inet_deletespi
-cl_inet_idlesa
cl_sctp_assoc_change
cl_sctp_check_addrs
cl_sctp_connect
@@ -43,6 +48,7 @@ cl_sctp_disconnect
cl_sctp_listen
cl_sctp_unlisten
conn_drain_nthreads
+dce_cache
default_ip6_asp_table
do_tcp_fusion
do_tcpzcopy
@@ -97,69 +103,41 @@ ill_no_arena
ill_null
inet_dev_info
inet_devops
-ip6_area_template
-ip6_ared_template
-ip6_cache_table_size
ip6_ftable_hash_size
-ip6_ire_max_bucket_cnt
-ip6_ire_min_bucket_cnt
-ip6_max_cache_table_size
ip6opt_ls
-ip_ard_template
-ip_area_template
-ip_ared_template
-ip_areq_template
-ip_arma_multi_template
-ip_aroff_template
-ip_aron_template
-ip_aru_template
-ip_cache_table_size
ip_cgtp_filter_rev
ip_conn_cache
ip_debug
ip_g_all_ones
-ip_helper_stream_cache
ip_helper_stream_info
ip_helper_stream_rinit
ip_helper_stream_winit
ip_ioctl_ftbl
-ip_ire_cleanup_cnt
-ip_ire_cpu_ratio
-ip_ire_max_bucket_cnt
-ip_ire_mem_ratio
-ip_ire_min_bucket_cnt
-ip_loopback_mtu
ip_loopback_mtu_v6plus
ip_loopback_mtuplus
ip_m_tbl
-ip_max_cache_table_size
ip_max_frag_dups
ip_min_frag_prune_time
-ip_minor_arena_sa
ip_minor_arena_la
+ip_minor_arena_sa
ip_misc_ioctl_count
ip_misc_ioctl_table
ip_mod_info
ip_modclose_ackwait_ms
ip_ndx_ioctl_count
ip_ndx_ioctl_table
-ip_opt_arr
-ip_opt_obj
ip_poll_normal_ms
ip_poll_normal_ticks
ip_rput_pullups
ip_six_byte_all_ones
ip_squeue_create_callback
ip_squeue_enter
-ip_squeue_enter_unbound
ip_squeue_fanout
ip_squeue_flag
ip_squeue_worker_wait
ip_thread_data
ip_thread_list
ip_thread_rwlock
-ip_use_helper_cache
-ip_wput_frag_mdt_min
ipcl_bind_fanout_size
ipcl_conn_hash_maxsize
ipcl_conn_hash_memfactor
@@ -173,23 +151,16 @@ ipinfov4
ipinfov6
iplrinit
iplwinit
-ipmp_aract_template
-ipmp_ardeact_template
ipmp_kstats
iprinitv4
iprinitv6
ipsec_action_cache
ipsec_hdr_pullup_needed
-ipsec_info_cache
ipsec_pol_cache
ipsec_policy_failure_msgs
ipsec_sel_cache
ipsec_spd_hashsize
ipsec_weird_null_inbound_policy
-iptunq_info
-iptunq_modinfo
-iptunq_rinit
-iptunq_winit
ipv4_forward_suffix
ipv4info
ipv6_all_hosts_mcast
@@ -197,29 +168,22 @@ ipv6_all_ones
ipv6_all_rtrs_mcast
ipv6_all_v2rtrs_mcast
ipv6_all_zeros
-ipv6_areq_template
ipv6_forward_suffix
ipv6_ll_template
ipv6_loopback
ipv6_solicited_node_mcast
ipv6_unspecified_group
ipv6info
-ipwinitv4
-ipwinitv6
+ipwinit
ire_cache
ire_gw_secattr_cache
-ire_idle_cutoff_interval
ire_null
ire_nv_arr
ire_nv_tbl
-ire_uinfo_null
lcl_ndp_arr
lcl_param_arr
lcl_sctp_param_arr
lcl_sctp_wroff_xtra_param
-lcl_tcp_mdt_head_param
-lcl_tcp_mdt_max_pbufs_param
-lcl_tcp_mdt_tail_param
lcl_tcp_param_arr
lcl_tcp_wroff_xtra_param
mask_rnhead
@@ -228,12 +192,15 @@ modldrv
modlinkage
modlstrmod
multicast_encap_iphdr
+nce_cache
+ncec_cache
netdev_privs
prov_update_handle
radix_mask_cache
radix_node_cache
rawip_conn_cache
req_arr
+rinit_arp
rn_mkfreelist
rn_ones
rn_zeros
@@ -256,12 +223,13 @@ sctp_kmem_faddr_cache
sctp_kmem_ftsn_set_cache
sctp_kmem_set_cache
sctp_mod_info
+sctp_opt_arr
+sctp_opt_arr_size
sctp_recvq_tq_task_max
sctp_recvq_tq_task_min
sctp_recvq_tq_thr_max
sctp_recvq_tq_thr_min
sctp_sin6_null
-sctp_taskq
sctpdebug
sctpinfo
sctprinit
@@ -292,12 +260,10 @@ tcp_g_statistics
tcp_g_t_info_ack
tcp_g_t_info_ack_v6
tcp_icmp_source_quench
-tcp_iphc_cache
tcp_max_optsize
-tcp_mdt_chain
-tcp_mdt_smss_threshold
tcp_opt_arr
tcp_opt_obj
+tcp_outbound_squeue_switch
tcp_random_anon_port
tcp_random_end_ptr
tcp_random_fptr
@@ -313,13 +279,11 @@ tcp_sock_winit
tcp_squeue_flag
tcp_squeue_wput
tcp_static_maxpsz
-tcp_taskq
tcp_timercache
tcp_tx_pull_len
tcp_valid_levels_arr
tcp_winfo
tcp_winit
-tcp_outbound_squeue_switch
tcpinfov4
tcpinfov6
tli_errs
@@ -344,4 +308,6 @@ udp_valid_levels_arr
udp_winit
udpinfov4
udpinfov6
-zero_info
+winit_arp
+eri_cksum_workaround
+nxge_cksum_workaround
diff --git a/usr/src/uts/sparc/ml/modstubs.s b/usr/src/uts/sparc/ml/modstubs.s
index 18eba0bdfa..24058b72e4 100644
--- a/usr/src/uts/sparc/ml/modstubs.s
+++ b/usr/src/uts/sparc/ml/modstubs.s
@@ -397,7 +397,6 @@ stubs_base:
MODULE(ipsecah,drv);
WSTUB(ipsecah, ipsec_construct_inverse_acquire, nomod_zero);
WSTUB(ipsecah, sadb_acquire, nomod_zero);
- WSTUB(ipsecah, sadb_ill_download, nomod_zero);
WSTUB(ipsecah, ipsecah_algs_changed, nomod_zero);
WSTUB(ipsecah, sadb_alg_update, nomod_zero);
WSTUB(ipsecah, sadb_unlinkassoc, nomod_zero);
@@ -1218,8 +1217,6 @@ stubs_base:
STUB(iptun, iptun_create, nomod_einval);
STUB(iptun, iptun_delete, nomod_einval);
STUB(iptun, iptun_set_policy, nomod_einval);
- STUB(iptun, iptun_set_g_q, nomod_einval);
- STUB(iptun, iptun_clear_g_q, nomod_void);
END_MODULE(iptun);
#endif